\HTML5_Tokenizer
Synopsis
class HTML5_Tokenizer
{
- // constants
- const PCDATA = 0;
- const RCDATA = 1;
- const CDATA = 2;
- const PLAINTEXT = 3;
- const DOCTYPE = 0;
- const STARTTAG = 1;
- const ENDTAG = 2;
- const COMMENT = 3;
- const CHARACTER = 4;
- const SPACECHARACTER = 5;
- const EOF = 6;
- const PARSEERROR = 7;
- const ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
- const UPPER_ALPHA = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
- const LOWER_ALPHA = 'abcdefghijklmnopqrstuvwxyz';
- const DIGIT = '0123456789';
- const HEX = '0123456789ABCDEFabcdef';
- const WHITESPACE = "\t\n\x0c ";
- // members
- protected $stream;
- private $tree;
- protected $content_model;
- protected $token;
- // methods
- public void __construct()
- public void parseFragment()
- public void parse()
- public void save()
- public void stream()
- private void consumeCharacterReference()
- private void characterReferenceInAttributeValue()
- protected void emitToken()
Constants
Name | Value |
---|---|
PCDATA | 0 |
RCDATA | 1 |
CDATA | 2 |
PLAINTEXT | 3 |
DOCTYPE | 0 |
STARTTAG | 1 |
ENDTAG | 2 |
COMMENT | 3 |
CHARACTER | 4 |
SPACECHARACTER | 5 |
EOF | 6 |
PARSEERROR | 7 |
ALPHA | 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' |
UPPER_ALPHA | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' |
LOWER_ALPHA | 'abcdefghijklmnopqrstuvwxyz' |
DIGIT | '0123456789' |
HEX | '0123456789ABCDEFabcdef' |
WHITESPACE | "\t\n\x0c " |
Members
private
-
$tree
Tree builder that the tokenizer emits token to.
protected
-
$content_model
Current content model we are parsing as. -
$stream
Points to an InputStream object. -
$token
Current token that is being built, but not yet emitted. Also is the last token emitted, if applicable.
Methods
private
protected
- emitToken() — Emits a token, passing it on to the tree builder.
public
- __construct()
- parse() — Performs the actual parsing of the document.
- parseFragment()
- save() — Returns a serialized representation of the tree.
- stream() — Returns the input stream.