Skip to content
This repository has been archived by the owner on Feb 15, 2023. It is now read-only.

Commit

Permalink
Add CDATA handling to parser, including a test for it.
Browse files Browse the repository at this point in the history
  • Loading branch information
nostrademons authored and vmg committed Feb 17, 2015
1 parent f9a515f commit 58d5fad
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 9 deletions.
12 changes: 9 additions & 3 deletions src/parser.c
Original file line number Diff line number Diff line change
Expand Up @@ -793,7 +793,8 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) {
}

assert(buffer_state->_type == GUMBO_NODE_WHITESPACE ||
buffer_state->_type == GUMBO_NODE_TEXT);
buffer_state->_type == GUMBO_NODE_TEXT ||
buffer_state->_type == GUMBO_NODE_CDATA);
GumboNode* text_node = create_node(parser, buffer_state->_type);
GumboText* text_node_data = &text_node->v.text;
text_node_data->text = gumbo_string_buffer_to_string(
Expand Down Expand Up @@ -1019,7 +1020,8 @@ static GumboNode* insert_foreign_element(

static void insert_text_token(GumboParser* parser, GumboToken* token) {
assert(token->type == GUMBO_TOKEN_WHITESPACE ||
token->type == GUMBO_TOKEN_CHARACTER);
token->type == GUMBO_TOKEN_CHARACTER ||
token->type == GUMBO_TOKEN_CDATA);
TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node;
if (buffer_state->_buffer.length == 0) {
// Initialize position fields.
Expand All @@ -1030,6 +1032,8 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) {
parser, token->v.character, &buffer_state->_buffer);
if (token->type == GUMBO_TOKEN_CHARACTER) {
buffer_state->_type = GUMBO_NODE_TEXT;
} else if (token->type == GUMBO_TOKEN_CDATA) {
buffer_state->_type = GUMBO_NODE_CDATA;
}
gumbo_debug("Inserting text token '%c'.\n", token->v.character);
}
Expand Down Expand Up @@ -2207,7 +2211,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
reconstruct_active_formatting_elements(parser);
insert_text_token(parser, token);
return true;
} else if (token->type == GUMBO_TOKEN_CHARACTER) {
} else if (token->type == GUMBO_TOKEN_CHARACTER ||
token->type == GUMBO_TOKEN_CDATA) {
reconstruct_active_formatting_elements(parser);
insert_text_token(parser, token);
set_frameset_not_ok(parser);
Expand Down Expand Up @@ -3492,6 +3497,7 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) {
case GUMBO_TOKEN_WHITESPACE:
insert_text_token(parser, token);
return true;
case GUMBO_TOKEN_CDATA:
case GUMBO_TOKEN_CHARACTER:
insert_text_token(parser, token);
set_frameset_not_ok(parser);
Expand Down
12 changes: 6 additions & 6 deletions src/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,11 @@ static int ensure_lowercase(int c) {
return c >= 'A' && c <= 'Z' ? c + 0x20 : c;
}

static GumboTokenType get_char_token_type(int c) {
static GumboTokenType get_char_token_type(bool is_in_cdata, int c) {
if (is_in_cdata && c != -1) {
return GUMBO_TOKEN_CDATA;
}

switch (c) {
case '\t':
case '\n':
Expand Down Expand Up @@ -479,11 +483,7 @@ static void finish_doctype_system_id(GumboParser* parser) {

// Writes a single specified character to the output token.
static void emit_char(GumboParser* parser, int c, GumboToken* output) {
if (parser->_tokenizer_state->_is_in_cdata) {
output->type = GUMBO_TOKEN_CDATA;
} else {
output->type = get_char_token_type(c);
}
output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c);
output->v.character = c;
finish_token(parser, output);
}
Expand Down
15 changes: 15 additions & 0 deletions tests/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1522,6 +1522,21 @@ TEST_F(GumboParserTest, ImplicitlyCloseLists) {
ASSERT_EQ(1, GetChildCount(li2));
}

TEST_F(GumboParserTest, CData) {
Parse("<svg><![CDATA[this is text]]></svg>");

GumboNode* body;
GetAndAssertBody(root_, &body);
ASSERT_EQ(1, GetChildCount(body));

GumboNode* svg = GetChild(body, 0);
ASSERT_EQ(1, GetChildCount(svg));

GumboNode* cdata = GetChild(svg, 0);
ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type);
EXPECT_STREQ("this is text", cdata->v.text.text);
}

TEST_F(GumboParserTest, FormattingTagsInHeading) {
Parse("<h2>This is <b>old</h2>text");

Expand Down

0 comments on commit 58d5fad

Please sign in to comment.