diff --git a/src/parser.c b/src/parser.c index 0888794b..b2c1ad8b 100644 --- a/src/parser.c +++ b/src/parser.c @@ -793,7 +793,8 @@ static void maybe_flush_text_node_buffer(GumboParser* parser) { } assert(buffer_state->_type == GUMBO_NODE_WHITESPACE || - buffer_state->_type == GUMBO_NODE_TEXT); + buffer_state->_type == GUMBO_NODE_TEXT || + buffer_state->_type == GUMBO_NODE_CDATA); GumboNode* text_node = create_node(parser, buffer_state->_type); GumboText* text_node_data = &text_node->v.text; text_node_data->text = gumbo_string_buffer_to_string( @@ -1019,7 +1020,8 @@ static GumboNode* insert_foreign_element( static void insert_text_token(GumboParser* parser, GumboToken* token) { assert(token->type == GUMBO_TOKEN_WHITESPACE || - token->type == GUMBO_TOKEN_CHARACTER); + token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_CDATA); TextNodeBufferState* buffer_state = &parser->_parser_state->_text_node; if (buffer_state->_buffer.length == 0) { // Initialize position fields. @@ -1030,6 +1032,8 @@ static void insert_text_token(GumboParser* parser, GumboToken* token) { parser, token->v.character, &buffer_state->_buffer); if (token->type == GUMBO_TOKEN_CHARACTER) { buffer_state->_type = GUMBO_NODE_TEXT; + } else if (token->type == GUMBO_TOKEN_CDATA) { + buffer_state->_type = GUMBO_NODE_CDATA; } gumbo_debug("Inserting text token '%c'.\n", token->v.character); } @@ -2207,7 +2211,8 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) { reconstruct_active_formatting_elements(parser); insert_text_token(parser, token); return true; - } else if (token->type == GUMBO_TOKEN_CHARACTER) { + } else if (token->type == GUMBO_TOKEN_CHARACTER || + token->type == GUMBO_TOKEN_CDATA) { reconstruct_active_formatting_elements(parser); insert_text_token(parser, token); set_frameset_not_ok(parser); @@ -3492,6 +3497,7 @@ static bool handle_in_foreign_content(GumboParser* parser, GumboToken* token) { case GUMBO_TOKEN_WHITESPACE: insert_text_token(parser, token); return true; + case GUMBO_TOKEN_CDATA: case GUMBO_TOKEN_CHARACTER: insert_text_token(parser, token); set_frameset_not_ok(parser); diff --git a/src/tokenizer.c b/src/tokenizer.c index 297299b3..7a7ae3c0 100644 --- a/src/tokenizer.c +++ b/src/tokenizer.c @@ -319,7 +319,11 @@ static int ensure_lowercase(int c) { return c >= 'A' && c <= 'Z' ? c + 0x20 : c; } -static GumboTokenType get_char_token_type(int c) { +static GumboTokenType get_char_token_type(bool is_in_cdata, int c) { + if (is_in_cdata && c != -1) { + return GUMBO_TOKEN_CDATA; + } + switch (c) { case '\t': case '\n': @@ -479,11 +483,7 @@ static void finish_doctype_system_id(GumboParser* parser) { // Writes a single specified character to the output token. static void emit_char(GumboParser* parser, int c, GumboToken* output) { - if (parser->_tokenizer_state->_is_in_cdata) { - output->type = GUMBO_TOKEN_CDATA; - } else { - output->type = get_char_token_type(c); - } + output->type = get_char_token_type(parser->_tokenizer_state->_is_in_cdata, c); output->v.character = c; finish_token(parser, output); } diff --git a/tests/parser.cc b/tests/parser.cc index e565a248..c5877591 100644 --- a/tests/parser.cc +++ b/tests/parser.cc @@ -1522,6 +1522,21 @@ TEST_F(GumboParserTest, ImplicitlyCloseLists) { ASSERT_EQ(1, GetChildCount(li2)); } +TEST_F(GumboParserTest, CData) { + Parse(""); + + GumboNode* body; + GetAndAssertBody(root_, &body); + ASSERT_EQ(1, GetChildCount(body)); + + GumboNode* svg = GetChild(body, 0); + ASSERT_EQ(1, GetChildCount(svg)); + + GumboNode* cdata = GetChild(svg, 0); + ASSERT_EQ(GUMBO_NODE_CDATA, cdata->type); + EXPECT_STREQ("this is text", cdata->v.text.text); +} + TEST_F(GumboParserTest, FormattingTagsInHeading) { Parse("