Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Split inline from block parser class, and other small refactors #1270

Merged
merged 6 commits into from
Jan 15, 2025
205 changes: 104 additions & 101 deletions lib/model/content.dart
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,7 @@ class MathInlineNode extends InlineContentNode {
class GlobalTimeNode extends InlineContentNode {
const GlobalTimeNode({super.debugHtmlNode, required this.datetime});

/// Always in UTC, enforced in [_ZulipContentParser.parseInlineContent].
/// Always in UTC, enforced in [_ZulipInlineContentParser.parseInlineContent].
final DateTime datetime;

@override
Expand All @@ -806,72 +806,68 @@ class GlobalTimeNode extends InlineContentNode {

////////////////////////////////////////////////////////////////

/// What sort of nodes a [_ZulipContentParser] is currently expecting to find.
enum _ParserContext {
/// The parser is currently looking for block nodes.
block,
String? _parseMath(dom.Element element, {required bool block}) {
final dom.Element katexElement;
if (!block) {
assert(element.localName == 'span' && element.className == 'katex');

/// The parser is currently looking for inline nodes.
inline,
}

class _ZulipContentParser {
/// The current state of what sort of nodes the parser is looking for.
///
/// This exists for the sake of debug-mode checks,
/// and should be read or updated only inside an assertion.
_ParserContext _debugParserContext = _ParserContext.block;

String? parseMath(dom.Element element, {required bool block}) {
assert(block == (_debugParserContext == _ParserContext.block));

final dom.Element katexElement;
if (!block) {
assert(element.localName == 'span' && element.className == 'katex');
katexElement = element;
} else {
assert(element.localName == 'span' && element.className == 'katex-display');

katexElement = element;
} else {
assert(element.localName == 'span' && element.className == 'katex-display');

if (element.nodes.length != 1) return null;
final child = element.nodes.single;
if (child is! dom.Element) return null;
if (child.localName != 'span') return null;
if (child.className != 'katex') return null;
katexElement = child;
}

// Expect two children span.katex-mathml, span.katex-html .
// For now we only care about the .katex-mathml .
if (katexElement.nodes.isEmpty) return null;
final child = katexElement.nodes.first;
if (element.nodes.length != 1) return null;
final child = element.nodes.single;
if (child is! dom.Element) return null;
if (child.localName != 'span') return null;
if (child.className != 'katex-mathml') return null;

if (child.nodes.length != 1) return null;
final grandchild = child.nodes.single;
if (grandchild is! dom.Element) return null;
if (grandchild.localName != 'math') return null;
if (grandchild.attributes['display'] != (block ? 'block' : null)) return null;
if (grandchild.namespaceUri != 'http://www.w3.org/1998/Math/MathML') return null;

if (grandchild.nodes.length != 1) return null;
final greatgrand = grandchild.nodes.single;
if (greatgrand is! dom.Element) return null;
if (greatgrand.localName != 'semantics') return null;

if (greatgrand.nodes.isEmpty) return null;
final descendant4 = greatgrand.nodes.last;
if (descendant4 is! dom.Element) return null;
if (descendant4.localName != 'annotation') return null;
if (descendant4.attributes['encoding'] != 'application/x-tex') return null;

return descendant4.text.trim();
if (child.className != 'katex') return null;
katexElement = child;
}

// Expect two children span.katex-mathml, span.katex-html .
// For now we only care about the .katex-mathml .
if (katexElement.nodes.isEmpty) return null;
final child = katexElement.nodes.first;
if (child is! dom.Element) return null;
if (child.localName != 'span') return null;
if (child.className != 'katex-mathml') return null;

if (child.nodes.length != 1) return null;
final grandchild = child.nodes.single;
if (grandchild is! dom.Element) return null;
if (grandchild.localName != 'math') return null;
if (grandchild.attributes['display'] != (block ? 'block' : null)) return null;
if (grandchild.namespaceUri != 'http://www.w3.org/1998/Math/MathML') return null;

if (grandchild.nodes.length != 1) return null;
final greatgrand = grandchild.nodes.single;
if (greatgrand is! dom.Element) return null;
if (greatgrand.localName != 'semantics') return null;

if (greatgrand.nodes.isEmpty) return null;
final descendant4 = greatgrand.nodes.last;
if (descendant4 is! dom.Element) return null;
if (descendant4.localName != 'annotation') return null;
if (descendant4.attributes['encoding'] != 'application/x-tex') return null;

return descendant4.text.trim();
}

/// Parser for the inline-content subtrees within Zulip content HTML.
///
/// The only entry point to this class is [parseBlockInline].
///
/// After a call to [parseBlockInline] returns, the [_ZulipInlineContentParser]
/// instance has been reset to its starting state, and can be re-used for
/// parsing other subtrees.
class _ZulipInlineContentParser {
InlineContentNode? parseInlineMath(dom.Element element) {
final debugHtmlNode = kDebugMode ? element : null;
final texSource = _parseMath(element, block: false);
if (texSource == null) return null;
return MathInlineNode(texSource: texSource, debugHtmlNode: debugHtmlNode);
}

UserMentionNode? parseUserMention(dom.Element element) {
assert(_debugParserContext == _ParserContext.inline);
assert(element.localName == 'span');
final debugHtmlNode = kDebugMode ? element : null;

Expand Down Expand Up @@ -945,7 +941,6 @@ class _ZulipContentParser {
static final _emojiCodeFromClassNameRegexp = RegExp(r"emoji-([^ ]+)");

InlineContentNode parseInlineContent(dom.Node node) {
assert(_debugParserContext == _ParserContext.inline);
final debugHtmlNode = kDebugMode ? node : null;
InlineContentNode unimplemented() => UnimplementedInlineContentNode(htmlNode: node);

Expand Down Expand Up @@ -1025,36 +1020,49 @@ class _ZulipContentParser {
}

if (localName == 'span' && className == 'katex') {
final texSource = parseMath(element, block: false);
if (texSource == null) return unimplemented();
return MathInlineNode(texSource: texSource, debugHtmlNode: debugHtmlNode);
return parseInlineMath(element) ?? unimplemented();
}

// TODO more types of node
return unimplemented();
}

List<InlineContentNode> parseInlineContentList(List<dom.Node> nodes) {
assert(_debugParserContext == _ParserContext.inline);
return nodes.map(parseInlineContent).toList(growable: false);
}

/// Parse the children of a [BlockInlineContainerNode], making up a
/// complete subtree of inline content with no further inline ancestors.
({List<InlineContentNode> nodes, List<LinkNode>? links}) parseBlockInline(List<dom.Node> nodes) {
assert(_debugParserContext == _ParserContext.block);
assert(() {
_debugParserContext = _ParserContext.inline;
return true;
}());
final resultNodes = parseInlineContentList(nodes);
assert(() {
_debugParserContext = _ParserContext.block;
return true;
}());
return (nodes: resultNodes, links: _takeLinkNodes());
}
}

/// Parser for a complete piece of Zulip HTML content, a [ZulipContent].
///
/// The only entry point to this class is [parse].
class _ZulipContentParser {
/// The single inline-content parser used and re-used throughout parsing of
/// a complete piece of Zulip HTML content.
///
/// Because block content can never appear nested inside inline content,
/// there's never a need for more than one of these at a time,
/// so we can allocate just one up front.
final inlineParser = _ZulipInlineContentParser();

({List<InlineContentNode> nodes, List<LinkNode>? links}) parseBlockInline(List<dom.Node> nodes) {
return inlineParser.parseBlockInline(nodes);
}

BlockContentNode parseMathBlock(dom.Element element) {
final debugHtmlNode = kDebugMode ? element : null;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like we will end up with a different debugHtmlNode here, but that's a debug-mode-only behavior so it should be fine as a part of an [nfc] commit.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, yeah, would have been good for me to make that explicit.

There's also a related change that if it fails to parse, the UnimplementedNode will get the span.katex-display child instead of the p parent. I think that change is good (and it's definitely needed in the #1156 context, and in fact the current revision of #1156 effectively already makes that change) but it's not quite NFC.

I actually had that thought at one point as I was writing these changes; but I guess I forgot at the moment I was making that commit.

final texSource = _parseMath(element, block: true);
if (texSource == null) return UnimplementedBlockContentNode(htmlNode: element);
return MathBlockNode(texSource: texSource, debugHtmlNode: debugHtmlNode);
}

BlockContentNode parseListNode(dom.Element element) {
assert(_debugParserContext == _ParserContext.block);
ListStyle? listStyle;
switch (element.localName) {
case 'ol': listStyle = ListStyle.ordered; break;
Expand All @@ -1077,7 +1085,6 @@ class _ZulipContentParser {
}

BlockContentNode parseSpoilerNode(dom.Element divElement) {
assert(_debugParserContext == _ParserContext.block);
assert(divElement.localName == 'div'
&& divElement.className == 'spoiler-block');

Expand All @@ -1097,7 +1104,6 @@ class _ZulipContentParser {
}

BlockContentNode parseCodeBlock(dom.Element divElement) {
assert(_debugParserContext == _ParserContext.block);
final mainElement = () {
assert(divElement.localName == 'div'
&& divElement.className == "codehilite");
Expand Down Expand Up @@ -1180,7 +1186,6 @@ class _ZulipContentParser {
static final _imageDimensionsRegExp = RegExp(r'^(\d+)x(\d+)$');

BlockContentNode parseImageNode(dom.Element divElement) {
assert(_debugParserContext == _ParserContext.block);
final elements = () {
assert(divElement.localName == 'div'
&& divElement.className == 'message_inline_image');
Expand Down Expand Up @@ -1272,7 +1277,6 @@ class _ZulipContentParser {
}();

BlockContentNode parseInlineVideoNode(dom.Element divElement) {
assert(_debugParserContext == _ParserContext.block);
assert(divElement.localName == 'div'
&& _videoClassNameRegexp.hasMatch(divElement.className));

Expand Down Expand Up @@ -1305,7 +1309,6 @@ class _ZulipContentParser {
}

BlockContentNode parseEmbedVideoNode(dom.Element divElement) {
assert(_debugParserContext == _ParserContext.block);
assert(divElement.localName == 'div'
&& _videoClassNameRegexp.hasMatch(divElement.className));

Expand Down Expand Up @@ -1344,7 +1347,6 @@ class _ZulipContentParser {
}

BlockContentNode parseTableContent(dom.Element tableElement) {
assert(_debugParserContext == _ParserContext.block);
assert(tableElement.localName == 'table'
&& tableElement.className.isEmpty);

Expand Down Expand Up @@ -1452,7 +1454,6 @@ class _ZulipContentParser {
}

BlockContentNode parseBlockContent(dom.Node node) {
assert(_debugParserContext == _ParserContext.block);
final debugHtmlNode = kDebugMode ? node : null;
if (node is! dom.Element) {
return UnimplementedBlockContentNode(htmlNode: node);
Expand Down Expand Up @@ -1480,9 +1481,7 @@ class _ZulipContentParser {
// The case with the `<br>\n` can happen when at the end of a quote;
// it seems like a glitch in the server's Markdown processing,
// so hopefully there just aren't any further such glitches.
final texSource = parseMath(child, block: true);
if (texSource == null) return UnimplementedBlockContentNode(htmlNode: node);
return MathBlockNode(texSource: texSource, debugHtmlNode: debugHtmlNode);
return parseMathBlock(child);
}
}
}
Expand Down Expand Up @@ -1579,10 +1578,15 @@ class _ZulipContentParser {
///
/// See [ParagraphNode].
List<BlockContentNode> parseImplicitParagraphBlockContentList(dom.NodeList nodes) {
assert(_debugParserContext == _ParserContext.block);
final List<BlockContentNode> result = [];
final List<dom.Node> currentParagraph = [];

List<ImageNode> imageNodes = [];
void consumeImageNodes() {
result.add(ImageNodeList(imageNodes));
imageNodes = [];
}

final List<dom.Node> currentParagraph = [];
void consumeParagraph() {
final parsed = parseBlockInline(currentParagraph);
result.add(ParagraphNode(
Expand All @@ -1597,8 +1601,7 @@ class _ZulipContentParser {

if (_isPossibleInlineNode(node)) {
if (imageNodes.isNotEmpty) {
result.add(ImageNodeList(imageNodes));
imageNodes = [];
consumeImageNodes();
// In a context where paragraphs are implicit it should be impossible
// to have more paragraph content after image previews.
result.add(UnimplementedBlockContentNode(htmlNode: node));
Expand All @@ -1613,24 +1616,25 @@ class _ZulipContentParser {
imageNodes.add(block);
continue;
}
if (imageNodes.isNotEmpty) {
result.add(ImageNodeList(imageNodes));
imageNodes = [];
}
if (imageNodes.isNotEmpty) consumeImageNodes();
result.add(block);
}
if (currentParagraph.isNotEmpty) consumeParagraph();
if (imageNodes.isNotEmpty) result.add(ImageNodeList(imageNodes));

if (imageNodes.isNotEmpty) consumeImageNodes();
return result;
}

static final _redundantLineBreaksRegexp = RegExp(r'^\n+$');

List<BlockContentNode> parseBlockContentList(dom.NodeList nodes) {
assert(_debugParserContext == _ParserContext.block);
final List<BlockContentNode> result = [];

List<ImageNode> imageNodes = [];
void consumeImageNodes() {
result.add(ImageNodeList(imageNodes));
imageNodes = [];
}

for (final node in nodes) {
// We get a bunch of newline Text nodes between paragraphs.
// A browser seems to ignore these; let's do the same.
Expand All @@ -1643,13 +1647,10 @@ class _ZulipContentParser {
imageNodes.add(block);
continue;
}
if (imageNodes.isNotEmpty) {
result.add(ImageNodeList(imageNodes));
imageNodes = [];
}
if (imageNodes.isNotEmpty) consumeImageNodes();
result.add(block);
}
if (imageNodes.isNotEmpty) result.add(ImageNodeList(imageNodes));
if (imageNodes.isNotEmpty) consumeImageNodes();
return result;
}

Expand All @@ -1660,6 +1661,8 @@ class _ZulipContentParser {
}
}

/// Parse a complete piece of Zulip HTML content,
/// such as an entire value of [Message.content].
ZulipContent parseContent(String html) {
return _ZulipContentParser().parse(html);
}