diff --git a/CMakeLists.txt b/CMakeLists.txt index f7843638..54cd14d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,10 +8,10 @@ cmake_minimum_required (VERSION 2.6) set (My_Project_Title "MultiMarkdown") set (My_Project_Description "Lightweight markup processor to produce HTML, LaTeX, and more.") set (My_Project_Author "Fletcher T. Penney") -set (My_Project_Revised_Date "2017-03-13") +set (My_Project_Revised_Date "2017-03-15") set (My_Project_Version_Major 6) set (My_Project_Version_Minor 0) -set (My_Project_Version_Patch 0-b2) +set (My_Project_Version_Patch 0-rc1) set (My_Project_Version "${My_Project_Version_Major}.${My_Project_Version_Minor}.${My_Project_Version_Patch}") diff --git a/DevelopmentNotes/DevelopmentNotes.epub b/DevelopmentNotes/DevelopmentNotes.epub new file mode 100644 index 00000000..58c0aceb Binary files /dev/null and b/DevelopmentNotes/DevelopmentNotes.epub differ diff --git a/DevelopmentNotes/DevelopmentNotes.fodt b/DevelopmentNotes/DevelopmentNotes.fodt new file mode 100644 index 00000000..f02814d3 --- /dev/null +++ b/DevelopmentNotes/DevelopmentNotes.fodt @@ -0,0 +1,1408 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Bibliography + + + + MultiMarkdown v6 Development Notes + Fletcher T. Penney + 2017-03-15 + + + +Introduction + +This document includes some notes on the development of MultiMarkdown (MMD) v6. Most of it +will be interesting only to other developers or those needing to choose the +absolute “best” Markdown (MD) implementation for their needs – it is not required +reading to understand how the software works. + +Why a New Version? + +MultiMarkdown version 5 was released in November of 2015, but the codebase was +essentially the same as that of v4 – and that was released in beta in April +of 2013. A few key things prompted work on a new version: + + + +Accuracy – MMD v4 and v5 were the most accurate versions yet, and a lot of +effort went into finding and resolving various edge cases. However, it began +to feel like a game of whack-a-mole where new bugs would creep in every time I +fixed an old one. The PEGParsing Expression Grammar https://en.wikipedia.org/wiki/Parsing_expression_grammar began to feel rather convoluted in spots, even +though it did allow for a precise (if not always accurate) specification of +the grammar. + + +Performance – “Back in the day” peg-markdown was one of the fastest +Markdown parsers around. MMD v3 was based on peg-markdown, and would leap- +frog with it in terms of performance. Then CommonMark was released, which +was a bit faster. Then a couple of years went by and CommonMark became much +faster – in one of my test suites, MMD v 5.4.0 takes about 25 times longer to +process a long document than CommonMark 0.27.0. + + + +In the spring of 2016, I decided I wanted to rewrite MultiMarkdown from scratch, +building the parser myself rather than relying on a pre-rolled solution. (I +had been using greg to compile the PEG +into parser code. It worked well overall, but lacked some features I needed, +requiring a lot of workarounds.) + +First Attempt + +My first attempt started by hand-crafting a parser that scanned through the +document a line at a time, deciding what to do with each line as it found +them. I used regex parsers made with re2c to +help classify each line, and then a separate parser layer to process groups of +lines into blocks. Initially this approach worked well, and was really +efficient. But I quickly began to code my way into a dead-end – the strategy +was not elegant enough to handle things like nested lists, etc. + +One thing that did turn out well from the first attempt, however, was an +approach for handling <emph> and <strong> parsing. I’ve learned over the +years that this can be one of the hardest parts of coding accurately for +Markdown. There are many examples that are obvious to a person, but difficult +to properly “explain” how to parse to a computer. + +No solution is perfect, but I developed an approach that seems to accurately +handle a wide range of situations without a great deal of complexity: + + + +Scan the documents for asterisks (*). Each one will be handled one at a +time. + + +Unlike brackets ([ and ]), an asterisk is “ambidextrous”, in that it +may be able to open a matched pair of asterisks, close a pair, or both. For +example, in foo *bar* foo: + + + +The first asterisk can open a pair, but not close one. + + +The second asterisk can close a pair, but not open one. + + + + +So, once the asterisks have been identified, each has to be examined to +determine whether it can open/close/both. The algorithm is not that complex, +but I’ll describe it in general terms. Check the code for more specifics. +This approach seems to work, but might still need some slight tweaking. In +the future, I’ll codify this better in language rather than just in code. + + + +If there is whitespace to the left of an asterisk, it can’t close. + + +If there is whitespace or punctuation to the right it can’t open. + + +“Runs” of asterisks, e.g. **bar are treated as a unit in terms of +looking left/right. + + +Asterisks inside a word are a bit trickier – we look at the number of +asterisks before the word, the number in the current run, and the number +of asterisks after the word to determine which combinations, if any, are +permitted. + + + + +Once all asterisks have been tagged as able to open/close/both, we proceed +through them in order: + + + +When we encounter a tag that can close, we look to see if there is a +previous opener that has not been paired off. If so, pair the two and +remove the opener from the list of available asterisks. + + +When we encounter an opener, add it to the stack of available openers. + + +When encounter an asterisk that can do both, see if it can close an +existing opener. If not, then add it to the stack. + + + + +After all tokens in the block have been paired, then we look for nesting +pairs of asterisks in order to create <emph> and <strong> sets. For +example, assume we have six asterisks wrapped around a word, three in front, +and three after. The asterisks are indicated with numbers: 123foo456. We +proceed in the following manner: + + + +Based on the pairing algorithm above, these asterisks would be paired as +follows, with matching asterisks sharing numbers – 123foo321. + + +Moving forwards, we come to asterisk “1”. It is followed by an +asterisk, so we check to see if they should be grouped as a <strong>. +Since the “1” asterisks are wrapped immediately outside the “2” asterisks, +they are joined together. More than two pairs can’t be joined, so we now +get the following – 112foo211, where the “11” represents the opening +and closing of a <strong>, and the “2” represents a <emph>. + + + + +When matching a pair, any unclosed openers that are on the stack are +removed, preventing pairs from “crossing” or “intersecting”. Pairs can wrap +around each other, e.g. [(foo)], but not intersect like [(foo]). In the +second case, the brackets would close, removing the ( from the stack. + + +This same approach is used in all tokens that are matched in pairs– +[foo], (foo), _foo_, etc. There’s slightly more to it, but once you +figure out how to assign opening/closing ability, the rest is easy. By using +a stack to track available openers, it can be performed efficiently. + + + +In my testing, this approach has worked quite well. It handles all the basic +scenarios I’ve thrown at it, and all of the “basic” and “devious” edge cases I +have thought of (some of these don’t necessarily have a “right” answer – but +v6 gives consistency answers that seem as reasonable as any others to me). +There are also three more edge cases I’ve come up can still stump it, and +ironically they are handled correctly by most implementations. They just +don’t follow the rules above. I’ll continue to work on this. + +In the end, I scrapped this effort, but kept the lessons learned in the token +pairing algorithm. + +Second Attempt + +I tried again this past Fall. This time, I approached the problem with lots +of reading. Lots and lots of reading – tons of websites, computer science +journal articles, PhD theses, etc. Learned a lot about lexers, and a lot +about parsers, including hand-crafting vs using parser generators. In brief: + + + +I learned about the Aho–Corasick algorithm, which is a great way to +efficiently search a string for multiple target strings at once. I used this +to create a custom lexer to identify tokens in a MultiMarkdown text document +(e.g. *, [, {++, etc.). I learned a lot, and had a good time working +out the implementation. This code efficiently allowed me to break a string of +text into the tokens that mattered for Markdown parsing. + + +However, in a few instances I really needed some features of regular +expressions to simplify more complex structures. After a quick bit of testing, +using re2c to create a tokenizer was just as efficient, and allowed me to +incorporate some regex functionality that simplified later parsing. I’ll keep +the Aho-Corasick stuff around, and will probably experiment more with it +later. But I didn’t need it for MMD now. lexer.re contains the source for +the tokenizer. + + + +I looked long and hard for a way to simplify the parsing algorithm to try and +“touch” each token only once. Ideally, the program could step through each +token, and decide when to create a new block, when to pair things together, +etc. But I’m not convinced it’s possible. Since Markdown’s grammar varies +based on context, it seems to work best when handled in distinct phases: + + + +Tokenize the string to identify key sections of text. This includes line +breaks, allowing the text to be examined one line at time. + + +Join series of lines together into blocks, such as paragraphs, code blocks, +lists, etc. + + +The tokens inside each block can then be paired together to create more +complex syntax such as links, strong, emphasis, etc. + + + +To handle the block parsing, I started off using the Aho-Corasick code to +handle my first attempt. I had actually implemented some basic regex +functionality, and used that to group lines together to create blocks. But +this quickly fell apart in the face of more complex structures such as +recursive lists. After a lot of searching, and tons more reading, I +ultimately decided to use a parser generator to handle the task of group lines +into blocks. parser.y has the source for this, and it is processed by the +lemon parser generator to create the actual +code. + +I chose to do this because hand-crafting the block parser would be complex. +The end result would likely be difficult to read and understand, which would +make it difficult to update later on. Using the parser generator allows me to +write things out in a way that can more easily be understood by a person. In +all likelihood, the performance is probably as good as anything I could do +anyway, if not better. + +Because lemon is a LALR(1) parser, it does require a bit of thinking ahead +about how to create the grammar used. But so far, it has been able to handle +everything I have thrown at it. + +Optimization + +One of my goals for MMD 6 was performance. So I’ve paid attention to speed +along the way, and have tried to use a few tricks to keep things fast. Here +are some things I’ve learned along the way. In no particular order: + +Memory Allocation + +When parsing a long document, a lot of token structures are created. Each +one requires a small bit of memory to be allocated. In aggregate, that time +added up and slowed down performance. + +After reading for a bit, I ended up coming up with an approach that uses +larger chunks of memory. I allocate pools of of memory in large slabs for +smaller “objects”". For example, I allocate memory for 1024 tokens at a +single time, and then dole that memory out as needed. When the slab is empty, +a new one is allocated. This dramatically improved performance. + +When pairing tokens, I created a new stack for each block. I realized that an +empty stack didn’t have any “leftover” cruft to interfere with re-use, so I +just used one for the entire document. Again a sizeable improvement in +performance from only allocating one object instead of many. When recursing +to a deeper level, the stack just gets deeper, but earlier levels aren’t +modified. + +Speaking of tokens, I realized that the average document contains a lot of +single spaces (there’s one between every two words I have written, for +example.) The vast majority of the time, these single spaces have no effect +on the output of Markdown documents. I changed my whitespace token search to +only flag runs of 2 or more spaces, dramatically reducing the number of +tokens. This gives the benefit of needing fewer memory allocations, and also +reduces the number of tokens that need to be processed later on. The only +downside is remember to check for a single space character in a few instances +where it matters. + +Proper input buffering + +When I first began last spring, I was amazed to see how much time was being +spent by MultiMarkdown simply reading the input file. Then I discovered it +was because I was reading it one character at a time. I switched to using a +buffered read approach and the time to read the file went to almost nothing. I +experimented with different buffer sizes, but they did not seem to make a +measurable difference. + +Output Buffering + +I experimented with different approaches to creating the output after parsing. +I tried printing directly to stdout, and even played with different +buffering settings. None of those seemed to work well, and all were slower +than using the d_string approach (formerly call GString in MMD 5). + +Fast Searches + +After getting basic Markdown functionality complete, I discovered during +testing that the time required to parse a document grew exponentially as the +document grew longer. Performance was on par with CommonMark for shorter +documents, but fell increasingly behind in larger tests. Time profiling found +that the culprit was searching for link definitions when they didn’t exist. +My first approach was to keep a stack of used link definitions, and to iterate +through them when necessary. In long documents, this performs very poorly. +More research and I ended up using +uthash. This allows me to search for +a link (or footnote, etc.) by “name” rather than searching through an array. +This allowed me to get MMD’s performance back to O(n), taking roughly twice as +much time to process a document that is twice as long. + +Efficient Utility Functions + +It is frequently necessary when parsing Markdown to check what sort of +character we are dealing with at a certain position – a letter, whitespace, +punctuation, etc. I created a lookup table for this via char_lookup.c and +hard-coded it in char.c. These routines allow me to quickly, and +consistently, classify any byte within a document. This saved a lot of +programming time, and saved time tracking down bugs from handling things +slightly differently under different circumstances. I also suspect it +improved performance, but don’t have the data to back it up. + +Testing While Writing + +I developed several chunks of code in parallel while creating MMD 6. The vast +majority of it was developed largely in a test-driven development approach. +The other code was largely created with extensive unit testing to accomplish +this. + +MMD isn’t particularly amenable to this approach at the small level, but +instead I relied more on integration testing with an ever-growing collection +of text files and the corresponding HTML files in the MMD 6 test suite. This +allowed me to ensure new features work properly and that old features aren’t +broken. At this time, there are 29 text files in the test suite, and many +more to come. + +Other Lessons + +Some things that didn’t do me any good…. + +I considered differences between using malloc and calloc when initializing +tokens. The time saved by using malloc was basically exactly offset by the +initial time required to initialize the token to default null values as +compared to using calloc. When trying calloc failed to help me out +(thinking that clearing a single slab in the object pool would be faster), I +stuck with malloc as it makes more sense to me in my workflow. + +I read a bit about struct padding and reordered some of my structs. It was +until later that I discovered the -Wpadded option, and it’s not clear +whether my changes modified anything. Since the structs were being padded +automatically, there was no noticeable performance change, and I didn’t have +the tools to measure whether I could have improved memory usage at all. Not +sure this would be worth the effort – much lower hanging fruit available. + +Performance + +Basic tests show that currently MMD 6 takes about 20–25% longer the CommonMark +0.27.0 to process long files (e.g. 0.2 MB). However, it is around 5% faster +than CommonMark when parsing a shorter file (27 kB) (measured by parsing the +same file 200 times over). This test suite is performed by using the Markdown +[syntax page], modified to avoid the use of the Setext header at the top. The +longer files tested are created by copying the same syntax page onto itself, +thereby doubling the length of the file with each iteration. + +The largest file I test is approximately 108 MB (4096 copies of the syntax +page). On my machine (2012 Mac mini with 2.3 GHz Intel Core i7, 16 GB RAM), +it takes approximately 4.4 seconds to parse with MMD 6 and 3.7 seconds with +CommonMark. MMD 6 processes approximately 25 MB/s on this test file. +CommonMark 0.27.0 gets about 29 MB/s on the same machine. + +There are some slight variations with the smaller test files (8–32 copies), +but overall the performance of both programs (MMD 6 and CommonMark) are +roughly linear as the test file gets bigger (double the file size and it takes +twice as long to parse, aka O(n)). + +Out of curiosity, I ran the same tests on the original Markdown.pl by Gruber +(v 1.0.2b8). It took approximately 178 seconds to parse 128 copies of the +file (3.4 MB) and was demonstrating quadratic performance characteristics +(double the file size and it takes 22 or 4 times longer to process, aka +O(n2)). I didn’t bother running it on larger versions of the test file. For +comparison, MMD 6 can process 128 copies in approximately 140 msec. + +Of note, the throughput speed drops when testing more complicated files +containing more advanced MultiMarkdown features, though it still seems to +maintain linear performance characteristics. A second test file is created by +concatenating all of the test suite files (including the Markdown syntax +file). In this case, MMD gets about 13 MB/s. CommonMark doesn’t support +these additional features, so testing it with that file is not relevant. I +will work to see whether there are certain features in particular that are +more challenging and see whether they can be reworked to improve performance. + +As above, I have done some high level optimization of the parse strategy, but +I’m sure there’s still a lot of room for further improvement to be made. +Suggestions welcome! + +Testing + +Test Suite + +The development of MMD v6 was heavily, but not absolutely, influenced by the +philosophy of test-driven development. While coding, I made use of test +suites to verify successful implementation of new features, to avoid +regression problems when adding new features, and to identify known edge cases +in need of proper handling. + +The test suite (located in tests/MMD6Tests) is a “living” collection of +documents that will continue to be updated as new bugs and edge cases are +identified. This helps make proper integration testing of the entire +application with every release. + +Fuzz Testing + +I was not familiar with the concept of Fuzz Testing +(https://en.wikipedia.org/wiki/Fuzzing) until a user mentioned something about +it to me a year or two ago. I had never used it before, but it seemed like a +good idea. I implement it in two ways. + +The first is that I created a simplified version of the line parser that +simply accepts various combinations of line type identifiers to see if they +would successfully parse. The line parser is responsible for taking a series +of line types (e.g. plain text, indented line, etc.) and determining what sort +of block they should become. The file test/parser_text.y is run through the +lemon program, compiled (with or without the -DNDEBUG flag) and then run. +It sequentially throws every combination of line types at the simplified line +parser to make sure that it doesn’t choke. When I first did this, I found +several combinations of lines that did not pass. + +NOTE: This does not verify accurate parsing, simply that the parser does +not crash by an unacceptable combination of lines. + +The second form of fuzz testing I have started using more recently. This is +using the American fuzzy lop program to try +to find text input that crashes MMD. This works by taking sample input (e.g. +files from the test suite), modifying them slightly, and trying the modified +versions. Do this over and over and over, and some interesting edge cases are +sometimes identified. I have found some interesting edge cases this way. +Definitely a very useful tool! + +Unit Testing + +Some of the original development was done with unit testing in some other +tools I developed. This code formed the basis of a few parts of MMD. +Otherwise, it was hard to see how to really create very good unit tests for +the development of MMD. So there is really not much unit testing built into +the code or used during the development. + +Changelog + + + +2017–03–15 – v 6.0.0-rc1: + + + + +FIXED: Add missing CriticMarkup tokens to LaTeX + + + +FIXED: Don‘t let labels end on ’' that is escaping the closing ‘]’ + + + +FIXED: Fix NULL pointer dereference + + + +FIXED: Fix bug in Aho–Corasick implementation + + + +FIXED: Fix bug with ATX Headers without newline + + + +FIXED: Fix bug with Setext header starting with ‘:’ + + + +FIXED: Fix bug with leading spaces in abbreviation references + + + +FIXED: Fix crash with empty definition + + + +FIXED: Fix edge case with URL definitions + + + +FIXED: Fix edge case with superscripts + + + +FIXED: Fix null dereference error in CriticMarkup substitution + + + +FIXED: Fix potential bug in Aho–Corasick search: + + + +FIXED: Fix potential bug in storing items to hash + + + +FIXED: Fix potential bug with line–>block parser + + + +FIXED: Fix potential crash in attribute parsing + + + +FIXED: Fix printing raw CriticMarkup tokens in LaTeX + + + +FIXED: Fix signedness bug in Aho–Corasick + + + +FIXED: Improve metadata edge cases; Fix NULL pointer dereference + + + +FIXED: Include non–breaking space (ASCII 160) in re2c patterns + + + +FIXED: Keep ‘:’ in false positive definitions + + + +FIXED: Lex space followed by tab as space, not text + + + +FIXED: Limit lines treated as ATX headers + + +FIXED: Update test code + + + + +2017–03–13 – v 6.0.0-b2: + + + + +ADDED: Add CriticMarkup preprocessor that works across empty lines when accepting/rejecting markup + + + +ADDED: Add back the mmd6 latex title file + + + +ADDED: Basic EPUB 3 support – uses ‘miniz’ library to zip creation + + + +ADDED: Update QuickStart and EPUB code + + + +CHANGED: Update QuickStart guide + + + +CHANGED: Update test suite + + + +FIXED: Don't duplicate LaTeX glossary definitions + + + +FIXED: Fix abbreviations in ODF; Improve test suite + + + +FIXED: Improve glossaries and abbreviations; Update QuickStart + + + +FIXED: Tidy up some compiler warnings in code + + +FIXED: Use custom UUID code to minimize external dependencies + + + + +2017–03–09 – v 6.0.0-b1: + + + + +ADDED: Add French translations; fix typo in German + + + +ADDED: Add Quick Start guide + + + +ADDED: Add functionality to automatically identify abbreviations and glossary terms in source + + + +ADDED: Improve LaTeX configuration files + + + +ADDED: Update German translations + + + +ADDED: Use native ODF table of contents instead of a manual list + + + +ADDED: Use native command for table of contents in LaTeX + + + +CHANGED: Bring HTML and ODF into line with LaTeX as to output of abbreviatinos on first and subsequent uses + + + +CHANGED: Slight performance tweak + + + +CHANGED: Update German test suite + + + +FIXED: Allow {{TOC}} in latex verbatim + + + +FIXED: Don't free token_pool if never initialized + + + +FIXED: Fix German typo + + + +FIXED: Fix missing token type + + + +FIXED: Improve performance of checking document for metadata, which improves performance when checking for possible transclusion + + +FIXED: Update test suite for abbreviation changes + + + + +2017–03–05 – v 0.4.2-b: + + + + +ADDED: Add and utility functions; fix memory leak + + + +ADDED: Initial abbreviation support + + + +ADDED: Keep working on Abbreviations/Glossaries + + + +ADDED: Refactor abbreviation code; Add inline abbreviations; Fix abbreviations in ODF + + + +ADDED: Update Inline Footnote test + + + +CHANGED: Add comments to i18n.h + + + +CHANGED: Finish refactoring note–related code + + + +CHANGED: Refactor footnotes + + + +CHANGED: Refactor glossary code + + + +CHANGED: Remove offset from html export functions + + + +FIXED: latex list items need to block optional argument to allow ‘[’ as first character + + +Merge branch ‘release/0.4.1-b’ into develop + + + + +2017–03–04 – v 0.4.1-b: + + + +FIXED: Add glossary localization + + + + +2017–03–04 – v 0.4.0-b: + + + + +ADDED: Add TOC support to ODF + + + +ADDED: Add glossary support to ODF + + + +ADDED: Add prelim code for handling abbreviations + + + +ADDED: Add support for Swift Package Maker; CHANGED: Restructure source directory + + + +ADDED: Added LaTeX support for escaped characters, fenced code blocks, images, links + + + +ADDED: Basic ODF Support + + + +ADDED: Better document strong/emph algorithm + + + +ADDED: Continue ODF progress + + + +ADDED: Continue to work on ODF export + + + +ADDED: Continue work on ODF + + + +ADDED: Finish ODF support for lists + + + +ADDED: Improve performance when exporting + + + +ADDED: Improve token_pool memory handling + + + +ADDED: Prototype support for Glossaries + + + +ADDED: Support ‘latexconfig’ metadata + + + +CHANGED: Use multiple cases in glossary tests + + + +FIXED: Don't force glossary terms into lowercase + + + +FIXED: Fix Makefile for new source file location + + + +FIXED: Fix algorithm for creating TOC to properly handle ‘incorrect’ levels + + + +FIXED: Fix linebreaks in LaTeX; ADDED: Add Linebreaks test file + + + +FIXED: Fix new_source script for new directory structure + + + +FIXED: Fix non–breaking space in ODF + + + +FIXED: Fix padding at end of document body in ODF + + + +FIXED: Fix underscores in raw latex + + + +FIXED: Potential bug + + +NOTE: Add shared library build option + + + + +2017–02–17 – v 0.3.1.a: + + + + +ADDED: ‘finalize’ beamer support + + + +ADDED: Add escaped newline as linebreak; start on beamer/memoir support + + + +ADDED: CriticMarkup test for LaTeX + + + +ADDED: Custom LaTeX output for CriticMarkup comments + + + +ADDED: Support mmd export format + + + +ADDED: Work on cpack installer – change project name for compatibility + + + +CHANGED: Adjust latex metadata configuration for consistency + + + +CHANGED: Configure cmake to use C99 + + + +FIXED: Add custom implementation for cross–platform support + + + +FIXED: Fix German HTML tests + + + +FIXED: Fix cpack destination directory issue + + + +FIXED: Fix memory leaks etc + + + +FIXED: Fix warning in custom vasprintf + + + +FIXED: Modify CMakeLists.txt to test for use of clang compiler + + + +FIXED: Work on memory leaks + + +NOTE: Adjust license width to improve display on smaller terminal windows + + + + +2017–02–14 – v 0.3.0a: + + + + +ADDED: Add basic image support to LaTeX + + + +ADDED: Add file transclusion + + + +ADDED: Add support for citation ‘locators’ + + + +ADDED: Add support for manual labels on ATX Headers + + + +ADDED: Add support for manual labels on Setext Headers + + + +ADDED: Add support for tables in LaTeX + + + +ADDED: HTML Comments appear as raw LaTeX + + + +ADDED: Improved citation support in LaTeX + + + +ADDED: Support \autoref{} in LaTeX + + + +ADDED: Support combined options in LaTeX citations that use the ‘][’ syntax + + + +ADDED: Support language specifier in fenced code blocks + + + +ADDED: Support metadata in LaTeX + + + +ADDED: Update Citations test suite + + + +FIXED: Escaped LaTeX characters + + + +FIXED: Fix bug in URL parsing + + + +FIXED: Fix bug in citation links + + + +FIXED: Fix bug when no closing divider or newline at end of last table cell + + + +FIXED: Fix issue printing ‘–’ + + + +FIXED: Fix scan_url test suite + + + +FIXED: Get Math working in LaTeX + + + +FIXED: Improve reliability or link scanner + + + +FIXED: Properly add id attribute to new instances of citation only + + + +FIXED: Properly handle manual labels with TOC + + + +FIXED: Properly print hash characters in LaTeX + + + +FIXED: Separate LaTeX verbatim and texttt character handling + + + +FIXED: Update Escapes test LaTeX result + + +FIXED: Work on escaping LaTeX characters + + + + +2017–02–08 – v 0.1.4a: + + + +ADDED: Add smart quote support for other languages (resolves #15) + + + + +2017–02–08 – v 0.1.3a: + + + + +ADDED: Add support for reference image id attributes + + + +ADDED: Add support for table captions + + + +ADDED: Metadata support for base header level + + + +ADDED: Support distinction between 3 and 5 backticks in fenced code blocks + + + +ADDED: Support Setext headers + + +FIXED: Fix issue with metadata disrupting smart quotes + + + + +2017–02–07 – v 0.1.2a: + + + +“pathologic” test suite – fix handling of nested brackets, e.g. +[[[[foo]]]] to avoid bogging down checking for reference links that +don’t exist. + + +Table support – a single blank line separates sections of tables, so +at least two blank lines are needed between adjacent tables. + + + +Definition list support + + + +“fuzz testing” – stress test the parser for unexpected failures + + + +Table of Contents support + + +Improved compatibility mode parsing + + + + +2017–01–28 – v 0.1.1a includes a few updates: + + + + +Metadata support + + + +Metadata variables support + + + +Extended ASCII range character checking + + + +Rudimentary language translations, including German + + + +Improved performance + + +Additional testing: + + + + +CriticMarkup + + + +HTML Blokcs + + + +Metadata/Variables + + +“pathologic” test cases from CommonMark + + + + + + + + + diff --git a/DevelopmentNotes/DevelopmentNotes.html b/DevelopmentNotes/DevelopmentNotes.html new file mode 100644 index 00000000..b7edd230 --- /dev/null +++ b/DevelopmentNotes/DevelopmentNotes.html @@ -0,0 +1,626 @@ + + + + + MultiMarkdown v6 Development Notes + + + + + +

Introduction

+ +

This document includes some notes on the development of MultiMarkdown (MMD) v6. Most of it +will be interesting only to other developers or those needing to choose the +absolute “best” Markdown (MD) implementation for their needs – it is not required +reading to understand how the software works.

+ +

Why a New Version?

+ +

MultiMarkdown version 5 was released in November of 2015, but the codebase was +essentially the same as that of v4 – and that was released in beta in April +of 2013. A few key things prompted work on a new version:

+ + + +

In the spring of 2016, I decided I wanted to rewrite MultiMarkdown from scratch, +building the parser myself rather than relying on a pre-rolled solution. (I +had been using greg to compile the PEG +into parser code. It worked well overall, but lacked some features I needed, +requiring a lot of workarounds.)

+ +

First Attempt

+ +

My first attempt started by hand-crafting a parser that scanned through the +document a line at a time, deciding what to do with each line as it found +them. I used regex parsers made with re2c to +help classify each line, and then a separate parser layer to process groups of +lines into blocks. Initially this approach worked well, and was really +efficient. But I quickly began to code my way into a dead-end – the strategy +was not elegant enough to handle things like nested lists, etc.

+ +

One thing that did turn out well from the first attempt, however, was an +approach for handling <emph> and <strong> parsing. I’ve learned over the +years that this can be one of the hardest parts of coding accurately for +Markdown. There are many examples that are obvious to a person, but difficult +to properly “explain” how to parse to a computer.

+ +

No solution is perfect, but I developed an approach that seems to accurately +handle a wide range of situations without a great deal of complexity:

+ +
    +
  1. Scan the documents for asterisks (*). Each one will be handled one at a +time.

  2. +
  3. Unlike brackets ([ and ]), an asterisk is “ambidextrous”, in that it +may be able to open a matched pair of asterisks, close a pair, or both. For +example, in foo *bar* foo:

    + +
      +
    1. The first asterisk can open a pair, but not close one.

    2. +
    3. The second asterisk can close a pair, but not open one.

    4. +
  4. +
  5. So, once the asterisks have been identified, each has to be examined to +determine whether it can open/close/both. The algorithm is not that complex, +but I’ll describe it in general terms. Check the code for more specifics. +This approach seems to work, but might still need some slight tweaking. In +the future, I’ll codify this better in language rather than just in code.

    + +
      +
    1. If there is whitespace to the left of an asterisk, it can’t close.

    2. +
    3. If there is whitespace or punctuation to the right it can’t open.

    4. +
    5. “Runs” of asterisks, e.g. **bar are treated as a unit in terms of +looking left/right.

    6. +
    7. Asterisks inside a word are a bit trickier – we look at the number of +asterisks before the word, the number in the current run, and the number +of asterisks after the word to determine which combinations, if any, are +permitted.

    8. +
  6. +
  7. Once all asterisks have been tagged as able to open/close/both, we proceed +through them in order:

    + +
      +
    1. When we encounter a tag that can close, we look to see if there is a +previous opener that has not been paired off. If so, pair the two and +remove the opener from the list of available asterisks.

    2. +
    3. When we encounter an opener, add it to the stack of available openers.

    4. +
    5. When encounter an asterisk that can do both, see if it can close an +existing opener. If not, then add it to the stack.

    6. +
  8. +
  9. After all tokens in the block have been paired, then we look for nesting +pairs of asterisks in order to create <emph> and <strong> sets. For +example, assume we have six asterisks wrapped around a word, three in front, +and three after. The asterisks are indicated with numbers: 123foo456. We +proceed in the following manner:

    + +
      +
    1. Based on the pairing algorithm above, these asterisks would be paired as +follows, with matching asterisks sharing numbers – 123foo321.

    2. +
    3. Moving forwards, we come to asterisk “1”. It is followed by an +asterisk, so we check to see if they should be grouped as a <strong>. +Since the “1” asterisks are wrapped immediately outside the “2” asterisks, +they are joined together. More than two pairs can’t be joined, so we now +get the following – 112foo211, where the “11” represents the opening +and closing of a <strong>, and the “2” represents a <emph>.

    4. +
  10. +
  11. When matching a pair, any unclosed openers that are on the stack are +removed, preventing pairs from “crossing” or “intersecting”. Pairs can wrap +around each other, e.g. [(foo)], but not intersect like [(foo]). In the +second case, the brackets would close, removing the ( from the stack.

  12. +
  13. This same approach is used in all tokens that are matched in pairs– +[foo], (foo), _foo_, etc. There’s slightly more to it, but once you +figure out how to assign opening/closing ability, the rest is easy. By using +a stack to track available openers, it can be performed efficiently.

  14. +
+ +

In my testing, this approach has worked quite well. It handles all the basic +scenarios I’ve thrown at it, and all of the “basic” and “devious” edge cases I +have thought of (some of these don’t necessarily have a “right” answer – but +v6 gives consistency answers that seem as reasonable as any others to me). +There are also three more edge cases I’ve come up can still stump it, and +ironically they are handled correctly by most implementations. They just +don’t follow the rules above. I’ll continue to work on this.

+ +

In the end, I scrapped this effort, but kept the lessons learned in the token +pairing algorithm.

+ +

Second Attempt

+ +

I tried again this past Fall. This time, I approached the problem with lots +of reading. Lots and lots of reading – tons of websites, computer science +journal articles, PhD theses, etc. Learned a lot about lexers, and a lot +about parsers, including hand-crafting vs using parser generators. In brief:

+ +
    +
  1. I learned about the Aho–Corasick algorithm, which is a great way to +efficiently search a string for multiple target strings at once. I used this +to create a custom lexer to identify tokens in a MultiMarkdown text document +(e.g. *, [, {++, etc.). I learned a lot, and had a good time working +out the implementation. This code efficiently allowed me to break a string of +text into the tokens that mattered for Markdown parsing.

  2. +
  3. However, in a few instances I really needed some features of regular +expressions to simplify more complex structures. After a quick bit of testing, +using re2c to create a tokenizer was just as efficient, and allowed me to +incorporate some regex functionality that simplified later parsing. I’ll keep +the Aho-Corasick stuff around, and will probably experiment more with it +later. But I didn’t need it for MMD now. lexer.re contains the source for +the tokenizer.

  4. +
+ +

I looked long and hard for a way to simplify the parsing algorithm to try and +“touch” each token only once. Ideally, the program could step through each +token, and decide when to create a new block, when to pair things together, +etc. But I’m not convinced it’s possible. Since Markdown’s grammar varies +based on context, it seems to work best when handled in distinct phases:

+ +
    +
  1. Tokenize the string to identify key sections of text. This includes line +breaks, allowing the text to be examined one line at time.

  2. +
  3. Join series of lines together into blocks, such as paragraphs, code blocks, +lists, etc.

  4. +
  5. The tokens inside each block can then be paired together to create more +complex syntax such as links, strong, emphasis, etc.

  6. +
+ +

To handle the block parsing, I started off using the Aho-Corasick code to +handle my first attempt. I had actually implemented some basic regex +functionality, and used that to group lines together to create blocks. But +this quickly fell apart in the face of more complex structures such as +recursive lists. After a lot of searching, and tons more reading, I +ultimately decided to use a parser generator to handle the task of group lines +into blocks. parser.y has the source for this, and it is processed by the +lemon parser generator to create the actual +code.

+ +

I chose to do this because hand-crafting the block parser would be complex. +The end result would likely be difficult to read and understand, which would +make it difficult to update later on. Using the parser generator allows me to +write things out in a way that can more easily be understood by a person. In +all likelihood, the performance is probably as good as anything I could do +anyway, if not better.

+ +

Because lemon is a LALR(1) parser, it does require a bit of thinking ahead +about how to create the grammar used. But so far, it has been able to handle +everything I have thrown at it.

+ +

Optimization

+ +

One of my goals for MMD 6 was performance. So I’ve paid attention to speed +along the way, and have tried to use a few tricks to keep things fast. Here +are some things I’ve learned along the way. In no particular order:

+ +

Memory Allocation

+ +

When parsing a long document, a lot of token structures are created. Each +one requires a small bit of memory to be allocated. In aggregate, that time +added up and slowed down performance.

+ +

After reading for a bit, I ended up coming up with an approach that uses +larger chunks of memory. I allocate pools of of memory in large slabs for +smaller “objects”". For example, I allocate memory for 1024 tokens at a +single time, and then dole that memory out as needed. When the slab is empty, +a new one is allocated. This dramatically improved performance.

+ +

When pairing tokens, I created a new stack for each block. I realized that an +empty stack didn’t have any “leftover” cruft to interfere with re-use, so I +just used one for the entire document. Again a sizeable improvement in +performance from only allocating one object instead of many. When recursing +to a deeper level, the stack just gets deeper, but earlier levels aren’t +modified.

+ +

Speaking of tokens, I realized that the average document contains a lot of +single spaces (there’s one between every two words I have written, for +example.) The vast majority of the time, these single spaces have no effect +on the output of Markdown documents. I changed my whitespace token search to +only flag runs of 2 or more spaces, dramatically reducing the number of +tokens. This gives the benefit of needing fewer memory allocations, and also +reduces the number of tokens that need to be processed later on. The only +downside is remember to check for a single space character in a few instances +where it matters.

+ +

Proper input buffering

+ +

When I first began last spring, I was amazed to see how much time was being +spent by MultiMarkdown simply reading the input file. Then I discovered it +was because I was reading it one character at a time. I switched to using a +buffered read approach and the time to read the file went to almost nothing. I +experimented with different buffer sizes, but they did not seem to make a +measurable difference.

+ +

Output Buffering

+ +

I experimented with different approaches to creating the output after parsing. +I tried printing directly to stdout, and even played with different +buffering settings. None of those seemed to work well, and all were slower +than using the d_string approach (formerly call GString in MMD 5).

+ +

Fast Searches

+ +

After getting basic Markdown functionality complete, I discovered during +testing that the time required to parse a document grew exponentially as the +document grew longer. Performance was on par with CommonMark for shorter +documents, but fell increasingly behind in larger tests. Time profiling found +that the culprit was searching for link definitions when they didn’t exist. +My first approach was to keep a stack of used link definitions, and to iterate +through them when necessary. In long documents, this performs very poorly. +More research and I ended up using +uthash. This allows me to search for +a link (or footnote, etc.) by “name” rather than searching through an array. +This allowed me to get MMD’s performance back to O(n), taking roughly twice as +much time to process a document that is twice as long.

+ +

Efficient Utility Functions

+ +

It is frequently necessary when parsing Markdown to check what sort of +character we are dealing with at a certain position – a letter, whitespace, +punctuation, etc. I created a lookup table for this via char_lookup.c and +hard-coded it in char.c. These routines allow me to quickly, and +consistently, classify any byte within a document. This saved a lot of +programming time, and saved time tracking down bugs from handling things +slightly differently under different circumstances. I also suspect it +improved performance, but don’t have the data to back it up.

+ +

Testing While Writing

+ +

I developed several chunks of code in parallel while creating MMD 6. The vast +majority of it was developed largely in a test-driven development approach. +The other code was largely created with extensive unit testing to accomplish +this.

+ +

MMD isn’t particularly amenable to this approach at the small level, but +instead I relied more on integration testing with an ever-growing collection +of text files and the corresponding HTML files in the MMD 6 test suite. This +allowed me to ensure new features work properly and that old features aren’t +broken. At this time, there are 29 text files in the test suite, and many +more to come.

+ +

Other Lessons

+ +

Some things that didn’t do me any good….

+ +

I considered differences between using malloc and calloc when initializing +tokens. The time saved by using malloc was basically exactly offset by the +initial time required to initialize the token to default null values as +compared to using calloc. When trying calloc failed to help me out +(thinking that clearing a single slab in the object pool would be faster), I +stuck with malloc as it makes more sense to me in my workflow.

+ +

I read a bit about struct padding and reordered some of my structs. It was +until later that I discovered the -Wpadded option, and it’s not clear +whether my changes modified anything. Since the structs were being padded +automatically, there was no noticeable performance change, and I didn’t have +the tools to measure whether I could have improved memory usage at all. Not +sure this would be worth the effort – much lower hanging fruit available.

+ +

Performance

+ +

Basic tests show that currently MMD 6 takes about 20–25% longer the CommonMark +0.27.0 to process long files (e.g. 0.2 MB). However, it is around 5% faster +than CommonMark when parsing a shorter file (27 kB) (measured by parsing the +same file 200 times over). This test suite is performed by using the Markdown +[syntax page], modified to avoid the use of the Setext header at the top. The +longer files tested are created by copying the same syntax page onto itself, +thereby doubling the length of the file with each iteration.

+ +

The largest file I test is approximately 108 MB (4096 copies of the syntax +page). On my machine (2012 Mac mini with 2.3 GHz Intel Core i7, 16 GB RAM), +it takes approximately 4.4 seconds to parse with MMD 6 and 3.7 seconds with +CommonMark. MMD 6 processes approximately 25 MB/s on this test file. +CommonMark 0.27.0 gets about 29 MB/s on the same machine.

+ +

There are some slight variations with the smaller test files (8–32 copies), +but overall the performance of both programs (MMD 6 and CommonMark) are +roughly linear as the test file gets bigger (double the file size and it takes +twice as long to parse, aka O(n)).

+ +

Out of curiosity, I ran the same tests on the original Markdown.pl by Gruber +(v 1.0.2b8). It took approximately 178 seconds to parse 128 copies of the +file (3.4 MB) and was demonstrating quadratic performance characteristics +(double the file size and it takes 22 or 4 times longer to process, aka +O(n2)). I didn’t bother running it on larger versions of the test file. For +comparison, MMD 6 can process 128 copies in approximately 140 msec.

+ +

Of note, the throughput speed drops when testing more complicated files +containing more advanced MultiMarkdown features, though it still seems to +maintain linear performance characteristics. A second test file is created by +concatenating all of the test suite files (including the Markdown syntax +file). In this case, MMD gets about 13 MB/s. CommonMark doesn’t support +these additional features, so testing it with that file is not relevant. I +will work to see whether there are certain features in particular that are +more challenging and see whether they can be reworked to improve performance.

+ +

As above, I have done some high level optimization of the parse strategy, but +I’m sure there’s still a lot of room for further improvement to be made. +Suggestions welcome!

+ +

Testing

+ +

Test Suite

+ +

The development of MMD v6 was heavily, but not absolutely, influenced by the +philosophy of test-driven development. While coding, I made use of test +suites to verify successful implementation of new features, to avoid +regression problems when adding new features, and to identify known edge cases +in need of proper handling.

+ +

The test suite (located in tests/MMD6Tests) is a “living” collection of +documents that will continue to be updated as new bugs and edge cases are +identified. This helps make proper integration testing of the entire +application with every release.

+ +

Fuzz Testing

+ +

I was not familiar with the concept of Fuzz Testing +(https://en.wikipedia.org/wiki/Fuzzing) until a user mentioned something about +it to me a year or two ago. I had never used it before, but it seemed like a +good idea. I implement it in two ways.

+ +

The first is that I created a simplified version of the line parser that +simply accepts various combinations of line type identifiers to see if they +would successfully parse. The line parser is responsible for taking a series +of line types (e.g. plain text, indented line, etc.) and determining what sort +of block they should become. The file test/parser_text.y is run through the +lemon program, compiled (with or without the -DNDEBUG flag) and then run. +It sequentially throws every combination of line types at the simplified line +parser to make sure that it doesn’t choke. When I first did this, I found +several combinations of lines that did not pass.

+ +

NOTE: This does not verify accurate parsing, simply that the parser does +not crash by an unacceptable combination of lines.

+ +

The second form of fuzz testing I have started using more recently. This is +using the American fuzzy lop program to try +to find text input that crashes MMD. This works by taking sample input (e.g. +files from the test suite), modifying them slightly, and trying the modified +versions. Do this over and over and over, and some interesting edge cases are +sometimes identified. I have found some interesting edge cases this way. +Definitely a very useful tool!

+ +

Unit Testing

+ +

Some of the original development was done with unit testing in some other +tools I developed. This code formed the basis of a few parts of MMD. +Otherwise, it was hard to see how to really create very good unit tests for +the development of MMD. So there is really not much unit testing built into +the code or used during the development.

+ +

Changelog

+ + + +
+
+
    + +
  1. +PEG:

    Parsing Expression Grammar https://en.wikipedia.org/wiki/Parsing_expression_grammar  ↩

    +
  2. + +
+
+ + + + diff --git a/DevelopmentNotes/DevelopmentNotes.pdf b/DevelopmentNotes/DevelopmentNotes.pdf new file mode 100644 index 00000000..6f092a30 Binary files /dev/null and b/DevelopmentNotes/DevelopmentNotes.pdf differ diff --git a/DevelopmentNotes/DevelopmentNotes.txt b/DevelopmentNotes/DevelopmentNotes.txt new file mode 100644 index 00000000..da5c2b23 --- /dev/null +++ b/DevelopmentNotes/DevelopmentNotes.txt @@ -0,0 +1,637 @@ +Title: MultiMarkdown v6 Development Notes +Author: Fletcher T. Penney +Date: 2017-03-15 +LaTeX Config: tufte-handout +Base Header Level: 3 + + +# Introduction # + +This document includes some notes on the development of MMD v6. Most of it +will be interesting only to other developers or those needing to choose the +absolute "best" MD implementation for their needs -- it is not required +reading to understand how the software works. + + +## Why a New Version? ## + +MultiMarkdown version 5 was released in November of 2015, but the codebase was +essentially the same as that of v4 -- and that was released in beta in April +of 2013. A few key things prompted work on a new version: + +* Accuracy -- MMD v4 and v5 were the most accurate versions yet, and a lot of +effort went into finding and resolving various edge cases. However, it began +to feel like a game of whack-a-mole where new bugs would creep in every time I +fixed an old one. The PEG began to feel rather convoluted in spots, even +though it did allow for a precise (if not always accurate) specification of +the grammar. + +* Performance -- "Back in the day" [peg-markdown] was one of the fastest +Markdown parsers around. MMD v3 was based on peg-markdown, and would leap- +frog with it in terms of performance. Then [CommonMark] was released, which +was a bit faster. Then a couple of years went by and CommonMark became *much* +faster -- in one of my test suites, MMD v 5.4.0 takes about 25 times longer to +process a long document than CommonMark 0.27.0. + +[peg-markdown]: https://github.com/jgm/peg-markdown +[CommonMark]: http://commonmark.org/ + +In the spring of 2016, I decided I wanted to rewrite MultiMarkdown from scratch, +building the parser myself rather than relying on a pre-rolled solution. (I +had been using [greg](https://github.com/ooc-lang/greg) to compile the PEG +into parser code. It worked well overall, but lacked some features I needed, +requiring a lot of workarounds.) + + +# First Attempt # + +My first attempt started by hand-crafting a parser that scanned through the +document a line at a time, deciding what to do with each line as it found +them. I used regex parsers made with [re2c](http://re2c.org/index.html) to +help classify each line, and then a separate parser layer to process groups of +lines into blocks. Initially this approach worked well, and was really +efficient. But I quickly began to code my way into a dead-end -- the strategy +was not elegant enough to handle things like nested lists, etc. + +One thing that did turn out well from the first attempt, however, was an +approach for handling `` and `` parsing. I've learned over the +years that this can be one of the hardest parts of coding accurately for +Markdown. There are many examples that are obvious to a person, but difficult +to properly "explain" how to parse to a computer. + +No solution is perfect, but I developed an approach that seems to accurately +handle a wide range of situations without a great deal of complexity: + +1. Scan the documents for asterisks (`*`). Each one will be handled one at a +time. + +2. Unlike brackets (`[` and `]`), an asterisk is "ambidextrous", in that it +may be able to open a matched pair of asterisks, close a pair, or both. For +example, in `foo *bar* foo`: + + 1. The first asterisk can open a pair, but not close one. + + 2. The second asterisk can close a pair, but not open one. + +3. So, once the asterisks have been identified, each has to be examined to +determine whether it can open/close/both. The algorithm is not that complex, +but I'll describe it in general terms. Check the code for more specifics. +This approach seems to work, but might still need some slight tweaking. In +the future, I'll codify this better in language rather than just in code. + + 1. If there is whitespace to the left of an asterisk, it can't close. + + 2. If there is whitespace or punctuation to the right it can't open. + + 3. "Runs" of asterisks, e.g. `**bar` are treated as a unit in terms of + looking left/right. + + 4. Asterisks inside a word are a bit trickier -- we look at the number of + asterisks before the word, the number in the current run, and the number + of asterisks after the word to determine which combinations, if any, are + permitted. + +4. Once all asterisks have been tagged as able to open/close/both, we proceed +through them in order: + + 1. When we encounter a tag that can close, we look to see if there is a + previous opener that has not been paired off. If so, pair the two and + remove the opener from the list of available asterisks. + + 2. When we encounter an opener, add it to the stack of available openers. + + 3. When encounter an asterisk that can do both, see if it can close an + existing opener. If not, then add it to the stack. + +5. After all tokens in the block have been paired, then we look for nesting +pairs of asterisks in order to create `` and `` sets. For +example, assume we have six asterisks wrapped around a word, three in front, +and three after. The asterisks are indicated with numbers: `123foo456`. We +proceed in the following manner: + + 1. Based on the pairing algorithm above, these asterisks would be paired as + follows, with matching asterisks sharing numbers -- `123foo321`. + + 2. Moving forwards, we come to asterisk "1". It is followed by an + asterisk, so we check to see if they should be grouped as a ``. + Since the "1" asterisks are wrapped immediately outside the "2" asterisks, + they are joined together. More than two pairs can't be joined, so we now + get the following -- `112foo211`, where the "11" represents the opening + and closing of a ``, and the "2" represents a ``. + +6. When matching a pair, any unclosed openers that are on the stack are +removed, preventing pairs from "crossing" or "intersecting". Pairs can wrap +around each other, e.g. `[(foo)]`, but not intersect like `[(foo])`. In the +second case, the brackets would close, removing the `(` from the stack. + +7. This same approach is used in all tokens that are matched in pairs-- +`[foo]`, `(foo)`, `_foo_`, etc. There's slightly more to it, but once you +figure out how to assign opening/closing ability, the rest is easy. By using +a stack to track available openers, it can be performed efficiently. + +In my testing, this approach has worked quite well. It handles all the basic +scenarios I've thrown at it, and all of the "basic" and "devious" edge cases I +have thought of (some of these don't necessarily have a "right" answer -- but +v6 gives consistency answers that seem as reasonable as any others to me). +There are also three more edge cases I've come up can still stump it, and +ironically they are handled correctly by most implementations. They just +don't follow the rules above. I'll continue to work on this. + +In the end, I scrapped this effort, but kept the lessons learned in the token +pairing algorithm. + + +# Second Attempt # + +I tried again this past Fall. This time, I approached the problem with lots +of reading. *Lots and lots* of reading -- tons of websites, computer science +journal articles, PhD theses, etc. Learned a lot about lexers, and a lot +about parsers, including hand-crafting vs using parser generators. In brief: + +1. I learned about the [Aho–Corasick algorithm], which is a great way to +efficiently search a string for multiple target strings at once. I used this +to create a custom lexer to identify tokens in a MultiMarkdown text document +(e.g. `*`, `[ `, `{++`, etc.). I learned a lot, and had a good time working +out the implementation. This code efficiently allowed me to break a string of +text into the tokens that mattered for Markdown parsing. + +2. However, in a few instances I really needed some features of regular +expressions to simplify more complex structures. After a quick bit of testing, +using re2c to create a tokenizer was just as efficient, and allowed me to +incorporate some regex functionality that simplified later parsing. I'll keep +the Aho-Corasick stuff around, and will probably experiment more with it +later. But I didn't need it for MMD now. `lexer.re` contains the source for +the tokenizer. + +[Aho–Corasick algorithm]: https://en.wikipedia.org/wiki/Aho-Corasick_algorithm + +I looked long and hard for a way to simplify the parsing algorithm to try and +"touch" each token only once. Ideally, the program could step through each +token, and decide when to create a new block, when to pair things together, +etc. But I'm not convinced it's possible. Since Markdown's grammar varies +based on context, it seems to work best when handled in distinct phases: + +1. Tokenize the string to identify key sections of text. This includes line +breaks, allowing the text to be examined one line at time. + +2. Join series of lines together into blocks, such as paragraphs, code blocks, +lists, etc. + +3. The tokens inside each block can then be paired together to create more +complex syntax such as links, strong, emphasis, etc. + +To handle the block parsing, I started off using the Aho-Corasick code to +handle my first attempt. I had actually implemented some basic regex +functionality, and used that to group lines together to create blocks. But +this quickly fell apart in the face of more complex structures such as +recursive lists. After a lot of searching, and *tons* more reading, I +ultimately decided to use a parser generator to handle the task of group lines +into blocks. `parser.y` has the source for this, and it is processed by the +[lemon](http://www.hwaci.com/sw/lemon/) parser generator to create the actual +code. + +I chose to do this because hand-crafting the block parser would be complex. +The end result would likely be difficult to read and understand, which would +make it difficult to update later on. Using the parser generator allows me to +write things out in a way that can more easily be understood by a person. In +all likelihood, the performance is probably as good as anything I could do +anyway, if not better. + +Because lemon is a LALR(1) parser, it does require a bit of thinking ahead +about how to create the grammar used. But so far, it has been able to handle +everything I have thrown at it. + + +# Optimization # + +One of my goals for MMD 6 was performance. So I've paid attention to speed +along the way, and have tried to use a few tricks to keep things fast. Here +are some things I've learned along the way. In no particular order: + + +## Memory Allocation ## + +When parsing a long document, a *lot* of token structures are created. Each +one requires a small bit of memory to be allocated. In aggregate, that time +added up and slowed down performance. + +After reading for a bit, I ended up coming up with an approach that uses +larger chunks of memory. I allocate pools of of memory in large slabs for +smaller "objects"". For example, I allocate memory for 1024 tokens at a +single time, and then dole that memory out as needed. When the slab is empty, +a new one is allocated. This dramatically improved performance. + +When pairing tokens, I created a new stack for each block. I realized that an +empty stack didn't have any "leftover" cruft to interfere with re-use, so I +just used one for the entire document. Again a sizeable improvement in +performance from only allocating one object instead of many. When recursing +to a deeper level, the stack just gets deeper, but earlier levels aren't +modified. + +Speaking of tokens, I realized that the average document contains a lot of +single spaces (there's one between every two words I have written, for +example.) The vast majority of the time, these single spaces have no effect +on the output of Markdown documents. I changed my whitespace token search to +only flag runs of 2 or more spaces, dramatically reducing the number of +tokens. This gives the benefit of needing fewer memory allocations, and also +reduces the number of tokens that need to be processed later on. The only +downside is remember to check for a single space character in a few instances +where it matters. + + +## Proper input buffering ## + +When I first began last spring, I was amazed to see how much time was being +spent by MultiMarkdown simply reading the input file. Then I discovered it +was because I was reading it one character at a time. I switched to using a +buffered read approach and the time to read the file went to almost nothing. I +experimented with different buffer sizes, but they did not seem to make a +measurable difference. + + +## Output Buffering ## + +I experimented with different approaches to creating the output after parsing. +I tried printing directly to `stdout`, and even played with different +buffering settings. None of those seemed to work well, and all were slower +than using the `d_string` approach (formerly call `GString` in MMD 5). + + +## Fast Searches ## + +After getting basic Markdown functionality complete, I discovered during +testing that the time required to parse a document grew exponentially as the +document grew longer. Performance was on par with CommonMark for shorter +documents, but fell increasingly behind in larger tests. Time profiling found +that the culprit was searching for link definitions when they didn't exist. +My first approach was to keep a stack of used link definitions, and to iterate +through them when necessary. In long documents, this performs very poorly. +More research and I ended up using +[uthash](http://troydhanson.github.io/uthash/). This allows me to search for +a link (or footnote, etc.) by "name" rather than searching through an array. +This allowed me to get MMD's performance back to O(n), taking roughly twice as +much time to process a document that is twice as long. + + +## Efficient Utility Functions ## + +It is frequently necessary when parsing Markdown to check what sort of +character we are dealing with at a certain position -- a letter, whitespace, +punctuation, etc. I created a lookup table for this via `char_lookup.c` and +hard-coded it in `char.c`. These routines allow me to quickly, and +consistently, classify any byte within a document. This saved a lot of +programming time, and saved time tracking down bugs from handling things +slightly differently under different circumstances. I also suspect it +improved performance, but don't have the data to back it up. + + +## Testing While Writing ## + +I developed several chunks of code in parallel while creating MMD 6. The vast +majority of it was developed largely in a [test-driven development] approach. +The other code was largely created with extensive unit testing to accomplish +this. + +[test-driven development]: https://en.wikipedia.org/wiki/Test-driven_development + +MMD isn't particularly amenable to this approach at the small level, but +instead I relied more on integration testing with an ever-growing collection +of text files and the corresponding HTML files in the MMD 6 test suite. This +allowed me to ensure new features work properly and that old features aren't +broken. At this time, there are 29 text files in the test suite, and many +more to come. + + +## Other Lessons ## + +Some things that didn't do me any good.... + +I considered differences between using `malloc` and `calloc` when initializing +tokens. The time saved by using `malloc` was basically exactly offset by the +initial time required to initialize the token to default null values as +compared to using `calloc`. When trying `calloc` failed to help me out +(thinking that clearing a single slab in the object pool would be faster), I +stuck with `malloc` as it makes more sense to me in my workflow. + +I read a bit about [struct padding] and reordered some of my structs. It was +until later that I discovered the `-Wpadded` option, and it's not clear +whether my changes modified anything. Since the structs were being padded +automatically, there was no noticeable performance change, and I didn't have +the tools to measure whether I could have improved memory usage at all. Not +sure this would be worth the effort -- much lower hanging fruit available. + +[struct padding]: http://www.catb.org/esr/structure-packing/ + + +# Performance # + +Basic tests show that currently MMD 6 takes about 20-25% longer the CommonMark +0.27.0 to process long files (e.g. 0.2 MB). However, it is around 5% *faster* +than CommonMark when parsing a shorter file (27 kB) (measured by parsing the +same file 200 times over). This test suite is performed by using the Markdown +[syntax page], modified to avoid the use of the Setext header at the top. The +longer files tested are created by copying the same syntax page onto itself, +thereby doubling the length of the file with each iteration. + +The largest file I test is approximately 108 MB (4096 copies of the syntax +page). On my machine (2012 Mac mini with 2.3 GHz Intel Core i7, 16 GB RAM), +it takes approximately 4.4 seconds to parse with MMD 6 and 3.7 seconds with +CommonMark. MMD 6 processes approximately 25 MB/s on this test file. +CommonMark 0.27.0 gets about 29 MB/s on the same machine. + +There are some slight variations with the smaller test files (8-32 copies), +but overall the performance of both programs (MMD 6 and CommonMark) are +roughly linear as the test file gets bigger (double the file size and it takes +twice as long to parse, aka O(n)). + +Out of curiosity, I ran the same tests on the original Markdown.pl by Gruber +(v 1.0.2b8). It took approximately 178 seconds to parse 128 copies of the +file (3.4 MB) and was demonstrating quadratic performance characteristics +(double the file size and it takes 2^2 or 4 times longer to process, aka +O(n^2)). I didn't bother running it on larger versions of the test file. For +comparison, MMD 6 can process 128 copies in approximately 140 msec. + +Of note, the throughput speed drops when testing more complicated files +containing more advanced MultiMarkdown features, though it still seems to +maintain linear performance characteristics. A second test file is created by +concatenating all of the test suite files (including the Markdown syntax +file). In this case, MMD gets about 13 MB/s. CommonMark doesn't support +these additional features, so testing it with that file is not relevant. I +will work to see whether there are certain features in particular that are +more challenging and see whether they can be reworked to improve performance. + +As above, I have done some high level optimization of the parse strategy, but +I'm sure there's still a lot of room for further improvement to be made. +Suggestions welcome! + + +# Testing # + +## Test Suite ## + +The development of MMD v6 was heavily, but not absolutely, influenced by the +philosophy of test-driven development. While coding, I made use of test +suites to verify successful implementation of new features, to avoid +regression problems when adding new features, and to identify known edge cases +in need of proper handling. + +The test suite (located in `tests/MMD6Tests`) is a "living" collection of +documents that will continue to be updated as new bugs and edge cases are +identified. This helps make proper integration testing of the entire +application with every release. + + +## Fuzz Testing ## + +I was not familiar with the concept of [Fuzz Testing] +(https://en.wikipedia.org/wiki/Fuzzing) until a user mentioned something about +it to me a year or two ago. I had never used it before, but it seemed like a +good idea. I implement it in two ways. + +The first is that I created a simplified version of the line parser that +simply accepts various combinations of line type identifiers to see if they +would successfully parse. The line parser is responsible for taking a series +of line types (e.g. plain text, indented line, etc.) and determining what sort +of block they should become. The file `test/parser_text.y` is run through the +`lemon` program, compiled (with or without the `-DNDEBUG` flag) and then run. +It sequentially throws every combination of line types at the simplified line +parser to make sure that it doesn't choke. When I first did this, I found +several combinations of lines that did not pass. + +**NOTE**: This does not verify accurate parsing, simply that the parser does +not crash by an unacceptable combination of lines. + +The second form of fuzz testing I have started using more recently. This is +using the [American fuzzy lop](http://lcamtuf.coredump.cx/afl/) program to try +to find text input that crashes MMD. This works by taking sample input (e.g. +files from the test suite), modifying them slightly, and trying the modified +versions. Do this over and over and over, and some interesting edge cases are +sometimes identified. I have found some interesting edge cases this way. +Definitely a very useful tool! + + +## Unit Testing ## + +Some of the original development was done with unit testing in some other +tools I developed. This code formed the basis of a few parts of MMD. +Otherwise, it was hard to see how to really create very good unit tests for +the development of MMD. So there is really not much unit testing built into +the code or used during the development. + + + +[>MMD]: MultiMarkdown +[>MD]: Markdown + +[CriticMarkup]: http://criticmarkup.com/ + +[?PEG]: Parsing Expression Grammar + +[?AST]: Abstract Syntax Tree + + +# Changelog # + +* 2017-03-15 -- v 6.0.0-rc1: + + * FIXED: Add missing CriticMarkup tokens to LaTeX + * FIXED: Don't let labels end on '\' that is escaping the closing ']' + * FIXED: Fix NULL pointer dereference + * FIXED: Fix bug in Aho-Corasick implementation + * FIXED: Fix bug with ATX Headers without newline + * FIXED: Fix bug with Setext header starting with ':' + * FIXED: Fix bug with leading spaces in abbreviation references + * FIXED: Fix crash with empty definition + * FIXED: Fix edge case with URL definitions + * FIXED: Fix edge case with superscripts + * FIXED: Fix null dereference error in CriticMarkup substitution + * FIXED: Fix potential bug in Aho-Corasick search: + * FIXED: Fix potential bug in storing items to hash + * FIXED: Fix potential bug with line->block parser + * FIXED: Fix potential crash in attribute parsing + * FIXED: Fix printing raw CriticMarkup tokens in LaTeX + * FIXED: Fix signedness bug in Aho-Corasick + * FIXED: Improve metadata edge cases; Fix NULL pointer dereference + * FIXED: Include non-breaking space (ASCII 160) in re2c patterns + * FIXED: Keep ':' in false positive definitions + * FIXED: Lex space followed by tab as space, not text + * FIXED: Limit lines treated as ATX headers + * FIXED: Update test code + + +* 2017-03-13 -- v 6.0.0-b2: + + * ADDED: Add CriticMarkup preprocessor that works across empty lines when accepting/rejecting markup + * ADDED: Add back the mmd6 latex title file + * ADDED: Basic EPUB 3 support -- uses 'miniz' library to zip creation + * ADDED: Update QuickStart and EPUB code + * CHANGED: Update QuickStart guide + * CHANGED: Update test suite + * FIXED: Don't duplicate LaTeX glossary definitions + * FIXED: Fix abbreviations in ODF; Improve test suite + * FIXED: Improve glossaries and abbreviations; Update QuickStart + * FIXED: Tidy up some compiler warnings in code + * FIXED: Use custom UUID code to minimize external dependencies + + +* 2017-03-09 -- v 6.0.0-b1: + + * ADDED: Add French translations; fix typo in German + * ADDED: Add Quick Start guide + * ADDED: Add functionality to automatically identify abbreviations and glossary terms in source + * ADDED: Improve LaTeX configuration files + * ADDED: Update German translations + * ADDED: Use native ODF table of contents instead of a manual list + * ADDED: Use native command for table of contents in LaTeX + * CHANGED: Bring HTML and ODF into line with LaTeX as to output of abbreviatinos on first and subsequent uses + * CHANGED: Slight performance tweak + * CHANGED: Update German test suite + * FIXED: Allow `{{TOC}}` in latex verbatim + * FIXED: Don't free token_pool if never initialized + * FIXED: Fix German typo + * FIXED: Fix missing token type + * FIXED: Improve performance of checking document for metadata, which improves performance when checking for possible transclusion + * FIXED: Update test suite for abbreviation changes + + +* 2017-03-05 -- v 0.4.2-b: + + * ADDED: Add and utility functions; fix memory leak + * ADDED: Initial abbreviation support + * ADDED: Keep working on Abbreviations/Glossaries + * ADDED: Refactor abbreviation code; Add inline abbreviations; Fix abbreviations in ODF + * ADDED: Update Inline Footnote test + * CHANGED: Add comments to i18n.h + * CHANGED: Finish refactoring note-related code + * CHANGED: Refactor footnotes + * CHANGED: Refactor glossary code + * CHANGED: Remove offset from html export functions + * FIXED: latex list items need to block optional argument to allow '[' as first character + * Merge branch 'release/0.4.1-b' into develop + + +* 2017-03-04 -- v 0.4.1-b: + + * FIXED: Add glossary localization + + +* 2017-03-04 -- v 0.4.0-b: + + * ADDED: Add TOC support to ODF + * ADDED: Add glossary support to ODF + * ADDED: Add prelim code for handling abbreviations + * ADDED: Add support for Swift Package Maker; CHANGED: Restructure source directory + * ADDED: Added LaTeX support for escaped characters, fenced code blocks, images, links + * ADDED: Basic ODF Support + * ADDED: Better document strong/emph algorithm + * ADDED: Continue ODF progress + * ADDED: Continue to work on ODF export + * ADDED: Continue work on ODF + * ADDED: Finish ODF support for lists + * ADDED: Improve performance when exporting + * ADDED: Improve token_pool memory handling + * ADDED: Prototype support for Glossaries + * ADDED: Support 'latexconfig' metadata + * CHANGED: Use multiple cases in glossary tests + * FIXED: Don't force glossary terms into lowercase + * FIXED: Fix Makefile for new source file location + * FIXED: Fix algorithm for creating TOC to properly handle 'incorrect' levels + * FIXED: Fix linebreaks in LaTeX; ADDED: Add Linebreaks test file + * FIXED: Fix new_source script for new directory structure + * FIXED: Fix non-breaking space in ODF + * FIXED: Fix padding at end of document body in ODF + * FIXED: Fix underscores in raw latex + * FIXED: Potential bug + * NOTE: Add shared library build option + + +* 2017-02-17 -- v 0.3.1.a: + + * ADDED: 'finalize' beamer support + * ADDED: Add escaped newline as linebreak; start on beamer/memoir support + * ADDED: CriticMarkup test for LaTeX + * ADDED: Custom LaTeX output for CriticMarkup comments + * ADDED: Support mmd export format + * ADDED: Work on cpack installer -- change project name for compatibility + * CHANGED: Adjust latex metadata configuration for consistency + * CHANGED: Configure cmake to use C99 + * FIXED: Add custom implementation for cross-platform support + * FIXED: Fix German HTML tests + * FIXED: Fix cpack destination directory issue + * FIXED: Fix memory leaks etc + * FIXED: Fix warning in custom vasprintf + * FIXED: Modify CMakeLists.txt to test for use of clang compiler + * FIXED: Work on memory leaks + * NOTE: Adjust license width to improve display on smaller terminal windows + + +* 2017-02-14 -- v 0.3.0a: + + * ADDED: Add basic image support to LaTeX + * ADDED: Add file transclusion + * ADDED: Add support for citation 'locators' + * ADDED: Add support for manual labels on ATX Headers + * ADDED: Add support for manual labels on Setext Headers + * ADDED: Add support for tables in LaTeX + * ADDED: HTML Comments appear as raw LaTeX + * ADDED: Improved citation support in LaTeX + * ADDED: Support \autoref{} in LaTeX + * ADDED: Support combined options in LaTeX citations that use the '\]\[' syntax + * ADDED: Support language specifier in fenced code blocks + * ADDED: Support metadata in LaTeX + * ADDED: Update Citations test suite + * FIXED: Escaped LaTeX characters + * FIXED: Fix bug in URL parsing + * FIXED: Fix bug in citation links + * FIXED: Fix bug when no closing divider or newline at end of last table cell + * FIXED: Fix issue printing '-' + * FIXED: Fix scan_url test suite + * FIXED: Get Math working in LaTeX + * FIXED: Improve reliability or link scanner + * FIXED: Properly add id attribute to new instances of citation only + * FIXED: Properly handle manual labels with TOC + * FIXED: Properly print hash characters in LaTeX + * FIXED: Separate LaTeX verbatim and texttt character handling + * FIXED: Update Escapes test LaTeX result + * FIXED: Work on escaping LaTeX characters + + +* 2017-02-08 -- v 0.1.4a: + + * ADDED: Add smart quote support for other languages (resolves #15) + + +* 2017-02-08 -- v 0.1.3a: + + * ADDED: Add support for reference image id attributes + * ADDED: Add support for table captions + * ADDED: Metadata support for base header level + * ADDED: Support distinction between 3 and 5 backticks in fenced code blocks + * ADDED: Support Setext headers + * FIXED: Fix issue with metadata disrupting smart quotes + +* 2017-02-07 -- v 0.1.2a: + + * "pathologic" test suite -- fix handling of nested brackets, e.g. + `[[[[foo]]]]` to avoid bogging down checking for reference links that + don't exist. + * Table support -- a single blank line separates sections of tables, so + at least two blank lines are needed between adjacent tables. + * Definition list support + * "fuzz testing" -- stress test the parser for unexpected failures + * Table of Contents support + * Improved compatibility mode parsing + +* 2017-01-28 -- v 0.1.1a includes a few updates: + + * Metadata support + * Metadata variables support + * Extended ASCII range character checking + * Rudimentary language translations, including German + * Improved performance + * Additional testing: + * CriticMarkup + * HTML Blokcs + * Metadata/Variables + * "pathologic" test cases from CommonMark + diff --git a/QuickStart.epub b/QuickStart.epub deleted file mode 100644 index c18779b9..00000000 Binary files a/QuickStart.epub and /dev/null differ diff --git a/QuickStart/QuickStart.epub b/QuickStart/QuickStart.epub new file mode 100644 index 00000000..ec826597 Binary files /dev/null and b/QuickStart/QuickStart.epub differ diff --git a/QuickStart.fodt b/QuickStart/QuickStart.fodt similarity index 98% rename from QuickStart.fodt rename to QuickStart/QuickStart.fodt index 10238a14..ed5c9487 100644 --- a/QuickStart.fodt +++ b/QuickStart/QuickStart.fodt @@ -274,7 +274,7 @@ MultiMarkdown v6 Quick Start Guide Fletcher T. Penney - 6.0-b + 6.0.0-rc1 0d6313fa-9135-477e-9c14-7d62c1977833 @@ -308,7 +308,7 @@ Introduction -Version: 6.0-b +Version: 6.0.0-rc1 This document serves as a description of MultiMarkdown (MMD) v6, as well as a sample document to demonstrate the various features. Specifically, differences from @@ -503,10 +503,9 @@ older versions of the EPUB format, but other tools can convert to other document formats you need. Same goes for Amazon’s ebook formats – the Calibre program can also be used to interconvert between formats. -**NOTE: Because EPUB documents are binary files, MMD only creates them when -**run in batch mode (using the -b\--batch options). Otherwise, it simply -**outputs the HTML 5 file that would serve as the primary content for the -**EPUB. +NOTE: Because EPUB documents are binary files, MMD only creates them when +run in batch mode (using the -b\--batch options). Otherwise, it simply +outputs the HTML 5 file that would serve as the primary content for the EPUB. Fenced Code Blocks diff --git a/QuickStart.html b/QuickStart/QuickStart.html similarity index 97% rename from QuickStart.html rename to QuickStart/QuickStart.html index 4d1ec694..9dc59695 100644 --- a/QuickStart.html +++ b/QuickStart/QuickStart.html @@ -4,7 +4,7 @@ MultiMarkdown v6 Quick Start Guide - + @@ -35,7 +35,7 @@

Introduction

-

Version: 6.0-b

+

Version: 6.0.0-rc1

This document serves as a description of MultiMarkdown (MMD) v6, as well as a sample document to demonstrate the various features. Specifically, differences from @@ -217,10 +217,9 @@

EPUB 3 Support

document formats you need. Same goes for Amazon’s ebook formats – the Calibre program can also be used to interconvert between formats.

-

**NOTE: Because EPUB documents are binary files, MMD only creates them when -**run in batch mode (using the -b\--batch options). Otherwise, it simply -**outputs the HTML 5 file that would serve as the primary content for the -**EPUB.

+

NOTE: Because EPUB documents are binary files, MMD only creates them when +run in batch mode (using the -b\--batch options). Otherwise, it simply +outputs the HTML 5 file that would serve as the primary content for the EPUB.

Fenced Code Blocks

diff --git a/QuickStart.pdf b/QuickStart/QuickStart.pdf similarity index 81% rename from QuickStart.pdf rename to QuickStart/QuickStart.pdf index 6926d290..ad3bcd81 100644 Binary files a/QuickStart.pdf and b/QuickStart/QuickStart.pdf differ diff --git a/QuickStart.txt b/QuickStart/QuickStart.txt similarity index 98% rename from QuickStart.txt rename to QuickStart/QuickStart.txt index 491975eb..c3583aec 100644 --- a/QuickStart.txt +++ b/QuickStart/QuickStart.txt @@ -1,6 +1,6 @@ Title: MultiMarkdown v6 Quick Start Guide Author: Fletcher T. Penney -Version: 6.0-b +Version: 6.0.0-rc1 LaTeX Config: tufte-handout Base Header Level: 3 uuid: 0d6313fa-9135-477e-9c14-7d62c1977833 @@ -202,10 +202,9 @@ older versions of the EPUB format, but other tools can convert to other document formats you need. Same goes for Amazon's ebook formats -- the [Calibre] program can also be used to interconvert between formats. -**NOTE: Because EPUB documents are binary files, MMD only creates them when -**run in batch mode (using the `-b\--batch` options). Otherwise, it simply -**outputs the HTML 5 file that would serve as the primary content for the -**EPUB. +**NOTE**: Because EPUB documents are binary files, MMD only creates them when +run in batch mode (using the `-b\--batch` options). Otherwise, it simply +outputs the HTML 5 file that would serve as the primary content for the EPUB. ## Fenced Code Blocks ## diff --git a/README.md b/README.md index 9c4863c1..b7fa26cd 100644 --- a/README.md +++ b/README.md @@ -4,536 +4,59 @@ | ---------- | ------------------------- | | Title: | MultiMarkdown | | Author: | Fletcher T. Penney | -| Date: | 2017-03-13 | +| Date: | 2017-03-15 | | Copyright: | Copyright © 2016 - 2017 Fletcher T. Penney. | -| Version: | 6.0.0-b2 | - - -## Updates ## - -* 2017-03-13 -- v 6.0.0-b2: - - * ADDED: Add CriticMarkup preprocessor that works across empty lines when accepting/rejecting markup - * ADDED: Add back the mmd6 latex title file - * ADDED: Basic EPUB 3 support -- uses 'miniz' library to zip creation - * ADDED: Update QuickStart and EPUB code - * CHANGED: Update QuickStart guide - * CHANGED: Update test suite - * FIXED: Don't duplicate LaTeX glossary definitions - * FIXED: Fix abbreviations in ODF; Improve test suite - * FIXED: Improve glossaries and abbreviations; Update QuickStart - * FIXED: Tidy up some compiler warnings in code - * FIXED: Use custom UUID code to minimize external dependencies - - -* 2017-03-09 -- v 6.0.0-b1: - - * ADDED: Add French translations; fix typo in German - * ADDED: Add Quick Start guide - * ADDED: Add functionality to automatically identify abbreviations and glossary terms in source - * ADDED: Improve LaTeX configuration files - * ADDED: Update German translations - * ADDED: Use native ODF table of contents instead of a manual list - * ADDED: Use native command for table of contents in LaTeX - * CHANGED: Bring HTML and ODF into line with LaTeX as to output of abbreviatinos on first and subsequent uses - * CHANGED: Slight performance tweak - * CHANGED: Update German test suite - * FIXED: Allow {{TOC}} in latex verbatim - * FIXED: Don't free token_pool if never initialized - * FIXED: Fix German typo - * FIXED: Fix missing token type - * FIXED: Improve performance of checking document for metadata, which improves performance when checking for possible transclusion - * FIXED: Update test suite for abbreviation changes - - -* 2017-03-05 -- v 0.4.2-b: - - * ADDED: Add and utility functions; fix memory leak - * ADDED: Initial abbreviation support - * ADDED: Keep working on Abbreviations/Glossaries - * ADDED: Refactor abbreviation code; Add inline abbreviations; Fix abbreviations in ODF - * ADDED: Update Inline Footnote test - * CHANGED: Add comments to i18n.h - * CHANGED: Finish refactoring note-related code - * CHANGED: Refactor footnotes - * CHANGED: Refactor glossary code - * CHANGED: Remove offset from html export functions - * FIXED: latex list items need to block optional argument to allow '[' as first character - * Merge branch 'release/0.4.1-b' into develop - - -* 2017-03-04 -- v 0.4.1-b: - - * FIXED: Add glossary localization - - -* 2017-03-04 -- v 0.4.0-b: - - * ADDED: Add TOC support to ODF - * ADDED: Add glossary support to ODF - * ADDED: Add prelim code for handling abbreviations - * ADDED: Add support for Swift Package Maker; CHANGED: Restructure source directory - * ADDED: Added LaTeX support for escaped characters, fenced code blocks, images, links - * ADDED: Basic ODF Support - * ADDED: Better document strong/emph algorithm - * ADDED: Continue ODF progress - * ADDED: Continue to work on ODF export - * ADDED: Continue work on ODF - * ADDED: Finish ODF support for lists - * ADDED: Improve performance when exporting - * ADDED: Improve token_pool memory handling - * ADDED: Prototype support for Glossaries - * ADDED: Support 'latexconfig' metadata - * CHANGED: Use multiple cases in glossary tests - * FIXED: Don't force glossary terms into lowercase - * FIXED: Fix Makefile for new source file location - * FIXED: Fix algorithm for creating TOC to properly handle 'incorrect' levels - * FIXED: Fix linebreaks in LaTeX; ADDED: Add Linebreaks test file - * FIXED: Fix new_source script for new directory structure - * FIXED: Fix non-breaking space in ODF - * FIXED: Fix padding at end of document body in ODF - * FIXED: Fix underscores in raw latex - * FIXED: Potential bug - * NOTE: Add shared library build option - - -* 2017-02-17 -- v 0.3.1.a: - - * ADDED: 'finalize' beamer support - * ADDED: Add escaped newline as linebreak; start on beamer/memoir support - * ADDED: CriticMarkup test for LaTeX - * ADDED: Custom LaTeX output for CriticMarkup comments - * ADDED: Support mmd export format - * ADDED: Work on cpack installer -- change project name for compatibility - * CHANGED: Adjust latex metadata configuration for consistency - * CHANGED: Configure cmake to use C99 - * FIXED: Add custom implementation for cross-platform support - * FIXED: Fix German HTML tests - * FIXED: Fix cpack destination directory issue - * FIXED: Fix memory leaks etc - * FIXED: Fix warning in custom vasprintf - * FIXED: Modify CMakeLists.txt to test for use of clang compiler - * FIXED: Work on memory leaks - * NOTE: Adjust license width to improve display on smaller terminal windows - - -* 2017-02-14 -- v 0.3.0a: - - * ADDED: Add basic image support to LaTeX - * ADDED: Add file transclusion - * ADDED: Add support for citation 'locators' - * ADDED: Add support for manual labels on ATX Headers - * ADDED: Add support for manual labels on Setext Headers - * ADDED: Add support for tables in LaTeX - * ADDED: HTML Comments appear as raw LaTeX - * ADDED: Improved citation support in LaTeX - * ADDED: Support \autoref{} in LaTeX - * ADDED: Support combined options in LaTeX citations that use the '\]\[' syntax - * ADDED: Support language specifier in fenced code blocks - * ADDED: Support metadata in LaTeX - * ADDED: Update Citations test suite - * FIXED: Escaped LaTeX characters - * FIXED: Fix bug in URL parsing - * FIXED: Fix bug in citation links - * FIXED: Fix bug when no closing divider or newline at end of last table cell - * FIXED: Fix issue printing '-' - * FIXED: Fix scan_url test suite - * FIXED: Get Math working in LaTeX - * FIXED: Improve reliability or link scanner - * FIXED: Properly add id attribute to new instances of citation only - * FIXED: Properly handle manual labels with TOC - * FIXED: Properly print hash characters in LaTeX - * FIXED: Separate LaTeX verbatim and texttt character handling - * FIXED: Update Escapes test LaTeX result - * FIXED: Work on escaping LaTeX characters - - -* 2017-02-08 -- v 0.1.4a: - - * ADDED: Add smart quote support for other languages (resolves #15) - - -* 2017-02-08 -- v 0.1.3a: - - * ADDED: Add support for reference image id attributes - * ADDED: Add support for table captions - * ADDED: Metadata support for base header level - * ADDED: Support distinction between 3 and 5 backticks in fenced code blocks - * ADDED: Support Setext headers - * FIXED: Fix issue with metadata disrupting smart quotes - -* 2017-02-07 -- v 0.1.2a: - - * "pathologic" test suite -- fix handling of nested brackets, e.g. - `[[[[foo]]]]` to avoid bogging down checking for reference links that - don't exist. - * Table support -- a single blank line separates sections of tables, so - at least two blank lines are needed between adjacent tables. - * Definition list support - * "fuzz testing" -- stress test the parser for unexpected failures - * Table of Contents support - * Improved compatibility mode parsing - -* 2017-01-28 -- v 0.1.1a includes a few updates: - - * Metadata support - * Metadata variables support - * Extended ASCII range character checking - * Rudimentary language translations, including German - * Improved performance - * Additional testing: - * CriticMarkup - * HTML Blokcs - * Metadata/Variables - * "pathologic" test cases from CommonMark +| Version: | 6.0.0-rc1 | ## An Announcement! ## -I would like to officially announce that MultiMarkdown version 6 is in public -alpha. It's finally at a point where it is usable, but there are quite a few -caveats. +MultiMarkdown v6 is finally here! It's technically still in "beta" as I would +like to hear back from a few more users to make sure I'm not missing anything, +but it has been subjected to much more rigorous testing than any previous +versions of MultiMarkdown in the past. If you want more information about +testing, see `DevelopmentNotes`. It's basically feature complete as a +replacement for MMD v5, and included additional features beyond that. -This post is a way for me to organize some of my thoughts, provide some -history for those who are interested, and to provide some tips and tricks from -my experiences for those who are working on their own products. -But first, some background... +## Obtaining MultiMarkdown ## +You can download the latest installer for MacOS or Windows at Github: -### Why a New Version? ### + -MultiMarkdown version 5 was released in November of 2015, but the codebase was -essentially the same as that of v4 -- and that was released in beta in April -of 2013. A few key things prompted work on a new version: +To build from source, download from Github. Then: -* Accuracy -- MMD v4 and v5 were the most accurate versions yet, and a lot of -effort went into finding and resolving various edge cases. However, it began -to feel like a game of whack-a-mole where new bugs would creep in every time I -fixed an old one. The PEG began to feel rather convoluted in spots, even -though it did allow for a precise (if not always accurate) specification of -the grammar. + make release + (OR) + make debug -* Performance -- "Back in the day" [peg-markdown] was one of the fastest -Markdown parsers around. MMD v3 was based on peg-markdown, and would leap- -frog with it in terms of performance. Then [CommonMark] was released, which -was a bit faster. Then a couple of years went by and CommonMark became *much* -faster -- in one of my test suites, MMD v 5.4.0 takes about 25 times longer to -process a long document than CommonMark 0.27.0. + cd build + make -[peg-markdown]: https://github.com/jgm/peg-markdown -[CommonMark]: http://commonmark.org/ +You can optionally test using the test suite: -Last spring, I decided I wanted to rewrite MultiMarkdown from scratch, -building the parser myself rather than relying on a pre-rolled solution. (I -had been using [greg](https://github.com/ooc-lang/greg) to compile the PEG -into parser code. It worked well overall, but lacked some features I needed, -requiring a lot of workarounds.) + ctest -## First Attempt ## +## Differences in the MultiMarkdown Syntax ## -My first attempt started by hand-crafting a parser that scanned through the -document a line at a time, deciding what to do with each line as it found -them. I used regex parsers made with [re2c](http://re2c.org/index.html) to -help classify each line, and then a separate parser layer to process groups of -lines into blocks. Initially this approach worked well, and was really -efficient. But I quickly began to code my way into a dead-end -- the strategy -was not elegant enough to handle things like nested lists, etc. +MultiMarkdown v6 is mostly about making a better MMD parser, but it involves a +few changes to the MultiMarkdown syntax itself. -One thing that did turn out well from the first attempt, however, was an -approach for handling `` and `` parsing. I've learned over the -years that this can be one of the hardest parts of coding accurately for -Markdown. There are many examples that are obvious to a person, but difficult -to properly "explain" how to parse to a computer. +1. Setext headers can consist of more than one line to be included in the +header: -No solution is perfect, but I developed an approach that seems to accurately -handle a wide range of situations without a great deal of complexity: - -1. Scan the documents for asterisks (`*`). Each one will be handled one at a -time. - -2. Unlike brackets (`[` and `]`), an asterisk is "ambidextrous", in that it -may be able to open a matched pair of asterisks, close a pair, or both. For -example, in `foo *bar* foo`: - - 1. The first asterisk can open a pair, but not close one. - - 2. The second asterisk can close a pair, but not open one. - -3. So, once the asterisks have been identified, each has to be examined to -determine whether it can open/close/both. The algorithm is not that complex, -but I'll describe it in general terms. Check the code for more specifics. -This approach seems to work, but might still need some slight tweaking. In -the future, I'll codify this better in language rather than just in code. - - 1. If there is whitespace to the left of an asterisk, it can't close. - - 2. If there is whitespace or punctuation to the right it can't open. - - 3. "Runs" of asterisks, e.g. `**bar` are treated as a unit in terms of - looking left/right. - - 4. Asterisks inside a word are a bit trickier -- we look at the number of - asterisks before the word, the number in the current run, and the number - of asterisks after the word to determine which combinations, if any, are - permitted. - -4. Once all asterisks have been tagged as able to open/close/both, we proceed -through them in order: - - 1. When we encounter a tag that can close, we look to see if there is a - previous opener that has not been paired off. If so, pair the two and - remove the opener from the list of available asterisks. - - 2. When we encounter an opener, add it to the stack of available openers. - - 3. When encounter an asterisk that can do both, see if it can close an - existing opener. If not, then add it to the stack. - -5. After all tokens in the block have been paired, then we look for nesting -pairs of asterisks in order to create `` and `` sets. For -example, assume we have six asterisks wrapped around a word, three in front, -and three after. The asterisks are indicated with numbers: `123foo456`. We -proceed in the following manner: - - 1. Based on the pairing algorithm above, these asterisks would be paired as - follows, with matching asterisks sharing numbers -- `123foo321`. - - 2. Moving forwards, we come to asterisk "1". It is followed by an - asterisk, so we check to see if they should be grouped as a ``. - Since the "1" asterisks are wrapped immediately outside the "2" asterisks, - they are joined together. More than two pairs can't be joined, so we now - get the following -- `112foo211`, where the "11" represents the opening - and closing of a ``, and the "2" represents a ``. - -6. When matching a pair, any unclosed openers that are on the stack are -removed, preventing pairs from "crossing" or "intersecting". Pairs can wrap -around each other, e.g. `[(foo)]`, but not intersect like `[(foo])`. In the -second case, the brackets would close, removing the `(` from the stack. - -7. This same approach is used in all tokens that are matched in pairs-- -`[foo]`, `(foo)`, `_foo_`, etc. There's slightly more to it, but once you -figure out how to assign opening/closing ability, the rest is easy. By using -a stack to track available openers, it can be performed efficiently. - -In my testing, this approach has worked quite well. It handles all the basic -scenarios I've thrown at it, and all of the "basic" and "devious" edge cases I -have thought of (some of these don't necessarily have a "right" answer -- but -v6 gives consistency answers that seem as reasonable as any others to me). -There are also three more edge cases I've come up can still stump it, and -ironically they are handled correctly by most implementations. They just -don't follow the rules above. I'll continue to work on this. - -In the end, I scrapped this effort, but kept the lessons learned in the token -pairing algorithm. - - -## Second Attempt ## - -I tried again this past Fall. This time, I approached the problem with lots -of reading. *Lots and lots* of reading -- tons of websites, computer science -journal articles, PhD theses, etc. Learned a lot about lexers, and a lot -about parsers, including hand-crafting vs using parser generators. In brief: - -1. I learned about the [Aho–Corasick algorithm], which is a great way to -efficiently search a string for multiple target strings at once. I used this -to create a custom lexer to identify tokens in a MultiMarkdown text document -(e.g. `*`, `[ `, `{++`, etc.). I learned a lot, and had a good time working -out the implementation. This code efficiently allowed me to break a string of -text into the tokens that mattered for Markdown parsing. - -2. However, in a few instances I really needed some features of regular -expressions to simplify more complex structures. After a quick bit of testing, -using re2c to create a tokenizer was just as efficient, and allowed me to -incorporate some regex functionality that simplified later parsing. I'll keep -the Aho-Corasick stuff around, and will probably experiment more with it -later. But I didn't need it for MMD now. `lexer.re` contains the source for -the tokenizer. - -[Aho–Corasick algorithm]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm - -I looked long and hard for a way to simplify the parsing algorithm to try and -"touch" each token only once. Ideally, the program could step through each -token, and decide when to create a new block, when to pair things together, -etc. But I'm not convinced it's possible. Since Markdown's grammar varies -based on context, it seems to work best when handled in distinct phases: - -1. Tokenize the string to identify key sections of text. This includes line -breaks, allowing the text to be examined one line at time. - -2. Join series of lines together into blocks, such as paragraphs, code blocks, -lists, etc. - -3. The tokens inside each block can then be paired together to create more -complex syntax such as links, strong, emphasis, etc. - -To handle the block parsing, I started off using the [Aho-Corasick] code to -handle my first attempt. I had actually implemented some basic regex -functionality, and used that to group lines together to create blocks. But -this quickly fell apart in the face of more complex structures such as -recursive lists. After a lot of searching, and *tons* more reading, I -ultimately decided to use a parser generator to handle the task of group lines -into blocks. `parser.y` has the source for this, and it is processed by the -[lemon](http://www.hwaci.com/sw/lemon/) parser generator to create the actual -code. - -I chose to do this because hand-crafting the block parser would be complex. -The end result would likely be difficult to read and understand, which would -make it difficult to update later on. Using the parser generator allows me to -write things out in a way that can more easily be understood by a person. In -all likelihood, the performance is probably as good as anything I could do -anyway, if not better. - -Because lemon is a LALR(1) parser, it does require a bit of thinking ahead -about how to create the grammar used. But so far, it has been able to handle -everything I have thrown at it. - - -## Optimization ## - -One of my goals for MMD 6 was performance. So I've paid attention to speed -along the way, and have tried to use a few tricks to keep things fast. Here -are some things I've learned along the way. In no particular order: - - -### Memory Allocation ### - -When parsing a long document, a *lot* of token structures are created. Each -one requires a small bit of memory to be allocated. In aggregate, that time -added up and slowed down performance. - -After reading for a bit, I ended up coming up with an approach that uses -larger chunks of memory. I allocate pools of of memory in large slabs for -smaller "objects"". For example, I allocate memory for 1024 tokens at a -single time, and then dole that memory out as needed. When the slab is empty, -a new one is allocated. This dramatically improved performance. - -When pairing tokens, I created a new stack for each block. I realized that an -empty stack didn't have any "leftover" cruft to interfere with re-use, so I -just used one for the entire document. Again a sizeable improvement in -performance from only allocating one object instead of many. When recursing -to a deeper level, the stack just gets deeper, but earlier levels aren't -modified. - -Speaking of tokens, I realized that the average document contains a lot of -single spaces (there's one between every two words I have written, for -example.) The vast majority of the time, these single spaces have no effect -on the output of Markdown documents. I changed my whitespace token search to -only flag runs of 2 or more spaces, dramatically reducing the number of -tokens. This gives the benefit of needing fewer memory allocations, and also -reduces the number of tokens that need to be processed later on. The only -downside is remember to check for a single space character in a few instances -where it matters. - - -### Proper input buffering ### - -When I first began last spring, I was amazed to see how much time was being -spent by MultiMarkdown simply reading the input file. Then I discovered it -was because I was reading it one character at a time. I switched to using a -buffered read approach and the time to read the file went to almost nothing. I -experimented with different buffer sizes, but they did not seem to make a -measurable difference. - - -### Output Buffering ### - -I experimented with different approaches to creating the output after parsing. -I tried printing directly to `stdout`, and even played with different -buffering settings. None of those seemed to work well, and all were slower -than using the `d_string` approach (formerly call `GString` in MMD 5). - - -### Fast Searches ### - -After getting basic Markdown functionality complete, I discovered during -testing that the time required to parse a document grew exponentially as the -document grew longer. Performance was on par with CommonMark for shorter -documents, but fell increasingly behind in larger tests. Time profiling found -that the culprit was searching for link definitions when they didn't exist. -My first approach was to keep a stack of used link definitions, and to iterate -through them when necessary. In long documents, this performs very poorly. -More research and I ended up using -[uthash](http://troydhanson.github.io/uthash/). This allows me to search for -a link (or footnote, etc.) by "name" rather than searching through an array. -This allowed me to get MMD's performance back to O(n), taking roughly twice as -much time to process a document that is twice as long. - - -### Efficient Utility Functions ### - -It is frequently necessary when parsing Markdown to check what sort of -character we are dealing with at a certain position -- a letter, whitespace, -punctuation, etc. I created a lookup table for this via `char_lookup.c` and -hard-coded it in `char.c`. These routines allow me to quickly, and -consistently, classify any byte within a document. This saved a lot of -programming time, and saved time tracking down bugs from handling things -slightly differently under different circumstances. I also suspect it -improved performance, but don't have the data to back it up. - - -### Testing While Writing ### - -I developed several chunks of code in parallel while creating MMD 6. The vast -majority of it was developed largely in a [test-driven development] approach. -The other code was largely created with extensive unit testing to accomplish -this. - -[test-driven development]: https://en.wikipedia.org/wiki/Test-driven_development - -MMD isn't particularly amenable to this approach at the small level, but -instead I relied more on integration testing with an ever-growing collection -of text files and the corresponding HTML files in the MMD 6 test suite. This -allowed me to ensure new features work properly and that old features aren't -broken. At this time, there are 29 text files in the test suite, and many -more to come. - - -### Other Lessons ### - -Some things that didn't do me any good.... - -I considered differences between using `malloc` and `calloc` when initializing -tokens. The time saved by using `malloc` was basically exactly offset by the -initial time required to initialize the token to default null values as -compared to using `calloc`. When trying `calloc` failed to help me out -(thinking that clearing a single slab in the object pool would be faster), I -stuck with `malloc` as it makes more sense to me in my workflow. - -I read a bit about [struct padding] and reordered some of my structs. It was -until later that I discovered the `-Wpadded` option, and it's not clear -whether my changes modified anything. Since the structs were being padded -automatically, there was no noticeable performance change, and I didn't have -the tools to measure whether I could have improved memory usage at all. Not -sure this would be worth the effort -- much lower hanging fruit available. - -[struct padding]: http://www.catb.org/esr/structure-packing/ - - -## Differences in MultiMarkdown Itself ## - -MultiMarkdown v6 is mostly about making a better MMD parser, but it will -likely involve a few changes to the MultiMarkdown language itself. - - -1. {--I am thinking about removing Setext headers from the language. I almost -never use them, much preferring to use ATX style headers (`# foo #`). -Additionally, I have never liked the fact that Setext headers allow the -meaning of a line to be completely changed by the following line. It makes -the parsing slightly more difficult on a technical level (requiring some -backtracking at times). I'm not 100% certain on this, but right now I believe -it's the only Markdown feature that doesn't exist in MMD 6 yet.--}{++I decided -to go ahead and implement Setext headers, as it can be done with the new -parser without backtracking. One difference with older versions of MMD, as -well as Markdown itself, is that a setext header can consist of more than one -line to be included in the header.++} + This is + a header + ======== 2. Whitespace is not allowed between the text brackets and label brackets in reference links, images, footnotes, etc. For example `[foo] [bar]` will no longer be the same as `[foo][bar]`. 3. Link and image titles can be quoted with `'foo'`, `"foo"`, or `(foo)`. +Link attributes can be used in both reference and inline links/images. 4. HTML elements are handled slightly differently. There is no longer a `markdown="1"` feature. Instead, HTML elements that are on a line by @@ -541,7 +64,8 @@ themselves will open an HTML block that will cause the rest of the "paragraph" to be treated as HTML such that Markdown will not be parsed in side of it. HTML block-level tags are even "stronger" at starting an HTML block. It is not quite as complex as the approach used in CommonMark, but is similar under -most circumstances. +most circumstances. Leaving a blank line after the opening tag will allow +MultiMarkdown parsing inside of the HTML block. For example, this would not be parsed: @@ -558,9 +82,10 @@ most circumstances. 5. "Malformed" reference link definitions are handled slightly differently. -For example, `Reference Footnotes.text` is parsed differently in compatibility -mode than MMD-5. This started as a side-effect of the parsing algorithm, but -I actually think it makes sense. This may or may not change in the future. +For example, the test suite file `Reference Footnotes.text` is parsed +differently in compatibility mode than MMD-5. This started as a side-effect +of the parsing algorithm, but I actually think it makes sense. This may or +may not change in the future. 6. Table captions in MMD-6 must come immediately *after* the table, not before it. @@ -570,103 +95,29 @@ before it. feature in MMD, but I don't see a problem with just making it default behavior. +8. Escaped spaces (`\ `) will be interpreted as a non-breaking space, if the +output format supports it. -## Where Does MultiMarkdown 6 Stand? ## - - -### Features ### - -I *think* that all basic Markdown features have been implemented. -Additionally, the following MultiMarkdown features have been implemented: - -* Automatic cross-reference targets -* Basic Citation support -* CriticMarkup support -* Definition lists -* Figures -* Footnotes -* Inline and reference footnotes -* Image and Link attributes (attributes can now be used with inline links as - well as reference links) -* Math support -* Smart quotes (support for languages other than english is not fully - implemented yet) -* Superscripts/subscripts -* Table of Contents -* Tables - - -Things that are partially completed: - -* Citations -- still need: - * Syntax for "not cited" entries - * Output format - * HTML --> separate footnotes and citations? - * Locators required? -* CriticMarkup -- need to decide: - * How to handle CM stretches that include blank lines -* Fenced code blocks -* Headers -- need support for manual labels -* Metadata -* Full/Snippet modes - - -Things yet to be completed: - -* Abbreviations -* Glossaries -* File Transclusion - - -### Accuracy ### - -MultiMarkdown v6 successfully parses the Markdown [syntax page], except for -the Setext header at the top. It passes the 29 test files currently in place. -There are a few at - -[syntax page]: https://daringfireball.net/projects/markdown/syntax - - -### Performance ### - -Basic tests show that currently MMD 6 takes about 20-25% longer the CommonMark -0.27.0 to process long files (e.g. 0.2 MB). However, it is around 5% *faster* -than CommonMark when parsing a shorter file (27 kB) (measured by parsing the -same file 200 times over). This test suite is performed by using the Markdown -[syntax page], modified to avoid the use of the Setext header at the top. The -longer files tested are created by copying the same syntax page onto itself, -thereby doubling the length of the file with each iteration. +9. CriticMarkup, Abbreviations, Glossary Terms, and Citations are handled +slightly differently. See the QuickStart guide for more information. -The largest file I test is approximately 108 MB (4096 copies of the syntax -page). On my machine (2012 Mac mini with 2.3 GHz Intel Core i7, 16 GB RAM), -it takes approximately 4.4 seconds to parse with MMD 6 and 3.7 seconds with -CommonMark. MMD 6 processes approximately 25 MB/s on this test file. -CommonMark 0.27.0 gets about 29 MB/s on the same machine. +10. Fenced code blocks can use leading/trailing "fences" of 3, 4, or 5 +backticks in length. That should be sufficient for complex documents without +requiring a more complex parser. If there is no trailing fence, then the +fenced block is considered to go through the end of the document. -There are some slight variations with the smaller test files (8-32 copies), -but overall the performance of both programs (MMD 6 and CommonMark) are -roughly linear as the test file gets bigger (double the file size and it takes -twice as long to parse, aka O(n)). +11. Emph and Strong parsing is conceptually the same, but the implementation +is different. It is designed for speed, accuracy, and consistency. In +general, it seems to handle edge cases much more reliably, but there are still +a couple of situations that I would like to take into account, if possible. +These are not situations that should occur often in "real life." -Out of curiosity, I ran the same tests on the original Markdown.pl by Gruber -(v 1.0.2b8). It took approximately 178 seconds to parse 128 copies of the -file (3.4 MB) and was demonstrating quadratic performance characteristics -(double the file size and it takes 2^2 or 4 times longer to process, aka -O(n^2)). I didn't bother running it on larger versions of the test file. For -comparison, MMD 6 can process 128 copies in approximately 140 msec. +12. EPUB 3 output is supported without need of any external tools. -Of note, the throughput speed drops when testing more complicated files -containing more advanced MultiMarkdown features, though it still seems to -maintain linear performance characteristics. A second test file is created by -concatenating all of the test suite files (including the Markdown syntax -file). In this case, MMD gets about 13 MB/s. CommonMark doesn't support -these additional features, so testing it with that file is not relevant. I -will work to see whether there are certain features in particular that are -more challenging and see whether they can be reworked to improve performance. +13. Internationalization support for HTML phrases, such as "see footnote". See +[Github](https://github.com/fletcher/MultiMarkdown-6/issues/37) for more +information. -As above, I have done some high level optimization of the parse strategy, but -I'm sure there's still a lot of room for further improvement to be made. -Suggestions welcome! ## License ## diff --git a/Sources/libMultiMarkdown/aho-corasick.c b/Sources/libMultiMarkdown/aho-corasick.c index ecd9ac4f..fb3255f6 100644 --- a/Sources/libMultiMarkdown/aho-corasick.c +++ b/Sources/libMultiMarkdown/aho-corasick.c @@ -195,13 +195,13 @@ size_t trie_node_search(trie * a, size_t s, const char * query) { return s; } - if (a->node[s].child[(int)query[0]] == 0) { + if (a->node[s].child[(unsigned char)query[0]] == 0) { // Failed to match return -1; } // Partial match, keep going - return trie_node_search(a, a->node[s].child[(int)query[0]], query + 1); + return trie_node_search(a, a->node[s].child[(unsigned char)query[0]], query + 1); } @@ -331,6 +331,7 @@ match * match_new(size_t start, size_t len, unsigned short match_type) { m->len = len; m->match_type = match_type; m->next = NULL; + m->prev = NULL; } return m; @@ -373,13 +374,13 @@ match * ac_trie_search(trie * a, const char * source, size_t start, size_t len) size_t temp_state; // Character being compared - int test_value; + unsigned char test_value; size_t counter = start; size_t stop = start + len; while ((counter < stop) && (source[counter] != '\0')) { // Read next character - test_value = (int)source[counter++]; + test_value = (unsigned char)source[counter++]; // Check for path that allows us to match next character while (state != 0 && a->node[state].child[test_value] == 0) { @@ -483,7 +484,8 @@ void match_set_filter_leftmost_longest(match * header) { } } - while (m->prev->len && + while (m->prev && + m->prev->len && m->prev->start >= m->start) { // We are "lefter" than previous n = m->prev; @@ -515,7 +517,7 @@ void Test_aho_trie_search(CuTest* tc) { ac_trie_prepare(a); - match * m = ac_trie_search(a, "this is a bar that serves food.", 31); + match * m = ac_trie_search(a, "this is a bar that serves food.", 0, 31); match_free(m); trie_free(a); diff --git a/Sources/libMultiMarkdown/html.c b/Sources/libMultiMarkdown/html.c index c6bf025e..1e54f6ce 100644 --- a/Sources/libMultiMarkdown/html.c +++ b/Sources/libMultiMarkdown/html.c @@ -1353,6 +1353,7 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc break; case PAIR_CRITIC_SUB_DEL: if ((scratch->extensions & EXT_CRITIC) && + (t->next) && (t->next->type == PAIR_CRITIC_SUB_ADD)) { t->child->type = TEXT_EMPTY; t->child->mate->type = TEXT_EMPTY; @@ -1371,6 +1372,7 @@ void mmd_export_token_html(DString * out, const char * source, token * t, scratc break; case PAIR_CRITIC_SUB_ADD: if ((scratch->extensions & EXT_CRITIC) && + (t->prev) && (t->prev->type == PAIR_CRITIC_SUB_DEL)) { t->child->type = TEXT_EMPTY; t->child->mate->type = TEXT_EMPTY; diff --git a/Sources/libMultiMarkdown/latex.c b/Sources/libMultiMarkdown/latex.c index 617f0649..ae05e4a6 100644 --- a/Sources/libMultiMarkdown/latex.c +++ b/Sources/libMultiMarkdown/latex.c @@ -787,6 +787,39 @@ void mmd_export_token_latex(DString * out, const char * source, token * t, scrat case COLON: print_const(":"); break; + case CRITIC_ADD_OPEN: + print_const("\\{++"); + break; + case CRITIC_ADD_CLOSE: + print_const("++\\}"); + break; + case CRITIC_COM_OPEN: + print_const("\\{>>"); + break; + case CRITIC_COM_CLOSE: + print_const("<<\\}"); + break; + case CRITIC_DEL_OPEN: + print_const("\\{--"); + break; + case CRITIC_DEL_CLOSE: + print_const("--\\}"); + break; + case CRITIC_HI_OPEN: + print_const("\\{=="); + break; + case CRITIC_HI_CLOSE: + print_const("==\\}"); + break; + case CRITIC_SUB_OPEN: + print_const("\\{~~"); + break; + case CRITIC_SUB_DIV: + print_const("~>"); + break; + case CRITIC_SUB_CLOSE: + print_const("~~\\}"); + break; case DASH_M: if (!(scratch->extensions & EXT_SMART)) { print_token(t); @@ -1345,6 +1378,7 @@ void mmd_export_token_latex(DString * out, const char * source, token * t, scrat break; case PAIR_CRITIC_SUB_DEL: if ((scratch->extensions & EXT_CRITIC) && + (t->next) && (t->next->type == PAIR_CRITIC_SUB_ADD)) { t->child->type = TEXT_EMPTY; t->child->mate->type = TEXT_EMPTY; @@ -1363,6 +1397,7 @@ void mmd_export_token_latex(DString * out, const char * source, token * t, scrat break; case PAIR_CRITIC_SUB_ADD: if ((scratch->extensions & EXT_CRITIC) && + (t->prev) && (t->prev->type == PAIR_CRITIC_SUB_DEL)) { t->child->type = TEXT_EMPTY; t->child->mate->type = TEXT_EMPTY; @@ -1419,7 +1454,7 @@ void mmd_export_token_latex(DString * out, const char * source, token * t, scrat break; case QUOTE_DOUBLE: if ((t->mate == NULL) || (!(scratch->extensions & EXT_SMART))) - print_const("""); + print_const("''"); else (t->start < t->mate->start) ? ( print_localized(QUOTE_LEFT_DOUBLE) ) : ( print_localized(QUOTE_RIGHT_DOUBLE) ); break; @@ -1537,6 +1572,9 @@ void mmd_export_token_latex(DString * out, const char * source, token * t, scrat case TEXT_PLAIN: print_token(t); break; + case TOC: + print_const("\\{\\{TOC\\}\\}"); + break; case UL: print_const("\\_"); break; @@ -1624,6 +1662,39 @@ void mmd_export_token_latex_tt(DString * out, const char * source, token * t, sc case ANGLE_RIGHT: print_const("$>$"); break; + case CRITIC_ADD_OPEN: + print_const("\\{++"); + break; + case CRITIC_ADD_CLOSE: + print_const("++\\}"); + break; + case CRITIC_COM_OPEN: + print_const("\\{>>"); + break; + case CRITIC_COM_CLOSE: + print_const("<<\\}"); + break; + case CRITIC_DEL_OPEN: + print_const("\\{--"); + break; + case CRITIC_DEL_CLOSE: + print_const("--\\}"); + break; + case CRITIC_HI_OPEN: + print_const("\\{=="); + break; + case CRITIC_HI_CLOSE: + print_const("==\\}"); + break; + case CRITIC_SUB_OPEN: + print_const("\\{~~"); + break; + case CRITIC_SUB_DIV: + print_const("~>"); + break; + case CRITIC_SUB_CLOSE: + print_const("~~\\}"); + break; case DASH_N: if (t->len == 1) { print_const("-"); diff --git a/Sources/libMultiMarkdown/lexer.c b/Sources/libMultiMarkdown/lexer.c index 9622db53..19520d1d 100644 --- a/Sources/libMultiMarkdown/lexer.c +++ b/Sources/libMultiMarkdown/lexer.c @@ -1,4 +1,4 @@ -/* Generated by re2c 0.14.3 on Fri Mar 10 13:21:57 2017 */ +/* Generated by re2c 0.14.3 on Wed Mar 15 00:32:21 2017 */ /** MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more. @@ -61,12 +61,11 @@ // Basic scanner struct -#define YYCTYPE char +#define YYCTYPE unsigned char #define YYCURSOR s->cur #define YYMARKER s->ptr #define YYCTXMARKER s->ctx - int scan(Scanner * s, const char * stop) { scan: @@ -85,19 +84,19 @@ int scan(Scanner * s, const char * stop) { yych = *YYCURSOR; switch (yych) { case '\t': goto yy45; - case '\n': goto yy53; - case '\r': goto yy55; + case '\n': goto yy54; + case '\r': goto yy56; case ' ': goto yy47; case '!': goto yy18; case '"': goto yy28; - case '#': goto yy48; + case '#': goto yy49; case '$': goto yy41; - case '%': goto yy50; + case '%': goto yy51; case '&': goto yy35; case '\'': goto yy30; case '(': goto yy20; case ')': goto yy22; - case '*': goto yy56; + case '*': goto yy57; case '+': goto yy4; case '-': goto yy6; case '.': goto yy32; @@ -111,7 +110,7 @@ int scan(Scanner * s, const char * stop) { case '6': case '7': case '8': - case '9': goto yy52; + case '9': goto yy53; case ':': goto yy33; case '<': goto yy8; case '=': goto yy12; @@ -120,24 +119,25 @@ int scan(Scanner * s, const char * stop) { case '\\': goto yy39; case ']': goto yy16; case '^': goto yy43; - case '_': goto yy58; - case '`': goto yy60; + case '_': goto yy59; + case '`': goto yy61; case '{': goto yy2; - case '|': goto yy62; + case '|': goto yy63; case '}': goto yy26; case '~': goto yy10; - default: goto yy64; + case 0xA0: goto yy48; + default: goto yy65; } yy2: yyaccept = 0; yych = *(YYMARKER = ++YYCURSOR); switch (yych) { - case '+': goto yy264; - case '-': goto yy263; - case '=': goto yy260; - case '>': goto yy262; - case '{': goto yy258; - case '~': goto yy261; + case '+': goto yy267; + case '-': goto yy266; + case '=': goto yy263; + case '>': goto yy265; + case '{': goto yy261; + case '~': goto yy264; default: goto yy3; } yy3: @@ -146,7 +146,7 @@ int scan(Scanner * s, const char * stop) { yyaccept = 1; yych = *(YYMARKER = ++YYCURSOR); switch (yych) { - case '+': goto yy255; + case '+': goto yy258; default: goto yy5; } yy5: @@ -154,7 +154,7 @@ int scan(Scanner * s, const char * stop) { yy6: ++YYCURSOR; switch ((yych = *YYCURSOR)) { - case '-': goto yy249; + case '-': goto yy252; default: goto yy7; } yy7: @@ -163,7 +163,7 @@ int scan(Scanner * s, const char * stop) { yyaccept = 2; yych = *(YYMARKER = ++YYCURSOR); switch (yych) { - case '<': goto yy246; + case '<': goto yy249; default: goto yy9; } yy9: @@ -172,8 +172,8 @@ int scan(Scanner * s, const char * stop) { yyaccept = 3; yych = *(YYMARKER = ++YYCURSOR); switch (yych) { - case '>': goto yy242; - case '~': goto yy241; + case '>': goto yy245; + case '~': goto yy244; default: goto yy11; } yy11: @@ -182,7 +182,7 @@ int scan(Scanner * s, const char * stop) { yyaccept = 4; yych = *(YYMARKER = ++YYCURSOR); switch (yych) { - case '=': goto yy238; + case '=': goto yy241; default: goto yy13; } yy13: @@ -190,11 +190,11 @@ int scan(Scanner * s, const char * stop) { yy14: ++YYCURSOR; switch ((yych = *YYCURSOR)) { - case '#': goto yy234; - case '%': goto yy228; - case '>': goto yy236; - case '?': goto yy230; - case '^': goto yy232; + case '#': goto yy237; + case '%': goto yy231; + case '>': goto yy239; + case '?': goto yy233; + case '^': goto yy235; default: goto yy15; } yy15: @@ -205,7 +205,7 @@ int scan(Scanner * s, const char * stop) { yy18: ++YYCURSOR; switch ((yych = *YYCURSOR)) { - case '[': goto yy226; + case '[': goto yy229; default: goto yy19; } yy19: @@ -222,7 +222,7 @@ int scan(Scanner * s, const char * stop) { yy26: ++YYCURSOR; switch ((yych = *YYCURSOR)) { - case '}': goto yy224; + case '}': goto yy227; default: goto yy27; } yy27: @@ -233,7 +233,7 @@ int scan(Scanner * s, const char * stop) { yy30: ++YYCURSOR; switch ((yych = *YYCURSOR)) { - case '\'': goto yy222; + case '\'': goto yy225; default: goto yy31; } yy31: @@ -242,8 +242,8 @@ int scan(Scanner * s, const char * stop) { yyaccept = 5; yych = *(YYMARKER = ++YYCURSOR); switch (yych) { - case ' ': goto yy214; - case '.': goto yy215; + case ' ': goto yy217; + case '.': goto yy218; default: goto yy19; } yy33: @@ -254,7 +254,7 @@ int scan(Scanner * s, const char * stop) { yych = *(YYMARKER = ++YYCURSOR); switch (yych) { case 'A': - case 'a': goto yy209; + case 'a': goto yy212; default: goto yy36; } yy36: @@ -265,41 +265,41 @@ int scan(Scanner * s, const char * stop) { yy39: ++YYCURSOR; switch ((yych = *YYCURSOR)) { - case '\n': goto yy132; - case '\r': goto yy134; - case ' ': goto yy137; - case '!': goto yy197; - case '"': goto yy187; - case '#': goto yy167; - case '$': goto yy165; - case '%': goto yy163; - case '&': goto yy151; - case '\'': goto yy185; - case '(': goto yy179; - case ')': goto yy177; - case '*': goto yy143; - case '+': goto yy161; - case ',': goto yy193; - case '-': goto yy159; - case '.': goto yy199; - case '/': goto yy147; - case ':': goto yy189; - case ';': goto yy191; - case '<': goto yy155; - case '=': goto yy157; - case '>': goto yy153; - case '?': goto yy195; - case '@': goto yy149; - case '[': goto yy171; - case '\\': goto yy135; - case ']': goto yy169; - case '^': goto yy145; - case '_': goto yy141; - case '`': goto yy183; - case '{': goto yy175; - case '|': goto yy139; - case '}': goto yy173; - case '~': goto yy181; + case '\n': goto yy135; + case '\r': goto yy137; + case ' ': goto yy140; + case '!': goto yy200; + case '"': goto yy190; + case '#': goto yy170; + case '$': goto yy168; + case '%': goto yy166; + case '&': goto yy154; + case '\'': goto yy188; + case '(': goto yy182; + case ')': goto yy180; + case '*': goto yy146; + case '+': goto yy164; + case ',': goto yy196; + case '-': goto yy162; + case '.': goto yy202; + case '/': goto yy150; + case ':': goto yy192; + case ';': goto yy194; + case '<': goto yy158; + case '=': goto yy160; + case '>': goto yy156; + case '?': goto yy198; + case '@': goto yy152; + case '[': goto yy174; + case '\\': goto yy138; + case ']': goto yy172; + case '^': goto yy148; + case '_': goto yy144; + case '`': goto yy186; + case '{': goto yy178; + case '|': goto yy142; + case '}': goto yy176; + case '~': goto yy184; default: goto yy40; } yy40: @@ -307,7 +307,7 @@ int scan(Scanner * s, const char * stop) { yy41: ++YYCURSOR; switch ((yych = *YYCURSOR)) { - case '$': goto yy130; + case '$': goto yy133; default: goto yy42; } yy42: @@ -319,14 +319,26 @@ int scan(Scanner * s, const char * stop) { ++YYCURSOR; { return INDENT_TAB; } yy47: + YYCTXMARKER = YYCURSOR + 1; yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy53; - case '\r': goto yy121; - case ' ': goto yy119; + case '\t': goto yy120; + case '\n': goto yy54; + case '\r': goto yy132; + case ' ': + case 0xA0: goto yy122; default: goto yy19; } yy48: + YYCTXMARKER = YYCURSOR + 1; + yych = *++YYCURSOR; + switch (yych) { + case '\t': goto yy120; + case ' ': + case 0xA0: goto yy122; + default: goto yy19; + } +yy49: YYCTXMARKER = YYCURSOR + 1; yyaccept = 7; yych = *(YYMARKER = ++YYCURSOR); @@ -334,20 +346,21 @@ int scan(Scanner * s, const char * stop) { case '\t': case '\n': case '\r': - case ' ': goto yy80; - case '#': goto yy78; - default: goto yy49; + case ' ': + case 0xA0: goto yy81; + case '#': goto yy79; + default: goto yy50; } -yy49: - { return TEXT_HASH; } yy50: + { return TEXT_HASH; } +yy51: ++YYCURSOR; { return TEXT_PERCENT; } -yy52: +yy53: yyaccept = 5; yych = *(YYMARKER = ++YYCURSOR); switch (yych) { - case '.': goto yy69; + case '.': goto yy70; case '0': case '1': case '2': @@ -357,67 +370,68 @@ int scan(Scanner * s, const char * stop) { case '6': case '7': case '8': - case '9': goto yy71; + case '9': goto yy72; default: goto yy19; } -yy53: - ++YYCURSOR; yy54: - { return TEXT_NL; } + ++YYCURSOR; yy55: + { return TEXT_NL; } +yy56: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy53; - default: goto yy54; + case '\n': goto yy54; + default: goto yy55; } -yy56: +yy57: ++YYCURSOR; { return STAR; } -yy58: +yy59: ++YYCURSOR; { return UL; } -yy60: +yy61: ++YYCURSOR; yych = *YYCURSOR; - goto yy68; -yy61: - { return BACKTICK; } + goto yy69; yy62: + { return BACKTICK; } +yy63: ++YYCURSOR; yych = *YYCURSOR; - goto yy66; -yy63: - { return PIPE; } + goto yy67; yy64: + { return PIPE; } +yy65: yych = *++YYCURSOR; goto yy19; -yy65: +yy66: ++YYCURSOR; yych = *YYCURSOR; -yy66: +yy67: switch (yych) { - case '|': goto yy65; - default: goto yy63; + case '|': goto yy66; + default: goto yy64; } -yy67: +yy68: ++YYCURSOR; yych = *YYCURSOR; -yy68: +yy69: switch (yych) { - case '`': goto yy67; - default: goto yy61; + case '`': goto yy68; + default: goto yy62; } -yy69: +yy70: YYCTXMARKER = YYCURSOR + 1; yych = *++YYCURSOR; switch (yych) { case '\t': - case ' ': goto yy76; - case '\n': goto yy73; - case '\r': goto yy75; - default: goto yy70; + case ' ': + case 0xA0: goto yy77; + case '\n': goto yy74; + case '\r': goto yy76; + default: goto yy71; } -yy70: +yy71: YYCURSOR = YYMARKER; switch (yyaccept) { case 0: goto yy3; @@ -427,15 +441,15 @@ int scan(Scanner * s, const char * stop) { case 4: goto yy13; case 5: goto yy19; case 6: goto yy36; - case 7: goto yy49; - case 8: goto yy127; - default: goto yy259; + case 7: goto yy50; + case 8: goto yy129; + default: goto yy262; } -yy71: +yy72: ++YYCURSOR; yych = *YYCURSOR; switch (yych) { - case '.': goto yy69; + case '.': goto yy70; case '0': case '1': case '2': @@ -445,622 +459,642 @@ int scan(Scanner * s, const char * stop) { case '6': case '7': case '8': - case '9': goto yy71; - default: goto yy70; + case '9': goto yy72; + default: goto yy71; } -yy73: - ++YYCURSOR; yy74: + ++YYCURSOR; +yy75: YYCURSOR = YYCTXMARKER; { return TEXT_NUMBER_POSS_LIST; } -yy75: +yy76: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy73; - default: goto yy74; + case '\n': goto yy74; + default: goto yy75; } -yy76: +yy77: ++YYCURSOR; yych = *YYCURSOR; switch (yych) { case '\t': - case ' ': goto yy76; - default: goto yy74; + case ' ': + case 0xA0: goto yy77; + default: goto yy75; } -yy78: +yy79: YYCTXMARKER = YYCURSOR + 1; yych = *++YYCURSOR; switch (yych) { case '\t': case '\n': case '\r': - case ' ': goto yy86; - case '#': goto yy91; - default: goto yy70; + case ' ': + case 0xA0: goto yy87; + case '#': goto yy92; + default: goto yy71; } -yy79: +yy80: ++YYCURSOR; yych = *YYCURSOR; -yy80: +yy81: switch (yych) { case '\t': - case ' ': goto yy79; - case '\n': goto yy82; - case '\r': goto yy84; - default: goto yy81; + case ' ': + case 0xA0: goto yy80; + case '\n': goto yy83; + case '\r': goto yy85; + default: goto yy82; } -yy81: - { return HASH1; } yy82: - ++YYCURSOR; + { return HASH1; } yy83: + ++YYCURSOR; +yy84: YYCURSOR = YYCTXMARKER; { return HASH1; } -yy84: +yy85: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy82; - default: goto yy83; + case '\n': goto yy83; + default: goto yy84; } -yy85: +yy86: ++YYCURSOR; yych = *YYCURSOR; -yy86: +yy87: switch (yych) { case '\t': - case ' ': goto yy85; - case '\n': goto yy88; - case '\r': goto yy90; - default: goto yy87; + case ' ': + case 0xA0: goto yy86; + case '\n': goto yy89; + case '\r': goto yy91; + default: goto yy88; } -yy87: - { return HASH2; } yy88: - ++YYCURSOR; + { return HASH2; } yy89: + ++YYCURSOR; +yy90: YYCURSOR = YYCTXMARKER; { return HASH2; } -yy90: +yy91: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy88; - default: goto yy89; + case '\n': goto yy89; + default: goto yy90; } -yy91: +yy92: YYCTXMARKER = YYCURSOR + 1; yych = *++YYCURSOR; switch (yych) { case '\t': case '\n': case '\r': - case ' ': goto yy94; - case '#': goto yy92; - default: goto yy70; + case ' ': + case 0xA0: goto yy95; + case '#': goto yy93; + default: goto yy71; } -yy92: +yy93: YYCTXMARKER = YYCURSOR + 1; yych = *++YYCURSOR; switch (yych) { case '\t': case '\n': case '\r': - case ' ': goto yy100; - case '#': goto yy105; - default: goto yy70; + case ' ': + case 0xA0: goto yy101; + case '#': goto yy106; + default: goto yy71; } -yy93: +yy94: ++YYCURSOR; yych = *YYCURSOR; -yy94: +yy95: switch (yych) { case '\t': - case ' ': goto yy93; - case '\n': goto yy96; - case '\r': goto yy98; - default: goto yy95; + case ' ': + case 0xA0: goto yy94; + case '\n': goto yy97; + case '\r': goto yy99; + default: goto yy96; } -yy95: - { return HASH3; } yy96: - ++YYCURSOR; + { return HASH3; } yy97: + ++YYCURSOR; +yy98: YYCURSOR = YYCTXMARKER; { return HASH3; } -yy98: +yy99: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy96; - default: goto yy97; + case '\n': goto yy97; + default: goto yy98; } -yy99: +yy100: ++YYCURSOR; yych = *YYCURSOR; -yy100: +yy101: switch (yych) { case '\t': - case ' ': goto yy99; - case '\n': goto yy102; - case '\r': goto yy104; - default: goto yy101; + case ' ': + case 0xA0: goto yy100; + case '\n': goto yy103; + case '\r': goto yy105; + default: goto yy102; } -yy101: - { return HASH4; } yy102: - ++YYCURSOR; + { return HASH4; } yy103: + ++YYCURSOR; +yy104: YYCURSOR = YYCTXMARKER; { return HASH4; } -yy104: +yy105: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy102; - default: goto yy103; + case '\n': goto yy103; + default: goto yy104; } -yy105: +yy106: YYCTXMARKER = YYCURSOR + 1; yych = *++YYCURSOR; switch (yych) { case '\t': case '\n': case '\r': - case ' ': goto yy108; - case '#': goto yy106; - default: goto yy70; + case ' ': + case 0xA0: goto yy109; + case '#': goto yy107; + default: goto yy71; } -yy106: +yy107: YYCTXMARKER = YYCURSOR + 1; yych = *++YYCURSOR; switch (yych) { case '\t': case '\n': case '\r': - case ' ': goto yy114; - default: goto yy70; + case ' ': + case 0xA0: goto yy115; + default: goto yy71; } -yy107: +yy108: ++YYCURSOR; yych = *YYCURSOR; -yy108: +yy109: switch (yych) { case '\t': - case ' ': goto yy107; - case '\n': goto yy110; - case '\r': goto yy112; - default: goto yy109; + case ' ': + case 0xA0: goto yy108; + case '\n': goto yy111; + case '\r': goto yy113; + default: goto yy110; } -yy109: - { return HASH5; } yy110: - ++YYCURSOR; + { return HASH5; } yy111: + ++YYCURSOR; +yy112: YYCURSOR = YYCTXMARKER; { return HASH5; } -yy112: +yy113: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy110; - default: goto yy111; + case '\n': goto yy111; + default: goto yy112; } -yy113: +yy114: ++YYCURSOR; yych = *YYCURSOR; -yy114: +yy115: switch (yych) { case '\t': - case ' ': goto yy113; - case '\n': goto yy116; - case '\r': goto yy118; - default: goto yy115; + case ' ': + case 0xA0: goto yy114; + case '\n': goto yy117; + case '\r': goto yy119; + default: goto yy116; } -yy115: - { return HASH6; } yy116: - ++YYCURSOR; + { return HASH6; } yy117: + ++YYCURSOR; +yy118: YYCURSOR = YYCTXMARKER; { return HASH6; } -yy118: +yy119: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy116; - default: goto yy117; + case '\n': goto yy117; + default: goto yy118; } -yy119: +yy120: + ++YYCURSOR; + YYCURSOR = YYCTXMARKER; + { return NON_INDENT_SPACE; } +yy122: ++YYCURSOR; switch ((yych = *YYCURSOR)) { - case '\n': goto yy123; - case '\r': goto yy125; - case ' ': goto yy122; - default: goto yy120; + case '\n': goto yy125; + case '\r': goto yy127; + case ' ': + case 0xA0: goto yy124; + default: goto yy123; } -yy120: +yy123: { return NON_INDENT_SPACE; } -yy121: - yych = *++YYCURSOR; - switch (yych) { - case '\n': goto yy53; - default: goto yy54; - } -yy122: +yy124: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy123; - case '\r': goto yy125; - case ' ': goto yy126; - default: goto yy120; + case '\n': goto yy125; + case '\r': goto yy127; + case ' ': + case 0xA0: goto yy128; + default: goto yy123; } -yy123: +yy125: ++YYCURSOR; -yy124: +yy126: { return TEXT_LINEBREAK; } -yy125: +yy127: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy123; - default: goto yy124; + case '\n': goto yy125; + default: goto yy126; } -yy126: +yy128: yyaccept = 8; yych = *(YYMARKER = ++YYCURSOR); switch (yych) { - case '\n': goto yy123; - case '\r': goto yy125; - case ' ': goto yy128; - default: goto yy127; + case '\n': goto yy125; + case '\r': goto yy127; + case ' ': + case 0xA0: goto yy130; + default: goto yy129; } -yy127: +yy129: { return INDENT_SPACE; } -yy128: +yy130: ++YYCURSOR; yych = *YYCURSOR; switch (yych) { - case '\n': goto yy123; - case '\r': goto yy125; - case ' ': goto yy128; - default: goto yy70; + case '\n': goto yy125; + case '\r': goto yy127; + case ' ': + case 0xA0: goto yy130; + default: goto yy71; } -yy130: +yy132: + yych = *++YYCURSOR; + switch (yych) { + case '\n': goto yy54; + default: goto yy55; + } +yy133: ++YYCURSOR; { return MATH_DOLLAR_DOUBLE; } -yy132: +yy135: ++YYCURSOR; -yy133: +yy136: { return TEXT_LINEBREAK; } -yy134: +yy137: yych = *++YYCURSOR; switch (yych) { - case '\n': goto yy132; - default: goto yy133; + case '\n': goto yy135; + default: goto yy136; } -yy135: +yy138: ++YYCURSOR; switch ((yych = *YYCURSOR)) { - case '(': goto yy201; - case ')': goto yy203; - case '[': goto yy205; - case ']': goto yy207; - default: goto yy136; + case '(': goto yy204; + case ')': goto yy206; + case '[': goto yy208; + case ']': goto yy210; + default: goto yy139; } -yy136: +yy139: { return ESCAPED_CHARACTER; } -yy137: +yy140: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy139: +yy142: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy141: +yy144: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy143: +yy146: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy145: +yy148: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy147: +yy150: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy149: +yy152: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy151: +yy154: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy153: +yy156: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy155: +yy158: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy157: +yy160: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy159: +yy162: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy161: +yy164: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy163: +yy166: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy165: +yy168: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy167: +yy170: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy169: +yy172: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy171: +yy174: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy173: +yy176: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy175: +yy178: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy177: +yy180: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy179: +yy182: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy181: +yy184: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy183: +yy186: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy185: +yy188: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy187: +yy190: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy189: +yy192: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy191: +yy194: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy193: +yy196: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy195: +yy198: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy197: +yy200: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy199: +yy202: ++YYCURSOR; { return ESCAPED_CHARACTER; } -yy201: +yy204: ++YYCURSOR; { return MATH_PAREN_OPEN; } -yy203: +yy206: ++YYCURSOR; { return MATH_PAREN_CLOSE; } -yy205: +yy208: ++YYCURSOR; { return MATH_BRACKET_OPEN; } -yy207: +yy210: ++YYCURSOR; { return MATH_BRACKET_CLOSE; } -yy209: +yy212: yych = *++YYCURSOR; switch (yych) { case 'M': - case 'm': goto yy210; - default: goto yy70; + case 'm': goto yy213; + default: goto yy71; } -yy210: +yy213: yych = *++YYCURSOR; switch (yych) { case 'P': - case 'p': goto yy211; - default: goto yy70; + case 'p': goto yy214; + default: goto yy71; } -yy211: +yy214: yych = *++YYCURSOR; switch (yych) { - case ';': goto yy212; - default: goto yy70; + case ';': goto yy215; + default: goto yy71; } -yy212: +yy215: ++YYCURSOR; { return AMPERSAND_LONG; } -yy214: +yy217: yych = *++YYCURSOR; switch (yych) { - case '.': goto yy218; - default: goto yy70; + case '.': goto yy221; + default: goto yy71; } -yy215: +yy218: yych = *++YYCURSOR; switch (yych) { - case '.': goto yy216; - default: goto yy70; + case '.': goto yy219; + default: goto yy71; } -yy216: +yy219: ++YYCURSOR; { return ELLIPSIS; } -yy218: +yy221: yych = *++YYCURSOR; switch (yych) { - case ' ': goto yy219; - default: goto yy70; + case ' ': goto yy222; + default: goto yy71; } -yy219: +yy222: yych = *++YYCURSOR; switch (yych) { - case '.': goto yy220; - default: goto yy70; + case '.': goto yy223; + default: goto yy71; } -yy220: +yy223: ++YYCURSOR; { return ELLIPSIS; } -yy222: +yy225: ++YYCURSOR; { return QUOTE_RIGHT_ALT; } -yy224: +yy227: ++YYCURSOR; { return BRACE_DOUBLE_RIGHT; } -yy226: +yy229: ++YYCURSOR; { return BRACKET_IMAGE_LEFT; } -yy228: +yy231: ++YYCURSOR; { return BRACKET_VARIABLE_LEFT; } -yy230: +yy233: ++YYCURSOR; { return BRACKET_GLOSSARY_LEFT; } -yy232: +yy235: ++YYCURSOR; { return BRACKET_FOOTNOTE_LEFT; } -yy234: +yy237: ++YYCURSOR; { return BRACKET_CITATION_LEFT; } -yy236: +yy239: ++YYCURSOR; { return BRACKET_ABBREVIATION_LEFT; } -yy238: +yy241: yych = *++YYCURSOR; switch (yych) { - case '}': goto yy239; - default: goto yy70; + case '}': goto yy242; + default: goto yy71; } -yy239: +yy242: ++YYCURSOR; { return CRITIC_HI_CLOSE; } -yy241: +yy244: yych = *++YYCURSOR; switch (yych) { - case '}': goto yy244; - default: goto yy70; + case '}': goto yy247; + default: goto yy71; } -yy242: +yy245: ++YYCURSOR; { return CRITIC_SUB_DIV; } -yy244: +yy247: ++YYCURSOR; { return CRITIC_SUB_CLOSE; } -yy246: +yy249: yych = *++YYCURSOR; switch (yych) { - case '}': goto yy247; - default: goto yy70; + case '}': goto yy250; + default: goto yy71; } -yy247: +yy250: ++YYCURSOR; { return CRITIC_COM_CLOSE; } -yy249: +yy252: ++YYCURSOR; switch ((yych = *YYCURSOR)) { - case '-': goto yy253; - case '}': goto yy251; - default: goto yy250; + case '-': goto yy256; + case '}': goto yy254; + default: goto yy253; } -yy250: +yy253: { return DASH_N; } -yy251: +yy254: ++YYCURSOR; { return CRITIC_DEL_CLOSE; } -yy253: +yy256: ++YYCURSOR; { return DASH_M; } -yy255: +yy258: yych = *++YYCURSOR; switch (yych) { - case '}': goto yy256; - default: goto yy70; + case '}': goto yy259; + default: goto yy71; } -yy256: +yy259: ++YYCURSOR; { return CRITIC_ADD_CLOSE; } -yy258: +yy261: yyaccept = 9; yych = *(YYMARKER = ++YYCURSOR); switch (yych) { - case 'T': goto yy275; - default: goto yy259; + case 'T': goto yy278; + default: goto yy262; } -yy259: +yy262: { return BRACE_DOUBLE_LEFT; } -yy260: +yy263: yych = *++YYCURSOR; switch (yych) { - case '=': goto yy273; - default: goto yy70; + case '=': goto yy276; + default: goto yy71; } -yy261: +yy264: yych = *++YYCURSOR; switch (yych) { - case '~': goto yy271; - default: goto yy70; + case '~': goto yy274; + default: goto yy71; } -yy262: +yy265: yych = *++YYCURSOR; switch (yych) { - case '>': goto yy269; - default: goto yy70; + case '>': goto yy272; + default: goto yy71; } -yy263: +yy266: yych = *++YYCURSOR; switch (yych) { - case '-': goto yy267; - default: goto yy70; + case '-': goto yy270; + default: goto yy71; } -yy264: +yy267: yych = *++YYCURSOR; switch (yych) { - case '+': goto yy265; - default: goto yy70; + case '+': goto yy268; + default: goto yy71; } -yy265: +yy268: ++YYCURSOR; { return CRITIC_ADD_OPEN; } -yy267: +yy270: ++YYCURSOR; { return CRITIC_DEL_OPEN; } -yy269: +yy272: ++YYCURSOR; { return CRITIC_COM_OPEN; } -yy271: +yy274: ++YYCURSOR; { return CRITIC_SUB_OPEN; } -yy273: +yy276: ++YYCURSOR; { return CRITIC_HI_OPEN; } -yy275: +yy278: yych = *++YYCURSOR; switch (yych) { - case 'O': goto yy276; - default: goto yy70; + case 'O': goto yy279; + default: goto yy71; } -yy276: +yy279: yych = *++YYCURSOR; switch (yych) { - case 'C': goto yy277; - default: goto yy70; + case 'C': goto yy280; + default: goto yy71; } -yy277: +yy280: yych = *++YYCURSOR; switch (yych) { - case '}': goto yy278; - default: goto yy70; + case '}': goto yy281; + default: goto yy71; } -yy278: +yy281: yych = *++YYCURSOR; switch (yych) { - case '}': goto yy279; - default: goto yy70; + case '}': goto yy282; + default: goto yy71; } -yy279: +yy282: ++YYCURSOR; { return TOC; } } diff --git a/Sources/libMultiMarkdown/lexer.re b/Sources/libMultiMarkdown/lexer.re index 9648f66f..998169f9 100644 --- a/Sources/libMultiMarkdown/lexer.re +++ b/Sources/libMultiMarkdown/lexer.re @@ -60,12 +60,11 @@ // Basic scanner struct -#define YYCTYPE char +#define YYCTYPE unsigned char #define YYCURSOR s->cur #define YYMARKER s->ptr #define YYCTXMARKER s->ctx - int scan(Scanner * s, const char * stop) { scan: @@ -80,15 +79,16 @@ int scan(Scanner * s, const char * stop) { re2c:yyfill:enable = 0; NL = "\r\n" | '\n' | '\r'; - SP = [ \t]+; + WS = [ \t\240]; // Whitespace from char_lookup.c + SP = WS+; - SPNL = [ \t]* NL; + SPNL = WS* NL; INDENT_TAB = '\t'; - INDENT_SPACE = ' '{4}; - NON_INDENT_SPACE = ' '{2,3}; + INDENT_SPACE = [ \240]{4}; + NON_INDENT_SPACE = [ \240]{2,3}; - TEXT_LINEBREAK = ' '{2,} NL; + TEXT_LINEBREAK = [ \240]{2,} NL; // The order of these seems to matter @@ -226,6 +226,7 @@ int scan(Scanner * s, const char * stop) { ' '? NL { return TEXT_NL; } NON_INDENT_SPACE { return NON_INDENT_SPACE; } + [ \240] / '\t' { return NON_INDENT_SPACE; } "*" { return STAR; } "+" { return PLUS; } diff --git a/Sources/libMultiMarkdown/mmd.c b/Sources/libMultiMarkdown/mmd.c index 53325997..ff003316 100644 --- a/Sources/libMultiMarkdown/mmd.c +++ b/Sources/libMultiMarkdown/mmd.c @@ -402,22 +402,26 @@ void mmd_assign_line_type(mmd_engine * e, token * line) { case HASH4: case HASH5: case HASH6: - line->type = (line->child->type - HASH1) + LINE_ATX_1; - line->child->type = (line->type - LINE_ATX_1) + MARKER_H1; - - // Strip trailing whitespace from '#' sequence - line->child->len = line->child->type - MARKER_H1 + 1; - - // Strip trailing '#' sequence if present - if (line->child->tail->type == TEXT_NL) { - if ((line->child->tail->prev->type >= HASH1) && - (line->child->tail->prev->type <= HASH6)) - line->child->tail->prev->type = TEXT_EMPTY; + if (scan_atx(&source[line->child->start])) { + line->type = (line->child->type - HASH1) + LINE_ATX_1; + line->child->type = (line->type - LINE_ATX_1) + MARKER_H1; + + // Strip trailing whitespace from '#' sequence + line->child->len = line->child->type - MARKER_H1 + 1; + + // Strip trailing '#' sequence if present + if (line->child->tail->type == TEXT_NL) { + if ((line->child->tail->prev->type >= HASH1) && + (line->child->tail->prev->type <= HASH6)) + line->child->tail->prev->type = TEXT_EMPTY; + } else { +// token_describe(line->child->tail, NULL); + if ((line->child->tail->type >= HASH1) && + (line->child->tail->type <= HASH6)) + line->child->tail->type = TEXT_EMPTY; + } } else { - token_describe(line->child->tail, NULL); - if ((line->child->tail->type >= HASH1) && - (line->child->tail->type <= HASH6)) - line->child->tail->type = TEXT_EMPTY; + line->type = LINE_PLAIN; } break; case TEXT_NUMBER_POSS_LIST: @@ -813,8 +817,13 @@ token * mmd_tokenize_string(mmd_engine * e, const char * str, size_t len, bool s token_append_child(line, t); } } + } else if (type == 0 && stop > last_stop) { + // Source text ends without newline + t = token_new(TEXT_PLAIN, (size_t)(last_stop - str), (size_t)(stop - last_stop)); + token_append_child(line, t); } + switch (type) { case 0: // 0 means we finished with input @@ -1316,12 +1325,17 @@ void mmd_assign_ambidextrous_tokens_in_block(mmd_engine * e, token * block, cons t->can_close = 0; // Shift next token right and move those characters as child node - if ((t->next != NULL) && ((t->next->type == TEXT_PLAIN) || (t->next->type == TEXT_NUMBER_POSS_LIST))) { - t->next->start += t->len - 1; - t->next->len -= t->len - 1; + // It's possible that one (or more?) tokens are entirely subsumed. + while (t->next && t->next->start + t->next->len < offset) { + tokens_prune(t->next, t->next); + } - t->child = token_new(TEXT_PLAIN, t->start + 1, t->len - 1); + if ((t->next != NULL) && ((t->next->type == TEXT_PLAIN) || (t->next->type == TEXT_NUMBER_POSS_LIST))) { + t->next->len = t->next->start + t->next->len - offset; + t->next->start = offset; } + + t->child = token_new(TEXT_PLAIN, t->start + 1, t->len - 1); } } @@ -1359,7 +1373,8 @@ void pair_emphasis_tokens(token * t) { case STAR: case UL: closer = t->mate; - if ((t->next->mate == closer->prev) && + if (t->next && + (t->next->mate == closer->prev) && (t->type == t->next->type) && (t->next->mate != t) && (t->start+t->len == t->next->start) && @@ -1508,6 +1523,7 @@ void strip_line_tokens_from_metadata(mmd_engine * e, token * metadata) { while (l) { switch (l->type) { case LINE_META: + meta: if (m) { meta_set_value(m, d->str); d_string_erase(d, 0, -1); @@ -1526,9 +1542,16 @@ void strip_line_tokens_from_metadata(mmd_engine * e, token * metadata) { l->len--; } case LINE_PLAIN: + plain: d_string_append_c(d, '\n'); d_string_append_c_array(d, &source[l->start], l->len); break; + case LINE_TABLE: + if (scan_meta_line(&source[l->start])) { + goto meta; + } else { + goto plain; + } default: fprintf(stderr, "ERROR!\n"); token_describe(l, NULL); @@ -1739,6 +1762,26 @@ void strip_line_tokens_from_block(mmd_engine * e, token * block) { // Advance to next line l = l->next; break; + case BLOCK_DEFINITION: + // Sometimes these get created unintentionally inside other blocks + // Process inside it, then treat it like a line to be stripped + + // Change to plain line + l->child->type = LINE_PLAIN; + strip_line_tokens_from_block(e, l); + + // Move children to parent + // Add ':' back + if (e->dstr->str[l->child->start - 1] == ':') { + temp = token_new(COLON, l->child->start - 1, 1); + token_append_child(block, temp); + } + token_append_child(block, l->child); + l->child = NULL; + if (children == NULL) + children = l; + l = l->next; + break; case LINE_TABLE_SEPARATOR: case LINE_TABLE: if (block->type == BLOCK_TABLE_HEADER) { @@ -1751,7 +1794,8 @@ void strip_line_tokens_from_block(mmd_engine * e, token * block) { goto handle_line; } default: - //fprintf(stderr, "Unspecified line type %d inside block type %d\n", l->type, block->type); + // token_describe(block, e->dstr->str); + // fprintf(stderr, "Unspecified line type %d inside block type %d\n", l->type, block->type); // This is a block, need to remove it from chain and // Add to parent temp = l->next; diff --git a/Sources/libMultiMarkdown/odf.c b/Sources/libMultiMarkdown/odf.c index a610d7fe..25f4e586 100644 --- a/Sources/libMultiMarkdown/odf.c +++ b/Sources/libMultiMarkdown/odf.c @@ -1331,6 +1331,7 @@ void mmd_export_token_odf(DString * out, const char * source, token * t, scratch break; case PAIR_CRITIC_SUB_DEL: if ((scratch->extensions & EXT_CRITIC) && + (t->next) && (t->next->type == PAIR_CRITIC_SUB_ADD)) { t->child->type = TEXT_EMPTY; t->child->mate->type = TEXT_EMPTY; @@ -1349,6 +1350,7 @@ void mmd_export_token_odf(DString * out, const char * source, token * t, scratch break; case PAIR_CRITIC_SUB_ADD: if ((scratch->extensions & EXT_CRITIC) && + (t->prev) && (t->prev->type == PAIR_CRITIC_SUB_DEL)) { t->child->type = TEXT_EMPTY; t->child->mate->type = TEXT_EMPTY; diff --git a/Sources/libMultiMarkdown/parser.c b/Sources/libMultiMarkdown/parser.c index 8c4cd996..7926ab19 100644 --- a/Sources/libMultiMarkdown/parser.c +++ b/Sources/libMultiMarkdown/parser.c @@ -108,15 +108,15 @@ typedef union { #define ParseARG_STORE yypParser->engine = engine #define YYFALLBACK 1 #define YYNSTATE 44 -#define YYNRULE 135 +#define YYNRULE 136 #define YY_MAX_SHIFT 43 -#define YY_MIN_SHIFTREDUCE 137 -#define YY_MAX_SHIFTREDUCE 271 -#define YY_MIN_REDUCE 272 -#define YY_MAX_REDUCE 406 -#define YY_ERROR_ACTION 407 -#define YY_ACCEPT_ACTION 408 -#define YY_NO_ACTION 409 +#define YY_MIN_SHIFTREDUCE 138 +#define YY_MAX_SHIFTREDUCE 273 +#define YY_MIN_REDUCE 274 +#define YY_MAX_REDUCE 409 +#define YY_ERROR_ACTION 410 +#define YY_ACCEPT_ACTION 411 +#define YY_NO_ACTION 412 /************* End control #defines *******************************************/ /* Define the yytestcase() macro to be a no-op if is not already defined @@ -188,95 +188,93 @@ typedef union { ** yy_default[] Default action for each state. ** *********** Begin parsing tables **********************************************/ -#define YY_ACTTAB_COUNT (278) +#define YY_ACTTAB_COUNT (266) static const YYACTIONTYPE yy_action[] = { - /* 0 */ 408, 1, 139, 32, 149, 150, 151, 152, 153, 154, - /* 10 */ 43, 156, 31, 29, 40, 38, 30, 14, 163, 164, - /* 20 */ 165, 42, 218, 13, 13, 29, 241, 227, 228, 270, - /* 30 */ 34, 34, 27, 266, 26, 25, 42, 40, 38, 221, - /* 40 */ 8, 224, 41, 193, 15, 15, 272, 146, 270, 182, - /* 50 */ 16, 270, 266, 227, 228, 266, 255, 140, 141, 142, - /* 60 */ 143, 144, 145, 232, 7, 6, 17, 4, 3, 2, - /* 70 */ 18, 28, 219, 147, 5, 241, 243, 246, 249, 244, - /* 80 */ 247, 250, 261, 146, 224, 167, 16, 270, 189, 227, - /* 90 */ 228, 266, 255, 140, 141, 142, 143, 144, 145, 232, - /* 100 */ 7, 6, 17, 4, 3, 2, 18, 241, 263, 147, - /* 110 */ 5, 241, 243, 246, 249, 244, 247, 250, 261, 138, - /* 120 */ 32, 149, 150, 151, 152, 153, 154, 43, 156, 31, - /* 130 */ 29, 40, 38, 30, 14, 163, 164, 165, 262, 253, - /* 140 */ 254, 186, 29, 251, 37, 37, 256, 34, 34, 27, - /* 150 */ 257, 26, 25, 23, 40, 38, 216, 8, 199, 41, - /* 160 */ 217, 15, 15, 211, 213, 214, 252, 194, 196, 192, - /* 170 */ 195, 197, 253, 254, 28, 198, 251, 9, 42, 9, - /* 180 */ 13, 13, 33, 33, 172, 10, 10, 19, 19, 166, - /* 190 */ 5, 175, 175, 35, 35, 5, 28, 36, 36, 252, - /* 200 */ 190, 187, 188, 191, 28, 39, 39, 10, 10, 19, - /* 210 */ 19, 179, 234, 174, 174, 10, 10, 19, 19, 28, - /* 220 */ 206, 173, 173, 233, 253, 254, 28, 6, 251, 28, - /* 230 */ 10, 10, 19, 19, 28, 7, 180, 180, 166, 20, - /* 240 */ 20, 166, 24, 24, 201, 11, 11, 21, 21, 28, - /* 250 */ 207, 252, 183, 184, 185, 227, 228, 182, 217, 274, - /* 260 */ 12, 12, 22, 22, 274, 202, 274, 274, 274, 274, - /* 270 */ 274, 274, 274, 274, 274, 274, 274, 200, + /* 0 */ 411, 1, 140, 32, 150, 151, 152, 153, 154, 155, + /* 10 */ 43, 157, 30, 29, 40, 38, 31, 14, 164, 165, + /* 20 */ 166, 42, 219, 13, 13, 29, 268, 272, 37, 37, + /* 30 */ 34, 34, 27, 225, 26, 25, 28, 40, 38, 265, + /* 40 */ 8, 242, 41, 194, 15, 15, 274, 147, 167, 20, + /* 50 */ 20, 16, 228, 229, 268, 272, 256, 141, 142, 143, + /* 60 */ 144, 145, 146, 233, 7, 6, 17, 4, 3, 2, + /* 70 */ 18, 264, 28, 148, 5, 242, 244, 247, 250, 245, + /* 80 */ 248, 251, 263, 147, 167, 24, 24, 16, 228, 229, + /* 90 */ 268, 272, 256, 141, 142, 143, 144, 145, 146, 233, + /* 100 */ 7, 6, 17, 4, 3, 2, 18, 33, 33, 148, + /* 110 */ 5, 242, 244, 247, 250, 245, 248, 251, 263, 139, + /* 120 */ 32, 150, 151, 152, 153, 154, 155, 43, 157, 30, + /* 130 */ 29, 40, 38, 31, 14, 164, 165, 166, 254, 255, + /* 140 */ 252, 23, 29, 268, 272, 35, 35, 34, 34, 27, + /* 150 */ 42, 26, 25, 222, 40, 38, 190, 8, 187, 41, + /* 160 */ 200, 15, 15, 28, 220, 253, 195, 197, 193, 196, + /* 170 */ 198, 254, 255, 252, 10, 10, 19, 19, 28, 199, + /* 180 */ 176, 176, 9, 42, 9, 13, 13, 214, 215, 10, + /* 190 */ 10, 19, 19, 228, 229, 175, 175, 28, 253, 191, + /* 200 */ 188, 189, 192, 254, 255, 252, 28, 173, 10, 10, + /* 210 */ 19, 19, 167, 5, 174, 174, 183, 10, 10, 19, + /* 220 */ 19, 228, 229, 181, 181, 28, 259, 36, 36, 257, + /* 230 */ 253, 184, 185, 186, 225, 258, 11, 11, 21, 21, + /* 240 */ 28, 208, 235, 28, 201, 39, 39, 218, 217, 212, + /* 250 */ 5, 12, 12, 22, 22, 234, 203, 168, 180, 242, + /* 260 */ 6, 7, 218, 183, 207, 202, }; static const YYCODETYPE yy_lookahead[] = { /* 0 */ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, /* 10 */ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, - /* 20 */ 58, 80, 81, 82, 83, 63, 29, 7, 8, 5, - /* 30 */ 68, 69, 70, 9, 72, 73, 80, 75, 76, 83, - /* 40 */ 78, 6, 80, 71, 82, 83, 0, 1, 5, 29, - /* 50 */ 4, 5, 9, 7, 8, 9, 10, 11, 12, 13, + /* 20 */ 58, 80, 81, 82, 83, 63, 8, 9, 68, 69, + /* 30 */ 68, 69, 70, 4, 72, 73, 48, 75, 76, 4, + /* 40 */ 78, 29, 80, 71, 82, 83, 0, 1, 60, 61, + /* 50 */ 62, 5, 6, 7, 8, 9, 10, 11, 12, 13, /* 60 */ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, - /* 70 */ 24, 48, 29, 27, 28, 29, 30, 31, 32, 33, - /* 80 */ 34, 35, 36, 1, 6, 62, 4, 5, 71, 7, + /* 70 */ 24, 36, 48, 27, 28, 29, 30, 31, 32, 33, + /* 80 */ 34, 35, 36, 1, 60, 61, 62, 5, 6, 7, /* 90 */ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, - /* 100 */ 18, 19, 20, 21, 22, 23, 24, 29, 6, 27, + /* 100 */ 18, 19, 20, 21, 22, 23, 24, 59, 60, 27, /* 110 */ 28, 29, 30, 31, 32, 33, 34, 35, 36, 40, /* 120 */ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, - /* 130 */ 51, 52, 53, 54, 55, 56, 57, 58, 36, 2, - /* 140 */ 3, 71, 63, 6, 68, 69, 6, 68, 69, 70, - /* 150 */ 10, 72, 73, 63, 75, 76, 5, 78, 63, 80, - /* 160 */ 9, 82, 83, 77, 2, 3, 29, 30, 31, 32, - /* 170 */ 33, 34, 2, 3, 48, 74, 6, 79, 80, 81, - /* 180 */ 82, 83, 59, 60, 67, 59, 60, 61, 62, 60, - /* 190 */ 28, 65, 66, 59, 60, 28, 48, 59, 60, 29, - /* 200 */ 30, 31, 32, 33, 48, 59, 60, 59, 60, 61, - /* 210 */ 62, 69, 6, 65, 66, 59, 60, 61, 62, 48, - /* 220 */ 76, 65, 66, 17, 2, 3, 48, 19, 6, 48, - /* 230 */ 59, 60, 61, 62, 48, 18, 65, 66, 60, 61, - /* 240 */ 62, 60, 61, 62, 75, 59, 60, 61, 62, 48, - /* 250 */ 64, 29, 30, 31, 32, 7, 8, 29, 9, 84, - /* 260 */ 59, 60, 61, 62, 84, 64, 84, 84, 84, 84, - /* 270 */ 84, 84, 84, 84, 84, 84, 84, 29, + /* 130 */ 51, 52, 53, 54, 55, 56, 57, 58, 2, 3, + /* 140 */ 4, 63, 63, 8, 9, 59, 60, 68, 69, 70, + /* 150 */ 80, 72, 73, 83, 75, 76, 71, 78, 71, 80, + /* 160 */ 63, 82, 83, 48, 29, 29, 30, 31, 32, 33, + /* 170 */ 34, 2, 3, 4, 59, 60, 61, 62, 48, 74, + /* 180 */ 65, 66, 79, 80, 81, 82, 83, 2, 3, 59, + /* 190 */ 60, 61, 62, 6, 7, 65, 66, 48, 29, 30, + /* 200 */ 31, 32, 33, 2, 3, 4, 48, 67, 59, 60, + /* 210 */ 61, 62, 60, 28, 65, 66, 29, 59, 60, 61, + /* 220 */ 62, 6, 7, 65, 66, 48, 1, 59, 60, 4, + /* 230 */ 29, 30, 31, 32, 4, 10, 59, 60, 61, 62, + /* 240 */ 48, 64, 4, 48, 29, 59, 60, 8, 9, 77, + /* 250 */ 28, 59, 60, 61, 62, 17, 64, 62, 69, 29, + /* 260 */ 19, 18, 8, 29, 76, 75, }; -#define YY_SHIFT_USE_DFLT (278) +#define YY_SHIFT_USE_DFLT (266) #define YY_SHIFT_COUNT (43) -#define YY_SHIFT_MIN (-3) -#define YY_SHIFT_MAX (249) -static const short yy_shift_ofst[] = { - /* 0 */ 82, 46, 78, 78, 78, 78, 78, 78, 24, 24, - /* 10 */ 78, 78, 78, 43, 162, 24, 35, 35, 35, -3, - /* 20 */ -3, -3, -3, 35, -3, 137, 170, 222, 20, 248, - /* 30 */ 102, 140, 206, 35, 167, 35, 35, 167, 208, 35, - /* 40 */ 217, 151, 249, 228, +#define YY_SHIFT_MIN (0) +#define YY_SHIFT_MAX (254) +static const unsigned short int yy_shift_ofst[] = { + /* 0 */ 82, 46, 230, 230, 230, 230, 230, 230, 18, 18, + /* 10 */ 230, 230, 230, 135, 185, 18, 29, 29, 29, 12, + /* 20 */ 12, 12, 12, 29, 12, 136, 169, 201, 187, 215, + /* 30 */ 225, 35, 238, 29, 222, 29, 29, 222, 241, 29, + /* 40 */ 243, 239, 254, 234, }; #define YY_REDUCE_USE_DFLT (-60) #define YY_REDUCE_COUNT (40) #define YY_REDUCE_MIN (-59) -#define YY_REDUCE_MAX (201) +#define YY_REDUCE_MAX (195) static const short yy_reduce_ofst[] = { - /* 0 */ -38, 79, 126, 148, 156, 171, 186, 201, 98, -59, - /* 10 */ 178, 181, 181, -44, 76, -44, 123, 134, 138, 23, - /* 20 */ 23, 23, 23, 146, 23, -28, 17, 70, 90, 95, - /* 30 */ 86, 101, 117, 129, 142, 129, 129, 142, 144, 129, - /* 40 */ 169, + /* 0 */ -38, 79, 115, 130, 149, 158, 177, 192, 103, -59, + /* 10 */ -12, 24, 24, 70, -40, 70, 48, 86, 168, 195, + /* 20 */ 195, 195, 195, 186, 195, -28, 85, 87, 78, 97, + /* 30 */ 105, 172, 140, 152, 189, 152, 152, 189, 188, 152, + /* 40 */ 190, }; static const YYACTIONTYPE yy_default[] = { - /* 0 */ 407, 407, 372, 371, 370, 316, 345, 340, 400, 350, - /* 10 */ 364, 343, 338, 355, 297, 357, 399, 374, 373, 366, - /* 20 */ 306, 344, 339, 304, 305, 383, 380, 377, 361, 293, - /* 30 */ 296, 292, 283, 347, 406, 312, 311, 313, 295, 303, - /* 40 */ 294, 404, 404, 290, + /* 0 */ 410, 410, 374, 373, 372, 318, 347, 342, 403, 352, + /* 10 */ 366, 345, 340, 357, 299, 359, 402, 376, 375, 368, + /* 20 */ 308, 346, 341, 306, 307, 385, 382, 379, 363, 295, + /* 30 */ 294, 298, 285, 349, 409, 314, 313, 315, 297, 305, + /* 40 */ 296, 407, 407, 292, }; /********** End of lemon-generated parsing tables *****************************/ @@ -297,15 +295,15 @@ static const YYACTIONTYPE yy_default[] = { #ifdef YYFALLBACK static const YYCODETYPE yyFallback[] = { 0, /* $ => nothing */ - 10, /* LINE_HR => LINE_HTML */ + 0, /* LINE_HR => nothing */ 1, /* LINE_SETEXT_1 => LINE_HR */ 1, /* LINE_SETEXT_2 => LINE_HR */ - 6, /* LINE_PLAIN => LINE_CONTINUATION */ - 4, /* LINE_TABLE_SEPARATOR => LINE_PLAIN */ 0, /* LINE_CONTINUATION => nothing */ - 6, /* LINE_INDENTED_TAB => LINE_CONTINUATION */ - 6, /* LINE_INDENTED_SPACE => LINE_CONTINUATION */ - 6, /* LINE_TABLE => LINE_CONTINUATION */ + 4, /* LINE_PLAIN => LINE_CONTINUATION */ + 4, /* LINE_INDENTED_TAB => LINE_CONTINUATION */ + 4, /* LINE_INDENTED_SPACE => LINE_CONTINUATION */ + 4, /* LINE_TABLE => LINE_CONTINUATION */ + 4, /* LINE_TABLE_SEPARATOR => LINE_CONTINUATION */ 0, /* LINE_HTML => nothing */ 10, /* LINE_ATX_1 => LINE_HTML */ 10, /* LINE_ATX_2 => LINE_HTML */ @@ -409,8 +407,8 @@ void ParseTrace(FILE *TraceFILE, char *zTracePrompt){ ** are required. The following table supplies these names */ static const char *const yyTokenName[] = { "$", "LINE_HR", "LINE_SETEXT_1", "LINE_SETEXT_2", - "LINE_PLAIN", "LINE_TABLE_SEPARATOR", "LINE_CONTINUATION", "LINE_INDENTED_TAB", - "LINE_INDENTED_SPACE", "LINE_TABLE", "LINE_HTML", "LINE_ATX_1", + "LINE_CONTINUATION", "LINE_PLAIN", "LINE_INDENTED_TAB", "LINE_INDENTED_SPACE", + "LINE_TABLE", "LINE_TABLE_SEPARATOR", "LINE_HTML", "LINE_ATX_1", "LINE_ATX_2", "LINE_ATX_3", "LINE_ATX_4", "LINE_ATX_5", "LINE_ATX_6", "LINE_BLOCKQUOTE", "LINE_LIST_BULLETED", "LINE_LIST_ENUMERATED", "LINE_DEF_ABBREVIATION", "LINE_DEF_CITATION", "LINE_DEF_FOOTNOTE", "LINE_DEF_GLOSSARY", @@ -557,20 +555,21 @@ static const char *const yyRuleName[] = { /* 118 */ "html_block ::= LINE_HTML", /* 119 */ "html_line ::= LINE_CONTINUATION", /* 120 */ "html_line ::= LINE_HTML", - /* 121 */ "indented_code ::= indented_line", - /* 122 */ "list_bullet ::= item_bullet", - /* 123 */ "list_enum ::= item_enum", - /* 124 */ "meta_block ::= LINE_META", - /* 125 */ "meta_line ::= LINE_META", - /* 126 */ "meta_line ::= LINE_CONTINUATION", - /* 127 */ "para ::= LINE_PLAIN", - /* 128 */ "table ::= table_header", - /* 129 */ "header_rows ::= LINE_TABLE", - /* 130 */ "table_body ::= table_section", - /* 131 */ "all_rows ::= row", - /* 132 */ "row ::= header_rows", - /* 133 */ "row ::= LINE_TABLE_SEPARATOR", - /* 134 */ "para ::= defs", + /* 121 */ "html_line ::= LINE_HR", + /* 122 */ "indented_code ::= indented_line", + /* 123 */ "list_bullet ::= item_bullet", + /* 124 */ "list_enum ::= item_enum", + /* 125 */ "meta_block ::= LINE_META", + /* 126 */ "meta_line ::= LINE_META", + /* 127 */ "meta_line ::= LINE_CONTINUATION", + /* 128 */ "para ::= LINE_PLAIN", + /* 129 */ "table ::= table_header", + /* 130 */ "header_rows ::= LINE_TABLE", + /* 131 */ "table_body ::= table_section", + /* 132 */ "all_rows ::= row", + /* 133 */ "row ::= header_rows", + /* 134 */ "row ::= LINE_TABLE_SEPARATOR", + /* 135 */ "para ::= defs", }; #endif /* NDEBUG */ @@ -1040,6 +1039,7 @@ static const struct { { 50, 1 }, { 74, 1 }, { 74, 1 }, + { 74, 1 }, { 51, 1 }, { 52, 1 }, { 53, 1 }, @@ -1382,20 +1382,21 @@ static void yy_reduce( /* (118) html_block ::= LINE_HTML */ yytestcase(yyruleno==118); /* (119) html_line ::= LINE_CONTINUATION */ yytestcase(yyruleno==119); /* (120) html_line ::= LINE_HTML */ yytestcase(yyruleno==120); - /* (121) indented_code ::= indented_line (OPTIMIZED OUT) */ assert(yyruleno!=121); - /* (122) list_bullet ::= item_bullet (OPTIMIZED OUT) */ assert(yyruleno!=122); - /* (123) list_enum ::= item_enum (OPTIMIZED OUT) */ assert(yyruleno!=123); - /* (124) meta_block ::= LINE_META */ yytestcase(yyruleno==124); - /* (125) meta_line ::= LINE_META */ yytestcase(yyruleno==125); - /* (126) meta_line ::= LINE_CONTINUATION */ yytestcase(yyruleno==126); - /* (127) para ::= LINE_PLAIN */ yytestcase(yyruleno==127); - /* (128) table ::= table_header */ yytestcase(yyruleno==128); - /* (129) header_rows ::= LINE_TABLE */ yytestcase(yyruleno==129); - /* (130) table_body ::= table_section (OPTIMIZED OUT) */ assert(yyruleno!=130); - /* (131) all_rows ::= row (OPTIMIZED OUT) */ assert(yyruleno!=131); - /* (132) row ::= header_rows */ yytestcase(yyruleno==132); - /* (133) row ::= LINE_TABLE_SEPARATOR */ yytestcase(yyruleno==133); - /* (134) para ::= defs */ yytestcase(yyruleno==134); + /* (121) html_line ::= LINE_HR */ yytestcase(yyruleno==121); + /* (122) indented_code ::= indented_line (OPTIMIZED OUT) */ assert(yyruleno!=122); + /* (123) list_bullet ::= item_bullet (OPTIMIZED OUT) */ assert(yyruleno!=123); + /* (124) list_enum ::= item_enum (OPTIMIZED OUT) */ assert(yyruleno!=124); + /* (125) meta_block ::= LINE_META */ yytestcase(yyruleno==125); + /* (126) meta_line ::= LINE_META */ yytestcase(yyruleno==126); + /* (127) meta_line ::= LINE_CONTINUATION */ yytestcase(yyruleno==127); + /* (128) para ::= LINE_PLAIN */ yytestcase(yyruleno==128); + /* (129) table ::= table_header */ yytestcase(yyruleno==129); + /* (130) header_rows ::= LINE_TABLE */ yytestcase(yyruleno==130); + /* (131) table_body ::= table_section (OPTIMIZED OUT) */ assert(yyruleno!=131); + /* (132) all_rows ::= row (OPTIMIZED OUT) */ assert(yyruleno!=132); + /* (133) row ::= header_rows */ yytestcase(yyruleno==133); + /* (134) row ::= LINE_TABLE_SEPARATOR */ yytestcase(yyruleno==134); + /* (135) para ::= defs */ yytestcase(yyruleno==135); break; /********** End reduce actions ************************************************/ }; diff --git a/Sources/libMultiMarkdown/parser.h b/Sources/libMultiMarkdown/parser.h index 55917c25..5c1dc2c5 100644 --- a/Sources/libMultiMarkdown/parser.h +++ b/Sources/libMultiMarkdown/parser.h @@ -1,12 +1,12 @@ #define LINE_HR 1 #define LINE_SETEXT_1 2 #define LINE_SETEXT_2 3 -#define LINE_PLAIN 4 -#define LINE_TABLE_SEPARATOR 5 -#define LINE_CONTINUATION 6 -#define LINE_INDENTED_TAB 7 -#define LINE_INDENTED_SPACE 8 -#define LINE_TABLE 9 +#define LINE_CONTINUATION 4 +#define LINE_PLAIN 5 +#define LINE_INDENTED_TAB 6 +#define LINE_INDENTED_SPACE 7 +#define LINE_TABLE 8 +#define LINE_TABLE_SEPARATOR 9 #define LINE_HTML 10 #define LINE_ATX_1 11 #define LINE_ATX_2 12 diff --git a/Sources/libMultiMarkdown/parser.out b/Sources/libMultiMarkdown/parser.out index 91097891..ed89f4aa 100644 --- a/Sources/libMultiMarkdown/parser.out +++ b/Sources/libMultiMarkdown/parser.out @@ -110,10 +110,10 @@ State 0: LINE_HR shift-reduce 9 block ::= LINE_HR LINE_PLAIN shift 16 - LINE_TABLE_SEPARATOR shift-reduce 133 row ::= LINE_TABLE_SEPARATOR LINE_INDENTED_TAB shift-reduce 90 indented_line ::= LINE_INDENTED_TAB LINE_INDENTED_SPACE shift-reduce 91 indented_line ::= LINE_INDENTED_SPACE - LINE_TABLE shift-reduce 129 header_rows ::= LINE_TABLE + LINE_TABLE shift-reduce 130 header_rows ::= LINE_TABLE + LINE_TABLE_SEPARATOR shift-reduce 134 row ::= LINE_TABLE_SEPARATOR LINE_HTML shift-reduce 118 html_block ::= LINE_HTML LINE_ATX_1 shift-reduce 3 block ::= LINE_ATX_1 LINE_ATX_2 shift-reduce 4 block ::= LINE_ATX_2 @@ -138,7 +138,7 @@ State 0: LINE_FENCE_BACKTICK_START_3 shift-reduce 107 fenced_3 ::= LINE_FENCE_BACKTICK_START_3 LINE_FENCE_BACKTICK_START_4 shift-reduce 110 fenced_4 ::= LINE_FENCE_BACKTICK_START_4 LINE_FENCE_BACKTICK_START_5 shift-reduce 113 fenced_5 ::= LINE_FENCE_BACKTICK_START_5 - LINE_META shift-reduce 124 meta_block ::= LINE_META + LINE_META shift-reduce 125 meta_block ::= LINE_META doc accept blocks shift 1 block shift-reduce 2 blocks ::= block @@ -151,11 +151,11 @@ State 0: definition_block shift-reduce 17 block ::= definition_block empty shift 43 fenced_block shift-reduce 19 block ::= fenced_block - html_block shift 31 + html_block shift 30 indented_code shift 29 list_bullet shift 40 list_enum shift 38 - meta_block shift 30 + meta_block shift 31 para shift 14 setext_1 shift-reduce 26 block ::= setext_1 setext_2 shift-reduce 27 block ::= setext_2 @@ -285,10 +285,10 @@ State 1: $ reduce 0 doc ::= blocks LINE_HR shift-reduce 9 block ::= LINE_HR LINE_PLAIN shift 16 - LINE_TABLE_SEPARATOR shift-reduce 133 row ::= LINE_TABLE_SEPARATOR LINE_INDENTED_TAB shift-reduce 90 indented_line ::= LINE_INDENTED_TAB LINE_INDENTED_SPACE shift-reduce 91 indented_line ::= LINE_INDENTED_SPACE - LINE_TABLE shift-reduce 129 header_rows ::= LINE_TABLE + LINE_TABLE shift-reduce 130 header_rows ::= LINE_TABLE + LINE_TABLE_SEPARATOR shift-reduce 134 row ::= LINE_TABLE_SEPARATOR LINE_HTML shift-reduce 118 html_block ::= LINE_HTML LINE_ATX_1 shift-reduce 3 block ::= LINE_ATX_1 LINE_ATX_2 shift-reduce 4 block ::= LINE_ATX_2 @@ -313,7 +313,7 @@ State 1: LINE_FENCE_BACKTICK_START_3 shift-reduce 107 fenced_3 ::= LINE_FENCE_BACKTICK_START_3 LINE_FENCE_BACKTICK_START_4 shift-reduce 110 fenced_4 ::= LINE_FENCE_BACKTICK_START_4 LINE_FENCE_BACKTICK_START_5 shift-reduce 113 fenced_5 ::= LINE_FENCE_BACKTICK_START_5 - LINE_META shift-reduce 124 meta_block ::= LINE_META + LINE_META shift-reduce 125 meta_block ::= LINE_META block shift-reduce 1 blocks ::= blocks block blockquote shift 32 def_abbreviation shift-reduce 12 block ::= def_abbreviation @@ -324,11 +324,11 @@ State 1: definition_block shift-reduce 17 block ::= definition_block empty shift 43 fenced_block shift-reduce 19 block ::= fenced_block - html_block shift 31 + html_block shift 30 indented_code shift 29 list_bullet shift 40 list_enum shift 38 - meta_block shift 30 + meta_block shift 31 para shift 14 setext_1 shift-reduce 26 block ::= setext_1 setext_2 shift-reduce 27 block ::= setext_2 @@ -524,7 +524,7 @@ State 7: State 8: table ::= table_header * table_body - (128) table ::= table_header * + (129) table ::= table_header * header_rows ::= * header_rows LINE_TABLE header_rows ::= * LINE_TABLE table_body ::= * table_body table_section @@ -536,16 +536,16 @@ State 8: row ::= * header_rows row ::= * LINE_TABLE_SEPARATOR - LINE_TABLE_SEPARATOR shift-reduce 133 row ::= LINE_TABLE_SEPARATOR - LINE_TABLE_SEPARATOR reduce 128 ** Parsing conflict ** - LINE_TABLE shift-reduce 129 header_rows ::= LINE_TABLE - LINE_TABLE reduce 128 ** Parsing conflict ** + LINE_TABLE shift-reduce 130 header_rows ::= LINE_TABLE + LINE_TABLE reduce 129 ** Parsing conflict ** + LINE_TABLE_SEPARATOR shift-reduce 134 row ::= LINE_TABLE_SEPARATOR + LINE_TABLE_SEPARATOR reduce 129 ** Parsing conflict ** table_body shift 9 header_rows shift 42 table_section shift 9 /* because table_section==table_body */ all_rows shift 13 row shift 13 /* because row==all_rows */ - {default} reduce 128 table ::= table_header + {default} reduce 129 table ::= table_header State 9: (78) table ::= table_header table_body * @@ -559,10 +559,10 @@ State 9: row ::= * header_rows row ::= * LINE_TABLE_SEPARATOR - LINE_TABLE_SEPARATOR shift-reduce 133 row ::= LINE_TABLE_SEPARATOR - LINE_TABLE_SEPARATOR reduce 78 ** Parsing conflict ** - LINE_TABLE shift-reduce 129 header_rows ::= LINE_TABLE + LINE_TABLE shift-reduce 130 header_rows ::= LINE_TABLE LINE_TABLE reduce 78 ** Parsing conflict ** + LINE_TABLE_SEPARATOR shift-reduce 134 row ::= LINE_TABLE_SEPARATOR + LINE_TABLE_SEPARATOR reduce 78 ** Parsing conflict ** header_rows shift 42 table_section shift-reduce 81 table_body ::= table_body table_section all_rows shift 13 @@ -644,10 +644,10 @@ State 13: row ::= * header_rows row ::= * LINE_TABLE_SEPARATOR - LINE_TABLE_SEPARATOR shift-reduce 133 row ::= LINE_TABLE_SEPARATOR - LINE_TABLE_SEPARATOR reduce 83 ** Parsing conflict ** - LINE_TABLE shift-reduce 129 header_rows ::= LINE_TABLE + LINE_TABLE shift-reduce 130 header_rows ::= LINE_TABLE LINE_TABLE reduce 83 ** Parsing conflict ** + LINE_TABLE_SEPARATOR shift-reduce 134 row ::= LINE_TABLE_SEPARATOR + LINE_TABLE_SEPARATOR reduce 83 ** Parsing conflict ** LINE_EMPTY shift-reduce 82 table_section ::= all_rows LINE_EMPTY LINE_EMPTY reduce 83 ** Parsing conflict ** header_rows shift 42 @@ -680,10 +680,10 @@ State 15: row ::= * LINE_TABLE_SEPARATOR (85) para ::= all_rows * - LINE_TABLE_SEPARATOR shift-reduce 133 row ::= LINE_TABLE_SEPARATOR - LINE_TABLE_SEPARATOR reduce 85 ** Parsing conflict ** - LINE_TABLE shift-reduce 129 header_rows ::= LINE_TABLE + LINE_TABLE shift-reduce 130 header_rows ::= LINE_TABLE LINE_TABLE reduce 85 ** Parsing conflict ** + LINE_TABLE_SEPARATOR shift-reduce 134 row ::= LINE_TABLE_SEPARATOR + LINE_TABLE_SEPARATOR reduce 85 ** Parsing conflict ** header_rows shift 42 row shift-reduce 84 all_rows ::= all_rows row {default} reduce 85 para ::= all_rows @@ -693,12 +693,12 @@ State 16: chunk ::= * chunk_line chunk_line ::= * LINE_CONTINUATION para ::= LINE_PLAIN * chunk - (127) para ::= LINE_PLAIN * + (128) para ::= LINE_PLAIN * LINE_CONTINUATION shift-reduce 87 chunk_line ::= LINE_CONTINUATION chunk shift 33 chunk_line shift 33 /* because chunk_line==chunk */ - {default} reduce 127 para ::= LINE_PLAIN + {default} reduce 128 para ::= LINE_PLAIN State 17: chunk ::= * chunk chunk_line @@ -929,29 +929,32 @@ State 29: {default} reduce 21 block ::= indented_code State 30: - (24) block ::= meta_block * - meta_block ::= meta_block * meta_line - meta_line ::= * LINE_META - meta_line ::= * LINE_CONTINUATION - - LINE_CONTINUATION shift-reduce 126 meta_line ::= LINE_CONTINUATION - LINE_META shift-reduce 125 meta_line ::= LINE_META - LINE_META reduce 24 ** Parsing conflict ** - meta_line shift-reduce 74 meta_block ::= meta_block meta_line - {default} reduce 24 block ::= meta_block - -State 31: (20) block ::= html_block * html_block ::= html_block * html_line html_line ::= * LINE_CONTINUATION html_line ::= * LINE_HTML + html_line ::= * LINE_HR + LINE_HR shift-reduce 121 html_line ::= LINE_HR + LINE_HR reduce 20 ** Parsing conflict ** LINE_CONTINUATION shift-reduce 119 html_line ::= LINE_CONTINUATION LINE_HTML shift-reduce 120 html_line ::= LINE_HTML LINE_HTML reduce 20 ** Parsing conflict ** html_line shift-reduce 61 html_block ::= html_block html_line {default} reduce 20 block ::= html_block +State 31: + (24) block ::= meta_block * + meta_block ::= meta_block * meta_line + meta_line ::= * LINE_META + meta_line ::= * LINE_CONTINUATION + + LINE_CONTINUATION shift-reduce 127 meta_line ::= LINE_CONTINUATION + LINE_META shift-reduce 126 meta_line ::= LINE_META + LINE_META reduce 24 ** Parsing conflict ** + meta_line shift-reduce 74 meta_block ::= meta_block meta_line + {default} reduce 24 block ::= meta_block + State 32: (11) block ::= blockquote * blockquote ::= blockquote * quote_line @@ -977,12 +980,12 @@ State 34: defs ::= defs * def def ::= * LINE_DEFINITION tail def ::= * LINE_DEFINITION - (134) para ::= defs * + (135) para ::= defs * LINE_DEFINITION shift 5 - LINE_DEFINITION reduce 134 ** Parsing conflict ** + LINE_DEFINITION reduce 135 ** Parsing conflict ** def shift-reduce 42 defs ::= defs def - {default} reduce 134 para ::= defs + {default} reduce 135 para ::= defs State 35: chunk ::= chunk * chunk_line @@ -1051,21 +1054,21 @@ State 40: State 41: table_header ::= header_rows * LINE_TABLE_SEPARATOR header_rows ::= header_rows * LINE_TABLE - (132) row ::= header_rows * + (133) row ::= header_rows * - LINE_TABLE_SEPARATOR shift-reduce 79 table_header ::= header_rows LINE_TABLE_SEPARATOR - LINE_TABLE_SEPARATOR reduce 132 ** Parsing conflict ** LINE_TABLE shift-reduce 80 header_rows ::= header_rows LINE_TABLE - LINE_TABLE reduce 132 ** Parsing conflict ** - {default} reduce 132 row ::= header_rows + LINE_TABLE reduce 133 ** Parsing conflict ** + LINE_TABLE_SEPARATOR shift-reduce 79 table_header ::= header_rows LINE_TABLE_SEPARATOR + LINE_TABLE_SEPARATOR reduce 133 ** Parsing conflict ** + {default} reduce 133 row ::= header_rows State 42: header_rows ::= header_rows * LINE_TABLE - (132) row ::= header_rows * + (133) row ::= header_rows * LINE_TABLE shift-reduce 80 header_rows ::= header_rows LINE_TABLE - LINE_TABLE reduce 132 ** Parsing conflict ** - {default} reduce 132 row ::= header_rows + LINE_TABLE reduce 133 ** Parsing conflict ** + {default} reduce 133 row ::= header_rows State 43: (18) block ::= empty * @@ -1081,12 +1084,12 @@ Symbols: 1: LINE_HR 2: LINE_SETEXT_1 3: LINE_SETEXT_2 - 4: LINE_PLAIN - 5: LINE_TABLE_SEPARATOR - 6: LINE_CONTINUATION - 7: LINE_INDENTED_TAB - 8: LINE_INDENTED_SPACE - 9: LINE_TABLE + 4: LINE_CONTINUATION + 5: LINE_PLAIN + 6: LINE_INDENTED_TAB + 7: LINE_INDENTED_SPACE + 8: LINE_TABLE + 9: LINE_TABLE_SEPARATOR 10: LINE_HTML 11: LINE_ATX_1 12: LINE_ATX_2 @@ -1115,16 +1118,16 @@ Symbols: 35: LINE_FENCE_BACKTICK_START_5 36: LINE_META 37: error: - 38: doc: LINE_HR LINE_PLAIN LINE_TABLE_SEPARATOR LINE_INDENTED_TAB LINE_INDENTED_SPACE LINE_TABLE LINE_HTML LINE_ATX_1 LINE_ATX_2 LINE_ATX_3 LINE_ATX_4 LINE_ATX_5 LINE_ATX_6 LINE_BLOCKQUOTE LINE_LIST_BULLETED LINE_LIST_ENUMERATED LINE_DEF_ABBREVIATION LINE_DEF_CITATION LINE_DEF_FOOTNOTE LINE_DEF_GLOSSARY LINE_DEF_LINK LINE_TOC LINE_DEFINITION LINE_EMPTY LINE_FENCE_BACKTICK_3 LINE_FENCE_BACKTICK_4 LINE_FENCE_BACKTICK_5 LINE_FENCE_BACKTICK_START_3 LINE_FENCE_BACKTICK_START_4 LINE_FENCE_BACKTICK_START_5 LINE_META - 39: blocks: LINE_HR LINE_PLAIN LINE_TABLE_SEPARATOR LINE_INDENTED_TAB LINE_INDENTED_SPACE LINE_TABLE LINE_HTML LINE_ATX_1 LINE_ATX_2 LINE_ATX_3 LINE_ATX_4 LINE_ATX_5 LINE_ATX_6 LINE_BLOCKQUOTE LINE_LIST_BULLETED LINE_LIST_ENUMERATED LINE_DEF_ABBREVIATION LINE_DEF_CITATION LINE_DEF_FOOTNOTE LINE_DEF_GLOSSARY LINE_DEF_LINK LINE_TOC LINE_DEFINITION LINE_EMPTY LINE_FENCE_BACKTICK_3 LINE_FENCE_BACKTICK_4 LINE_FENCE_BACKTICK_5 LINE_FENCE_BACKTICK_START_3 LINE_FENCE_BACKTICK_START_4 LINE_FENCE_BACKTICK_START_5 LINE_META - 40: block: LINE_HR LINE_PLAIN LINE_TABLE_SEPARATOR LINE_INDENTED_TAB LINE_INDENTED_SPACE LINE_TABLE LINE_HTML LINE_ATX_1 LINE_ATX_2 LINE_ATX_3 LINE_ATX_4 LINE_ATX_5 LINE_ATX_6 LINE_BLOCKQUOTE LINE_LIST_BULLETED LINE_LIST_ENUMERATED LINE_DEF_ABBREVIATION LINE_DEF_CITATION LINE_DEF_FOOTNOTE LINE_DEF_GLOSSARY LINE_DEF_LINK LINE_TOC LINE_DEFINITION LINE_EMPTY LINE_FENCE_BACKTICK_3 LINE_FENCE_BACKTICK_4 LINE_FENCE_BACKTICK_5 LINE_FENCE_BACKTICK_START_3 LINE_FENCE_BACKTICK_START_4 LINE_FENCE_BACKTICK_START_5 LINE_META + 38: doc: LINE_HR LINE_PLAIN LINE_INDENTED_TAB LINE_INDENTED_SPACE LINE_TABLE LINE_TABLE_SEPARATOR LINE_HTML LINE_ATX_1 LINE_ATX_2 LINE_ATX_3 LINE_ATX_4 LINE_ATX_5 LINE_ATX_6 LINE_BLOCKQUOTE LINE_LIST_BULLETED LINE_LIST_ENUMERATED LINE_DEF_ABBREVIATION LINE_DEF_CITATION LINE_DEF_FOOTNOTE LINE_DEF_GLOSSARY LINE_DEF_LINK LINE_TOC LINE_DEFINITION LINE_EMPTY LINE_FENCE_BACKTICK_3 LINE_FENCE_BACKTICK_4 LINE_FENCE_BACKTICK_5 LINE_FENCE_BACKTICK_START_3 LINE_FENCE_BACKTICK_START_4 LINE_FENCE_BACKTICK_START_5 LINE_META + 39: blocks: LINE_HR LINE_PLAIN LINE_INDENTED_TAB LINE_INDENTED_SPACE LINE_TABLE LINE_TABLE_SEPARATOR LINE_HTML LINE_ATX_1 LINE_ATX_2 LINE_ATX_3 LINE_ATX_4 LINE_ATX_5 LINE_ATX_6 LINE_BLOCKQUOTE LINE_LIST_BULLETED LINE_LIST_ENUMERATED LINE_DEF_ABBREVIATION LINE_DEF_CITATION LINE_DEF_FOOTNOTE LINE_DEF_GLOSSARY LINE_DEF_LINK LINE_TOC LINE_DEFINITION LINE_EMPTY LINE_FENCE_BACKTICK_3 LINE_FENCE_BACKTICK_4 LINE_FENCE_BACKTICK_5 LINE_FENCE_BACKTICK_START_3 LINE_FENCE_BACKTICK_START_4 LINE_FENCE_BACKTICK_START_5 LINE_META + 40: block: LINE_HR LINE_PLAIN LINE_INDENTED_TAB LINE_INDENTED_SPACE LINE_TABLE LINE_TABLE_SEPARATOR LINE_HTML LINE_ATX_1 LINE_ATX_2 LINE_ATX_3 LINE_ATX_4 LINE_ATX_5 LINE_ATX_6 LINE_BLOCKQUOTE LINE_LIST_BULLETED LINE_LIST_ENUMERATED LINE_DEF_ABBREVIATION LINE_DEF_CITATION LINE_DEF_FOOTNOTE LINE_DEF_GLOSSARY LINE_DEF_LINK LINE_TOC LINE_DEFINITION LINE_EMPTY LINE_FENCE_BACKTICK_3 LINE_FENCE_BACKTICK_4 LINE_FENCE_BACKTICK_5 LINE_FENCE_BACKTICK_START_3 LINE_FENCE_BACKTICK_START_4 LINE_FENCE_BACKTICK_START_5 LINE_META 41: blockquote: LINE_BLOCKQUOTE 42: def_abbreviation: LINE_DEF_ABBREVIATION 43: def_citation: LINE_DEF_CITATION 44: def_footnote: LINE_DEF_FOOTNOTE 45: def_glossary: LINE_DEF_GLOSSARY 46: def_link: LINE_DEF_LINK - 47: definition_block: LINE_PLAIN LINE_TABLE_SEPARATOR LINE_TABLE LINE_DEFINITION + 47: definition_block: LINE_PLAIN LINE_TABLE LINE_TABLE_SEPARATOR LINE_DEFINITION 48: empty: LINE_EMPTY 49: fenced_block: LINE_FENCE_BACKTICK_3 LINE_FENCE_BACKTICK_4 LINE_FENCE_BACKTICK_5 LINE_FENCE_BACKTICK_START_3 LINE_FENCE_BACKTICK_START_4 LINE_FENCE_BACKTICK_START_5 50: html_block: LINE_HTML @@ -1132,9 +1135,9 @@ Symbols: 52: list_bullet: LINE_LIST_BULLETED 53: list_enum: LINE_LIST_ENUMERATED 54: meta_block: LINE_META - 55: para: LINE_PLAIN LINE_TABLE_SEPARATOR LINE_TABLE LINE_DEFINITION - 56: setext_1: LINE_PLAIN LINE_TABLE_SEPARATOR LINE_TABLE LINE_DEFINITION - 57: setext_2: LINE_PLAIN LINE_TABLE_SEPARATOR LINE_TABLE LINE_DEFINITION + 55: para: LINE_PLAIN LINE_TABLE LINE_TABLE_SEPARATOR LINE_DEFINITION + 56: setext_1: LINE_PLAIN LINE_TABLE LINE_TABLE_SEPARATOR LINE_DEFINITION + 57: setext_2: LINE_PLAIN LINE_TABLE LINE_TABLE_SEPARATOR LINE_DEFINITION 58: table: LINE_TABLE 59: chunk: LINE_CONTINUATION 60: chunk_line: LINE_CONTINUATION @@ -1151,13 +1154,13 @@ Symbols: 71: fenced_line: LINE_SETEXT_1 LINE_SETEXT_2 LINE_CONTINUATION LINE_EMPTY 72: fenced_4: LINE_FENCE_BACKTICK_4 LINE_FENCE_BACKTICK_START_4 73: fenced_5: LINE_FENCE_BACKTICK_5 LINE_FENCE_BACKTICK_START_5 - 74: html_line: LINE_CONTINUATION LINE_HTML + 74: html_line: LINE_HR LINE_CONTINUATION LINE_HTML 75: item_bullet: LINE_LIST_BULLETED 76: item_enum: LINE_LIST_ENUMERATED 77: meta_line: LINE_CONTINUATION LINE_META 78: table_header: LINE_TABLE - 79: table_body: LINE_TABLE_SEPARATOR LINE_TABLE + 79: table_body: LINE_TABLE LINE_TABLE_SEPARATOR 80: header_rows: LINE_TABLE - 81: table_section: LINE_TABLE_SEPARATOR LINE_TABLE - 82: all_rows: LINE_TABLE_SEPARATOR LINE_TABLE - 83: row: LINE_TABLE_SEPARATOR LINE_TABLE + 81: table_section: LINE_TABLE LINE_TABLE_SEPARATOR + 82: all_rows: LINE_TABLE LINE_TABLE_SEPARATOR + 83: row: LINE_TABLE LINE_TABLE_SEPARATOR diff --git a/Sources/libMultiMarkdown/parser.y b/Sources/libMultiMarkdown/parser.y index d5f162c4..62e0e588 100644 --- a/Sources/libMultiMarkdown/parser.y +++ b/Sources/libMultiMarkdown/parser.y @@ -64,11 +64,11 @@ %fallback LINE_HR LINE_SETEXT_1 LINE_SETEXT_2. -%fallback LINE_PLAIN LINE_TABLE_SEPARATOR. +//%fallback LINE_PLAIN LINE_TABLE_SEPARATOR. -%fallback LINE_CONTINUATION LINE_PLAIN LINE_INDENTED_TAB LINE_INDENTED_SPACE LINE_TABLE. +%fallback LINE_CONTINUATION LINE_PLAIN LINE_INDENTED_TAB LINE_INDENTED_SPACE LINE_TABLE LINE_TABLE_SEPARATOR. -%fallback LINE_HTML LINE_ATX_1 LINE_ATX_2 LINE_ATX_3 LINE_ATX_4 LINE_ATX_5 LINE_ATX_6 LINE_HR LINE_BLOCKQUOTE +%fallback LINE_HTML LINE_ATX_1 LINE_ATX_2 LINE_ATX_3 LINE_ATX_4 LINE_ATX_5 LINE_ATX_6 LINE_BLOCKQUOTE LINE_LIST_BULLETED LINE_LIST_ENUMERATED LINE_DEF_ABBREVIATION LINE_DEF_CITATION LINE_DEF_FOOTNOTE LINE_DEF_GLOSSARY LINE_DEF_LINK LINE_FENCE_BACKTICK LINE_FENCE_BACKTICK_START. @@ -271,6 +271,7 @@ html_block ::= LINE_HTML. html_line ::= LINE_CONTINUATION. html_line ::= LINE_HTML. +html_line ::= LINE_HR. // Indented code blocks diff --git a/Sources/libMultiMarkdown/scanners.c b/Sources/libMultiMarkdown/scanners.c index bf59122e..2d46d5ad 100644 --- a/Sources/libMultiMarkdown/scanners.c +++ b/Sources/libMultiMarkdown/scanners.c @@ -1,4 +1,4 @@ -/* Generated by re2c 0.14.3 on Mon Mar 6 14:49:25 2017 */ +/* Generated by re2c 0.14.3 on Wed Mar 15 00:32:22 2017 */ /** MultiMarkdown 6 -- Lightweight markup processor to produce HTML, LaTeX, and more. @@ -66,12 +66,13 @@ size_t scan_spnl(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { case '\t': - case ' ': goto yy3; + case ' ': + case 0xA0: goto yy3; case '\n': goto yy4; case '\r': goto yy6; default: goto yy7; @@ -87,7 +88,8 @@ size_t scan_spnl(const char * c) { yy5: switch (yych) { case '\t': - case ' ': goto yy4; + case ' ': + case 0xA0: goto yy4; default: goto yy2; } yy6: @@ -111,7 +113,8 @@ size_t scan_spnl(const char * c) { yy11: switch (yych) { case '\t': - case ' ': goto yy10; + case ' ': + case 0xA0: goto yy10; case '\n': goto yy4; case '\r': goto yy9; default: goto yy2; @@ -126,7 +129,7 @@ size_t scan_key(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { case '\n': goto yy14; @@ -281,7 +284,7 @@ size_t scan_value(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { case '\n': goto yy22; @@ -488,11 +491,12 @@ size_t scan_attr(const char * c) { { - char yych; + unsigned char yych; yych = *(marker = c); switch (yych) { case '\t': - case ' ': goto yy39; + case ' ': + case 0xA0: goto yy39; case '\n': goto yy40; case '\r': goto yy43; case ':': @@ -613,7 +617,8 @@ size_t scan_attr(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy60; + case 'z': + case 0xA0: goto yy60; default: goto yy38; } yy40: @@ -622,7 +627,8 @@ size_t scan_attr(const char * c) { yy41: switch (yych) { case '\t': - case ' ': goto yy40; + case ' ': + case 0xA0: goto yy40; case ':': case 'A': case 'B': @@ -687,7 +693,8 @@ size_t scan_attr(const char * c) { switch (yych) { case '\t': case '\n': - case ' ': goto yy40; + case ' ': + case 0xA0: goto yy40; case ':': case 'A': case 'B': @@ -897,7 +904,8 @@ size_t scan_attr(const char * c) { yych = *c; switch (yych) { case '\t': - case ' ': goto yy48; + case ' ': + case 0xA0: goto yy48; case '"': goto yy50; case '\'': goto yy52; case '.': @@ -1071,7 +1079,8 @@ size_t scan_attr(const char * c) { yy60: switch (yych) { case '\t': - case ' ': goto yy59; + case ' ': + case 0xA0: goto yy59; case '\n': goto yy40; case '\r': goto yy58; case ':': @@ -1141,12 +1150,13 @@ size_t scan_attributes(const char * c) { { - char yych; + unsigned char yych; unsigned int yyaccept = 0; yych = *(marker = c); switch (yych) { case '\t': - case ' ': goto yy64; + case ' ': + case 0xA0: goto yy64; case '\n': goto yy65; case '\r': goto yy68; case ':': @@ -1268,7 +1278,8 @@ size_t scan_attributes(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy83; + case 'z': + case 0xA0: goto yy83; default: goto yy63; } yy65: @@ -1276,7 +1287,8 @@ size_t scan_attributes(const char * c) { yych = *c; switch (yych) { case '\t': - case ' ': goto yy65; + case ' ': + case 0xA0: goto yy65; case ':': case 'A': case 'B': @@ -1346,7 +1358,8 @@ size_t scan_attributes(const char * c) { switch (yych) { case '\t': case '\n': - case ' ': goto yy65; + case ' ': + case 0xA0: goto yy65; case ':': case 'A': case 'B': @@ -1557,7 +1570,8 @@ size_t scan_attributes(const char * c) { yych = *c; switch (yych) { case '\t': - case ' ': goto yy73; + case ' ': + case 0xA0: goto yy73; case '"': goto yy75; case '\'': goto yy77; case '.': @@ -1651,7 +1665,8 @@ size_t scan_attributes(const char * c) { yych = *c; switch (yych) { case '\t': - case ' ': goto yy82; + case ' ': + case 0xA0: goto yy82; case '\n': goto yy65; case '\r': goto yy84; case '.': @@ -1729,7 +1744,8 @@ size_t scan_attributes(const char * c) { yy83: switch (yych) { case '\t': - case ' ': goto yy82; + case ' ': + case 0xA0: goto yy82; case '\n': goto yy65; case '\r': goto yy84; case ':': @@ -1794,7 +1810,8 @@ size_t scan_attributes(const char * c) { switch (yych) { case '\t': case '\n': - case ' ': goto yy65; + case ' ': + case 0xA0: goto yy65; case ':': case 'A': case 'B': @@ -1857,7 +1874,8 @@ size_t scan_attributes(const char * c) { yych = *c; switch (yych) { case '\t': - case ' ': goto yy82; + case ' ': + case 0xA0: goto yy82; case '\n': goto yy65; case '\r': goto yy84; case '-': @@ -1935,7 +1953,8 @@ size_t scan_attributes(const char * c) { yych = *c; switch (yych) { case '\t': - case ' ': goto yy82; + case ' ': + case 0xA0: goto yy82; case '\n': goto yy65; case '\r': goto yy84; case ':': @@ -2005,7 +2024,7 @@ size_t scan_email(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { case '\n': goto yy90; @@ -2249,7 +2268,8 @@ size_t scan_email(const char * c) { case '\n': case '\r': case ' ': - case '>': goto yy95; + case '>': + case 0xA0: goto yy95; default: goto yy98; } yy95: @@ -2343,7 +2363,8 @@ size_t scan_email(const char * c) { case '\n': case '\r': case ' ': - case '>': goto yy100; + case '>': + case 0xA0: goto yy100; default: goto yy98; } yy100: @@ -2399,7 +2420,7 @@ size_t scan_url(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { case '\n': goto yy109; @@ -2810,7 +2831,8 @@ size_t scan_url(const char * c) { case '\n': case '\r': case ' ': - case '>': goto yy115; + case '>': + case 0xA0: goto yy115; default: goto yy121; } yy119: @@ -2901,7 +2923,8 @@ size_t scan_url(const char * c) { case '\n': case '\r': case ' ': - case '>': goto yy123; + case '>': + case 0xA0: goto yy123; default: goto yy121; } yy123: @@ -2920,7 +2943,8 @@ size_t scan_url(const char * c) { case '\n': case '\r': case ' ': - case '>': goto yy115; + case '>': + case 0xA0: goto yy115; default: goto yy126; } yy126: @@ -2932,7 +2956,8 @@ size_t scan_url(const char * c) { case '\n': case '\r': case ' ': - case '>': goto yy128; + case '>': + case 0xA0: goto yy128; default: goto yy126; } yy128: @@ -3253,7 +3278,8 @@ size_t scan_url(const char * c) { case '\n': case '\r': case ' ': - case '>': goto yy115; + case '>': + case 0xA0: goto yy115; case '!': case '$': case '%': @@ -3337,7 +3363,8 @@ size_t scan_url(const char * c) { case '\n': case '\r': case ' ': - case '>': goto yy128; + case '>': + case 0xA0: goto yy128; case '!': case '$': case '%': @@ -3420,7 +3447,8 @@ size_t scan_url(const char * c) { case '\n': case '\r': case ' ': - case '>': goto yy128; + case '>': + case 0xA0: goto yy128; default: goto yy140; } yy140: @@ -3432,7 +3460,8 @@ size_t scan_url(const char * c) { case '\n': case '\r': case ' ': - case '>': goto yy123; + case '>': + case 0xA0: goto yy123; default: goto yy140; } } @@ -3446,11 +3475,12 @@ size_t scan_ref_abbreviation(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { case '\n': goto yy144; - case ' ': goto yy145; + case ' ': + case 0xA0: goto yy145; case '[': goto yy146; default: goto yy147; } @@ -3459,8 +3489,9 @@ size_t scan_ref_abbreviation(const char * c) { yy145: yych = *(marker = ++c); switch (yych) { - case ' ': goto yy157; - case '[': goto yy158; + case ' ': + case 0xA0: goto yy158; + case '[': goto yy159; default: goto yy144; } yy146: @@ -3473,68 +3504,74 @@ size_t scan_ref_abbreviation(const char * c) { yych = *++c; goto yy144; yy148: - yych = *++c; + ++c; + yych = *c; switch (yych) { - case ']': goto yy149; + case 0x00: + case '\n': + case '\r': + case ']': goto yy150; + case '\\': goto yy148; default: goto yy151; } -yy149: +yy150: c = marker; goto yy144; -yy150: +yy151: ++c; yych = *c; -yy151: switch (yych) { case 0x00: case '\n': - case '\r': goto yy149; - case ']': goto yy152; - default: goto yy150; + case '\r': goto yy150; + case '\\': goto yy148; + case ']': goto yy153; + default: goto yy151; } -yy152: +yy153: yych = *++c; switch (yych) { - case ':': goto yy153; - default: goto yy149; + case ':': goto yy154; + default: goto yy150; } -yy153: +yy154: yych = *++c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy149; - default: goto yy154; + case '\r': goto yy150; + default: goto yy155; } -yy154: +yy155: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy156; - default: goto yy154; + case '\r': goto yy157; + default: goto yy155; } -yy156: - { return (size_t)( c - start ); } yy157: + { return (size_t)( c - start ); } +yy158: yych = *++c; switch (yych) { - case ' ': goto yy159; - case '[': goto yy158; - default: goto yy149; + case ' ': + case 0xA0: goto yy160; + case '[': goto yy159; + default: goto yy150; } -yy158: +yy159: yych = *++c; switch (yych) { case '>': goto yy148; - default: goto yy149; + default: goto yy150; } -yy159: +yy160: ++c; switch ((yych = *c)) { - case '[': goto yy158; - default: goto yy149; + case '[': goto yy159; + default: goto yy150; } } @@ -3547,95 +3584,103 @@ size_t scan_ref_citation(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy162; - case ' ': goto yy163; - case '[': goto yy164; - default: goto yy165; + case '\n': goto yy163; + case ' ': + case 0xA0: goto yy164; + case '[': goto yy165; + default: goto yy166; } -yy162: - { return 0; } yy163: + { return 0; } +yy164: yych = *(marker = ++c); switch (yych) { - case ' ': goto yy175; - case '[': goto yy176; - default: goto yy162; + case ' ': + case 0xA0: goto yy177; + case '[': goto yy178; + default: goto yy163; } -yy164: +yy165: yych = *(marker = ++c); switch (yych) { - case '#': goto yy166; - default: goto yy162; + case '#': goto yy167; + default: goto yy163; } -yy165: - yych = *++c; - goto yy162; yy166: yych = *++c; + goto yy163; +yy167: + ++c; + yych = *c; switch (yych) { - case ']': goto yy167; - default: goto yy169; + case 0x00: + case '\n': + case '\r': + case ']': goto yy169; + case '\\': goto yy167; + default: goto yy170; } -yy167: +yy169: c = marker; - goto yy162; -yy168: + goto yy163; +yy170: ++c; yych = *c; -yy169: switch (yych) { case 0x00: case '\n': - case '\r': goto yy167; - case ']': goto yy170; - default: goto yy168; + case '\r': goto yy169; + case '\\': goto yy167; + case ']': goto yy172; + default: goto yy170; } -yy170: +yy172: yych = *++c; switch (yych) { - case ':': goto yy171; - default: goto yy167; + case ':': goto yy173; + default: goto yy169; } -yy171: +yy173: yych = *++c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy167; - default: goto yy172; + case '\r': goto yy169; + default: goto yy174; } -yy172: +yy174: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy174; - default: goto yy172; + case '\r': goto yy176; + default: goto yy174; } -yy174: +yy176: { return (size_t)( c - start ); } -yy175: +yy177: yych = *++c; switch (yych) { - case ' ': goto yy177; - case '[': goto yy176; - default: goto yy167; + case ' ': + case 0xA0: goto yy179; + case '[': goto yy178; + default: goto yy169; } -yy176: +yy178: yych = *++c; switch (yych) { - case '#': goto yy166; - default: goto yy167; + case '#': goto yy167; + default: goto yy169; } -yy177: +yy179: ++c; switch ((yych = *c)) { - case '[': goto yy176; - default: goto yy167; + case '[': goto yy178; + default: goto yy169; } } @@ -3648,95 +3693,103 @@ size_t scan_ref_foot(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy180; - case ' ': goto yy181; - case '[': goto yy182; - default: goto yy183; - } -yy180: - { return 0; } -yy181: - yych = *(marker = ++c); - switch (yych) { - case ' ': goto yy193; - case '[': goto yy194; - default: goto yy180; + case '\n': goto yy182; + case ' ': + case 0xA0: goto yy183; + case '[': goto yy184; + default: goto yy185; } yy182: + { return 0; } +yy183: yych = *(marker = ++c); switch (yych) { - case '^': goto yy184; - default: goto yy180; + case ' ': + case 0xA0: goto yy196; + case '[': goto yy197; + default: goto yy182; } -yy183: - yych = *++c; - goto yy180; yy184: - yych = *++c; + yych = *(marker = ++c); switch (yych) { - case ']': goto yy185; - default: goto yy187; + case '^': goto yy186; + default: goto yy182; } yy185: - c = marker; - goto yy180; + yych = *++c; + goto yy182; yy186: ++c; yych = *c; -yy187: switch (yych) { case 0x00: case '\n': - case '\r': goto yy185; + case '\r': case ']': goto yy188; - default: goto yy186; + case '\\': goto yy186; + default: goto yy189; } yy188: + c = marker; + goto yy182; +yy189: + ++c; + yych = *c; + switch (yych) { + case 0x00: + case '\n': + case '\r': goto yy188; + case '\\': goto yy186; + case ']': goto yy191; + default: goto yy189; + } +yy191: yych = *++c; switch (yych) { - case ':': goto yy189; - default: goto yy185; + case ':': goto yy192; + default: goto yy188; } -yy189: +yy192: yych = *++c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy185; - default: goto yy190; + case '\r': goto yy188; + default: goto yy193; } -yy190: +yy193: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy192; - default: goto yy190; + case '\r': goto yy195; + default: goto yy193; } -yy192: +yy195: { return (size_t)( c - start ); } -yy193: +yy196: yych = *++c; switch (yych) { - case ' ': goto yy195; - case '[': goto yy194; - default: goto yy185; + case ' ': + case 0xA0: goto yy198; + case '[': goto yy197; + default: goto yy188; } -yy194: +yy197: yych = *++c; switch (yych) { - case '^': goto yy184; - default: goto yy185; + case '^': goto yy186; + default: goto yy188; } -yy195: +yy198: ++c; switch ((yych = *c)) { - case '[': goto yy194; - default: goto yy185; + case '[': goto yy197; + default: goto yy188; } } @@ -3749,95 +3802,103 @@ size_t scan_ref_glossary(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy198; - case ' ': goto yy199; - case '[': goto yy200; - default: goto yy201; + case '\n': goto yy201; + case ' ': + case 0xA0: goto yy202; + case '[': goto yy203; + default: goto yy204; } -yy198: +yy201: { return 0; } -yy199: +yy202: yych = *(marker = ++c); switch (yych) { - case ' ': goto yy211; - case '[': goto yy212; - default: goto yy198; + case ' ': + case 0xA0: goto yy215; + case '[': goto yy216; + default: goto yy201; } -yy200: +yy203: yych = *(marker = ++c); switch (yych) { - case '?': goto yy202; - default: goto yy198; + case '?': goto yy205; + default: goto yy201; } -yy201: - yych = *++c; - goto yy198; -yy202: +yy204: yych = *++c; + goto yy201; +yy205: + ++c; + yych = *c; switch (yych) { - case ']': goto yy203; - default: goto yy205; + case 0x00: + case '\n': + case '\r': + case ']': goto yy207; + case '\\': goto yy205; + default: goto yy208; } -yy203: +yy207: c = marker; - goto yy198; -yy204: + goto yy201; +yy208: ++c; yych = *c; -yy205: switch (yych) { case 0x00: case '\n': - case '\r': goto yy203; - case ']': goto yy206; - default: goto yy204; + case '\r': goto yy207; + case '\\': goto yy205; + case ']': goto yy210; + default: goto yy208; } -yy206: +yy210: yych = *++c; switch (yych) { - case ':': goto yy207; - default: goto yy203; + case ':': goto yy211; + default: goto yy207; } -yy207: +yy211: yych = *++c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy203; - default: goto yy208; + case '\r': goto yy207; + default: goto yy212; } -yy208: +yy212: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy210; - default: goto yy208; + case '\r': goto yy214; + default: goto yy212; } -yy210: +yy214: { return (size_t)( c - start ); } -yy211: +yy215: yych = *++c; switch (yych) { - case ' ': goto yy213; - case '[': goto yy212; - default: goto yy203; + case ' ': + case 0xA0: goto yy217; + case '[': goto yy216; + default: goto yy207; } -yy212: +yy216: yych = *++c; switch (yych) { - case '?': goto yy202; - default: goto yy203; + case '?': goto yy205; + default: goto yy207; } -yy213: +yy217: ++c; switch ((yych = *c)) { - case '[': goto yy212; - default: goto yy203; + case '[': goto yy216; + default: goto yy207; } } @@ -3850,974 +3911,1039 @@ size_t scan_ref_link_no_attributes(const char * c) { { - char yych; + unsigned char yych; unsigned int yyaccept = 0; yych = *c; switch (yych) { - case '\n': goto yy216; - case ' ': goto yy217; - case '[': goto yy218; - default: goto yy219; + case '\n': goto yy220; + case ' ': + case 0xA0: goto yy221; + case '[': goto yy222; + default: goto yy223; } -yy216: +yy220: { return 0; } -yy217: +yy221: yyaccept = 0; yych = *(marker = ++c); switch (yych) { - case ' ': goto yy353; - case '[': goto yy354; - default: goto yy216; + case ' ': + case 0xA0: goto yy359; + case '[': goto yy227; + default: goto yy220; } -yy218: +yy222: yyaccept = 0; yych = *(marker = ++c); switch (yych) { case 0x00: case '\n': case '\r': - case ']': goto yy216; - default: goto yy220; + case ']': goto yy220; + default: goto yy225; } -yy219: +yy223: yych = *++c; - goto yy216; -yy220: + goto yy220; +yy224: ++c; yych = *c; -yy221: +yy225: switch (yych) { case 0x00: case '\n': - case '\r': goto yy222; - case ']': goto yy223; - default: goto yy220; + case '\r': goto yy226; + case '\\': goto yy227; + case ']': goto yy229; + default: goto yy224; } -yy222: +yy226: c = marker; if (yyaccept == 0) { - goto yy216; + goto yy220; } else { - goto yy236; + goto yy242; } -yy223: +yy227: + ++c; + yych = *c; + switch (yych) { + case 0x00: + case '\n': + case '\r': + case ']': goto yy226; + case '\\': goto yy227; + default: goto yy224; + } +yy229: yych = *++c; switch (yych) { - case ':': goto yy224; - default: goto yy222; + case ':': goto yy230; + default: goto yy226; } -yy224: +yy230: ++c; yych = *c; switch (yych) { - case 0x00: goto yy222; + case 0x00: goto yy226; case '\t': - case ' ': goto yy224; - case '\n': goto yy226; - case '\r': goto yy228; - case '<': goto yy229; - default: goto yy231; + case ' ': + case 0xA0: goto yy230; + case '\n': goto yy232; + case '\r': goto yy234; + case '<': goto yy235; + default: goto yy237; } -yy226: +yy232: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy222; + case '\r': goto yy226; case '\t': - case ' ': goto yy226; - case '<': goto yy229; - default: goto yy231; + case ' ': + case 0xA0: goto yy232; + case '<': goto yy235; + default: goto yy237; } -yy228: +yy234: yych = *++c; switch (yych) { case 0x00: - case '\r': goto yy222; + case '\r': goto yy226; case '\t': case '\n': - case ' ': goto yy226; - case '<': goto yy229; - default: goto yy231; + case ' ': + case 0xA0: goto yy232; + case '<': goto yy235; + default: goto yy237; } -yy229: +yy235: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy233; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy326; - case '\'': goto yy328; - case '(': goto yy330; - case '>': goto yy231; - default: goto yy229; + case ' ': + case 0xA0: goto yy239; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy332; + case '\'': goto yy334; + case '(': goto yy336; + case '>': goto yy237; + default: goto yy235; } -yy231: +yy237: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy233; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy239; - case '\'': goto yy241; - case '(': goto yy243; - default: goto yy231; + case ' ': + case 0xA0: goto yy239; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy245; + case '\'': goto yy247; + case '(': goto yy249; + default: goto yy237; } -yy233: +yy239: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy233; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy255; - case '\'': goto yy257; - case '(': goto yy259; - default: goto yy222; + case ' ': + case 0xA0: goto yy239; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy261; + case '\'': goto yy263; + case '(': goto yy265; + default: goto yy226; } -yy235: +yy241: ++c; -yy236: +yy242: { return (size_t)( c - start ); } -yy237: +yy243: yyaccept = 1; yych = *(marker = ++c); switch (yych) { - case '"': goto yy255; - case '\'': goto yy257; - case '(': goto yy259; - default: goto yy236; + case '"': goto yy261; + case '\'': goto yy263; + case '(': goto yy265; + default: goto yy242; } -yy238: +yy244: yyaccept = 1; yych = *(marker = ++c); switch (yych) { - case '\n': goto yy237; - case '"': goto yy255; - case '\'': goto yy257; - case '(': goto yy259; - default: goto yy236; + case '\n': goto yy243; + case '"': goto yy261; + case '\'': goto yy263; + case '(': goto yy265; + default: goto yy242; } -yy239: +yy245: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy324; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy312; - case '\'': goto yy302; - case '(': goto yy247; - default: goto yy239; + case ' ': + case 0xA0: goto yy330; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy318; + case '\'': goto yy308; + case '(': goto yy253; + default: goto yy245; } -yy241: +yy247: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy322; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy302; - case '\'': goto yy264; - case '(': goto yy249; - default: goto yy241; + case ' ': + case 0xA0: goto yy328; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy308; + case '\'': goto yy270; + case '(': goto yy255; + default: goto yy247; } -yy243: +yy249: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy245; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy247; - case '\'': goto yy249; - case ')': goto yy251; - default: goto yy243; + case ' ': + case 0xA0: goto yy251; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy253; + case '\'': goto yy255; + case ')': goto yy257; + default: goto yy249; } -yy245: +yy251: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy245; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy292; - case '\'': goto yy276; - case ')': goto yy261; - default: goto yy259; + case ' ': + case 0xA0: goto yy251; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy298; + case '\'': goto yy282; + case ')': goto yy267; + default: goto yy265; } -yy247: +yy253: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy314; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy316; - case '\'': goto yy268; - case ')': goto yy312; - default: goto yy247; + case ' ': + case 0xA0: goto yy320; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy322; + case '\'': goto yy274; + case ')': goto yy318; + default: goto yy253; } -yy249: +yy255: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy266; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy268; - case '\'': goto yy270; - case ')': goto yy264; - default: goto yy249; + case ' ': + case 0xA0: goto yy272; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy274; + case '\'': goto yy276; + case ')': goto yy270; + default: goto yy255; } -yy251: +yy257: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy252; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy239; - case '\'': goto yy241; - case '(': goto yy243; - default: goto yy231; + case ' ': + case 0xA0: goto yy258; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy245; + case '\'': goto yy247; + case '(': goto yy249; + default: goto yy237; } -yy252: +yy258: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy252; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy255; - case '\'': goto yy257; - case '(': goto yy259; - default: goto yy222; + case ' ': + case 0xA0: goto yy258; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy261; + case '\'': goto yy263; + case '(': goto yy265; + default: goto yy226; } -yy254: +yy260: yyaccept = 1; yych = *(marker = ++c); switch (yych) { - case '\n': goto yy237; - case '"': goto yy255; - case '\'': goto yy257; - case '(': goto yy259; - default: goto yy236; + case '\n': goto yy243; + case '"': goto yy261; + case '\'': goto yy263; + case '(': goto yy265; + default: goto yy242; } -yy255: +yy261: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy222; - case '"': goto yy261; - default: goto yy255; + case '\r': goto yy226; + case '"': goto yy267; + default: goto yy261; } -yy257: +yy263: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy222; - case '\'': goto yy261; - default: goto yy257; + case '\r': goto yy226; + case '\'': goto yy267; + default: goto yy263; } -yy259: +yy265: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy222; - case ')': goto yy261; - default: goto yy259; + case '\r': goto yy226; + case ')': goto yy267; + default: goto yy265; } -yy261: +yy267: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy235; + case '\n': goto yy241; case '\t': - case ' ': goto yy261; - case '\r': goto yy263; - default: goto yy222; + case ' ': + case 0xA0: goto yy267; + case '\r': goto yy269; + default: goto yy226; } -yy263: +yy269: yych = *++c; switch (yych) { - case '\n': goto yy235; - default: goto yy236; + case '\n': goto yy241; + default: goto yy242; } -yy264: +yy270: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy310; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy302; - case '\'': goto yy264; - case '(': goto yy249; - default: goto yy241; - } -yy266: - ++c; - yych = *c; - switch (yych) { - case 0x00: goto yy235; - case '\t': - case ' ': goto yy266; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy278; - case '\'': goto yy280; - case ')': goto yy274; - default: goto yy276; - } -yy268: - ++c; - yych = *c; - switch (yych) { - case 0x00: goto yy235; - case '\t': - case ' ': goto yy296; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': - case '\'': goto yy294; - case ')': goto yy298; - default: goto yy268; - } -yy270: - ++c; - yych = *c; - switch (yych) { - case 0x00: goto yy235; - case '\t': - case ' ': goto yy272; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy268; + case ' ': + case 0xA0: goto yy316; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy308; case '\'': goto yy270; - case ')': goto yy264; - default: goto yy249; + case '(': goto yy255; + default: goto yy247; } yy272: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy272; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy278; - case '\'': goto yy280; - case ')': goto yy274; - default: goto yy276; + case ' ': + case 0xA0: goto yy272; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy284; + case '\'': goto yy286; + case ')': goto yy280; + default: goto yy282; } yy274: ++c; yych = *c; switch (yych) { - case 0x00: - case '\n': goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy274; - case '\r': goto yy263; - case '\'': goto yy261; - default: goto yy257; + case ' ': + case 0xA0: goto yy302; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': + case '\'': goto yy300; + case ')': goto yy304; + default: goto yy274; } yy276: ++c; yych = *c; switch (yych) { - case 0x00: - case '\n': - case '\r': goto yy222; - case '\'': goto yy282; - case ')': goto yy274; - default: goto yy276; + case 0x00: goto yy241; + case '\t': + case ' ': + case 0xA0: goto yy278; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy274; + case '\'': goto yy276; + case ')': goto yy270; + default: goto yy255; } yy278: ++c; yych = *c; switch (yych) { - case 0x00: - case '\n': - case '\r': goto yy222; - case '"': goto yy280; - case '\'': goto yy284; - case ')': goto yy286; - default: goto yy278; + case 0x00: goto yy241; + case '\t': + case ' ': + case 0xA0: goto yy278; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy284; + case '\'': goto yy286; + case ')': goto yy280; + default: goto yy282; } yy280: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy235; + case '\n': goto yy241; case '\t': - case ' ': goto yy280; - case '\r': goto yy263; - case '\'': goto yy282; - case ')': goto yy274; - default: goto yy276; + case ' ': + case 0xA0: goto yy280; + case '\r': goto yy269; + case '\'': goto yy267; + default: goto yy263; } yy282: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy235; - case '\t': - case ' ': goto yy282; - case '\r': goto yy263; - case ')': goto yy261; - default: goto yy259; + case '\n': + case '\r': goto yy226; + case '\'': goto yy288; + case ')': goto yy280; + default: goto yy282; } yy284: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy235; - case '\t': - case ' ': goto yy284; - case '\r': goto yy263; - case '"': goto yy282; - case ')': goto yy288; - default: goto yy292; + case '\n': + case '\r': goto yy226; + case '"': goto yy286; + case '\'': goto yy290; + case ')': goto yy292; + default: goto yy284; } yy286: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy235; + case '\n': goto yy241; case '\t': - case ' ': goto yy286; - case '\r': goto yy263; - case '"': goto yy274; + case ' ': + case 0xA0: goto yy286; + case '\r': goto yy269; case '\'': goto yy288; - default: goto yy290; + case ')': goto yy280; + default: goto yy282; } yy288: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy235; + case '\n': goto yy241; case '\t': - case ' ': goto yy288; - case '\r': goto yy263; - case '"': goto yy261; - default: goto yy255; + case ' ': + case 0xA0: goto yy288; + case '\r': goto yy269; + case ')': goto yy267; + default: goto yy265; } yy290: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': - case '\r': goto yy222; - case '"': goto yy274; - case '\'': goto yy288; - default: goto yy290; + case '\n': goto yy241; + case '\t': + case ' ': + case 0xA0: goto yy290; + case '\r': goto yy269; + case '"': goto yy288; + case ')': goto yy294; + default: goto yy298; } yy292: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': - case '\r': goto yy222; - case '"': goto yy282; - case ')': goto yy288; - default: goto yy292; + case '\n': goto yy241; + case '\t': + case ' ': + case 0xA0: goto yy292; + case '\r': goto yy269; + case '"': goto yy280; + case '\'': goto yy294; + default: goto yy296; } yy294: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: + case '\n': goto yy241; case '\t': - case ' ': goto yy308; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': - case '\'': goto yy294; - case ')': goto yy298; - default: goto yy268; + case ' ': + case 0xA0: goto yy294; + case '\r': goto yy269; + case '"': goto yy267; + default: goto yy261; } yy296: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; - case '\t': - case ' ': goto yy296; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': - case '\'': goto yy306; - case ')': goto yy286; - default: goto yy278; + case 0x00: + case '\n': + case '\r': goto yy226; + case '"': goto yy280; + case '\'': goto yy294; + default: goto yy296; } yy298: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; - case '\t': - case ' ': goto yy300; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': - case '\'': goto yy298; - case '(': goto yy268; - default: goto yy302; + case 0x00: + case '\n': + case '\r': goto yy226; + case '"': goto yy288; + case ')': goto yy294; + default: goto yy298; } yy300: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy300; - case '\n': goto yy237; - case '\r': goto yy254; + case ' ': + case 0xA0: goto yy314; + case '\n': goto yy243; + case '\r': goto yy260; case '"': - case '\'': goto yy286; - case '(': goto yy278; - default: goto yy290; + case '\'': goto yy300; + case ')': goto yy304; + default: goto yy274; } yy302: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy304; - case '\n': goto yy237; - case '\r': goto yy238; + case ' ': + case 0xA0: goto yy302; + case '\n': goto yy243; + case '\r': goto yy244; case '"': - case '\'': goto yy298; - case '(': goto yy268; - default: goto yy302; + case '\'': goto yy312; + case ')': goto yy292; + default: goto yy284; } yy304: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy304; - case '\n': goto yy237; - case '\r': goto yy238; + case ' ': + case 0xA0: goto yy306; + case '\n': goto yy243; + case '\r': goto yy260; case '"': - case '\'': goto yy286; - case '(': goto yy278; - default: goto yy290; + case '\'': goto yy304; + case '(': goto yy274; + default: goto yy308; } yy306: ++c; yych = *c; switch (yych) { - case 0x00: - case '\n': goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy306; - case '\r': goto yy263; - case '"': goto yy280; - case '\'': goto yy284; - case ')': goto yy286; - default: goto yy278; + case ' ': + case 0xA0: goto yy306; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': + case '\'': goto yy292; + case '(': goto yy284; + default: goto yy296; } yy308: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy308; - case '\n': goto yy237; - case '\r': goto yy254; + case ' ': + case 0xA0: goto yy310; + case '\n': goto yy243; + case '\r': goto yy244; case '"': - case '\'': goto yy306; - case ')': goto yy286; - default: goto yy278; + case '\'': goto yy304; + case '(': goto yy274; + default: goto yy308; } yy310: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy310; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy290; - case '\'': goto yy274; - case '(': goto yy276; - default: goto yy257; + case ' ': + case 0xA0: goto yy310; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': + case '\'': goto yy292; + case '(': goto yy284; + default: goto yy296; } yy312: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: + case '\n': goto yy241; case '\t': - case ' ': goto yy320; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy312; - case '\'': goto yy302; - case '(': goto yy247; - default: goto yy239; + case ' ': + case 0xA0: goto yy312; + case '\r': goto yy269; + case '"': goto yy286; + case '\'': goto yy290; + case ')': goto yy292; + default: goto yy284; } yy314: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy314; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy284; - case '\'': goto yy278; - case ')': goto yy288; - default: goto yy292; + case ' ': + case 0xA0: goto yy314; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': + case '\'': goto yy312; + case ')': goto yy292; + default: goto yy284; } yy316: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy318; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy316; - case '\'': goto yy268; - case ')': goto yy312; - default: goto yy247; + case ' ': + case 0xA0: goto yy316; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy296; + case '\'': goto yy280; + case '(': goto yy282; + default: goto yy263; } yy318: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy318; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy284; - case '\'': goto yy278; - case ')': goto yy288; - default: goto yy292; + case ' ': + case 0xA0: goto yy326; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy318; + case '\'': goto yy308; + case '(': goto yy253; + default: goto yy245; } yy320: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy320; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy288; - case '\'': goto yy290; - case '(': goto yy292; - default: goto yy255; + case ' ': + case 0xA0: goto yy320; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy290; + case '\'': goto yy284; + case ')': goto yy294; + default: goto yy298; } yy322: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy322; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy290; + case ' ': + case 0xA0: goto yy324; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy322; case '\'': goto yy274; - case '(': goto yy276; - default: goto yy257; + case ')': goto yy318; + default: goto yy253; } yy324: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy324; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy288; - case '\'': goto yy290; - case '(': goto yy292; - default: goto yy255; + case ' ': + case 0xA0: goto yy324; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy290; + case '\'': goto yy284; + case ')': goto yy294; + default: goto yy298; } yy326: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy324; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy349; - case '\'': goto yy347; - case '(': goto yy332; - case '>': goto yy239; - default: goto yy326; + case ' ': + case 0xA0: goto yy326; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy294; + case '\'': goto yy296; + case '(': goto yy298; + default: goto yy261; } yy328: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy322; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy347; - case '\'': goto yy337; - case '(': goto yy334; - case '>': goto yy241; - default: goto yy328; + case ' ': + case 0xA0: goto yy328; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy296; + case '\'': goto yy280; + case '(': goto yy282; + default: goto yy263; } yy330: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy245; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy332; - case '\'': goto yy334; - case ')': goto yy336; - case '>': goto yy243; - default: goto yy330; + case ' ': + case 0xA0: goto yy330; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy294; + case '\'': goto yy296; + case '(': goto yy298; + default: goto yy261; } yy332: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy314; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy351; - case '\'': goto yy339; - case ')': goto yy349; - case '>': goto yy247; + case ' ': + case 0xA0: goto yy330; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy355; + case '\'': goto yy353; + case '(': goto yy338; + case '>': goto yy245; default: goto yy332; } yy334: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy266; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': goto yy339; - case '\'': goto yy341; - case ')': goto yy337; - case '>': goto yy249; + case ' ': + case 0xA0: goto yy328; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy353; + case '\'': goto yy343; + case '(': goto yy340; + case '>': goto yy247; default: goto yy334; } yy336: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy252; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy326; - case '\'': goto yy328; - case '(': goto yy330; - case '>': goto yy231; - default: goto yy229; + case ' ': + case 0xA0: goto yy251; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy338; + case '\'': goto yy340; + case ')': goto yy342; + case '>': goto yy249; + default: goto yy336; } -yy337: +yy338: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy310; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy347; - case '\'': goto yy337; - case '(': goto yy334; - case '>': goto yy241; - default: goto yy328; + case ' ': + case 0xA0: goto yy320; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy357; + case '\'': goto yy345; + case ')': goto yy355; + case '>': goto yy253; + default: goto yy338; } -yy339: +yy340: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy296; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': - case '\'': goto yy343; - case ')': goto yy345; - case '>': goto yy268; - default: goto yy339; + case ' ': + case 0xA0: goto yy272; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': goto yy345; + case '\'': goto yy347; + case ')': goto yy343; + case '>': goto yy255; + default: goto yy340; } -yy341: +yy342: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy272; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy339; - case '\'': goto yy341; - case ')': goto yy337; - case '>': goto yy249; - default: goto yy334; + case ' ': + case 0xA0: goto yy258; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy332; + case '\'': goto yy334; + case '(': goto yy336; + case '>': goto yy237; + default: goto yy235; } yy343: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy308; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': + case ' ': + case 0xA0: goto yy316; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy353; case '\'': goto yy343; - case ')': goto yy345; - case '>': goto yy268; - default: goto yy339; + case '(': goto yy340; + case '>': goto yy247; + default: goto yy334; } yy345: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy300; - case '\n': goto yy237; - case '\r': goto yy254; + case ' ': + case 0xA0: goto yy302; + case '\n': goto yy243; + case '\r': goto yy244; case '"': - case '\'': goto yy345; - case '(': goto yy339; - case '>': goto yy302; - default: goto yy347; + case '\'': goto yy349; + case ')': goto yy351; + case '>': goto yy274; + default: goto yy345; } yy347: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy304; - case '\n': goto yy237; - case '\r': goto yy238; - case '"': - case '\'': goto yy345; - case '(': goto yy339; - case '>': goto yy302; - default: goto yy347; + case ' ': + case 0xA0: goto yy278; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy345; + case '\'': goto yy347; + case ')': goto yy343; + case '>': goto yy255; + default: goto yy340; } yy349: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy320; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy349; - case '\'': goto yy347; - case '(': goto yy332; - case '>': goto yy239; - default: goto yy326; + case ' ': + case 0xA0: goto yy314; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': + case '\'': goto yy349; + case ')': goto yy351; + case '>': goto yy274; + default: goto yy345; } yy351: ++c; yych = *c; switch (yych) { - case 0x00: goto yy235; + case 0x00: goto yy241; case '\t': - case ' ': goto yy318; - case '\n': goto yy237; - case '\r': goto yy254; - case '"': goto yy351; - case '\'': goto yy339; - case ')': goto yy349; - case '>': goto yy247; - default: goto yy332; + case ' ': + case 0xA0: goto yy306; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': + case '\'': goto yy351; + case '(': goto yy345; + case '>': goto yy308; + default: goto yy353; } yy353: - yych = *++c; + ++c; + yych = *c; + switch (yych) { + case 0x00: goto yy241; + case '\t': + case ' ': + case 0xA0: goto yy310; + case '\n': goto yy243; + case '\r': goto yy244; + case '"': + case '\'': goto yy351; + case '(': goto yy345; + case '>': goto yy308; + default: goto yy353; + } +yy355: + ++c; + yych = *c; + switch (yych) { + case 0x00: goto yy241; + case '\t': + case ' ': + case 0xA0: goto yy326; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy355; + case '\'': goto yy353; + case '(': goto yy338; + case '>': goto yy245; + default: goto yy332; + } +yy357: + ++c; + yych = *c; switch (yych) { - case ' ': goto yy355; - case '[': goto yy354; - default: goto yy222; + case 0x00: goto yy241; + case '\t': + case ' ': + case 0xA0: goto yy324; + case '\n': goto yy243; + case '\r': goto yy260; + case '"': goto yy357; + case '\'': goto yy345; + case ')': goto yy355; + case '>': goto yy253; + default: goto yy338; } -yy354: +yy359: yych = *++c; switch (yych) { - case ']': goto yy222; - default: goto yy221; + case ' ': + case 0xA0: goto yy360; + case '[': goto yy227; + default: goto yy226; } -yy355: +yy360: ++c; switch ((yych = *c)) { - case '[': goto yy354; - default: goto yy222; + case '[': goto yy227; + default: goto yy226; } } @@ -4830,92 +4956,101 @@ size_t scan_ref_link(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy358; - case ' ': goto yy359; - case '[': goto yy360; - default: goto yy361; + case '\n': goto yy363; + case ' ': + case 0xA0: goto yy364; + case '[': goto yy365; + default: goto yy366; } -yy358: +yy363: { return 0; } -yy359: +yy364: yych = *(marker = ++c); switch (yych) { - case ' ': goto yy370; - case '[': goto yy371; - default: goto yy358; + case ' ': + case 0xA0: goto yy377; + case '[': goto yy370; + default: goto yy363; } -yy360: +yy365: yych = *(marker = ++c); switch (yych) { case 0x00: case '\n': case '\r': - case ']': goto yy358; - default: goto yy362; + case ']': goto yy363; + default: goto yy368; } -yy361: +yy366: yych = *++c; - goto yy358; -yy362: + goto yy363; +yy367: ++c; yych = *c; -yy363: +yy368: switch (yych) { case 0x00: case '\n': - case '\r': goto yy364; - case ']': goto yy365; - default: goto yy362; + case '\r': goto yy369; + case '\\': goto yy370; + case ']': goto yy372; + default: goto yy367; } -yy364: +yy369: c = marker; - goto yy358; -yy365: + goto yy363; +yy370: + ++c; + yych = *c; + switch (yych) { + case 0x00: + case '\n': + case '\r': + case ']': goto yy369; + case '\\': goto yy370; + default: goto yy367; + } +yy372: yych = *++c; switch (yych) { - case ':': goto yy366; - default: goto yy364; + case ':': goto yy373; + default: goto yy369; } -yy366: +yy373: yych = *++c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy364; - default: goto yy367; + case '\r': goto yy369; + default: goto yy374; } -yy367: +yy374: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy369; - default: goto yy367; + case '\r': goto yy376; + default: goto yy374; } -yy369: +yy376: { return (size_t)( c - start ); } -yy370: - yych = *++c; - switch (yych) { - case ' ': goto yy372; - case '[': goto yy371; - default: goto yy364; - } -yy371: +yy377: yych = *++c; switch (yych) { - case ']': goto yy364; - default: goto yy363; + case ' ': + case 0xA0: goto yy378; + case '[': goto yy370; + default: goto yy369; } -yy372: +yy378: ++c; switch ((yych = *c)) { - case '[': goto yy371; - default: goto yy364; + case '[': goto yy370; + default: goto yy369; } } @@ -4928,20 +5063,20 @@ size_t scan_html(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy375; - case '<': goto yy376; - default: goto yy377; + case '\n': goto yy381; + case '<': goto yy382; + default: goto yy383; } -yy375: +yy381: { return 0; } -yy376: +yy382: yych = *(marker = ++c); switch (yych) { - case '!': goto yy378; - case '/': goto yy380; + case '!': goto yy384; + case '/': goto yy386; case 'A': case 'B': case 'C': @@ -4993,22 +5128,22 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy381; - default: goto yy375; + case 'z': goto yy387; + default: goto yy381; } -yy377: +yy383: yych = *++c; - goto yy375; -yy378: + goto yy381; +yy384: yych = *++c; switch (yych) { - case '-': goto yy409; - default: goto yy379; + case '-': goto yy415; + default: goto yy385; } -yy379: +yy385: c = marker; - goto yy375; -yy380: + goto yy381; +yy386: yych = *++c; switch (yych) { case 'A': @@ -5062,17 +5197,18 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy405; - default: goto yy379; + case 'z': goto yy411; + default: goto yy385; } -yy381: +yy387: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy385; - case '\n': goto yy387; - case '\r': goto yy389; + case ' ': + case 0xA0: goto yy391; + case '\n': goto yy393; + case '\r': goto yy395; case '-': case '0': case '1': @@ -5083,11 +5219,11 @@ size_t scan_html(const char * c) { case '6': case '7': case '8': - case '9': goto yy381; - case '/': goto yy394; + case '9': goto yy387; + case '/': goto yy400; case ':': - case '_': goto yy390; - case '>': goto yy392; + case '_': goto yy396; + case '>': goto yy398; case 'A': case 'B': case 'C': @@ -5139,17 +5275,18 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy383; - default: goto yy379; + case 'z': goto yy389; + default: goto yy385; } -yy383: +yy389: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy385; - case '\n': goto yy387; - case '\r': goto yy389; + case ' ': + case 0xA0: goto yy391; + case '\n': goto yy393; + case '\r': goto yy395; case '-': case '0': case '1': @@ -5212,24 +5349,25 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy383; + case 'z': goto yy389; case '.': case ':': - case '_': goto yy390; - case '/': goto yy394; - case '=': goto yy395; - case '>': goto yy392; - default: goto yy379; + case '_': goto yy396; + case '/': goto yy400; + case '=': goto yy401; + case '>': goto yy398; + default: goto yy385; } -yy385: +yy391: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy385; - case '\n': goto yy387; - case '\r': goto yy389; - case '/': goto yy394; + case ' ': + case 0xA0: goto yy391; + case '\n': goto yy393; + case '\r': goto yy395; + case '/': goto yy400; case ':': case 'A': case 'B': @@ -5283,16 +5421,17 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy390; - case '>': goto yy392; - default: goto yy379; + case 'z': goto yy396; + case '>': goto yy398; + default: goto yy385; } -yy387: +yy393: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy387; + case ' ': + case 0xA0: goto yy393; case ':': case 'A': case 'B': @@ -5346,16 +5485,17 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy390; - default: goto yy379; + case 'z': goto yy396; + default: goto yy385; } -yy389: +yy395: ++c; yych = *c; switch (yych) { case '\t': case '\n': - case ' ': goto yy387; + case ' ': + case 0xA0: goto yy393; case ':': case 'A': case 'B': @@ -5409,10 +5549,10 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy390; - default: goto yy379; + case 'z': goto yy396; + default: goto yy385; } -yy390: +yy396: ++c; yych = *c; switch (yych) { @@ -5481,27 +5621,28 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy390; - case '=': goto yy395; - default: goto yy379; + case 'z': goto yy396; + case '=': goto yy401; + default: goto yy385; } -yy392: +yy398: ++c; { return (size_t)( c - start ); } -yy394: +yy400: yych = *++c; switch (yych) { - case '>': goto yy392; - default: goto yy379; + case '>': goto yy398; + default: goto yy385; } -yy395: +yy401: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy395; - case '"': goto yy397; - case '\'': goto yy399; + case ' ': + case 0xA0: goto yy401; + case '"': goto yy403; + case '\'': goto yy405; case '.': case '0': case '1': @@ -5564,37 +5705,38 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy401; - default: goto yy379; + case 'z': goto yy407; + default: goto yy385; } -yy397: +yy403: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy379; - case '"': goto yy385; - default: goto yy397; + case '\r': goto yy385; + case '"': goto yy391; + default: goto yy403; } -yy399: +yy405: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy379; - case '\'': goto yy385; - default: goto yy399; + case '\r': goto yy385; + case '\'': goto yy391; + default: goto yy405; } -yy401: +yy407: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy385; - case '\n': goto yy387; - case '\r': goto yy389; + case ' ': + case 0xA0: goto yy391; + case '\n': goto yy393; + case '\r': goto yy395; case '.': case '0': case '1': @@ -5605,11 +5747,11 @@ size_t scan_html(const char * c) { case '6': case '7': case '8': - case '9': goto yy401; - case '/': goto yy394; + case '9': goto yy407; + case '/': goto yy400; case ':': - case '_': goto yy390; - case '>': goto yy392; + case '_': goto yy396; + case '>': goto yy398; case 'A': case 'B': case 'C': @@ -5661,20 +5803,21 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy403; - default: goto yy379; + case 'z': goto yy409; + default: goto yy385; } -yy403: +yy409: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy385; - case '\n': goto yy387; - case '\r': goto yy389; + case ' ': + case 0xA0: goto yy391; + case '\n': goto yy393; + case '\r': goto yy395; case '-': case ':': - case '_': goto yy390; + case '_': goto yy396; case '.': case '0': case '1': @@ -5737,18 +5880,19 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy403; - case '/': goto yy394; - case '=': goto yy395; - case '>': goto yy392; - default: goto yy379; + case 'z': goto yy409; + case '/': goto yy400; + case '=': goto yy401; + case '>': goto yy398; + default: goto yy385; } -yy405: +yy411: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy407; + case ' ': + case 0xA0: goto yy413; case '-': case '0': case '1': @@ -5811,58 +5955,59 @@ size_t scan_html(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy405; - case '>': goto yy392; - default: goto yy379; + case 'z': goto yy411; + case '>': goto yy398; + default: goto yy385; } -yy407: +yy413: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy407; - case '>': goto yy392; - default: goto yy379; + case ' ': + case 0xA0: goto yy413; + case '>': goto yy398; + default: goto yy385; } -yy409: +yy415: yych = *++c; switch (yych) { - case '-': goto yy410; - default: goto yy379; + case '-': goto yy416; + default: goto yy385; } -yy410: +yy416: yych = *++c; switch (yych) { - case '-': goto yy379; - default: goto yy412; + case '-': goto yy385; + default: goto yy418; } -yy411: +yy417: ++c; yych = *c; -yy412: +yy418: switch (yych) { case 0x00: - case '>': goto yy379; - case '-': goto yy413; - default: goto yy411; + case '>': goto yy385; + case '-': goto yy419; + default: goto yy417; } -yy413: +yy419: ++c; yych = *c; switch (yych) { case 0x00: - case '>': goto yy379; - case '-': goto yy414; - default: goto yy411; + case '>': goto yy385; + case '-': goto yy420; + default: goto yy417; } -yy414: +yy420: ++c; yych = *c; switch (yych) { - case 0x00: goto yy379; - case '-': goto yy414; - case '>': goto yy392; - default: goto yy411; + case 0x00: goto yy385; + case '-': goto yy420; + case '>': goto yy398; + default: goto yy417; } } @@ -5875,74 +6020,74 @@ size_t scan_html_comment(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy418; - case '<': goto yy419; - default: goto yy420; + case '\n': goto yy424; + case '<': goto yy425; + default: goto yy426; } -yy418: +yy424: { return 0; } -yy419: +yy425: yych = *(marker = ++c); switch (yych) { - case '!': goto yy421; - default: goto yy418; + case '!': goto yy427; + default: goto yy424; } -yy420: +yy426: yych = *++c; - goto yy418; -yy421: + goto yy424; +yy427: yych = *++c; switch (yych) { - case '-': goto yy423; - default: goto yy422; + case '-': goto yy429; + default: goto yy428; } -yy422: +yy428: c = marker; - goto yy418; -yy423: + goto yy424; +yy429: yych = *++c; switch (yych) { - case '-': goto yy424; - default: goto yy422; + case '-': goto yy430; + default: goto yy428; } -yy424: +yy430: yych = *++c; switch (yych) { - case '-': goto yy422; - default: goto yy426; + case '-': goto yy428; + default: goto yy432; } -yy425: +yy431: ++c; yych = *c; -yy426: +yy432: switch (yych) { case 0x00: - case '>': goto yy422; - case '-': goto yy427; - default: goto yy425; + case '>': goto yy428; + case '-': goto yy433; + default: goto yy431; } -yy427: +yy433: ++c; yych = *c; switch (yych) { case 0x00: - case '>': goto yy422; - case '-': goto yy428; - default: goto yy425; + case '>': goto yy428; + case '-': goto yy434; + default: goto yy431; } -yy428: +yy434: ++c; yych = *c; switch (yych) { - case 0x00: goto yy422; - case '-': goto yy428; - case '>': goto yy430; - default: goto yy425; + case 0x00: goto yy428; + case '-': goto yy434; + case '>': goto yy436; + default: goto yy431; } -yy430: +yy436: ++c; { return (size_t)( c - start ); } } @@ -5956,133 +6101,133 @@ size_t scan_html_block(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy434; - case '<': goto yy435; - default: goto yy436; + case '\n': goto yy440; + case '<': goto yy441; + default: goto yy442; } -yy434: +yy440: { return 0; } -yy435: +yy441: yych = *(marker = ++c); switch (yych) { - case '/': goto yy437; + case '/': goto yy443; case 'A': - case 'a': goto yy440; + case 'a': goto yy446; case 'B': - case 'b': goto yy441; + case 'b': goto yy447; case 'C': - case 'c': goto yy442; + case 'c': goto yy448; case 'D': - case 'd': goto yy443; + case 'd': goto yy449; case 'F': - case 'f': goto yy444; + case 'f': goto yy450; case 'H': - case 'h': goto yy445; + case 'h': goto yy451; case 'I': - case 'i': goto yy446; + case 'i': goto yy452; case 'L': - case 'l': goto yy447; + case 'l': goto yy453; case 'M': - case 'm': goto yy448; + case 'm': goto yy454; case 'N': - case 'n': goto yy449; + case 'n': goto yy455; case 'O': - case 'o': goto yy450; + case 'o': goto yy456; case 'P': - case 'p': goto yy439; + case 'p': goto yy445; case 'S': - case 's': goto yy451; + case 's': goto yy457; case 'T': - case 't': goto yy452; + case 't': goto yy458; case 'U': - case 'u': goto yy453; + case 'u': goto yy459; case 'V': - case 'v': goto yy454; - default: goto yy434; + case 'v': goto yy460; + default: goto yy440; } -yy436: +yy442: yych = *++c; - goto yy434; -yy437: + goto yy440; +yy443: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy440; + case 'a': goto yy446; case 'B': - case 'b': goto yy441; + case 'b': goto yy447; case 'C': - case 'c': goto yy442; + case 'c': goto yy448; case 'D': - case 'd': goto yy443; + case 'd': goto yy449; case 'F': - case 'f': goto yy444; + case 'f': goto yy450; case 'H': - case 'h': goto yy445; + case 'h': goto yy451; case 'I': - case 'i': goto yy446; + case 'i': goto yy452; case 'L': - case 'l': goto yy447; + case 'l': goto yy453; case 'M': - case 'm': goto yy448; + case 'm': goto yy454; case 'N': - case 'n': goto yy449; + case 'n': goto yy455; case 'O': - case 'o': goto yy450; + case 'o': goto yy456; case 'P': - case 'p': goto yy439; + case 'p': goto yy445; case 'S': - case 's': goto yy451; + case 's': goto yy457; case 'T': - case 't': goto yy452; + case 't': goto yy458; case 'U': - case 'u': goto yy453; + case 'u': goto yy459; case 'V': - case 'v': goto yy454; - default: goto yy438; + case 'v': goto yy460; + default: goto yy444; } -yy438: +yy444: c = marker; - goto yy434; -yy439: + goto yy440; +yy445: yych = *++c; switch (yych) { - case '/': goto yy466; - case '>': goto yy467; + case '/': goto yy472; + case '>': goto yy473; case 'R': - case 'r': goto yy586; - default: goto yy460; + case 'r': goto yy592; + default: goto yy466; } -yy440: +yy446: yych = *++c; switch (yych) { case 'D': - case 'd': goto yy575; + case 'd': goto yy581; case 'R': - case 'r': goto yy574; + case 'r': goto yy580; case 'S': - case 's': goto yy573; - default: goto yy438; + case 's': goto yy579; + default: goto yy444; } -yy441: +yy447: yych = *++c; switch (yych) { case 'L': - case 'l': goto yy565; - default: goto yy438; + case 'l': goto yy571; + default: goto yy444; } -yy442: +yy448: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy558; + case 'a': goto yy564; case 'E': - case 'e': goto yy557; - default: goto yy438; + case 'e': goto yy563; + default: goto yy444; } -yy443: +yy449: yych = *++c; switch (yych) { case 'D': @@ -6090,23 +6235,23 @@ size_t scan_html_block(const char * c) { case 'T': case 'd': case 'l': - case 't': goto yy458; + case 't': goto yy464; case 'I': - case 'i': goto yy556; - default: goto yy438; + case 'i': goto yy562; + default: goto yy444; } -yy444: +yy450: yych = *++c; switch (yych) { case 'I': - case 'i': goto yy532; + case 'i': goto yy538; case 'O': - case 'o': goto yy531; + case 'o': goto yy537; case 'R': - case 'r': goto yy530; - default: goto yy438; + case 'r': goto yy536; + default: goto yy444; } -yy445: +yy451: yych = *++c; switch (yych) { case '1': @@ -6116,122 +6261,123 @@ size_t scan_html_block(const char * c) { case '5': case '6': case 'R': - case 'r': goto yy458; + case 'r': goto yy464; case 'E': - case 'e': goto yy523; + case 'e': goto yy529; case 'G': - case 'g': goto yy522; - default: goto yy438; + case 'g': goto yy528; + default: goto yy444; } -yy446: +yy452: yych = *++c; switch (yych) { case 'S': - case 's': goto yy517; - default: goto yy438; + case 's': goto yy523; + default: goto yy444; } -yy447: +yy453: yych = *++c; switch (yych) { case 'I': - case 'i': goto yy458; - default: goto yy438; + case 'i': goto yy464; + default: goto yy444; } -yy448: +yy454: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy514; + case 'a': goto yy520; case 'E': - case 'e': goto yy513; - default: goto yy438; + case 'e': goto yy519; + default: goto yy444; } -yy449: +yy455: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy502; + case 'a': goto yy508; case 'O': - case 'o': goto yy501; - default: goto yy438; + case 'o': goto yy507; + default: goto yy444; } -yy450: +yy456: yych = *++c; switch (yych) { case 'L': - case 'l': goto yy458; + case 'l': goto yy464; case 'U': - case 'u': goto yy497; - default: goto yy438; + case 'u': goto yy503; + default: goto yy444; } -yy451: +yy457: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy492; - default: goto yy438; + case 'e': goto yy498; + default: goto yy444; } -yy452: +yy458: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy482; + case 'a': goto yy488; case 'B': - case 'b': goto yy481; + case 'b': goto yy487; case 'D': case 'R': case 'd': - case 'r': goto yy458; + case 'r': goto yy464; case 'F': - case 'f': goto yy480; + case 'f': goto yy486; case 'H': - case 'h': goto yy479; - default: goto yy438; + case 'h': goto yy485; + default: goto yy444; } -yy453: +yy459: yych = *++c; switch (yych) { case 'L': - case 'l': goto yy458; - default: goto yy438; + case 'l': goto yy464; + default: goto yy444; } -yy454: +yy460: yych = *++c; switch (yych) { case 'I': - case 'i': goto yy455; - default: goto yy438; + case 'i': goto yy461; + default: goto yy444; } -yy455: +yy461: yych = *++c; switch (yych) { case 'D': - case 'd': goto yy456; - default: goto yy438; + case 'd': goto yy462; + default: goto yy444; } -yy456: +yy462: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy457; - default: goto yy438; + case 'e': goto yy463; + default: goto yy444; } -yy457: +yy463: yych = *++c; switch (yych) { case 'O': - case 'o': goto yy458; - default: goto yy438; + case 'o': goto yy464; + default: goto yy444; } -yy458: +yy464: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy459; - case '\n': goto yy461; - case '\r': goto yy463; - case '/': goto yy466; + case ' ': + case 0xA0: goto yy465; + case '\n': goto yy467; + case '\r': goto yy469; + case '/': goto yy472; case ':': case 'A': case 'B': @@ -6285,19 +6431,20 @@ size_t scan_html_block(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy464; - case '>': goto yy467; - default: goto yy438; + case 'z': goto yy470; + case '>': goto yy473; + default: goto yy444; } -yy459: +yy465: ++c; yych = *c; -yy460: +yy466: switch (yych) { case '\t': - case ' ': goto yy459; - case '\n': goto yy461; - case '\r': goto yy463; + case ' ': + case 0xA0: goto yy465; + case '\n': goto yy467; + case '\r': goto yy469; case ':': case 'A': case 'B': @@ -6351,15 +6498,16 @@ size_t scan_html_block(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy464; - default: goto yy438; + case 'z': goto yy470; + default: goto yy444; } -yy461: +yy467: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy461; + case ' ': + case 0xA0: goto yy467; case ':': case 'A': case 'B': @@ -6413,16 +6561,17 @@ size_t scan_html_block(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy464; - default: goto yy438; + case 'z': goto yy470; + default: goto yy444; } -yy463: +yy469: ++c; yych = *c; switch (yych) { case '\t': case '\n': - case ' ': goto yy461; + case ' ': + case 0xA0: goto yy467; case ':': case 'A': case 'B': @@ -6476,13 +6625,13 @@ size_t scan_html_block(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy464; - default: goto yy438; + case 'z': goto yy470; + default: goto yy444; } -yy464: +yy470: ++c; yych = *c; -yy465: +yy471: switch (yych) { case '-': case '.': @@ -6549,27 +6698,28 @@ size_t scan_html_block(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy464; - case '=': goto yy469; - default: goto yy438; + case 'z': goto yy470; + case '=': goto yy475; + default: goto yy444; } -yy466: +yy472: yych = *++c; switch (yych) { - case '>': goto yy467; - default: goto yy438; + case '>': goto yy473; + default: goto yy444; } -yy467: +yy473: ++c; { return (size_t)( c - start ); } -yy469: +yy475: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy469; - case '"': goto yy471; - case '\'': goto yy473; + case ' ': + case 0xA0: goto yy475; + case '"': goto yy477; + case '\'': goto yy479; case '.': case '0': case '1': @@ -6632,37 +6782,38 @@ size_t scan_html_block(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy475; - default: goto yy438; + case 'z': goto yy481; + default: goto yy444; } -yy471: +yy477: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy438; - case '"': goto yy458; - default: goto yy471; + case '\r': goto yy444; + case '"': goto yy464; + default: goto yy477; } -yy473: +yy479: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy438; - case '\'': goto yy458; - default: goto yy473; + case '\r': goto yy444; + case '\'': goto yy464; + default: goto yy479; } -yy475: +yy481: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy459; - case '\n': goto yy461; - case '\r': goto yy463; + case ' ': + case 0xA0: goto yy465; + case '\n': goto yy467; + case '\r': goto yy469; case '.': case '0': case '1': @@ -6673,11 +6824,11 @@ size_t scan_html_block(const char * c) { case '6': case '7': case '8': - case '9': goto yy475; - case '/': goto yy466; + case '9': goto yy481; + case '/': goto yy472; case ':': - case '_': goto yy464; - case '>': goto yy467; + case '_': goto yy470; + case '>': goto yy473; case 'A': case 'B': case 'C': @@ -6729,20 +6880,21 @@ size_t scan_html_block(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy477; - default: goto yy438; + case 'z': goto yy483; + default: goto yy444; } -yy477: +yy483: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy459; - case '\n': goto yy461; - case '\r': goto yy463; + case ' ': + case 0xA0: goto yy465; + case '\n': goto yy467; + case '\r': goto yy469; case '-': case ':': - case '_': goto yy464; + case '_': goto yy470; case '.': case '0': case '1': @@ -6805,99 +6957,99 @@ size_t scan_html_block(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy477; - case '/': goto yy466; - case '=': goto yy469; - case '>': goto yy467; - default: goto yy438; + case 'z': goto yy483; + case '/': goto yy472; + case '=': goto yy475; + case '>': goto yy473; + default: goto yy444; } -yy479: +yy485: yych = *++c; switch (yych) { - case '/': goto yy466; - case '>': goto yy467; + case '/': goto yy472; + case '>': goto yy473; case 'E': - case 'e': goto yy489; - default: goto yy460; + case 'e': goto yy495; + default: goto yy466; } -yy480: +yy486: yych = *++c; switch (yych) { case 'O': - case 'o': goto yy487; - default: goto yy438; + case 'o': goto yy493; + default: goto yy444; } -yy481: +yy487: yych = *++c; switch (yych) { case 'O': - case 'o': goto yy485; - default: goto yy438; + case 'o': goto yy491; + default: goto yy444; } -yy482: +yy488: yych = *++c; switch (yych) { case 'B': - case 'b': goto yy483; - default: goto yy438; + case 'b': goto yy489; + default: goto yy444; } -yy483: +yy489: yych = *++c; switch (yych) { case 'L': - case 'l': goto yy484; - default: goto yy438; + case 'l': goto yy490; + default: goto yy444; } -yy484: +yy490: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy458; - default: goto yy438; + case 'e': goto yy464; + default: goto yy444; } -yy485: +yy491: yych = *++c; switch (yych) { case 'D': - case 'd': goto yy486; - default: goto yy438; + case 'd': goto yy492; + default: goto yy444; } -yy486: +yy492: yych = *++c; switch (yych) { case 'Y': - case 'y': goto yy458; - default: goto yy438; + case 'y': goto yy464; + default: goto yy444; } -yy487: +yy493: yych = *++c; switch (yych) { case 'O': - case 'o': goto yy488; - default: goto yy438; + case 'o': goto yy494; + default: goto yy444; } -yy488: +yy494: yych = *++c; switch (yych) { case 'T': - case 't': goto yy458; - default: goto yy438; + case 't': goto yy464; + default: goto yy444; } -yy489: +yy495: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy490; - default: goto yy465; + case 'a': goto yy496; + default: goto yy471; } -yy490: +yy496: yych = *++c; switch (yych) { case 'D': - case 'd': goto yy491; - default: goto yy465; + case 'd': goto yy497; + default: goto yy471; } -yy491: +yy497: yych = *++c; switch (yych) { case '-': @@ -6911,686 +7063,686 @@ size_t scan_html_block(const char * c) { case '6': case '7': case '8': - case '9': goto yy464; - case '/': goto yy466; - case '=': goto yy469; - case '>': goto yy467; - default: goto yy460; + case '9': goto yy470; + case '/': goto yy472; + case '=': goto yy475; + case '>': goto yy473; + default: goto yy466; } -yy492: +yy498: yych = *++c; switch (yych) { case 'C': - case 'c': goto yy493; - default: goto yy438; + case 'c': goto yy499; + default: goto yy444; } -yy493: +yy499: yych = *++c; switch (yych) { case 'T': - case 't': goto yy494; - default: goto yy438; + case 't': goto yy500; + default: goto yy444; } -yy494: +yy500: yych = *++c; switch (yych) { case 'I': - case 'i': goto yy495; - default: goto yy438; + case 'i': goto yy501; + default: goto yy444; } -yy495: +yy501: yych = *++c; switch (yych) { case 'O': - case 'o': goto yy496; - default: goto yy438; + case 'o': goto yy502; + default: goto yy444; } -yy496: +yy502: yych = *++c; switch (yych) { case 'N': - case 'n': goto yy458; - default: goto yy438; + case 'n': goto yy464; + default: goto yy444; } -yy497: +yy503: yych = *++c; switch (yych) { case 'T': - case 't': goto yy498; - default: goto yy438; + case 't': goto yy504; + default: goto yy444; } -yy498: +yy504: yych = *++c; switch (yych) { case 'P': - case 'p': goto yy499; - default: goto yy438; + case 'p': goto yy505; + default: goto yy444; } -yy499: +yy505: yych = *++c; switch (yych) { case 'U': - case 'u': goto yy500; - default: goto yy438; + case 'u': goto yy506; + default: goto yy444; } -yy500: +yy506: yych = *++c; switch (yych) { case 'T': - case 't': goto yy458; - default: goto yy438; + case 't': goto yy464; + default: goto yy444; } -yy501: +yy507: yych = *++c; switch (yych) { case 'F': - case 'f': goto yy503; + case 'f': goto yy509; case 'S': - case 's': goto yy504; - default: goto yy438; + case 's': goto yy510; + default: goto yy444; } -yy502: +yy508: yych = *++c; switch (yych) { case 'V': - case 'v': goto yy458; - default: goto yy438; + case 'v': goto yy464; + default: goto yy444; } -yy503: +yy509: yych = *++c; switch (yych) { case 'R': - case 'r': goto yy509; - default: goto yy438; + case 'r': goto yy515; + default: goto yy444; } -yy504: +yy510: yych = *++c; switch (yych) { case 'C': - case 'c': goto yy505; - default: goto yy438; + case 'c': goto yy511; + default: goto yy444; } -yy505: +yy511: yych = *++c; switch (yych) { case 'R': - case 'r': goto yy506; - default: goto yy438; + case 'r': goto yy512; + default: goto yy444; } -yy506: +yy512: yych = *++c; switch (yych) { case 'I': - case 'i': goto yy507; - default: goto yy438; + case 'i': goto yy513; + default: goto yy444; } -yy507: +yy513: yych = *++c; switch (yych) { case 'P': - case 'p': goto yy508; - default: goto yy438; + case 'p': goto yy514; + default: goto yy444; } -yy508: +yy514: yych = *++c; switch (yych) { case 'T': - case 't': goto yy458; - default: goto yy438; + case 't': goto yy464; + default: goto yy444; } -yy509: +yy515: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy510; - default: goto yy438; + case 'a': goto yy516; + default: goto yy444; } -yy510: +yy516: yych = *++c; switch (yych) { case 'M': - case 'm': goto yy511; - default: goto yy438; + case 'm': goto yy517; + default: goto yy444; } -yy511: +yy517: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy512; - default: goto yy438; + case 'e': goto yy518; + default: goto yy444; } -yy512: +yy518: yych = *++c; switch (yych) { case 'S': - case 's': goto yy458; - default: goto yy438; + case 's': goto yy464; + default: goto yy444; } -yy513: +yy519: yych = *++c; switch (yych) { case 'N': - case 'n': goto yy516; - default: goto yy438; + case 'n': goto yy522; + default: goto yy444; } -yy514: +yy520: yych = *++c; switch (yych) { case 'I': - case 'i': goto yy515; - default: goto yy438; + case 'i': goto yy521; + default: goto yy444; } -yy515: +yy521: yych = *++c; switch (yych) { case 'N': - case 'n': goto yy458; - default: goto yy438; + case 'n': goto yy464; + default: goto yy444; } -yy516: +yy522: yych = *++c; switch (yych) { case 'U': - case 'u': goto yy458; - default: goto yy438; + case 'u': goto yy464; + default: goto yy444; } -yy517: +yy523: yych = *++c; switch (yych) { case 'I': - case 'i': goto yy518; - default: goto yy438; + case 'i': goto yy524; + default: goto yy444; } -yy518: +yy524: yych = *++c; switch (yych) { case 'N': - case 'n': goto yy519; - default: goto yy438; + case 'n': goto yy525; + default: goto yy444; } -yy519: +yy525: yych = *++c; switch (yych) { case 'D': - case 'd': goto yy520; - default: goto yy438; + case 'd': goto yy526; + default: goto yy444; } -yy520: +yy526: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy521; - default: goto yy438; + case 'e': goto yy527; + default: goto yy444; } -yy521: +yy527: yych = *++c; switch (yych) { case 'X': - case 'x': goto yy458; - default: goto yy438; + case 'x': goto yy464; + default: goto yy444; } -yy522: +yy528: yych = *++c; switch (yych) { case 'R': - case 'r': goto yy527; - default: goto yy438; + case 'r': goto yy533; + default: goto yy444; } -yy523: +yy529: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy524; - default: goto yy438; + case 'a': goto yy530; + default: goto yy444; } -yy524: +yy530: yych = *++c; switch (yych) { case 'D': - case 'd': goto yy525; - default: goto yy438; + case 'd': goto yy531; + default: goto yy444; } -yy525: +yy531: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy526; - default: goto yy438; + case 'e': goto yy532; + default: goto yy444; } -yy526: +yy532: yych = *++c; switch (yych) { case 'R': - case 'r': goto yy458; - default: goto yy438; + case 'r': goto yy464; + default: goto yy444; } -yy527: +yy533: yych = *++c; switch (yych) { case 'O': - case 'o': goto yy528; - default: goto yy438; + case 'o': goto yy534; + default: goto yy444; } -yy528: +yy534: yych = *++c; switch (yych) { case 'U': - case 'u': goto yy529; - default: goto yy438; + case 'u': goto yy535; + default: goto yy444; } -yy529: +yy535: yych = *++c; switch (yych) { case 'P': - case 'p': goto yy458; - default: goto yy438; + case 'p': goto yy464; + default: goto yy444; } -yy530: +yy536: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy551; - default: goto yy438; + case 'a': goto yy557; + default: goto yy444; } -yy531: +yy537: yych = *++c; switch (yych) { case 'O': - case 'o': goto yy547; + case 'o': goto yy553; case 'R': - case 'r': goto yy548; - default: goto yy438; + case 'r': goto yy554; + default: goto yy444; } -yy532: +yy538: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy533; + case 'e': goto yy539; case 'G': - case 'g': goto yy534; - default: goto yy438; + case 'g': goto yy540; + default: goto yy444; } -yy533: +yy539: yych = *++c; switch (yych) { case 'L': - case 'l': goto yy543; - default: goto yy438; + case 'l': goto yy549; + default: goto yy444; } -yy534: +yy540: yych = *++c; switch (yych) { case 'C': - case 'c': goto yy536; + case 'c': goto yy542; case 'U': - case 'u': goto yy535; - default: goto yy438; + case 'u': goto yy541; + default: goto yy444; } -yy535: +yy541: yych = *++c; switch (yych) { case 'R': - case 'r': goto yy542; - default: goto yy438; + case 'r': goto yy548; + default: goto yy444; } -yy536: +yy542: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy537; - default: goto yy438; + case 'a': goto yy543; + default: goto yy444; } -yy537: +yy543: yych = *++c; switch (yych) { case 'P': - case 'p': goto yy538; - default: goto yy438; + case 'p': goto yy544; + default: goto yy444; } -yy538: +yy544: yych = *++c; switch (yych) { case 'T': - case 't': goto yy539; - default: goto yy438; + case 't': goto yy545; + default: goto yy444; } -yy539: +yy545: yych = *++c; switch (yych) { case 'I': - case 'i': goto yy540; - default: goto yy438; + case 'i': goto yy546; + default: goto yy444; } -yy540: +yy546: yych = *++c; switch (yych) { case 'O': - case 'o': goto yy541; - default: goto yy438; + case 'o': goto yy547; + default: goto yy444; } -yy541: +yy547: yych = *++c; switch (yych) { case 'N': - case 'n': goto yy458; - default: goto yy438; + case 'n': goto yy464; + default: goto yy444; } -yy542: +yy548: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy458; - default: goto yy438; + case 'e': goto yy464; + default: goto yy444; } -yy543: +yy549: yych = *++c; switch (yych) { case 'D': - case 'd': goto yy544; - default: goto yy438; + case 'd': goto yy550; + default: goto yy444; } -yy544: +yy550: yych = *++c; switch (yych) { case 'S': - case 's': goto yy545; - default: goto yy438; + case 's': goto yy551; + default: goto yy444; } -yy545: +yy551: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy546; - default: goto yy438; + case 'e': goto yy552; + default: goto yy444; } -yy546: +yy552: yych = *++c; switch (yych) { case 'T': - case 't': goto yy458; - default: goto yy438; + case 't': goto yy464; + default: goto yy444; } -yy547: +yy553: yych = *++c; switch (yych) { case 'T': - case 't': goto yy549; - default: goto yy438; + case 't': goto yy555; + default: goto yy444; } -yy548: +yy554: yych = *++c; switch (yych) { case 'M': - case 'm': goto yy458; - default: goto yy438; + case 'm': goto yy464; + default: goto yy444; } -yy549: +yy555: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy550; - default: goto yy438; + case 'e': goto yy556; + default: goto yy444; } -yy550: +yy556: yych = *++c; switch (yych) { case 'R': - case 'r': goto yy458; - default: goto yy438; + case 'r': goto yy464; + default: goto yy444; } -yy551: +yy557: yych = *++c; switch (yych) { case 'M': - case 'm': goto yy552; - default: goto yy438; + case 'm': goto yy558; + default: goto yy444; } -yy552: +yy558: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy553; - default: goto yy438; + case 'e': goto yy559; + default: goto yy444; } -yy553: +yy559: yych = *++c; switch (yych) { case 'S': - case 's': goto yy554; - default: goto yy438; + case 's': goto yy560; + default: goto yy444; } -yy554: +yy560: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy555; - default: goto yy438; + case 'e': goto yy561; + default: goto yy444; } -yy555: +yy561: yych = *++c; switch (yych) { case 'T': - case 't': goto yy458; - default: goto yy438; + case 't': goto yy464; + default: goto yy444; } -yy556: +yy562: yych = *++c; switch (yych) { case 'R': case 'V': case 'r': - case 'v': goto yy458; - default: goto yy438; + case 'v': goto yy464; + default: goto yy444; } -yy557: +yy563: yych = *++c; switch (yych) { case 'N': - case 'n': goto yy562; - default: goto yy438; + case 'n': goto yy568; + default: goto yy444; } -yy558: +yy564: yych = *++c; switch (yych) { case 'N': - case 'n': goto yy559; - default: goto yy438; + case 'n': goto yy565; + default: goto yy444; } -yy559: +yy565: yych = *++c; switch (yych) { case 'V': - case 'v': goto yy560; - default: goto yy438; + case 'v': goto yy566; + default: goto yy444; } -yy560: +yy566: yych = *++c; switch (yych) { case 'A': - case 'a': goto yy561; - default: goto yy438; + case 'a': goto yy567; + default: goto yy444; } -yy561: +yy567: yych = *++c; switch (yych) { case 'S': - case 's': goto yy458; - default: goto yy438; + case 's': goto yy464; + default: goto yy444; } -yy562: +yy568: yych = *++c; switch (yych) { case 'T': - case 't': goto yy563; - default: goto yy438; + case 't': goto yy569; + default: goto yy444; } -yy563: +yy569: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy564; - default: goto yy438; + case 'e': goto yy570; + default: goto yy444; } -yy564: +yy570: yych = *++c; switch (yych) { case 'R': - case 'r': goto yy458; - default: goto yy438; + case 'r': goto yy464; + default: goto yy444; } -yy565: +yy571: yych = *++c; switch (yych) { case 'O': - case 'o': goto yy566; - default: goto yy438; + case 'o': goto yy572; + default: goto yy444; } -yy566: +yy572: yych = *++c; switch (yych) { case 'C': - case 'c': goto yy567; - default: goto yy438; + case 'c': goto yy573; + default: goto yy444; } -yy567: +yy573: yych = *++c; switch (yych) { case 'K': - case 'k': goto yy568; - default: goto yy438; + case 'k': goto yy574; + default: goto yy444; } -yy568: +yy574: yych = *++c; switch (yych) { case 'Q': - case 'q': goto yy569; - default: goto yy438; + case 'q': goto yy575; + default: goto yy444; } -yy569: +yy575: yych = *++c; switch (yych) { case 'U': - case 'u': goto yy570; - default: goto yy438; + case 'u': goto yy576; + default: goto yy444; } -yy570: +yy576: yych = *++c; switch (yych) { case 'O': - case 'o': goto yy571; - default: goto yy438; + case 'o': goto yy577; + default: goto yy444; } -yy571: +yy577: yych = *++c; switch (yych) { case 'T': - case 't': goto yy572; - default: goto yy438; + case 't': goto yy578; + default: goto yy444; } -yy572: +yy578: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy458; - default: goto yy438; + case 'e': goto yy464; + default: goto yy444; } -yy573: +yy579: yych = *++c; switch (yych) { case 'I': - case 'i': goto yy584; - default: goto yy438; + case 'i': goto yy590; + default: goto yy444; } -yy574: +yy580: yych = *++c; switch (yych) { case 'T': - case 't': goto yy580; - default: goto yy438; + case 't': goto yy586; + default: goto yy444; } -yy575: +yy581: yych = *++c; switch (yych) { case 'D': - case 'd': goto yy576; - default: goto yy438; + case 'd': goto yy582; + default: goto yy444; } -yy576: +yy582: yych = *++c; switch (yych) { case 'R': - case 'r': goto yy577; - default: goto yy438; + case 'r': goto yy583; + default: goto yy444; } -yy577: +yy583: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy578; - default: goto yy438; + case 'e': goto yy584; + default: goto yy444; } -yy578: +yy584: yych = *++c; switch (yych) { case 'S': - case 's': goto yy579; - default: goto yy438; + case 's': goto yy585; + default: goto yy444; } -yy579: +yy585: yych = *++c; switch (yych) { case 'S': - case 's': goto yy458; - default: goto yy438; + case 's': goto yy464; + default: goto yy444; } -yy580: +yy586: yych = *++c; switch (yych) { case 'I': - case 'i': goto yy581; - default: goto yy438; + case 'i': goto yy587; + default: goto yy444; } -yy581: +yy587: yych = *++c; switch (yych) { case 'C': - case 'c': goto yy582; - default: goto yy438; + case 'c': goto yy588; + default: goto yy444; } -yy582: +yy588: yych = *++c; switch (yych) { case 'L': - case 'l': goto yy583; - default: goto yy438; + case 'l': goto yy589; + default: goto yy444; } -yy583: +yy589: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy458; - default: goto yy438; + case 'e': goto yy464; + default: goto yy444; } -yy584: +yy590: yych = *++c; switch (yych) { case 'D': - case 'd': goto yy585; - default: goto yy438; + case 'd': goto yy591; + default: goto yy444; } -yy585: +yy591: yych = *++c; switch (yych) { case 'E': - case 'e': goto yy458; - default: goto yy438; + case 'e': goto yy464; + default: goto yy444; } -yy586: +yy592: ++c; switch ((yych = *c)) { case 'E': - case 'e': goto yy491; - default: goto yy465; + case 'e': goto yy497; + default: goto yy471; } } @@ -7603,20 +7755,20 @@ size_t scan_html_line(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy589; - case '<': goto yy590; - default: goto yy591; + case '\n': goto yy595; + case '<': goto yy596; + default: goto yy597; } -yy589: +yy595: { return 0; } -yy590: +yy596: yych = *(marker = ++c); switch (yych) { - case '!': goto yy592; - case '/': goto yy594; + case '!': goto yy598; + case '/': goto yy600; case 'A': case 'B': case 'C': @@ -7668,22 +7820,22 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy595; - default: goto yy589; + case 'z': goto yy601; + default: goto yy595; } -yy591: +yy597: yych = *++c; - goto yy589; -yy592: + goto yy595; +yy598: yych = *++c; switch (yych) { - case '-': goto yy626; - default: goto yy593; + case '-': goto yy632; + default: goto yy599; } -yy593: +yy599: c = marker; - goto yy589; -yy594: + goto yy595; +yy600: yych = *++c; switch (yych) { case 'A': @@ -7737,17 +7889,18 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy622; - default: goto yy593; + case 'z': goto yy628; + default: goto yy599; } -yy595: +yy601: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy599; - case '\n': goto yy601; - case '\r': goto yy603; + case ' ': + case 0xA0: goto yy605; + case '\n': goto yy607; + case '\r': goto yy609; case '-': case '0': case '1': @@ -7758,11 +7911,11 @@ size_t scan_html_line(const char * c) { case '6': case '7': case '8': - case '9': goto yy595; - case '/': goto yy608; + case '9': goto yy601; + case '/': goto yy614; case ':': - case '_': goto yy604; - case '>': goto yy606; + case '_': goto yy610; + case '>': goto yy612; case 'A': case 'B': case 'C': @@ -7814,17 +7967,18 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy597; - default: goto yy593; + case 'z': goto yy603; + default: goto yy599; } -yy597: +yy603: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy599; - case '\n': goto yy601; - case '\r': goto yy603; + case ' ': + case 0xA0: goto yy605; + case '\n': goto yy607; + case '\r': goto yy609; case '-': case '0': case '1': @@ -7887,24 +8041,25 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy597; + case 'z': goto yy603; case '.': case ':': - case '_': goto yy604; - case '/': goto yy608; - case '=': goto yy612; - case '>': goto yy606; - default: goto yy593; + case '_': goto yy610; + case '/': goto yy614; + case '=': goto yy618; + case '>': goto yy612; + default: goto yy599; } -yy599: +yy605: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy599; - case '\n': goto yy601; - case '\r': goto yy603; - case '/': goto yy608; + case ' ': + case 0xA0: goto yy605; + case '\n': goto yy607; + case '\r': goto yy609; + case '/': goto yy614; case ':': case 'A': case 'B': @@ -7958,16 +8113,17 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy604; - case '>': goto yy606; - default: goto yy593; + case 'z': goto yy610; + case '>': goto yy612; + default: goto yy599; } -yy601: +yy607: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy601; + case ' ': + case 0xA0: goto yy607; case ':': case 'A': case 'B': @@ -8021,16 +8177,17 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy604; - default: goto yy593; + case 'z': goto yy610; + default: goto yy599; } -yy603: +yy609: ++c; yych = *c; switch (yych) { case '\t': case '\n': - case ' ': goto yy601; + case ' ': + case 0xA0: goto yy607; case ':': case 'A': case 'B': @@ -8084,10 +8241,10 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy604; - default: goto yy593; + case 'z': goto yy610; + default: goto yy599; } -yy604: +yy610: ++c; yych = *c; switch (yych) { @@ -8156,44 +8313,46 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy604; - case '=': goto yy612; - default: goto yy593; + case 'z': goto yy610; + case '=': goto yy618; + default: goto yy599; } -yy606: +yy612: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy606; - case '\n': goto yy609; - case '\r': goto yy611; - default: goto yy593; + case ' ': + case 0xA0: goto yy612; + case '\n': goto yy615; + case '\r': goto yy617; + default: goto yy599; } -yy608: +yy614: yych = *++c; switch (yych) { - case '>': goto yy606; - default: goto yy593; + case '>': goto yy612; + default: goto yy599; } -yy609: +yy615: ++c; -yy610: +yy616: { return (size_t)( c - start ); } -yy611: +yy617: yych = *++c; switch (yych) { - case '\n': goto yy609; - default: goto yy610; + case '\n': goto yy615; + default: goto yy616; } -yy612: +yy618: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy612; - case '"': goto yy614; - case '\'': goto yy616; + case ' ': + case 0xA0: goto yy618; + case '"': goto yy620; + case '\'': goto yy622; case '.': case '0': case '1': @@ -8256,37 +8415,38 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy618; - default: goto yy593; + case 'z': goto yy624; + default: goto yy599; } -yy614: +yy620: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy593; - case '"': goto yy599; - default: goto yy614; + case '\r': goto yy599; + case '"': goto yy605; + default: goto yy620; } -yy616: +yy622: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy593; - case '\'': goto yy599; - default: goto yy616; + case '\r': goto yy599; + case '\'': goto yy605; + default: goto yy622; } -yy618: +yy624: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy599; - case '\n': goto yy601; - case '\r': goto yy603; + case ' ': + case 0xA0: goto yy605; + case '\n': goto yy607; + case '\r': goto yy609; case '.': case '0': case '1': @@ -8297,11 +8457,11 @@ size_t scan_html_line(const char * c) { case '6': case '7': case '8': - case '9': goto yy618; - case '/': goto yy608; + case '9': goto yy624; + case '/': goto yy614; case ':': - case '_': goto yy604; - case '>': goto yy606; + case '_': goto yy610; + case '>': goto yy612; case 'A': case 'B': case 'C': @@ -8353,20 +8513,21 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy620; - default: goto yy593; + case 'z': goto yy626; + default: goto yy599; } -yy620: +yy626: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy599; - case '\n': goto yy601; - case '\r': goto yy603; + case ' ': + case 0xA0: goto yy605; + case '\n': goto yy607; + case '\r': goto yy609; case '-': case ':': - case '_': goto yy604; + case '_': goto yy610; case '.': case '0': case '1': @@ -8429,18 +8590,19 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy620; - case '/': goto yy608; - case '=': goto yy612; - case '>': goto yy606; - default: goto yy593; + case 'z': goto yy626; + case '/': goto yy614; + case '=': goto yy618; + case '>': goto yy612; + default: goto yy599; } -yy622: +yy628: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy624; + case ' ': + case 0xA0: goto yy630; case '-': case '0': case '1': @@ -8503,58 +8665,59 @@ size_t scan_html_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy622; - case '>': goto yy606; - default: goto yy593; + case 'z': goto yy628; + case '>': goto yy612; + default: goto yy599; } -yy624: +yy630: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy624; - case '>': goto yy606; - default: goto yy593; + case ' ': + case 0xA0: goto yy630; + case '>': goto yy612; + default: goto yy599; } -yy626: +yy632: yych = *++c; switch (yych) { - case '-': goto yy627; - default: goto yy593; + case '-': goto yy633; + default: goto yy599; } -yy627: +yy633: yych = *++c; switch (yych) { - case '-': goto yy593; - default: goto yy629; + case '-': goto yy599; + default: goto yy635; } -yy628: +yy634: ++c; yych = *c; -yy629: +yy635: switch (yych) { case 0x00: - case '>': goto yy593; - case '-': goto yy630; - default: goto yy628; + case '>': goto yy599; + case '-': goto yy636; + default: goto yy634; } -yy630: +yy636: ++c; yych = *c; switch (yych) { case 0x00: - case '>': goto yy593; - case '-': goto yy631; - default: goto yy628; + case '>': goto yy599; + case '-': goto yy637; + default: goto yy634; } -yy631: +yy637: ++c; yych = *c; switch (yych) { - case 0x00: goto yy593; - case '-': goto yy631; - case '>': goto yy606; - default: goto yy628; + case 0x00: goto yy599; + case '-': goto yy637; + case '>': goto yy612; + default: goto yy634; } } @@ -8567,111 +8730,114 @@ size_t scan_fence_start(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy635; - case ' ': goto yy636; + case '\n': goto yy641; + case ' ': + case 0xA0: goto yy642; case '`': - case '~': goto yy637; - default: goto yy638; + case '~': goto yy643; + default: goto yy644; } -yy635: +yy641: { return 0; } -yy636: +yy642: yych = *(marker = ++c); switch (yych) { - case ' ': goto yy650; + case ' ': + case 0xA0: goto yy656; case '`': - case '~': goto yy651; - default: goto yy635; + case '~': goto yy657; + default: goto yy641; } -yy637: +yy643: yych = *(marker = ++c); switch (yych) { case '`': - case '~': goto yy639; - default: goto yy635; + case '~': goto yy645; + default: goto yy641; } -yy638: +yy644: yych = *++c; - goto yy635; -yy639: + goto yy641; +yy645: yych = *++c; switch (yych) { case '`': - case '~': goto yy641; - default: goto yy640; + case '~': goto yy647; + default: goto yy646; } -yy640: +yy646: c = marker; - goto yy635; -yy641: + goto yy641; +yy647: ++c; yych = *c; switch (yych) { case 0x00: case '\n': case '\r': - case '\'': goto yy640; - case '`': goto yy641; - case '~': goto yy645; - default: goto yy643; + case '\'': goto yy646; + case '`': goto yy647; + case '~': goto yy651; + default: goto yy649; } -yy643: +yy649: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy647; - case '\r': goto yy649; + case '\n': goto yy653; + case '\r': goto yy655; case '\'': - case '`': goto yy640; - default: goto yy643; + case '`': goto yy646; + default: goto yy649; } -yy645: +yy651: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy647; - case '\r': goto yy649; - case '\'': goto yy640; - case '`': goto yy641; - case '~': goto yy645; - default: goto yy643; + case '\n': goto yy653; + case '\r': goto yy655; + case '\'': goto yy646; + case '`': goto yy647; + case '~': goto yy651; + default: goto yy649; } -yy647: +yy653: ++c; -yy648: +yy654: { return (size_t)( c - start ); } -yy649: +yy655: yych = *++c; switch (yych) { - case '\n': goto yy647; - default: goto yy648; + case '\n': goto yy653; + default: goto yy654; } -yy650: +yy656: yych = *++c; switch (yych) { - case ' ': goto yy652; + case ' ': + case 0xA0: goto yy658; case '`': - case '~': goto yy651; - default: goto yy640; + case '~': goto yy657; + default: goto yy646; } -yy651: +yy657: yych = *++c; switch (yych) { case '`': - case '~': goto yy639; - default: goto yy640; + case '~': goto yy645; + default: goto yy646; } -yy652: +yy658: ++c; switch ((yych = *c)) { case '`': - case '~': goto yy651; - default: goto yy640; + case '~': goto yy657; + default: goto yy646; } } @@ -8684,100 +8850,105 @@ size_t scan_fence_end(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy655; - case ' ': goto yy656; + case '\n': goto yy661; + case ' ': + case 0xA0: goto yy662; case '`': - case '~': goto yy657; - default: goto yy658; + case '~': goto yy663; + default: goto yy664; } -yy655: +yy661: { return 0; } -yy656: +yy662: yych = *(marker = ++c); switch (yych) { - case ' ': goto yy668; + case ' ': + case 0xA0: goto yy674; case '`': - case '~': goto yy669; - default: goto yy655; + case '~': goto yy675; + default: goto yy661; } -yy657: +yy663: yych = *(marker = ++c); switch (yych) { case '`': - case '~': goto yy659; - default: goto yy655; + case '~': goto yy665; + default: goto yy661; } -yy658: +yy664: yych = *++c; - goto yy655; -yy659: + goto yy661; +yy665: yych = *++c; switch (yych) { case '`': - case '~': goto yy661; - default: goto yy660; + case '~': goto yy667; + default: goto yy666; } -yy660: +yy666: c = marker; - goto yy655; -yy661: + goto yy661; +yy667: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy665; + case '\n': goto yy671; case '\t': - case ' ': goto yy663; - case '\r': goto yy667; + case ' ': + case 0xA0: goto yy669; + case '\r': goto yy673; case '`': - case '~': goto yy661; - default: goto yy660; + case '~': goto yy667; + default: goto yy666; } -yy663: +yy669: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy665; + case '\n': goto yy671; case '\t': - case ' ': goto yy663; - case '\r': goto yy667; - default: goto yy660; + case ' ': + case 0xA0: goto yy669; + case '\r': goto yy673; + default: goto yy666; } -yy665: +yy671: ++c; -yy666: +yy672: { return (size_t)( c - start ); } -yy667: +yy673: yych = *++c; switch (yych) { - case '\n': goto yy665; - default: goto yy666; + case '\n': goto yy671; + default: goto yy672; } -yy668: +yy674: yych = *++c; switch (yych) { - case ' ': goto yy670; + case ' ': + case 0xA0: goto yy676; case '`': - case '~': goto yy669; - default: goto yy660; + case '~': goto yy675; + default: goto yy666; } -yy669: +yy675: yych = *++c; switch (yych) { case '`': - case '~': goto yy659; - default: goto yy660; + case '~': goto yy665; + default: goto yy666; } -yy670: +yy676: ++c; switch ((yych = *c)) { case '`': - case '~': goto yy669; - default: goto yy660; + case '~': goto yy675; + default: goto yy666; } } @@ -8790,10 +8961,10 @@ size_t scan_meta_line(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy673; + case '\n': goto yy679; case '0': case '1': case '2': @@ -8855,12 +9026,12 @@ size_t scan_meta_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy674; - default: goto yy675; + case 'z': goto yy680; + default: goto yy681; } -yy673: +yy679: { return 0; } -yy674: +yy680: yych = *(marker = ++c); switch (yych) { case '\t': @@ -8930,16 +9101,17 @@ size_t scan_meta_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy677; - default: goto yy673; + case 'z': + case 0xA0: goto yy683; + default: goto yy679; } -yy675: +yy681: yych = *++c; - goto yy673; -yy676: + goto yy679; +yy682: ++c; yych = *c; -yy677: +yy683: switch (yych) { case '\t': case ' ': @@ -9007,39 +9179,40 @@ size_t scan_meta_line(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy676; - case ':': goto yy679; - default: goto yy678; + case 'z': + case 0xA0: goto yy682; + case ':': goto yy685; + default: goto yy684; } -yy678: +yy684: c = marker; - goto yy673; -yy679: + goto yy679; +yy685: yych = *++c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy678; - default: goto yy680; + case '\r': goto yy684; + default: goto yy686; } -yy680: +yy686: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy682; - case '\r': goto yy684; - default: goto yy680; + case '\n': goto yy688; + case '\r': goto yy690; + default: goto yy686; } -yy682: +yy688: ++c; -yy683: +yy689: { return (size_t)( c - start ); } -yy684: +yy690: ++c; switch ((yych = *c)) { - case '\n': goto yy682; - default: goto yy683; + case '\n': goto yy688; + default: goto yy689; } } @@ -9051,10 +9224,10 @@ size_t scan_meta_key(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy687; + case '\n': goto yy693; case '0': case '1': case '2': @@ -9116,24 +9289,24 @@ size_t scan_meta_key(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy688; - default: goto yy690; + case 'z': goto yy694; + default: goto yy696; } -yy687: +yy693: { return 0; } -yy688: +yy694: ++c; yych = *c; - goto yy692; -yy689: + goto yy698; +yy695: { return (size_t)( c - start ); } -yy690: +yy696: yych = *++c; - goto yy687; -yy691: + goto yy693; +yy697: ++c; yych = *c; -yy692: +yy698: switch (yych) { case '\t': case ' ': @@ -9201,8 +9374,9 @@ size_t scan_meta_key(const char * c) { case 'w': case 'x': case 'y': - case 'z': goto yy691; - default: goto yy689; + case 'z': + case 0xA0: goto yy697; + default: goto yy695; } } @@ -9215,74 +9389,78 @@ size_t scan_definition(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy695; - case ' ': goto yy696; - case ':': goto yy697; - default: goto yy698; + case '\n': goto yy701; + case ' ': + case 0xA0: goto yy702; + case ':': goto yy703; + default: goto yy704; } -yy695: +yy701: { return 0; } -yy696: +yy702: yych = *(marker = ++c); switch (yych) { - case ' ': goto yy703; - case ':': goto yy705; - default: goto yy695; + case ' ': + case 0xA0: goto yy709; + case ':': goto yy711; + default: goto yy701; } -yy697: +yy703: yych = *++c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy695; - default: goto yy700; + case '\r': goto yy701; + default: goto yy706; } -yy698: +yy704: yych = *++c; - goto yy695; -yy699: + goto yy701; +yy705: ++c; yych = *c; -yy700: +yy706: switch (yych) { case 0x00: case '\n': - case '\r': goto yy701; + case '\r': goto yy707; case '\t': - case ' ': goto yy699; - default: goto yy702; + case ' ': + case 0xA0: goto yy705; + default: goto yy708; } -yy701: +yy707: { return (size_t)( c - start ); } -yy702: +yy708: yych = *++c; - goto yy701; -yy703: + goto yy707; +yy709: yych = *++c; switch (yych) { - case ' ': goto yy706; - case ':': goto yy705; - default: goto yy704; + case ' ': + case 0xA0: goto yy712; + case ':': goto yy711; + default: goto yy710; } -yy704: +yy710: c = marker; - goto yy695; -yy705: + goto yy701; +yy711: yych = *++c; switch (yych) { case 0x00: case '\n': - case '\r': goto yy704; - default: goto yy700; + case '\r': goto yy710; + default: goto yy706; } -yy706: +yy712: ++c; switch ((yych = *c)) { - case ':': goto yy705; - default: goto yy704; + case ':': goto yy711; + default: goto yy710; } } @@ -9295,7 +9473,7 @@ size_t scan_table_separator(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { case '\t': @@ -9303,14 +9481,15 @@ size_t scan_table_separator(const char * c) { case '+': case '-': case ':': - case '=': goto yy711; - case '\n': goto yy709; - case '|': goto yy710; - default: goto yy712; + case '=': + case 0xA0: goto yy717; + case '\n': goto yy715; + case '|': goto yy716; + default: goto yy718; } -yy709: +yy715: { return 0; } -yy710: +yy716: yych = *(marker = ++c); switch (yych) { case 0x00: @@ -9322,10 +9501,11 @@ size_t scan_table_separator(const char * c) { case '-': case ':': case '=': - case '|': goto yy722; - default: goto yy709; + case '|': + case 0xA0: goto yy728; + default: goto yy715; } -yy711: +yy717: yych = *(marker = ++c); switch (yych) { case '\t': @@ -9333,33 +9513,35 @@ size_t scan_table_separator(const char * c) { case '+': case '-': case ':': - case '=': goto yy716; - case '|': goto yy713; - default: goto yy709; + case '=': + case 0xA0: goto yy722; + case '|': goto yy719; + default: goto yy715; } -yy712: +yy718: yych = *++c; - goto yy709; -yy713: + goto yy715; +yy719: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy718; + case '\n': goto yy724; case '\t': case ' ': case '+': case '-': case ':': case '=': - case '|': goto yy713; - case '\r': goto yy720; - default: goto yy715; + case '|': + case 0xA0: goto yy719; + case '\r': goto yy726; + default: goto yy721; } -yy715: +yy721: c = marker; - goto yy709; -yy716: + goto yy715; +yy722: ++c; yych = *c; switch (yych) { @@ -9368,36 +9550,38 @@ size_t scan_table_separator(const char * c) { case '+': case '-': case ':': - case '=': goto yy716; - case '|': goto yy713; - default: goto yy715; + case '=': + case 0xA0: goto yy722; + case '|': goto yy719; + default: goto yy721; } -yy718: +yy724: ++c; -yy719: +yy725: { return (size_t)( c - start ); } -yy720: +yy726: yych = *++c; switch (yych) { - case '\n': goto yy718; - default: goto yy719; + case '\n': goto yy724; + default: goto yy725; } -yy721: +yy727: ++c; yych = *c; -yy722: +yy728: switch (yych) { case 0x00: - case '\n': goto yy718; + case '\n': goto yy724; case '\t': case ' ': case '+': case '-': case ':': case '=': - case '|': goto yy721; - case '\r': goto yy720; - default: goto yy715; + case '|': + case 0xA0: goto yy727; + case '\r': goto yy726; + default: goto yy721; } } @@ -9409,238 +9593,248 @@ size_t scan_alignment_string(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { case '\t': - case ' ': goto yy726; - case '\n': goto yy725; + case ' ': + case 0xA0: goto yy732; + case '\n': goto yy731; case '-': - case '=': goto yy728; - case ':': goto yy727; - default: goto yy729; + case '=': goto yy734; + case ':': goto yy733; + default: goto yy735; } -yy725: +yy731: { return 0; } -yy726: +yy732: yych = *(marker = ++c); switch (yych) { case '\t': - case ' ': goto yy768; + case ' ': + case 0xA0: goto yy774; case '-': - case '=': goto yy732; - case ':': goto yy767; - default: goto yy725; + case '=': goto yy738; + case ':': goto yy773; + default: goto yy731; } -yy727: +yy733: yych = *(marker = ++c); switch (yych) { case '-': - case '=': goto yy744; - default: goto yy725; + case '=': goto yy750; + default: goto yy731; } -yy728: +yy734: yych = *(marker = ++c); switch (yych) { case '-': - case '=': goto yy732; - case ':': goto yy730; - default: goto yy725; + case '=': goto yy738; + case ':': goto yy736; + default: goto yy731; } -yy729: +yy735: yych = *++c; - goto yy725; -yy730: + goto yy731; +yy736: yych = *++c; switch (yych) { - case '+': goto yy739; - default: goto yy735; + case '+': goto yy745; + default: goto yy741; } -yy731: +yy737: c = marker; - goto yy725; -yy732: + goto yy731; +yy738: ++c; yych = *c; switch (yych) { case '-': - case '=': goto yy732; - case ':': goto yy730; - default: goto yy731; + case '=': goto yy738; + case ':': goto yy736; + default: goto yy737; } -yy734: +yy740: ++c; yych = *c; -yy735: +yy741: switch (yych) { case 0x00: case '\n': - case '|': goto yy736; + case '|': goto yy742; case '\t': - case ' ': goto yy734; - case '\r': goto yy738; - default: goto yy731; + case ' ': + case 0xA0: goto yy740; + case '\r': goto yy744; + default: goto yy737; } -yy736: +yy742: ++c; -yy737: +yy743: { return ALIGN_RIGHT; } -yy738: +yy744: yych = *++c; switch (yych) { - case '\n': goto yy736; - default: goto yy737; + case '\n': goto yy742; + default: goto yy743; } -yy739: +yy745: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '|': goto yy741; + case '|': goto yy747; case '\t': - case ' ': goto yy739; - case '\r': goto yy743; - default: goto yy731; + case ' ': + case 0xA0: goto yy745; + case '\r': goto yy749; + default: goto yy737; } -yy741: +yy747: ++c; -yy742: +yy748: { return ALIGN_WRAP | ALIGN_RIGHT; } -yy743: +yy749: yych = *++c; switch (yych) { - case '\n': goto yy741; - default: goto yy742; + case '\n': goto yy747; + default: goto yy748; } -yy744: +yy750: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '|': goto yy748; + case '|': goto yy754; case '\t': - case ' ': goto yy746; - case '\r': goto yy750; - case '+': goto yy752; + case ' ': + case 0xA0: goto yy752; + case '\r': goto yy756; + case '+': goto yy758; case '-': - case '=': goto yy744; - case ':': goto yy751; - default: goto yy731; + case '=': goto yy750; + case ':': goto yy757; + default: goto yy737; } -yy746: +yy752: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '|': goto yy748; + case '|': goto yy754; case '\t': - case ' ': goto yy746; - case '\r': goto yy750; - default: goto yy731; + case ' ': + case 0xA0: goto yy752; + case '\r': goto yy756; + default: goto yy737; } -yy748: +yy754: ++c; -yy749: +yy755: { return ALIGN_LEFT; } -yy750: +yy756: yych = *++c; switch (yych) { - case '\n': goto yy748; - default: goto yy749; + case '\n': goto yy754; + default: goto yy755; } -yy751: +yy757: yych = *++c; switch (yych) { - case '+': goto yy757; - default: goto yy760; + case '+': goto yy763; + default: goto yy766; } -yy752: +yy758: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '|': goto yy754; + case '|': goto yy760; case '\t': - case ' ': goto yy752; - case '\r': goto yy756; - default: goto yy731; + case ' ': + case 0xA0: goto yy758; + case '\r': goto yy762; + default: goto yy737; } -yy754: +yy760: ++c; -yy755: +yy761: { return ALIGN_WRAP | ALIGN_LEFT; } -yy756: +yy762: yych = *++c; switch (yych) { - case '\n': goto yy754; - default: goto yy755; + case '\n': goto yy760; + default: goto yy761; } -yy757: +yy763: ++c; yych = *c; switch (yych) { case 0x00: case '\n': - case '|': goto yy764; + case '|': goto yy770; case '\t': - case ' ': goto yy757; - case '\r': goto yy766; - default: goto yy731; + case ' ': + case 0xA0: goto yy763; + case '\r': goto yy772; + default: goto yy737; } -yy759: +yy765: ++c; yych = *c; -yy760: +yy766: switch (yych) { case 0x00: case '\n': - case '|': goto yy761; + case '|': goto yy767; case '\t': - case ' ': goto yy759; - case '\r': goto yy763; - default: goto yy731; + case ' ': + case 0xA0: goto yy765; + case '\r': goto yy769; + default: goto yy737; } -yy761: +yy767: ++c; -yy762: +yy768: { return ALIGN_CENTER; } -yy763: +yy769: yych = *++c; switch (yych) { - case '\n': goto yy761; - default: goto yy762; + case '\n': goto yy767; + default: goto yy768; } -yy764: +yy770: ++c; -yy765: +yy771: { return ALIGN_WRAP | ALIGN_CENTER; } -yy766: +yy772: yych = *++c; switch (yych) { - case '\n': goto yy764; - default: goto yy765; + case '\n': goto yy770; + default: goto yy771; } -yy767: +yy773: yych = *++c; switch (yych) { case '-': - case '=': goto yy744; - default: goto yy731; + case '=': goto yy750; + default: goto yy737; } -yy768: +yy774: ++c; yych = *c; switch (yych) { case '\t': - case ' ': goto yy768; + case ' ': + case 0xA0: goto yy774; case '-': - case '=': goto yy732; - case ':': goto yy767; - default: goto yy731; + case '=': goto yy738; + case ':': goto yy773; + default: goto yy737; } } @@ -9652,55 +9846,58 @@ size_t scan_destination(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { case 0x00: case '\t': case '\r': - case ' ': goto yy776; - case '\n': goto yy772; - case '<': goto yy773; - default: goto yy775; + case ' ': + case 0xA0: goto yy782; + case '\n': goto yy778; + case '<': goto yy779; + default: goto yy781; } -yy772: +yy778: { return 0; } -yy773: +yy779: ++c; yych = *c; - goto yy780; -yy774: + goto yy786; +yy780: { return (size_t)( c - start ); } -yy775: +yy781: yych = *++c; - goto yy778; -yy776: + goto yy784; +yy782: yych = *++c; - goto yy772; -yy777: + goto yy778; +yy783: ++c; yych = *c; -yy778: +yy784: switch (yych) { case 0x00: case '\t': case '\n': case '\r': - case ' ': goto yy774; - default: goto yy777; + case ' ': + case 0xA0: goto yy780; + default: goto yy783; } -yy779: +yy785: ++c; yych = *c; -yy780: +yy786: switch (yych) { case 0x00: case '\t': case '\n': case '\r': - case ' ': goto yy774; - case '>': goto yy777; - default: goto yy779; + case ' ': + case 0xA0: goto yy780; + case '>': goto yy783; + default: goto yy785; } } @@ -9713,82 +9910,82 @@ size_t scan_title(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy783; - case '"': goto yy784; - case '\'': goto yy785; - case '(': goto yy786; - default: goto yy787; + case '\n': goto yy789; + case '"': goto yy790; + case '\'': goto yy791; + case '(': goto yy792; + default: goto yy793; } -yy783: +yy789: { return 0; } -yy784: +yy790: yych = *(marker = ++c); switch (yych) { case 0x00: case '\n': - case '\r': goto yy783; - default: goto yy796; + case '\r': goto yy789; + default: goto yy802; } -yy785: +yy791: yych = *(marker = ++c); switch (yych) { case 0x00: case '\n': - case '\r': goto yy783; - default: goto yy794; + case '\r': goto yy789; + default: goto yy800; } -yy786: +yy792: yych = *(marker = ++c); switch (yych) { case 0x00: case '\n': - case '\r': goto yy783; - default: goto yy789; + case '\r': goto yy789; + default: goto yy795; } -yy787: +yy793: yych = *++c; - goto yy783; -yy788: + goto yy789; +yy794: ++c; yych = *c; -yy789: +yy795: switch (yych) { case 0x00: case '\n': - case '\r': goto yy790; - case ')': goto yy791; - default: goto yy788; + case '\r': goto yy796; + case ')': goto yy797; + default: goto yy794; } -yy790: +yy796: c = marker; - goto yy783; -yy791: + goto yy789; +yy797: ++c; { return (size_t)( c - start ); } -yy793: +yy799: ++c; yych = *c; -yy794: +yy800: switch (yych) { case 0x00: case '\n': - case '\r': goto yy790; - case '\'': goto yy791; - default: goto yy793; + case '\r': goto yy796; + case '\'': goto yy797; + default: goto yy799; } -yy795: +yy801: ++c; yych = *c; -yy796: +yy802: switch (yych) { case 0x00: case '\n': - case '\r': goto yy790; - case '"': goto yy791; - default: goto yy795; + case '\r': goto yy796; + case '"': goto yy797; + default: goto yy801; } } @@ -9800,114 +9997,176 @@ size_t scan_setext(const char * c) { { - char yych; + unsigned char yych; yych = *c; switch (yych) { - case '\n': goto yy799; - case ' ': goto yy800; - case '-': goto yy802; - case '=': goto yy801; - default: goto yy803; + case '\n': goto yy805; + case ' ': + case 0xA0: goto yy806; + case '-': goto yy808; + case '=': goto yy807; + default: goto yy809; } -yy799: +yy805: { return 0; } -yy800: +yy806: yych = *(marker = ++c); switch (yych) { - case ' ': goto yy815; - case '-': goto yy816; - case '=': goto yy817; - default: goto yy799; + case ' ': + case 0xA0: goto yy821; + case '-': goto yy822; + case '=': goto yy823; + default: goto yy805; } -yy801: +yy807: yych = *(marker = ++c); switch (yych) { - case '=': goto yy810; - default: goto yy799; + case '=': goto yy816; + default: goto yy805; } -yy802: +yy808: yych = *(marker = ++c); switch (yych) { - case '-': goto yy804; - default: goto yy799; + case '-': goto yy810; + default: goto yy805; } -yy803: +yy809: yych = *++c; - goto yy799; -yy804: + goto yy805; +yy810: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy807; - case '\r': goto yy809; - case '-': goto yy804; - default: goto yy806; + case '\n': goto yy813; + case '\r': goto yy815; + case '-': goto yy810; + default: goto yy812; } -yy806: +yy812: c = marker; - goto yy799; -yy807: + goto yy805; +yy813: ++c; -yy808: +yy814: { return (size_t)( c - start ); } -yy809: +yy815: yych = *++c; switch (yych) { - case '\n': goto yy807; - default: goto yy808; + case '\n': goto yy813; + default: goto yy814; } -yy810: +yy816: ++c; yych = *c; switch (yych) { case 0x00: - case '\n': goto yy812; - case '\r': goto yy814; - case '=': goto yy810; - default: goto yy806; + case '\n': goto yy818; + case '\r': goto yy820; + case '=': goto yy816; + default: goto yy812; } -yy812: +yy818: ++c; -yy813: +yy819: { return (size_t)( c - start ); } -yy814: +yy820: yych = *++c; switch (yych) { - case '\n': goto yy812; - default: goto yy813; + case '\n': goto yy818; + default: goto yy819; } -yy815: +yy821: yych = *++c; switch (yych) { - case ' ': goto yy818; - case '-': goto yy816; - case '=': goto yy817; - default: goto yy806; + case ' ': + case 0xA0: goto yy824; + case '-': goto yy822; + case '=': goto yy823; + default: goto yy812; } -yy816: +yy822: yych = *++c; switch (yych) { - case '-': goto yy804; - default: goto yy806; + case '-': goto yy810; + default: goto yy812; } -yy817: +yy823: yych = *++c; switch (yych) { - case '=': goto yy810; - default: goto yy806; + case '=': goto yy816; + default: goto yy812; } -yy818: +yy824: ++c; switch ((yych = *c)) { - case '-': goto yy816; - case '=': goto yy817; - default: goto yy806; + case '-': goto yy822; + case '=': goto yy823; + default: goto yy812; } } } +size_t scan_atx(const char * c) { + const char * marker = NULL; + const char * start = c; + + +{ + unsigned char yych; + yych = *c; + switch (yych) { + case '\n': goto yy827; + case '#': goto yy828; + default: goto yy829; + } +yy827: + { return 0; } +yy828: + yych = *(marker = ++c); + switch (yych) { + case '\t': + case ' ': + case 0xA0: goto yy830; + case '#': goto yy833; + default: goto yy827; + } +yy829: + yych = *++c; + goto yy827; +yy830: + ++c; + yych = *c; + switch (yych) { + case 0x00: + case '\n': + case '\r': goto yy832; + case '\t': + case ' ': + case 0xA0: goto yy830; + default: goto yy835; + } +yy832: + c = marker; + goto yy827; +yy833: + ++c; + yych = *c; + switch (yych) { + case '\t': + case ' ': + case 0xA0: goto yy830; + case '#': goto yy833; + default: goto yy832; + } +yy835: + ++c; + { return (size_t)( c - start ); } +} + +} + #ifdef TEST void Test_scan_url(CuTest* tc) { diff --git a/Sources/libMultiMarkdown/scanners.h b/Sources/libMultiMarkdown/scanners.h index c0460854..cb6591c8 100644 --- a/Sources/libMultiMarkdown/scanners.h +++ b/Sources/libMultiMarkdown/scanners.h @@ -72,6 +72,7 @@ enum alignments { size_t scan_alignment_string(const char * c); size_t scan_attr(const char * c); size_t scan_attributes(const char * c); +size_t scan_atx(const char * c); size_t scan_definition(const char * c); size_t scan_destination(const char * c); size_t scan_email(const char * c); diff --git a/Sources/libMultiMarkdown/scanners.re b/Sources/libMultiMarkdown/scanners.re index ef4bcc35..5b7e0d58 100644 --- a/Sources/libMultiMarkdown/scanners.re +++ b/Sources/libMultiMarkdown/scanners.re @@ -59,21 +59,21 @@ /*!re2c - re2c:define:YYCTYPE = "char"; + re2c:define:YYCTYPE = "unsigned char"; re2c:define:YYCURSOR = c; re2c:define:YYMARKER = marker; re2c:define:YYCTXMARKER = marker; re2c:yyfill:enable = 0; nl = ( '\n' | '\r' '\n'?); - sp = [ \t]*; + sp = [ \t\240]*; spnl = sp (nl sp)?; - non_indent = ' '{0,3}; + non_indent = [ \240]{0,3}; nl_eof = nl | '\x00'; - email = 'mailto:'? [-A-Za-z0-9+_./!%~$]+ '@' [^ \t\n\r\x00>]+; + email = 'mailto:'? [-A-Za-z0-9+_./!%~$]+ '@' [^ \240\t\n\r\x00>]+; - url = [A-Za-z\-]+ '://' [^ \t\n\r\x00>]+; + url = [A-Za-z\-]+ '://' [^ \240\t\n\r\x00>]+; name = [A-Za-z_:] [A-Za-z0-9_.:-]*; quoted_d = '"' [^"\n\r\x00]* '"'; @@ -86,7 +86,7 @@ attributes = (attr)+; title = (quoted_d | quoted_s | quoted_p); - label = [^\]\n\r\x00]+; + label = [^\]\n\r\x00]* [^\]\n\r\x00\\]; finish_line = [^\n\r\x00]+; ref_abbr = non_indent '[>' label ']' ':' finish_line; @@ -99,7 +99,7 @@ ref_link = non_indent '[' label ']' ':' finish_line; - destination = ('<' [^ \t\n\r\x00>]* '>') | [^ \t\n\r\x00]+; + destination = ('<' [^ \240\t\n\r\x00>]* '>') | [^ \240\t\n\r\x00]+; ref_link_no_attributes = non_indent '[' label ']' ':' spnl destination sp (nl_eof | (nl? (title) sp) nl_eof); @@ -136,7 +136,7 @@ fence_end = non_indent [`~]{3,} sp nl_eof; - meta_key = [A-Za-z0-9] [A-Za-z0-9_ \t\-\.]*; + meta_key = [A-Za-z0-9] [A-Za-z0-9_ \240\t\-\.]*; meta_value = [^\n\r\x00]+; @@ -144,7 +144,7 @@ definition = non_indent ':' sp [^\n\r\x00]; - table_separator = (('|' [:\-= \t|+]*) | ([:\-= \t+]+ '|' [:\-= \t|+]*)) nl_eof; + table_separator = (('|' [:\-= \240\t|+]*) | ([:\-= \240\t+]+ '|' [:\-= \240\t|+]*)) nl_eof; align = [\-=]+; align_left = sp ':' align sp ('|' | nl_eof); @@ -158,6 +158,7 @@ setext_2 = non_indent '-'{2,} nl_eof; + atx = '#'+ [ \240\t]+ [^ \240\t\n\r\x00]; */ @@ -460,6 +461,16 @@ size_t scan_setext(const char * c) { */ } +size_t scan_atx(const char * c) { + const char * marker = NULL; + const char * start = c; + +/*!re2c + atx { return (size_t)( c - start ); } + .? { return 0; } +*/ +} + #ifdef TEST void Test_scan_url(CuTest* tc) { diff --git a/Sources/libMultiMarkdown/writer.c b/Sources/libMultiMarkdown/writer.c index 853d355a..d547eb5d 100644 --- a/Sources/libMultiMarkdown/writer.c +++ b/Sources/libMultiMarkdown/writer.c @@ -532,7 +532,7 @@ attr * parse_attributes(char * source) { size_t scan_len; size_t pos = 0; - while (scan_attr(&source[pos])) { + while (source[pos] != '\0' && scan_attr(&source[pos])) { pos += scan_spnl(&source[pos]); // Get key @@ -556,7 +556,7 @@ attr * parse_attributes(char * source) { attributes = a; } - free(value); // We stored a copy + free(value); // We stored a modified copy } return attributes; @@ -612,21 +612,25 @@ void store_link(scratch_pad * scratch, link * l) { link * temp_link; // Add link via `clean_text`? - HASH_FIND_STR(scratch->link_hash, l->clean_text, temp_link); - - if (!temp_link) { - // Only add if another link is not found with clean_text - temp_link = link_shallow_copy(l); - HASH_ADD_KEYPTR(hh, scratch->link_hash, l->clean_text, strlen(l->clean_text), temp_link); + if (l->clean_text && l->clean_text[0] != '\0') { + HASH_FIND_STR(scratch->link_hash, l->clean_text, temp_link); + + if (!temp_link) { + // Only add if another link is not found with clean_text + temp_link = link_shallow_copy(l); + HASH_ADD_KEYPTR(hh, scratch->link_hash, l->clean_text, strlen(l->clean_text), temp_link); + } } // Add link via `label_text`? - HASH_FIND_STR(scratch->link_hash, l->label_text, temp_link); + if (l->label_text && l->label_text[0] != '\0') { + HASH_FIND_STR(scratch->link_hash, l->label_text, temp_link); - if (!temp_link) { - // Only add if another link is not found with label_text - temp_link = link_shallow_copy(l); - HASH_ADD_KEYPTR(hh, scratch->link_hash, l->label_text, strlen(l->label_text), temp_link); + if (!temp_link) { + // Only add if another link is not found with label_text + temp_link = link_shallow_copy(l); + HASH_ADD_KEYPTR(hh, scratch->link_hash, l->label_text, strlen(l->label_text), temp_link); + } } } @@ -663,19 +667,23 @@ void store_footnote(scratch_pad * scratch, footnote * f) { fn_holder * temp_holder; // Store by `clean_text`? - HASH_FIND_STR(scratch->footnote_hash, f->clean_text, temp_holder); + if (f->clean_text && f->clean_text[0] != '\0') { + HASH_FIND_STR(scratch->footnote_hash, f->clean_text, temp_holder); - if (!temp_holder) { - temp_holder = fn_holder_new(f); - HASH_ADD_KEYPTR(hh, scratch->footnote_hash, f->clean_text, strlen(f->clean_text), temp_holder); + if (!temp_holder) { + temp_holder = fn_holder_new(f); + HASH_ADD_KEYPTR(hh, scratch->footnote_hash, f->clean_text, strlen(f->clean_text), temp_holder); + } } // Store by `label_text`? - HASH_FIND_STR(scratch->footnote_hash, f->label_text, temp_holder); + if (f->label_text && f->label_text[0] != '\0') { + HASH_FIND_STR(scratch->footnote_hash, f->label_text, temp_holder); - if (!temp_holder) { - temp_holder = fn_holder_new(f); - HASH_ADD_KEYPTR(hh, scratch->footnote_hash, f->label_text, strlen(f->label_text), temp_holder); + if (!temp_holder) { + temp_holder = fn_holder_new(f); + HASH_ADD_KEYPTR(hh, scratch->footnote_hash, f->label_text, strlen(f->label_text), temp_holder); + } } } @@ -684,19 +692,23 @@ void store_citation(scratch_pad * scratch, footnote * f) { fn_holder * temp_holder; // Store by `clean_text`? - HASH_FIND_STR(scratch->citation_hash, f->clean_text, temp_holder); + if (f->clean_text && f->clean_text[0] != '\0') { + HASH_FIND_STR(scratch->citation_hash, f->clean_text, temp_holder); - if (!temp_holder) { - temp_holder = fn_holder_new(f); - HASH_ADD_KEYPTR(hh, scratch->citation_hash, f->clean_text, strlen(f->clean_text), temp_holder); + if (!temp_holder) { + temp_holder = fn_holder_new(f); + HASH_ADD_KEYPTR(hh, scratch->citation_hash, f->clean_text, strlen(f->clean_text), temp_holder); + } } // Store by `label_text`? - HASH_FIND_STR(scratch->citation_hash, f->label_text, temp_holder); + if (f->label_text && f->label_text[0] != '\0') { + HASH_FIND_STR(scratch->citation_hash, f->label_text, temp_holder); - if (!temp_holder) { - temp_holder = fn_holder_new(f); - HASH_ADD_KEYPTR(hh, scratch->citation_hash, f->label_text, strlen(f->label_text), temp_holder); + if (!temp_holder) { + temp_holder = fn_holder_new(f); + HASH_ADD_KEYPTR(hh, scratch->citation_hash, f->label_text, strlen(f->label_text), temp_holder); + } } } @@ -705,19 +717,23 @@ void store_glossary(scratch_pad * scratch, footnote * f) { fn_holder * temp_holder; // Store by `clean_text`? - HASH_FIND_STR(scratch->glossary_hash, f->clean_text, temp_holder); + if (f->clean_text && f->clean_text[0] != '\0') { + HASH_FIND_STR(scratch->glossary_hash, f->clean_text, temp_holder); - if (!temp_holder) { - temp_holder = fn_holder_new(f); - HASH_ADD_KEYPTR(hh, scratch->glossary_hash, f->clean_text, strlen(f->clean_text), temp_holder); + if (!temp_holder) { + temp_holder = fn_holder_new(f); + HASH_ADD_KEYPTR(hh, scratch->glossary_hash, f->clean_text, strlen(f->clean_text), temp_holder); + } } // Store by `label_text`? - HASH_FIND_STR(scratch->glossary_hash, f->label_text, temp_holder); + if (f->label_text && f->label_text[0] != '\0') { + HASH_FIND_STR(scratch->glossary_hash, f->label_text, temp_holder); - if (!temp_holder) { - temp_holder = fn_holder_new(f); - HASH_ADD_KEYPTR(hh, scratch->glossary_hash, f->label_text, strlen(f->label_text), temp_holder); + if (!temp_holder) { + temp_holder = fn_holder_new(f); + HASH_ADD_KEYPTR(hh, scratch->glossary_hash, f->label_text, strlen(f->label_text), temp_holder); + } } } @@ -726,10 +742,12 @@ void store_metadata(scratch_pad * scratch, meta * m) { meta * temp; // Store by `key` - HASH_FIND_STR(scratch->meta_hash, m->key, temp); + if (m->key && m->key[0] != '\0') { + HASH_FIND_STR(scratch->meta_hash, m->key, temp); - if (!temp) { - HASH_ADD_KEYPTR(hh, scratch->meta_hash, m->key, strlen(m->key), m); + if (!temp) { + HASH_ADD_KEYPTR(hh, scratch->meta_hash, m->key, strlen(m->key), m); + } } } @@ -738,11 +756,13 @@ void store_abbreviation(scratch_pad * scratch, footnote * f) { fn_holder * temp_holder; // Store by `label_text` - HASH_FIND_STR(scratch->abbreviation_hash, f->label_text, temp_holder); + if (f->label_text && f->label_text[0] != '\0') { + HASH_FIND_STR(scratch->abbreviation_hash, f->label_text, temp_holder); - if (!temp_holder) { - temp_holder = fn_holder_new(f); - HASH_ADD_KEYPTR(hh, scratch->abbreviation_hash, f->label_text, strlen(f->label_text), temp_holder); + if (!temp_holder) { + temp_holder = fn_holder_new(f); + HASH_ADD_KEYPTR(hh, scratch->abbreviation_hash, f->label_text, strlen(f->label_text), temp_holder); + } } } @@ -831,12 +851,15 @@ char * destination_accept(const char * source, token ** remainder, bool validate // Grab destination string url = my_strndup(&source[start], scan_len); - // Advance remainder - while ((*remainder)->start < start + scan_len) + // Advance remainder to end of destination + while ((*remainder)->next && + (*remainder)->next->start < start + scan_len) { *remainder = (*remainder)->next; + } - - t = (*remainder)->prev; + t = (*remainder); // We need to remember this for below + // Move remainder beyond destination + *remainder = (*remainder)->next; // Is there a space in a URL concatenated with a title or attribute? // e.g. [foo]: http://foo.bar/ class="foo" @@ -1244,14 +1267,21 @@ void process_definition_block(mmd_engine * e, token * block) { case BLOCK_DEF_ABBREVIATION: // Strip leading '>'' from term f = footnote_new(e->dstr->str, label, block->child, false); - if (f && f->clean_text) + if (f && f->clean_text) { memmove(f->clean_text, &(f->clean_text)[1],strlen(f->clean_text)); + while (char_is_whitespace((f->clean_text)[0])) { + memmove(f->clean_text, &(f->clean_text)[1],strlen(f->clean_text)); + } + } // Adjust the properties free(f->label_text); f->label_text = f->clean_text; - f->clean_text = clean_string_from_range(e->dstr->str, f->content->child->next->next->start, block->start + block->len - f->content->child->next->next->start, false); - + if (f->content->child->next->next) { + f->clean_text = clean_string_from_range(e->dstr->str, f->content->child->next->next->start, block->start + block->len - f->content->child->next->next->start, false); + } else { + f->clean_text = NULL; + } stack_push(e->abbreviation_stack, f); break; case BLOCK_DEF_CITATION: @@ -1326,7 +1356,7 @@ token * manual_label_from_header(token * h, const char * source) { break; case PAIR_BRACKET: label = walker; - while(walker->type == PAIR_BRACKET) { + while(walker && walker->type == PAIR_BRACKET) { walker = walker->prev; count++; } diff --git a/templates/README.md.in b/templates/README.md.in index e5056129..e4999a6f 100644 --- a/templates/README.md.in +++ b/templates/README.md.in @@ -9,531 +9,54 @@ | Version: | @My_Project_Version@ | -## Updates ## - -* 2017-03-13 -- v 6.0.0-b2: - - * ADDED: Add CriticMarkup preprocessor that works across empty lines when accepting/rejecting markup - * ADDED: Add back the mmd6 latex title file - * ADDED: Basic EPUB 3 support -- uses 'miniz' library to zip creation - * ADDED: Update QuickStart and EPUB code - * CHANGED: Update QuickStart guide - * CHANGED: Update test suite - * FIXED: Don't duplicate LaTeX glossary definitions - * FIXED: Fix abbreviations in ODF; Improve test suite - * FIXED: Improve glossaries and abbreviations; Update QuickStart - * FIXED: Tidy up some compiler warnings in code - * FIXED: Use custom UUID code to minimize external dependencies - - -* 2017-03-09 -- v 6.0.0-b1: - - * ADDED: Add French translations; fix typo in German - * ADDED: Add Quick Start guide - * ADDED: Add functionality to automatically identify abbreviations and glossary terms in source - * ADDED: Improve LaTeX configuration files - * ADDED: Update German translations - * ADDED: Use native ODF table of contents instead of a manual list - * ADDED: Use native command for table of contents in LaTeX - * CHANGED: Bring HTML and ODF into line with LaTeX as to output of abbreviatinos on first and subsequent uses - * CHANGED: Slight performance tweak - * CHANGED: Update German test suite - * FIXED: Allow {{TOC}} in latex verbatim - * FIXED: Don't free token_pool if never initialized - * FIXED: Fix German typo - * FIXED: Fix missing token type - * FIXED: Improve performance of checking document for metadata, which improves performance when checking for possible transclusion - * FIXED: Update test suite for abbreviation changes - - -* 2017-03-05 -- v 0.4.2-b: - - * ADDED: Add and utility functions; fix memory leak - * ADDED: Initial abbreviation support - * ADDED: Keep working on Abbreviations/Glossaries - * ADDED: Refactor abbreviation code; Add inline abbreviations; Fix abbreviations in ODF - * ADDED: Update Inline Footnote test - * CHANGED: Add comments to i18n.h - * CHANGED: Finish refactoring note-related code - * CHANGED: Refactor footnotes - * CHANGED: Refactor glossary code - * CHANGED: Remove offset from html export functions - * FIXED: latex list items need to block optional argument to allow '[' as first character - * Merge branch 'release/0.4.1-b' into develop - - -* 2017-03-04 -- v 0.4.1-b: - - * FIXED: Add glossary localization - - -* 2017-03-04 -- v 0.4.0-b: - - * ADDED: Add TOC support to ODF - * ADDED: Add glossary support to ODF - * ADDED: Add prelim code for handling abbreviations - * ADDED: Add support for Swift Package Maker; CHANGED: Restructure source directory - * ADDED: Added LaTeX support for escaped characters, fenced code blocks, images, links - * ADDED: Basic ODF Support - * ADDED: Better document strong/emph algorithm - * ADDED: Continue ODF progress - * ADDED: Continue to work on ODF export - * ADDED: Continue work on ODF - * ADDED: Finish ODF support for lists - * ADDED: Improve performance when exporting - * ADDED: Improve token_pool memory handling - * ADDED: Prototype support for Glossaries - * ADDED: Support 'latexconfig' metadata - * CHANGED: Use multiple cases in glossary tests - * FIXED: Don't force glossary terms into lowercase - * FIXED: Fix Makefile for new source file location - * FIXED: Fix algorithm for creating TOC to properly handle 'incorrect' levels - * FIXED: Fix linebreaks in LaTeX; ADDED: Add Linebreaks test file - * FIXED: Fix new_source script for new directory structure - * FIXED: Fix non-breaking space in ODF - * FIXED: Fix padding at end of document body in ODF - * FIXED: Fix underscores in raw latex - * FIXED: Potential bug - * NOTE: Add shared library build option - - -* 2017-02-17 -- v 0.3.1.a: - - * ADDED: 'finalize' beamer support - * ADDED: Add escaped newline as linebreak; start on beamer/memoir support - * ADDED: CriticMarkup test for LaTeX - * ADDED: Custom LaTeX output for CriticMarkup comments - * ADDED: Support mmd export format - * ADDED: Work on cpack installer -- change project name for compatibility - * CHANGED: Adjust latex metadata configuration for consistency - * CHANGED: Configure cmake to use C99 - * FIXED: Add custom implementation for cross-platform support - * FIXED: Fix German HTML tests - * FIXED: Fix cpack destination directory issue - * FIXED: Fix memory leaks etc - * FIXED: Fix warning in custom vasprintf - * FIXED: Modify CMakeLists.txt to test for use of clang compiler - * FIXED: Work on memory leaks - * NOTE: Adjust license width to improve display on smaller terminal windows - - -* 2017-02-14 -- v 0.3.0a: - - * ADDED: Add basic image support to LaTeX - * ADDED: Add file transclusion - * ADDED: Add support for citation 'locators' - * ADDED: Add support for manual labels on ATX Headers - * ADDED: Add support for manual labels on Setext Headers - * ADDED: Add support for tables in LaTeX - * ADDED: HTML Comments appear as raw LaTeX - * ADDED: Improved citation support in LaTeX - * ADDED: Support \autoref{} in LaTeX - * ADDED: Support combined options in LaTeX citations that use the '\]\[' syntax - * ADDED: Support language specifier in fenced code blocks - * ADDED: Support metadata in LaTeX - * ADDED: Update Citations test suite - * FIXED: Escaped LaTeX characters - * FIXED: Fix bug in URL parsing - * FIXED: Fix bug in citation links - * FIXED: Fix bug when no closing divider or newline at end of last table cell - * FIXED: Fix issue printing '-' - * FIXED: Fix scan_url test suite - * FIXED: Get Math working in LaTeX - * FIXED: Improve reliability or link scanner - * FIXED: Properly add id attribute to new instances of citation only - * FIXED: Properly handle manual labels with TOC - * FIXED: Properly print hash characters in LaTeX - * FIXED: Separate LaTeX verbatim and texttt character handling - * FIXED: Update Escapes test LaTeX result - * FIXED: Work on escaping LaTeX characters - - -* 2017-02-08 -- v 0.1.4a: - - * ADDED: Add smart quote support for other languages (resolves #15) - - -* 2017-02-08 -- v 0.1.3a: - - * ADDED: Add support for reference image id attributes - * ADDED: Add support for table captions - * ADDED: Metadata support for base header level - * ADDED: Support distinction between 3 and 5 backticks in fenced code blocks - * ADDED: Support Setext headers - * FIXED: Fix issue with metadata disrupting smart quotes - -* 2017-02-07 -- v 0.1.2a: - - * "pathologic" test suite -- fix handling of nested brackets, e.g. - `[[[[foo]]]]` to avoid bogging down checking for reference links that - don't exist. - * Table support -- a single blank line separates sections of tables, so - at least two blank lines are needed between adjacent tables. - * Definition list support - * "fuzz testing" -- stress test the parser for unexpected failures - * Table of Contents support - * Improved compatibility mode parsing - -* 2017-01-28 -- v 0.1.1a includes a few updates: - - * Metadata support - * Metadata variables support - * Extended ASCII range character checking - * Rudimentary language translations, including German - * Improved performance - * Additional testing: - * CriticMarkup - * HTML Blokcs - * Metadata/Variables - * "pathologic" test cases from CommonMark - - ## An Announcement! ## -I would like to officially announce that MultiMarkdown version 6 is in public -alpha. It's finally at a point where it is usable, but there are quite a few -caveats. - -This post is a way for me to organize some of my thoughts, provide some -history for those who are interested, and to provide some tips and tricks from -my experiences for those who are working on their own products. - -But first, some background... - - -### Why a New Version? ### - -MultiMarkdown version 5 was released in November of 2015, but the codebase was -essentially the same as that of v4 -- and that was released in beta in April -of 2013. A few key things prompted work on a new version: - -* Accuracy -- MMD v4 and v5 were the most accurate versions yet, and a lot of -effort went into finding and resolving various edge cases. However, it began -to feel like a game of whack-a-mole where new bugs would creep in every time I -fixed an old one. The PEG began to feel rather convoluted in spots, even -though it did allow for a precise (if not always accurate) specification of -the grammar. - -* Performance -- "Back in the day" [peg-markdown] was one of the fastest -Markdown parsers around. MMD v3 was based on peg-markdown, and would leap- -frog with it in terms of performance. Then [CommonMark] was released, which -was a bit faster. Then a couple of years went by and CommonMark became *much* -faster -- in one of my test suites, MMD v 5.4.0 takes about 25 times longer to -process a long document than CommonMark 0.27.0. - -[peg-markdown]: https://github.com/jgm/peg-markdown -[CommonMark]: http://commonmark.org/ - -Last spring, I decided I wanted to rewrite MultiMarkdown from scratch, -building the parser myself rather than relying on a pre-rolled solution. (I -had been using [greg](https://github.com/ooc-lang/greg) to compile the PEG -into parser code. It worked well overall, but lacked some features I needed, -requiring a lot of workarounds.) - - -## First Attempt ## - -My first attempt started by hand-crafting a parser that scanned through the -document a line at a time, deciding what to do with each line as it found -them. I used regex parsers made with [re2c](http://re2c.org/index.html) to -help classify each line, and then a separate parser layer to process groups of -lines into blocks. Initially this approach worked well, and was really -efficient. But I quickly began to code my way into a dead-end -- the strategy -was not elegant enough to handle things like nested lists, etc. - -One thing that did turn out well from the first attempt, however, was an -approach for handling `` and `` parsing. I've learned over the -years that this can be one of the hardest parts of coding accurately for -Markdown. There are many examples that are obvious to a person, but difficult -to properly "explain" how to parse to a computer. - -No solution is perfect, but I developed an approach that seems to accurately -handle a wide range of situations without a great deal of complexity: - -1. Scan the documents for asterisks (`*`). Each one will be handled one at a -time. - -2. Unlike brackets (`[` and `]`), an asterisk is "ambidextrous", in that it -may be able to open a matched pair of asterisks, close a pair, or both. For -example, in `foo *bar* foo`: - - 1. The first asterisk can open a pair, but not close one. - - 2. The second asterisk can close a pair, but not open one. - -3. So, once the asterisks have been identified, each has to be examined to -determine whether it can open/close/both. The algorithm is not that complex, -but I'll describe it in general terms. Check the code for more specifics. -This approach seems to work, but might still need some slight tweaking. In -the future, I'll codify this better in language rather than just in code. - - 1. If there is whitespace to the left of an asterisk, it can't close. - - 2. If there is whitespace or punctuation to the right it can't open. - - 3. "Runs" of asterisks, e.g. `**bar` are treated as a unit in terms of - looking left/right. - - 4. Asterisks inside a word are a bit trickier -- we look at the number of - asterisks before the word, the number in the current run, and the number - of asterisks after the word to determine which combinations, if any, are - permitted. - -4. Once all asterisks have been tagged as able to open/close/both, we proceed -through them in order: - - 1. When we encounter a tag that can close, we look to see if there is a - previous opener that has not been paired off. If so, pair the two and - remove the opener from the list of available asterisks. - - 2. When we encounter an opener, add it to the stack of available openers. - - 3. When encounter an asterisk that can do both, see if it can close an - existing opener. If not, then add it to the stack. - -5. After all tokens in the block have been paired, then we look for nesting -pairs of asterisks in order to create `` and `` sets. For -example, assume we have six asterisks wrapped around a word, three in front, -and three after. The asterisks are indicated with numbers: `123foo456`. We -proceed in the following manner: - - 1. Based on the pairing algorithm above, these asterisks would be paired as - follows, with matching asterisks sharing numbers -- `123foo321`. - - 2. Moving forwards, we come to asterisk "1". It is followed by an - asterisk, so we check to see if they should be grouped as a ``. - Since the "1" asterisks are wrapped immediately outside the "2" asterisks, - they are joined together. More than two pairs can't be joined, so we now - get the following -- `112foo211`, where the "11" represents the opening - and closing of a ``, and the "2" represents a ``. - -6. When matching a pair, any unclosed openers that are on the stack are -removed, preventing pairs from "crossing" or "intersecting". Pairs can wrap -around each other, e.g. `[(foo)]`, but not intersect like `[(foo])`. In the -second case, the brackets would close, removing the `(` from the stack. - -7. This same approach is used in all tokens that are matched in pairs-- -`[foo]`, `(foo)`, `_foo_`, etc. There's slightly more to it, but once you -figure out how to assign opening/closing ability, the rest is easy. By using -a stack to track available openers, it can be performed efficiently. - -In my testing, this approach has worked quite well. It handles all the basic -scenarios I've thrown at it, and all of the "basic" and "devious" edge cases I -have thought of (some of these don't necessarily have a "right" answer -- but -v6 gives consistency answers that seem as reasonable as any others to me). -There are also three more edge cases I've come up can still stump it, and -ironically they are handled correctly by most implementations. They just -don't follow the rules above. I'll continue to work on this. - -In the end, I scrapped this effort, but kept the lessons learned in the token -pairing algorithm. - - -## Second Attempt ## - -I tried again this past Fall. This time, I approached the problem with lots -of reading. *Lots and lots* of reading -- tons of websites, computer science -journal articles, PhD theses, etc. Learned a lot about lexers, and a lot -about parsers, including hand-crafting vs using parser generators. In brief: - -1. I learned about the [Aho–Corasick algorithm], which is a great way to -efficiently search a string for multiple target strings at once. I used this -to create a custom lexer to identify tokens in a MultiMarkdown text document -(e.g. `*`, `[ `, `{++`, etc.). I learned a lot, and had a good time working -out the implementation. This code efficiently allowed me to break a string of -text into the tokens that mattered for Markdown parsing. - -2. However, in a few instances I really needed some features of regular -expressions to simplify more complex structures. After a quick bit of testing, -using re2c to create a tokenizer was just as efficient, and allowed me to -incorporate some regex functionality that simplified later parsing. I'll keep -the Aho-Corasick stuff around, and will probably experiment more with it -later. But I didn't need it for MMD now. `lexer.re` contains the source for -the tokenizer. - -[Aho–Corasick algorithm]: https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm - -I looked long and hard for a way to simplify the parsing algorithm to try and -"touch" each token only once. Ideally, the program could step through each -token, and decide when to create a new block, when to pair things together, -etc. But I'm not convinced it's possible. Since Markdown's grammar varies -based on context, it seems to work best when handled in distinct phases: - -1. Tokenize the string to identify key sections of text. This includes line -breaks, allowing the text to be examined one line at time. - -2. Join series of lines together into blocks, such as paragraphs, code blocks, -lists, etc. - -3. The tokens inside each block can then be paired together to create more -complex syntax such as links, strong, emphasis, etc. - -To handle the block parsing, I started off using the [Aho-Corasick] code to -handle my first attempt. I had actually implemented some basic regex -functionality, and used that to group lines together to create blocks. But -this quickly fell apart in the face of more complex structures such as -recursive lists. After a lot of searching, and *tons* more reading, I -ultimately decided to use a parser generator to handle the task of group lines -into blocks. `parser.y` has the source for this, and it is processed by the -[lemon](http://www.hwaci.com/sw/lemon/) parser generator to create the actual -code. - -I chose to do this because hand-crafting the block parser would be complex. -The end result would likely be difficult to read and understand, which would -make it difficult to update later on. Using the parser generator allows me to -write things out in a way that can more easily be understood by a person. In -all likelihood, the performance is probably as good as anything I could do -anyway, if not better. - -Because lemon is a LALR(1) parser, it does require a bit of thinking ahead -about how to create the grammar used. But so far, it has been able to handle -everything I have thrown at it. - - -## Optimization ## - -One of my goals for MMD 6 was performance. So I've paid attention to speed -along the way, and have tried to use a few tricks to keep things fast. Here -are some things I've learned along the way. In no particular order: - - -### Memory Allocation ### - -When parsing a long document, a *lot* of token structures are created. Each -one requires a small bit of memory to be allocated. In aggregate, that time -added up and slowed down performance. - -After reading for a bit, I ended up coming up with an approach that uses -larger chunks of memory. I allocate pools of of memory in large slabs for -smaller "objects"". For example, I allocate memory for 1024 tokens at a -single time, and then dole that memory out as needed. When the slab is empty, -a new one is allocated. This dramatically improved performance. - -When pairing tokens, I created a new stack for each block. I realized that an -empty stack didn't have any "leftover" cruft to interfere with re-use, so I -just used one for the entire document. Again a sizeable improvement in -performance from only allocating one object instead of many. When recursing -to a deeper level, the stack just gets deeper, but earlier levels aren't -modified. +MultiMarkdown v6 is finally here! It's technically still in "beta" as I would +like to hear back from a few more users to make sure I'm not missing anything, +but it has been subjected to much more rigorous testing than any previous +versions of MultiMarkdown in the past. If you want more information about +testing, see `DevelopmentNotes`. It's basically feature complete as a +replacement for MMD v5, and included additional features beyond that. -Speaking of tokens, I realized that the average document contains a lot of -single spaces (there's one between every two words I have written, for -example.) The vast majority of the time, these single spaces have no effect -on the output of Markdown documents. I changed my whitespace token search to -only flag runs of 2 or more spaces, dramatically reducing the number of -tokens. This gives the benefit of needing fewer memory allocations, and also -reduces the number of tokens that need to be processed later on. The only -downside is remember to check for a single space character in a few instances -where it matters. +## Obtaining MultiMarkdown ## -### Proper input buffering ### +You can download the latest installer for MacOS or Windows at Github: -When I first began last spring, I was amazed to see how much time was being -spent by MultiMarkdown simply reading the input file. Then I discovered it -was because I was reading it one character at a time. I switched to using a -buffered read approach and the time to read the file went to almost nothing. I -experimented with different buffer sizes, but they did not seem to make a -measurable difference. + +To build from source, download from Github. Then: -### Output Buffering ### - -I experimented with different approaches to creating the output after parsing. -I tried printing directly to `stdout`, and even played with different -buffering settings. None of those seemed to work well, and all were slower -than using the `d_string` approach (formerly call `GString` in MMD 5). + make release + (OR) + make debug + cd build + make -### Fast Searches ### +You can optionally test using the test suite: -After getting basic Markdown functionality complete, I discovered during -testing that the time required to parse a document grew exponentially as the -document grew longer. Performance was on par with CommonMark for shorter -documents, but fell increasingly behind in larger tests. Time profiling found -that the culprit was searching for link definitions when they didn't exist. -My first approach was to keep a stack of used link definitions, and to iterate -through them when necessary. In long documents, this performs very poorly. -More research and I ended up using -[uthash](http://troydhanson.github.io/uthash/). This allows me to search for -a link (or footnote, etc.) by "name" rather than searching through an array. -This allowed me to get MMD's performance back to O(n), taking roughly twice as -much time to process a document that is twice as long. + ctest -### Efficient Utility Functions ### +## Differences in the MultiMarkdown Syntax ## -It is frequently necessary when parsing Markdown to check what sort of -character we are dealing with at a certain position -- a letter, whitespace, -punctuation, etc. I created a lookup table for this via `char_lookup.c` and -hard-coded it in `char.c`. These routines allow me to quickly, and -consistently, classify any byte within a document. This saved a lot of -programming time, and saved time tracking down bugs from handling things -slightly differently under different circumstances. I also suspect it -improved performance, but don't have the data to back it up. +MultiMarkdown v6 is mostly about making a better MMD parser, but it involves a +few changes to the MultiMarkdown syntax itself. +1. Setext headers can consist of more than one line to be included in the +header: -### Testing While Writing ### - -I developed several chunks of code in parallel while creating MMD 6. The vast -majority of it was developed largely in a [test-driven development] approach. -The other code was largely created with extensive unit testing to accomplish -this. - -[test-driven development]: https://en.wikipedia.org/wiki/Test-driven_development - -MMD isn't particularly amenable to this approach at the small level, but -instead I relied more on integration testing with an ever-growing collection -of text files and the corresponding HTML files in the MMD 6 test suite. This -allowed me to ensure new features work properly and that old features aren't -broken. At this time, there are 29 text files in the test suite, and many -more to come. - - -### Other Lessons ### - -Some things that didn't do me any good.... - -I considered differences between using `malloc` and `calloc` when initializing -tokens. The time saved by using `malloc` was basically exactly offset by the -initial time required to initialize the token to default null values as -compared to using `calloc`. When trying `calloc` failed to help me out -(thinking that clearing a single slab in the object pool would be faster), I -stuck with `malloc` as it makes more sense to me in my workflow. - -I read a bit about [struct padding] and reordered some of my structs. It was -until later that I discovered the `-Wpadded` option, and it's not clear -whether my changes modified anything. Since the structs were being padded -automatically, there was no noticeable performance change, and I didn't have -the tools to measure whether I could have improved memory usage at all. Not -sure this would be worth the effort -- much lower hanging fruit available. - -[struct padding]: http://www.catb.org/esr/structure-packing/ - - -## Differences in MultiMarkdown Itself ## - -MultiMarkdown v6 is mostly about making a better MMD parser, but it will -likely involve a few changes to the MultiMarkdown language itself. - - -1. {--I am thinking about removing Setext headers from the language. I almost -never use them, much preferring to use ATX style headers (`# foo #`). -Additionally, I have never liked the fact that Setext headers allow the -meaning of a line to be completely changed by the following line. It makes -the parsing slightly more difficult on a technical level (requiring some -backtracking at times). I'm not 100% certain on this, but right now I believe -it's the only Markdown feature that doesn't exist in MMD 6 yet.--}{++I decided -to go ahead and implement Setext headers, as it can be done with the new -parser without backtracking. One difference with older versions of MMD, as -well as Markdown itself, is that a setext header can consist of more than one -line to be included in the header.++} + This is + a header + ======== 2. Whitespace is not allowed between the text brackets and label brackets in reference links, images, footnotes, etc. For example `[foo] [bar]` will no longer be the same as `[foo][bar]`. 3. Link and image titles can be quoted with `'foo'`, `"foo"`, or `(foo)`. +Link attributes can be used in both reference and inline links/images. 4. HTML elements are handled slightly differently. There is no longer a `markdown="1"` feature. Instead, HTML elements that are on a line by @@ -541,7 +64,8 @@ themselves will open an HTML block that will cause the rest of the "paragraph" to be treated as HTML such that Markdown will not be parsed in side of it. HTML block-level tags are even "stronger" at starting an HTML block. It is not quite as complex as the approach used in CommonMark, but is similar under -most circumstances. +most circumstances. Leaving a blank line after the opening tag will allow +MultiMarkdown parsing inside of the HTML block. For example, this would not be parsed: @@ -558,9 +82,10 @@ most circumstances. 5. "Malformed" reference link definitions are handled slightly differently. -For example, `Reference Footnotes.text` is parsed differently in compatibility -mode than MMD-5. This started as a side-effect of the parsing algorithm, but -I actually think it makes sense. This may or may not change in the future. +For example, the test suite file `Reference Footnotes.text` is parsed +differently in compatibility mode than MMD-5. This started as a side-effect +of the parsing algorithm, but I actually think it makes sense. This may or +may not change in the future. 6. Table captions in MMD-6 must come immediately *after* the table, not before it. @@ -570,103 +95,29 @@ before it. feature in MMD, but I don't see a problem with just making it default behavior. +8. Escaped spaces (`\ `) will be interpreted as a non-breaking space, if the +output format supports it. -## Where Does MultiMarkdown 6 Stand? ## - - -### Features ### - -I *think* that all basic Markdown features have been implemented. -Additionally, the following MultiMarkdown features have been implemented: - -* Automatic cross-reference targets -* Basic Citation support -* CriticMarkup support -* Definition lists -* Figures -* Footnotes -* Inline and reference footnotes -* Image and Link attributes (attributes can now be used with inline links as - well as reference links) -* Math support -* Smart quotes (support for languages other than english is not fully - implemented yet) -* Superscripts/subscripts -* Table of Contents -* Tables - - -Things that are partially completed: - -* Citations -- still need: - * Syntax for "not cited" entries - * Output format - * HTML --> separate footnotes and citations? - * Locators required? -* CriticMarkup -- need to decide: - * How to handle CM stretches that include blank lines -* Fenced code blocks -* Headers -- need support for manual labels -* Metadata -* Full/Snippet modes - - -Things yet to be completed: - -* Abbreviations -* Glossaries -* File Transclusion - - -### Accuracy ### - -MultiMarkdown v6 successfully parses the Markdown [syntax page], except for -the Setext header at the top. It passes the 29 test files currently in place. -There are a few at - -[syntax page]: https://daringfireball.net/projects/markdown/syntax - - -### Performance ### - -Basic tests show that currently MMD 6 takes about 20-25% longer the CommonMark -0.27.0 to process long files (e.g. 0.2 MB). However, it is around 5% *faster* -than CommonMark when parsing a shorter file (27 kB) (measured by parsing the -same file 200 times over). This test suite is performed by using the Markdown -[syntax page], modified to avoid the use of the Setext header at the top. The -longer files tested are created by copying the same syntax page onto itself, -thereby doubling the length of the file with each iteration. +9. CriticMarkup, Abbreviations, Glossary Terms, and Citations are handled +slightly differently. See the QuickStart guide for more information. -The largest file I test is approximately 108 MB (4096 copies of the syntax -page). On my machine (2012 Mac mini with 2.3 GHz Intel Core i7, 16 GB RAM), -it takes approximately 4.4 seconds to parse with MMD 6 and 3.7 seconds with -CommonMark. MMD 6 processes approximately 25 MB/s on this test file. -CommonMark 0.27.0 gets about 29 MB/s on the same machine. +10. Fenced code blocks can use leading/trailing "fences" of 3, 4, or 5 +backticks in length. That should be sufficient for complex documents without +requiring a more complex parser. If there is no trailing fence, then the +fenced block is considered to go through the end of the document. -There are some slight variations with the smaller test files (8-32 copies), -but overall the performance of both programs (MMD 6 and CommonMark) are -roughly linear as the test file gets bigger (double the file size and it takes -twice as long to parse, aka O(n)). +11. Emph and Strong parsing is conceptually the same, but the implementation +is different. It is designed for speed, accuracy, and consistency. In +general, it seems to handle edge cases much more reliably, but there are still +a couple of situations that I would like to take into account, if possible. +These are not situations that should occur often in "real life." -Out of curiosity, I ran the same tests on the original Markdown.pl by Gruber -(v 1.0.2b8). It took approximately 178 seconds to parse 128 copies of the -file (3.4 MB) and was demonstrating quadratic performance characteristics -(double the file size and it takes 2^2 or 4 times longer to process, aka -O(n^2)). I didn't bother running it on larger versions of the test file. For -comparison, MMD 6 can process 128 copies in approximately 140 msec. +12. EPUB 3 output is supported without need of any external tools. -Of note, the throughput speed drops when testing more complicated files -containing more advanced MultiMarkdown features, though it still seems to -maintain linear performance characteristics. A second test file is created by -concatenating all of the test suite files (including the Markdown syntax -file). In this case, MMD gets about 13 MB/s. CommonMark doesn't support -these additional features, so testing it with that file is not relevant. I -will work to see whether there are certain features in particular that are -more challenging and see whether they can be reworked to improve performance. +13. Internationalization support for HTML phrases, such as "see footnote". See +[Github](https://github.com/fletcher/MultiMarkdown-6/issues/37) for more +information. -As above, I have done some high level optimization of the parse strategy, but -I'm sure there's still a lot of room for further improvement to be made. -Suggestions welcome! ## License ## diff --git a/test/parser_test.y b/test/parser_test.y index 46af54b4..118bea16 100644 --- a/test/parser_test.y +++ b/test/parser_test.y @@ -69,11 +69,13 @@ %fallback LINE_HR LINE_SETEXT_1 LINE_SETEXT_2. -%fallback LINE_PLAIN LINE_TABLE_SEPARATOR. +//%fallback LINE_PLAIN LINE_TABLE_SEPARATOR. -%fallback LINE_CONTINUATION LINE_PLAIN LINE_INDENTED_TAB LINE_INDENTED_SPACE LINE_TABLE. +%fallback LINE_CONTINUATION LINE_INDENTED_TAB LINE_INDENTED_SPACE LINE_TABLE LINE_TABLE_SEPARATOR. -%fallback LINE_HTML LINE_ATX_1 LINE_ATX_2 LINE_ATX_3 LINE_ATX_4 LINE_ATX_5 LINE_ATX_6 LINE_HR LINE_BLOCKQUOTE LINE_LIST_BULLETED LINE_LIST_ENUMERATED LINE_DEF_CITATION LINE_DEF_FOOTNOTE LINE_DEF_LINK LINE_FENCE_BACKTICK LINE_FENCE_BACKTICK_START. +%fallback LINE_HTML LINE_ATX_1 LINE_ATX_2 LINE_ATX_3 LINE_ATX_4 LINE_ATX_5 LINE_ATX_6 LINE_BLOCKQUOTE +LINE_LIST_BULLETED LINE_LIST_ENUMERATED LINE_DEF_ABBREVIATION LINE_DEF_CITATION LINE_DEF_FOOTNOTE +LINE_DEF_GLOSSARY LINE_DEF_LINK LINE_FENCE_BACKTICK LINE_FENCE_BACKTICK_START. // Copy clean grammar via `lemon -g parser.y` here @@ -90,8 +92,10 @@ block ::= LINE_ATX_6. block ::= LINE_HR. block ::= LINE_TOC. block ::= blockquote. +block ::= def_abbreviation. block ::= def_citation. block ::= def_footnote. +block ::= def_glossary. block ::= def_link. block ::= definition_block. block ::= empty. @@ -114,7 +118,9 @@ opt_ext_chunk ::= chunk nested_chunks. blockquote ::= blockquote quote_line. def_citation ::= LINE_DEF_CITATION tail. def_footnote ::= LINE_DEF_FOOTNOTE tail. +def_glossary ::= LINE_DEF_GLOSSARY tail. def_link ::= LINE_DEF_LINK chunk. +def_abbreviation ::= LINE_DEF_ABBREVIATION chunk. definition_block ::= para defs. defs ::= defs def. def ::= LINE_DEFINITION tail. @@ -174,7 +180,9 @@ quote_line ::= LINE_BLOCKQUOTE. quote_line ::= LINE_CONTINUATION. def_citation ::= LINE_DEF_CITATION. def_footnote ::= LINE_DEF_FOOTNOTE. +def_glossary ::= LINE_DEF_GLOSSARY. def_link ::= LINE_DEF_LINK. +def_abbreviation ::= LINE_DEF_ABBREVIATION. defs ::= def. empty ::= LINE_EMPTY. fenced_block ::= fenced_3. @@ -225,7 +233,7 @@ para ::= defs. void ParseFree(); void ParseTrace(); - #define kMaxToken 34 + #define kMaxToken 36 int i,j,k,l,m, n; diff --git a/tests/MMD6Tests/What Is MMD.text b/tests/Disabled/What Is MMD.text similarity index 100% rename from tests/MMD6Tests/What Is MMD.text rename to tests/Disabled/What Is MMD.text diff --git a/tests/MMD6Tests/Definition Lists.fodt b/tests/MMD6Tests/Definition Lists.fodt index 9e439dbd..60d786e7 100644 --- a/tests/MMD6Tests/Definition Lists.fodt +++ b/tests/MMD6Tests/Definition Lists.fodt @@ -299,6 +299,9 @@ bar foo baz* bat foo + +:foo +
diff --git a/tests/MMD6Tests/Definition Lists.html b/tests/MMD6Tests/Definition Lists.html index 76ea0bc0..4419ca23 100644 --- a/tests/MMD6Tests/Definition Lists.html +++ b/tests/MMD6Tests/Definition Lists.html @@ -32,6 +32,9 @@

foo

+

:foo +

+ diff --git a/tests/MMD6Tests/Definition Lists.htmlc b/tests/MMD6Tests/Definition Lists.htmlc index 24e00b83..65bb395a 100644 --- a/tests/MMD6Tests/Definition Lists.htmlc +++ b/tests/MMD6Tests/Definition Lists.htmlc @@ -21,3 +21,5 @@ bar foo : baz bat

foo

+ +

:foo

diff --git a/tests/MMD6Tests/Definition Lists.tex b/tests/MMD6Tests/Definition Lists.tex index 12256ed3..8e663e9c 100644 --- a/tests/MMD6Tests/Definition Lists.tex +++ b/tests/MMD6Tests/Definition Lists.tex @@ -34,5 +34,8 @@ foo +:foo + + \input{mmd6-article-footer} \end{document} diff --git a/tests/MMD6Tests/Definition Lists.text b/tests/MMD6Tests/Definition Lists.text index 7f46a57c..7a012065 100644 --- a/tests/MMD6Tests/Definition Lists.text +++ b/tests/MMD6Tests/Definition Lists.text @@ -22,3 +22,4 @@ foo foo +:foo diff --git a/tests/MMD6Tests/Fuzz.fodt b/tests/MMD6Tests/Fuzz.fodt new file mode 100644 index 00000000..b0be3a5f --- /dev/null +++ b/tests/MMD6Tests/Fuzz.fodt @@ -0,0 +1,322 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Bibliography + + + + Fuzz Testing + + + +Collection of test cases identified by American fuzzy lop. + +>bar~~} + + + + +list + + +tems + + + +:Escapes + +[>MM]: MultiMarkdown + +foo1 + +foo2 + +]: And aÄe footn.IThi1. $ + +[MMD]: + +f F O O (o o) f o o + + + + +foo + + + foo + + + +(This must be at end of file without trailing newline) + +A + + + diff --git a/tests/MMD6Tests/Fuzz.html b/tests/MMD6Tests/Fuzz.html new file mode 100644 index 00000000..b625f30a --- /dev/null +++ b/tests/MMD6Tests/Fuzz.html @@ -0,0 +1,43 @@ + + + + + Fuzz Testing + + + +

Collection of test cases identified by American fuzzy lop.

+ +

>bar~~}

+ +
    +
  • list
  • +
  • tems
  • +
+ +

:Escapes

+ +

[>MM]: MultiMarkdown

+ +

foo1

+ +

foo2

+ +

]: And aÄe footn.IThi1. $

+ +

[MMD]:

+ +

f F O O (o o) f o o

+ +
    +
  • foo
  • +
  • Â foo
  • +
+ +

(This must be at end of file without trailing newline)

+ +

A

+ + + + diff --git a/tests/MMD6Tests/Fuzz.htmlc b/tests/MMD6Tests/Fuzz.htmlc new file mode 100644 index 00000000..88d697b1 --- /dev/null +++ b/tests/MMD6Tests/Fuzz.htmlc @@ -0,0 +1,38 @@ +

Title: Fuzz Testing +latex config: article

+ +

Collection of test cases identified by American fuzzy lop.

+ +

û~~foo~>bar~~}

+ +
    +
  • list
  • +
  • tems
  • +
+ +

:Escapes [escaped]

+ +

[>MM]: MultiMarkdown

+ +

[?terí¢ıı[?term]: A term to be defined.

+ +

foo1

+ +

foo2

+ +

]: And aÄe footn.I^Thi1. $

+ +

[MMD]:

+ +

f o o f o o

+ +

[> o o]: F O O

+ +
    +
  • foo
  • +
  • Â foo
  • +
+ +

(This must be at end of file without trailing newline)

+ +

A

diff --git a/tests/MMD6Tests/Fuzz.tex b/tests/MMD6Tests/Fuzz.tex new file mode 100644 index 00000000..e0ba52dd --- /dev/null +++ b/tests/MMD6Tests/Fuzz.tex @@ -0,0 +1,46 @@ +\input{mmd6-article-leader} +\def\mytitle{Fuzz Testing} +\newacronym{o o}{o o}{F O O} + +\input{mmd6-article-begin} + +Collection of test cases identified by \href{http://lcamtuf.coredump.cx/afl/}{American fuzzy lop}\footnote{\href{http://lcamtuf.coredump.cx/afl/}{http:\slash \slash lcamtuf.coredump.cx\slash afl\slash }}. + +>bar~~} + +\begin{itemize} +\item{} list + +\item{} tems + +\end{itemize} + +\chapter{:Escapes } +\label{escaped} + +[>MM]: MultiMarkdown + +foo1 (\autoref{ba\}) + +foo2 (\autoref{bar}) + +]: And aÄe footn.I\textsuperscript{Thi1}. \$ + +[MMD]: + +f \gls{o o} f \gls{o o} + +\begin{itemize} +\item{} foo + +\item{}  foo + +\end{itemize} + +(This must be at end of file without trailing newline) + +\part{A} +\label{a} + +\input{mmd6-article-footer} +\end{document} diff --git a/tests/MMD6Tests/Fuzz.text b/tests/MMD6Tests/Fuzz.text new file mode 100644 index 00000000..67e0fc30 --- /dev/null +++ b/tests/MMD6Tests/Fuzz.text @@ -0,0 +1,40 @@ +Title: Fuzz Testing +latex config: article + +Collection of test cases identified by [American fuzzy lop](http://lcamtuf.coredump.cx/afl/). + +û~~foo~>bar~~} + +* list +* tems + +:Escapes [escaped] +---------------- + +[>MM\]: MultiMarkdown + +[?terí¢ıı[?term]: A term to be defined. + +[foo1] + +[foo2] + +[foo1]: #ba\ +[foo2]: #bar + +]: And aÄe footn.I^Thi1. \$ + +[MMD]: + +f o o f o o + +[> o o]: F O O + +* foo +*   foo + + + +(This must be at end of file without trailing newline) + +# A \ No newline at end of file