diff --git a/.gitignore b/.gitignore index d4071e6..a4e1b2c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ test/coverage doc/markup.odocl doc/html doc/publish +doc/*.zip opam diff --git a/Makefile b/Makefile index 94c5048..5847736 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ LIB := markup +VERSION := 0.5 if_package = ! ocamlfind query $(1) > /dev/null 2> /dev/null || ( $(2) ) @@ -118,13 +119,7 @@ docs-odocl : PUBLISH := doc/publish .PHONY : publish-docs -publish-docs : docs - @test $(OCAML_VERSION) -ne 402 \ - || (echo "\nocamldoc is broken in 4.02" && false) - @ocamlfind query lwt.unix > /dev/null 2> /dev/null \ - || (echo "\nLwt not installed" && false) - @ocamlfind query lambdasoup > /dev/null 2> /dev/null \ - || (echo "\nLambda Soup not installed" && false) +publish-docs : check-doc-prereqs docs rm -rf $(PUBLISH) mkdir -p $(PUBLISH) cd $(PUBLISH) \ @@ -135,6 +130,22 @@ publish-docs : docs && git commit -m 'Markup.ml documentation.' \ && git push -uf github master:gh-pages +DOC_ZIP := doc/$(LIB)-$(VERSION)-doc.zip + +.PHONY : package-docs +package-docs : check-doc-prereqs docs + rm -f $(DOC_ZIP) + zip -9 $(DOC_ZIP) $(HTML)/* + +.PHONY : check-doc-prereqs +check-doc-prereqs : + @test $(OCAML_VERSION) -ne 402 \ + || (echo "\nocamldoc is broken in 4.02" && false) + @ocamlfind query lwt.unix > /dev/null 2> /dev/null \ + || (echo "\nLwt not installed" && false) + @ocamlfind query lambdasoup > /dev/null 2> /dev/null \ + || (echo "\nLambda Soup not installed" && false) + need_package = \ ocamlfind query $(1) > /dev/null 2> /dev/null \ || echo "Missing package '$(1)' (opam install $(2))" @@ -193,5 +204,5 @@ uninstall : .PHONY : clean clean : $(OCAMLBUILD) -clean - rm -rf bisect*.out $(COVERAGE) $(HTML) $(PUBLISH) opam + rm -rf bisect*.out $(COVERAGE) $(HTML) $(PUBLISH) $(DOC_ZIP) opam cd $(DEP_TEST_DIR) && $(OCAMLBUILD) -clean diff --git a/README.md b/README.md index 7151c65..627dbc5 100644 --- a/README.md +++ b/README.md @@ -82,13 +82,14 @@ let report = count := !count + 1; if !count >= 10 then raise_notrace Exit -string "some xml" |> parse_xml ~report |> drain +string "some xml" |> parse_xml ~report |> signals |> drain (* Load HTML into a custom document tree data type. *) type html = Text of string | Element of string * html list file "some_file" |> parse_html +|> signals |> tree ~text:(fun ss -> Text (String.concat "" ss)) ~element:(fun (_, name) _ children -> Element (name, children)) @@ -151,19 +152,12 @@ opam install lwt cohttp lambdasoup markup ## Installing -Until Markup.ml is added to OPAM, the easiest way to install it is by cloning -this repository, then running +Simply ```sh -make install +opam install markup ``` -in the cloned directory. This will use OPAM to pin Markup.ml, install the -dependency Uutf, then build and install Markup.ml. If you want to use the module -`Markup_lwt`, check that Lwt is installed before installing Markup.ml. - -To remove the pin later, run `make uninstall`. - ## Documentation The interface of Markup.ml is three modules [`Markup`][Markup], diff --git a/doc/postprocess.ml b/doc/postprocess.ml index 96b6e33..4666ded 100644 --- a/doc/postprocess.ml +++ b/doc/postprocess.ml @@ -229,7 +229,8 @@ let clean_up_content soup = uncolor "constructor" "ASYNCHRONOUS"; uncolor "constructor" "Pervasives"; uncolor "constructor" "Lwt_io"; - uncolor "keyword" "false" + uncolor "keyword" "false"; + uncolor "keyword" "parser" let add_with_type soup type_name = let extra = diff --git a/doc/style.css b/doc/style.css index 041c153..e35df21 100644 --- a/doc/style.css +++ b/doc/style.css @@ -183,6 +183,12 @@ h2 { line-height: 1; } +h3 { + font-size: 100%; + margin-top: 1.5em; + margin-bottom: 1.5em; +} + body > pre:first-of-type { margin-top: 1.5em; } diff --git a/src/markup.mli b/src/markup.mli index 92dd935..f36bfa9 100644 --- a/src/markup.mli +++ b/src/markup.mli @@ -77,7 +77,9 @@ val write_xml : signal stream -> char stream Markup.ml is developed on {{:https://github.com/aantron/markup.ml} GitHub} and distributed under the {{:https://github.com/aantron/markup.ml/blob/master/doc/LICENSE} - BSD license}. This documentation is for version 0.5 of the library. *) + BSD license}. This documentation is for version 0.5 of the library. + Documentation for older versions can be found on the + {{: https://github.com/aantron/markup.ml/releases} releases page}. *) @@ -193,7 +195,7 @@ sig val decode : ?report:(location -> Error.t -> unit) -> t -> (char, 's) stream -> (int, 's) stream - (** Applies a decoder to byte stream. Illegal input byte sequences result in + (** Applies a decoder to a byte stream. Illegal input byte sequences result in calls to the error handler [~report] with error kind [`Decoding_error]. The illegal bytes are then skipped, and zero or more U+FFFD replacement characters are emitted. The default handler ignores errors. @@ -308,9 +310,9 @@ val signal_to_string : [< signal ] -> string (** {2 Parsers} *) type 's parser -(** A ['s parser] is a thin wrapper around a [(signal, 's) stream] that supports - access to additional information that is not carried directly in the stream, - such as source locations. *) +(** An ['s parser] is a thin wrapper around a [(signal, 's) stream] that + supports access to additional information that is not carried directly in + the stream, such as source locations. *) val signals : 's parser -> (signal, 's) stream (** Converts a parser to its underlying signal stream. *) @@ -761,3 +763,66 @@ val kstream : ('a, _) stream -> 'a Kstream.t val of_kstream : 'a Kstream.t -> ('a, _) stream (**/**) + + + +(** {2 Conformance status} + + The HTML parser seeks to implement section 8 of the HTML5 specification. + That section describes a parser, part of a full-blown user agent, that is + building up a DOM representation of an HTML document. Markup.ml is neither + inherently part of a user agent, nor does it build up a DOM representation. + With respect to section 8 of HTML5, Markup.ml is concerned with only the + syntax. When that section requires that the user agent perform an action, + Markup.ml emits enough information for a hypothetical user agent based on it + to be able to decide to perform this action. Likewise, Markup.ml seeks to + emit enough information for a hypothetical user agent to build up a + conforming DOM. + + The XML parser seeks to be a non-validating implementation of the XML and + Namespaces in XML specifications. + + This rest of this section lists known deviations from HTML5, XML, and + Namespaces in XML. Some of these deviations are meant to be corrected in + future versions of Markup.ml, while others will probably remain. The latter + satisfy some or all of the following properties: + + - They require non-local adjustment, especially of past nodes. For example, + adjusting the start signal of the root node mid-way through the signal + stream is difficult for a one-pass parser. + - They are minor. Users implementing less than a conforming browser + typically don't care about them, and they typically have to do with + obscure error recovery. + - They can easily be corrected by code written over Markup.ml that builds up + a DOM or maintains other auxiliary data structures during parsing. + + {3 To be corrected} + + - XML: There is no attribute value normalization. + - HTML: The {e adoption agency algorithm} is not implemented, because it + requires non-local adjustments. + - HTML: {e foster parenting} is not implemented, because it requires + non-local adjustments. + - HTML: Quirks mode is not honored. This affects the interaction between + automatic closing of [p] elements and opening of [table] elements. + - HTML: The parser ignores the {e head element pointer}. + - HTML: The parser ignores the {e form element pointer}. + - HTML: The parser ignores interactions between [form] and [template]. + - HTML: The form translation for [isindex] is completely ignored. [isindex] + is handled as an unknown element. + + {3 To remain} + + - HTML: Except when detecting encodings, the parser does not try to read + [] tags for encoding declarations. The user of Markup.ml should read + these, if necessary. They are part of the emitted signal stream. + - HTML: [noscript] elements are always parsed, as are [script] elements. For + conforming behavior, if the user of Markup.ml "supports scripts," the user + should serialize the content of [noscript] to a [`Text] signal using + [write_html]. + - HTML: Elements such as [title] that belong in [head], but are found + between [head] and [body], are not moved into [head]. + - HTML: [] tags found in the body do not have their attributes added + to the [`Start_element "html"] signal emitted at the beginning of the + document. +*) diff --git a/test/test_encoding.ml b/test/test_encoding.ml index c2a665f..1671e20 100644 --- a/test/test_encoding.ml +++ b/test/test_encoding.ml @@ -9,8 +9,6 @@ open Kstream open Stream_io open Encoding -(* TODO Test exception pass-through. *) - let ok = wrong_k "failed" let test_ucs_4 (f : Encoding.t) name s1 s2 bad_bytes = diff --git a/test/test_html_parser.ml b/test/test_html_parser.ml index 69a9cec..aacbdb8 100644 --- a/test/test_html_parser.ml +++ b/test/test_html_parser.ml @@ -114,7 +114,6 @@ let tests = [ 1, 31, S `End_element; 1, 31, S `End_element]); - (* TODO Document deviation for non-iframe srcdoc documents. *) ("html.parser.no-doctype" >:: fun _ -> expect ~prefix:true "foo" [ 1, 1, S (start_element "html"); @@ -357,7 +356,6 @@ let tests = [ 1, 18, S `End_element; 1, 18, S `End_element]); - (* TODO It is strange that the tag always causes a parse error. *) ("html.parser.plaintext" >:: fun _ -> expect "<p><plaintext>foo</plaintext></p>" [ 1, 1, S (start_element "html"); @@ -413,7 +411,6 @@ let tests = [ 1, 40, S `End_element; 1, 40, S `End_element]); - (* TODO Test condition in EOF case, likewise HTML case. *) ("html.parser.truncated-body" >:: fun _ -> expect "<body>" [ 1, 1, S (start_element "html"); @@ -459,8 +456,6 @@ let tests = [ 1, 22, S `End_element; 1, 29, S `End_element]); - (* TODO Don't double-report errors on the same start tag. *) - (* TODO Change the location of implied start tags? *) ("html.parser.reconstruct-active-formatting-elements" >:: fun _ -> expect "<p><em><strong>foo<p>bar" [ 1, 1, S (start_element "html"); @@ -533,7 +528,6 @@ let tests = [ [ 1, 1, E (`Bad_token ("U+0000", "foreign content", "null")); 1, 1, S (`Text ["\xef\xbf\xbdfoo"])]; - (* TODO Throttle `Bad_content. *) expect ~context:(Some (`Fragment "body")) "<table>\x00foo</table>" [ 1, 1, S (start_element "table"); 1, 8, E (`Bad_token ("U+0000", "table", "null")); diff --git a/test/test_html_writer.ml b/test/test_html_writer.ml index 25048c3..0538ea3 100644 --- a/test/test_html_writer.ml +++ b/test/test_html_writer.ml @@ -5,10 +5,6 @@ open OUnit2 open Test_support open Common -(* TODO Test qnames for non-SVG,MathML,HTML elements. *) -(* TODO Attribute qnames. *) -(* TODO Test xmlns, xlink, and xml namespaces in the parser as well. *) - let expect id signals strings = let _, iterate, ended = expect_strings id strings in @@ -93,7 +89,6 @@ let tests = [ S "id"; S "=\""; S "foo<>&quot;&amp;&nbsp;"; S "\""; S ">"; S "</"; S "p"; S ">"]); - (* TODO Implement self-closing foreign elements. *) ("html.writer.foreign-element" >:: fun _ -> expect "foreign element" [`Start_element ((svg_ns, "use"), [(xlink_ns, "href"), "#foo"]); diff --git a/test/test_support.ml b/test/test_support.ml index 9c43dd6..a22c3cf 100644 --- a/test/test_support.ml +++ b/test/test_support.ml @@ -42,7 +42,6 @@ let expect_error : sprintf "no error\nexpected \"%s\"" (Error.to_string ~location:l error) |> assert_failure -(* TODO Rename id to label. *) let expect_sequence ?(prefix = false) id to_string sequence = let assert_failure s = assert_failure (id ^ "\n" ^ s) in diff --git a/test/test_xml_parser.ml b/test/test_xml_parser.ml index 45eb9e0..087c9cb 100644 --- a/test/test_xml_parser.ml +++ b/test/test_xml_parser.ml @@ -368,5 +368,3 @@ let tests = [ (xmlns_ns, "c"), "baz"])); 1, 1, S `End_element]) ] - -(* TODO Test fragment argument. *) diff --git a/test/test_xml_tokenizer.ml b/test/test_xml_tokenizer.ml index 1398f9a..5bac7f5 100644 --- a/test/test_xml_tokenizer.ml +++ b/test/test_xml_tokenizer.ml @@ -5,8 +5,6 @@ open OUnit2 open Test_support open Common -(* TODO Test exception pass-through in integration. *) - let xml_decl version encoding standalone = `Xml {version; encoding; standalone} diff --git a/test/test_xml_writer.ml b/test/test_xml_writer.ml index fd0c6eb..91fe095 100644 --- a/test/test_xml_writer.ml +++ b/test/test_xml_writer.ml @@ -150,7 +150,6 @@ let tests = [ S "xmlns:a"; S "=\""; S "other_ns"; S "\""; S "/>"; S "</"; S "foo"; S ">"]; - (* TODO Do this test in both orders. *) expect "shadowing resolution" [`Start_element (("", "foo"), [(xmlns_ns, "a"), "some_ns"; @@ -185,5 +184,3 @@ let tests = [ S "<"; S "foo"; S " "; S "xmlns:a"; S "=\""; S "other_ns"; S "\""; S "/>"]) ] - -(* TODO Ill-formed signal sequences. *)