diff --git a/.gitignore b/.gitignore index d4071e6..a4e1b2c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ test/coverage doc/markup.odocl doc/html doc/publish +doc/*.zip opam diff --git a/Makefile b/Makefile index 94c5048..5847736 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,5 @@ LIB := markup +VERSION := 0.5 if_package = ! ocamlfind query $(1) > /dev/null 2> /dev/null || ( $(2) ) @@ -118,13 +119,7 @@ docs-odocl : PUBLISH := doc/publish .PHONY : publish-docs -publish-docs : docs - @test $(OCAML_VERSION) -ne 402 \ - || (echo "\nocamldoc is broken in 4.02" && false) - @ocamlfind query lwt.unix > /dev/null 2> /dev/null \ - || (echo "\nLwt not installed" && false) - @ocamlfind query lambdasoup > /dev/null 2> /dev/null \ - || (echo "\nLambda Soup not installed" && false) +publish-docs : check-doc-prereqs docs rm -rf $(PUBLISH) mkdir -p $(PUBLISH) cd $(PUBLISH) \ @@ -135,6 +130,22 @@ publish-docs : docs && git commit -m 'Markup.ml documentation.' \ && git push -uf github master:gh-pages +DOC_ZIP := doc/$(LIB)-$(VERSION)-doc.zip + +.PHONY : package-docs +package-docs : check-doc-prereqs docs + rm -f $(DOC_ZIP) + zip -9 $(DOC_ZIP) $(HTML)/* + +.PHONY : check-doc-prereqs +check-doc-prereqs : + @test $(OCAML_VERSION) -ne 402 \ + || (echo "\nocamldoc is broken in 4.02" && false) + @ocamlfind query lwt.unix > /dev/null 2> /dev/null \ + || (echo "\nLwt not installed" && false) + @ocamlfind query lambdasoup > /dev/null 2> /dev/null \ + || (echo "\nLambda Soup not installed" && false) + need_package = \ ocamlfind query $(1) > /dev/null 2> /dev/null \ || echo "Missing package '$(1)' (opam install $(2))" @@ -193,5 +204,5 @@ uninstall : .PHONY : clean clean : $(OCAMLBUILD) -clean - rm -rf bisect*.out $(COVERAGE) $(HTML) $(PUBLISH) opam + rm -rf bisect*.out $(COVERAGE) $(HTML) $(PUBLISH) $(DOC_ZIP) opam cd $(DEP_TEST_DIR) && $(OCAMLBUILD) -clean diff --git a/README.md b/README.md index 7151c65..627dbc5 100644 --- a/README.md +++ b/README.md @@ -82,13 +82,14 @@ let report = count := !count + 1; if !count >= 10 then raise_notrace Exit -string "some xml" |> parse_xml ~report |> drain +string "some xml" |> parse_xml ~report |> signals |> drain (* Load HTML into a custom document tree data type. *) type html = Text of string | Element of string * html list file "some_file" |> parse_html +|> signals |> tree ~text:(fun ss -> Text (String.concat "" ss)) ~element:(fun (_, name) _ children -> Element (name, children)) @@ -151,19 +152,12 @@ opam install lwt cohttp lambdasoup markup ## Installing -Until Markup.ml is added to OPAM, the easiest way to install it is by cloning -this repository, then running +Simply ```sh -make install +opam install markup ``` -in the cloned directory. This will use OPAM to pin Markup.ml, install the -dependency Uutf, then build and install Markup.ml. If you want to use the module -`Markup_lwt`, check that Lwt is installed before installing Markup.ml. - -To remove the pin later, run `make uninstall`. - ## Documentation The interface of Markup.ml is three modules [`Markup`][Markup], diff --git a/doc/postprocess.ml b/doc/postprocess.ml index 96b6e33..4666ded 100644 --- a/doc/postprocess.ml +++ b/doc/postprocess.ml @@ -229,7 +229,8 @@ let clean_up_content soup = uncolor "constructor" "ASYNCHRONOUS"; uncolor "constructor" "Pervasives"; uncolor "constructor" "Lwt_io"; - uncolor "keyword" "false" + uncolor "keyword" "false"; + uncolor "keyword" "parser" let add_with_type soup type_name = let extra = diff --git a/doc/style.css b/doc/style.css index 041c153..e35df21 100644 --- a/doc/style.css +++ b/doc/style.css @@ -183,6 +183,12 @@ h2 { line-height: 1; } +h3 { + font-size: 100%; + margin-top: 1.5em; + margin-bottom: 1.5em; +} + body > pre:first-of-type { margin-top: 1.5em; } diff --git a/src/markup.mli b/src/markup.mli index 92dd935..f36bfa9 100644 --- a/src/markup.mli +++ b/src/markup.mli @@ -77,7 +77,9 @@ val write_xml : signal stream -> char stream Markup.ml is developed on {{:https://github.com/aantron/markup.ml} GitHub} and distributed under the {{:https://github.com/aantron/markup.ml/blob/master/doc/LICENSE} - BSD license}. This documentation is for version 0.5 of the library. *) + BSD license}. This documentation is for version 0.5 of the library. + Documentation for older versions can be found on the + {{: https://github.com/aantron/markup.ml/releases} releases page}. *) @@ -193,7 +195,7 @@ sig val decode : ?report:(location -> Error.t -> unit) -> t -> (char, 's) stream -> (int, 's) stream - (** Applies a decoder to byte stream. Illegal input byte sequences result in + (** Applies a decoder to a byte stream. Illegal input byte sequences result in calls to the error handler [~report] with error kind [`Decoding_error]. The illegal bytes are then skipped, and zero or more U+FFFD replacement characters are emitted. The default handler ignores errors. @@ -308,9 +310,9 @@ val signal_to_string : [< signal ] -> string (** {2 Parsers} *) type 's parser -(** A ['s parser] is a thin wrapper around a [(signal, 's) stream] that supports - access to additional information that is not carried directly in the stream, - such as source locations. *) +(** An ['s parser] is a thin wrapper around a [(signal, 's) stream] that + supports access to additional information that is not carried directly in + the stream, such as source locations. *) val signals : 's parser -> (signal, 's) stream (** Converts a parser to its underlying signal stream. *) @@ -761,3 +763,66 @@ val kstream : ('a, _) stream -> 'a Kstream.t val of_kstream : 'a Kstream.t -> ('a, _) stream (**/**) + + + +(** {2 Conformance status} + + The HTML parser seeks to implement section 8 of the HTML5 specification. + That section describes a parser, part of a full-blown user agent, that is + building up a DOM representation of an HTML document. Markup.ml is neither + inherently part of a user agent, nor does it build up a DOM representation. + With respect to section 8 of HTML5, Markup.ml is concerned with only the + syntax. When that section requires that the user agent perform an action, + Markup.ml emits enough information for a hypothetical user agent based on it + to be able to decide to perform this action. Likewise, Markup.ml seeks to + emit enough information for a hypothetical user agent to build up a + conforming DOM. + + The XML parser seeks to be a non-validating implementation of the XML and + Namespaces in XML specifications. + + This rest of this section lists known deviations from HTML5, XML, and + Namespaces in XML. Some of these deviations are meant to be corrected in + future versions of Markup.ml, while others will probably remain. The latter + satisfy some or all of the following properties: + + - They require non-local adjustment, especially of past nodes. For example, + adjusting the start signal of the root node mid-way through the signal + stream is difficult for a one-pass parser. + - They are minor. Users implementing less than a conforming browser + typically don't care about them, and they typically have to do with + obscure error recovery. + - They can easily be corrected by code written over Markup.ml that builds up + a DOM or maintains other auxiliary data structures during parsing. + + {3 To be corrected} + + - XML: There is no attribute value normalization. + - HTML: The {e adoption agency algorithm} is not implemented, because it + requires non-local adjustments. + - HTML: {e foster parenting} is not implemented, because it requires + non-local adjustments. + - HTML: Quirks mode is not honored. This affects the interaction between + automatic closing of [p] elements and opening of [table] elements. + - HTML: The parser ignores the {e head element pointer}. + - HTML: The parser ignores the {e form element pointer}. + - HTML: The parser ignores interactions between [form] and [template]. + - HTML: The form translation for [isindex] is completely ignored. [isindex] + is handled as an unknown element. + + {3 To remain} + + - HTML: Except when detecting encodings, the parser does not try to read + [] tags for encoding declarations. The user of Markup.ml should read + these, if necessary. They are part of the emitted signal stream. + - HTML: [noscript] elements are always parsed, as are [script] elements. For + conforming behavior, if the user of Markup.ml "supports scripts," the user + should serialize the content of [noscript] to a [`Text] signal using + [write_html]. + - HTML: Elements such as [title] that belong in [head], but are found + between [head] and [body], are not moved into [head]. + - HTML: [] tags found in the body do not have their attributes added + to the [`Start_element "html"] signal emitted at the beginning of the + document. +*) diff --git a/test/test_encoding.ml b/test/test_encoding.ml index c2a665f..1671e20 100644 --- a/test/test_encoding.ml +++ b/test/test_encoding.ml @@ -9,8 +9,6 @@ open Kstream open Stream_io open Encoding -(* TODO Test exception pass-through. *) - let ok = wrong_k "failed" let test_ucs_4 (f : Encoding.t) name s1 s2 bad_bytes = diff --git a/test/test_html_parser.ml b/test/test_html_parser.ml index 69a9cec..aacbdb8 100644 --- a/test/test_html_parser.ml +++ b/test/test_html_parser.ml @@ -114,7 +114,6 @@ let tests = [ 1, 31, S `End_element; 1, 31, S `End_element]); - (* TODO Document deviation for non-iframe srcdoc documents. *) ("html.parser.no-doctype" >:: fun _ -> expect ~prefix:true "
foo bar"
[ 1, 1, S (start_element "html");
@@ -533,7 +528,6 @@ let tests = [
[ 1, 1, E (`Bad_token ("U+0000", "foreign content", "null"));
1, 1, S (`Text ["\xef\xbf\xbdfoo"])];
- (* TODO Throttle `Bad_content. *)
expect ~context:(Some (`Fragment "body")) "\x00foo
"
[ 1, 1, S (start_element "table");
1, 8, E (`Bad_token ("U+0000", "table", "null"));
diff --git a/test/test_html_writer.ml b/test/test_html_writer.ml
index 25048c3..0538ea3 100644
--- a/test/test_html_writer.ml
+++ b/test/test_html_writer.ml
@@ -5,10 +5,6 @@ open OUnit2
open Test_support
open Common
-(* TODO Test qnames for non-SVG,MathML,HTML elements. *)
-(* TODO Attribute qnames. *)
-(* TODO Test xmlns, xlink, and xml namespaces in the parser as well. *)
-
let expect id signals strings =
let _, iterate, ended = expect_strings id strings in
@@ -93,7 +89,6 @@ let tests = [
S "id"; S "=\""; S "foo<>"& "; S "\""; S ">";
S ""; S "p"; S ">"]);
- (* TODO Implement self-closing foreign elements. *)
("html.writer.foreign-element" >:: fun _ ->
expect "foreign element"
[`Start_element ((svg_ns, "use"), [(xlink_ns, "href"), "#foo"]);
diff --git a/test/test_support.ml b/test/test_support.ml
index 9c43dd6..a22c3cf 100644
--- a/test/test_support.ml
+++ b/test/test_support.ml
@@ -42,7 +42,6 @@ let expect_error :
sprintf "no error\nexpected \"%s\"" (Error.to_string ~location:l error)
|> assert_failure
-(* TODO Rename id to label. *)
let expect_sequence ?(prefix = false) id to_string sequence =
let assert_failure s = assert_failure (id ^ "\n" ^ s) in
diff --git a/test/test_xml_parser.ml b/test/test_xml_parser.ml
index 45eb9e0..087c9cb 100644
--- a/test/test_xml_parser.ml
+++ b/test/test_xml_parser.ml
@@ -368,5 +368,3 @@ let tests = [
(xmlns_ns, "c"), "baz"]));
1, 1, S `End_element])
]
-
-(* TODO Test fragment argument. *)
diff --git a/test/test_xml_tokenizer.ml b/test/test_xml_tokenizer.ml
index 1398f9a..5bac7f5 100644
--- a/test/test_xml_tokenizer.ml
+++ b/test/test_xml_tokenizer.ml
@@ -5,8 +5,6 @@ open OUnit2
open Test_support
open Common
-(* TODO Test exception pass-through in integration. *)
-
let xml_decl version encoding standalone =
`Xml {version; encoding; standalone}
diff --git a/test/test_xml_writer.ml b/test/test_xml_writer.ml
index fd0c6eb..91fe095 100644
--- a/test/test_xml_writer.ml
+++ b/test/test_xml_writer.ml
@@ -150,7 +150,6 @@ let tests = [
S "xmlns:a"; S "=\""; S "other_ns"; S "\""; S "/>";
S ""; S "foo"; S ">"];
- (* TODO Do this test in both orders. *)
expect "shadowing resolution"
[`Start_element (("", "foo"),
[(xmlns_ns, "a"), "some_ns";
@@ -185,5 +184,3 @@ let tests = [
S "<"; S "foo"; S " ";
S "xmlns:a"; S "=\""; S "other_ns"; S "\""; S "/>"])
]
-
-(* TODO Ill-formed signal sequences. *)