From 6c3135ec3a4b2de030379a97d3b3fa60e5635fe4 Mon Sep 17 00:00:00 2001 From: Jeroen Koops Date: Fri, 15 Oct 2010 16:14:20 +0200 Subject: [PATCH 1/3] Fix for parsing broken processing instructions. Some HTML contains broken processing instructions, such as this one encountered in the wild: ``. The parser crashes on this since it only accepts `?>` (or end-of-file) as the end of a processing instruction. This patch fixes it by also allowing `>` or `/>` to end a processing instruction. --- src/mochiweb_html.erl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/mochiweb_html.erl b/src/mochiweb_html.erl index b10bbe39..dda2a701 100644 --- a/src/mochiweb_html.erl +++ b/src/mochiweb_html.erl @@ -576,6 +576,10 @@ find_qgt(Bin, S=#decoder{offset=O}) -> case Bin of <<_:O/binary, "?>", _/binary>> -> ?ADV_COL(S, 2); + <<_:O/binary, ">", _/binary>> -> + ?ADV_COL(S, 1); + <<_:O/binary, "/>", _/binary>> -> + ?ADV_COL(S, 2); %% tokenize_attributes takes care of this state: %% <<_:O/binary, C, _/binary>> -> %% find_qgt(Bin, ?INC_CHAR(S, C)); @@ -1236,5 +1240,15 @@ parse_missing_attr_name_test() -> {<<"html">>, [ { <<"=">>, <<"=">> }, { <<"black">>, <<"black">> } ], [] }, mochiweb_html:parse(D0)), ok. + +parse_broken_pi_test() -> + D0 = <<"">>, + ?assertEqual( + {<<"html">>, [], [ + { pi, <<"xml:namespace">>, [ { <<"prefix">>, <<"o">> }, + { <<"ns">>, <<"urn:schemas-microsoft-com:office:office">> } ] } + ] }, + mochiweb_html:parse(D0)), + ok. -endif. From 9e45e2ebf93caba04d8b09272ced7678dc9561a3 Mon Sep 17 00:00:00 2001 From: Jeroen Koops Date: Fri, 15 Oct 2010 22:14:20 +0800 Subject: [PATCH 2/3] Fix for parsing broken processing instructions. Some HTML contains broken processing instructions, such as this one encountered in the wild: ``. The parser crashes on this since it only accepts `?>` (or end-of-file) as the end of a processing instruction. This patch fixes it by also allowing `>` or `/>` to end a processing instruction. --- src/mochiweb_html.erl | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/mochiweb_html.erl b/src/mochiweb_html.erl index b10bbe39..dda2a701 100644 --- a/src/mochiweb_html.erl +++ b/src/mochiweb_html.erl @@ -576,6 +576,10 @@ find_qgt(Bin, S=#decoder{offset=O}) -> case Bin of <<_:O/binary, "?>", _/binary>> -> ?ADV_COL(S, 2); + <<_:O/binary, ">", _/binary>> -> + ?ADV_COL(S, 1); + <<_:O/binary, "/>", _/binary>> -> + ?ADV_COL(S, 2); %% tokenize_attributes takes care of this state: %% <<_:O/binary, C, _/binary>> -> %% find_qgt(Bin, ?INC_CHAR(S, C)); @@ -1236,5 +1240,15 @@ parse_missing_attr_name_test() -> {<<"html">>, [ { <<"=">>, <<"=">> }, { <<"black">>, <<"black">> } ], [] }, mochiweb_html:parse(D0)), ok. + +parse_broken_pi_test() -> + D0 = <<"">>, + ?assertEqual( + {<<"html">>, [], [ + { pi, <<"xml:namespace">>, [ { <<"prefix">>, <<"o">> }, + { <<"ns">>, <<"urn:schemas-microsoft-com:office:office">> } ] } + ] }, + mochiweb_html:parse(D0)), + ok. -endif. From c89abd6501b916ad8c89a4253544c07c71c3d3ab Mon Sep 17 00:00:00 2001 From: Jeroen Koops Date: Wed, 20 Oct 2010 10:34:17 +0200 Subject: [PATCH 3/3] Fix for error in parsing particular incorrect singletons. When the HTML parser attempts to parse tags that should be singletons but are not, AND they have content, an exception occurs: 1> mochiweb_html:parse("x"). ** exception error: no case clause matching {[],[{<<"input">>,[],[<<"x">>]},{<<"input">>,[],[]}]} in function mochiweb_html:destack/2 in call from mochiweb_html:tree/2 in call from mochiweb_html:parse_tokens/1 This patch provides a fix. --- src/mochiweb_html.erl | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/mochiweb_html.erl b/src/mochiweb_html.erl index dda2a701..0f281db9 100644 --- a/src/mochiweb_html.erl +++ b/src/mochiweb_html.erl @@ -404,8 +404,8 @@ destack(TagName, Stack) when is_list(Stack) -> {_, []} -> %% Actually was a singleton Stack; - {Pre, [{T1, A1, []} | Post1]} -> - [{T0, A0, [{T1, A1, lists:reverse(Pre)} | Post1]} + {Pre, [{T1, A1, Acc1} | Post1]} -> + [{T0, A0, [{T1, A1, Acc1 ++ lists:reverse(Pre)} | Post1]} | Post0] end; _ -> @@ -1250,5 +1250,15 @@ parse_broken_pi_test() -> ] }, mochiweb_html:parse(D0)), ok. + +parse_funny_singletons_test() -> + D0 = <<"x">>, + ?assertEqual( + {<<"html">>, [], [ + { <<"input">>, [], [] }, + { <<"input">>, [], [ <<"x">> ] } + ] }, + mochiweb_html:parse(D0)), + ok. -endif.