Skip to content

Commit 50178f6

Browse files
authored
Improve KNBC HTML Parser (#137)
1 parent 181e85e commit 50178f6

File tree

2 files changed

+73
-17
lines changed

2 files changed

+73
-17
lines changed

scripts/prepare_knbc.py

Lines changed: 43 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -33,36 +33,62 @@
3333

3434

3535
class KNBCHTMLParser(HTMLParser):
36-
"""Parses the HTML files in the KNBC corpus and outputs the chunks."""
36+
"""Parses the HTML files in the KNBC corpus to collect chunks.
37+
38+
Attributes:
39+
chunks: The collected chunks.
40+
row: The current row index.
41+
col: The current column index.
42+
current_word: The current word to process.
43+
on_split_row: Whether the scan is on the splitting row.
44+
split_tab: Whether to split by tags in addition to Bunsetsu.
45+
"""
46+
47+
BUNSETSU_SPLIT_ID = 'bnst-kugiri'
48+
TAG_SPLIT_ID = 'tag-kugiri'
49+
50+
def __init__(self, split_tab: bool = False) -> None:
51+
"""Initializes the HTML parser for the KNBC corpus.
3752
38-
def __init__(self, split_tab: bool = True) -> None:
53+
Args:
54+
split_tab: Split by tags in addition to Bunsetsu. (default: False)
55+
"""
3956
super().__init__()
4057
self.chunks = ['']
41-
self.n_rows = 0
42-
self.n_cols = 0
43-
self.current_word: typing.Optional[str] = None
58+
self.row = 0
59+
self.col = 0
60+
self.current_word = ''
61+
self.on_split_row = False
4462
self.split_tab = split_tab
4563

46-
def handle_starttag(self, tag: str, _: typing.Any) -> None:
64+
def handle_starttag(
65+
self, tag: str,
66+
attributes: typing.List[typing.Tuple[str, typing.Optional[str]]]) -> None:
4767
if tag == 'tr':
48-
self.n_rows += 1
49-
self.n_cols = 0
50-
self.current_word = None
68+
self.row += 1
69+
self.col = 0
70+
self.current_word = ''
71+
self.on_split_row = False
72+
5173
if tag == 'td':
52-
self.n_cols += 1
74+
self.col += 1
75+
for name, value in attributes:
76+
if (name == 'id' and value == self.BUNSETSU_SPLIT_ID) or (
77+
self.split_tab and name == 'id' and value == self.TAG_SPLIT_ID):
78+
self.on_split_row = True
5379

5480
def handle_endtag(self, tag: str) -> None:
55-
if tag != 'tr':
81+
if tag != 'tr': # Skip all tags but TR.
82+
return None
83+
if self.row < 3: # Skip the first two rows.
5684
return None
57-
flag1 = self.n_rows > 2 and self.n_cols == 1
58-
flag2 = self.split_tab or self.current_word == '文節区切り'
59-
if flag1 and flag2:
60-
self.chunks.append('')
61-
if self.n_cols == 5 and type(self.current_word) is str:
85+
if self.on_split_row:
86+
return self.chunks.append('')
87+
if self.col == 5:
6288
self.chunks[-1] += self.current_word
6389

6490
def handle_data(self, data: str) -> None:
65-
if self.n_cols == 1:
91+
if self.col == 1:
6692
self.current_word = data
6793

6894

scripts/tests/test_prepare_knbc.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,33 @@ def test_multiple_hit(self) -> None:
4040
chunks = ['abcabc', 'def']
4141
result = prepare_knbc.break_before_sequence(chunks, 'bc')
4242
self.assertListEqual(result, ['a', 'bca', 'bc', 'def'])
43+
44+
45+
class TestKNBCHTMLParser(unittest.TestCase):
46+
example_html = '''
47+
<html>
48+
<body>
49+
<table>
50+
<tr><th>HA</th><th>HB</th><th>HC</th><th>HD</th><th>HE</th></tr>
51+
<tr><td colspan="5" id="bnst-kugiri"><a>文節区切り</a></td></tr>
52+
<tr><td>abc</td><td></td><td></td><td></td><td></td></tr>
53+
<tr><td>de</td><td></td><td></td><td></td><td></td></tr>
54+
<tr><td colspan="5" id="tag-kugiri"><a>タグ区切り</a></td></tr>
55+
<tr><td>fgh</td><td></td><td></td><td></td><td> </td></tr>
56+
<tr><td>ijkl</td><td></td><td></td><td></td><td> </td></tr>
57+
<tr><td colspan="5" id="bnst-kugiri"><a>文節区切り</a></td></tr>
58+
<tr><td>mn</td><td></td><td></td><td></td><td> </td></tr>
59+
</table>
60+
</body>
61+
</html>
62+
'''
63+
64+
def test_parse(self) -> None:
65+
parser = prepare_knbc.KNBCHTMLParser(False)
66+
parser.feed(self.example_html)
67+
self.assertListEqual(parser.chunks, ['abcdefghijkl', 'mn'])
68+
69+
def test_parse_split_tags(self) -> None:
70+
parser = prepare_knbc.KNBCHTMLParser(True)
71+
parser.feed(self.example_html)
72+
self.assertListEqual(parser.chunks, ['abcde', 'fghijkl', 'mn'])

0 commit comments

Comments
 (0)