|
33 | 33 |
|
34 | 34 |
|
35 | 35 | class KNBCHTMLParser(HTMLParser): |
36 | | - """Parses the HTML files in the KNBC corpus and outputs the chunks.""" |
| 36 | + """Parses the HTML files in the KNBC corpus to collect chunks. |
| 37 | +
|
| 38 | + Attributes: |
| 39 | + chunks: The collected chunks. |
| 40 | + row: The current row index. |
| 41 | + col: The current column index. |
| 42 | + current_word: The current word to process. |
| 43 | + on_split_row: Whether the scan is on the splitting row. |
| 44 | + split_tab: Whether to split by tags in addition to Bunsetsu. |
| 45 | + """ |
| 46 | + |
| 47 | + BUNSETSU_SPLIT_ID = 'bnst-kugiri' |
| 48 | + TAG_SPLIT_ID = 'tag-kugiri' |
| 49 | + |
| 50 | + def __init__(self, split_tab: bool = False) -> None: |
| 51 | + """Initializes the HTML parser for the KNBC corpus. |
37 | 52 |
|
38 | | - def __init__(self, split_tab: bool = True) -> None: |
| 53 | + Args: |
| 54 | + split_tab: Split by tags in addition to Bunsetsu. (default: False) |
| 55 | + """ |
39 | 56 | super().__init__() |
40 | 57 | self.chunks = [''] |
41 | | - self.n_rows = 0 |
42 | | - self.n_cols = 0 |
43 | | - self.current_word: typing.Optional[str] = None |
| 58 | + self.row = 0 |
| 59 | + self.col = 0 |
| 60 | + self.current_word = '' |
| 61 | + self.on_split_row = False |
44 | 62 | self.split_tab = split_tab |
45 | 63 |
|
46 | | - def handle_starttag(self, tag: str, _: typing.Any) -> None: |
| 64 | + def handle_starttag( |
| 65 | + self, tag: str, |
| 66 | + attributes: typing.List[typing.Tuple[str, typing.Optional[str]]]) -> None: |
47 | 67 | if tag == 'tr': |
48 | | - self.n_rows += 1 |
49 | | - self.n_cols = 0 |
50 | | - self.current_word = None |
| 68 | + self.row += 1 |
| 69 | + self.col = 0 |
| 70 | + self.current_word = '' |
| 71 | + self.on_split_row = False |
| 72 | + |
51 | 73 | if tag == 'td': |
52 | | - self.n_cols += 1 |
| 74 | + self.col += 1 |
| 75 | + for name, value in attributes: |
| 76 | + if (name == 'id' and value == self.BUNSETSU_SPLIT_ID) or ( |
| 77 | + self.split_tab and name == 'id' and value == self.TAG_SPLIT_ID): |
| 78 | + self.on_split_row = True |
53 | 79 |
|
54 | 80 | def handle_endtag(self, tag: str) -> None: |
55 | | - if tag != 'tr': |
| 81 | + if tag != 'tr': # Skip all tags but TR. |
| 82 | + return None |
| 83 | + if self.row < 3: # Skip the first two rows. |
56 | 84 | return None |
57 | | - flag1 = self.n_rows > 2 and self.n_cols == 1 |
58 | | - flag2 = self.split_tab or self.current_word == '文節区切り' |
59 | | - if flag1 and flag2: |
60 | | - self.chunks.append('') |
61 | | - if self.n_cols == 5 and type(self.current_word) is str: |
| 85 | + if self.on_split_row: |
| 86 | + return self.chunks.append('') |
| 87 | + if self.col == 5: |
62 | 88 | self.chunks[-1] += self.current_word |
63 | 89 |
|
64 | 90 | def handle_data(self, data: str) -> None: |
65 | | - if self.n_cols == 1: |
| 91 | + if self.col == 1: |
66 | 92 | self.current_word = data |
67 | 93 |
|
68 | 94 |
|
|
0 commit comments