Skip to content

Commit b07fcb6

Browse files
ddaspitisaac091
andauthored
Properly include paragraph markers in row text (#200)
- fixes #197 --------- Co-authored-by: Isaac Schifferer <[email protected]>
1 parent 417c95f commit b07fcb6

File tree

2 files changed

+23
-31
lines changed

2 files changed

+23
-31
lines changed

machine/corpora/usfm_text_base.py

Lines changed: 4 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from .usfm_parser import UsfmParser
1515
from .usfm_parser_state import UsfmParserState
1616
from .usfm_stylesheet import UsfmStylesheet
17-
from .usfm_token import UsfmAttribute, UsfmToken, UsfmTokenType
17+
from .usfm_token import UsfmAttribute, UsfmTokenType
1818
from .usfm_tokenizer import UsfmTokenizer
1919

2020

@@ -82,10 +82,8 @@ def __init__(self, text: UsfmTextBase) -> None:
8282

8383
self._text = text
8484
self._rows: List[TextRow] = []
85-
self._next_para_tokens: List[UsfmToken] = []
8685
self._row_texts_stack: List[str] = []
8786
self._sentence_start: bool = False
88-
self._next_para_text_started = False
8987

9088
@property
9189
def rows(self) -> Iterable[TextRow]:
@@ -98,18 +96,6 @@ def start_book(self, state: UsfmParserState, marker: str, code: str) -> None:
9896
if code != self._text.id:
9997
raise ValueError(f"The \\id marker {code} does not match the text id {self._text.id}.")
10098

101-
def verse(
102-
self,
103-
state: UsfmParserState,
104-
number: str,
105-
marker: str,
106-
alt_number: Optional[str],
107-
pub_number: Optional[str],
108-
) -> None:
109-
super().verse(state, number, marker, alt_number, pub_number)
110-
self._next_para_text_started = True
111-
self._next_para_tokens.clear()
112-
11399
def start_para(
114100
self,
115101
state: UsfmParserState,
@@ -195,12 +181,6 @@ def text(self, state: UsfmParserState, text: str) -> None:
195181
if self._text._include_markers:
196182
text = text.rstrip("\r\n")
197183
if len(text) > 0:
198-
if not text.isspace():
199-
if self._current_text_type == ScriptureTextType.VERSE:
200-
for token in self._next_para_tokens:
201-
row_text += str(token) + " "
202-
self._next_para_tokens.clear()
203-
self._next_para_text_started = True
204184
if len(row_text) == 0 or row_text[-1].isspace():
205185
text = text.lstrip()
206186
row_text += text
@@ -223,9 +203,6 @@ def _start_verse_text(self, state: UsfmParserState, scripture_refs: Sequence[Scr
223203

224204
def _end_verse_text(self, state: UsfmParserState, scripture_refs: Sequence[ScriptureRef]) -> None:
225205
text = self._row_texts_stack.pop()
226-
if self._text._include_markers:
227-
for token in self._next_para_tokens:
228-
text += str(token) + " "
229206
self._rows.extend(self._text._create_scripture_rows(scripture_refs, text, self._sentence_start))
230207
self._sentence_start = (state.token and state.token.marker == "c") or has_sentence_ending(text)
231208

@@ -243,10 +220,7 @@ def _output_marker(self, state: UsfmParserState) -> None:
243220

244221
assert state.token is not None
245222

246-
if self._next_para_text_started:
247-
self._row_texts_stack[-1] += str(state.token)
248-
else:
249-
self._next_para_tokens.append(state.token)
223+
self._row_texts_stack[-1] += str(state.token)
250224

251225
def _handle_para(self, state: UsfmParserState) -> None:
252226
if len(self._row_texts_stack) == 0:
@@ -257,8 +231,7 @@ def _handle_para(self, state: UsfmParserState) -> None:
257231
for i, row_text in enumerate(self._row_texts_stack):
258232
if len(row_text) > 0 and not row_text[-1].isspace():
259233
self._row_texts_stack[i] += " "
260-
if self._current_text_type == ScriptureTextType.VERSE:
261-
self._next_para_tokens.append(state.token)
262-
self._next_para_text_started = False
234+
if self._current_text_type == ScriptureTextType.VERSE and self._text._include_markers:
235+
self._row_texts_stack[-1] += str(state.token) + " "
263236
if not state.is_verse_para:
264237
self._sentence_start = True

tests/corpora/test_usfm_memory_text.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,25 @@ def test_get_rows_paragraph_before_nonverse_paragraph() -> None:
171171
assert rows[2].text == "header"
172172

173173

174+
def test_get_rows_style_starting_nonverse_paragraph_after_empty_paragraph() -> None:
175+
rows: List[TextRow] = get_rows(
176+
r"""\id MAT - Test
177+
\c 1
178+
\p
179+
\v 1 verse 1
180+
\b
181+
\s1 \w header\w*
182+
\q1
183+
\v 2 verse 2
184+
""",
185+
include_all_text=True,
186+
include_markers=True,
187+
)
188+
assert len(rows) == 4, str.join(",", [tr.text for tr in rows])
189+
assert rows[1].text == "verse 1 \\b \\q1"
190+
assert rows[2].text == "\\w header\\w*"
191+
192+
174193
def get_rows(usfm: str, include_markers: bool = False, include_all_text: bool = False) -> List[TextRow]:
175194
text = UsfmMemoryText(
176195
UsfmStylesheet("usfm.sty"),

0 commit comments

Comments
 (0)