-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinput_formats.py
More file actions
509 lines (401 loc) · 22.2 KB
/
input_formats.py
File metadata and controls
509 lines (401 loc) · 22.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
#pylint: disable=multiple-statements
import os
import copy, re, json
from lxml import etree
import xml.etree.ElementTree as ET
from collections import OrderedDict, defaultdict
from usfm_grammar import USFMParser
import utils
import output_formats
def get_element_text(element):
"""
Extract all text content from an XML element, including text from nested child elements.
This handles cases where XML elements contain mixed content (text + child elements).
Args:
element: XML element from ElementTree
Returns:
str: All text content concatenated together, or empty string if no text
"""
if element is None:
return ""
# Start with the element's direct text content
text_parts = []
if element.text:
text_parts.append(element.text.strip())
# Recursively get text from all child elements
for child in element:
child_text = get_element_text(child)
if child_text:
text_parts.append(child_text)
# Also get the tail text (text that comes after the child element)
if child.tail:
text_parts.append(child.tail.strip())
# Join all text parts with spaces and clean up extra whitespace
full_text = ' '.join(text_parts)
# Normalize whitespace (replace multiple spaces/newlines with single spaces)
full_text = ' '.join(full_text.split())
return full_text
def chop_with_regex( content, regex ):
last_capture = None
result = {}
for capture in re.finditer( regex, content ):
if last_capture is not None:
capture_number = int(last_capture.group(1))
chopped_content = content[last_capture.end()+1:capture.start()]
result[capture_number] = chopped_content
last_capture = capture
if last_capture is not None:
capture_number = int(last_capture.group(1))
chopped_content = content[last_capture.end()+1:]
result[capture_number] = chopped_content
return result
def hacked_usfm_parser( text, book_finder_id="toc3" ):
"""The reason for this is that the usfm library I am using keeps on not working,
so this is a hacked version which will just get the job done even if there are problems.
"""
#so first chop everything up into chapters.
chapter_regex = r'\\c (\d+)'
verse_regex = r'\\v (\d+)'
book_finder = r'\\' + book_finder_id + r' (\w+)'
book_match = re.search( book_finder, text )
if book_match is None:
book_finder_2 = r'\\id (\w+)'
book_match = re.search( book_finder_2, text )
book_name = book_match.group(1).upper()
chapter_content = chop_with_regex( text, chapter_regex )
vref = []
text = []
reg_exp_to_drop = [ r'\\s\d+', r'\\p', r'\\q(\d+)?', r'\\m', r'\\f (.*)\\f\*', r'\\b',
r'\\f (.*)\\fqa', r'\\1', r'\\nb', r'\\s\d?.*[^\n]*', r'\\r' ]
for chapter_number, chapter_content in chapter_content.items():
verse_content = chop_with_regex( chapter_content, verse_regex )
for verse_number, verse_content in verse_content.items():
for reg_exp in reg_exp_to_drop:
verse_content = re.sub( reg_exp, '', verse_content )
if '\\' in verse_content:
print( f"Found \\ in {verse_content}" )
vref.append( f"{book_name} {chapter_number}:{verse_number}" )
text.append( verse_content )
return {'vref':vref, 'text':text }
#then chop the chapters into verses.
#Then try and strip out all misc stuff.
def sort_verses( verses, reference_key ):
def verse_sort_key_func( verse ):
found_index = -1
book, chapter, verse = utils.split_ref( utils.look_up_key( verse, reference_key ))
for i, key in enumerate( output_formats.USFM_NAME.keys() ):
if book.upper() in key.upper():
found_index = i
break
assert found_index != -1, f"Didn't find the book \"{book}\" in known book names"
if isinstance(verse, str) and '-' in verse:
verse = int(verse.split('-')[0])
return (found_index, chapter, verse)
sorted_verses = sorted( verses, key=verse_sort_key_func )
return sorted_verses
def load_format( settings, reference_key, translation_key, source_key = None ):
if settings['format'] == 'USX':
result = []
import_folder = settings['folder']
#iterate through the xml files that have usx extensions:
for filename in os.listdir(import_folder):
if filename.lower().endswith('.usx'):
print(f"Loading {filename}")
usx_file = os.path.join(import_folder, filename)
# Load the XML file
#https://pypi.org/project/usfm-grammar/#:~:text=USX%20TO%20USFM%2C%20USJ%20OR%20TABLE
with open( usx_file, 'r', encoding='utf-8' ) as f:
usx_str = f.read()
usx_obj = etree.fromstring(usx_str)
my_parser = USFMParser(from_usx=usx_obj)
dict_output = my_parser.to_biblenlp_format( ignore_errors=settings.get('ignore_errors', True) )
for vref,text in zip( dict_output['vref'], dict_output['text'] ):
new_verse = {}
utils.set_key(new_verse, reference_key, vref)
utils.set_key(new_verse, translation_key, text)
result.append(new_verse)
if settings.get( "sort", True ): result = sort_verses( result, reference_key )
elif settings['format'] == 'hacked_usfm':
result = []
import_folder = settings['folder']
#iterate through the usfm files:
for filename in os.listdir(import_folder):
if filename.lower().endswith('.usfm') or filename.lower().endswith('.sfm'):
print(f"Loading {filename}")
full_filename = os.path.join(import_folder, filename)
# Load the usfm file
#https://pypi.org/project/usfm-grammar/#:~:text=USX%20TO%20USFM%2C%20USJ%20OR%20TABLE
with open( full_filename, 'r', encoding='utf-8' ) as f:
usfm_string = f.read()
dict_output = hacked_usfm_parser( usfm_string, settings.get('book_finder_id','toc3') )
for vref,text in zip( dict_output['vref'], dict_output['text'] ):
new_verse = {}
utils.set_key(new_verse, reference_key, vref)
utils.set_key(new_verse, translation_key, text)
result.append(new_verse)
if settings.get( "sort", True ): result = sort_verses( result, reference_key )
elif settings['format'] == 'codex':
result = []
import_folder = settings['folder']
#iterate through the codex files:
for filename in os.listdir(import_folder):
if filename.lower().endswith('.codex') or filename.lower().endswith('.source'):
print( f"Loading {filename}" )
full_filename = os.path.join(import_folder, filename)
with open( full_filename, 'r', encoding='utf-8' ) as f:
codex_structure = json.load(f)
for codex_cell in codex_structure.get('cells', []):
text = codex_cell.get('value', '' )
vref = codex_cell.get('metadata', {} ).get('id', '' )
cell_type = codex_cell.get('metadata', {} ).get('type', '' )
assert cell_type == 'text', f'Unsupported type: {cell_type}'
new_verse = {}
utils.set_key(new_verse, reference_key, vref)
utils.set_key(new_verse, translation_key, text)
result.append(new_verse)
elif settings['format'] == 'xliff':
result = []
import_folder = settings['folder']
#iterate through the xliff files:
for filename in os.listdir(import_folder):
if filename.lower().endswith('.xliff'):
print(f"Loading {filename}")
full_filename = os.path.join(import_folder, filename)
#parse as xml
#https://docs.python.org/3/library/xml.etree.elementtree.html
tree = ET.parse(full_filename)
root = tree.getroot()
#now recursively iterate and find trans-unit tags.
#https://docs.python.org/3/library/xml.etree.elementtree.html#tree-walk
for elem in root.iter():
# Handle namespace-prefixed tags (e.g., {urn:oasis:names:tc:xliff:document:1.2}trans-unit)
# We check the local name part to be flexible with different XLIFF versions
if elem.tag == 'trans-unit' or elem.tag.endswith('}trans-unit'):
# Extract namespace for finding child elements
namespace = ''
if elem.tag.endswith('}trans-unit'):
namespace = elem.tag[:elem.tag.find('}')+1]
new_verse = {}
reference = elem.attrib['id']
utils.set_key(new_verse, reference_key, reference)
if source_key is None:
# Use namespace-aware element finding
target_elem = elem.find(f'{namespace}target') if namespace else elem.find('target')
source_elem = elem.find(f'{namespace}source') if namespace else elem.find('source')
# Use helper function to extract all text content, including from nested elements
target_text = get_element_text(target_elem)
source_text = get_element_text(source_elem)
if target_text:
utils.set_key( new_verse, translation_key, target_text )
elif source_text:
utils.set_key( new_verse, translation_key, source_text )
else:
raise Exception("Unable to find either target or source element with text in trans-unit")
else:
source_elem = elem.find(f'{namespace}source') if namespace else elem.find('source')
if source_elem is None:
raise Exception("Unable to find source element in trans-unit")
source_text = get_element_text(source_elem)
if not source_text:
raise Exception("Source element has no text content")
utils.set_key( new_verse, source_key, source_text )
target_elem = elem.find(f'{namespace}target') if namespace else elem.find('target')
if target_elem is None:
raise Exception("Unable to find target element in trans-unit")
target_text = get_element_text(target_elem)
if not target_text:
raise Exception("Target element has no text content")
utils.set_key( new_verse, translation_key, target_text )
result.append(new_verse)
if settings.get( "sort", True ):
result = sort_verses( result, reference_key )
elif settings['format'] == 'usfm':
result = []
import_folder = settings['folder']
#iterate through the usfm files:
for filename in os.listdir(import_folder):
if filename.lower().endswith('.usfm') or filename.lower().endswith('.sfm'):
print(f"Loading {filename}")
full_filename = os.path.join(import_folder, filename)
# Load the usfm file
#https://pypi.org/project/usfm-grammar/#:~:text=USX%20TO%20USFM%2C%20USJ%20OR%20TABLE
with open( full_filename, 'r', encoding='utf-8' ) as f:
usfm_string = f.read()
my_parser = USFMParser(usfm_string)
dict_output = my_parser.to_biblenlp_format( ignore_errors=settings.get('ignore_errors', True) )
for vref,text in zip( dict_output['vref'], dict_output['text'] ):
try:
utils.split_ref( vref )
new_verse = {}
utils.set_key(new_verse, reference_key, vref)
utils.set_key(new_verse, translation_key, text)
result.append(new_verse)
except ValueError:
#don't include "verses" without parsable
#references.
print( f"Unparsable reference in {filename}:\n {vref}: {text}" )
if settings.get( "sort", True ): result = sort_verses( result, reference_key )
elif settings['format'] == 'biblenlp':
vref_file = settings['vref']
source_file = settings['source']
vrefs = utils.load_file_to_list( vref_file )
source = utils.load_file_to_list( source_file )
result = []
for vref, source_verse in zip( vrefs, source ):
if source_verse:
assert vref, "missing vref for verse"
new_verse = {}
utils.set_key( new_verse, reference_key, vref )
utils.set_key( new_verse, translation_key, source_verse )
result.append(new_verse)
#result = sort_verses( result, reference_key )
elif settings['format'] == 'sblgnt_txt':
#find a way to convert book names to the standardized 3 letter code.
ref_reverse_hash = {}
for key, value in output_formats.USFM_NAME.items():
if len( key ) == 3:
ref_reverse_hash[value] = key
normalization_hash = {}
for key, value in output_formats.USFM_NAME.items():
normalization_hash[ key ] = ref_reverse_hash[ value ]
import_folder = settings['folder']
result = []
#iterate through the txt files in the sblgnt or sblgnt like folder
for filename in os.listdir(import_folder):
if filename.lower().endswith('.txt'):
with open( os.path.join(import_folder, filename), 'r', encoding='utf-8' ) as f:
for line in f:
if '\t' in line:
vref, text = line.split('\t')
book, chapter, start_verse, end_verse = utils.split_ref2( vref )
if book.upper() not in normalization_hash and book not in normalization_hash:
assert False, f"Unknown book name {book} in {filename}"
if book in normalization_hash:
book = normalization_hash[ book ]
elif book.upper() in normalization_hash:
book = normalization_hash[ book.upper() ]
if start_verse == end_verse:
vref = f"{book} {chapter}:{start_verse}"
else:
vref = f"{book} {chapter}:{start_verse}-{end_verse}"
new_verse = {}
utils.set_key(new_verse, reference_key, vref.strip().upper())
utils.set_key(new_verse, translation_key, text.strip())
result.append(new_verse)
if settings.get( "sort", True ): result = sort_verses( result, reference_key )
else:
assert False, f"Unrecognized format {settings['format']}"
return result
def merge_source_and_target( settings, source, target, reference_key, source_key, translation_key ):
#so to make it so that both the source and the target can be ranges,
#what I will do is do the group thing where there
#is a leader which is in change of each group,
#and then we will go through each verse injecting
#their contents up into the leader and then
#we collect all the leaders and spit out the content.
class VerseCluster:
parent_ptr = None
def connect( self, other ):
self_parent = self.get_parent()
other_parent = other.get_parent()
if self_parent != other_parent:
self_parent.parent_ptr = other_parent
def get_parent( self ):
if self.parent_ptr is None:
return self
self.parent_ptr = self.parent_ptr.get_parent()
return self.parent_ptr
verse_to_cluster = defaultdict( VerseCluster )
for v in source + target:
vref = utils.look_up_key( v, reference_key )
book, chapter, verse = utils.split_ref( vref )
if isinstance( verse, str ) and '-' in verse:
start_verse, end_verse = [int(x) for x in verse.split('-')]
cluster = verse_to_cluster[(book,chapter,start_verse)].get_parent()
for verse_count in range(start_verse+1, end_verse+1):
cluster.connect(verse_to_cluster[(book,chapter,verse_count)])
else:
#just touch it in the defaultdict.
verse_to_cluster[(book,chapter,verse)]
hashed_results = OrderedDict()
for side, side_key in [[source,source_key], [target,translation_key]]:
for v in side:
vref = utils.look_up_key( v, reference_key )
book, chapter, start_verse, end_verse = utils.split_ref2( vref )
cluster = verse_to_cluster[(book,chapter,start_verse)].get_parent()
if cluster not in hashed_results:
hashed_results[cluster] = {}
merged_verse = hashed_results[cluster]
_, _, existing_start, existing_end = utils.split_ref2( utils.look_up_key( merged_verse, reference_key, default=vref))
start_verse = min( start_verse, existing_start )
end_verse = max( end_verse, existing_end )
if start_verse != end_verse:
utils.set_key( merged_verse, reference_key, f"{book} {chapter}:{start_verse}-{end_verse}")
else:
utils.set_key( merged_verse, reference_key, f"{book} {chapter}:{start_verse}" )
text = utils.look_up_key( merged_verse, side_key, "" )
if text: text += "\n"
text += utils.look_up_key( v, side_key )
utils.set_key( merged_verse, side_key, text )
result = list(hashed_results.values())
result = sort_verses( result, reference_key )
#now see if there are not any white listed verses which have a
# target and not a source or the other way around.
for verse_obj in result:
vref = utils.look_up_key( verse_obj, reference_key )
source_text = utils.look_up_key( verse_obj, source_key )
target_text = utils.look_up_key( verse_obj, translation_key )
book, chapter, _ = utils.split_ref( vref )
missing_white_list = settings.get( 'missing_white_list', [] )
missing_level = settings.get( 'missing_level', 'error' )
if book not in missing_white_list and vref not in missing_white_list:
if target_text:
if missing_level == 'error':
assert source_text, f"For {vref} missing source text"
else:
if not source_text:
print( f"For {vref} missing source text" )
if source_text:
if missing_level == 'error':
assert target_text, f"For {vref} missing target text"
else:
if not target_text:
print( f"For {vref} missing target text" )
return result
def main():
configs = utils.load_yaml_configuration( 'input_formats.yaml' )['configs']
for name,config in configs.items():
if not config.get('active', True):
continue
reference_key = config.get('reference_key' , ['vref'])
translation_key = config.get('translation_key', ['fresh_translation','text'] )
source_key = config.get('source_key' , ['source'])
if 'input_source_target' in config:
if config['input_source_target'].get('format', '' ) == 'codex':
source_side = config['input_source_target'].copy()
source_side['folder'] = os.path.join( config['input_source_target']['folder'], '.project/sourceTexts' )
input_source = load_format( source_side, reference_key, source_key )
target_side = config['input_source_target'].copy()
target_side['folder'] = os.path.join( config['input_source_target']['folder'], 'files/target' )
input_target = load_format( target_side, reference_key, translation_key )
combined = merge_source_and_target( config.get('merge',{}), input_source, input_target, reference_key, source_key, translation_key )
else:
combined = load_format( config['input_source_target'],
reference_key,
translation_key,
source_key = source_key )
else:
input_target = load_format( config['input_target'],
reference_key,
translation_key )
input_source = load_format( config['input_source'],
reference_key,
source_key)
combined = merge_source_and_target( config.get('merge',{}), input_source, input_target, reference_key, source_key, translation_key )
combined = utils.normalize_ranges( combined, reference_key, translation_key, source_key )
#now save it out.
utils.save_jsonl( os.path.join( "output", f"{name}.jsonl" ), combined )
print( "loaded")
if __name__ == '__main__':
main()
print( "Done" )