-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsplit.py
More file actions
111 lines (90 loc) · 4.04 KB
/
split.py
File metadata and controls
111 lines (90 loc) · 4.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import re
import math
def read_file(file_path):
"""
Reads the content of a file. Output is a list of lines.
:param file_path: Path to the input file.
"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.readlines()
return content
def filter_lines(array_of_lines, skip_tags=None):
"""
Generates array without empty fields and specified tags.
:param array_of_lines: List of lines to filter.
:param skip_tags: List of tags to skip.
"""
# Create regex for skipping specified tags
if skip_tags:
skip_pattern = re.compile(r"<\s*({})\s*>.*?</\s*\1\s*>".format("|".join(skip_tags)))
else:
skip_pattern = None
# Filter and clean lines in one step
filtered_lines = [
re.sub(r"<.*?>", "", line) # Clean HTML-like tags
for line in array_of_lines
if not (
re.match(r"<(\w+)>\s*</\1>", line.strip()) or # Skip empty fields
(skip_pattern and skip_pattern.match(line.strip())) # Skip lines matching the skip pattern
)
]
return filtered_lines
def write_out(filtered_lines, output_dir, output_filename, max_lines=250000, total_parts=1):
"""
Writes out the filtered lines to a file or splits them into parts.
:param filtered_lines: List of filtered lines.
:param output_dir: Directory to store the output files.
:param name: Base name of the output file(s).
:param max_lines: Maximum number of lines per split file.
:param total_parts: Total number of parts (set to 1 for a single file).
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
chunk = []
chunk_index = 1
for i, line in enumerate(filtered_lines):
chunk.append(line) # Add the line to the current chunk
if len(chunk) >= max_lines or i == len(filtered_lines) - 1:
# Determine the output file path
chunk_file_path = (
os.path.join(output_dir, f"{output_filename}.xml")
if total_parts == 1
else os.path.join(output_dir, f"{output_filename}_part_{chunk_index}.xml")
)
# Write the chunk to the file
with open(chunk_file_path, 'w', encoding='utf-8') as chunk_file:
if total_parts > 1:
chunk_file.write(f"<!-- Part {chunk_index} of {total_parts} -->\n")
chunk_file.writelines(chunk)
chunk_index += 1
chunk = [] # Reset the chunk after writing
if total_parts == 1:
print(f"XML file successfully written to {os.path.join(output_dir, f'{output_filename}.xml')}.")
else:
print(f"XML file successfully split into {chunk_index - 1} parts.")
def process_xml(file_path, output_dir, output_filename, max_lines=250000, skip_tags=None, split=False):
"""
Removes empty fields and specified tags from an XML file and writes the output to a new file.
:param file_path: Path to the input XML file.
:param output_dir: Directory to store the split files.
:param output_filename: Base name of the output file(s).
:param max_lines: Maximum number of lines per split file.
:param skip_tags: List of tags to skip.
:param split: Whether to split the file into multiple parts.
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
array_of_lines = read_file(file_path)
filtered_lines = filter_lines(array_of_lines, skip_tags)
# Determine total parts after filtering
total_lines = len(filtered_lines)
total_parts = math.ceil(total_lines / max_lines)
write_out(filtered_lines, output_dir, output_filename, max_lines, total_parts if split else 1)
skip_tags=["image", "id" ,"created", "updated", "hidden","position", "column", "map_id", "enable_proxy"]
# split_large_xml('../wordpress_unc_oct_2024.xml', './xml/wordpress', 'wordpress_export')
process_xml('../unprocessed_files/libguides-241008.xml',
'./xml/libguides',
'libguides_export',
skip_tags=skip_tags,
split=True)