-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhandler.py
More file actions
84 lines (75 loc) · 2.94 KB
/
Copy pathhandler.py
File metadata and controls
84 lines (75 loc) · 2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""Creates a ContentHandler for the SAX parser to handle the wiki dump"""
import io
from os import mkdir, path
from collections import defaultdict
from xml.sax.handler import ContentHandler
from processor import process_text, process_title
import config
class WikiHandler(ContentHandler):
"""Class to handle the wiki dump through SAX parser"""
def __init__(self):
self.docs_length = 0
self.temp_files_length = 0
self.titles = []
self.title_tmp = ""
self.title_flag = 0
self.text_tmp = ""
self.text_flag = 0
self.index = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
def dump_index(self):
self.temp_files_length += 1
if not path.exists(config.TEMP_OUT_DIR):
mkdir(config.TEMP_OUT_DIR)
out_file = config.TEMP_OUT_DIR + \
str(self.temp_files_length)
words = sorted(self.index.keys())
with io.open(config.TITLE_FILE, "a", encoding="utf-8") as title_file:
title_file.writelines(self.titles)
self.titles = []
with io.open(out_file, "w", encoding="utf-8") as f:
for word in words:
line = word + '|'
docs = sorted(self.index[word])
for doc_id in docs:
line += str(doc_id) + '|'
for field in config.FIELDS:
line += str(self.index[word][doc_id][field]) + '|'
line = line[:-1] + '\n'
f.write(unicode(line))
self.index = defaultdict(lambda: defaultdict(lambda: defaultdict(int)))
def startDocument(self):
print "Starting document parsing"
def endDocument(self):
self.dump_index()
print "Finished parsing document"
def startElement(self, name, attrs):
if name == "title":
self.title_flag = 1
elif name == "text":
self.text_flag = 1
def endElement(self, name):
if name == "page":
self.docs_length += 1
if self.docs_length % config.MAX_DOCS_IN_MEMORY == 0:
self.dump_index()
if name == "title":
self.title_flag = 0
print self.docs_length, self.title_tmp
self.titles.append(self.title_tmp + "\n")
title = process_title(self.title_tmp.encode('utf-8'))
for word in title:
self.index[word][self.docs_length]["t"] += 1
self.title_tmp = ""
elif name == "text":
self.text_flag = 0
fields = process_text(self.text_tmp.encode('utf-8'))
# print fields
for field, val in fields.iteritems():
for word in val:
self.index[word][self.docs_length][field] += 1
self.text_tmp = ""
def characters(self, content):
if self.title_flag == 1:
self.title_tmp += content
if self.text_flag == 1:
self.text_tmp += content