-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathget_data.py
More file actions
51 lines (42 loc) · 1.21 KB
/
get_data.py
File metadata and controls
51 lines (42 loc) · 1.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import glob
import pickle
def get(num_files=500):
X = []
files = glob.glob('AA/*.pkl')
print("Total number of files",len(files)) # 17150
files = files[:num_files] # 500 = ~3%
print("Training number of files", len(files))
for i, file in enumerate(files):
with open(file, "rb") as f:
tmp = pickle.load(f)
tmp = [sentence[:-1] for sentence in tmp]
if X:
X += tmp
else:
X = tmp
return X
def find_a_correct_paragraph(test_sents,y_pred, num=4):
paragraph = []
paragraphs = []
for sentence1, sentence2 in zip(test_sents, y_pred):
for word1,word2 in zip(sentence1,sentence2):
if word1[1] != word2:
paragraph = []
break
paragraph.append(sentence1)
if len(paragraph) == num:
num_of_tags = [word for sentence in paragraph for word in sentence if word[1]!='O']
num_of_tags = len(num_of_tags)
if num_of_tags > num*3:
paragraphs.append(' '.join([word[0] for sentence in paragraph for word in sentence]))
return paragraphs
def get_output_level(paragraph, level=3):
# level [1-n]
new_par = []
for sentence in paragraph:
temp = []
for word in sentence:
tag = word[1].split("-")
temp.append([word[0], '-'.join(tag[:level])])
new_par.append(temp)
return new_par