-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPOSTagger.py
More file actions
115 lines (110 loc) · 5.16 KB
/
POSTagger.py
File metadata and controls
115 lines (110 loc) · 5.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#POS Tagger
import string, math
import re
def applypostagging(keywordsentencedict):
#Grab training counts
trainset = open("POS.train.large",'r')
postoworddict = {}
vocab = []
print('training...')
for line in trainset:
line = line.split(" ")
for wordposinstance in line:
try:
wordposinstance.rindex("/")
except ValueError:
continue
word = wordposinstance[:wordposinstance.rindex("/")]
vocab.append(word)
#split | to account for multiple pos tags
pos = wordposinstance[wordposinstance.rindex("/") + 1:].split("|")
#Account for multiple POS tags
for individualpos in pos:
if individualpos not in postoworddict:
postoworddict[individualpos] = {}
elif word in postoworddict[individualpos]:
postoworddict[individualpos][word] += 1
else:
postoworddict[individualpos][word] = 1
#Unique vocab
vocab = list(set(vocab))
print('summing pos tags...')
totalpostags = 0
for key in postoworddict.keys():
totalpostags += sum(postoworddict[key].values())
print('iterating over sentences containing keywords...')
count = 1
#Iterate through each of the sentences containing a keyword
for keyword in keywordsentencedict:
#print('keyword:' + keyword + '/')
if keyword == '':
continue
#count += 1
sentenceslist = keywordsentencedict[keyword]
#print(str(len(sentenceslist)) + " sentences to parse for the keyword...")
#Iterate through each sentence
for sentence in sentenceslist:
sentencetags = []
wordslst = sentence.split(" ")
for word in wordslst:
newword = [word]
#Split punctuation if it exists
if any(c for c in word for c in string.punctuation):
#Parse punctuation
newword = ""
for c in word:
if c in string.punctuation:
newword = newword + " " + c + " "
else:
newword = newword + c
newword = newword.split(" ")
scores = []
for pos in postoworddict.keys():
conditionals = []
for token in newword:
if word not in postoworddict[pos]:
wordoccurenceinpos = 0
else:
wordoccurenceinpos = postoworddict[pos][word]
wordconditionalprob = float(wordoccurenceinpos + 1) / float(sum(postoworddict[pos].values()) + len(vocab))
#Use log space in base 10
wordconditionalprob = math.log10(wordconditionalprob)
conditionals.append(wordconditionalprob)
sumconditionals = sum(conditionals)
probabilitypos = math.log10(float(sum(postoworddict[pos].values())) / float(totalpostags))
score = sumconditionals + (probabilitypos)
#Append to scores list
scores.append([score, pos])
predictedpos = max(scores)[1]
sentencetags.append(predictedpos)
#Split just incase of multiple words
wordsinkey = keyword.split(" ")
tokensfromkeyword = []
for word in wordsinkey:
newword = [word]
#Split punctuation if it exists
if any(c for c in word for c in string.punctuation):
#Parse punctuation
newword = ""
for c in word:
if c in string.punctuation:
newword = newword + " " + c + " "
else:
newword = newword + c
newword = newword.split(" ")
tokensfromkeyword.append(newword)
#Catch instances where keyword = represent and the actual word in the sentence is represented
try:
startingindexofkeyword = wordslst.index(wordsinkey[0])
except ValueError:
startingindexofkeyword = [word for word in wordslst if wordsinkey[0] in word]
startingindexofkeyword = wordslst.index(startingindexofkeyword[0])
tokenindices = [startingindexofkeyword + i for i in range(0, len(tokensfromkeyword))]
nounadjectiveverb = False
for i in tokenindices:
if 'NN' == sentencetags[i] or 'JJ' == sentencetags[i] or 'VB' == sentencetags[i] or 'VBD' == sentencetags[i] or 'VBG' == sentencetags[i] or 'VBN' == sentencetags[i] or 'VBP' == sentencetags[i] or 'VBZ' == sentencetags[i]:
nounadjectiveverb = True
if nounadjectiveverb == False:
sentenceslist.remove(sentence)
keywordsentencedict[keyword] = len(sentenceslist)
return keywordsentencedict