blacklistsTest/wordsExtractionTest_trembl.py at master · samiscoding/blacklistsTest · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
'''
@auther: Samaneh

'''
from Bio import SeqIO
import re
import json

#db = "../data/reference/non_red_sprot_batch_3_references.fasta"
db = "/home/samaneh/AHRD/data/db/uniprot_trembl.fasta"
#db = "../uniprot_sprot.fasta"
seqs = SeqIO.parse(db, "fasta")

firstWordList = []
testList = []
wordsDictCount = dict()
firstWordDictCount = dict()
pattern = "^[0-9]*\.*-*[0-9]*$"

for seq in seqs:

	#a.omit name from description
	des = str(seq.description).strip(str(seq.name)).strip(" ")
	#b.omit unusable end part of the description
	pureDes = re.sub("OS=.*","",des)
	#first word c. extract the first word and put it in a list
	word = pureDes.split(" ")[0]#.lower()
	word_2 = word.lower()
	#d. omit only numbers containing words from descriptions
	if not re.match(pattern, word_2) :
		if word_2 not in firstWordList:
			firstWordList.append(word_2)

	#all words c. extract all words and the number of each apperance and put it in a dictionary
	words = (pureDes.split(" "))
	for w in words:
		w_2 = w.lower()
		#d. omit only numbers containing words from descriptions
		if not re.match(pattern, w_2):
			if w_2 not in wordsDictCount.keys():
				wordsDictCount.update({w_2:1})
			else:
				wordsDictCount[w_2] = wordsDictCount[w_2]+1

#count the number of appearance of first words of descriptions in whole descriptions
for wrd in firstWordList:
	if wrd not in firstWordDictCount.keys():
		count = wordsDictCount[wrd]
		firstWordDictCount.update({wrd:count})


#for i in range(0,lSprot):
#	if sorted_wordsDictCount[i][0]=="protein":
#			print sorted_wordsDictCount[i]


###f = open('../outputs/firstWordsDictionary_trembl.json','w')
###sorted_firstWords_sprot = [(k,v) for v,k in sorted( [ (v,k) for k,v in firstWordDictCount.items() ] ) ]
#print sorted_firstWords_sprot
###json.dump(sorted_firstWords_sprot,f)
###f.close()


f = open('../outputs/wordsDictionary_trembl_all.json','w')
sorted_wordsDictCount = [(k,v) for v,k in sorted( [ (v,k) for k,v in wordsDictCount.items() ] ) ]
#print sorted_wordsDictCount
json.dump(sorted_wordsDictCount,f)
f.close()