-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwordsExtractionTest_trembl.py
More file actions
77 lines (55 loc) · 2.02 KB
/
wordsExtractionTest_trembl.py
File metadata and controls
77 lines (55 loc) · 2.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
'''
@auther: Samaneh
'''
from Bio import SeqIO
import re
import json
#db = "../data/reference/non_red_sprot_batch_3_references.fasta"
db = "/home/samaneh/AHRD/data/db/uniprot_trembl.fasta"
#db = "../uniprot_sprot.fasta"
seqs = SeqIO.parse(db, "fasta")
firstWordList = []
testList = []
wordsDictCount = dict()
firstWordDictCount = dict()
pattern = "^[0-9]*\.*-*[0-9]*$"
for seq in seqs:
#a.omit name from description
des = str(seq.description).strip(str(seq.name)).strip(" ")
#b.omit unusable end part of the description
pureDes = re.sub("OS=.*","",des)
#first word c. extract the first word and put it in a list
word = pureDes.split(" ")[0]#.lower()
word_2 = word.lower()
#d. omit only numbers containing words from descriptions
if not re.match(pattern, word_2) :
if word_2 not in firstWordList:
firstWordList.append(word_2)
#all words c. extract all words and the number of each apperance and put it in a dictionary
words = (pureDes.split(" "))
for w in words:
w_2 = w.lower()
#d. omit only numbers containing words from descriptions
if not re.match(pattern, w_2):
if w_2 not in wordsDictCount.keys():
wordsDictCount.update({w_2:1})
else:
wordsDictCount[w_2] = wordsDictCount[w_2]+1
#count the number of appearance of first words of descriptions in whole descriptions
for wrd in firstWordList:
if wrd not in firstWordDictCount.keys():
count = wordsDictCount[wrd]
firstWordDictCount.update({wrd:count})
#for i in range(0,lSprot):
# if sorted_wordsDictCount[i][0]=="protein":
# print sorted_wordsDictCount[i]
###f = open('../outputs/firstWordsDictionary_trembl.json','w')
###sorted_firstWords_sprot = [(k,v) for v,k in sorted( [ (v,k) for k,v in firstWordDictCount.items() ] ) ]
#print sorted_firstWords_sprot
###json.dump(sorted_firstWords_sprot,f)
###f.close()
f = open('../outputs/wordsDictionary_trembl_all.json','w')
sorted_wordsDictCount = [(k,v) for v,k in sorted( [ (v,k) for k,v in wordsDictCount.items() ] ) ]
#print sorted_wordsDictCount
json.dump(sorted_wordsDictCount,f)
f.close()