-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathfeatures_public.py
More file actions
95 lines (76 loc) · 2.89 KB
/
Copy pathfeatures_public.py
File metadata and controls
95 lines (76 loc) · 2.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import numpy as np
import pandas as pd
import os, re
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import csv
from collections import Counter
import pickle
from sklearn.metrics.cluster import *
from sklearn.metrics import hamming_loss
for pkl in [x for x in os.listdir('cachedata/') if x.split('.')[-1]=='pkl']:
dictN=pkl.split('.')[0]
with open('cachedata/%s'%pkl,'rb') as tf:
globals()[dictN]=pickle.load(tf)
## Obtain human consensus protein abundance deposited in PaxDb v4.2
def getPaxdb(df):
"""
Get protein expression from PAXdb v4.2
"""
# with open('cachedata/paxdbValue.pkl','rb') as tf:
# paxdbValue=pickle.load(tf)
for i in df.index:
uni=df.loc[i,'uni']
if paxdbValue.get(uni):
df.loc[i,'feature_paxdb_log10']=paxdbValue[uni]
return df
## Determine whether the phosphorylation site is a known phosphorylation site in PhosphositePLUS or as reported by Ochoa et al.
def get_psite_existed(df):
nbt=pd.read_csv('cachedata/nbt.txt',sep='\t')
ptmL=['Phosphorylation']
ptmPic=pd.DataFrame()
for mod in ptmL:
cache=pd.read_csv('cachedata/psp/%s_site_dataset'%mod,sep='\t',skiprows=3,low_memory=False)
cache=cache[cache.ORGANISM=='human']
cache['Mod']=mod
ptmPic=ptmPic.append(cache)
ptmPic=ptmPic.reset_index(drop=True)
ptmPic['siteLoc']=ptmPic['MOD_RSD'].str.split('-',expand=True)[0].apply(lambda x:int(x[1:]))
for i in df.index:
line=df.loc[i]
uni=line.uni
loc=int(line.snvLoc)
psite=line.psite
ploc=int(line.ploc)
tC=0
if nbt[(nbt.uniprot==uni)&(nbt.position==ploc)].shape[0]>0:
tC+=1
if ptmPic[(ptmPic.ACC_ID==uni)&(ptmPic.siteLoc==ploc)].shape[0]>0:
tC+=1
df.loc[i,'feature_psite_existed']=tC
return df
## Obtain multiple Score using dbNSFP4.3a
def get_vep(df, vepPath):
df['dnnsfp'] = df.chr.astype(str) + '_' + df.chrl.astype(
str
) + '_' + df.NNchange + '_' + df.AAchange #+ '_' + df.snvLoc.astype(str)
dnnsfp = pd.read_csv(vepPath, sep='\t')
dnnsfp['dnnsfp'] = dnnsfp['#chr'].astype(str) + '_' + dnnsfp[
'pos(1-based)'].astype(str) + '_' + dnnsfp.ref.astype(
str) + '/' + dnnsfp.alt.astype(str) + '_' + dnnsfp.aaref.astype(
str) + '/' + dnnsfp.aaalt.astype(
str) # + '_' + dnnsfp.aapos.astype(str)
# dnnsfp['Uniprot'] = dnnsfp.Uniprot_acc.str.split('-', expand=True)[0]
dnnsfp = dnnsfp[[
'phyloP17way_primate', 'phyloP17way_primate_rankscore',
'BayesDel_addAF_score', 'BayesDel_addAF_rankscore',
'integrated_fitCons_score', 'integrated_fitCons_rankscore',
'GERP++_RS', 'GERP++_RS_rankscore', 'dnnsfp'
]].drop_duplicates().replace('.',np.nan)
df = pd.merge(df,
dnnsfp,
left_on=['dnnsfp'],
right_on=['dnnsfp'],
how='left')
return df