stmp3/segregation_util.py at master · AshleyLab/stmp3 · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#author: Noah Friedman
import sys

sys.path.append("/home/noahfrie/noahfrie/devCode/stmp2/code/trio")
import pedigreeUtils
import general_utils
import vcfUtils
import vcf
import os
import logging
import subprocess

#Initialization of a logger
logger = logging.getLogger("segregation")
FORMAT = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, filemode='w', filename='segregation_util.log')
logger.setLevel(logging.INFO)
logger.info("\n\nLog File for segregation")

#a dictionary that tells you what index you need for accessing a certain argument value from the command line.  In my opinion this solution is simpler than using an argument parser especially since this is a mini utility.
args_dict = {"ped": 0, "vcf": 1, "udnid": 2, 'modelType': 3, "dir": 4}

#------------------------------------------------------------------------------------
#UNIT TESTS

#function to ensure that the matricies are consistent with the order we ought to expect
def consistentcy_checks(familyMembersList, familyMembersMatrix, vcfMatrix, inputVcf, affectedDict, verbose=False, n = 100):
	#confirm that familyMembersMatrix is consistent with affected dict
	print "verifying that familyMembersMatrix is consistent with affected dict"
	for i in range(len(familyMembersList)):
		if verbose: print familyMembersList[i], familyMembersMatrix[i], affectedDict[familyMembersList[i]]
		if familyMembersMatrix[i] != affectedDict[familyMembersList[i]]:
			print "error inconsistency detected"
			sys.exit()
		else:
			if verbose: print "affection status for ", familyMembersList[i], " is consistent"
	print "passed"

	#confirm that order is consistent between familyMembers list and samples in the first n lines of the vcf
	print "verifying that order is consistent between familyMembers list and samples in the first ", n, " lines of the vcf"
	vcf_reader = vcf.Reader(open(inputVcf, 'r'))
	cntr = 0
	for record in vcf_reader:
		for i in range(len(record.samples)):
			if verbose: print record.samples[i].sample, familyMembersList[i]
			if record.samples[i].sample != familyMembersList[i]:
				print "error inconsistency in ordering of samples"
				sys.exit()
		cntr += 1
		if cntr >= n: break
	print "passed"

	vcf_reader2 = vcf.Reader(open(inputVcf, 'r'))
	cntr = 0
	for record in vcf_reader2:
		for i in range(len(record.samples)):
			gt = record.samples[i]['GT']
			if gt != vcfMatrix[cntr][i]:
				print "error matrix and vcf are inconsistent"
			else:
				if verbose: print gt, vcfMatrix[cntr][i]
		cntr += 1
		if cntr >= n: break
	print "passed"

	print "passed all tests"

#------------------------------------------------------------------------------------------------

def get_arg(args, arg):
	return args[args_dict[arg]]

def get_gt_matrix_from_vcf(inputVcf):
	#iterates through a vcf and gives back the gts
	vcf_reader = vcf.Reader(open(inputVcf, 'r'))
	m = []
	cntr = 0
	for record in vcf_reader:
		row = []
		#alert added for testing throw out
		cntr += 1
		#if cntr > 20: return m
		for sample in record.samples:
			row.append(sample['GT'])
		m.append(row)
	return m

#parse the first vcf record to figure out which family members show up in the vcf
#note: it is important that we do this instead of simply using the pedigree in case the pedigree lists family members who aren't in the vcf
def get_family_members_in_vcf(inputVcf):
	vcf_reader = vcf.Reader(open(inputVcf, 'r'))
	#just check the first record (I know this is not the prettiest method but still..)
	cntr = 0
	familyMembers = []
	for record in vcf_reader:
		for s in record.samples:
			familyMember = s.sample
			familyMembers.append(familyMember)
		cntr += 1
		if cntr >= 1:
			return familyMembers

#take the full affection status dict from trio tools and specifcially pulls out info relevant to the family in question
def get_fam_affection_status(affectedDict, familyMembers):
	tempDict = dict()
	for key, value in affectedDict.items():
		if key in familyMembers:
			tempDict[key] = value
	fMatrix = []
	#return this information as a list which conforms to the order we found in the vcf!  maintaining this order is important!
	for familyMember in familyMembers:
		fMatrix.append(tempDict[familyMember])
	return fMatrix

#IMPORTANT NOTE--our code makes the assumption that the order that the samples occur in the vcf is consistent

#checks to see if anyone in the family is missing a genotype for the spcified line
def no_missing_genotypes(gtLine):
	for gt in gtLine:
		if gt == './.': return False
	return True

#returns true if all genotypes for the current alelle in the vcf are not the same
#NOTE: IN The future this can (should) be adjusted so that it only takes into account parents and affecteds.  If we have an unaffected sibling whose genotype is different from unaffected mom and dad we will include the variant even if everyone else is the same
def all_genotypes_are_not_the_same(gtLine):
	gt1 = gtLine[0]
	#if any genotype is different than the first genotype we can conclude that all genotypes are not the same
	#as a fun exercise, the reader can prove this by induction:)
	for gt in gtLine:
		if gt != gt1:
			return True
	return False

#returns true if the genotypes of at least one of the affecteds is not 0/0 (homozygous ref)
def no_homozygous_ref_in_all_affected(gtLine, aIdxs):
	for idxA in aIdxs:
		print gtLine[idxA]
		if gtLine[idxA] != '0/0':
			return True
	return False

#returns true if there are no homozygous alternate genotypes in the unaffecteds that are composed of alternate alleles the proband has
#some examples: proband is 1/1, mom is 1/1, dad is 0/1--throw out variant cant be disease causing
#proband is 0/1, mom is 1/1, dad is 0/0--throw out, variant cant be implicated in compound het model
#proband is 0/1, mom is 1/2, dad is 0/2, unaffected sister is 2/2--keep sister's homozygous alt isnt disase causing
#proband is 1/2, mom is 2/2, dad is 0/1--throw out
#proband is 1/2, mom is 0/2, dad is 1/1--throw out
#proband is 1/2, mom is 0/2, dad is 0/1--KEEP
def no_homozygous_alt_in_any_unaffecteds(gtLine, uIdxs, aIdxs):
	#get the genotype of the proband
	gtProband = gtLine[aIdxs[0]]
	#isolate their alleles
	pAllele1 = gtProband[0]
	pAllele2 = gtProband[2]
	#create a list of the 1 or 2 homozygous alts that can be formed from the probands alelles
	pHomozygotes = []
	if pAllele1 == '1' or pAllele2 == '1': pHomozygotes.append('1/1')
	if pAllele1 == '2' or pAllele2 == '2': pHomozygotes.append('2/2')
	if pAllele1 == '3' or pAllele2 == '3': pHomozygotes.append('3/3')

	for idxU in uIdxs:
		if gtLine[idxU] in pHomozygotes:
			return False
	return True

#DEPRECATED CURRENTLY
def segregation_inconsistent_between_afffected_and_unaffected(gtLine, aIdxs, uIdxs):
	#we can't filter on lines with incomplete data
	#print gtLine, aIdxs, uIdxs
	#if len(gtLine) != len(uIdxs): return True
	for idxU in uIdxs:
		gtU = gtLine[idxU]
		for idxA in aIdxs:
			gtA = gtLine[idxA]
			if gtU == gtA:
				return False
	return True

#DEPRECATED CURRENTLY
def segregation_consistent_across_affecteds(gtLine, aIdxs):
	#we can't filter on lines with incomplete data
	if len(gtLine) != len(aIdxs): return True
	#by default segregation is consistent if there aren't two affecteds
	if len(aIdxs) < 2: return True
	else:
		gt1 = gtLine[aIdxs[0]]
		for idx in aIdxs:
			gt = gtLine[idx]
			if gt1 != gt: return False
	return True

#to be consistent with a dominant model of inheritance, all affecteds must have the alt, and not unaffecteds can have the alt
#note it permits missing records
def consistent_with_dominant_model_of_inheritance(gtLine, aIdxs, uIdxs):
	for idxA in aIdxs:
		gtA = gtLine[idxA]
		if gtA == '0/0': return False
	for idxU in uIdxs:
		gtU = gtLine[idxU]
		if gtU == '0/1' or gtU == '1/1': return False
	return True

#to be consistent with a recessive model of inheritance, all affecteds must be homozygous recessive and their parents must be heterozygous carriers
#again it is permissive with missing records
def consistent_with_recessive_model_of_inheritance(gtLine, aIdxs, uIdxs):
	#check that affecteds are homozygous alt
	for idxA in aIdxs:
		gtA = gtLine[idxA]
		if gtA == '0/1' or gtA == '0/0':
			return False
	#check that unaffecteds are heterozygous
	for idxU in uIdxs:
		gtU = gtLine[idxU]
		if gtU == '0/0' or gtU == '1/1':
			return False
	return True


#returns the indicies of affecteds and unaffecteds in list form
def get_idxs(fMatrix):
	cntr = 0
	aIdxs = []
	uIdxs = []
	for f in fMatrix:
		if f == '2': aIdxs.append(cntr)
		elif f == '1': uIdxs.append(cntr)
		cntr += 1
	return aIdxs, uIdxs

#main function called from tiering that checks ped file to see if the pedigree is consistent
def filter_vcf_by_segregation(gtMatrix, fMatrix, inputVcf, modelType):
	aIdxs, uIdxs = get_idxs(fMatrix)
	inputVcf = general_utils.open_compressed_or_regular(inputVcf,"r")
	lines = inputVcf.readlines()
	linesToWrite = []
	rowNum = 0
	for line in lines:
		segregationFailReason = ''
		if line[0] == '#':
			print 'Line has been added because it begins with a #'
			linesToWrite.append(line)
		else:
			#by deafult we automatically include a variant if anyone in the family is missing a genotype
			if not no_missing_genotypes(gtMatrix[rowNum]):
				#print 'Line has been added because there are missing genotypes'
				#linesToWrite.append(line)
				pass
				#Do nothing right now

			#Core Segregation Logic
			else:
				if all_genotypes_are_not_the_same(gtMatrix[rowNum]):
					if no_homozygous_ref_in_all_affected(gtMatrix[rowNum], aIdxs):
						if no_homozygous_alt_in_any_unaffecteds(gtMatrix[rowNum], uIdxs, aIdxs):

							#the dominant model type requires a different test to see if it fits a dominant model of inheritance
							if modelType == 'Dominant':
								if consistent_with_dominant_model_of_inheritance(gtMatrix[rowNum], aIdxs, uIdxs):
									print 'Line has been added because it is consistent with a dominant model of inheritance'
									linesToWrite.append(line)
								else:
									segregationFailReason = 'inconsistent with a dominant model of inheritance'
							#the recessive model type requires us to do an additional test to see if it fits a recessive model of inheritance
							elif modelType == 'Recessive':
								if consistent_with_recessive_model_of_inheritance(gtMatrix[rowNum], aIdxs, uIdxs):
									print 'Line has been added because it is consistent with a recessive model of inheritance'
									linesToWrite.append(line)
								else:
									segregationFailReason = 'inconsistent with a recessive model of inheritance'

							#deafult model is the most permissive
							elif modelType == 'Default':
								print 'Line has been added because it is consistent with the default model of inheritance'
								linesToWrite.append(line)

						else:
							segregationFailReason = 'homozygous alt present in an unaffected'
							#print segregationFailReason
					else:
						segregationFailReason = 'homozygous reference present in all affecteds'
						#print segregationFailReason
				else:
					segregationFailReason = 'All genotypes are the same'
					#print segregationFailReason

			#if segregation failed
			if segregationFailReason != '':
				logger.info('excluded line for failing segregation: ' + line)
				logger.info('failed because ' + segregationFailReason)
				logger.info('\n')

			rowNum += 1
	return linesToWrite

#----------------------------------------------------------------------------------
def write_vcf(linesToWrite, filename, outputDir):
	vcf_name = vcfUtils.getSampleName(filename)
	#change this please please NOAH
	outfilepath = os.path.join(outputDir, vcf_name+'_segreation_filtered_TEST_IN_JANUARY'+ '.vcf')
	f = open(outfilepath, 'w')
	for line in linesToWrite:
		f.write(line)
	f.close()

	cmd = 'bgzip -f {vcf1}'.format(vcf1=outfilepath)
	proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
	proc.wait()

	outfilepath += '.gz'

	cmd2 = 'bcftools index {vcf1}'.format(vcf1=outfilepath)
	proc = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True)
	proc.wait()

	return outfilepath

#main function for segregation filtering that is called by other scripts
def filter_by_segregation(inputVcf, inputPed, outputDir, modelType):
	#munge the ped file using the traverse function in pedigree utils
	#note that this provides a lot of extra information that we don't currently use
	#i just wanted to reuse the function instead of reinventing the wheel
	founders, parent_dict, trios, quartets, unique_trios, affectedDict = pedigreeUtils.traverse(inputPed)
	#probandUDNID = get_arg('udnid')

	familyMembersList = get_family_members_in_vcf(inputVcf)
	fMatrix = get_fam_affection_status(affectedDict, familyMembersList)
	print "generating gt_matrix"
	m = get_gt_matrix_from_vcf(inputVcf)

	#consistentcy_checks(familyMembersList, fMatrix, m, inputVcf, affectedDict)

	print "filtering by segregation"

	lineToWrite = filter_vcf_by_segregation(m, fMatrix, inputVcf, modelType)
	print "writing vcf"

	outputPath = write_vcf(lineToWrite, inputVcf, outputDir)
	print outputPath
	return outputPath

def main():
	print "usage segregation_util.py inputVcf udnid inputPed modelType outputDir (note it is order sensitive); model type can be Dominant, Default or Recessive"
	args = sys.argv[1:]
	inputVcf = get_arg(args, 'vcf')
	inputPed = get_arg(args, 'ped')
	print(inputPed)
	modelType = get_arg(args, 'modelType')
	outputDir = get_arg(args, 'dir')
	#NOTE THE TEMPORARY HACK BEING USED FOR FILEPATH HERE
	filter_by_segregation(inputVcf, inputPed, outputDir, modelType)

if __name__ == '__main__':
	main()