Skip to content

Commit d663b81

Browse files
authored
Merge pull request #6 from yhoogstrate/2_1_0
2 1 0
2 parents 1ddd2f7 + 9bc9a47 commit d663b81

File tree

8 files changed

+65
-9
lines changed

8 files changed

+65
-9
lines changed

Changelog

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
11-sept-2018: Youri Hoogstrate
2+
* v2.1.0 - Adds unit tests and can calculate ROC for Lorenz curves
3+
14
05-sept-2018: Youri Hoogstrate
25
* v2.0.0 - does not require to use samtools, does not take all
36
pysam.samtools.depth output into memory, does not export

README renamed to README.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
bam-lorenz-coverage
22
===================
33

4-
Generate Lorenz and Cumulative Coverage plots on a bam file.
4+
Generate Lorenz plots and Coverage plots directly from BAM files
55

66
Implemented in:
7-
Python3 + Matplotlib + Pysam
7+
* Python3 + Matplotlib + Pysam
88

99

1010
Installation:
@@ -27,10 +27,13 @@ Usage: bam-lorenz-coverage [OPTIONS] INPUT_ALIGNMENT_FILE
2727
Options:
2828
--version Show the version and exit.
2929
-l, --lorenz-table TEXT Output table Lorenz-curve (for stdout use: -)
30+
-x, --roc Output Lorenz-curve ROC to $lorenz_table.roc.txt
31+
[ requires --lorenz-table to be set to file ]
3032
-c, --coverage-table TEXT Output table Coverage-graph (for stdout use: -)
3133
-L, --lorenz-svg TEXT Output figure Lorenz-curve (SVG).
3234
-C, --coverage-svg TEXT Output figure Coverage-graph (SVG).
3335
--help Show this message and exit.
36+
3437
```
3538

3639
The lowercase arguments (-l, -c) allow extraction of the raw data tables for custom plotting. The uppercase arguments (-L, -C) directly generate a plot. The implemented plot only contains one sample per plot. For multi-sample plots, use the column tables and your imagination.

bin/bam-lorenz-coverage

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,11 @@ from blc.blc import bamlorenzcoverage
1212
@click.version_option(blc.__version__ + "\n\n" + blc.__license_notice__ + "\n\nCopyright (C) 2018 " + blc.__author__ + ".\n\nFor more info please visit:\n" + blc.__homepage__)
1313
@click.argument('input_alignment_file', type=click.Path(exists=True))
1414
@click.option('-l', '--lorenz-table', nargs=1, help='Output table Lorenz-curve (for stdout use: -)')
15+
@click.option('-x', '--roc', is_flag=True, help='Output Lorenz-curve ROC to $lorenz_table.roc.txt\n[ requires --lorenz-table to be set to file ]')
1516
@click.option('-c', '--coverage-table', nargs=1, help='Output table Coverage-graph (for stdout use: -)')
1617
@click.option('-L', '--lorenz-svg', help='Output figure Lorenz-curve (SVG).')
1718
@click.option('-C', '--coverage-svg', help='Output figure Coverage-graph (SVG).')
18-
def CLI(lorenz_table, coverage_table, lorenz_svg, coverage_svg, input_alignment_file):
19+
def CLI(lorenz_table, roc, coverage_table, lorenz_svg, coverage_svg, input_alignment_file):
1920
b = bamlorenzcoverage()
2021
idx_observed = b.bam_file_to_idx(input_alignment_file)
2122

@@ -41,6 +42,9 @@ def CLI(lorenz_table, coverage_table, lorenz_svg, coverage_svg, input_alignment_
4142
else:
4243
with open(lorenz_table, 'w') as fh:
4344
b.export_lorenz_curves(lorenz_curves, fh)
45+
if roc:
46+
with open(lorenz_table + '.roc.txt', 'w') as fh:
47+
fh.write(str(lorenz_curves['roc']) + "\n")
4448

4549
if lorenz_svg:
4650
b.export_lorenz_plot(lorenz_curves, lorenz_svg)

blc/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"""[License: GNU General Public License v3 (GPLv3)]
66
"""
77

8-
__version_info__ = ('2', '0', '0')
8+
__version_info__ = ('2', '1', '0')
99
__version__ = '.'.join(__version_info__) if (len(__version_info__) == 3) else '.'.join(__version_info__[0:3]) + "-" + __version_info__[3]
1010
__author__ = 'Youri Hoogstrate'
1111
__homepage__ = 'https://github.com/yhoogstrate/bam-lorenz-coverage'

blc/blc.py

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,22 +248,37 @@ def estimate_lorenz_curves(self, idx_observed):
248248
total_covered_positions_of_genome = cumu_frac_of_genome
249249
total_sequenced_bases = cumulative_sequenced_bases
250250

251+
previous = [0, 0]
252+
top = 0
253+
denom = total_sequenced_bases * total_covered_positions_of_genome * 2
254+
251255
lorenz_curves = {'fraction_reads': [0.0], 'fraction_genome': [0.0]}
252256
for depth in sorted(lorenz_idx, reverse=True):
253257
if depth != 0:
254258
lorenz_curves['fraction_reads'].append(round(1.0 * lorenz_idx[depth][1] / total_sequenced_bases, 4))
255259
lorenz_curves['fraction_genome'].append(round(1.0 * lorenz_idx[depth][0] / total_covered_positions_of_genome, 4))
256260

261+
top += (lorenz_idx[depth][1] - previous[1]) * (lorenz_idx[depth][0] - previous[0])
262+
top += (lorenz_idx[depth][1] - previous[1]) * (previous[0]) * 2
263+
264+
previous = lorenz_idx[depth]
265+
266+
roc = 1.0 * top / denom
267+
268+
lorenz_curves['roc'] = roc
257269
return lorenz_curves
258270

259271
def export_lorenz_curves(self, lorenz_curves, output_stream):
260272
output_stream.write("X-fraction-sequenced-bases\tY-fraction-genome-covered\n")
261273
for n in range(len(lorenz_curves['fraction_reads'])):
262274
output_stream.write(str(lorenz_curves['fraction_reads'][n]) + "\t" + str(lorenz_curves['fraction_genome'][n]) + "\n")
263275

264-
def export_lorenz_plot(self, lorenz_curves, output_file):
276+
def export_lorenz_plot(self, lorenz_curves, output_file, sign_digits=3):
265277
plt.plot([0.0, 1.0], [0.0, 1.0], 'k--')
266278
plt.plot(lorenz_curves['fraction_reads'], lorenz_curves['fraction_genome'], '-bo')
279+
str_roc = round(lorenz_curves['roc'], sign_digits)
280+
a = '{0:.' + str(sign_digits) + 'f}'
281+
plt.text(0.0, 0.95, 'ROC=' + a.format(str_roc), fontsize=14)
267282
plt.xlabel('Fraction sequenced bases')
268283
plt.ylabel('Fraction covered genome')
269284
plt.savefig(output_file)

requirements.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
click
21
numpy
2+
click
33
pysam
44
tqdm
55
flake8

tests/blc/test_blc_011.sam

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
@HD VN:1.4
2+
@SQ SN:chr1 LN:14
3+
@CO user command line: test-data
4+
read_01 0 chr1 4 100 5M * 0 0 NNNNN BCCBC NH:i:1 HI:i:1 AS:i:232 nM:i:0
5+
read_02 0 chr1 7 100 5M * 0 0 NNNNN BCCBC NH:i:1 HI:i:1 AS:i:232 nM:i:0

tests/test_class_blc.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def test_008_lorenz_01(self):
146146

147147
# print(idx, file=sys.stderr)
148148
# denote that it only considers the (size of the) sequences described in the SAM header
149-
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0], 'fraction_reads': [0.0, 1.0]})
149+
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0], 'fraction_reads': [0.0, 1.0], 'roc': 0.5})
150150

151151
def test_009_lorenz_02(self):
152152
# everything covered, is at least covered even densely
@@ -161,7 +161,7 @@ def test_009_lorenz_02(self):
161161

162162
# print(idx, file=sys.stderr)
163163
# denote that it only considers the (size of the) sequences described in the SAM header
164-
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0], 'fraction_reads': [0.0, 1.0]})
164+
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0], 'fraction_reads': [0.0, 1.0], 'roc': 0.5})
165165

166166
def test_010_lorenz_03(self):
167167
# everything covered, is at least covered even densely
@@ -176,7 +176,33 @@ def test_010_lorenz_03(self):
176176

177177
# print(idx, file=sys.stderr)
178178
# denote that it only considers the (size of the) sequences described in the SAM header
179-
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0 * 2 / 8, 1.0], 'fraction_reads': [0.0, 1.0 * 4 / 10, 1.0]})
179+
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0 * 2 / 8, 1.0], 'fraction_reads': [0.0, 1.0 * 4 / 10, 1.0], 'roc': 0.425})
180+
181+
def test_011_lorenz_03(self):
182+
# everything covered, is at least covered even densely
183+
# x x x x x
184+
# x x x x x
185+
# - - - - - - - - - - - - - -
186+
187+
test_id = 'blc_011'
188+
189+
input_file_sam = TEST_DIR + "test_" + test_id + ".sam"
190+
input_file_bam = T_TEST_DIR + "test_" + test_id + ".bam"
191+
192+
sam_to_sorted_bam(input_file_sam, input_file_bam)
193+
194+
b = bamlorenzcoverage()
195+
idx = b.bam_file_to_idx(input_file_bam)
196+
197+
# print(idx, file=sys.stderr)
198+
# denote that it only considers the (size of the) sequences described in the SAM header
199+
self.assertDictEqual(idx, {0: 6, 1: 6, 2: 2})
200+
201+
lc = b.estimate_lorenz_curves(idx)
202+
203+
# print(idx, file=sys.stderr)
204+
# denote that it only considers the (size of the) sequences described in the SAM header
205+
self.assertDictEqual(lc, {'fraction_genome': [0.0, 1.0 * 2 / 8, 1.0], 'fraction_reads': [0.0, 1.0 * 4 / 10, 1.0], 'roc': 0.425})
180206

181207

182208
if __name__ == '__main__':

0 commit comments

Comments
 (0)