Skip to content

Commit 790c317

Browse files
committed
Add the misc/vrfs-variances script
1 parent 068da7a commit 790c317

File tree

1 file changed

+160
-0
lines changed

1 file changed

+160
-0
lines changed

misc/vrfs-variances

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,160 @@
1+
#!/usr/bin/env perl
2+
#
3+
# The MIT License
4+
#
5+
# Copyright (c) 2024-2025 Genome Research Ltd.
6+
#
7+
# Author: petr.danecek@sanger
8+
#
9+
# Permission is hereby granted, free of charge, to any person obtaining a copy
10+
# of this software and associated documentation files (the "Software"), to deal
11+
# in the Software without restriction, including without limitation the rights
12+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13+
# copies of the Software, and to permit persons to whom the Software is
14+
# furnished to do so, subject to the following conditions:
15+
#
16+
# The above copyright notice and this permission notice shall be included in
17+
# all copies or substantial portions of the Software.
18+
#
19+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25+
# THE SOFTWARE.
26+
27+
28+
use strict;
29+
use warnings;
30+
use Carp;
31+
32+
my $opts = parse_params();
33+
parse_and_calc($opts);
34+
35+
exit;
36+
37+
#--------------------------------
38+
39+
sub error
40+
{
41+
my (@msg) = @_;
42+
if ( scalar @msg ) { confess @msg; }
43+
print
44+
"About: Parse bcftools/vrfs output and from a subset of sites calculate variances.\n",
45+
"Usage: vrfs-variances [OPTIONS]\n",
46+
"Options:\n",
47+
" -n, --ndat NUM Number of sites to include, fraction (FLOAT) or absolute (INT) [0.2]\n",
48+
" -r, --rand-noise INT Add random noise, INT is a seed for reproducibility, or 0 for no seed [0]\n",
49+
" -s, --list-sites List sites passing the -n setting\n",
50+
" -v, --list-var2 Output in a format suitable for `bcftools +vrfs -r file`\n",
51+
" -h, -?, --help This help message\n",
52+
"\n";
53+
exit -1;
54+
}
55+
sub parse_params
56+
{
57+
my $opts = { ndat=>0.2 };
58+
if ( -t STDIN && !@ARGV ) { error(); }
59+
while (defined(my $arg=shift(@ARGV)))
60+
{
61+
if ( $arg eq '-r' or $arg eq '--rand-noise' ) { $$opts{rand_noise}=shift(@ARGV); next }
62+
if ( $arg eq '-s' or $arg eq '--list-sites' ) { $$opts{list_sites}=1; next }
63+
if ( $arg eq '-v' or $arg eq '--list-var2' ) { $$opts{list_var2}=1; next }
64+
if ( $arg eq '-n' or $arg eq '--ndat' ) { $$opts{ndat}=shift(@ARGV); next }
65+
if ( $arg eq '-?' or $arg eq '-h' or $arg eq '--help' ) { error(); }
66+
error("Unknown parameter \"$arg\". Run -h for help.\n");
67+
}
68+
if ( exists($$opts{rand_noise}) )
69+
{
70+
if ( $$opts{rand_noise} ) { srand($$opts{rand_noise}); }
71+
else { srand(); }
72+
}
73+
return $opts;
74+
}
75+
76+
sub cmp_dist
77+
{
78+
for (my $i=@{$$a{dist}}-1; $i>=0; $i--)
79+
{
80+
if ( $$a{dist}[$i] == $$b{dist}[$i] ) { next; }
81+
return $$a{dist}[$i] <=> $$b{dist}[$i];
82+
}
83+
return 0;
84+
}
85+
86+
sub parse_and_calc
87+
{
88+
my ($opts) = @_;
89+
my @dat = ();
90+
while (my $line=<STDIN>)
91+
{
92+
# SITE chr15 79031596 AAG A 5.746144e+01 926-181-22-12-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0
93+
if ( !($line=~/^SITE/) ) { next; }
94+
chomp($line);
95+
my (@col) = split(/\t/,$line);
96+
my @dist = split(/-/,$col[-1]);
97+
push @dat, { line=>$line, dist=>\@dist };
98+
}
99+
my @sdat = sort cmp_dist @dat;
100+
my $nmax = $$opts{ndat};
101+
if ( $nmax <= 1 ) { $nmax = int($nmax * scalar @sdat); }
102+
my $n = 0;
103+
my @avg = ();
104+
my @avg2 = ();
105+
for my $x (@sdat)
106+
{
107+
my $rand = -1;
108+
if ( exists($$opts{rand_noise}) && rand(1000)<10 ) { $rand = int(rand(@{$$x{dist}})); }
109+
my $max = 0;
110+
for (my $i=0; $i<@{$$x{dist}}; $i++)
111+
{
112+
if ( $rand==$i ) { $$x{dist}[$i]++; }
113+
if ( $max < $$x{dist}[$i] ) { $max = $$x{dist}[$i]; }
114+
}
115+
for (my $i=0; $i<@{$$x{dist}}; $i++)
116+
{
117+
my $val = $$x{dist}[$i] / $max;
118+
$avg[$i] += $val;
119+
$avg2[$i] += $val * $val;
120+
}
121+
if ( $$opts{list_sites} ) { print $$x{line}."\n"; }
122+
if ( ++$n >= $nmax )
123+
{
124+
if ( !$$opts{list_var2} ) { print $$x{line}."\n"; }
125+
last;
126+
}
127+
}
128+
if ( $$opts{list_sites} ) { return; }
129+
$avg2[0] = 1;
130+
for (my $i=0; $i<@avg; $i++)
131+
{
132+
$avg[$i] = sprintf("%e",$avg[$i]/$n);
133+
$avg2[$i] = $avg2[$i]/$n - $avg[$i]*$avg[$i];
134+
if ( $avg2[$i]<=0 )
135+
{
136+
# yes, it be smaller than zero, machine precision in play when the values are close to zero
137+
$avg2[$i] = $i>0 ? $avg2[$i-1]/2 : 1;
138+
}
139+
if ( !exists($$opts{rand_noise}) && $avg2[$i] < 1e-9 )
140+
{
141+
$avg2[$i] = $avg2[$i-1] * ($i-1) / $i;
142+
}
143+
$avg2[$i] = sprintf("%e",$avg2[$i]);
144+
}
145+
# make it monotonic
146+
for (my $i=@avg2-1; $i>0; $i--)
147+
{
148+
if ( $avg2[$i-1] < $avg2[$i] ) { $avg2[$i-1] = $avg2[$i]; }
149+
}
150+
if ( $$opts{list_var2} )
151+
{
152+
print join("\n",@avg2)."\n";
153+
}
154+
else
155+
{
156+
print STDERR "MEAN\t".join(" ",@avg)."\n";
157+
print STDERR "VAR2\t".join(" ",@avg2)."\n";
158+
}
159+
}
160+

0 commit comments

Comments
 (0)