|
| 1 | +#!/usr/bin/env perl |
| 2 | +# |
| 3 | +# The MIT License |
| 4 | +# |
| 5 | +# Copyright (c) 2024-2025 Genome Research Ltd. |
| 6 | +# |
| 7 | +# Author: petr.danecek@sanger |
| 8 | +# |
| 9 | +# Permission is hereby granted, free of charge, to any person obtaining a copy |
| 10 | +# of this software and associated documentation files (the "Software"), to deal |
| 11 | +# in the Software without restriction, including without limitation the rights |
| 12 | +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 13 | +# copies of the Software, and to permit persons to whom the Software is |
| 14 | +# furnished to do so, subject to the following conditions: |
| 15 | +# |
| 16 | +# The above copyright notice and this permission notice shall be included in |
| 17 | +# all copies or substantial portions of the Software. |
| 18 | +# |
| 19 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 20 | +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 21 | +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 22 | +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 23 | +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 24 | +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 25 | +# THE SOFTWARE. |
| 26 | + |
| 27 | + |
| 28 | +use strict; |
| 29 | +use warnings; |
| 30 | +use Carp; |
| 31 | + |
| 32 | +my $opts = parse_params(); |
| 33 | +parse_and_calc($opts); |
| 34 | + |
| 35 | +exit; |
| 36 | + |
| 37 | +#-------------------------------- |
| 38 | + |
| 39 | +sub error |
| 40 | +{ |
| 41 | + my (@msg) = @_; |
| 42 | + if ( scalar @msg ) { confess @msg; } |
| 43 | + print |
| 44 | + "About: Parse bcftools/vrfs output and from a subset of sites calculate variances.\n", |
| 45 | + "Usage: vrfs-variances [OPTIONS]\n", |
| 46 | + "Options:\n", |
| 47 | + " -n, --ndat NUM Number of sites to include, fraction (FLOAT) or absolute (INT) [0.2]\n", |
| 48 | + " -r, --rand-noise INT Add random noise, INT is a seed for reproducibility, or 0 for no seed [0]\n", |
| 49 | + " -s, --list-sites List sites passing the -n setting\n", |
| 50 | + " -v, --list-var2 Output in a format suitable for `bcftools +vrfs -r file`\n", |
| 51 | + " -h, -?, --help This help message\n", |
| 52 | + "\n"; |
| 53 | + exit -1; |
| 54 | +} |
| 55 | +sub parse_params |
| 56 | +{ |
| 57 | + my $opts = { ndat=>0.2 }; |
| 58 | + if ( -t STDIN && !@ARGV ) { error(); } |
| 59 | + while (defined(my $arg=shift(@ARGV))) |
| 60 | + { |
| 61 | + if ( $arg eq '-r' or $arg eq '--rand-noise' ) { $$opts{rand_noise}=shift(@ARGV); next } |
| 62 | + if ( $arg eq '-s' or $arg eq '--list-sites' ) { $$opts{list_sites}=1; next } |
| 63 | + if ( $arg eq '-v' or $arg eq '--list-var2' ) { $$opts{list_var2}=1; next } |
| 64 | + if ( $arg eq '-n' or $arg eq '--ndat' ) { $$opts{ndat}=shift(@ARGV); next } |
| 65 | + if ( $arg eq '-?' or $arg eq '-h' or $arg eq '--help' ) { error(); } |
| 66 | + error("Unknown parameter \"$arg\". Run -h for help.\n"); |
| 67 | + } |
| 68 | + if ( exists($$opts{rand_noise}) ) |
| 69 | + { |
| 70 | + if ( $$opts{rand_noise} ) { srand($$opts{rand_noise}); } |
| 71 | + else { srand(); } |
| 72 | + } |
| 73 | + return $opts; |
| 74 | +} |
| 75 | + |
| 76 | +sub cmp_dist |
| 77 | +{ |
| 78 | + for (my $i=@{$$a{dist}}-1; $i>=0; $i--) |
| 79 | + { |
| 80 | + if ( $$a{dist}[$i] == $$b{dist}[$i] ) { next; } |
| 81 | + return $$a{dist}[$i] <=> $$b{dist}[$i]; |
| 82 | + } |
| 83 | + return 0; |
| 84 | +} |
| 85 | + |
| 86 | +sub parse_and_calc |
| 87 | +{ |
| 88 | + my ($opts) = @_; |
| 89 | + my @dat = (); |
| 90 | + while (my $line=<STDIN>) |
| 91 | + { |
| 92 | + # SITE chr15 79031596 AAG A 5.746144e+01 926-181-22-12-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0-0 |
| 93 | + if ( !($line=~/^SITE/) ) { next; } |
| 94 | + chomp($line); |
| 95 | + my (@col) = split(/\t/,$line); |
| 96 | + my @dist = split(/-/,$col[-1]); |
| 97 | + push @dat, { line=>$line, dist=>\@dist }; |
| 98 | + } |
| 99 | + my @sdat = sort cmp_dist @dat; |
| 100 | + my $nmax = $$opts{ndat}; |
| 101 | + if ( $nmax <= 1 ) { $nmax = int($nmax * scalar @sdat); } |
| 102 | + my $n = 0; |
| 103 | + my @avg = (); |
| 104 | + my @avg2 = (); |
| 105 | + for my $x (@sdat) |
| 106 | + { |
| 107 | + my $rand = -1; |
| 108 | + if ( exists($$opts{rand_noise}) && rand(1000)<10 ) { $rand = int(rand(@{$$x{dist}})); } |
| 109 | + my $max = 0; |
| 110 | + for (my $i=0; $i<@{$$x{dist}}; $i++) |
| 111 | + { |
| 112 | + if ( $rand==$i ) { $$x{dist}[$i]++; } |
| 113 | + if ( $max < $$x{dist}[$i] ) { $max = $$x{dist}[$i]; } |
| 114 | + } |
| 115 | + for (my $i=0; $i<@{$$x{dist}}; $i++) |
| 116 | + { |
| 117 | + my $val = $$x{dist}[$i] / $max; |
| 118 | + $avg[$i] += $val; |
| 119 | + $avg2[$i] += $val * $val; |
| 120 | + } |
| 121 | + if ( $$opts{list_sites} ) { print $$x{line}."\n"; } |
| 122 | + if ( ++$n >= $nmax ) |
| 123 | + { |
| 124 | + if ( !$$opts{list_var2} ) { print $$x{line}."\n"; } |
| 125 | + last; |
| 126 | + } |
| 127 | + } |
| 128 | + if ( $$opts{list_sites} ) { return; } |
| 129 | + $avg2[0] = 1; |
| 130 | + for (my $i=0; $i<@avg; $i++) |
| 131 | + { |
| 132 | + $avg[$i] = sprintf("%e",$avg[$i]/$n); |
| 133 | + $avg2[$i] = $avg2[$i]/$n - $avg[$i]*$avg[$i]; |
| 134 | + if ( $avg2[$i]<=0 ) |
| 135 | + { |
| 136 | + # yes, it be smaller than zero, machine precision in play when the values are close to zero |
| 137 | + $avg2[$i] = $i>0 ? $avg2[$i-1]/2 : 1; |
| 138 | + } |
| 139 | + if ( !exists($$opts{rand_noise}) && $avg2[$i] < 1e-9 ) |
| 140 | + { |
| 141 | + $avg2[$i] = $avg2[$i-1] * ($i-1) / $i; |
| 142 | + } |
| 143 | + $avg2[$i] = sprintf("%e",$avg2[$i]); |
| 144 | + } |
| 145 | + # make it monotonic |
| 146 | + for (my $i=@avg2-1; $i>0; $i--) |
| 147 | + { |
| 148 | + if ( $avg2[$i-1] < $avg2[$i] ) { $avg2[$i-1] = $avg2[$i]; } |
| 149 | + } |
| 150 | + if ( $$opts{list_var2} ) |
| 151 | + { |
| 152 | + print join("\n",@avg2)."\n"; |
| 153 | + } |
| 154 | + else |
| 155 | + { |
| 156 | + print STDERR "MEAN\t".join(" ",@avg)."\n"; |
| 157 | + print STDERR "VAR2\t".join(" ",@avg2)."\n"; |
| 158 | + } |
| 159 | +} |
| 160 | + |
0 commit comments