Skip to content

Commit 64f63a2

Browse files
authored
Merge pull request #36 from broadinstitute/fn_update_docker
Updating Docker with Lydia's Perl Scripts
2 parents 948a783 + 0a8164f commit 64f63a2

20 files changed

+2711
-2
lines changed

Dockerfile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ ENV VIRAL_CLASSIFY_PATH=$INSTALL_PATH/viral-classify \
66
PATH="$PATH:$MINICONDA_PATH/envs/env2/bin"
77

88
COPY requirements-conda.txt requirements-conda-env2.txt $VIRAL_CLASSIFY_PATH/
9-
109
# install most dependencies to the main environment
1110
RUN $VIRAL_NGS_PATH/docker/install-conda-dependencies.sh $VIRAL_CLASSIFY_PATH/requirements-conda.txt
1211

@@ -22,7 +21,7 @@ RUN CONDA_PREFIX="$MINICONDA_PATH/envs/env2"; \
2221
COPY . $VIRAL_CLASSIFY_PATH
2322

2423
# Link key bits of python code into the path
25-
RUN ln -s $VIRAL_CLASSIFY_PATH/metagenomics.py $VIRAL_CLASSIFY_PATH/taxon_filter.py $VIRAL_CLASSIFY_PATH/kmer_utils.py $VIRAL_CLASSIFY_PATH/classify $VIRAL_NGS_PATH
24+
RUN ln -s $VIRAL_CLASSIFY_PATH/taxon_id_scripts/* $VIRAL_CLASSIFY_PATH/metagenomics.py $VIRAL_CLASSIFY_PATH/taxon_filter.py $VIRAL_CLASSIFY_PATH/kmer_utils.py $VIRAL_CLASSIFY_PATH/classify $VIRAL_NGS_PATH
2625

2726
# This not only prints the current version string, but it
2827
# also saves it to the VERSION file for later use and also
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
#!/usr/bin/env perl
2+
3+
# Reads in column containing taxon id and adds column containing the superkingdom of
4+
# that taxon id.
5+
6+
# Usage:
7+
# perl add_column_with_superkingdom_of_taxon_id.pl [table]
8+
# [title of column containing taxon ids] [nodes.dmp file from NCBI]
9+
10+
# Prints to console. To print to file, use
11+
# perl add_column_with_superkingdom_of_taxon_id.pl [table]
12+
# [title of column containing taxon ids] [nodes.dmp file from NCBI] > [output table path]
13+
14+
15+
use strict;
16+
use warnings;
17+
18+
19+
my $table = $ARGV[0];
20+
my $taxon_id_column_title = $ARGV[1];
21+
my $nodes_file = $ARGV[2]; # nodes.dmp file from NCBI: ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz
22+
23+
24+
my $NO_DATA = "NA";
25+
my $NEWLINE = "\n";
26+
my $DELIMITER = "\t";
27+
my $TAXONDUMP_DELIMITER = "\t[|]\t"; # nodes.dmp and names.dmp
28+
29+
my $SUPERKINGDOM_COLUMN_TITLE = "superkingdom"; # column to add
30+
31+
# superkingdoms
32+
my %TAXON_ID_TO_SUPERKINGDOM = (
33+
2157 => "Archaea",
34+
2 => "Bacteria",
35+
2759 => "Eukaryota",
36+
10239 => "Viruses");
37+
my $ROOT_TAXON_ID = 1;
38+
39+
# nodes.dmp and names.dmp
40+
my $TAXONID_COLUMN = 0; # both
41+
my $PARENTID_COLUMN = 1; # nodes.dmp
42+
my $RANK_COLUMN = 2; # nodes.dmp
43+
my $NAMES_COLUMN = 1; # names.dmp
44+
my $NAME_TYPE_COLUMN = 3; # names.dmp
45+
46+
47+
# verifies that all input files exist and are non-empty
48+
if(!$nodes_file or !-e $nodes_file or -z $nodes_file)
49+
{
50+
print STDERR "Error: nodes.dmp file not provided, does not exist, or empty:\n\t"
51+
.$nodes_file."\nExiting.\n";
52+
die;
53+
}
54+
if(!$table or !-e $table or -z $table)
55+
{
56+
print STDERR "Error: input table not provided, does not exist, or is empty:\n\t"
57+
.$table."\nExiting.\n";
58+
die;
59+
}
60+
61+
62+
# reads in nodes file
63+
my %taxonid_to_parent = (); # key: taxon id -> value: taxon id of parent taxon
64+
my %taxonid_to_rank = (); # key: taxon id -> value: rank of taxon
65+
open NODES_FILE, "<$nodes_file" || die "Could not open $nodes_file to read\n";
66+
while(<NODES_FILE>)
67+
{
68+
chomp;
69+
if($_ =~ /\S/)
70+
{
71+
my @items = split($TAXONDUMP_DELIMITER, $_);
72+
my $taxonid = $items[$TAXONID_COLUMN];
73+
my $parent_taxonid = $items[$PARENTID_COLUMN];
74+
my $rank = $items[$RANK_COLUMN];
75+
76+
$taxonid_to_parent{$taxonid} = $parent_taxonid;
77+
$taxonid_to_rank{$taxonid} = $rank;
78+
}
79+
}
80+
close NODES_FILE;
81+
82+
83+
# reads in taxon id column of table and adds superkingdom column
84+
my $first_line = 1;
85+
my $taxon_id_column = -1;
86+
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
87+
while(<TABLE>) # for each row in the file
88+
{
89+
chomp;
90+
my $line = $_;
91+
if($line =~ /\S/) # if row not empty
92+
{
93+
my @items_in_line = split($DELIMITER, $line, -1);
94+
if($first_line) # column titles
95+
{
96+
# identifies column to merge by and columns to save
97+
my $column = 0;
98+
foreach my $column_title(@items_in_line)
99+
{
100+
if(defined $column_title and $column_title eq $taxon_id_column_title)
101+
{
102+
if($taxon_id_column != -1)
103+
{
104+
print STDERR "Warning: column title ".$taxon_id_column_title
105+
." appears more than once in input table:\n\t".$table."\n";
106+
}
107+
$taxon_id_column = $column;
108+
}
109+
$column++;
110+
}
111+
112+
# verifies that we have found column to merge by
113+
if($taxon_id_column == -1)
114+
{
115+
print STDERR "Warning: column title ".$taxon_id_column_title
116+
." not found in input table:\n\t".$table."\nExiting.\n";
117+
die;
118+
}
119+
$first_line = 0; # next line is not column titles
120+
121+
# prints line as is
122+
print $line;
123+
124+
# prints titles of new superkingdom column
125+
print $DELIMITER.$SUPERKINGDOM_COLUMN_TITLE.$NEWLINE;
126+
}
127+
else # column values (not column titles)
128+
{
129+
# retrieves taxon id
130+
my $taxon_id = $items_in_line[$taxon_id_column];
131+
132+
# retrieves superkingdom
133+
my $superkingdom = $NO_DATA;
134+
my $ancestor_taxon_id = $taxon_id;
135+
while($superkingdom eq $NO_DATA
136+
and $taxonid_to_parent{$ancestor_taxon_id}
137+
and $ancestor_taxon_id ne $taxonid_to_parent{$ancestor_taxon_id}
138+
and $ancestor_taxon_id ne $ROOT_TAXON_ID)
139+
{
140+
if($TAXON_ID_TO_SUPERKINGDOM{$ancestor_taxon_id})
141+
{
142+
$superkingdom = $TAXON_ID_TO_SUPERKINGDOM{$ancestor_taxon_id};
143+
}
144+
$ancestor_taxon_id = $taxonid_to_parent{$ancestor_taxon_id}
145+
}
146+
147+
# prints line as is
148+
print $line;
149+
150+
# prints superkingdom column value
151+
print $DELIMITER.$superkingdom.$NEWLINE;
152+
}
153+
}
154+
}
155+
close TABLE;
156+
157+
158+
# April 4, 2023
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#!/usr/bin/env perl
2+
3+
# Adds column with specified title and specified value for all values.
4+
5+
# Usage:
6+
# perl add_one_value_column.pl [table to add column to] "[title of column to add]"
7+
# "[value of column to add]"
8+
9+
# Prints to console. To print to file, use
10+
# perl add_one_value_column.pl [table to add column to] "[title of column to add]"
11+
# "[value of column to add]" > [output table path]
12+
13+
14+
use strict;
15+
use warnings;
16+
17+
18+
my $table = $ARGV[0];
19+
my $title_of_column_to_add = $ARGV[1];
20+
my $value_of_column_to_add = $ARGV[2];
21+
22+
my $NEWLINE = "\n";
23+
my $DELIMITER = "\t";
24+
25+
26+
# verifies that input table exists and is not empty
27+
if(!$table or !-e $table or -z $table)
28+
{
29+
print STDERR "Error: table to add column to not provided, does not exist, or empty:\n\t"
30+
.$table."\nExiting.\n";
31+
die;
32+
}
33+
34+
35+
# reads in and adds column to table to add columns to
36+
my $first_line = 1;
37+
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
38+
while(<TABLE>) # for each row in the file
39+
{
40+
chomp;
41+
my $line = $_;
42+
if($line =~ /\S/) # if row not empty
43+
{
44+
if($first_line) # column titles
45+
{
46+
# prints line as is
47+
print $line;
48+
49+
# prints title of new column
50+
print $DELIMITER;
51+
print $title_of_column_to_add;
52+
print $NEWLINE;
53+
54+
$first_line = 0;
55+
}
56+
else # column values (not column titles)
57+
{
58+
# prints line as is
59+
print $line;
60+
61+
# prints value of new column
62+
print $DELIMITER;
63+
print $value_of_column_to_add;
64+
print $NEWLINE;
65+
}
66+
}
67+
}
68+
close TABLE;
69+
70+
71+
# September 26, 2021
72+
# November 8, 2021
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
#!/usr/bin/env perl
2+
3+
# Given a list of accession numbers, one per line, downloads and prints fasta sequences
4+
# corresponding to each accession number.
5+
6+
# Based on option 2 in https://edwards.flinders.edu.au/ncbi-sequence-or-fasta-batch-download-using-entrez/
7+
8+
9+
# Usage:
10+
# perl download_fasta_sequences_from_accession_numbers.pl
11+
# [path of file with list of accession numbers, one per line]
12+
# [database (nucleotide by default)]
13+
14+
# Prints to console. To print to file, use
15+
# perl download_fasta_sequences_from_accession_numbers.pl
16+
# [path of file with list of accession numbers, one per line]
17+
# [database (nucleotide by default)] > [output fasta file path]
18+
19+
use strict;
20+
use warnings;
21+
22+
23+
my $accession_numbers_file = $ARGV[0]; # list of accession numbers, one per line
24+
my $database = $ARGV[1]; # nucleotide by default
25+
if(!$database)
26+
{
27+
$database = "nucleotide";
28+
}
29+
30+
31+
my $NEWLINE = "\n";
32+
my $MAXIMUM_NUMBER_ACCESSION_NUMBERS_IN_ONE_URL = 400;
33+
my $TEMP_FILE_EXTENSION = "_temp.txt";
34+
35+
36+
# reads in accession numbers and retrieves corresponding fasta sequences
37+
my @accession_numbers_lists = ();
38+
my $current_accession_numbers_command_list = "";
39+
my $current_number_accession_numbers = 0;
40+
open ACCESSION_NUMBERS, "<$accession_numbers_file" || die "Could not open $accession_numbers_file to read\n";
41+
while(<ACCESSION_NUMBERS>)
42+
{
43+
chomp;
44+
if($_ =~ /\S/)
45+
{
46+
my $accession_number = $_;
47+
if($current_accession_numbers_command_list)
48+
{
49+
$current_accession_numbers_command_list .= ",";
50+
}
51+
$current_accession_numbers_command_list .= $accession_number;
52+
$current_number_accession_numbers++;
53+
54+
if($current_number_accession_numbers >= $MAXIMUM_NUMBER_ACCESSION_NUMBERS_IN_ONE_URL)
55+
{
56+
push(@accession_numbers_lists, $current_accession_numbers_command_list);
57+
$current_number_accession_numbers = 0;
58+
$current_accession_numbers_command_list = "";
59+
}
60+
}
61+
}
62+
close ACCESSION_NUMBERS;
63+
push(@accession_numbers_lists, $current_accession_numbers_command_list);
64+
65+
66+
# builds and runs command to download fasta file
67+
# example URL: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&retmode=text&id=D90600.1
68+
my $temp_file = $accession_numbers_file.$TEMP_FILE_EXTENSION;
69+
foreach my $accession_numbers_list(@accession_numbers_lists)
70+
{
71+
# print STDERR $accession_numbers_list."\n";
72+
my $command = "curl https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db="
73+
.$database."\\&rettype=fasta\\&retmode=text\\&id=".$accession_numbers_list; # ." > ".$output_fasta;
74+
`$command > $temp_file`;
75+
76+
open FASTA_SEQUENCES, "<$temp_file" || die "Could not open $temp_file to read\n";
77+
while(<FASTA_SEQUENCES>)
78+
{
79+
chomp;
80+
if($_ =~ /\S/)
81+
{
82+
print $_.$NEWLINE;
83+
}
84+
}
85+
close FASTA_SEQUENCES;
86+
}
87+
`rm $temp_file`;
88+
89+
90+
# December 27, 2022
91+
# December 29, 2022

0 commit comments

Comments
 (0)