Skip to content

Commit 0dff10a

Browse files
authored
Merge pull request #45 from broadinstitute/fn_add_perl_script
Adding kraken output perl scripts
2 parents 486dd5b + 3cbaed6 commit 0dff10a

23 files changed

+964
-1660
lines changed
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
#!/usr/bin/env perl
2+
3+
# Converts LCA output table to kraken output format. Treats paired reads as separate reads.
4+
5+
# Input table is output of retrieve_top_blast_hits_LCA_for_each_sequence.pl, with
6+
# column titles (tab-separated):
7+
# - sequence_name
8+
# - LCA_taxon_id
9+
# - LCA_taxon_rank
10+
# - LCA_taxon_species
11+
# - LCA_taxon_genus
12+
# - LCA_taxon_family
13+
# - evalue_of_top_hits
14+
# - lowest_pident_of_top_hits
15+
# - mean_pident_of_top_hits
16+
# - highest_pident_of_top_hits
17+
# - lowest_qcovs_of_top_hits
18+
# - mean_qcovs_of_top_hits
19+
# - highest_qcovs_of_top_hits
20+
# - number_top_hits
21+
22+
# Kraken output format:
23+
# https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#standard-kraken-output-format
24+
25+
26+
# Usage:
27+
# perl LCA_table_to_kraken_output_format.pl
28+
# [output of retrieve_top_blast_hits_LCA_for_each_sequence.pl for one blast search]
29+
# [fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)]
30+
31+
32+
# Prints to console. To print to file, use
33+
# perl LCA_table_to_kraken_output_format.pl
34+
# [output of retrieve_top_blast_hits_LCA_for_each_sequence.pl for one blast search]
35+
# [fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)]
36+
# > [output kraken format table]
37+
38+
39+
use strict;
40+
use warnings;
41+
42+
43+
my $LCA_matches = $ARGV[0]; # output of retrieve_top_blast_hits_LCA_for_each_sequence.pl
44+
my $fasta_file = $ARGV[1]; # fasta file that was input to blast search (to retrieve sequence lengths and names of unclassified sequences)
45+
46+
my $NO_DATA = "NA";
47+
my $NEWLINE = "\n";
48+
my $DELIMITER = "\t";
49+
50+
my $INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH = 1;
51+
52+
# blast LCA table columns:
53+
my $sequence_name_column = 0;
54+
my $LCA_taxon_id_column = 1;
55+
my $LCA_taxon_rank_column = 2;
56+
my $LCA_taxon_species_column = 3;
57+
my $LCA_taxon_genus_column = 4;
58+
my $LCA_taxon_family_column = 5;
59+
my $LCA_taxon_superkingdom_column = 6;
60+
my $evalue_of_top_hits_column = 7;
61+
my $lowest_pident_of_top_hits_column = 8;
62+
my $mean_pident_of_top_hits_column = 9;
63+
my $highest_pident_of_top_hits_column = 10;
64+
my $lowest_qcovs_of_top_hits_column = 11;
65+
my $mean_qcovs_of_top_hits_column = 12;
66+
my $highest_qcovs_of_top_hits_column = 13;
67+
my $number_top_hits_column = 14;
68+
69+
70+
# reads in sequence names and lengths from fasta file
71+
my %sequence_name_to_length = (); # key: sequence name -> value: length of sequence
72+
my %sequence_name_appears_in_LCA_table = (); # key: sequence name from fasta file -> value: 1 if sequence appears in LCA table, 0 if not
73+
open FASTA_FILE, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
74+
my $sequence_name = "";
75+
my $sequence = "";
76+
while(<FASTA_FILE>) # for each line in the file
77+
{
78+
chomp;
79+
if($_ =~ /^>(.*)$/) # header line
80+
{
81+
# processes previous sequence
82+
$sequence_name_to_length{$sequence_name} = sequence_length($sequence);
83+
$sequence_name_appears_in_LCA_table{$sequence_name} = 0;
84+
85+
# starts processing this sequence
86+
$sequence_name = $1;
87+
$sequence = "";
88+
}
89+
elsif($_ =~ /\S/)
90+
{
91+
$sequence .= $_;
92+
}
93+
}
94+
close FASTA_FILE;
95+
if($sequence and $sequence_name)
96+
{
97+
# processes last sequence
98+
$sequence_name_to_length{$sequence_name} = sequence_length($sequence);
99+
$sequence_name_appears_in_LCA_table{$sequence_name} = 0;
100+
}
101+
102+
103+
# reads LCA table and prints rows for reads with hits
104+
my $first_row = 1;
105+
open LCA_MATCHES, "<$LCA_matches" || die "Could not open $LCA_matches to read\n";
106+
while(<LCA_MATCHES>)
107+
{
108+
chomp;
109+
my $line = $_;
110+
if($line =~ /\S/)
111+
{
112+
if($first_row)
113+
{
114+
# ignore header line
115+
$first_row = 0;
116+
}
117+
else
118+
{
119+
# reads in relevant lines in row
120+
my @items = split($DELIMITER, $line);
121+
my $sequence_name = $items[$sequence_name_column];
122+
my $assigned_taxon_id = $items[$LCA_taxon_id_column];
123+
124+
# records that we have seen this sequence
125+
$sequence_name_appears_in_LCA_table{$sequence_name} = 1;
126+
127+
# prints kraken format row for unclassified sequence
128+
# "C"/"U": a one letter code indicating that the sequence was either classified
129+
# or unclassified.
130+
if($assigned_taxon_id == 0)
131+
{
132+
print "U".$DELIMITER;
133+
}
134+
else
135+
{
136+
print "C".$DELIMITER;
137+
}
138+
139+
# The sequence ID, obtained from the FASTA/FASTQ header.
140+
print $sequence_name.$DELIMITER;
141+
142+
# The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence
143+
# is unclassified.
144+
print $assigned_taxon_id.$DELIMITER;
145+
146+
# The length of the sequence in bp. In the case of paired read data, this will be
147+
# a string containing the lengths of the two sequences in bp, separated by a pipe
148+
# character, e.g. "98|94".
149+
print $sequence_name_to_length{$sequence_name}.$DELIMITER;
150+
151+
# A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s)
152+
print $NO_DATA.$NEWLINE;
153+
}
154+
}
155+
}
156+
close LCA_MATCHES;
157+
158+
159+
# prints rows for reads without hits
160+
foreach my $sequence_name(sort keys %sequence_name_appears_in_LCA_table)
161+
{
162+
if($sequence_name and !$sequence_name_appears_in_LCA_table{$sequence_name})
163+
{
164+
# prints kraken format row for unclassified sequence
165+
# "C"/"U": a one letter code indicating that the sequence was either classified
166+
# or unclassified.
167+
print "U".$DELIMITER;
168+
169+
# The sequence ID, obtained from the FASTA/FASTQ header.
170+
print $sequence_name.$DELIMITER;
171+
172+
# The taxonomy ID Kraken 2 used to label the sequence; this is 0 if the sequence
173+
# is unclassified.
174+
print "0".$DELIMITER;
175+
176+
# The length of the sequence in bp. In the case of paired read data, this will be
177+
# a string containing the lengths of the two sequences in bp, separated by a pipe
178+
# character, e.g. "98|94".
179+
print $sequence_name_to_length{$sequence_name}.$DELIMITER;
180+
181+
# A space-delimited list indicating the LCA mapping of each k-mer in the sequence(s)
182+
print $NO_DATA.$NEWLINE;
183+
}
184+
}
185+
186+
187+
# returns sequence length
188+
sub sequence_length
189+
{
190+
my $sequence = $_[0];
191+
192+
# capitalize sequence
193+
$sequence = uc($sequence);
194+
195+
# counts number bases or unambiguous bases in this sequence
196+
my $sequence_length = 0;
197+
foreach my $base(split //, $sequence)
198+
{
199+
if(!$INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH and is_unambiguous_base($base))
200+
{
201+
$sequence_length++;
202+
}
203+
elsif($INCLUDE_AMBIGUOUS_BASES_IN_SEQUENCE_LENGTH and is_base($base))
204+
{
205+
$sequence_length++;
206+
}
207+
}
208+
return $sequence_length;
209+
}
210+
211+
# returns 1 if base is A, T, C, G; returns 0 if not
212+
# input base must be capitalized
213+
sub is_unambiguous_base
214+
{
215+
my $base = $_[0]; # must be capitalized
216+
if($base eq "A" or $base eq "T" or $base eq "C" or $base eq "G")
217+
{
218+
return 1;
219+
}
220+
return 0;
221+
}
222+
223+
# returns 1 if base is not gap, 0 if base is a gap
224+
sub is_base
225+
{
226+
my $base = $_[0];
227+
228+
# empty value
229+
if(!$base)
230+
{
231+
return 0;
232+
}
233+
234+
# only whitespace
235+
if($base !~ /\S/)
236+
{
237+
return 0;
238+
}
239+
240+
# gap
241+
if($base eq "-")
242+
{
243+
return 0;
244+
}
245+
246+
# base
247+
return 1;
248+
}
249+
250+
# September 29, 2023

0 commit comments

Comments
 (0)