Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,14 @@ Optional additional parameters:
|--split-tables | output 2 tables: (1) abundances only at specified rank and (2) taxonomic lineages down to specified rank |
|--counts | output estimated counts rather than relative abundance percentage in combined table. Only includes Emu relative abundance outputs that already have 'estimated counts' |

### Verify Database
The verify_db function can be used to validate a database prior to running emu. This function checks if there are two files species_taxid.fasta and taxonomy.tsv in the input directory (with the exact same name) and raises an error if either file is not found. Additionally, it checks if there are duplicate entries in taxonomy.tsv and rasises an error if duplicates are found.

```bash
emu verify_db <directory_path>
```


### System Requirements

All software dependencies are listed in environment.yml. Emu v3.0.0 has been tested on Python v3.7 and used to generate results in manuscript.
Expand Down
94 changes: 64 additions & 30 deletions emu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env python3
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# pylint: disable=consider-using-f-string
"""
Expand All @@ -9,6 +9,7 @@ import os
import argparse
import pathlib
import subprocess
from os.path import exists
from sys import stdout
from operator import add, mul
from pathlib import Path
Expand All @@ -22,7 +23,6 @@ from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


# static global variables
CIGAR_OPS = [1, 2, 4, 10]
CIGAR_OPS_ALL = [0, 1, 2, 4]
Expand Down Expand Up @@ -143,7 +143,7 @@ def get_cigar_op_log_probabilities(sam_path):
for i in sorted(zero_locs, reverse=True):
del cigar_stats_primary[i]
n_char = sum(cigar_stats_primary)
return [math.log(x) for x in np.array(cigar_stats_primary)/n_char], zero_locs, \
return [math.log(x) for x in np.array(cigar_stats_primary) / n_char], zero_locs, \
dict_longest_align


Expand All @@ -163,7 +163,7 @@ def compute_log_prob_rgs(alignment, cigar_stats, log_p_cigar_op, dict_longest_al

ref_name, query_name = alignment.reference_name, alignment.query_name
log_score = sum(list(map(mul, log_p_cigar_op, cigar_stats))) * \
(dict_longest_align[query_name]/align_len)
(dict_longest_align[query_name] / align_len)
species_tid = int(ref_name.split(":")[0])
return log_score, query_name, species_tid

Expand Down Expand Up @@ -192,7 +192,7 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
cigar_stats = get_align_stats(alignment)
log_score, query_name, species_tid = \
compute_log_prob_rgs(alignment, cigar_stats, log_p_cigar_op,
dict_longest_align, align_len)
dict_longest_align, align_len)

if query_name not in log_p_rgs:
log_p_rgs[query_name] = ([species_tid], [log_score])
Expand All @@ -217,12 +217,12 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
del cigar_stats[i]
log_score, query_name, species_tid = \
compute_log_prob_rgs(alignment, cigar_stats, log_p_cigar_op,
dict_longest_align, align_len)
dict_longest_align, align_len)

if query_name not in log_p_rgs:
log_p_rgs[query_name] = ([species_tid], [log_score])
elif query_name in log_p_rgs and species_tid not in log_p_rgs[query_name][0]:
log_p_rgs[query_name] = (log_p_rgs[query_name][0] +[species_tid],
log_p_rgs[query_name] = (log_p_rgs[query_name][0] + [species_tid],
log_p_rgs[query_name][1] + [log_score])
else:
logprgs_idx = log_p_rgs[query_name][0].index(species_tid)
Expand All @@ -238,8 +238,8 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z

## remove low likelihood alignments?
## remove if p(r|s) < 0.01
#min_p_thresh = math.log(0.01)
#log_p_rgs = {r_map: val for r_map, val in log_p_rgs.items() if val > min_p_thresh}
# min_p_thresh = math.log(0.01)
# log_p_rgs = {r_map: val for r_map, val in log_p_rgs.items() if val > min_p_thresh}
return log_p_rgs, unassigned_count, len(assigned_reads)


Expand Down Expand Up @@ -276,7 +276,7 @@ def expectation_maximization(log_p_rgs, freq):
log_p_rns.append(logprns_val)

if len(valid_seqs) != 0:
#np.array([log_p_rns], dtype=np.float64)
# np.array([log_p_rns], dtype=np.float64)
logc = -np.max(log_p_rns) # calculate fixed multiplier, c
prnsc = np.exp(log_p_rns + logc) # calculates exp(log(L(r|s) * f(s) * c))
prc = np.sum(prnsc) # calculates sum of (L(r|s) * f(s) * c) for each read
Expand Down Expand Up @@ -312,9 +312,9 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr
freq, counter = dict.fromkeys(db_ids, 1 / n_db), 1

# set output abundance threshold
freq_thresh = 1/n_reads
freq_thresh = 1 / n_reads
if n_reads > 1000:
freq_thresh = 10/n_reads
freq_thresh = 10 / n_reads

# performs iterations of the expectation_maximization algorithm
total_log_likelihood = -math.inf
Expand All @@ -336,7 +336,7 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr
if log_likelihood_diff < lli_thresh:
stdout.write("Number of EM iterations: {}\n".format(counter))
freq = {k: v for k, v in freq.items() if v >= freq_thresh}
# remove tax id if less than the frequency threshold
# remove tax id if less than the frequency threshold
freq_full, updated_log_likelihood, p_sgr = expectation_maximization(log_p_rgs, freq)
freq_set_thresh = None
if freq_thresh < input_threshold:
Expand All @@ -345,8 +345,8 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr
expectation_maximization(log_p_rgs, freq)
return freq_full, freq_set_thresh, p_sgr

#output current estimation
#freq_to_lineage_df(freq, f"{out_file}_{counter}", df_nodes, df_names)
# output current estimation
# freq_to_lineage_df(freq, f"{out_file}_{counter}", df_nodes, df_names)
counter += 1


Expand All @@ -368,7 +368,7 @@ def lineage_dict_from_tid(taxid, nodes_dict, names_dict):
while names_dict[taxid] != "root":
tup = nodes_dict[taxid]
# find the name for each taxonomic rank
if tup[1] in RANKS_PRINTOUT: # check rank in printout list
if tup[1] in RANKS_PRINTOUT: # check rank in printout list
idx = RANKS_PRINTOUT.index(tup[1])
lineage_list[idx] = names_dict[taxid]
taxid = tup[0]
Expand All @@ -388,12 +388,12 @@ def freq_to_lineage_df(freq, tsv_output_path, taxonomy_df, assigned_count,
counts(boolean): True if include estimated counts in output .tsv file
returns(df): pandas df with lineage and abundances for values in f
"""
#add tax lineage for values in freq
# add tax lineage for values in freq
results_df = pd.DataFrame(zip(list(freq.keys()) + ['unassigned'],
list(freq.values()) + [0]),
columns=["tax_id", "abundance"]).set_index('tax_id')
results_df = results_df.join(taxonomy_df, how='left').reset_index()
#add in the estimated count values for the assigned and unassigned counts
# add in the estimated count values for the assigned and unassigned counts
if counts:
counts_series = pd.concat([(results_df["abundance"] * assigned_count)[:-1],
pd.Series(unassigned_count)], ignore_index=True)
Expand All @@ -408,7 +408,7 @@ def generate_alignments(in_file_list, out_basename, database):
in_file_list(list(str)): list of path(s) to input sequences
out_basename(str): path and basename for output files
"""
#indicate filepath and create file
# indicate filepath and create file
input_file = " ".join(in_file_list)
filetype = pathlib.PurePath(args.input_file[0]).suffix
if filetype == '.sam':
Expand Down Expand Up @@ -461,7 +461,7 @@ def create_names_dict(names_path):
'name_txt' as values
"""
name_headers = ['tax_id', 'name_txt', 'name_class']
names_df = pd.read_csv(names_path, sep='|', index_col=False, header=None, dtype=str)\
names_df = pd.read_csv(names_path, sep='|', index_col=False, header=None, dtype=str) \
.drop([2, 4], axis=1)
names_df.columns = name_headers
for col in names_df.columns:
Expand Down Expand Up @@ -582,7 +582,7 @@ def build_ncbi_taxonomy(unique_tids, nodes_dict, names_dict, filepath):
"""
with open(filepath, "w", encoding="utf8") as file:
# write the header to the file
dummy_str = '\t'.join(['%s',] * len(RANKS_PRINTOUT)) + '\n'
dummy_str = '\t'.join(['%s', ] * len(RANKS_PRINTOUT)) + '\n'
file.write(dummy_str % tuple(RANKS_PRINTOUT))
# write each lineage as a row in the file
for tid in unique_tids:
Expand All @@ -606,7 +606,7 @@ def build_direct_taxonomy(tid_set, lineage_path, taxonomy_file):
tax_output_file.write(first_line.split("\t", 1)[1])
for line in file:
tax_id = line.split("\t", 1)[0]
if tax_id in tid_set: #only add if taxid is in fasta
if tax_id in tid_set: # only add if taxid is in fasta
tax_output_file.write(line)


Expand Down Expand Up @@ -638,6 +638,7 @@ def collapse_rank(path, rank):
df_emu_copy.to_csv(output_path, sep='\t')
stdout.write("File generated: {}\n".format(output_path))


def combine_outputs(dir_path, rank, split_files=False, count_table=False):
""" Combines multiple Emu output relative abundance tables into a single table.
Inlcudes all .tsv files with 'rel-abundance' in the filename in the provided dir_path.
Expand All @@ -653,7 +654,7 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False):
metric = 'estimated counts'
for file in os.listdir(dir_path):
file_extension = pathlib.Path(file).suffix
if file_extension == '.tsv' and 'rel-abundance' in file:
if file_extension == '.tsv' and 'rel-abundance' in file and not file.startswith('.'):
name = pathlib.Path(file).stem
name = name.replace('_rel-abundance', '')
df_sample = pd.read_csv(os.path.join(dir_path, file), sep='\t', dtype=str)
Expand Down Expand Up @@ -685,12 +686,41 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False):
stdout.write("Combined table generated: {}\n".format(out_path))
return df_combined_full


def find_repeats(path_to_tsv):
"""
Given a tsv file, return a set of duplicate taxids.
:param path_to_tsv: the path to the tsv file.
"""
df = pd.read_csv(path_to_tsv, sep="\t")
boolean_series = df.duplicated(subset=["tax_id"])
repeats = df["tax_id"][boolean_series]
return set(repeats)


def verify_db(path_to_db):
"""
Verify if there are two files species_taxid.fasta and taxonomy.tsv in the input directory (with the exact same name).
Raise an error if either of the two files is missing.
Then verify if there are duplicate entries in taxonomy.tsv. Raise an error if there are duplicates.
:param path_to_db: the path to the directory that's supposed to contain species_taxid.fasta and taxonomy.tsv
"""
# Check whether both species_taxid.fasta and taxonomy.tsv are present
if not (exists(path_to_db + "/" + "species_taxid.fasta") and exists(path_to_db + "/" + "taxonomy.tsv")):
raise FileNotFoundError("Missing species_taxid.fasta or taxonomy.tsv")
# Check for duplicates in taxonomy.tsv
repeats = find_repeats(path_to_db + "/" + "taxonomy.tsv")
if len(repeats) > 0:
raise ValueError("There are one or more duplicate entries in this database:\n" + str(repeats))
print("Database looks good!")


if __name__ == "__main__":
__version__ = "3.4.5"
parser = argparse.ArgumentParser()
parser.add_argument('--version', '-v', action='version', version='%(prog)s v' + __version__)
subparsers = parser.add_subparsers(dest="subparser_name", help='sub-commands')
abundance_parser = subparsers.\
abundance_parser = subparsers. \
add_parser("abundance", help="Generate relative abundance estimates")
abundance_parser.add_argument(
'input_file', type=str, nargs='+',
Expand Down Expand Up @@ -733,6 +763,9 @@ if __name__ == "__main__":
'--threads', type=int, default=3,
help='threads utilized by minimap [3]')

verify_db_parser = subparsers.add_parser("verify_db", help="Check the DB for duplicates")
verify_db_parser.add_argument('path_to_db', type=str, help='path to the directory containing two files')

build_db_parser = subparsers.add_parser("build-database",
help="Build custom Emu database")
build_db_parser.add_argument(
Expand Down Expand Up @@ -762,7 +795,7 @@ if __name__ == "__main__":
help='collapsed taxonomic rank')

combine_parser = subparsers.add_parser("combine-outputs",
help="Combine Emu rel abundance outputs to a single table")
help="Combine Emu rel abundance outputs to a single table")
combine_parser.add_argument(
'dir_path', type=str,
help='path to directory containing Emu output files')
Expand All @@ -775,8 +808,12 @@ if __name__ == "__main__":
combine_parser.add_argument(
'--counts', action="store_true",
help='counts rather than abundances in output table')

args = parser.parse_args()

if args.subparser_name == "verify_db":
verify_db(args.path_to_db)

if args.subparser_name == "abundance":
# check input file is fasta/q or sam alignment file
# validate_input(args.input_file[0])
Expand All @@ -803,12 +840,11 @@ if __name__ == "__main__":
log_prob_rgs, counts_unassigned, counts_assigned = log_prob_rgs_dict(
SAM_FILE, log_prob_cigar_op, longest_align_dict, locs_p_cigar_zero)
f_full, f_set_thresh, read_dist = expectation_maximization_iterations(log_prob_rgs,
db_species_tids,
.01, args.min_abundance)
db_species_tids,
.01, args.min_abundance)
freq_to_lineage_df(f_full, "{}_rel-abundance".format(out_file), df_taxonomy,
counts_assigned, counts_unassigned, args.keep_counts)


# output read assignment distributions as a tsv
if args.keep_read_assignments:
output_read_assignments(read_dist, "{}_read-assignment-distributions".format(out_file))
Expand All @@ -833,7 +869,6 @@ if __name__ == "__main__":
if os.path.exists(SAM_FILE):
os.remove(SAM_FILE)


# create a custom database with ncbi taxonomy
if args.subparser_name == "build-database":
emu_db_path = os.getcwd()
Expand Down Expand Up @@ -864,7 +899,6 @@ if __name__ == "__main__":
build_direct_taxonomy(db_unique_ids, args.taxonomy_list, output_taxonomy_location)
stdout.write("Database creation successful\n")


# collapse Emu results at desired taxonomic rank
if args.subparser_name == "collapse-taxonomy":
collapse_rank(args.input_path, args.rank)
Expand Down
6 changes: 6 additions & 0 deletions emu_database/.ipynb_checkpoints/verify_db-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"cells": [],
"metadata": {},
"nbformat": 4,
"nbformat_minor": 5
}
Loading