diff --git a/README.md b/README.md index a5fda68..0f6597b 100644 --- a/README.md +++ b/README.md @@ -206,6 +206,14 @@ Optional additional parameters: |--split-tables | output 2 tables: (1) abundances only at specified rank and (2) taxonomic lineages down to specified rank | |--counts | output estimated counts rather than relative abundance percentage in combined table. Only includes Emu relative abundance outputs that already have 'estimated counts' | +### Verify Database +The verify_db function can be used to validate a database prior to running emu. This function checks if there are two files species_taxid.fasta and taxonomy.tsv in the input directory (with the exact same name) and raises an error if either file is not found. Additionally, it checks if there are duplicate entries in taxonomy.tsv and rasises an error if duplicates are found. + +```bash +emu verify_db +``` + + ### System Requirements All software dependencies are listed in environment.yml. Emu v3.0.0 has been tested on Python v3.7 and used to generate results in manuscript. diff --git a/emu b/emu index c26cb31..0babd25 100755 --- a/emu +++ b/emu @@ -1,4 +1,4 @@ -#!/usr/bin/env python3 +#!/usr/bin/env python # -*- coding: utf-8 -*- # pylint: disable=consider-using-f-string """ @@ -9,6 +9,7 @@ import os import argparse import pathlib import subprocess +from os.path import exists from sys import stdout from operator import add, mul from pathlib import Path @@ -22,7 +23,6 @@ from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord - # static global variables CIGAR_OPS = [1, 2, 4, 10] CIGAR_OPS_ALL = [0, 1, 2, 4] @@ -143,7 +143,7 @@ def get_cigar_op_log_probabilities(sam_path): for i in sorted(zero_locs, reverse=True): del cigar_stats_primary[i] n_char = sum(cigar_stats_primary) - return [math.log(x) for x in np.array(cigar_stats_primary)/n_char], zero_locs, \ + return [math.log(x) for x in np.array(cigar_stats_primary) / n_char], zero_locs, \ dict_longest_align @@ -163,7 +163,7 @@ def compute_log_prob_rgs(alignment, cigar_stats, log_p_cigar_op, dict_longest_al ref_name, query_name = alignment.reference_name, alignment.query_name log_score = sum(list(map(mul, log_p_cigar_op, cigar_stats))) * \ - (dict_longest_align[query_name]/align_len) + (dict_longest_align[query_name] / align_len) species_tid = int(ref_name.split(":")[0]) return log_score, query_name, species_tid @@ -192,7 +192,7 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z cigar_stats = get_align_stats(alignment) log_score, query_name, species_tid = \ compute_log_prob_rgs(alignment, cigar_stats, log_p_cigar_op, - dict_longest_align, align_len) + dict_longest_align, align_len) if query_name not in log_p_rgs: log_p_rgs[query_name] = ([species_tid], [log_score]) @@ -217,12 +217,12 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z del cigar_stats[i] log_score, query_name, species_tid = \ compute_log_prob_rgs(alignment, cigar_stats, log_p_cigar_op, - dict_longest_align, align_len) + dict_longest_align, align_len) if query_name not in log_p_rgs: log_p_rgs[query_name] = ([species_tid], [log_score]) elif query_name in log_p_rgs and species_tid not in log_p_rgs[query_name][0]: - log_p_rgs[query_name] = (log_p_rgs[query_name][0] +[species_tid], + log_p_rgs[query_name] = (log_p_rgs[query_name][0] + [species_tid], log_p_rgs[query_name][1] + [log_score]) else: logprgs_idx = log_p_rgs[query_name][0].index(species_tid) @@ -238,8 +238,8 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z ## remove low likelihood alignments? ## remove if p(r|s) < 0.01 - #min_p_thresh = math.log(0.01) - #log_p_rgs = {r_map: val for r_map, val in log_p_rgs.items() if val > min_p_thresh} + # min_p_thresh = math.log(0.01) + # log_p_rgs = {r_map: val for r_map, val in log_p_rgs.items() if val > min_p_thresh} return log_p_rgs, unassigned_count, len(assigned_reads) @@ -276,7 +276,7 @@ def expectation_maximization(log_p_rgs, freq): log_p_rns.append(logprns_val) if len(valid_seqs) != 0: - #np.array([log_p_rns], dtype=np.float64) + # np.array([log_p_rns], dtype=np.float64) logc = -np.max(log_p_rns) # calculate fixed multiplier, c prnsc = np.exp(log_p_rns + logc) # calculates exp(log(L(r|s) * f(s) * c)) prc = np.sum(prnsc) # calculates sum of (L(r|s) * f(s) * c) for each read @@ -312,9 +312,9 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr freq, counter = dict.fromkeys(db_ids, 1 / n_db), 1 # set output abundance threshold - freq_thresh = 1/n_reads + freq_thresh = 1 / n_reads if n_reads > 1000: - freq_thresh = 10/n_reads + freq_thresh = 10 / n_reads # performs iterations of the expectation_maximization algorithm total_log_likelihood = -math.inf @@ -336,7 +336,7 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr if log_likelihood_diff < lli_thresh: stdout.write("Number of EM iterations: {}\n".format(counter)) freq = {k: v for k, v in freq.items() if v >= freq_thresh} - # remove tax id if less than the frequency threshold + # remove tax id if less than the frequency threshold freq_full, updated_log_likelihood, p_sgr = expectation_maximization(log_p_rgs, freq) freq_set_thresh = None if freq_thresh < input_threshold: @@ -345,8 +345,8 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr expectation_maximization(log_p_rgs, freq) return freq_full, freq_set_thresh, p_sgr - #output current estimation - #freq_to_lineage_df(freq, f"{out_file}_{counter}", df_nodes, df_names) + # output current estimation + # freq_to_lineage_df(freq, f"{out_file}_{counter}", df_nodes, df_names) counter += 1 @@ -368,7 +368,7 @@ def lineage_dict_from_tid(taxid, nodes_dict, names_dict): while names_dict[taxid] != "root": tup = nodes_dict[taxid] # find the name for each taxonomic rank - if tup[1] in RANKS_PRINTOUT: # check rank in printout list + if tup[1] in RANKS_PRINTOUT: # check rank in printout list idx = RANKS_PRINTOUT.index(tup[1]) lineage_list[idx] = names_dict[taxid] taxid = tup[0] @@ -388,12 +388,12 @@ def freq_to_lineage_df(freq, tsv_output_path, taxonomy_df, assigned_count, counts(boolean): True if include estimated counts in output .tsv file returns(df): pandas df with lineage and abundances for values in f """ - #add tax lineage for values in freq + # add tax lineage for values in freq results_df = pd.DataFrame(zip(list(freq.keys()) + ['unassigned'], list(freq.values()) + [0]), columns=["tax_id", "abundance"]).set_index('tax_id') results_df = results_df.join(taxonomy_df, how='left').reset_index() - #add in the estimated count values for the assigned and unassigned counts + # add in the estimated count values for the assigned and unassigned counts if counts: counts_series = pd.concat([(results_df["abundance"] * assigned_count)[:-1], pd.Series(unassigned_count)], ignore_index=True) @@ -408,7 +408,7 @@ def generate_alignments(in_file_list, out_basename, database): in_file_list(list(str)): list of path(s) to input sequences out_basename(str): path and basename for output files """ - #indicate filepath and create file + # indicate filepath and create file input_file = " ".join(in_file_list) filetype = pathlib.PurePath(args.input_file[0]).suffix if filetype == '.sam': @@ -461,7 +461,7 @@ def create_names_dict(names_path): 'name_txt' as values """ name_headers = ['tax_id', 'name_txt', 'name_class'] - names_df = pd.read_csv(names_path, sep='|', index_col=False, header=None, dtype=str)\ + names_df = pd.read_csv(names_path, sep='|', index_col=False, header=None, dtype=str) \ .drop([2, 4], axis=1) names_df.columns = name_headers for col in names_df.columns: @@ -582,7 +582,7 @@ def build_ncbi_taxonomy(unique_tids, nodes_dict, names_dict, filepath): """ with open(filepath, "w", encoding="utf8") as file: # write the header to the file - dummy_str = '\t'.join(['%s',] * len(RANKS_PRINTOUT)) + '\n' + dummy_str = '\t'.join(['%s', ] * len(RANKS_PRINTOUT)) + '\n' file.write(dummy_str % tuple(RANKS_PRINTOUT)) # write each lineage as a row in the file for tid in unique_tids: @@ -606,7 +606,7 @@ def build_direct_taxonomy(tid_set, lineage_path, taxonomy_file): tax_output_file.write(first_line.split("\t", 1)[1]) for line in file: tax_id = line.split("\t", 1)[0] - if tax_id in tid_set: #only add if taxid is in fasta + if tax_id in tid_set: # only add if taxid is in fasta tax_output_file.write(line) @@ -638,6 +638,7 @@ def collapse_rank(path, rank): df_emu_copy.to_csv(output_path, sep='\t') stdout.write("File generated: {}\n".format(output_path)) + def combine_outputs(dir_path, rank, split_files=False, count_table=False): """ Combines multiple Emu output relative abundance tables into a single table. Inlcudes all .tsv files with 'rel-abundance' in the filename in the provided dir_path. @@ -653,7 +654,7 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False): metric = 'estimated counts' for file in os.listdir(dir_path): file_extension = pathlib.Path(file).suffix - if file_extension == '.tsv' and 'rel-abundance' in file: + if file_extension == '.tsv' and 'rel-abundance' in file and not file.startswith('.'): name = pathlib.Path(file).stem name = name.replace('_rel-abundance', '') df_sample = pd.read_csv(os.path.join(dir_path, file), sep='\t', dtype=str) @@ -685,12 +686,41 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False): stdout.write("Combined table generated: {}\n".format(out_path)) return df_combined_full + +def find_repeats(path_to_tsv): + """ + Given a tsv file, return a set of duplicate taxids. + :param path_to_tsv: the path to the tsv file. + """ + df = pd.read_csv(path_to_tsv, sep="\t") + boolean_series = df.duplicated(subset=["tax_id"]) + repeats = df["tax_id"][boolean_series] + return set(repeats) + + +def verify_db(path_to_db): + """ + Verify if there are two files species_taxid.fasta and taxonomy.tsv in the input directory (with the exact same name). + Raise an error if either of the two files is missing. + Then verify if there are duplicate entries in taxonomy.tsv. Raise an error if there are duplicates. + :param path_to_db: the path to the directory that's supposed to contain species_taxid.fasta and taxonomy.tsv + """ + # Check whether both species_taxid.fasta and taxonomy.tsv are present + if not (exists(path_to_db + "/" + "species_taxid.fasta") and exists(path_to_db + "/" + "taxonomy.tsv")): + raise FileNotFoundError("Missing species_taxid.fasta or taxonomy.tsv") + # Check for duplicates in taxonomy.tsv + repeats = find_repeats(path_to_db + "/" + "taxonomy.tsv") + if len(repeats) > 0: + raise ValueError("There are one or more duplicate entries in this database:\n" + str(repeats)) + print("Database looks good!") + + if __name__ == "__main__": __version__ = "3.4.5" parser = argparse.ArgumentParser() parser.add_argument('--version', '-v', action='version', version='%(prog)s v' + __version__) subparsers = parser.add_subparsers(dest="subparser_name", help='sub-commands') - abundance_parser = subparsers.\ + abundance_parser = subparsers. \ add_parser("abundance", help="Generate relative abundance estimates") abundance_parser.add_argument( 'input_file', type=str, nargs='+', @@ -733,6 +763,9 @@ if __name__ == "__main__": '--threads', type=int, default=3, help='threads utilized by minimap [3]') + verify_db_parser = subparsers.add_parser("verify_db", help="Check the DB for duplicates") + verify_db_parser.add_argument('path_to_db', type=str, help='path to the directory containing two files') + build_db_parser = subparsers.add_parser("build-database", help="Build custom Emu database") build_db_parser.add_argument( @@ -762,7 +795,7 @@ if __name__ == "__main__": help='collapsed taxonomic rank') combine_parser = subparsers.add_parser("combine-outputs", - help="Combine Emu rel abundance outputs to a single table") + help="Combine Emu rel abundance outputs to a single table") combine_parser.add_argument( 'dir_path', type=str, help='path to directory containing Emu output files') @@ -775,8 +808,12 @@ if __name__ == "__main__": combine_parser.add_argument( '--counts', action="store_true", help='counts rather than abundances in output table') + args = parser.parse_args() + if args.subparser_name == "verify_db": + verify_db(args.path_to_db) + if args.subparser_name == "abundance": # check input file is fasta/q or sam alignment file # validate_input(args.input_file[0]) @@ -803,12 +840,11 @@ if __name__ == "__main__": log_prob_rgs, counts_unassigned, counts_assigned = log_prob_rgs_dict( SAM_FILE, log_prob_cigar_op, longest_align_dict, locs_p_cigar_zero) f_full, f_set_thresh, read_dist = expectation_maximization_iterations(log_prob_rgs, - db_species_tids, - .01, args.min_abundance) + db_species_tids, + .01, args.min_abundance) freq_to_lineage_df(f_full, "{}_rel-abundance".format(out_file), df_taxonomy, counts_assigned, counts_unassigned, args.keep_counts) - # output read assignment distributions as a tsv if args.keep_read_assignments: output_read_assignments(read_dist, "{}_read-assignment-distributions".format(out_file)) @@ -833,7 +869,6 @@ if __name__ == "__main__": if os.path.exists(SAM_FILE): os.remove(SAM_FILE) - # create a custom database with ncbi taxonomy if args.subparser_name == "build-database": emu_db_path = os.getcwd() @@ -864,7 +899,6 @@ if __name__ == "__main__": build_direct_taxonomy(db_unique_ids, args.taxonomy_list, output_taxonomy_location) stdout.write("Database creation successful\n") - # collapse Emu results at desired taxonomic rank if args.subparser_name == "collapse-taxonomy": collapse_rank(args.input_path, args.rank) diff --git a/emu_database/.ipynb_checkpoints/verify_db-checkpoint.ipynb b/emu_database/.ipynb_checkpoints/verify_db-checkpoint.ipynb new file mode 100644 index 0000000..363fcab --- /dev/null +++ b/emu_database/.ipynb_checkpoints/verify_db-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/emu_database/verify_db.ipynb b/emu_database/verify_db.ipynb new file mode 100644 index 0000000..8b696c1 --- /dev/null +++ b/emu_database/verify_db.ipynb @@ -0,0 +1,439 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "id": "6c044dc2", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "87cabaca", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "df = pd.read_csv(\"/Users/hubingbing/Downloads/Duplicate_taxid/taxonomy.tsv\", sep=\"\\t\") " + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "17d0da19", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tax_idspeciesgenusfamilyorderclassphylumcladesuperkingdomsubspeciesspecies subgroupspecies group
0622671Desulfobotulus alkaliphilusDesulfobotulusDesulfobacteraceaeDesulfobacteralesDeltaproteobacteriaProteobacteriaNaNBacteriaNaNNaNNaN
11338382Desulfitobacterium sp. PRDesulfitobacteriumDesulfitobacteriaceaeEubacterialesClostridiaFirmicutesTerrabacteria groupBacteriaNaNNaNNaN
21610527Devosia honganensisDevosiaDevosiaceaeHyphomicrobialesAlphaproteobacteriaProteobacteriaNaNBacteriaNaNNaNNaN
3426706Desulfomicrobium aestuariiDesulfomicrobiumDesulfomicrobiaceaeDesulfovibrionalesDeltaproteobacteriaProteobacteriaNaNBacteriaNaNNaNNaN
481459Entomoplasma melaleucaeEntomoplasmaEntomoplasmataceaeEntomoplasmatalesMollicutesTenericutesTerrabacteria groupBacteriaNaNNaNNaN
.......................................
47558171390Chroococcidiopsis sp. BB96.1ChroococcidiopsisChroococcidiopsidaceaeChroococcidiopsidalesNaNCyanobacteriaTerrabacteria groupBacteriaNaNNaNNaN
47559171290Mycoplasma oxoniensisMycoplasmaMycoplasmataceaeMycoplasmatalesMollicutesTenericutesTerrabacteria groupBacteriaNaNNaNNaN
47560171388Chroococcidiopsis sp. BB82.3ChroococcidiopsisChroococcidiopsidaceaeChroococcidiopsidalesNaNCyanobacteriaTerrabacteria groupBacteriaNaNNaNNaN
47561171387Chroococcidiopsis sp. BB79.2ChroococcidiopsisChroococcidiopsidaceaeChroococcidiopsidalesNaNCyanobacteriaTerrabacteria groupBacteriaNaNNaNNaN
47562171395Fischerella sp. BB98.1FischerellaHapalosiphonaceaeNostocalesNaNCyanobacteriaTerrabacteria groupBacteriaNaNNaNNaN
\n", + "

47563 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " tax_id species genus \\\n", + "0 622671 Desulfobotulus alkaliphilus Desulfobotulus \n", + "1 1338382 Desulfitobacterium sp. PR Desulfitobacterium \n", + "2 1610527 Devosia honganensis Devosia \n", + "3 426706 Desulfomicrobium aestuarii Desulfomicrobium \n", + "4 81459 Entomoplasma melaleucae Entomoplasma \n", + "... ... ... ... \n", + "47558 171390 Chroococcidiopsis sp. BB96.1 Chroococcidiopsis \n", + "47559 171290 Mycoplasma oxoniensis Mycoplasma \n", + "47560 171388 Chroococcidiopsis sp. BB82.3 Chroococcidiopsis \n", + "47561 171387 Chroococcidiopsis sp. BB79.2 Chroococcidiopsis \n", + "47562 171395 Fischerella sp. BB98.1 Fischerella \n", + "\n", + " family order class \\\n", + "0 Desulfobacteraceae Desulfobacterales Deltaproteobacteria \n", + "1 Desulfitobacteriaceae Eubacteriales Clostridia \n", + "2 Devosiaceae Hyphomicrobiales Alphaproteobacteria \n", + "3 Desulfomicrobiaceae Desulfovibrionales Deltaproteobacteria \n", + "4 Entomoplasmataceae Entomoplasmatales Mollicutes \n", + "... ... ... ... \n", + "47558 Chroococcidiopsidaceae Chroococcidiopsidales NaN \n", + "47559 Mycoplasmataceae Mycoplasmatales Mollicutes \n", + "47560 Chroococcidiopsidaceae Chroococcidiopsidales NaN \n", + "47561 Chroococcidiopsidaceae Chroococcidiopsidales NaN \n", + "47562 Hapalosiphonaceae Nostocales NaN \n", + "\n", + " phylum clade superkingdom subspecies \\\n", + "0 Proteobacteria NaN Bacteria NaN \n", + "1 Firmicutes Terrabacteria group Bacteria NaN \n", + "2 Proteobacteria NaN Bacteria NaN \n", + "3 Proteobacteria NaN Bacteria NaN \n", + "4 Tenericutes Terrabacteria group Bacteria NaN \n", + "... ... ... ... ... \n", + "47558 Cyanobacteria Terrabacteria group Bacteria NaN \n", + "47559 Tenericutes Terrabacteria group Bacteria NaN \n", + "47560 Cyanobacteria Terrabacteria group Bacteria NaN \n", + "47561 Cyanobacteria Terrabacteria group Bacteria NaN \n", + "47562 Cyanobacteria Terrabacteria group Bacteria NaN \n", + "\n", + " species subgroup species group \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "... ... ... \n", + "47558 NaN NaN \n", + "47559 NaN NaN \n", + "47560 NaN NaN \n", + "47561 NaN NaN \n", + "47562 NaN NaN \n", + "\n", + "[47563 rows x 12 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "7aa22f8c", + "metadata": {}, + "outputs": [], + "source": [ + "boolean_series = df.duplicated(subset=[\"tax_id\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "319879be", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "repeats = df[\"tax_id\"][boolean_series]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "22c37a16", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "246\n" + ] + } + ], + "source": [ + "print(repeats.size)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "f6b3c696", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "print(type(repeats))" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "c663d580", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "897 1428\n", + "4063 623\n", + "8264 1894\n", + "8641 1245\n", + "15318 358\n", + " ... \n", + "47535 1534743\n", + "47541 2083055\n", + "47542 142651\n", + "47543 2083054\n", + "47544 2122\n", + "Name: tax_id, Length: 246, dtype: int64" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "repeats" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "eb13749a", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1428, 623, 1894, 1245, 358, 13687, 1883, 1886637, 584, 307643, 1532905, 1696, 531938, 714, 47760, 1883, 53445, 332101, 286, 298, 395600, 698769, 1089306, 1841, 32008, 2056873, 1961, 67384, 32046, 1883, 84567, 71190, 1649279, 476, 51669, 394095, 2236, 413401, 496866, 157076, 330922, 512649, 930805, 371036, 727, 1921578, 577492, 1074311, 317, 535712, 81034, 43997, 28189, 1486246, 53462, 2093, 691, 44249, 306, 813, 381300, 2093, 669, 223527, 2093, 1458492, 2093, 2093, 1882268, 316, 1935978, 76832, 256, 669, 1283291, 33, 83456, 77097, 1917449, 316, 1126240, 29284, 65093, 261685, 395935, 452084, 215690, 1416776, 502829, 669, 669, 669, 44933, 370769, 1534743, 588898, 2337, 2093, 2746, 35622, 286, 1827300, 1508420, 1508419, 2093, 44249, 2104, 1028989, 483539, 2109, 158080, 1278643, 669, 2093, 2093, 76630, 1346688, 1636610, 669, 1278643, 376489, 114886, 2093, 2746, 2109, 2054919, 1338689, 1278643, 2746, 1338689, 52021, 370323, 2093, 370769, 286, 52021, 444459, 413816, 376489, 253237, 413815, 1534743, 2067572, 2067572, 146785, 2336, 691825, 150396, 2093, 2104, 29557, 2054919, 1636610, 2093, 2746, 669, 2083054, 368454, 1338689, 2083054, 146785, 245294, 2093, 380547, 142651, 425309, 669, 353800, 29561, 171632, 520762, 2067572, 224312, 224312, 1500686, 329163, 28227, 1534743, 2109, 1278643, 2161724, 669, 2109, 370769, 52021, 142651, 64713, 669, 650850, 370323, 691825, 370323, 118675, 353800, 29557, 2109, 245294, 114885, 370769, 1500686, 150396, 392421, 2746, 1547897, 872983, 2746, 44933, 2054919, 376489, 2336, 2746, 1827300, 392421, 35622, 1500686, 29283, 337243, 2746, 1534743, 368454, 337243, 2083055, 48003, 2337, 286, 2746, 2107, 1338689, 171632, 416585, 2109, 44933, 1534743, 28227, 1383851, 64713, 353800, 50740, 1534743, 1500687, 380547, 1534743, 2083055, 142651, 2083054, 2122\n" + ] + } + ], + "source": [ + "repeated_id_str = \"\"\n", + "for i in repeats:\n", + " repeated_id_str += str(i)+\", \"\n", + "print(repeated_id_str[:-2])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/emu_default 2.tar.gz b/emu_default 2.tar.gz new file mode 100644 index 0000000..159c446 Binary files /dev/null and b/emu_default 2.tar.gz differ diff --git a/verify_DB.py b/verify_DB.py new file mode 100644 index 0000000..f8f689a --- /dev/null +++ b/verify_DB.py @@ -0,0 +1,21 @@ +from os.path import exists +import pandas as pd + + +def find_repeats(path_to_tsv): + df = pd.read_csv(path_to_tsv, sep="\t") + boolean_series = df.duplicated(subset=["tax_id"]) + repeats = df["tax_id"][boolean_series] + return set(repeats) + + +def verify_db(path_to_db): + # Check whether both species_taxid.fasta and taxonomy.tsv are present + if not (exists(path_to_db + "/" + "species_taxid.fasta") and exists(path_to_db + "/" + "taxonomy.tsv")): + raise FileNotFoundError("Missing species_taxid.fasta or taxonomy.tsv") + # Check for duplicates in taxonomy.tsv + repeats = find_repeats(path_to_db + "/" + "taxonomy.tsv") + if len(repeats) > 0: + raise ValueError("There are one or more duplicate entries in this database:\n" + str(repeats)) + print("Database looks good!") +