diff --git a/README.md b/README.md
index a5fda68..0f6597b 100644
--- a/README.md
+++ b/README.md
@@ -206,6 +206,14 @@ Optional additional parameters:
 |--split-tables	| output 2 tables: (1) abundances only at specified rank and (2) taxonomic lineages down to specified rank	|
 |--counts	| output estimated counts rather than relative abundance percentage in combined table. Only includes Emu relative abundance outputs that already have 'estimated counts'  |
 
+### Verify Database
+The verify_db function can be used to validate a database prior to running emu. This function checks if there are two files species_taxid.fasta and taxonomy.tsv in the input directory (with the exact same name) and raises an error if either file is not found. Additionally, it checks if there are duplicate entries in taxonomy.tsv and rasises an error if duplicates are found. 
+
+```bash
+emu verify_db <directory_path>
+```
+
+
 ### System Requirements
 
 All software dependencies are listed in environment.yml. Emu v3.0.0 has been tested on Python v3.7 and used to generate results in manuscript.
diff --git a/emu b/emu
index c26cb31..0babd25 100755
--- a/emu
+++ b/emu
@@ -1,4 +1,4 @@
-#!/usr/bin/env python3
+#!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # pylint: disable=consider-using-f-string
 """
@@ -9,6 +9,7 @@ import os
 import argparse
 import pathlib
 import subprocess
+from os.path import exists
 from sys import stdout
 from operator import add, mul
 from pathlib import Path
@@ -22,7 +23,6 @@ from Bio import SeqIO
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 
-
 # static global variables
 CIGAR_OPS = [1, 2, 4, 10]
 CIGAR_OPS_ALL = [0, 1, 2, 4]
@@ -143,7 +143,7 @@ def get_cigar_op_log_probabilities(sam_path):
         for i in sorted(zero_locs, reverse=True):
             del cigar_stats_primary[i]
     n_char = sum(cigar_stats_primary)
-    return [math.log(x) for x in np.array(cigar_stats_primary)/n_char], zero_locs, \
+    return [math.log(x) for x in np.array(cigar_stats_primary) / n_char], zero_locs, \
            dict_longest_align
 
 
@@ -163,7 +163,7 @@ def compute_log_prob_rgs(alignment, cigar_stats, log_p_cigar_op, dict_longest_al
 
     ref_name, query_name = alignment.reference_name, alignment.query_name
     log_score = sum(list(map(mul, log_p_cigar_op, cigar_stats))) * \
-                (dict_longest_align[query_name]/align_len)
+                (dict_longest_align[query_name] / align_len)
     species_tid = int(ref_name.split(":")[0])
     return log_score, query_name, species_tid
 
@@ -192,7 +192,7 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
                 cigar_stats = get_align_stats(alignment)
                 log_score, query_name, species_tid = \
                     compute_log_prob_rgs(alignment, cigar_stats, log_p_cigar_op,
-                                        dict_longest_align, align_len)
+                                         dict_longest_align, align_len)
 
                 if query_name not in log_p_rgs:
                     log_p_rgs[query_name] = ([species_tid], [log_score])
@@ -217,12 +217,12 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
                         del cigar_stats[i]
                     log_score, query_name, species_tid = \
                         compute_log_prob_rgs(alignment, cigar_stats, log_p_cigar_op,
-                                            dict_longest_align, align_len)
+                                             dict_longest_align, align_len)
 
                     if query_name not in log_p_rgs:
                         log_p_rgs[query_name] = ([species_tid], [log_score])
                     elif query_name in log_p_rgs and species_tid not in log_p_rgs[query_name][0]:
-                        log_p_rgs[query_name] = (log_p_rgs[query_name][0] +[species_tid],
+                        log_p_rgs[query_name] = (log_p_rgs[query_name][0] + [species_tid],
                                                  log_p_rgs[query_name][1] + [log_score])
                     else:
                         logprgs_idx = log_p_rgs[query_name][0].index(species_tid)
@@ -238,8 +238,8 @@ def log_prob_rgs_dict(sam_path, log_p_cigar_op, dict_longest_align, p_cigar_op_z
 
     ## remove low likelihood alignments?
     ## remove if p(r|s) < 0.01
-    #min_p_thresh = math.log(0.01)
-    #log_p_rgs = {r_map: val for r_map, val in log_p_rgs.items() if val > min_p_thresh}
+    # min_p_thresh = math.log(0.01)
+    # log_p_rgs = {r_map: val for r_map, val in log_p_rgs.items() if val > min_p_thresh}
     return log_p_rgs, unassigned_count, len(assigned_reads)
 
 
@@ -276,7 +276,7 @@ def expectation_maximization(log_p_rgs, freq):
                 log_p_rns.append(logprns_val)
 
         if len(valid_seqs) != 0:
-            #np.array([log_p_rns], dtype=np.float64)
+            # np.array([log_p_rns], dtype=np.float64)
             logc = -np.max(log_p_rns)  # calculate fixed multiplier, c
             prnsc = np.exp(log_p_rns + logc)  # calculates exp(log(L(r|s) * f(s) * c))
             prc = np.sum(prnsc)  # calculates sum of (L(r|s) * f(s) * c) for each read
@@ -312,9 +312,9 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr
     freq, counter = dict.fromkeys(db_ids, 1 / n_db), 1
 
     # set output abundance threshold
-    freq_thresh = 1/n_reads
+    freq_thresh = 1 / n_reads
     if n_reads > 1000:
-        freq_thresh = 10/n_reads
+        freq_thresh = 10 / n_reads
 
     # performs iterations of the expectation_maximization algorithm
     total_log_likelihood = -math.inf
@@ -336,7 +336,7 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr
         if log_likelihood_diff < lli_thresh:
             stdout.write("Number of EM iterations: {}\n".format(counter))
             freq = {k: v for k, v in freq.items() if v >= freq_thresh}
-                # remove tax id if less than the frequency threshold
+            # remove tax id if less than the frequency threshold
             freq_full, updated_log_likelihood, p_sgr = expectation_maximization(log_p_rgs, freq)
             freq_set_thresh = None
             if freq_thresh < input_threshold:
@@ -345,8 +345,8 @@ def expectation_maximization_iterations(log_p_rgs, db_ids, lli_thresh, input_thr
                     expectation_maximization(log_p_rgs, freq)
             return freq_full, freq_set_thresh, p_sgr
 
-        #output current estimation
-        #freq_to_lineage_df(freq, f"{out_file}_{counter}", df_nodes, df_names)
+        # output current estimation
+        # freq_to_lineage_df(freq, f"{out_file}_{counter}", df_nodes, df_names)
         counter += 1
 
 
@@ -368,7 +368,7 @@ def lineage_dict_from_tid(taxid, nodes_dict, names_dict):
     while names_dict[taxid] != "root":
         tup = nodes_dict[taxid]
         # find the name for each taxonomic rank
-        if tup[1] in RANKS_PRINTOUT: # check rank in printout list
+        if tup[1] in RANKS_PRINTOUT:  # check rank in printout list
             idx = RANKS_PRINTOUT.index(tup[1])
             lineage_list[idx] = names_dict[taxid]
         taxid = tup[0]
@@ -388,12 +388,12 @@ def freq_to_lineage_df(freq, tsv_output_path, taxonomy_df, assigned_count,
         counts(boolean): True if include estimated counts in output .tsv file
         returns(df): pandas df with lineage and abundances for values in f
     """
-    #add tax lineage for values in freq
+    # add tax lineage for values in freq
     results_df = pd.DataFrame(zip(list(freq.keys()) + ['unassigned'],
                                   list(freq.values()) + [0]),
                               columns=["tax_id", "abundance"]).set_index('tax_id')
     results_df = results_df.join(taxonomy_df, how='left').reset_index()
-        #add in the estimated count values for the assigned and unassigned counts
+    # add in the estimated count values for the assigned and unassigned counts
     if counts:
         counts_series = pd.concat([(results_df["abundance"] * assigned_count)[:-1],
                                    pd.Series(unassigned_count)], ignore_index=True)
@@ -408,7 +408,7 @@ def generate_alignments(in_file_list, out_basename, database):
         in_file_list(list(str)): list of path(s) to input sequences
         out_basename(str): path and basename for output files
     """
-    #indicate filepath and create file
+    # indicate filepath and create file
     input_file = " ".join(in_file_list)
     filetype = pathlib.PurePath(args.input_file[0]).suffix
     if filetype == '.sam':
@@ -461,7 +461,7 @@ def create_names_dict(names_path):
                 'name_txt' as values
     """
     name_headers = ['tax_id', 'name_txt', 'name_class']
-    names_df = pd.read_csv(names_path, sep='|', index_col=False, header=None, dtype=str)\
+    names_df = pd.read_csv(names_path, sep='|', index_col=False, header=None, dtype=str) \
         .drop([2, 4], axis=1)
     names_df.columns = name_headers
     for col in names_df.columns:
@@ -582,7 +582,7 @@ def build_ncbi_taxonomy(unique_tids, nodes_dict, names_dict, filepath):
     """
     with open(filepath, "w", encoding="utf8") as file:
         # write the header to the file
-        dummy_str = '\t'.join(['%s',] * len(RANKS_PRINTOUT)) + '\n'
+        dummy_str = '\t'.join(['%s', ] * len(RANKS_PRINTOUT)) + '\n'
         file.write(dummy_str % tuple(RANKS_PRINTOUT))
         # write each lineage as a row in the file
         for tid in unique_tids:
@@ -606,7 +606,7 @@ def build_direct_taxonomy(tid_set, lineage_path, taxonomy_file):
             tax_output_file.write(first_line.split("\t", 1)[1])
             for line in file:
                 tax_id = line.split("\t", 1)[0]
-                if tax_id in tid_set: #only add if taxid is in fasta
+                if tax_id in tid_set:  # only add if taxid is in fasta
                     tax_output_file.write(line)
 
 
@@ -638,6 +638,7 @@ def collapse_rank(path, rank):
     df_emu_copy.to_csv(output_path, sep='\t')
     stdout.write("File generated: {}\n".format(output_path))
 
+
 def combine_outputs(dir_path, rank, split_files=False, count_table=False):
     """ Combines multiple Emu output relative abundance tables into a single table.
         Inlcudes all .tsv files with 'rel-abundance' in the filename in the provided dir_path.
@@ -653,7 +654,7 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False):
         metric = 'estimated counts'
     for file in os.listdir(dir_path):
         file_extension = pathlib.Path(file).suffix
-        if file_extension == '.tsv' and 'rel-abundance' in file:
+        if file_extension == '.tsv' and 'rel-abundance' in file and not file.startswith('.'):
             name = pathlib.Path(file).stem
             name = name.replace('_rel-abundance', '')
             df_sample = pd.read_csv(os.path.join(dir_path, file), sep='\t', dtype=str)
@@ -685,12 +686,41 @@ def combine_outputs(dir_path, rank, split_files=False, count_table=False):
         stdout.write("Combined table generated: {}\n".format(out_path))
     return df_combined_full
 
+
+def find_repeats(path_to_tsv):
+    """
+    Given a tsv file, return a set of duplicate taxids.
+    :param path_to_tsv: the path to the tsv file.
+    """
+    df = pd.read_csv(path_to_tsv, sep="\t")
+    boolean_series = df.duplicated(subset=["tax_id"])
+    repeats = df["tax_id"][boolean_series]
+    return set(repeats)
+
+
+def verify_db(path_to_db):
+    """
+    Verify if there are two files species_taxid.fasta and taxonomy.tsv in the input directory (with the exact same name).
+    Raise an error if either of the two files is missing.
+    Then verify if there are duplicate entries in taxonomy.tsv. Raise an error if there are duplicates.
+    :param path_to_db: the path to the directory that's supposed to contain species_taxid.fasta and taxonomy.tsv
+    """
+    # Check whether both species_taxid.fasta and taxonomy.tsv are present
+    if not (exists(path_to_db + "/" + "species_taxid.fasta") and exists(path_to_db + "/" + "taxonomy.tsv")):
+        raise FileNotFoundError("Missing species_taxid.fasta or taxonomy.tsv")
+    # Check for duplicates in taxonomy.tsv
+    repeats = find_repeats(path_to_db + "/" + "taxonomy.tsv")
+    if len(repeats) > 0:
+        raise ValueError("There are one or more duplicate entries in this database:\n" + str(repeats))
+    print("Database looks good!")
+
+
 if __name__ == "__main__":
     __version__ = "3.4.5"
     parser = argparse.ArgumentParser()
     parser.add_argument('--version', '-v', action='version', version='%(prog)s v' + __version__)
     subparsers = parser.add_subparsers(dest="subparser_name", help='sub-commands')
-    abundance_parser = subparsers.\
+    abundance_parser = subparsers. \
         add_parser("abundance", help="Generate relative abundance estimates")
     abundance_parser.add_argument(
         'input_file', type=str, nargs='+',
@@ -733,6 +763,9 @@ if __name__ == "__main__":
         '--threads', type=int, default=3,
         help='threads utilized by minimap [3]')
 
+    verify_db_parser = subparsers.add_parser("verify_db", help="Check the DB for duplicates")
+    verify_db_parser.add_argument('path_to_db', type=str, help='path to the directory containing two files')
+
     build_db_parser = subparsers.add_parser("build-database",
                                             help="Build custom Emu database")
     build_db_parser.add_argument(
@@ -762,7 +795,7 @@ if __name__ == "__main__":
         help='collapsed taxonomic rank')
 
     combine_parser = subparsers.add_parser("combine-outputs",
-                                            help="Combine Emu rel abundance outputs to a single table")
+                                           help="Combine Emu rel abundance outputs to a single table")
     combine_parser.add_argument(
         'dir_path', type=str,
         help='path to directory containing Emu output files')
@@ -775,8 +808,12 @@ if __name__ == "__main__":
     combine_parser.add_argument(
         '--counts', action="store_true",
         help='counts rather than abundances in output table')
+
     args = parser.parse_args()
 
+    if args.subparser_name == "verify_db":
+        verify_db(args.path_to_db)
+
     if args.subparser_name == "abundance":
         # check input file is fasta/q or sam alignment file
         # validate_input(args.input_file[0])
@@ -803,12 +840,11 @@ if __name__ == "__main__":
         log_prob_rgs, counts_unassigned, counts_assigned = log_prob_rgs_dict(
             SAM_FILE, log_prob_cigar_op, longest_align_dict, locs_p_cigar_zero)
         f_full, f_set_thresh, read_dist = expectation_maximization_iterations(log_prob_rgs,
-                                                                   db_species_tids,
-                                                                   .01, args.min_abundance)
+                                                                              db_species_tids,
+                                                                              .01, args.min_abundance)
         freq_to_lineage_df(f_full, "{}_rel-abundance".format(out_file), df_taxonomy,
                            counts_assigned, counts_unassigned, args.keep_counts)
 
-
         # output read assignment distributions as a tsv
         if args.keep_read_assignments:
             output_read_assignments(read_dist, "{}_read-assignment-distributions".format(out_file))
@@ -833,7 +869,6 @@ if __name__ == "__main__":
             if os.path.exists(SAM_FILE):
                 os.remove(SAM_FILE)
 
-
     # create a custom database with ncbi taxonomy
     if args.subparser_name == "build-database":
         emu_db_path = os.getcwd()
@@ -864,7 +899,6 @@ if __name__ == "__main__":
             build_direct_taxonomy(db_unique_ids, args.taxonomy_list, output_taxonomy_location)
         stdout.write("Database creation successful\n")
 
-
     # collapse Emu results at desired taxonomic rank
     if args.subparser_name == "collapse-taxonomy":
         collapse_rank(args.input_path, args.rank)
diff --git a/emu_database/.ipynb_checkpoints/verify_db-checkpoint.ipynb b/emu_database/.ipynb_checkpoints/verify_db-checkpoint.ipynb
new file mode 100644
index 0000000..363fcab
--- /dev/null
+++ b/emu_database/.ipynb_checkpoints/verify_db-checkpoint.ipynb
@@ -0,0 +1,6 @@
+{
+ "cells": [],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/emu_database/verify_db.ipynb b/emu_database/verify_db.ipynb
new file mode 100644
index 0000000..8b696c1
--- /dev/null
+++ b/emu_database/verify_db.ipynb
@@ -0,0 +1,439 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "6c044dc2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "87cabaca",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.read_csv(\"/Users/hubingbing/Downloads/Duplicate_taxid/taxonomy.tsv\", sep=\"\\t\") "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "17d0da19",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>tax_id</th>\n",
+       "      <th>species</th>\n",
+       "      <th>genus</th>\n",
+       "      <th>family</th>\n",
+       "      <th>order</th>\n",
+       "      <th>class</th>\n",
+       "      <th>phylum</th>\n",
+       "      <th>clade</th>\n",
+       "      <th>superkingdom</th>\n",
+       "      <th>subspecies</th>\n",
+       "      <th>species subgroup</th>\n",
+       "      <th>species group</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>622671</td>\n",
+       "      <td>Desulfobotulus alkaliphilus</td>\n",
+       "      <td>Desulfobotulus</td>\n",
+       "      <td>Desulfobacteraceae</td>\n",
+       "      <td>Desulfobacterales</td>\n",
+       "      <td>Deltaproteobacteria</td>\n",
+       "      <td>Proteobacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Bacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1338382</td>\n",
+       "      <td>Desulfitobacterium sp. PR</td>\n",
+       "      <td>Desulfitobacterium</td>\n",
+       "      <td>Desulfitobacteriaceae</td>\n",
+       "      <td>Eubacteriales</td>\n",
+       "      <td>Clostridia</td>\n",
+       "      <td>Firmicutes</td>\n",
+       "      <td>Terrabacteria group</td>\n",
+       "      <td>Bacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1610527</td>\n",
+       "      <td>Devosia honganensis</td>\n",
+       "      <td>Devosia</td>\n",
+       "      <td>Devosiaceae</td>\n",
+       "      <td>Hyphomicrobiales</td>\n",
+       "      <td>Alphaproteobacteria</td>\n",
+       "      <td>Proteobacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Bacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>426706</td>\n",
+       "      <td>Desulfomicrobium aestuarii</td>\n",
+       "      <td>Desulfomicrobium</td>\n",
+       "      <td>Desulfomicrobiaceae</td>\n",
+       "      <td>Desulfovibrionales</td>\n",
+       "      <td>Deltaproteobacteria</td>\n",
+       "      <td>Proteobacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Bacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>81459</td>\n",
+       "      <td>Entomoplasma melaleucae</td>\n",
+       "      <td>Entomoplasma</td>\n",
+       "      <td>Entomoplasmataceae</td>\n",
+       "      <td>Entomoplasmatales</td>\n",
+       "      <td>Mollicutes</td>\n",
+       "      <td>Tenericutes</td>\n",
+       "      <td>Terrabacteria group</td>\n",
+       "      <td>Bacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>47558</th>\n",
+       "      <td>171390</td>\n",
+       "      <td>Chroococcidiopsis sp. BB96.1</td>\n",
+       "      <td>Chroococcidiopsis</td>\n",
+       "      <td>Chroococcidiopsidaceae</td>\n",
+       "      <td>Chroococcidiopsidales</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Cyanobacteria</td>\n",
+       "      <td>Terrabacteria group</td>\n",
+       "      <td>Bacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>47559</th>\n",
+       "      <td>171290</td>\n",
+       "      <td>Mycoplasma oxoniensis</td>\n",
+       "      <td>Mycoplasma</td>\n",
+       "      <td>Mycoplasmataceae</td>\n",
+       "      <td>Mycoplasmatales</td>\n",
+       "      <td>Mollicutes</td>\n",
+       "      <td>Tenericutes</td>\n",
+       "      <td>Terrabacteria group</td>\n",
+       "      <td>Bacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>47560</th>\n",
+       "      <td>171388</td>\n",
+       "      <td>Chroococcidiopsis sp. BB82.3</td>\n",
+       "      <td>Chroococcidiopsis</td>\n",
+       "      <td>Chroococcidiopsidaceae</td>\n",
+       "      <td>Chroococcidiopsidales</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Cyanobacteria</td>\n",
+       "      <td>Terrabacteria group</td>\n",
+       "      <td>Bacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>47561</th>\n",
+       "      <td>171387</td>\n",
+       "      <td>Chroococcidiopsis sp. BB79.2</td>\n",
+       "      <td>Chroococcidiopsis</td>\n",
+       "      <td>Chroococcidiopsidaceae</td>\n",
+       "      <td>Chroococcidiopsidales</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Cyanobacteria</td>\n",
+       "      <td>Terrabacteria group</td>\n",
+       "      <td>Bacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>47562</th>\n",
+       "      <td>171395</td>\n",
+       "      <td>Fischerella sp. BB98.1</td>\n",
+       "      <td>Fischerella</td>\n",
+       "      <td>Hapalosiphonaceae</td>\n",
+       "      <td>Nostocales</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Cyanobacteria</td>\n",
+       "      <td>Terrabacteria group</td>\n",
+       "      <td>Bacteria</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>47563 rows × 12 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "        tax_id                       species               genus  \\\n",
+       "0       622671   Desulfobotulus alkaliphilus      Desulfobotulus   \n",
+       "1      1338382     Desulfitobacterium sp. PR  Desulfitobacterium   \n",
+       "2      1610527           Devosia honganensis             Devosia   \n",
+       "3       426706    Desulfomicrobium aestuarii    Desulfomicrobium   \n",
+       "4        81459       Entomoplasma melaleucae        Entomoplasma   \n",
+       "...        ...                           ...                 ...   \n",
+       "47558   171390  Chroococcidiopsis sp. BB96.1   Chroococcidiopsis   \n",
+       "47559   171290         Mycoplasma oxoniensis          Mycoplasma   \n",
+       "47560   171388  Chroococcidiopsis sp. BB82.3   Chroococcidiopsis   \n",
+       "47561   171387  Chroococcidiopsis sp. BB79.2   Chroococcidiopsis   \n",
+       "47562   171395        Fischerella sp. BB98.1         Fischerella   \n",
+       "\n",
+       "                       family                  order                class  \\\n",
+       "0          Desulfobacteraceae      Desulfobacterales  Deltaproteobacteria   \n",
+       "1       Desulfitobacteriaceae          Eubacteriales           Clostridia   \n",
+       "2                 Devosiaceae       Hyphomicrobiales  Alphaproteobacteria   \n",
+       "3         Desulfomicrobiaceae     Desulfovibrionales  Deltaproteobacteria   \n",
+       "4          Entomoplasmataceae      Entomoplasmatales           Mollicutes   \n",
+       "...                       ...                    ...                  ...   \n",
+       "47558  Chroococcidiopsidaceae  Chroococcidiopsidales                  NaN   \n",
+       "47559        Mycoplasmataceae        Mycoplasmatales           Mollicutes   \n",
+       "47560  Chroococcidiopsidaceae  Chroococcidiopsidales                  NaN   \n",
+       "47561  Chroococcidiopsidaceae  Chroococcidiopsidales                  NaN   \n",
+       "47562       Hapalosiphonaceae             Nostocales                  NaN   \n",
+       "\n",
+       "               phylum                clade superkingdom  subspecies  \\\n",
+       "0      Proteobacteria                  NaN     Bacteria         NaN   \n",
+       "1          Firmicutes  Terrabacteria group     Bacteria         NaN   \n",
+       "2      Proteobacteria                  NaN     Bacteria         NaN   \n",
+       "3      Proteobacteria                  NaN     Bacteria         NaN   \n",
+       "4         Tenericutes  Terrabacteria group     Bacteria         NaN   \n",
+       "...               ...                  ...          ...         ...   \n",
+       "47558   Cyanobacteria  Terrabacteria group     Bacteria         NaN   \n",
+       "47559     Tenericutes  Terrabacteria group     Bacteria         NaN   \n",
+       "47560   Cyanobacteria  Terrabacteria group     Bacteria         NaN   \n",
+       "47561   Cyanobacteria  Terrabacteria group     Bacteria         NaN   \n",
+       "47562   Cyanobacteria  Terrabacteria group     Bacteria         NaN   \n",
+       "\n",
+       "      species subgroup species group  \n",
+       "0                  NaN           NaN  \n",
+       "1                  NaN           NaN  \n",
+       "2                  NaN           NaN  \n",
+       "3                  NaN           NaN  \n",
+       "4                  NaN           NaN  \n",
+       "...                ...           ...  \n",
+       "47558              NaN           NaN  \n",
+       "47559              NaN           NaN  \n",
+       "47560              NaN           NaN  \n",
+       "47561              NaN           NaN  \n",
+       "47562              NaN           NaN  \n",
+       "\n",
+       "[47563 rows x 12 columns]"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "7aa22f8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "boolean_series = df.duplicated(subset=[\"tax_id\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "319879be",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "repeats = df[\"tax_id\"][boolean_series]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "22c37a16",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "246\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(repeats.size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "f6b3c696",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.series.Series'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(type(repeats))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "c663d580",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "897         1428\n",
+       "4063         623\n",
+       "8264        1894\n",
+       "8641        1245\n",
+       "15318        358\n",
+       "          ...   \n",
+       "47535    1534743\n",
+       "47541    2083055\n",
+       "47542     142651\n",
+       "47543    2083054\n",
+       "47544       2122\n",
+       "Name: tax_id, Length: 246, dtype: int64"
+      ]
+     },
+     "execution_count": 47,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "repeats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "eb13749a",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1428, 623, 1894, 1245, 358, 13687, 1883, 1886637, 584, 307643, 1532905, 1696, 531938, 714, 47760, 1883, 53445, 332101, 286, 298, 395600, 698769, 1089306, 1841, 32008, 2056873, 1961, 67384, 32046, 1883, 84567, 71190, 1649279, 476, 51669, 394095, 2236, 413401, 496866, 157076, 330922, 512649, 930805, 371036, 727, 1921578, 577492, 1074311, 317, 535712, 81034, 43997, 28189, 1486246, 53462, 2093, 691, 44249, 306, 813, 381300, 2093, 669, 223527, 2093, 1458492, 2093, 2093, 1882268, 316, 1935978, 76832, 256, 669, 1283291, 33, 83456, 77097, 1917449, 316, 1126240, 29284, 65093, 261685, 395935, 452084, 215690, 1416776, 502829, 669, 669, 669, 44933, 370769, 1534743, 588898, 2337, 2093, 2746, 35622, 286, 1827300, 1508420, 1508419, 2093, 44249, 2104, 1028989, 483539, 2109, 158080, 1278643, 669, 2093, 2093, 76630, 1346688, 1636610, 669, 1278643, 376489, 114886, 2093, 2746, 2109, 2054919, 1338689, 1278643, 2746, 1338689, 52021, 370323, 2093, 370769, 286, 52021, 444459, 413816, 376489, 253237, 413815, 1534743, 2067572, 2067572, 146785, 2336, 691825, 150396, 2093, 2104, 29557, 2054919, 1636610, 2093, 2746, 669, 2083054, 368454, 1338689, 2083054, 146785, 245294, 2093, 380547, 142651, 425309, 669, 353800, 29561, 171632, 520762, 2067572, 224312, 224312, 1500686, 329163, 28227, 1534743, 2109, 1278643, 2161724, 669, 2109, 370769, 52021, 142651, 64713, 669, 650850, 370323, 691825, 370323, 118675, 353800, 29557, 2109, 245294, 114885, 370769, 1500686, 150396, 392421, 2746, 1547897, 872983, 2746, 44933, 2054919, 376489, 2336, 2746, 1827300, 392421, 35622, 1500686, 29283, 337243, 2746, 1534743, 368454, 337243, 2083055, 48003, 2337, 286, 2746, 2107, 1338689, 171632, 416585, 2109, 44933, 1534743, 28227, 1383851, 64713, 353800, 50740, 1534743, 1500687, 380547, 1534743, 2083055, 142651, 2083054, 2122\n"
+     ]
+    }
+   ],
+   "source": [
+    "repeated_id_str = \"\"\n",
+    "for i in repeats:\n",
+    "    repeated_id_str += str(i)+\", \"\n",
+    "print(repeated_id_str[:-2])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/emu_default 2.tar.gz b/emu_default 2.tar.gz
new file mode 100644
index 0000000..159c446
Binary files /dev/null and b/emu_default 2.tar.gz differ
diff --git a/verify_DB.py b/verify_DB.py
new file mode 100644
index 0000000..f8f689a
--- /dev/null
+++ b/verify_DB.py
@@ -0,0 +1,21 @@
+from os.path import exists
+import pandas as pd
+
+
+def find_repeats(path_to_tsv):
+    df = pd.read_csv(path_to_tsv, sep="\t")
+    boolean_series = df.duplicated(subset=["tax_id"])
+    repeats = df["tax_id"][boolean_series]
+    return set(repeats)
+
+
+def verify_db(path_to_db):
+    # Check whether both species_taxid.fasta and taxonomy.tsv are present
+    if not (exists(path_to_db + "/" + "species_taxid.fasta") and exists(path_to_db + "/" + "taxonomy.tsv")):
+        raise FileNotFoundError("Missing species_taxid.fasta or taxonomy.tsv")
+    # Check for duplicates in taxonomy.tsv
+    repeats = find_repeats(path_to_db + "/" + "taxonomy.tsv")
+    if len(repeats) > 0:
+        raise ValueError("There are one or more duplicate entries in this database:\n" + str(repeats))
+    print("Database looks good!")
+