Tutorials (#184)

Koeng101 · TimothyStiles · web-flow · commit 082f1d1d83c8 · 2021-08-09T15:18:18.000-07:00
* Added example_test for genbank and gff

* Added SIMPLE tutorials and notes for each parser.

* Revamped seqhash docs and added simple tutorial

* moved example seqhash to example_test - does this make it runnable?

Co-authored-by: Timothy Stiles &lt;tim@stiles.io&gt;
diff --git a/io/fasta/example_test.go b/io/fasta/example_test.go
@@ -0,0 +1,14 @@
+package fasta_test
+
+import (
+	"fmt"
+	"github.com/TimothyStiles/poly/io/fasta"
+)
+
+// This example shows how to open a file with the fasta parser. The sequences
+// within that file can then be analyzed further with different software.
+func Example_basic() {
+	fastas := fasta.Read("data/base.fasta")
+	fmt.Println(fastas[1].Sequence)
+	// Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK*
+}
diff --git a/io/fasta/fasta.go b/io/fasta/fasta.go
@@ -1,3 +1,14 @@
+/*
+Package fasta contains fasta parsers and writers.
+
+Fasta is a flat text file format developed in 1985 to store nucleotide and
+amino acid sequences. It is extremely simple and well supported across many
+languages. However, this simplicity means that annotation of genetic objects
+is not supported.
+
+This package provides a parser and writer for working with Fasta formatted
+genetic sequences.
+*/
 package fasta
 
 import (
@@ -13,21 +24,21 @@ import (
 /******************************************************************************
 Apr 25, 2021
 
- Parser begins here
+Fasta Parser begins here
 
 Many thanks to Jordan Campbell (https://github.com/0x106) for building the first
 parser for Poly and thanks to Tim Stiles (https://github.com/TimothyStiles)
 for helping complete that PR. This work expands on the previous work by allowing
 for concurrent  parsing and giving Poly a specific  parser subpackage,
 as well as few bug fixes.
 
- is a very simple file format for working with DNA, RNA, or protein sequences.
+Fasta is a very simple file format for working with DNA, RNA, or protein sequences.
 It was first released in 1985 and is still widely used in bioinformatics.
 
 https://en.wikipedia.org/wiki/_format
 
 One interesting use of the concurrent  parser is working with the Uniprot
- dump files, which are far too large to fit into RAM. This parser is able
+fasta dump files, which are far too large to fit into RAM. This parser is able
 to easily handle those files by doing computation actively while the data dump
 is getting parsed.
 
diff --git a/io/genbank/example_test.go b/io/genbank/example_test.go
@@ -0,0 +1,19 @@
+package genbank_test
+
+import (
+	"fmt"
+
+	"github.com/TimothyStiles/poly/io/genbank"
+)
+
+// This example shows how to open a genbank file and search for a gene given
+// its name. After finding it, notes about the particular gene are read.
+func Example_basic() {
+	sequence := genbank.Read("../../data/puc19.gbk")
+	for _, feature := range sequence.Features {
+		if feature.Attributes["gene"] == "bla" {
+			fmt.Println(feature.Attributes["note"])
+		}
+	}
+	// Output: confers resistance to ampicillin, carbenicillin, andrelated antibiotics
+}
diff --git a/io/genbank/genbank.go b/io/genbank/genbank.go
@@ -1,3 +1,13 @@
+/*
+Package genbank provides genbank parsers and writers.
+
+GenBank is a flat text file format developed in the 1980s to annotate genetic
+sequences, and has since become the standard for sharing annotated genetic
+sequences.
+
+This package provides a parser and writer to convert between the GenBank file
+format and the more general poly.Sequence struct.
+*/
 package genbank
 
 import (
diff --git a/io/gff/example_test.go b/io/gff/example_test.go
@@ -0,0 +1,18 @@
+package gff_test
+
+import (
+	"fmt"
+	"github.com/TimothyStiles/poly/io/gff"
+)
+
+// This example shows how to open a gff file and search for a gene given its
+// locus tag. We then display the EC number of that particular gene.
+func Example_basic() {
+	sequence := gff.Read("../../data/ecoli-mg1655-short.gff")
+	for _, feature := range sequence.Features {
+		if feature.Attributes["locus_tag"] == "b0003" {
+			fmt.Println(feature.Attributes["EC_number"])
+		}
+	}
+	// Output: 2.7.1.39
+}
diff --git a/io/gff/gff.go b/io/gff/gff.go
@@ -1,3 +1,14 @@
+/*
+Package gff provides gff parsers and writers.
+
+GFF stands for "general feature format". It is an alternative to GenBank for
+storing data about genomic sequences. While not often used in synthetic biology
+research, it is more commonly used in bioinformatics for digesting features of
+genomic sequences.
+
+This package provides a parser and writer to convert between the gff file
+format and the more general poly.Sequence struct.
+*/
 package gff
 
 import (
diff --git a/io/rebase/example_test.go b/io/rebase/example_test.go
@@ -0,0 +1,14 @@
+package rebase_test
+
+import (
+	"fmt"
+	"github.com/TimothyStiles/poly/io/rebase"
+)
+
+// This example reads rebase into an enzymeMap and returns the AarI recognition
+// sequence.
+func Example_basic() {
+	enzymeMap, _ := rebase.Read("data/rebase_test.txt")
+	fmt.Println(enzymeMap["AarI"].RecognitionSequence)
+	// Output: CACCTGC(4/8)
+}
diff --git a/io/rebase/rebase.go b/io/rebase/rebase.go
@@ -1,15 +1,9 @@
-package rebase
+/*
+Package rebase contains a rebase parser for rebase data dump #31.
 
-import (
-	"encoding/json"
-	"io/ioutil"
-	"strings"
-)
-
-/******************************************************************************
-Apr 25, 2021
-
-REBASE Parser #31 here.
+In order to effectively simulate cloning reactions, we need to know how each
+restriction enzyme in the reaction functions. This data can be derived, in
+bulk, from the REBASE database.
 
 REBASE is an amazing resource run by New England Biolabs listing essentially
 every known restriction enzyme. In particular, this parser parses the REBASE
@@ -21,17 +15,9 @@ http://rebase.neb.com/rebase/rebase.f31.html
 The actual data dump itself is linked here and updated once a month:
 http://rebase.neb.com/rebase/link_withrefm
 
-In ./data/rebase_test.txt we have the first 1002 lines of link_withrefm from
-Apr 25, 2021. From this file, we will extract lists of restriction enzymes,
-which can be used for simulation of restriction enzyme cutting and for cloning.
-
 The header of this file gives a wonderful explanation of its structure. Here is the
 header with the commercial suppliers format and an example enzyme.
 
-Cheers,
-Keoni Gandall
-
-
 ```
 REBASE version 104                                              withrefm.104
 
@@ -152,7 +138,14 @@ REBASE codes for commercial sources of enzymes
 <8>Tagami, H., Tayama, K., Tohyama, T., Fukaya, M., Okumura, H., Kawamura, Y., Horinouchi, S., Beppu, T., (1988) FEMS Microbiol. Lett., vol. 56, pp. 161-166.
 
 ```
-******************************************************************************/
+*/
+package rebase
+
+import (
+	"encoding/json"
+	"io/ioutil"
+	"strings"
+)
 
 // Enzyme represents a single enzyme within the Rebase database
 type Enzyme struct {
diff --git a/io/uniprot/example_test.go b/io/uniprot/example_test.go
@@ -0,0 +1,20 @@
+package uniprot_test
+
+import (
+	"fmt"
+	"github.com/TimothyStiles/poly/io/uniprot"
+)
+
+// This example shows how to open a uniprot data dump file and read the results
+// into a list. Directly using the channel without converting to an array
+// should be used for the Trembl data dump
+func Example_basic() {
+	entries, _, _ := uniprot.Read("data/uniprot_sprot_mini.xml.gz")
+
+	var entry uniprot.Entry
+	for singleEntry := range entries {
+		entry = singleEntry
+	}
+	fmt.Println(entry.Accession[0])
+	// Output: O55723
+}
diff --git a/io/uniprot/uniprot.go b/io/uniprot/uniprot.go
@@ -1,3 +1,23 @@
+/*
+Package uniprot provides an XML parser for Uniprot data dumps.
+
+Uniprot is comprehensive, high-quality and freely accessible resource of protein
+sequence and functional information. It is the best(1) protein database out there.
+
+Uniprot database dumps are available as gzipped FASTA files or gzipped XML files.
+The XML files have significantly more information than the FASTA files, and this
+parser specifically works on the gzipped XML files from Uniprot.
+
+Uniprot provides an XML schema of their data dumps(3), which is useful for
+autogeneration of Golang structs. xsdgen was used to automatically generate
+xml.go from uniprot.xsd.
+
+Each protein in Uniprot is known as an "Entry" (as defined in xml.go).
+
+The function Parse stream-reads Uniprot into an Entry channel, from which you
+can use the entries however you want. Read simplifies reading gzipped files
+from a disk into an Entry channel.
+*/
 package uniprot
 
 import (
diff --git a/seqhash/example_test.go b/seqhash/example_test.go
@@ -0,0 +1,29 @@
+package seqhash_test
+
+import (
+	"fmt"
+	"github.com/TimothyStiles/poly/seqhash"
+)
+
+// This example shows how to seqhash a sequence.
+func Example_basic() {
+	sequence := "ATGC"
+	sequenceType := "DNA"
+	circular := false
+	doubleStranded := true
+
+	sequenceSeqhash, _ := seqhash.Hash(sequence, sequenceType, circular, doubleStranded)
+	fmt.Println(sequenceSeqhash)
+	// Output: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350
+}
+
+func ExampleHash() {
+	sequence := "ATGC"
+	sequenceType := "DNA"
+	circular := false
+	doubleStranded := true
+
+	sequenceSeqhash, _ := seqhash.Hash(sequence, sequenceType, circular, doubleStranded)
+	fmt.Println(sequenceSeqhash)
+	// Output: v1_DLD_f4028f93e08c5c23cbb8daa189b0a9802b378f1a1c919dcbcf1608a615f46350
+}
diff --git a/seqhash/seqhash.go b/seqhash/seqhash.go
@@ -1,3 +1,59 @@
+/*
+Package seqhash contains the seqhash algorithm.
+
+This package contains the reference seqhash algorithm.
+
+There is a big problem with current sequence databases - they all use different
+identifiers and accession numbers. This means cross-referencing databases is
+a complicated exercise, especially as the quantity of databases increases, or if
+you need to compare "wild" DNA sequences.
+
+Seqhash is a simple algorithm to produce consistent identifiers for any genetic sequence. The
+basic premise of the Seqhash algorithm is to hash sequences with the hash being a robust
+cross-database identifier. Sequences themselves shouldn't be used as a database index
+(often, they're too big), so a hash based off of a sequence is the next best thing.
+
+Usability wise, you should be able to Seqhash any rotation of a sequence in any direction and
+get a consistent hash.
+
+The Seqhash algorithm makes several opinionated design choices, primarily to make working
+with Seqhashes more consistent and nice. The Seqhash algorithm only uses a single hash function,
+Blake3, and only operates on DNA, RNA, and Protein sequences. These identifiers will be seen
+by human beings, so versioning and metadata is attached to the front of the hashes so that
+a human operator can quickly identify problems with hashing.
+
+If the sequence is DNA or RNA, the Seqhash algorithm needs to know whether or not the nucleic
+acid is circular and/or double stranded. If circular, the sequence is rotated to a deterministic
+point. If double stranded, the sequence is compared to its reverse complement, and the lexiographically
+minimal sequence is taken (whether or not the min or max is used doesn't matter, just needs to
+be consistent).
+
+If the sequence is RNA, the sequence will be converted to DNA before hashing. While the full Seqhash
+will still be different between RNA and DNA (due to the metadata string), the hash afterwards will be the same.
+This makes it easy to cross reference DNA and RNA sequences. This fact is important for parts of Poly
+store that relate to storing and searching large quantities of sequences - deduplication can easily
+be used on those Seqhashes to save a lot of space.
+
+For DNA or RNA sequences, only ATUGCYRSWKMBDHVNZ characters are allowed. For Proteins,
+only ACDEFGHIKLMNPQRSTVWYUO*BXZ characters are allowed in sequences. Selenocysteine (Sec; U) and pyrrolysine
+(Pyl; O) are included in the protein character set - usually U and O don't occur within protein sequences,
+but for certain organisms they do, and it is certainly a relevant amino acid for those particular proteins.
+
+A Seqhash is separated into 3 different elements divided by underscores. It looks like the following:
+
+v1_DCD_4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9
+
+The first element is the version tag (v1 for version 1). If there is ever a Seqhash version 2, this tag
+will differentiate seqhashes. The second element is the metadata tag, which has 3 letters. The first letter
+codes for the sequenceType (D for DNA, R for RNA, and P for Protein). The second letter codes for whether or
+not the sequence is circular (C for Circular, L for Linear). The final letter codes for whether or not the
+sequence is double stranded (D for Double stranded, S for Single stranded). The final element is the blake3
+hash of the sequence (once rotated and complemented, as stated above).
+
+Seqhash is a simple algorithm that allows for much better indexing of genetic sequences than what is
+currently available.
+
+*/
 package seqhash
 
 import (
@@ -76,66 +132,6 @@ func RotateSequence(sequence string) string {
 	return sequence
 }
 
-/******************************************************************************
-Dec, 2, 2020
-
-Seqhash stuff starts here.
-
-There is a big problem with current sequence databases - they all use different
-identifiers and accession numbers. This means cross-referencing databases is
-a complicated exercise, especially as the quantity of databases increases, or if
-you need to compare "wild" DNA sequences.
-
-Seqhash is a simple algorithm to produce consistent identifiers for any genetic sequence. The
-basic premise of the Seqhash algorithm is to hash sequences with the hash being a robust
-cross-database identifier. Sequences themselves shouldn't be used as a database index
-(often, they're too big), so a hash based off of a sequence is the next best thing.
-
-Usability wise, you should be able to Seqhash any rotation of a sequence in any direction and
-get a consistent hash.
-
-The Seqhash algorithm makes several opinionated design choices, primarily to make working
-with Seqhashes more consistent and nice. The Seqhash algorithm only uses a single hash function,
-Blake3, and only operates on DNA, RNA, and Protein sequences. These identifiers will be seen
-by human beings, so versioning and metadata is attached to the front of the hashes so that
-a human operator can quickly identify problems with hashing.
-
-If the sequence is DNA or RNA, the Seqhash algorithm needs to know whether or not the nucleic
-acid is circular and/or double stranded. If circular, the sequence is rotated to a deterministic
-point. If double stranded, the sequence is compared to its reverse complement, and the lexiographically
-minimal sequence is taken (whether or not the min or max is used doesn't matter, just needs to
-be consistent).
-
-If the sequence is RNA, the sequence will be converted to DNA before hashing. While the full Seqhash
-will still be different between RNA and DNA (due to the metadata string), the hash afterwards will be the same.
-This makes it easy to cross reference DNA and RNA sequences. This fact is important for parts of Poly
-store that relate to storing and searching large quantities of sequences - deduplication can easily
-be used on those Seqhashes to save a lot of space.
-
-For DNA or RNA sequences, only ATUGCYRSWKMBDHVNZ characters are allowed. For Proteins,
-only ACDEFGHIKLMNPQRSTVWYUO*BXZ characters are allowed in sequences. Selenocysteine (Sec; U) and pyrrolysine
-(Pyl; O) are included in the protein character set - usually U and O don't occur within protein sequences,
-but for certain organisms they do, and it is certainly a relevant amino acid for those particular proteins.
-
-A Seqhash is separated into 3 different elements divided by underscores. It looks like the following:
-
-v1_DCD_4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9
-
-The first element is the version tag (v1 for version 1). If there is ever a Seqhash version 2, this tag
-will differentiate seqhashes. The second element is the metadata tag, which has 3 letters. The first letter
-codes for the sequenceType (D for DNA, R for RNA, and P for Protein). The second letter codes for whether or
-not the sequence is circular (C for Circular, L for Linear). The final letter codes for whether or not the
-sequence is double stranded (D for Double stranded, S for Single stranded). The final element is the blake3
-hash of the sequence (once rotated and complemented, as stated above).
-
-Seqhash is a simple algorithm that allows for much better indexing of genetic sequences than what is
-currently available. I hope it will be widely adopted someday.
-
-En Taro Adun,
-Keoni
-
-******************************************************************************/
-
 // Hash is a function to create Seqhashes, a specific kind of identifier.
 func Hash(sequence string, sequenceType string, circular bool, doubleStranded bool) (string, error) {
 	// By definition, Seqhashes are of uppercase sequences
diff --git a/seqhash/seqhash_test.go b/seqhash/seqhash_test.go