Skip to content

Commit 082f1d1

Browse files
Koeng101TimothyStiles
andauthored
Tutorials (#184)
* Added example_test for genbank and gff * Added SIMPLE tutorials and notes for each parser. * Revamped seqhash docs and added simple tutorial * moved example seqhash to example_test - does this make it runnable? Co-authored-by: Timothy Stiles <[email protected]>
1 parent 09350cb commit 082f1d1

File tree

13 files changed

+238
-91
lines changed

13 files changed

+238
-91
lines changed

io/fasta/example_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package fasta_test
2+
3+
import (
4+
"fmt"
5+
"github.com/TimothyStiles/poly/io/fasta"
6+
)
7+
8+
// This example shows how to open a file with the fasta parser. The sequences
9+
// within that file can then be analyzed further with different software.
10+
func Example_basic() {
11+
fastas := fasta.Read("data/base.fasta")
12+
fmt.Println(fastas[1].Sequence)
13+
// Output: ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTIDFPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREADIDGDGQVNYEEFVQMMTAK*
14+
}

io/fasta/fasta.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
/*
2+
Package fasta contains fasta parsers and writers.
3+
4+
Fasta is a flat text file format developed in 1985 to store nucleotide and
5+
amino acid sequences. It is extremely simple and well supported across many
6+
languages. However, this simplicity means that annotation of genetic objects
7+
is not supported.
8+
9+
This package provides a parser and writer for working with Fasta formatted
10+
genetic sequences.
11+
*/
112
package fasta
213

314
import (
@@ -13,21 +24,21 @@ import (
1324
/******************************************************************************
1425
Apr 25, 2021
1526
16-
Parser begins here
27+
Fasta Parser begins here
1728
1829
Many thanks to Jordan Campbell (https://github.com/0x106) for building the first
1930
parser for Poly and thanks to Tim Stiles (https://github.com/TimothyStiles)
2031
for helping complete that PR. This work expands on the previous work by allowing
2132
for concurrent parsing and giving Poly a specific parser subpackage,
2233
as well as few bug fixes.
2334
24-
is a very simple file format for working with DNA, RNA, or protein sequences.
35+
Fasta is a very simple file format for working with DNA, RNA, or protein sequences.
2536
It was first released in 1985 and is still widely used in bioinformatics.
2637
2738
https://en.wikipedia.org/wiki/_format
2839
2940
One interesting use of the concurrent parser is working with the Uniprot
30-
dump files, which are far too large to fit into RAM. This parser is able
41+
fasta dump files, which are far too large to fit into RAM. This parser is able
3142
to easily handle those files by doing computation actively while the data dump
3243
is getting parsed.
3344

io/genbank/example_test.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
package genbank_test
2+
3+
import (
4+
"fmt"
5+
6+
"github.com/TimothyStiles/poly/io/genbank"
7+
)
8+
9+
// This example shows how to open a genbank file and search for a gene given
10+
// its name. After finding it, notes about the particular gene are read.
11+
func Example_basic() {
12+
sequence := genbank.Read("../../data/puc19.gbk")
13+
for _, feature := range sequence.Features {
14+
if feature.Attributes["gene"] == "bla" {
15+
fmt.Println(feature.Attributes["note"])
16+
}
17+
}
18+
// Output: confers resistance to ampicillin, carbenicillin, andrelated antibiotics
19+
}

io/genbank/genbank.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
/*
2+
Package genbank provides genbank parsers and writers.
3+
4+
GenBank is a flat text file format developed in the 1980s to annotate genetic
5+
sequences, and has since become the standard for sharing annotated genetic
6+
sequences.
7+
8+
This package provides a parser and writer to convert between the GenBank file
9+
format and the more general poly.Sequence struct.
10+
*/
111
package genbank
212

313
import (

io/gff/example_test.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
package gff_test
2+
3+
import (
4+
"fmt"
5+
"github.com/TimothyStiles/poly/io/gff"
6+
)
7+
8+
// This example shows how to open a gff file and search for a gene given its
9+
// locus tag. We then display the EC number of that particular gene.
10+
func Example_basic() {
11+
sequence := gff.Read("../../data/ecoli-mg1655-short.gff")
12+
for _, feature := range sequence.Features {
13+
if feature.Attributes["locus_tag"] == "b0003" {
14+
fmt.Println(feature.Attributes["EC_number"])
15+
}
16+
}
17+
// Output: 2.7.1.39
18+
}

io/gff/gff.go

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
/*
2+
Package gff provides gff parsers and writers.
3+
4+
GFF stands for "general feature format". It is an alternative to GenBank for
5+
storing data about genomic sequences. While not often used in synthetic biology
6+
research, it is more commonly used in bioinformatics for digesting features of
7+
genomic sequences.
8+
9+
This package provides a parser and writer to convert between the gff file
10+
format and the more general poly.Sequence struct.
11+
*/
112
package gff
213

314
import (

io/rebase/example_test.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
package rebase_test
2+
3+
import (
4+
"fmt"
5+
"github.com/TimothyStiles/poly/io/rebase"
6+
)
7+
8+
// This example reads rebase into an enzymeMap and returns the AarI recognition
9+
// sequence.
10+
func Example_basic() {
11+
enzymeMap, _ := rebase.Read("data/rebase_test.txt")
12+
fmt.Println(enzymeMap["AarI"].RecognitionSequence)
13+
// Output: CACCTGC(4/8)
14+
}

io/rebase/rebase.go

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,9 @@
1-
package rebase
1+
/*
2+
Package rebase contains a rebase parser for rebase data dump #31.
23
3-
import (
4-
"encoding/json"
5-
"io/ioutil"
6-
"strings"
7-
)
8-
9-
/******************************************************************************
10-
Apr 25, 2021
11-
12-
REBASE Parser #31 here.
4+
In order to effectively simulate cloning reactions, we need to know how each
5+
restriction enzyme in the reaction functions. This data can be derived, in
6+
bulk, from the REBASE database.
137
148
REBASE is an amazing resource run by New England Biolabs listing essentially
159
every known restriction enzyme. In particular, this parser parses the REBASE
@@ -21,17 +15,9 @@ http://rebase.neb.com/rebase/rebase.f31.html
2115
The actual data dump itself is linked here and updated once a month:
2216
http://rebase.neb.com/rebase/link_withrefm
2317
24-
In ./data/rebase_test.txt we have the first 1002 lines of link_withrefm from
25-
Apr 25, 2021. From this file, we will extract lists of restriction enzymes,
26-
which can be used for simulation of restriction enzyme cutting and for cloning.
27-
2818
The header of this file gives a wonderful explanation of its structure. Here is the
2919
header with the commercial suppliers format and an example enzyme.
3020
31-
Cheers,
32-
Keoni Gandall
33-
34-
3521
```
3622
REBASE version 104 withrefm.104
3723
@@ -152,7 +138,14 @@ REBASE codes for commercial sources of enzymes
152138
<8>Tagami, H., Tayama, K., Tohyama, T., Fukaya, M., Okumura, H., Kawamura, Y., Horinouchi, S., Beppu, T., (1988) FEMS Microbiol. Lett., vol. 56, pp. 161-166.
153139
154140
```
155-
******************************************************************************/
141+
*/
142+
package rebase
143+
144+
import (
145+
"encoding/json"
146+
"io/ioutil"
147+
"strings"
148+
)
156149

157150
// Enzyme represents a single enzyme within the Rebase database
158151
type Enzyme struct {

io/uniprot/example_test.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
package uniprot_test
2+
3+
import (
4+
"fmt"
5+
"github.com/TimothyStiles/poly/io/uniprot"
6+
)
7+
8+
// This example shows how to open a uniprot data dump file and read the results
9+
// into a list. Directly using the channel without converting to an array
10+
// should be used for the Trembl data dump
11+
func Example_basic() {
12+
entries, _, _ := uniprot.Read("data/uniprot_sprot_mini.xml.gz")
13+
14+
var entry uniprot.Entry
15+
for singleEntry := range entries {
16+
entry = singleEntry
17+
}
18+
fmt.Println(entry.Accession[0])
19+
// Output: O55723
20+
}

io/uniprot/uniprot.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,23 @@
1+
/*
2+
Package uniprot provides an XML parser for Uniprot data dumps.
3+
4+
Uniprot is comprehensive, high-quality and freely accessible resource of protein
5+
sequence and functional information. It is the best(1) protein database out there.
6+
7+
Uniprot database dumps are available as gzipped FASTA files or gzipped XML files.
8+
The XML files have significantly more information than the FASTA files, and this
9+
parser specifically works on the gzipped XML files from Uniprot.
10+
11+
Uniprot provides an XML schema of their data dumps(3), which is useful for
12+
autogeneration of Golang structs. xsdgen was used to automatically generate
13+
xml.go from uniprot.xsd.
14+
15+
Each protein in Uniprot is known as an "Entry" (as defined in xml.go).
16+
17+
The function Parse stream-reads Uniprot into an Entry channel, from which you
18+
can use the entries however you want. Read simplifies reading gzipped files
19+
from a disk into an Entry channel.
20+
*/
121
package uniprot
222

323
import (

0 commit comments

Comments
 (0)