Skip to content

Commit 9cd47eb

Browse files
Refactor docs (#60)
* added a couple of example functions to io_test.go. * added Gff IO example tests. * added gbk IO example tests. * added JSON IO example tests. * refactored parse functions to accept bytes instead of strings. * adding sample json file for testing. * added example tests for primer functions. * make complementBaseRuneMap private. * made defaultCodonTable maps private. * made getCodonFrequency private. * recommented BoothLeastRotation. * added example test to hash_test.go. * modified RotateSequence and made boothLeastRotation private. * added example tests to translation_test.go.
1 parent 67bc453 commit 9cd47eb

File tree

12 files changed

+423
-46
lines changed

12 files changed

+423
-46
lines changed

data/sample.json

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
{
2+
"Meta": {
3+
"Name": "",
4+
"GffVersion": "",
5+
"RegionStart": 0,
6+
"RegionEnd": 0,
7+
"Size": 0,
8+
"Type": "",
9+
"GenbankDivision": "",
10+
"Date": "",
11+
"Definition": "Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p (AXL2) and Rev7p (REV7) genes, complete cds.",
12+
"Accession": "U49845",
13+
"Version": "U49845.1 GI:1293613",
14+
"Keywords": ".",
15+
"Organism": "Saccharomyces cerevisiae Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes; Saccharomycetales; Saccharomycetaceae; Saccharomyces.",
16+
"Source": "Saccharomyces cerevisiae (baker's yeast)",
17+
"Origin": "",
18+
"Locus": {
19+
"Name": "SCU49845",
20+
"SequenceLength": "5028",
21+
"MoleculeType": "DNA",
22+
"GenBankDivision": "PLN",
23+
"ModDate": "21-JUN-1999",
24+
"SequenceCoding": "bp",
25+
"Circular": false
26+
},
27+
"References": [
28+
{
29+
"Index": "1",
30+
"Authors": "Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.",
31+
"Title": "Cloning and sequence of REV7, a gene whose function is required for DNA damage-induced mutagenesis in Saccharomyces cerevisiae",
32+
"Journal": "Yeast 10 (11), 1503-1509 (1994)",
33+
"PubMed": "7871890",
34+
"Remark": "",
35+
"Range": "(bases 1 to 5028)"
36+
},
37+
{
38+
"Index": "2",
39+
"Authors": "Roemer,T., Madden,K., Chang,J. and Snyder,M.",
40+
"Title": "Selection of axial growth sites in yeast requires Axl2p, a novel plasma membrane glycoprotein",
41+
"Journal": "Genes Dev. 10 (7), 777-793 (1996)",
42+
"PubMed": "8846915",
43+
"Remark": "",
44+
"Range": "(bases 1 to 5028)"
45+
},
46+
{
47+
"Index": "3",
48+
"Authors": "Roemer,T.",
49+
"Title": "Direct Submission",
50+
"Journal": "Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New Haven, CT, USA",
51+
"PubMed": "",
52+
"Remark": "",
53+
"Range": "(bases 1 to 5028)"
54+
}
55+
],
56+
"Primaries": null
57+
},
58+
"Features": [
59+
{
60+
"Name": "",
61+
"Source": "",
62+
"Type": "source",
63+
"Start": 1,
64+
"End": 5028,
65+
"Complement": false,
66+
"FivePrimePartial": false,
67+
"ThreePrimePartial": false,
68+
"Score": "",
69+
"Strand": "",
70+
"Phase": "",
71+
"Attributes": {
72+
"chromosome": "IX",
73+
"db_xref": "taxon:4932",
74+
"map": "9",
75+
"organism": "Saccharomyces cerevisiae"
76+
},
77+
"Location": "1..5028",
78+
"Sequence": ""
79+
},
80+
{
81+
"Name": "",
82+
"Source": "",
83+
"Type": "CDS",
84+
"Start": 1,
85+
"End": 206,
86+
"Complement": false,
87+
"FivePrimePartial": true,
88+
"ThreePrimePartial": false,
89+
"Score": "",
90+
"Strand": "",
91+
"Phase": "",
92+
"Attributes": {
93+
"codon_start": "3",
94+
"db_xref": "GI:1293614",
95+
"product": "TCP1-beta",
96+
"protein_id": "AAA98665.1",
97+
"translation": "SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEAAEVLLRVDNIIRARPRTANRQHM"
98+
},
99+
"Location": "\u003c1..206",
100+
"Sequence": ""
101+
},
102+
{
103+
"Name": "",
104+
"Source": "",
105+
"Type": "gene",
106+
"Start": 687,
107+
"End": 3158,
108+
"Complement": false,
109+
"FivePrimePartial": false,
110+
"ThreePrimePartial": false,
111+
"Score": "",
112+
"Strand": "",
113+
"Phase": "",
114+
"Attributes": {
115+
"gene": "AXL2"
116+
},
117+
"Location": "687..3158",
118+
"Sequence": ""
119+
},
120+
{
121+
"Name": "",
122+
"Source": "",
123+
"Type": "CDS",
124+
"Start": 687,
125+
"End": 3158,
126+
"Complement": false,
127+
"FivePrimePartial": false,
128+
"ThreePrimePartial": false,
129+
"Score": "",
130+
"Strand": "",
131+
"Phase": "",
132+
"Attributes": {
133+
"codon_start": "1",
134+
"db_xref": "GI:1293615",
135+
"function": "required for axial budding pattern of S.cerevisiae",
136+
"gene": "AXL2",
137+
"note": "plasma membrane glycoprotein",
138+
"product": "Axl2p",
139+
"protein_id": "AAA98666.1",
140+
"translation": "MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESFTFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFNVILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNEVFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPETSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYVYLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYGDVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQDHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSANATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIACGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLNNPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQSQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDSYGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTKHRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRLVDFSNKSNVNVGQVKDIHGRIPEML"
141+
},
142+
"Location": "687..3158",
143+
"Sequence": ""
144+
},
145+
{
146+
"Name": "",
147+
"Source": "",
148+
"Type": "gene",
149+
"Start": 3300,
150+
"End": 4037,
151+
"Complement": true,
152+
"FivePrimePartial": false,
153+
"ThreePrimePartial": false,
154+
"Score": "",
155+
"Strand": "",
156+
"Phase": "",
157+
"Attributes": {
158+
"gene": "REV7"
159+
},
160+
"Location": "complement(3300..4037)",
161+
"Sequence": ""
162+
},
163+
{
164+
"Name": "",
165+
"Source": "",
166+
"Type": "CDS",
167+
"Start": 3300,
168+
"End": 4037,
169+
"Complement": true,
170+
"FivePrimePartial": false,
171+
"ThreePrimePartial": false,
172+
"Score": "",
173+
"Strand": "",
174+
"Phase": "",
175+
"Attributes": {
176+
"codon_start": "1",
177+
"db_xref": "GI:1293616",
178+
"gene": "REV7",
179+
"product": "Rev7p",
180+
"protein_id": "AAA98667.1",
181+
"translation": "MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQFVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVDKDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNRRVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEKLISGDDKILNGVYSQYEEGESIFGSLF"
182+
},
183+
"Location": "complement(3300..4037)",
184+
"Sequence": ""
185+
}
186+
],
187+
"Sequence": {
188+
"Description": "",
189+
"Hash": "",
190+
"HashFunction": "",
191+
"Sequence": "gatcctccatatacaacggtatctccacctcaggtttagatctcaacaacggaaccattgccgacatgagacagttaggtatcgtcgagagttacaagctaaaacgagcagtagtcagctctgcatctgaagccgctgaagttctactaagggtggataacatcatccgtgcaagaccaagaaccgccaatagacaacatatgtaacatatttaggatatacctcgaaaataataaaccgccacactgtcattattataattagaaacagaacgcaaaaattatccactatataattcaaagacgcgaaaaaaaaagaacaacgcgtcatagaacttttggcaattcgcgtcacaaataaattttggcaacttatgtttcctcttcgagcagtactcgagccctgtctcaagaatgtaataatacccatcgtaggtatggttaaagatagcatctccacaacctcaaagctccttgccgagagtcgccctcctttgtcgagtaattttcacttttcatatgagaacttattttcttattctttactctcacatcctgtagtgattgacactgcaacagccaccatcactagaagaacagaacaattacttaatagaaaaattatatcttcctcgaaacgatttcctgcttccaacatctacgtatatcaagaagcattcacttaccatgacacagcttcagatttcattattgctgacagctactatatcactactccatctagtagtggccacgccctatgaggcatatcctatcggaaaacaataccccccagtggcaagagtcaatgaatcgtttacatttcaaatttccaatgatacctataaatcgtctgtagacaagacagctcaaataacatacaattgcttcgacttaccgagctggctttcgtttgactctagttctagaacgttctcaggtgaaccttcttctgacttactatctgatgcgaacaccacgttgtatttcaatgtaatactcgagggtacggactctgccgacagcacgtctttgaacaatacataccaatttgttgttacaaaccgtccatccatctcgctatcgtcagatttcaatctattggcgttgttaaaaaactatggttatactaacggcaaaaacgctctgaaactagatcctaatgaagtcttcaacgtgacttttgaccgttcaatgttcactaacgaagaatccattgtgtcgtattacggacgttctcagttgtataatgcgccgttacccaattggctgttcttcgattctggcgagttgaagtttactgggacggcaccggtgataaactcggcgattgctccagaaacaagctacagttttgtcatcatcgctacagacattgaaggattttctgccgttgaggtagaattcgaattagtcatcggggctcaccagttaactacctctattcaaaatagtttgataatcaacgttactgacacaggtaacgtttcatatgacttacctctaaactatgtttatctcgatgacgatcctatttcttctgataaattgggttctataaacttattggatgctccagactgggtggcattagataatgctaccatttccgggtctgtcccagatgaattactcggtaagaactccaatcctgccaatttttctgtgtccatttatgatacttatggtgatgtgatttatttcaacttcgaagttgtctccacaacggatttgtttgccattagttctcttcccaatattaacgctacaaggggtgaatggttctcctactattttttgccttctcagtttacagactacgtgaatacaaacgtttcattagagtttactaattcaagccaagaccatgactgggtgaaattccaatcatctaatttaacattagctggagaagtgcccaagaatttcgacaagctttcattaggtttgaaagcgaaccaaggttcacaatctcaagagctatattttaacatcattggcatggattcaaagataactcactcaaaccacagtgcgaatgcaacgtccacaagaagttctcaccactccacctcaacaagttcttacacatcttctacttacactgcaaaaatttcttctacctccgctgctgctacttcttctgctccagcagcgctgccagcagccaataaaacttcatctcacaataaaaaagcagtagcaattgcgtgcggtgttgctatcccattaggcgttatcctagtagctctcatttgcttcctaatattctggagacgcagaagggaaaatccagacgatgaaaacttaccgcatgctattagtggacctgatttgaataatcctgcaaataaaccaaatcaagaaaacgctacacctttgaacaacccctttgatgatgatgcttcctcgtacgatgatacttcaatagcaagaagattggctgctttgaacactttgaaattggataaccactctgccactgaatctgatatttccagcgtggatgaaaagagagattctctatcaggtatgaatacatacaatgatcagttccaatcccaaagtaaagaagaattattagcaaaacccccagtacagcctccagagagcccgttctttgacccacagaataggtcttcttctgtgtatatggatagtgaaccagcagtaaataaatcctggcgatatactggcaacctgtcaccagtctctgatattgtcagagacagttacggatcacaaaaaactgttgatacagaaaaacttttcgatttagaagcaccagagaaggaaaaacgtacgtcaagggatgtcactatgtcttcactggacccttggaacagcaatattagcccttctcccgtaagaaaatcagtaacaccatcaccatataacgtaacgaagcatcgtaaccgccacttacaaaatattcaagactctcaaagcggtaaaaacggaatcactcccacaacaatgtcaacttcatcttctgacgattttgttccggttaaagatggtgaaaatttttgctgggtccatagcatggaaccagacagaagaccaagtaagaaaaggttagtagatttttcaaataagagtaatgtcaatgttggtcaagttaaggacattcacggacgcatcccagaaatgctgtgattatacgcaacgatattttgcttaattttattttcctgttttattttttattagtggtttacagataccctatattttatttagtttttatacttagagacatttaattttaattccattcttcaaatttcatttttgcacttaaaacaaagatccaaaaatgctctcgccctcttcatattgagaatacactccattcaaaattttgtcgtcaccgctgattaatttttcactaaactgatgaataatcaaaggccccacgtcagaaccgactaaagaagtgagttttattttaggaggttgaaaaccattattgtctggtaaattttcatcttcttgacatttaacccagtttgaatccctttcaatttctgctttttcctccaaactatcgaccctcctgtttctgtccaacttatgtcctagttccaattcgatcgcattaataactgcttcaaatgttattgtgtcatcgttgactttaggtaatttctccaaatgcataatcaaactatttaaggaagatcggaattcgtcgaacacttcagtttccgtaatgatctgatcgtctttatccacatgttgtaattcactaaaatctaaaacgtatttttcaatgcataaatcgttctttttattaataatgcagatggaaaatctgtaaacgtgcgttaatttagaaagaacatccagtataagttcttctatatagtcaattaaagcaggatgcctattaatgggaacgaactgcggcaagttgaatgactggtaagtagtgtagtcgaatgactgaggtgggtatacatttctataaaataaaatcaaattaatgtagcattttaagtataccctcagccacttctctacccatctattcataaagctgacgcaacgattactattttttttttcttcttggatctcagtcgtcgcaaaaacgtataccttctttttccgaccttttttttagctttctggaaaagtttatattagttaaacagggtctagtcttagtgtgaaagctagtggtttcgattgactgatattaagaaagtggaaattaaattagtagtgtagacgtatatgcatatgtatttctcgcctgtttatgtttctacgtacttttgatttatagcaaggggaaaagaaatacatactattttttggtaaaggtgaaagcataatgtaaaagctagaataaaatggacgaaataaagagaggcttagttcatcttttttccaaaaagcacccaatgataataactaaaatgaaaaggatttgccatctgtcagcaacatcagttgtgtgagcaataataaaatcatcacctccgttgcctttagcgcgtttgtcgtttgtatcttccgtaattttagtcttatcaatgggaatcataaattttccaatgaattagcaatttcgtccaattctttttgagcttcttcatatttgctttggaattcttcgcacttcttttcccattcatctctttcttcttccaaagcaacgatccttctacccatttgctcagagttcaaatcggcctctttcagtttatccattgcttccttcagtttggcttcactgtcttctagctgttgttctagatcctggtttttcttggtgtagttctcattattagatctcaagttattggagtcttcagccaattgctttgtatcagacaattgactctctaacttctccacttcactgtcgagttgctcgtttttagcggacaaagatttaatctcgttttctttttcagtgttagattgctctaattctttgagctgttctctcagctcctcatatttttcttgccatgactcagattctaattttaagctattcaatttctctttgatc"
192+
}
193+
}

hash.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,11 @@ import (
3636
// BLAKE2b_384 // import golang.org/x/crypto/blake2b
3737
// BLAKE2b_512 // import golang.org/x/crypto/blake2b
3838

39-
// BoothLeastRotation gets the least rotation of a circular string.
40-
// https://en.wikipedia.org/wiki/Lexicographically_minimal_string_rotation
41-
// this is generally over commented but I'm keeping it this way for now. - Tim
42-
func BoothLeastRotation(sequence string) int {
39+
// boothLeastRotation gets the least rotation of a circular string.
40+
func boothLeastRotation(sequence string) int {
41+
42+
// https://en.wikipedia.org/wiki/Lexicographically_minimal_string_rotation
43+
// this is generally over commented but I'm keeping it this way for now. - Tim
4344

4445
// first concatenate the sequence to itself to avoid modular arithmateic
4546
sequence += sequence // maybe do this as a buffer just for speed? May get annoying with larger sequences.
@@ -89,8 +90,14 @@ func BoothLeastRotation(sequence string) int {
8990

9091
// RotateSequence rotates circular sequences to deterministic point.
9192
func RotateSequence(sequence string) string {
92-
rotationIndex := BoothLeastRotation(sequence)
93-
concatenatedSequence := sequence + sequence
93+
rotationIndex := boothLeastRotation(sequence)
94+
var sequenceBuilder strings.Builder
95+
96+
// writing the same sequence twice. using build incase of very long circular genome.
97+
sequenceBuilder.WriteString(sequence)
98+
sequenceBuilder.WriteString(sequence)
99+
100+
concatenatedSequence := sequenceBuilder.String()
94101
sequence = concatenatedSequence[rotationIndex : rotationIndex+len(sequence)]
95102
return sequence
96103
}

hash_test.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,13 @@ import (
99
"lukechampine.com/blake3"
1010
)
1111

12+
func ExampleHash() {
13+
puc19 := ReadGbk("data/puc19.gbk")
14+
fmt.Println(puc19.Hash(blake3.New(32, nil))) // passing new hash.Hash struct to Hasher
15+
16+
// output: 4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9
17+
}
18+
1219
func TestHashRegression(t *testing.T) {
1320
puc19GbkBlake3Hash := "4b0616d1b3fc632e42d78521deb38b44fba95cca9fde159e01cd567fa996ceb9"
1421
puc19 := ReadGbk("data/puc19.gbk")
@@ -28,6 +35,15 @@ func TestHashRegression(t *testing.T) {
2835
}
2936
}
3037

38+
func ExampleRotateSequence() {
39+
sequence := ReadGbk("data/puc19.gbk")
40+
sequenceLength := len(sequence.Sequence)
41+
testSequence := sequence.Sequence[sequenceLength/2:] + sequence.Sequence[0:sequenceLength/2]
42+
43+
fmt.Println(RotateSequence(sequence.Sequence) == RotateSequence(testSequence))
44+
// output: true
45+
}
46+
3147
func TestLeastRotation(t *testing.T) {
3248
sequence := ReadGbk("data/puc19.gbk")
3349
var sequenceBuffer bytes.Buffer

io.go

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,9 @@ GFF specific IO related things begin here.
139139
******************************************************************************/
140140

141141
// ParseGff Takes in a string representing a gffv3 file and parses it into an Sequence object.
142-
func ParseGff(gff string) Sequence {
142+
func ParseGff(gffBytes []byte) Sequence {
143+
144+
gff := string(gffBytes)
143145
sequence := Sequence{}
144146

145147
lines := strings.Split(gff, "\n")
@@ -327,7 +329,7 @@ func ReadGff(path string) Sequence {
327329
if err != nil {
328330
// return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err)
329331
} else {
330-
sequence = ParseGff(string(file))
332+
sequence = ParseGff(file)
331333
}
332334
return sequence
333335
}
@@ -392,8 +394,8 @@ FASTA specific IO related things begin here.
392394
******************************************************************************/
393395

394396
// ParseFASTA parses a Sequence struct from a FASTA file and adds appropriate pointers to the structs.
395-
func ParseFASTA(fasta string) Sequence {
396-
397+
func ParseFASTA(fastaBytes []byte) Sequence {
398+
fasta := string(fastaBytes)
397399
var sequence Sequence
398400
var feature Feature
399401
var features []Feature
@@ -500,8 +502,8 @@ func ReadFASTA(path string) Sequence {
500502
if err != nil {
501503
// return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err)
502504
}
503-
annotatedSequenceArray := ParseFASTA(string(file))
504-
return annotatedSequenceArray
505+
sequence := ParseFASTA(file)
506+
return sequence
505507
}
506508

507509
// WriteFASTA writes a Sequence struct out to FASTA.
@@ -522,8 +524,9 @@ GBK specific IO related things begin here.
522524
******************************************************************************/
523525

524526
// ParseGbk takes in a string representing a gbk/gb/genbank file and parses it into an Sequence object.
525-
func ParseGbk(gbk string) Sequence {
527+
func ParseGbk(gbkBytes []byte) Sequence {
526528

529+
gbk := string(gbkBytes)
527530
lines := strings.Split(gbk, "\n")
528531

529532
// Create meta struct
@@ -716,8 +719,7 @@ func ReadGbk(path string) Sequence {
716719
if err != nil {
717720
// return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err)
718721
} else {
719-
gbkString := string(file)
720-
sequence = ParseGbk(gbkString)
722+
sequence = ParseGbk(file)
721723

722724
}
723725
return sequence

0 commit comments

Comments
 (0)