Skip to content

Commit 09350cb

Browse files
author
Koeng101
authored
Added concurrent parsing feature for genbank files (#182)
* Added concurrent parsing feature for genbank files * Added ParseFlatConcurrent * Updated to have generic CheckSum instead of just MD5
1 parent ac4a3f3 commit 09350cb

File tree

5 files changed

+80
-33
lines changed

5 files changed

+80
-33
lines changed

cmd/poly/commands_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func TestConvertPipe(t *testing.T) {
6262
baseTestSequence := parseExt(match)
6363
pipeOutputTestSequence := polyjson.Parse(writeBuffer.Bytes())
6464

65-
if diff := cmp.Diff(baseTestSequence, pipeOutputTestSequence, cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence")); diff != "" {
65+
if diff := cmp.Diff(baseTestSequence, pipeOutputTestSequence, []cmp.Option{cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence"), cmpopts.IgnoreFields(poly.Sequence{}, "CheckSum")}...); diff != "" {
6666
t.Errorf(" mismatch converting from %q to json (-want +got):\n%s", extension, diff)
6767
}
6868
}
@@ -92,7 +92,7 @@ func TestConvertIO(t *testing.T) {
9292

9393
pipeOutputTestSequence := parseFlag(writeBuffer.Bytes(), extension)
9494

95-
if diff := cmp.Diff(baseTestSequence, pipeOutputTestSequence, cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence")); diff != "" {
95+
if diff := cmp.Diff(baseTestSequence, pipeOutputTestSequence, []cmp.Option{cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence"), cmpopts.IgnoreFields(poly.Sequence{}, "CheckSum")}...); diff != "" {
9696
t.Errorf(" mismatch reading and writing %q (-want +got):\n%s", extension, diff)
9797
}
9898
}
@@ -121,7 +121,7 @@ func TestConvertWriteFile(t *testing.T) {
121121
outputSequence := parseExt(testOutputPath)
122122
os.Remove(testOutputPath)
123123

124-
if diff := cmp.Diff(baseTestSequence, outputSequence, cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence")); diff != "" {
124+
if diff := cmp.Diff(baseTestSequence, outputSequence, []cmp.Option{cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence"), cmpopts.IgnoreFields(poly.Sequence{}, "CheckSum")}...); diff != "" {
125125
t.Errorf(" mismatch reading and writing %q (-want +got):\n%s", extension, diff)
126126
}
127127
}

io/genbank/genbank.go

Lines changed: 69 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,13 @@
11
package genbank
22

33
import (
4+
"bufio"
45
"bytes"
56
"compress/gzip"
7+
"io"
68
"io/ioutil"
79
"log"
10+
"lukechampine.com/blake3"
811
"regexp"
912
"strconv"
1013
"strings"
@@ -35,6 +38,9 @@ func Parse(file []byte) poly.Sequence {
3538
// Create sequence struct
3639
sequence := poly.Sequence{}
3740

41+
// Add the CheckSum to sequence (blake3)
42+
sequence.CheckSum = blake3.Sum256(file)
43+
3844
for numLine := 0; numLine < len(lines); numLine++ {
3945
line := lines[numLine]
4046
splitLine := strings.Split(line, " ")
@@ -760,41 +766,29 @@ Genbank Flat specific IO related things begin here.
760766

761767
// ParseMulti parses multiple Genbank files in a byte array to multiple sequences
762768
func ParseMulti(file []byte) []poly.Sequence {
769+
r := bytes.NewReader(file)
770+
sequences := make(chan poly.Sequence)
771+
go ParseConcurrent(r, sequences)
763772

764-
gbk := string(file)
765-
genbankFiles := strings.SplitAfter(gbk, "//\n")
766-
767-
//Remove last genbankFile in list. The real file terminates with //, which
768-
//will be interpreted as an empty genbankFile.
769-
genbankFiles = genbankFiles[:len(genbankFiles)-1]
770-
771-
//Iterate through each genbankFile in genbankFiles list and Parse it
772-
//using the Parse function. Return output.
773-
var sequences []poly.Sequence
774-
for _, f := range genbankFiles {
775-
sequences = append(sequences, Parse([]byte(f)))
773+
var outputGenbanks []poly.Sequence
774+
for sequence := range sequences {
775+
outputGenbanks = append(outputGenbanks, sequence)
776776
}
777-
778-
return sequences
779-
777+
return outputGenbanks
780778
}
781779

782780
// ParseFlat specifically takes the output of a Genbank Flat file that from
783781
// the genbank ftp dumps. These files have 10 line headers, which are entirely
784782
// removed
785783
func ParseFlat(file []byte) []poly.Sequence {
786-
787-
gbk := string(file)
788-
789-
// This code removes the header, which is 10 lines long. This is inefficient
790-
// and gets rid of the data in the header, which may be useful for some
791-
// application. Header data is not needed to parse the Genbank files, though
792-
gbkWithoutHeader := []byte(strings.Join(strings.Split(gbk, "\n")[10:], "\n"))
793-
794-
// Pass gbkWithoutHeader to ParseMulti, which should handle
795-
// the rest of the parsing just fine
796-
sequences := ParseMulti(gbkWithoutHeader)
797-
return sequences
784+
r := bytes.NewReader(file)
785+
sequences := make(chan poly.Sequence)
786+
go ParseFlatConcurrent(r, sequences)
787+
var outputGenbanks []poly.Sequence
788+
for sequence := range sequences {
789+
outputGenbanks = append(outputGenbanks, sequence)
790+
}
791+
return outputGenbanks
798792
}
799793

800794
// ReadMulti reads multiple genbank files from a single file
@@ -826,3 +820,51 @@ func ReadFlatGz(path string) []poly.Sequence {
826820
Genbank Flat specific IO related things end here.
827821
828822
******************************************************************************/
823+
824+
/******************************************************************************
825+
826+
Genbank Concurrent specific IO related things begin here.
827+
828+
******************************************************************************/
829+
830+
// ParseConcurrent concurrently parses a given multi-Genbank file in an io.Reader into a channel of poly.Sequence.
831+
func ParseConcurrent(r io.Reader, sequences chan<- poly.Sequence) {
832+
var gbkStr string
833+
var gbk poly.Sequence
834+
835+
// Start a new scanner
836+
scanner := bufio.NewScanner(r)
837+
for scanner.Scan() {
838+
line := scanner.Text()
839+
if line == "//" {
840+
gbkStr = gbkStr + "//"
841+
// Parse the genbank string and send it to the channel
842+
gbk = Parse([]byte(gbkStr))
843+
sequences <- gbk
844+
// Reset the genbank string
845+
gbkStr = ""
846+
} else {
847+
// Append new lines of the Genbank file to a growing string
848+
gbkStr = gbkStr + line + "\n"
849+
}
850+
}
851+
close(sequences)
852+
}
853+
854+
// ParseFlatConcurrent concurrently parses a given flat-Genbank file in an io.Reader into a channel of poly.Sequnce.
855+
func ParseFlatConcurrent(r io.Reader, sequences chan<- poly.Sequence) {
856+
// Start a new reader
857+
reader := bufio.NewReader(r)
858+
// Read 10 lines, or the header of a flat file
859+
// Header data is not needed to parse the Genbank files, though it may contain useful information.
860+
for i := 0; i < 10; i++ {
861+
_, _, _ = reader.ReadLine()
862+
}
863+
go ParseConcurrent(reader, sequences)
864+
}
865+
866+
/******************************************************************************
867+
868+
Genbank Concurrent specific IO related things end here.
869+
870+
******************************************************************************/

io/genbank/genbank_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ func TestGbkIO(t *testing.T) {
7373
Write(gbk, tmpGbkFilePath)
7474

7575
writeTestGbk := Read(tmpGbkFilePath)
76-
if diff := cmp.Diff(gbk, writeTestGbk, cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence")); diff != "" {
76+
if diff := cmp.Diff(gbk, writeTestGbk, []cmp.Option{cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence"), cmpopts.IgnoreFields(poly.Sequence{}, "CheckSum")}...); diff != "" {
7777
t.Errorf("Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got this diff:\n%s", diff)
7878
}
7979

@@ -108,7 +108,7 @@ func TestGbkLocationStringBuilder(t *testing.T) {
108108
testInputGbk := Read("../../data/sample.gbk")
109109
testOutputGbk := Read(tmpGbkFilePath)
110110

111-
if diff := cmp.Diff(testInputGbk, testOutputGbk, cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence")); diff != "" {
111+
if diff := cmp.Diff(testInputGbk, testOutputGbk, []cmp.Option{cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence"), cmpopts.IgnoreFields(poly.Sequence{}, "CheckSum")}...); diff != "" {
112112
t.Errorf("Issue with partial location building. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got this diff:\n%s", diff)
113113
}
114114
}
@@ -133,7 +133,7 @@ func TestGbLocationStringBuilder(t *testing.T) {
133133
testInputGb := Read("../../data/t4_intron.gb")
134134
testOutputGb := Read(tmpGbFilePath)
135135

136-
if diff := cmp.Diff(testInputGb, testOutputGb, cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence")); diff != "" {
136+
if diff := cmp.Diff(testInputGb, testOutputGb, []cmp.Option{cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence"), cmpopts.IgnoreFields(poly.Sequence{}, "CheckSum")}...); diff != "" {
137137
t.Errorf("Issue with either Join or complement location building. Parsing the output of Build() does not produce the same output as parsing the original file read with Read(). Got this diff:\n%s", diff)
138138
}
139139
}

io/gff/gff.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"bytes"
55
"io/ioutil"
66
"log"
7+
"lukechampine.com/blake3"
78
"regexp"
89
"sort"
910
"strconv"
@@ -18,6 +19,9 @@ func Parse(file []byte) poly.Sequence {
1819
gff := string(file)
1920
sequence := poly.Sequence{}
2021

22+
// Add the CheckSum to sequence (blake3)
23+
sequence.CheckSum = blake3.Sum256(file)
24+
2125
lines := strings.Split(gff, "\n")
2226
metaString := lines[0:2]
2327
versionString := metaString[0]

poly.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ type Sequence struct {
103103
SequenceHashFunction string `json:"hash_function"`
104104
Sequence string `json:"sequence"`
105105
Features []Feature `json:"features"`
106+
CheckSum [32]byte `json:"checkSum"` // blake3 checksum of the parsed file itself. Useful for if you want to check if incoming genbank/gff files are different.
106107
}
107108

108109
// AddFeature is the canonical way to add a Feature into a Sequence struct. Appending a Feature struct directly to Sequence.Feature's will break .GetSequence() method.

0 commit comments

Comments
 (0)