|
1 | 1 | package genbank |
2 | 2 |
|
3 | 3 | import ( |
| 4 | + "bufio" |
4 | 5 | "bytes" |
5 | 6 | "compress/gzip" |
| 7 | + "io" |
6 | 8 | "io/ioutil" |
7 | 9 | "log" |
| 10 | + "lukechampine.com/blake3" |
8 | 11 | "regexp" |
9 | 12 | "strconv" |
10 | 13 | "strings" |
@@ -35,6 +38,9 @@ func Parse(file []byte) poly.Sequence { |
35 | 38 | // Create sequence struct |
36 | 39 | sequence := poly.Sequence{} |
37 | 40 |
|
| 41 | + // Add the CheckSum to sequence (blake3) |
| 42 | + sequence.CheckSum = blake3.Sum256(file) |
| 43 | + |
38 | 44 | for numLine := 0; numLine < len(lines); numLine++ { |
39 | 45 | line := lines[numLine] |
40 | 46 | splitLine := strings.Split(line, " ") |
@@ -760,41 +766,29 @@ Genbank Flat specific IO related things begin here. |
760 | 766 |
|
761 | 767 | // ParseMulti parses multiple Genbank files in a byte array to multiple sequences |
762 | 768 | func ParseMulti(file []byte) []poly.Sequence { |
| 769 | + r := bytes.NewReader(file) |
| 770 | + sequences := make(chan poly.Sequence) |
| 771 | + go ParseConcurrent(r, sequences) |
763 | 772 |
|
764 | | - gbk := string(file) |
765 | | - genbankFiles := strings.SplitAfter(gbk, "//\n") |
766 | | - |
767 | | - //Remove last genbankFile in list. The real file terminates with //, which |
768 | | - //will be interpreted as an empty genbankFile. |
769 | | - genbankFiles = genbankFiles[:len(genbankFiles)-1] |
770 | | - |
771 | | - //Iterate through each genbankFile in genbankFiles list and Parse it |
772 | | - //using the Parse function. Return output. |
773 | | - var sequences []poly.Sequence |
774 | | - for _, f := range genbankFiles { |
775 | | - sequences = append(sequences, Parse([]byte(f))) |
| 773 | + var outputGenbanks []poly.Sequence |
| 774 | + for sequence := range sequences { |
| 775 | + outputGenbanks = append(outputGenbanks, sequence) |
776 | 776 | } |
777 | | - |
778 | | - return sequences |
779 | | - |
| 777 | + return outputGenbanks |
780 | 778 | } |
781 | 779 |
|
782 | 780 | // ParseFlat specifically takes the output of a Genbank Flat file that from |
783 | 781 | // the genbank ftp dumps. These files have 10 line headers, which are entirely |
784 | 782 | // removed |
785 | 783 | func ParseFlat(file []byte) []poly.Sequence { |
786 | | - |
787 | | - gbk := string(file) |
788 | | - |
789 | | - // This code removes the header, which is 10 lines long. This is inefficient |
790 | | - // and gets rid of the data in the header, which may be useful for some |
791 | | - // application. Header data is not needed to parse the Genbank files, though |
792 | | - gbkWithoutHeader := []byte(strings.Join(strings.Split(gbk, "\n")[10:], "\n")) |
793 | | - |
794 | | - // Pass gbkWithoutHeader to ParseMulti, which should handle |
795 | | - // the rest of the parsing just fine |
796 | | - sequences := ParseMulti(gbkWithoutHeader) |
797 | | - return sequences |
| 784 | + r := bytes.NewReader(file) |
| 785 | + sequences := make(chan poly.Sequence) |
| 786 | + go ParseFlatConcurrent(r, sequences) |
| 787 | + var outputGenbanks []poly.Sequence |
| 788 | + for sequence := range sequences { |
| 789 | + outputGenbanks = append(outputGenbanks, sequence) |
| 790 | + } |
| 791 | + return outputGenbanks |
798 | 792 | } |
799 | 793 |
|
800 | 794 | // ReadMulti reads multiple genbank files from a single file |
@@ -826,3 +820,51 @@ func ReadFlatGz(path string) []poly.Sequence { |
826 | 820 | Genbank Flat specific IO related things end here. |
827 | 821 |
|
828 | 822 | ******************************************************************************/ |
| 823 | + |
| 824 | +/****************************************************************************** |
| 825 | +
|
| 826 | +Genbank Concurrent specific IO related things begin here. |
| 827 | +
|
| 828 | +******************************************************************************/ |
| 829 | + |
| 830 | +// ParseConcurrent concurrently parses a given multi-Genbank file in an io.Reader into a channel of poly.Sequence. |
| 831 | +func ParseConcurrent(r io.Reader, sequences chan<- poly.Sequence) { |
| 832 | + var gbkStr string |
| 833 | + var gbk poly.Sequence |
| 834 | + |
| 835 | + // Start a new scanner |
| 836 | + scanner := bufio.NewScanner(r) |
| 837 | + for scanner.Scan() { |
| 838 | + line := scanner.Text() |
| 839 | + if line == "//" { |
| 840 | + gbkStr = gbkStr + "//" |
| 841 | + // Parse the genbank string and send it to the channel |
| 842 | + gbk = Parse([]byte(gbkStr)) |
| 843 | + sequences <- gbk |
| 844 | + // Reset the genbank string |
| 845 | + gbkStr = "" |
| 846 | + } else { |
| 847 | + // Append new lines of the Genbank file to a growing string |
| 848 | + gbkStr = gbkStr + line + "\n" |
| 849 | + } |
| 850 | + } |
| 851 | + close(sequences) |
| 852 | +} |
| 853 | + |
| 854 | +// ParseFlatConcurrent concurrently parses a given flat-Genbank file in an io.Reader into a channel of poly.Sequnce. |
| 855 | +func ParseFlatConcurrent(r io.Reader, sequences chan<- poly.Sequence) { |
| 856 | + // Start a new reader |
| 857 | + reader := bufio.NewReader(r) |
| 858 | + // Read 10 lines, or the header of a flat file |
| 859 | + // Header data is not needed to parse the Genbank files, though it may contain useful information. |
| 860 | + for i := 0; i < 10; i++ { |
| 861 | + _, _, _ = reader.ReadLine() |
| 862 | + } |
| 863 | + go ParseConcurrent(reader, sequences) |
| 864 | +} |
| 865 | + |
| 866 | +/****************************************************************************** |
| 867 | +
|
| 868 | +Genbank Concurrent specific IO related things end here. |
| 869 | +
|
| 870 | +******************************************************************************/ |
0 commit comments