From ebe83db6de87b5c821eb01311e28561940a2b8f6 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Tue, 3 Mar 2026 12:06:51 +1100 Subject: [PATCH 1/2] early lib --- Makefile | 20 +++++++-- examples/example.c | 19 +++++++++ include/squigulator.h | 20 +++++++++ src/sim.c | 9 ++-- src/sq_api.c | 98 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 158 insertions(+), 8 deletions(-) create mode 100644 examples/example.c create mode 100755 include/squigulator.h create mode 100755 src/sq_api.c diff --git a/Makefile b/Makefile index 6f21823..634d329 100644 --- a/Makefile +++ b/Makefile @@ -9,9 +9,14 @@ ifeq ($(zstd),1) LDFLAGS += -lzstd endif +STATICLIB = $(BUILD_DIR)/libsquigulator.a + BINARY = squigulator -OBJ = $(BUILD_DIR)/main.o \ - $(BUILD_DIR)/model.o \ + +OBJ_BIN = $(BUILD_DIR)/main.o $(OBJ) +OBJ_LIB = $(BUILD_DIR)/sq_api.o $(OBJ) + +OBJ = $(BUILD_DIR)/model.o \ $(BUILD_DIR)/methmodel.o \ $(BUILD_DIR)/misc.o \ $(BUILD_DIR)/sim.o \ @@ -31,8 +36,12 @@ endif .PHONY: clean distclean test -$(BINARY): $(OBJ) slow5lib/lib/libslow5.a - $(CC) $(CFLAGS) $(OBJ) slow5lib/lib/libslow5.a $(LDFLAGS) -o $@ +$(BINARY): $(OBJ_BIN) slow5lib/lib/libslow5.a + $(CC) $(CFLAGS) $(OBJ_BIN) slow5lib/lib/libslow5.a $(LDFLAGS) -o $@ + +$(STATICLIB): $(OBJ_LIB) slow5lib/lib/libslow5.a + cp slow5lib/lib/libslow5.a $@ + $(AR) rcs $@ $(OBJ_LIB) HEADERS = src/error.h src/format.h src/misc.h src/model.h \ src/rand.h src/ref.h src/seq.h src/sq.h src/str.h src/version.h \ @@ -41,6 +50,9 @@ HEADERS = src/error.h src/format.h src/misc.h src/model.h \ $(BUILD_DIR)/main.o: src/main.c $(HEADERS) $(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@ +$(BUILD_DIR)/sq_api.o: src/sq_api.c $(HEADERS) include/squigulator.h + $(CC) $(CFLAGS) $(CPPFLAGS) -I include/ $< -c -o $@ + $(BUILD_DIR)/model.o: src/model.c src/model.h src/misc.h $(CC) $(CFLAGS) $(CPPFLAGS) $< -c -o $@ diff --git a/examples/example.c b/examples/example.c new file mode 100644 index 0000000..9890c4f --- /dev/null +++ b/examples/example.c @@ -0,0 +1,19 @@ +// gcc -Wall -I include/ examples/example.c build/libsquigulator.a -o example -lz -lm -lpthread + +#include +#include + +int main(){ + squig_t *sq = squig_init("test/nCoV-2019.reference.fasta", "out.blow5","dna-r9-prom"); + squig_batch_t *batch = squig_sim_batch(sq, 10); + + for(int i=0; i<10; i++){ + slow5_rec_t *rec = squig_decode(sq, batch, i); + fprintf(stdout, "%s\n", rec->read_id); + slow5_rec_free(rec); + } + + squig_free_batch(batch); + squig_free(sq); + return 0; +} \ No newline at end of file diff --git a/include/squigulator.h b/include/squigulator.h new file mode 100755 index 0000000..142eb68 --- /dev/null +++ b/include/squigulator.h @@ -0,0 +1,20 @@ +#ifndef SQUIGULATOR_H +#define SQUIGULATOR_H + +#include + +typedef struct squig_s squig_t; +typedef struct squig_batch_s squig_batch_t; + + +squig_t *squig_init(char *ref_fasta, char* output_blow5, char *profile ); + +void squig_free(squig_t *sq); + +squig_batch_t* squig_sim_batch(squig_t *sq, int num_reads); + +void squig_free_batch(squig_batch_t *batch); + +slow5_rec_t *squig_decode(squig_t *sq, squig_batch_t *batch, int i); + +#endif \ No newline at end of file diff --git a/src/sim.c b/src/sim.c index cf516c4..e61d70e 100644 --- a/src/sim.c +++ b/src/sim.c @@ -149,7 +149,8 @@ profile_t minion_rna004_rna_prof = { .dwell_std=0.0 }; -static inline profile_t set_profile(char *prof_name, opt_t *opt){ + +profile_t set_profile(char *prof_name, opt_t *opt){ //opt->rna = 0; if(strcmp(prof_name, "dna-r9-min") == 0){ return minion_r9_dna_prof; @@ -191,14 +192,14 @@ static inline profile_t set_profile(char *prof_name, opt_t *opt){ } } -static void print_model_stat(profile_t p){ +static inline void print_model_stat(profile_t p){ VERBOSE("digitisation: %.1f; sample_rate: %.1f; range: %.1f; offset_mean: %.1f; offset_std: %.1f; dwell_mean: %.1f; dwell_std: %.1f", p.digitisation,p.sample_rate,p.range,p.offset_mean,p.offset_std,p.dwell_mean,p.dwell_std); } ///////////////////////////////////////////// CORE and DB ///////////////////////////////////////////// -static void init_opt(opt_t *opt){ +void init_opt(opt_t *opt){ //opt->ideal = 0; //opt->ideal_time = 0; //opt->ideal_amp = 0; @@ -261,7 +262,7 @@ static void init_rand(core_t *core){ } } -static core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_file, char *fasta, char *paf, char *sam, char *trans_count){ +core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_file, char *fasta, char *paf, char *sam, char *trans_count){ core_t *core = (core_t *)malloc(sizeof(core_t)); MALLOC_CHK(core); core->opt = opt; diff --git a/src/sq_api.c b/src/sq_api.c new file mode 100755 index 0000000..2a1d10e --- /dev/null +++ b/src/sq_api.c @@ -0,0 +1,98 @@ +#include "squigulator.h" +#include "version.h" +#include "sq.h" +#include "ref.h" +#include "format.h" +#include "seq.h" +#include "rand.h" +#include "error.h" +#include "misc.h" + +struct squig_s { + core_t *core; + slow5_file_t *sp; +}; + +struct squig_batch_s { + db_t *db; +}; + +// db_t* init_db(core_t* core, int32_t n_rec); +core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_file, char *fasta, char *paf, char *sam, char *trans_count); +void init_opt(opt_t *opt); +profile_t set_profile(char *prof_name, opt_t *opt); +void free_core(core_t *core); +void free_db(db_t* db); +db_t* init_db(core_t* core, int32_t n_rec); +void process_db(core_t* core,db_t* db); +void output_db(core_t* core, db_t* db); + +squig_t *squig_init(char *ref_fasta, char* output_blow5, char *profile){ + squig_t *sq = (squig_t *)malloc(sizeof(squig_t *)); + MALLOC_CHK(sq); + + opt_t opt; + init_opt(&opt); + + profile_t p = set_profile(profile, &opt); + + char *output_file = output_blow5; + char *fasta = NULL; + char *paf = NULL; + char *sam = NULL; + char *trans_count = NULL; + + opt.seed = 1; //todo, fix this + + sq->core = init_core(opt, p, ref_fasta, output_file, fasta, paf, sam, trans_count); + + return sq; +} + +void squig_free(squig_t *sq){ + free_core(sq->core); + free(sq); + +} + + +void squig_free_batch(squig_batch_t *batch){ + free_db(batch->db); + free(batch); +} + +squig_batch_t *squig_sim_batch(squig_t *sq, int num_reads){ + //todo check if the n<=c + squig_batch_t *batch = (squig_batch_t *)malloc(sizeof(squig_batch_t *)); + batch->db = init_db(sq->core, num_reads); + process_db(sq->core, batch->db); + output_db(sq->core, batch->db); + return batch; +} + +slow5_rec_t *squig_decode(squig_t *sq, squig_batch_t *batch, int i){ + slow5_rec_t *rec = NULL; + + size_t bytes = batch->db->mem_bytes[i]; + char *mem = (char *)malloc(bytes); + MALLOC_CHK(mem); + + if(sq->core->sp->format == SLOW5_FORMAT_ASCII){ + memcpy(mem, batch->db->mem_records[i], bytes); + mem[bytes-1] = '\0'; + } else if (sq->core->sp->format == SLOW5_FORMAT_BINARY){ + memcpy(mem, batch->db->mem_records[i]+8, bytes-8); + } else { + fprintf(stderr, "Something is not right!\n"); + exit(EXIT_FAILURE); + } + + if (slow5_decode(&mem, &batch->db->mem_bytes[i], &rec, sq->core->sp) < 0){ + fprintf(stderr,"Error in decoding record\n"); + exit(EXIT_FAILURE); + } + + free(mem); + + return rec; +} From be456f909796de02258abcb14b3055b61b0432c9 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Tue, 3 Mar 2026 13:39:51 +1100 Subject: [PATCH 2/2] mostly done --- .gitignore | 2 + examples/example.c | 36 ++++++++++++---- include/squigulator.h | 44 +++++++++++++++---- src/sq_api.c | 99 +++++++++++++++++++++++++++++++++++++------ 4 files changed, 152 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index ba6879c..a1ff030 100644 --- a/.gitignore +++ b/.gitignore @@ -87,3 +87,5 @@ test/ecoli_2kb_region/reads.fasta.index.readdb /test/*.index /*.paf /test/hu +/example +build/libsquigulator.a \ No newline at end of file diff --git a/examples/example.c b/examples/example.c index 9890c4f..df77bca 100644 --- a/examples/example.c +++ b/examples/example.c @@ -1,19 +1,39 @@ // gcc -Wall -I include/ examples/example.c build/libsquigulator.a -o example -lz -lm -lpthread +#include #include -#include int main(){ - squig_t *sq = squig_init("test/nCoV-2019.reference.fasta", "out.blow5","dna-r9-prom"); - squig_batch_t *batch = squig_sim_batch(sq, 10); - for(int i=0; i<10; i++){ - slow5_rec_t *rec = squig_decode(sq, batch, i); - fprintf(stdout, "%s\n", rec->read_id); - slow5_rec_free(rec); + squig_opt_t opt; + squig_init_opt(&opt); + opt.output_blow5 = "test.blow5"; + opt.profile = "dna-r9-prom"; + opt.num_threads = 8; + opt.batch_size = 1000; + + squig_t *sq = squig_init("test/nCoV-2019.reference.fasta", &opt); + + fprintf(stdout,"read_id\tread_group\tdigitisation\toffset\trange\tlen_raw_signal\tsum_raw_signal\n"); + + for(int n=0; n<5; n++){ //1000*5 reads + + squig_batch_t *batch = squig_sim_batch(sq); + + for(int i=0; i < opt.batch_size; i++){ + squig_rec_t *rec = squig_get_rec(sq, batch, i); + fprintf(stdout, "%s\t%d\t%.0f\t%.0f\t%.0f\t%.0f\t%ld\t", rec->read_id, rec->read_group, rec->digitisation, rec->offset, rec->range, rec->sampling_rate, rec->len_raw_signal); + int64_t raw_signal_sum = 0; + for(int j=0; j < rec->len_raw_signal; j++){ + raw_signal_sum += rec->raw_signal[j]; + } + fprintf(stdout, "%ld\n", raw_signal_sum); + } + + squig_free_batch(batch); } - squig_free_batch(batch); squig_free(sq); + return 0; } \ No newline at end of file diff --git a/include/squigulator.h b/include/squigulator.h index 142eb68..43d6dee 100755 --- a/include/squigulator.h +++ b/include/squigulator.h @@ -1,20 +1,48 @@ #ifndef SQUIGULATOR_H #define SQUIGULATOR_H -#include +#include + +typedef struct { + //from primary SLOW5 fields + char* read_id; + uint32_t read_group; + double digitisation; + double offset; + double range; + double sampling_rate; + uint64_t len_raw_signal; + int16_t* raw_signal; + + //from auxilliary SLOW5 fields + char *channel_number; + double median_before; + int32_t read_number; + uint8_t start_mux; + uint64_t start_time; + uint8_t end_reason; + +} squig_rec_t; + +typedef struct { + int32_t avg_rlen; + int64_t random_seed; + int32_t num_threads; + int32_t batch_size; + char *output_blow5; + char *profile; + //things to add later ... +} squig_opt_t; typedef struct squig_s squig_t; typedef struct squig_batch_s squig_batch_t; - -squig_t *squig_init(char *ref_fasta, char* output_blow5, char *profile ); - +void squig_init_opt(squig_opt_t *opts); +squig_t *squig_init(char *ref_fasta, squig_opt_t *opts); void squig_free(squig_t *sq); - -squig_batch_t* squig_sim_batch(squig_t *sq, int num_reads); - +squig_batch_t* squig_sim_batch(squig_t *sq); void squig_free_batch(squig_batch_t *batch); +squig_rec_t *squig_get_rec(squig_t *sq, squig_batch_t *batch, int i); -slow5_rec_t *squig_decode(squig_t *sq, squig_batch_t *batch, int i); #endif \ No newline at end of file diff --git a/src/sq_api.c b/src/sq_api.c index 2a1d10e..2fc58af 100755 --- a/src/sq_api.c +++ b/src/sq_api.c @@ -1,3 +1,8 @@ + +#define _XOPEN_SOURCE 700 +#include +#include + #include "squigulator.h" #include "version.h" #include "sq.h" @@ -8,16 +13,22 @@ #include "error.h" #include "misc.h" + + + + struct squig_s { core_t *core; - slow5_file_t *sp; + squig_opt_t opt; + // can later have the header attributes and enum labels here }; struct squig_batch_s { db_t *db; + squig_rec_t **rec; + int n_rec; }; -// db_t* init_db(core_t* core, int32_t n_rec); core_t *init_core(opt_t opt, profile_t p, char *refname, char *output_file, char *fasta, char *paf, char *sam, char *trans_count); void init_opt(opt_t *opt); profile_t set_profile(char *prof_name, opt_t *opt); @@ -27,25 +38,43 @@ db_t* init_db(core_t* core, int32_t n_rec); void process_db(core_t* core,db_t* db); void output_db(core_t* core, db_t* db); -squig_t *squig_init(char *ref_fasta, char* output_blow5, char *profile){ - squig_t *sq = (squig_t *)malloc(sizeof(squig_t *)); +void squig_init_opt(squig_opt_t *opts){ + opts->avg_rlen = 1000; + opts->random_seed = 0; + opts->num_threads = 8; + opts->batch_size = 1000; + opts->output_blow5 = "output.blow5"; + opts->profile = "dna-r9-prom"; +} + +squig_t *squig_init(char *ref_fasta, squig_opt_t *opts){ + squig_t *sq = (squig_t *)malloc(sizeof(squig_t)); MALLOC_CHK(sq); opt_t opt; init_opt(&opt); - profile_t p = set_profile(profile, &opt); + opt.rlen = opts->avg_rlen; + opt.num_thread = opts->num_threads; + opt.batch_size = opts->batch_size; + if (opts->random_seed == 0){ + opts->random_seed = realtime(); + } + opt.seed = opts->random_seed; - char *output_file = output_blow5; + profile_t p = set_profile(opts->profile, &opt); + + char *output_file = opts->output_blow5; char *fasta = NULL; char *paf = NULL; char *sam = NULL; char *trans_count = NULL; - opt.seed = 1; //todo, fix this sq->core = init_core(opt, p, ref_fasta, output_file, fasta, paf, sam, trans_count); + sq->opt = *opts; + return sq; } @@ -57,20 +86,43 @@ void squig_free(squig_t *sq){ void squig_free_batch(squig_batch_t *batch){ + for(int i = 0; i < batch->n_rec; i++){ + free(batch->rec[i]->read_id); + free(batch->rec[i]->raw_signal); + free(batch->rec[i]->channel_number); + free(batch->rec[i]); + } + free(batch->rec); free_db(batch->db); free(batch); } -squig_batch_t *squig_sim_batch(squig_t *sq, int num_reads){ - //todo check if the n<=c - squig_batch_t *batch = (squig_batch_t *)malloc(sizeof(squig_batch_t *)); - batch->db = init_db(sq->core, num_reads); +squig_batch_t *squig_sim_batch(squig_t *sq){ + + squig_batch_t *batch = (squig_batch_t *)malloc(sizeof(squig_batch_t)); + MALLOC_CHK(batch); + batch->n_rec = sq->opt.batch_size; + batch->db = init_db(sq->core, batch->n_rec); + batch->rec = (squig_rec_t **)malloc(batch->n_rec * sizeof(squig_rec_t *)); + MALLOC_CHK(batch->rec); + for(int i = 0; i < batch->n_rec; i++){ + batch->rec[i] = (squig_rec_t *)malloc(sizeof(squig_rec_t)); + MALLOC_CHK(batch->rec[i]); + } process_db(sq->core, batch->db); output_db(sq->core, batch->db); return batch; } -slow5_rec_t *squig_decode(squig_t *sq, squig_batch_t *batch, int i){ +//this function is hacky, inefficient and repeats unnecessary work. Can be fixed later if performance becomes an issue, +// by integrating into core function. Not wanting to touch those at the moment +squig_rec_t *squig_get_rec(squig_t *sq, squig_batch_t *batch, int i){ + + if(i < 0 || i >= batch->n_rec){ + fprintf(stderr, "Index out of bounds. batch size: %d, requested index: %d\n", batch->n_rec, i); + return NULL; + } + slow5_rec_t *rec = NULL; size_t bytes = batch->db->mem_bytes[i]; @@ -92,7 +144,28 @@ slow5_rec_t *squig_decode(squig_t *sq, squig_batch_t *batch, int i){ exit(EXIT_FAILURE); } + //from primary SLOW5 fields + batch->rec[i]->read_id = strdup(rec->read_id); + batch->rec[i]->read_group = rec->read_group; + batch->rec[i]->digitisation = rec->digitisation; + batch->rec[i]->offset = rec->offset; + batch->rec[i]->range = rec->range; + batch->rec[i]->sampling_rate = rec->sampling_rate; + batch->rec[i]->len_raw_signal = rec->len_raw_signal; + batch->rec[i]->raw_signal = rec->raw_signal; rec->raw_signal = NULL; + + //from auxilliary SLOW5 fields + int ret=0; + uint64_t len=0; + batch->rec[i]->channel_number = strdup(slow5_aux_get_string(rec,"channel_number", &len, &ret)); NEG_CHK(ret); + batch->rec[i]->median_before = slow5_aux_get_double(rec,"median_before", &ret); NEG_CHK(ret); + batch->rec[i]->read_number = slow5_aux_get_int32(rec,"read_number", &ret); NEG_CHK(ret); + batch->rec[i]->start_mux = slow5_aux_get_uint8(rec,"start_mux", &ret); NEG_CHK(ret); + batch->rec[i]->start_time = slow5_aux_get_uint64(rec,"start_time", &ret); NEG_CHK(ret); + batch->rec[i]->end_reason = slow5_aux_get_enum(rec,"end_reason", &ret); NEG_CHK(ret); + free(mem); + slow5_rec_free(rec); - return rec; + return batch->rec[i]; }