4141#include < utility>
4242#include < vector>
4343
44- // NOLINTBEGIN(*-avoid-magic-numbers,*- narrowing-conversions,*-prefer-member-initializer )
44+ // NOLINTBEGIN(*-narrowing-conversions)
4545
4646struct hmr_summary {
47- explicit hmr_summary (const std::vector<GenomicRegion> &hmrs) {
48- hmr_count = hmrs. size ();
49- hmr_total_size =
47+ explicit hmr_summary (const std::vector<GenomicRegion> &hmrs) :
48+ hmr_count{ std:: size (hmrs)},
49+ hmr_total_size{
5050 std::accumulate (std::cbegin (hmrs), std::cend (hmrs), 0ul ,
5151 [](const std::uint64_t t, const GenomicRegion &p) {
5252 return t + p.get_width ();
53- });
53+ })} {
5454 hmr_mean_size = static_cast <double >(hmr_total_size) /
5555 std::max (hmr_count, static_cast <std::uint64_t >(1 ));
5656 }
@@ -91,27 +91,28 @@ get_stepup_cutoff(std::vector<double> scores, const double cutoff) {
9191 else if (cutoff > 1 )
9292 return std::numeric_limits<double >::min ();
9393
94- const std::size_t n = scores. size ();
94+ const std::size_t n = std:: size (scores );
9595 std::sort (begin (scores), std::end (scores));
9696 std::size_t i = 1 ;
9797 while (i < n && scores[i - 1 ] < (cutoff * i) / n)
9898 ++i;
9999 return scores[i - 1 ];
100100}
101101
102- static void
102+ static auto
103103get_domain_scores (const std::vector<bool > &state_ids,
104104 const std::vector<std::pair<double , double >> &meth,
105- const std::vector<std::size_t > &reset_points,
106- std::vector<double > &scores) {
105+ const std::vector<std::size_t > &reset_points)
106+ -> std::vector<double> {
107107 std::size_t reset_idx = 1 ;
108108 bool in_domain = false ;
109109 double score = 0 ;
110- for (std::size_t i = 0 ; i < state_ids.size (); ++i) {
110+ std::vector<double > domain_scores;
111+ for (std::size_t i = 0 ; i < std::size (state_ids); ++i) {
111112 if (reset_points[reset_idx] == i) {
112113 if (in_domain) {
113114 in_domain = false ;
114- scores .push_back (score);
115+ domain_scores .push_back (score);
115116 score = 0 ;
116117 }
117118 ++reset_idx;
@@ -122,13 +123,13 @@ get_domain_scores(const std::vector<bool> &state_ids,
122123 }
123124 else if (in_domain) {
124125 in_domain = false ;
125- scores .push_back (score);
126+ domain_scores .push_back (score);
126127 score = 0 ;
127128 }
128129 }
129-
130130 if (in_domain)
131- scores.push_back (score);
131+ domain_scores.push_back (score);
132+ return domain_scores;
132133}
133134
134135static void
@@ -138,7 +139,7 @@ build_domains(const std::vector<MSite> &cpgs,
138139 std::vector<GenomicRegion> &domains) {
139140 std::size_t n_cpgs = 0 , reset_idx = 1 , prev_end = 0 ;
140141 bool in_domain = false ;
141- for (std::size_t i = 0 ; i < state_ids. size (); ++i) {
142+ for (std::size_t i = 0 ; i < std:: size (state_ids ); ++i) {
142143 if (reset_points[reset_idx] == i) {
143144 if (in_domain) {
144145 in_domain = false ;
@@ -152,7 +153,7 @@ build_domains(const std::vector<MSite> &cpgs,
152153 if (!in_domain) {
153154 in_domain = true ;
154155 domains.push_back (as_gen_rgn (cpgs[i]));
155- domains.back ().set_name (" HYPO" + std::to_string (domains. size ()));
156+ domains.back ().set_name (" HYPO" + std::to_string (std:: size (domains )));
156157 }
157158 ++n_cpgs;
158159 }
@@ -177,10 +178,10 @@ separate_regions(const bool verbose, const std::size_t desert_size,
177178 std::vector<U> &reads,
178179 std::vector<std::size_t > &reset_points) {
179180 if (verbose)
180- std::cerr << " [separating by cpg desert]" << ' \n ' ;
181+ std::cerr << " [separating by cpg desert]\n " ;
181182 // eliminate the zero-read cpgs
182183 std::size_t j = 0 ;
183- for (std::size_t i = 0 ; i < cpgs. size (); ++i)
184+ for (std::size_t i = 0 ; i < std:: size (cpgs ); ++i)
184185 if (reads[i] > 0 ) {
185186 cpgs[j] = cpgs[i];
186187 meth[j] = meth[i];
@@ -195,7 +196,7 @@ separate_regions(const bool verbose, const std::size_t desert_size,
195196 double bases_in_deserts = 0 ;
196197 // segregate cpgs
197198 std::size_t prev_pos = 0 ;
198- for (std::size_t i = 0 ; i < cpgs. size (); ++i) {
199+ for (std::size_t i = 0 ; i < std:: size (cpgs ); ++i) {
199200 const std::size_t dist = (i > 0 && cpgs[i].chrom == cpgs[i - 1 ].chrom )
200201 ? cpgs[i].pos - prev_pos
201202 : std::numeric_limits<std::size_t >::max ();
@@ -209,13 +210,13 @@ separate_regions(const bool verbose, const std::size_t desert_size,
209210
210211 prev_pos = cpgs[i].pos ;
211212 }
212- reset_points.push_back (cpgs. size ());
213+ reset_points.push_back (std:: size (cpgs ));
213214
214215 if (verbose)
215- std::cerr << " [cpgs retained: " << cpgs. size () << " ]" << ' \n '
216- << " [deserts removed: " << reset_points. size () - 2 << " ]" << ' \n '
216+ std::cerr << " [cpgs retained: " << std:: size (cpgs ) << " ]" << ' \n '
217+ << " [deserts removed: " << std:: size (reset_points ) - 2 << " ]\n "
217218 << " [genome fraction covered: "
218- << 1.0 - (bases_in_deserts / total_bases) << " ]" << ' \n ' ;
219+ << 1.0 - (bases_in_deserts / total_bases) << " ]\n " ;
219220}
220221
221222/* function to "fold" the methylation profile so that the middle
@@ -225,43 +226,45 @@ separate_regions(const bool verbose, const std::size_t desert_size,
225226static void
226227make_partial_meth (const std::vector<std::uint32_t > &reads,
227228 std::vector<std::pair<double , double >> &meth) {
228- for (std::size_t i = 0 ; i < reads.size (); ++i) {
229+ static constexpr auto half = 0.5 ;
230+ for (std::size_t i = 0 ; i < std::size (reads); ++i) {
229231 double m = meth[i].first / reads[i];
230- m = (m <= 0.5 ) ? (1.0 - 2 * m) : (1.0 - 2 * (1.0 - m));
232+ m = (m <= half ) ? (1.0 - 2 * m) : (1.0 - 2 * (1.0 - m));
231233 meth[i].first = reads[i] * m;
232234 meth[i].second = (reads[i] - meth[i].first );
233235 }
234236}
235237
236- static void
238+ [[nodiscard]] static auto
237239shuffle_cpgs (const std::size_t rng_seed, const TwoStateHMM &hmm,
238240 std::vector<std::pair<double , double >> meth,
239241 const std::vector<std::size_t > &reset_points, const double p_fb,
240242 const double p_bf, const double fg_alpha, const double fg_beta,
241- const double bg_alpha, const double bg_beta,
242- std::vector<double > &domain_scores) {
243+ const double bg_alpha,
244+ const double bg_beta) -> std::vector<double> {
243245 auto eng = std::default_random_engine (rng_seed);
244246 std::shuffle (std::begin (meth), std::end (meth), eng);
245-
246247 std::vector<bool > state_ids;
247248 std::vector<double > scores;
248249 hmm.PosteriorDecoding (meth, reset_points, p_fb, p_bf, fg_alpha, fg_beta,
249250 bg_alpha, bg_beta, state_ids, scores);
250- get_domain_scores (state_ids, meth, reset_points, domain_scores );
251+ auto domain_scores = get_domain_scores (state_ids, meth, reset_points);
251252 std::sort (std::begin (domain_scores), std::end (domain_scores));
253+ return domain_scores;
252254}
253255
254- static void
255- assign_p_values (const std::vector<double > &random_scores,
256- const std::vector<double > &observed_scores,
257- std::vector<double > &p_values) {
258- const double n_randoms = random_scores.empty () ? 1 : random_scores.size ();
259- for (std::size_t i = 0 ; i < observed_scores.size (); ++i)
260- p_values.push_back (
261- (std::end (random_scores) - upper_bound (std::begin (random_scores),
262- std::end (random_scores),
263- observed_scores[i])) /
264- n_randoms);
256+ [[nodiscard]] static auto
257+ assign_p_values (const std::vector<double > &random,
258+ const std::vector<double > &observed) -> std::vector<double> {
259+ const double n_randoms = random.empty () ? 1 : std::size (random);
260+ const auto n_scores = std::size (observed);
261+ std::vector<double > p_values (n_scores);
262+ for (auto i = 0u ; i < n_scores; ++i) {
263+ const auto ub =
264+ std::upper_bound (std::cbegin (random), std::cend (random), observed[i]);
265+ p_values[i] = std::distance (ub, std::end (random)) / n_randoms;
266+ }
267+ return p_values;
265268}
266269
267270static void
@@ -290,12 +293,13 @@ write_params_file(const std::string &outfile, const double fg_alpha,
290293 const double fg_beta, const double bg_alpha,
291294 const double bg_beta, const double p_fb, const double p_bf,
292295 const double domain_score_cutoff) {
296+ static constexpr auto precision_value = 30 ;
293297 std::ofstream of;
294298 if (!outfile.empty ())
295299 of.open (outfile.c_str ());
296300 std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
297301
298- out.precision (30 );
302+ out.precision (precision_value );
299303 out << " FG_ALPHA\t " << fg_alpha << ' \n '
300304 << " FG_BETA\t " << fg_beta << ' \n '
301305 << " BG_ALPHA\t " << bg_alpha << ' \n '
@@ -382,24 +386,28 @@ check_sorted_within_chroms(T first, const T last) {
382386int
383387main_hmr (int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
384388 try {
385- constexpr double min_coverage = 1.0 ;
389+ static constexpr auto min_coverage = 1.0 ;
390+ static constexpr auto tolerance = 1e-10 ; // corrections for small values
391+ static constexpr auto p_value_cutoff = 0.01 ;
392+ static constexpr auto alpha_frac = 0.33 ;
386393
387394 std::string outfile;
388395 std::string hypo_post_outfile;
389396 std::string meth_post_outfile;
390397
391- std::size_t desert_size = 1000 ;
392- std::size_t max_iterations = 10 ;
393- std::size_t rng_seed = 408 ;
398+ // NOLINTBEGIN(*-avoid-magic-numbers)
399+ std::size_t desert_size{1000 };
400+ std::size_t max_iterations{10 };
401+ std::size_t rng_seed{408 };
402+ double p_fb{0.25 };
403+ double p_bf{0.25 };
404+ // NOLINTEND(*-avoid-magic-numbers)
394405
395406 // run mode flags
396407 bool verbose = false ;
397408 bool PARTIAL_METH = false ;
398409 bool allow_extra_fields = false ;
399410
400- // corrections for small values
401- const double tolerance = 1e-10 ;
402-
403411 std::string summary_file;
404412
405413 std::string params_in_file;
@@ -416,8 +424,7 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
416424 /* ***************** COMMAND LINE OPTIONS ********************/
417425 OptionParser opt_parse (argv[0 ], // NOLINT(*-pointer-arithmetic)
418426 description, " <methylation-file>" );
419- opt_parse.add_opt (" out" , ' o' , " output file (default: stdout)" , false ,
420- outfile);
427+ opt_parse.add_opt (" out" , ' o' , " output file" , true , outfile);
421428 opt_parse.add_opt (" desert" , ' d' , " max dist btwn covered cpgs in HMR" , false ,
422429 desert_size);
423430 opt_parse.add_opt (" itr" , ' i' , " max iterations" , false , max_iterations);
@@ -474,6 +481,10 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
474481 if (!is_msite_file (cpgs_file))
475482 throw std::runtime_error (" malformed counts file: " + cpgs_file);
476483
484+ std::ofstream out (outfile);
485+ if (!out)
486+ throw std::runtime_error (" failed to open output file: " + outfile);
487+
477488 // separate the regions by chrom and by desert
478489 std::vector<MSite> cpgs;
479490 std::vector<std::pair<double , double >> meth;
@@ -491,8 +502,8 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
491502 const auto mean_coverage = get_mean (std::cbegin (reads), std::cend (reads));
492503
493504 if (verbose)
494- std::cerr << " [total_cpgs=" << size (cpgs) << " ]" << ' \n '
495- << " [mean_coverage=" << mean_coverage << " ]" << ' \n ' ;
505+ std::cerr << " [total_cpgs=" << std:: size (cpgs) << " ]\n "
506+ << " [mean_coverage=" << mean_coverage << " ]\n " ;
496507
497508 // check for sufficient data
498509 if (mean_coverage < static_cast <double >(min_coverage)) {
@@ -516,11 +527,8 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
516527
517528 const TwoStateHMM hmm (tolerance, max_iterations, verbose);
518529
519- double p_fb = 0.25 ;
520- double p_bf = 0.25 ;
521-
522- double fg_alpha = 0 , fg_beta = 0 ;
523- double bg_alpha = 0 , bg_beta = 0 ;
530+ double fg_alpha{}, fg_beta{};
531+ double bg_alpha{}, bg_beta{};
524532 double domain_score_cutoff = std::numeric_limits<double >::max ();
525533
526534 if (!params_in_file.empty ()) { // read parameters file
@@ -530,10 +538,10 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
530538 }
531539 else {
532540 const double n_reads = get_mean (std::cbegin (reads), std::cend (reads));
533- fg_alpha = 0.33 * n_reads;
534- fg_beta = 0.67 * n_reads;
535- bg_alpha = 0.67 * n_reads;
536- bg_beta = 0.33 * n_reads;
541+ fg_alpha = alpha_frac * n_reads;
542+ fg_beta = ( 1.0 - alpha_frac) * n_reads;
543+ bg_alpha = ( 1.0 - alpha_frac) * n_reads;
544+ bg_beta = alpha_frac * n_reads;
537545 }
538546
539547 if (max_iterations > 0 )
@@ -545,19 +553,15 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
545553 hmm.PosteriorDecoding (meth, reset_points, p_fb, p_bf, fg_alpha, fg_beta,
546554 bg_alpha, bg_beta, state_ids, posteriors);
547555
548- std::vector<double > domain_scores;
549- get_domain_scores (state_ids, meth, reset_points, domain_scores);
550-
551- std::vector<double > random_scores;
552- shuffle_cpgs (rng_seed, hmm, meth, reset_points, p_fb, p_bf, fg_alpha,
553- fg_beta, bg_alpha, bg_beta, random_scores);
554-
555- std::vector<double > p_values;
556- assign_p_values (random_scores, domain_scores, p_values);
556+ const auto domain_scores = get_domain_scores (state_ids, meth, reset_points);
557+ const auto random_scores =
558+ shuffle_cpgs (rng_seed, hmm, meth, reset_points, p_fb, p_bf, fg_alpha,
559+ fg_beta, bg_alpha, bg_beta);
560+ const auto p_values = assign_p_values (random_scores, domain_scores);
557561
558562 if (domain_score_cutoff == std::numeric_limits<double >::max () &&
559563 !domain_scores.empty ())
560- domain_score_cutoff = get_stepup_cutoff (p_values, 0.01 );
564+ domain_score_cutoff = get_stepup_cutoff (p_values, p_value_cutoff );
561565
562566 // write parameters if requested
563567 if (!params_out_file.empty ())
@@ -569,13 +573,8 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
569573 build_domains (cpgs, reset_points, state_ids, domains);
570574
571575 {
572- std::ofstream of;
573- if (!outfile.empty ())
574- of.open (outfile);
575- std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
576-
577576 std::size_t good_hmr_count = 0 ;
578- for (auto i = 0u ; i < size (domains); ++i)
577+ for (auto i = 0u ; i < std:: size (domains); ++i)
579578 if (p_values[i] < domain_score_cutoff) {
580579 domains[good_hmr_count] = domains[i];
581580 domains[good_hmr_count].set_name (" HYPO" +
@@ -591,24 +590,30 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
591590 if (!hypo_post_outfile.empty ()) {
592591 if (verbose)
593592 std::cerr << " [writing=" << hypo_post_outfile << " ]\n " ;
594- std::ofstream out (hypo_post_outfile);
595- for (std::size_t i = 0 ; i < cpgs.size (); ++i) {
593+ std::ofstream out_hypo_post (hypo_post_outfile);
594+ if (!out_hypo_post)
595+ throw std::runtime_error (" failed to open output hypo post file: " +
596+ hypo_post_outfile);
597+ for (std::size_t i = 0 ; i < std::size (cpgs); ++i) {
596598 GenomicRegion cpg (as_gen_rgn (cpgs[i]));
597599 cpg.set_name (format_cpg_meth_tag (meth[i]));
598600 cpg.set_score (posteriors[i]);
599- out << cpg << ' \n ' ;
601+ out_hypo_post << cpg << ' \n ' ;
600602 }
601603 }
602604
603605 if (!meth_post_outfile.empty ()) {
604- std::ofstream out (meth_post_outfile);
606+ std::ofstream out_meth_post (meth_post_outfile);
607+ if (!out_meth_post)
608+ throw std::runtime_error (" failed to open meth post output file: " +
609+ meth_post_outfile);
605610 if (verbose)
606611 std::cerr << " [writing=" << meth_post_outfile << " ]\n " ;
607- for (std::size_t i = 0 ; i < cpgs. size (); ++i) {
612+ for (std::size_t i = 0 ; i < std:: size (cpgs ); ++i) {
608613 GenomicRegion cpg (as_gen_rgn (cpgs[i]));
609614 cpg.set_name (format_cpg_meth_tag (meth[i]));
610615 cpg.set_score (1.0 - posteriors[i]);
611- out << cpg << ' \n ' ;
616+ out_meth_post << cpg << ' \n ' ;
612617 }
613618 }
614619 if (!summary_file.empty ()) {
@@ -625,4 +630,4 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
625630 return EXIT_SUCCESS;
626631}
627632
628- // NOLINTEND(*-avoid-magic-numbers,*- narrowing-conversions,*-prefer-member-initializer )
633+ // NOLINTEND(*-narrowing-conversions)
0 commit comments