Skip to content

Commit e7e3cba

Browse files
src/analysis/hmr.cpp: checking the output file can open at the start of the run
1 parent 2c48553 commit e7e3cba

File tree

1 file changed

+90
-85
lines changed

1 file changed

+90
-85
lines changed

src/analysis/hmr.cpp

Lines changed: 90 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,16 @@
4141
#include <utility>
4242
#include <vector>
4343

44-
// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions,*-prefer-member-initializer)
44+
// NOLINTBEGIN(*-narrowing-conversions)
4545

4646
struct hmr_summary {
47-
explicit hmr_summary(const std::vector<GenomicRegion> &hmrs) {
48-
hmr_count = hmrs.size();
49-
hmr_total_size =
47+
explicit hmr_summary(const std::vector<GenomicRegion> &hmrs) :
48+
hmr_count{std::size(hmrs)},
49+
hmr_total_size{
5050
std::accumulate(std::cbegin(hmrs), std::cend(hmrs), 0ul,
5151
[](const std::uint64_t t, const GenomicRegion &p) {
5252
return t + p.get_width();
53-
});
53+
})} {
5454
hmr_mean_size = static_cast<double>(hmr_total_size) /
5555
std::max(hmr_count, static_cast<std::uint64_t>(1));
5656
}
@@ -91,27 +91,28 @@ get_stepup_cutoff(std::vector<double> scores, const double cutoff) {
9191
else if (cutoff > 1)
9292
return std::numeric_limits<double>::min();
9393

94-
const std::size_t n = scores.size();
94+
const std::size_t n = std::size(scores);
9595
std::sort(begin(scores), std::end(scores));
9696
std::size_t i = 1;
9797
while (i < n && scores[i - 1] < (cutoff * i) / n)
9898
++i;
9999
return scores[i - 1];
100100
}
101101

102-
static void
102+
static auto
103103
get_domain_scores(const std::vector<bool> &state_ids,
104104
const std::vector<std::pair<double, double>> &meth,
105-
const std::vector<std::size_t> &reset_points,
106-
std::vector<double> &scores) {
105+
const std::vector<std::size_t> &reset_points)
106+
-> std::vector<double> {
107107
std::size_t reset_idx = 1;
108108
bool in_domain = false;
109109
double score = 0;
110-
for (std::size_t i = 0; i < state_ids.size(); ++i) {
110+
std::vector<double> domain_scores;
111+
for (std::size_t i = 0; i < std::size(state_ids); ++i) {
111112
if (reset_points[reset_idx] == i) {
112113
if (in_domain) {
113114
in_domain = false;
114-
scores.push_back(score);
115+
domain_scores.push_back(score);
115116
score = 0;
116117
}
117118
++reset_idx;
@@ -122,13 +123,13 @@ get_domain_scores(const std::vector<bool> &state_ids,
122123
}
123124
else if (in_domain) {
124125
in_domain = false;
125-
scores.push_back(score);
126+
domain_scores.push_back(score);
126127
score = 0;
127128
}
128129
}
129-
130130
if (in_domain)
131-
scores.push_back(score);
131+
domain_scores.push_back(score);
132+
return domain_scores;
132133
}
133134

134135
static void
@@ -138,7 +139,7 @@ build_domains(const std::vector<MSite> &cpgs,
138139
std::vector<GenomicRegion> &domains) {
139140
std::size_t n_cpgs = 0, reset_idx = 1, prev_end = 0;
140141
bool in_domain = false;
141-
for (std::size_t i = 0; i < state_ids.size(); ++i) {
142+
for (std::size_t i = 0; i < std::size(state_ids); ++i) {
142143
if (reset_points[reset_idx] == i) {
143144
if (in_domain) {
144145
in_domain = false;
@@ -152,7 +153,7 @@ build_domains(const std::vector<MSite> &cpgs,
152153
if (!in_domain) {
153154
in_domain = true;
154155
domains.push_back(as_gen_rgn(cpgs[i]));
155-
domains.back().set_name("HYPO" + std::to_string(domains.size()));
156+
domains.back().set_name("HYPO" + std::to_string(std::size(domains)));
156157
}
157158
++n_cpgs;
158159
}
@@ -177,10 +178,10 @@ separate_regions(const bool verbose, const std::size_t desert_size,
177178
std::vector<U> &reads,
178179
std::vector<std::size_t> &reset_points) {
179180
if (verbose)
180-
std::cerr << "[separating by cpg desert]" << '\n';
181+
std::cerr << "[separating by cpg desert]\n";
181182
// eliminate the zero-read cpgs
182183
std::size_t j = 0;
183-
for (std::size_t i = 0; i < cpgs.size(); ++i)
184+
for (std::size_t i = 0; i < std::size(cpgs); ++i)
184185
if (reads[i] > 0) {
185186
cpgs[j] = cpgs[i];
186187
meth[j] = meth[i];
@@ -195,7 +196,7 @@ separate_regions(const bool verbose, const std::size_t desert_size,
195196
double bases_in_deserts = 0;
196197
// segregate cpgs
197198
std::size_t prev_pos = 0;
198-
for (std::size_t i = 0; i < cpgs.size(); ++i) {
199+
for (std::size_t i = 0; i < std::size(cpgs); ++i) {
199200
const std::size_t dist = (i > 0 && cpgs[i].chrom == cpgs[i - 1].chrom)
200201
? cpgs[i].pos - prev_pos
201202
: std::numeric_limits<std::size_t>::max();
@@ -209,13 +210,13 @@ separate_regions(const bool verbose, const std::size_t desert_size,
209210

210211
prev_pos = cpgs[i].pos;
211212
}
212-
reset_points.push_back(cpgs.size());
213+
reset_points.push_back(std::size(cpgs));
213214

214215
if (verbose)
215-
std::cerr << "[cpgs retained: " << cpgs.size() << "]" << '\n'
216-
<< "[deserts removed: " << reset_points.size() - 2 << "]" << '\n'
216+
std::cerr << "[cpgs retained: " << std::size(cpgs) << "]" << '\n'
217+
<< "[deserts removed: " << std::size(reset_points) - 2 << "]\n"
217218
<< "[genome fraction covered: "
218-
<< 1.0 - (bases_in_deserts / total_bases) << "]" << '\n';
219+
<< 1.0 - (bases_in_deserts / total_bases) << "]\n";
219220
}
220221

221222
/* function to "fold" the methylation profile so that the middle
@@ -225,43 +226,45 @@ separate_regions(const bool verbose, const std::size_t desert_size,
225226
static void
226227
make_partial_meth(const std::vector<std::uint32_t> &reads,
227228
std::vector<std::pair<double, double>> &meth) {
228-
for (std::size_t i = 0; i < reads.size(); ++i) {
229+
static constexpr auto half = 0.5;
230+
for (std::size_t i = 0; i < std::size(reads); ++i) {
229231
double m = meth[i].first / reads[i];
230-
m = (m <= 0.5) ? (1.0 - 2 * m) : (1.0 - 2 * (1.0 - m));
232+
m = (m <= half) ? (1.0 - 2 * m) : (1.0 - 2 * (1.0 - m));
231233
meth[i].first = reads[i] * m;
232234
meth[i].second = (reads[i] - meth[i].first);
233235
}
234236
}
235237

236-
static void
238+
[[nodiscard]] static auto
237239
shuffle_cpgs(const std::size_t rng_seed, const TwoStateHMM &hmm,
238240
std::vector<std::pair<double, double>> meth,
239241
const std::vector<std::size_t> &reset_points, const double p_fb,
240242
const double p_bf, const double fg_alpha, const double fg_beta,
241-
const double bg_alpha, const double bg_beta,
242-
std::vector<double> &domain_scores) {
243+
const double bg_alpha,
244+
const double bg_beta) -> std::vector<double> {
243245
auto eng = std::default_random_engine(rng_seed);
244246
std::shuffle(std::begin(meth), std::end(meth), eng);
245-
246247
std::vector<bool> state_ids;
247248
std::vector<double> scores;
248249
hmm.PosteriorDecoding(meth, reset_points, p_fb, p_bf, fg_alpha, fg_beta,
249250
bg_alpha, bg_beta, state_ids, scores);
250-
get_domain_scores(state_ids, meth, reset_points, domain_scores);
251+
auto domain_scores = get_domain_scores(state_ids, meth, reset_points);
251252
std::sort(std::begin(domain_scores), std::end(domain_scores));
253+
return domain_scores;
252254
}
253255

254-
static void
255-
assign_p_values(const std::vector<double> &random_scores,
256-
const std::vector<double> &observed_scores,
257-
std::vector<double> &p_values) {
258-
const double n_randoms = random_scores.empty() ? 1 : random_scores.size();
259-
for (std::size_t i = 0; i < observed_scores.size(); ++i)
260-
p_values.push_back(
261-
(std::end(random_scores) - upper_bound(std::begin(random_scores),
262-
std::end(random_scores),
263-
observed_scores[i])) /
264-
n_randoms);
256+
[[nodiscard]] static auto
257+
assign_p_values(const std::vector<double> &random,
258+
const std::vector<double> &observed) -> std::vector<double> {
259+
const double n_randoms = random.empty() ? 1 : std::size(random);
260+
const auto n_scores = std::size(observed);
261+
std::vector<double> p_values(n_scores);
262+
for (auto i = 0u; i < n_scores; ++i) {
263+
const auto ub =
264+
std::upper_bound(std::cbegin(random), std::cend(random), observed[i]);
265+
p_values[i] = std::distance(ub, std::end(random)) / n_randoms;
266+
}
267+
return p_values;
265268
}
266269

267270
static void
@@ -290,12 +293,13 @@ write_params_file(const std::string &outfile, const double fg_alpha,
290293
const double fg_beta, const double bg_alpha,
291294
const double bg_beta, const double p_fb, const double p_bf,
292295
const double domain_score_cutoff) {
296+
static constexpr auto precision_value = 30;
293297
std::ofstream of;
294298
if (!outfile.empty())
295299
of.open(outfile.c_str());
296300
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
297301

298-
out.precision(30);
302+
out.precision(precision_value);
299303
out << "FG_ALPHA\t" << fg_alpha << '\n'
300304
<< "FG_BETA\t" << fg_beta << '\n'
301305
<< "BG_ALPHA\t" << bg_alpha << '\n'
@@ -382,24 +386,28 @@ check_sorted_within_chroms(T first, const T last) {
382386
int
383387
main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
384388
try {
385-
constexpr double min_coverage = 1.0;
389+
static constexpr auto min_coverage = 1.0;
390+
static constexpr auto tolerance = 1e-10; // corrections for small values
391+
static constexpr auto p_value_cutoff = 0.01;
392+
static constexpr auto alpha_frac = 0.33;
386393

387394
std::string outfile;
388395
std::string hypo_post_outfile;
389396
std::string meth_post_outfile;
390397

391-
std::size_t desert_size = 1000;
392-
std::size_t max_iterations = 10;
393-
std::size_t rng_seed = 408;
398+
// NOLINTBEGIN(*-avoid-magic-numbers)
399+
std::size_t desert_size{1000};
400+
std::size_t max_iterations{10};
401+
std::size_t rng_seed{408};
402+
double p_fb{0.25};
403+
double p_bf{0.25};
404+
// NOLINTEND(*-avoid-magic-numbers)
394405

395406
// run mode flags
396407
bool verbose = false;
397408
bool PARTIAL_METH = false;
398409
bool allow_extra_fields = false;
399410

400-
// corrections for small values
401-
const double tolerance = 1e-10;
402-
403411
std::string summary_file;
404412

405413
std::string params_in_file;
@@ -416,8 +424,7 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
416424
/****************** COMMAND LINE OPTIONS ********************/
417425
OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic)
418426
description, "<methylation-file>");
419-
opt_parse.add_opt("out", 'o', "output file (default: stdout)", false,
420-
outfile);
427+
opt_parse.add_opt("out", 'o', "output file", true, outfile);
421428
opt_parse.add_opt("desert", 'd', "max dist btwn covered cpgs in HMR", false,
422429
desert_size);
423430
opt_parse.add_opt("itr", 'i', "max iterations", false, max_iterations);
@@ -474,6 +481,10 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
474481
if (!is_msite_file(cpgs_file))
475482
throw std::runtime_error("malformed counts file: " + cpgs_file);
476483

484+
std::ofstream out(outfile);
485+
if (!out)
486+
throw std::runtime_error("failed to open output file: " + outfile);
487+
477488
// separate the regions by chrom and by desert
478489
std::vector<MSite> cpgs;
479490
std::vector<std::pair<double, double>> meth;
@@ -491,8 +502,8 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
491502
const auto mean_coverage = get_mean(std::cbegin(reads), std::cend(reads));
492503

493504
if (verbose)
494-
std::cerr << "[total_cpgs=" << size(cpgs) << "]" << '\n'
495-
<< "[mean_coverage=" << mean_coverage << "]" << '\n';
505+
std::cerr << "[total_cpgs=" << std::size(cpgs) << "]\n"
506+
<< "[mean_coverage=" << mean_coverage << "]\n";
496507

497508
// check for sufficient data
498509
if (mean_coverage < static_cast<double>(min_coverage)) {
@@ -516,11 +527,8 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
516527

517528
const TwoStateHMM hmm(tolerance, max_iterations, verbose);
518529

519-
double p_fb = 0.25;
520-
double p_bf = 0.25;
521-
522-
double fg_alpha = 0, fg_beta = 0;
523-
double bg_alpha = 0, bg_beta = 0;
530+
double fg_alpha{}, fg_beta{};
531+
double bg_alpha{}, bg_beta{};
524532
double domain_score_cutoff = std::numeric_limits<double>::max();
525533

526534
if (!params_in_file.empty()) { // read parameters file
@@ -530,10 +538,10 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
530538
}
531539
else {
532540
const double n_reads = get_mean(std::cbegin(reads), std::cend(reads));
533-
fg_alpha = 0.33 * n_reads;
534-
fg_beta = 0.67 * n_reads;
535-
bg_alpha = 0.67 * n_reads;
536-
bg_beta = 0.33 * n_reads;
541+
fg_alpha = alpha_frac * n_reads;
542+
fg_beta = (1.0 - alpha_frac) * n_reads;
543+
bg_alpha = (1.0 - alpha_frac) * n_reads;
544+
bg_beta = alpha_frac * n_reads;
537545
}
538546

539547
if (max_iterations > 0)
@@ -545,19 +553,15 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
545553
hmm.PosteriorDecoding(meth, reset_points, p_fb, p_bf, fg_alpha, fg_beta,
546554
bg_alpha, bg_beta, state_ids, posteriors);
547555

548-
std::vector<double> domain_scores;
549-
get_domain_scores(state_ids, meth, reset_points, domain_scores);
550-
551-
std::vector<double> random_scores;
552-
shuffle_cpgs(rng_seed, hmm, meth, reset_points, p_fb, p_bf, fg_alpha,
553-
fg_beta, bg_alpha, bg_beta, random_scores);
554-
555-
std::vector<double> p_values;
556-
assign_p_values(random_scores, domain_scores, p_values);
556+
const auto domain_scores = get_domain_scores(state_ids, meth, reset_points);
557+
const auto random_scores =
558+
shuffle_cpgs(rng_seed, hmm, meth, reset_points, p_fb, p_bf, fg_alpha,
559+
fg_beta, bg_alpha, bg_beta);
560+
const auto p_values = assign_p_values(random_scores, domain_scores);
557561

558562
if (domain_score_cutoff == std::numeric_limits<double>::max() &&
559563
!domain_scores.empty())
560-
domain_score_cutoff = get_stepup_cutoff(p_values, 0.01);
564+
domain_score_cutoff = get_stepup_cutoff(p_values, p_value_cutoff);
561565

562566
// write parameters if requested
563567
if (!params_out_file.empty())
@@ -569,13 +573,8 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
569573
build_domains(cpgs, reset_points, state_ids, domains);
570574

571575
{
572-
std::ofstream of;
573-
if (!outfile.empty())
574-
of.open(outfile);
575-
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
576-
577576
std::size_t good_hmr_count = 0;
578-
for (auto i = 0u; i < size(domains); ++i)
577+
for (auto i = 0u; i < std::size(domains); ++i)
579578
if (p_values[i] < domain_score_cutoff) {
580579
domains[good_hmr_count] = domains[i];
581580
domains[good_hmr_count].set_name("HYPO" +
@@ -591,24 +590,30 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
591590
if (!hypo_post_outfile.empty()) {
592591
if (verbose)
593592
std::cerr << "[writing=" << hypo_post_outfile << "]\n";
594-
std::ofstream out(hypo_post_outfile);
595-
for (std::size_t i = 0; i < cpgs.size(); ++i) {
593+
std::ofstream out_hypo_post(hypo_post_outfile);
594+
if (!out_hypo_post)
595+
throw std::runtime_error("failed to open output hypo post file: " +
596+
hypo_post_outfile);
597+
for (std::size_t i = 0; i < std::size(cpgs); ++i) {
596598
GenomicRegion cpg(as_gen_rgn(cpgs[i]));
597599
cpg.set_name(format_cpg_meth_tag(meth[i]));
598600
cpg.set_score(posteriors[i]);
599-
out << cpg << '\n';
601+
out_hypo_post << cpg << '\n';
600602
}
601603
}
602604

603605
if (!meth_post_outfile.empty()) {
604-
std::ofstream out(meth_post_outfile);
606+
std::ofstream out_meth_post(meth_post_outfile);
607+
if (!out_meth_post)
608+
throw std::runtime_error("failed to open meth post output file: " +
609+
meth_post_outfile);
605610
if (verbose)
606611
std::cerr << "[writing=" << meth_post_outfile << "]\n";
607-
for (std::size_t i = 0; i < cpgs.size(); ++i) {
612+
for (std::size_t i = 0; i < std::size(cpgs); ++i) {
608613
GenomicRegion cpg(as_gen_rgn(cpgs[i]));
609614
cpg.set_name(format_cpg_meth_tag(meth[i]));
610615
cpg.set_score(1.0 - posteriors[i]);
611-
out << cpg << '\n';
616+
out_meth_post << cpg << '\n';
612617
}
613618
}
614619
if (!summary_file.empty()) {
@@ -625,4 +630,4 @@ main_hmr(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
625630
return EXIT_SUCCESS;
626631
}
627632

628-
// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions,*-prefer-member-initializer)
633+
// NOLINTEND(*-narrowing-conversions)

0 commit comments

Comments
 (0)