28
28
29
29
#include " formats/format_utils.hpp"
30
30
31
- namespace {
32
-
33
- // Returns percentage of live documents
34
- inline double FillFactor (const irs::SegmentInfo& segment) noexcept {
35
- return static_cast <double >(segment.live_docs_count ) /
36
- static_cast <double >(segment.docs_count );
37
- }
38
-
39
- // Returns approximated size of a segment in the absence of removals
40
- inline size_t SizeWithoutRemovals (const irs::SegmentInfo& segment) noexcept {
41
- return size_t (static_cast <double >(segment.byte_size ) * FillFactor (segment));
42
- }
43
-
44
31
namespace tier {
45
32
46
- struct SegmentStats {
47
- // cppcheck-suppress noExplicitConstructor
48
- SegmentStats (const irs::SubReader& reader) noexcept
49
- : reader{&reader},
50
- meta{&reader.Meta ()},
51
- size{SizeWithoutRemovals (*meta)},
52
- fill_factor{FillFactor (*meta)} {}
53
-
54
- bool operator <(const SegmentStats& rhs) const noexcept {
55
- // cppcheck-suppress constVariable
56
- auto & lhs = *this ;
57
-
58
- if (lhs.size == rhs.size ) {
59
- if (lhs.fill_factor > rhs.fill_factor ) {
60
- return true ;
61
- } else if (lhs.fill_factor < rhs.fill_factor ) {
62
- return false ;
63
- }
33
+ // ConsolidationConfig static constants.
34
+ const size_t ConsolidationConfig::candidate_size { 4 }; // consolidation window size
35
+ const size_t ConsolidationConfig::tier1 { 1 << 22 }; // 4 MB
36
+ const double ConsolidationConfig::maxMergeScore { 1.5 }; // Skip consolidation if candidate score is greater
64
37
65
- return lhs.meta ->name < rhs.meta ->name ;
66
- }
38
+ ConsolidationCandidate::ConsolidationCandidate (
39
+ iterator_t start,
40
+ iterator_t end) noexcept
41
+ : segments(start, end) {
67
42
68
- return lhs.size < rhs.size ;
69
- }
43
+ initialized = true ;
70
44
71
- operator const irs::SubReader*() const noexcept { return reader; }
72
-
73
- const irs::SubReader* reader;
74
- const irs::SegmentInfo* meta;
75
- size_t size; // approximate size of segment without removals
76
- double_t fill_factor;
77
- };
78
-
79
- struct ConsolidationCandidate {
80
- using iterator_t = std::vector<SegmentStats>::const_iterator;
81
- using range_t = std::pair<iterator_t , iterator_t >;
82
-
83
- explicit ConsolidationCandidate (iterator_t i) noexcept : segments(i, i) {}
84
-
85
- iterator_t begin () const noexcept { return segments.first ; }
86
- iterator_t end () const noexcept { return segments.second ; }
87
-
88
- range_t segments;
89
- size_t count{0 };
90
- size_t size{0 }; // estimated size of the level
91
- double_t score{DBL_MIN}; // how good this permutation is
92
- };
93
-
94
- // / @returns score of the consolidation bucket
95
- double_t consolidation_score (const ConsolidationCandidate& consolidation,
96
- const size_t segments_per_tier,
97
- const size_t floor_segment_bytes) noexcept {
98
- // to detect how skewed the consolidation we do the following:
99
- // 1. evaluate coefficient of variation, less is better
100
- // 2. good candidates are in range [0;1]
101
- // 3. favor condidates where number of segments is equal to
102
- // 'segments_per_tier' approx
103
- // 4. prefer smaller consolidations
104
- // 5. prefer consolidations which clean removals
105
-
106
- switch (consolidation.count ) {
107
- case 0 :
108
- // empty consolidation makes not sense
109
- return DBL_MIN;
110
- case 1 : {
111
- auto & meta = *consolidation.segments .first ->meta ;
112
-
113
- if (meta.docs_count == meta.live_docs_count ) {
114
- // singletone without removals makes no sense
115
- return DBL_MIN;
116
- }
45
+ // Calculate initial cost
46
+ auto itr = start;
47
+ do
48
+ {
49
+ auto itrMeta = itr->meta ;
117
50
118
- // FIXME honor number of deletes???
119
- // signletone with removals makes sense if nothing better is found
120
- return DBL_MIN + DBL_EPSILON ;
121
- }
122
- }
51
+ mergeBytes += itrMeta-> byte_size ;
52
+ skew = static_cast < double >(itrMeta-> byte_size ) / mergeBytes;
53
+ delCount += (itrMeta-> docs_count - itrMeta-> live_docs_count ) ;
54
+ mergeScore = skew + ( 1.0 / ( 1 + delCount));
55
+ cost = mergeBytes * mergeScore;
123
56
124
- size_t size_before_consolidation = 0 ;
125
- size_t size_after_consolidation = 0 ;
126
- size_t size_after_consolidation_floored = 0 ;
127
- for (auto & segment_stat : consolidation) {
128
- size_before_consolidation += segment_stat.meta ->byte_size ;
129
- size_after_consolidation += segment_stat.size ;
130
- size_after_consolidation_floored +=
131
- std::max (segment_stat.size , floor_segment_bytes);
57
+ } while (itr++ != end);
132
58
}
133
59
134
- // evaluate coefficient of variation
135
- double sum_square_differences = 0 ;
136
- const auto segment_size_after_consolidaton_mean =
137
- static_cast <double >(size_after_consolidation_floored) /
138
- static_cast <double >(consolidation.count );
139
- for (auto & segment_stat : consolidation) {
140
- const auto diff =
141
- static_cast <double >(std::max (segment_stat.size , floor_segment_bytes)) -
142
- segment_size_after_consolidaton_mean;
143
- sum_square_differences += diff * diff;
144
- }
60
+ // Caller is responsible for ensuring that
61
+ // the segment iterators aren't past the
62
+ // last element before calling advance().
63
+ void ConsolidationCandidate::advance () noexcept {
64
+ if (!initialized)
65
+ return ;
145
66
146
- const auto stdev = std::sqrt (sum_square_differences /
147
- static_cast <double >(consolidation.count ));
148
- const auto cv = (stdev / segment_size_after_consolidaton_mean);
67
+ const auto & removeMeta = segments.first ->meta ;
68
+ const auto & addMeta = (segments.second + 1 )->meta ;
149
69
150
- // evaluate initial score
151
- auto score = 1 . - cv ;
70
+ std::advance (segments. first , 1 );
71
+ std::advance (segments. second , 1 ) ;
152
72
153
- // favor consolidations that contain approximately the requested number of
154
- // segments
155
- score *= std::pow (static_cast <double >(consolidation.count ) /
156
- static_cast <double >(segments_per_tier),
157
- 1.5 );
73
+ auto getDelCount = [](const irs::SegmentInfo* itemMeta) {
74
+ return (itemMeta->docs_count - itemMeta->live_docs_count );
75
+ };
158
76
159
- // FIXME use relative measure, e.g. cosolidation_size/total_size
160
- // carefully prefer smaller consolidations over the bigger ones
161
- score /= std::pow (size_after_consolidation, 0.5 );
77
+ mergeBytes = mergeBytes - removeMeta->byte_size + addMeta->byte_size ;
78
+ skew = static_cast <double >(addMeta->byte_size ) / mergeBytes;
79
+ delCount = delCount - getDelCount (removeMeta) + getDelCount (addMeta);
80
+ mergeScore = skew + (1 / (1 + delCount));
81
+ cost = mergeBytes * mergeScore;
82
+ }
83
+
84
+ // Currently we're using powers of 4 to define tiers,
85
+ // with the smallest tier being 0-4MB. We select subsequent
86
+ // tiers by multiplying the last tier by 4.
87
+ // So we get 0-4MB, 4MB-16MB and so on.
88
+ size_t getConsolidationTier (size_t num) {
162
89
163
- // favor consolidations which clean out removals
164
- score /= std::pow (static_cast <double >(size_after_consolidation) /
165
- static_cast <double >(size_before_consolidation),
166
- 2 );
90
+ size_t nextTier = ConsolidationConfig::tier1;
91
+ while (nextTier < num)
92
+ nextTier = nextTier << 2 ;
167
93
168
- return score ;
94
+ return nextTier ;
169
95
}
170
96
171
97
} // namespace tier
172
- } // namespace
173
98
174
99
namespace irs ::index_utils {
175
100
@@ -391,6 +316,9 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
391
316
// / if
392
317
// / - segment size is greater than 'max_segments_bytes / 2'
393
318
// / - segment has many documents but only few deletions
319
+ // /
320
+ // / TODO - too_big_segments_threshold formula is unreasonable
321
+ // / - add unit tests as well
394
322
// /////////////////////////////////////////////////////////////////////////
395
323
396
324
const double_t total_fill_factor =
@@ -413,63 +341,36 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
413
341
}
414
342
}
415
343
344
+ // No point in attempting consolidation if we don't have
345
+ // enough segments to fill the consolidation window
346
+ if (sorted_segments.size () < tier::ConsolidationConfig::candidate_size)
347
+ return ;
348
+
416
349
// /////////////////////////////////////////////////////////////////////////
417
350
// / Stage 3
418
- // / sort candidates
351
+ // / sort candidates and organize them into tiers
419
352
// /////////////////////////////////////////////////////////////////////////
420
353
421
354
std::sort (sorted_segments.begin (), sorted_segments.end ());
422
355
423
- // /////////////////////////////////////////////////////////////////////////
424
- // / Stage 4
425
- // / find proper candidates
426
- // /////////////////////////////////////////////////////////////////////////
427
-
428
- tier::ConsolidationCandidate best (sorted_segments.begin ());
429
-
430
- if (sorted_segments.size () >= min_segments_per_tier) {
431
- for (auto i = sorted_segments.begin (), end = sorted_segments.end ();
432
- i != end; ++i) {
433
- tier::ConsolidationCandidate candidate (i);
434
-
435
- while (candidate.segments .second != end &&
436
- candidate.count < max_segments_per_tier) {
437
- candidate.size += candidate.segments .second ->size ;
438
-
439
- if (candidate.size > max_segments_bytes) {
440
- // overcome the limit
441
- break ;
442
- }
443
-
444
- ++candidate.count ;
445
- ++candidate.segments .second ;
356
+ auto getSegmentSize = [](const tier::SegmentStats& segment) {
357
+ return segment.meta ->byte_size ;
358
+ };
446
359
447
- if (candidate.count < min_segments_per_tier) {
448
- // not enough segments yet
449
- continue ;
450
- }
451
-
452
- candidate.score = tier::consolidation_score (
453
- candidate, max_segments_per_tier, floor_segment_bytes);
454
-
455
- if (candidate.score < min_score) {
456
- // score is too small
457
- continue ;
458
- }
459
-
460
- if (best.score < candidate.score ) {
461
- best = candidate;
462
- }
463
- }
464
- }
465
- }
360
+ auto tiers = tier::mapToTiers (sorted_segments, getSegmentSize);
466
361
467
362
// /////////////////////////////////////////////////////////////////////////
468
363
// / Stage 4
469
- // / pick the best candidate
364
+ // / Find best candidate for consolidation.
470
365
// /////////////////////////////////////////////////////////////////////////
471
366
472
- std::copy (best.begin (), best.end (), std::back_inserter (candidates));
367
+ tier::ConsolidationCandidate best;
368
+ auto ret = tier::findBestConsolidationCandidate<tier::SegmentStats>(tiers, max_segments_bytes, best);
369
+ if (!ret) {
370
+ return ;
371
+ }
372
+
373
+ std::copy (best.first (), best.last () + 1 , std::back_inserter (candidates));
473
374
};
474
375
}
475
376
0 commit comments