Skip to content

Commit c1e6ebb

Browse files
committed
Consolidation issue fix.
- Fixed the tier based candidate selection - Default tiers are powers of 4 with the first tier being 0-4M followed by 4-16M, 16-64M and so on. - Fixed consolidation window of size 4
1 parent 07286d8 commit c1e6ebb

File tree

3 files changed

+279
-169
lines changed

3 files changed

+279
-169
lines changed

core/index/index_meta.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,15 @@ class format;
3939
class IndexWriter;
4040

4141
struct SegmentInfo {
42+
SegmentInfo() = default;
43+
44+
// Added for testing purposes.
45+
SegmentInfo(
46+
const std::string& _name,
47+
uint64_t _byte_size
48+
) : name(_name), byte_size(_byte_size)
49+
{}
50+
4251
bool operator==(const SegmentInfo&) const = default;
4352

4453
std::string name; // FIXME(gnusi): move to SegmentMeta

core/utils/index_utils.cpp

Lines changed: 70 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -28,148 +28,73 @@
2828

2929
#include "formats/format_utils.hpp"
3030

31-
namespace {
32-
33-
// Returns percentage of live documents
34-
inline double FillFactor(const irs::SegmentInfo& segment) noexcept {
35-
return static_cast<double>(segment.live_docs_count) /
36-
static_cast<double>(segment.docs_count);
37-
}
38-
39-
// Returns approximated size of a segment in the absence of removals
40-
inline size_t SizeWithoutRemovals(const irs::SegmentInfo& segment) noexcept {
41-
return size_t(static_cast<double>(segment.byte_size) * FillFactor(segment));
42-
}
43-
4431
namespace tier {
4532

46-
struct SegmentStats {
47-
// cppcheck-suppress noExplicitConstructor
48-
SegmentStats(const irs::SubReader& reader) noexcept
49-
: reader{&reader},
50-
meta{&reader.Meta()},
51-
size{SizeWithoutRemovals(*meta)},
52-
fill_factor{FillFactor(*meta)} {}
53-
54-
bool operator<(const SegmentStats& rhs) const noexcept {
55-
// cppcheck-suppress constVariable
56-
auto& lhs = *this;
57-
58-
if (lhs.size == rhs.size) {
59-
if (lhs.fill_factor > rhs.fill_factor) {
60-
return true;
61-
} else if (lhs.fill_factor < rhs.fill_factor) {
62-
return false;
63-
}
33+
// ConsolidationConfig static constants.
34+
const size_t ConsolidationConfig::candidate_size { 4 }; // consolidation window size
35+
const size_t ConsolidationConfig::tier1 { 1 << 22 }; // 4 MB
36+
const double ConsolidationConfig::maxMergeScore { 1.5 }; // Skip consolidation if candidate score is greater
6437

65-
return lhs.meta->name < rhs.meta->name;
66-
}
38+
ConsolidationCandidate::ConsolidationCandidate(
39+
iterator_t start,
40+
iterator_t end) noexcept
41+
: segments(start, end) {
6742

68-
return lhs.size < rhs.size;
69-
}
43+
initialized = true;
7044

71-
operator const irs::SubReader*() const noexcept { return reader; }
72-
73-
const irs::SubReader* reader;
74-
const irs::SegmentInfo* meta;
75-
size_t size; // approximate size of segment without removals
76-
double_t fill_factor;
77-
};
78-
79-
struct ConsolidationCandidate {
80-
using iterator_t = std::vector<SegmentStats>::const_iterator;
81-
using range_t = std::pair<iterator_t, iterator_t>;
82-
83-
explicit ConsolidationCandidate(iterator_t i) noexcept : segments(i, i) {}
84-
85-
iterator_t begin() const noexcept { return segments.first; }
86-
iterator_t end() const noexcept { return segments.second; }
87-
88-
range_t segments;
89-
size_t count{0};
90-
size_t size{0}; // estimated size of the level
91-
double_t score{DBL_MIN}; // how good this permutation is
92-
};
93-
94-
/// @returns score of the consolidation bucket
95-
double_t consolidation_score(const ConsolidationCandidate& consolidation,
96-
const size_t segments_per_tier,
97-
const size_t floor_segment_bytes) noexcept {
98-
// to detect how skewed the consolidation we do the following:
99-
// 1. evaluate coefficient of variation, less is better
100-
// 2. good candidates are in range [0;1]
101-
// 3. favor condidates where number of segments is equal to
102-
// 'segments_per_tier' approx
103-
// 4. prefer smaller consolidations
104-
// 5. prefer consolidations which clean removals
105-
106-
switch (consolidation.count) {
107-
case 0:
108-
// empty consolidation makes not sense
109-
return DBL_MIN;
110-
case 1: {
111-
auto& meta = *consolidation.segments.first->meta;
112-
113-
if (meta.docs_count == meta.live_docs_count) {
114-
// singletone without removals makes no sense
115-
return DBL_MIN;
116-
}
45+
// Calculate initial cost
46+
auto itr = start;
47+
do
48+
{
49+
auto itrMeta = itr->meta;
11750

118-
// FIXME honor number of deletes???
119-
// signletone with removals makes sense if nothing better is found
120-
return DBL_MIN + DBL_EPSILON;
121-
}
122-
}
51+
mergeBytes += itrMeta->byte_size;
52+
skew = static_cast<double>(itrMeta->byte_size) / mergeBytes;
53+
delCount += (itrMeta->docs_count - itrMeta->live_docs_count);
54+
mergeScore = skew + (1.0 / (1 + delCount));
55+
cost = mergeBytes * mergeScore;
12356

124-
size_t size_before_consolidation = 0;
125-
size_t size_after_consolidation = 0;
126-
size_t size_after_consolidation_floored = 0;
127-
for (auto& segment_stat : consolidation) {
128-
size_before_consolidation += segment_stat.meta->byte_size;
129-
size_after_consolidation += segment_stat.size;
130-
size_after_consolidation_floored +=
131-
std::max(segment_stat.size, floor_segment_bytes);
57+
} while (itr++ != end);
13258
}
13359

134-
// evaluate coefficient of variation
135-
double sum_square_differences = 0;
136-
const auto segment_size_after_consolidaton_mean =
137-
static_cast<double>(size_after_consolidation_floored) /
138-
static_cast<double>(consolidation.count);
139-
for (auto& segment_stat : consolidation) {
140-
const auto diff =
141-
static_cast<double>(std::max(segment_stat.size, floor_segment_bytes)) -
142-
segment_size_after_consolidaton_mean;
143-
sum_square_differences += diff * diff;
144-
}
60+
// Caller is responsible for ensuring that
61+
// the segment iterators aren't past the
62+
// last element before calling advance().
63+
void ConsolidationCandidate::advance() noexcept {
64+
if (!initialized)
65+
return;
14566

146-
const auto stdev = std::sqrt(sum_square_differences /
147-
static_cast<double>(consolidation.count));
148-
const auto cv = (stdev / segment_size_after_consolidaton_mean);
67+
const auto& removeMeta = segments.first->meta;
68+
const auto& addMeta = (segments.second + 1)->meta;
14969

150-
// evaluate initial score
151-
auto score = 1. - cv;
70+
std::advance(segments.first, 1);
71+
std::advance(segments.second, 1);
15272

153-
// favor consolidations that contain approximately the requested number of
154-
// segments
155-
score *= std::pow(static_cast<double>(consolidation.count) /
156-
static_cast<double>(segments_per_tier),
157-
1.5);
73+
auto getDelCount = [](const irs::SegmentInfo* itemMeta) {
74+
return (itemMeta->docs_count - itemMeta->live_docs_count);
75+
};
15876

159-
// FIXME use relative measure, e.g. cosolidation_size/total_size
160-
// carefully prefer smaller consolidations over the bigger ones
161-
score /= std::pow(size_after_consolidation, 0.5);
77+
mergeBytes = mergeBytes - removeMeta->byte_size + addMeta->byte_size;
78+
skew = static_cast<double>(addMeta->byte_size) / mergeBytes;
79+
delCount = delCount - getDelCount(removeMeta) + getDelCount(addMeta);
80+
mergeScore = skew + (1 / (1 + delCount));
81+
cost = mergeBytes * mergeScore;
82+
}
83+
84+
// Currently we're using powers of 4 to define tiers,
85+
// with the smallest tier being 0-4MB. We select subsequent
86+
// tiers by multiplying the last tier by 4.
87+
// So we get 0-4MB, 4MB-16MB and so on.
88+
size_t getConsolidationTier(size_t num) {
16289

163-
// favor consolidations which clean out removals
164-
score /= std::pow(static_cast<double>(size_after_consolidation) /
165-
static_cast<double>(size_before_consolidation),
166-
2);
90+
size_t nextTier = ConsolidationConfig::tier1;
91+
while (nextTier < num)
92+
nextTier = nextTier << 2;
16793

168-
return score;
94+
return nextTier;
16995
}
17096

17197
} // namespace tier
172-
} // namespace
17398

17499
namespace irs::index_utils {
175100

@@ -391,6 +316,9 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
391316
/// if
392317
/// - segment size is greater than 'max_segments_bytes / 2'
393318
/// - segment has many documents but only few deletions
319+
///
320+
/// TODO - too_big_segments_threshold formula is unreasonable
321+
/// - add unit tests as well
394322
///////////////////////////////////////////////////////////////////////////
395323

396324
const double_t total_fill_factor =
@@ -413,63 +341,36 @@ ConsolidationPolicy MakePolicy(const ConsolidateTier& options) {
413341
}
414342
}
415343

344+
// No point in attempting consolidation if we don't have
345+
// enough segments to fill the consolidation window
346+
if (sorted_segments.size() < tier::ConsolidationConfig::candidate_size)
347+
return;
348+
416349
///////////////////////////////////////////////////////////////////////////
417350
/// Stage 3
418-
/// sort candidates
351+
/// sort candidates and organize them into tiers
419352
///////////////////////////////////////////////////////////////////////////
420353

421354
std::sort(sorted_segments.begin(), sorted_segments.end());
422355

423-
///////////////////////////////////////////////////////////////////////////
424-
/// Stage 4
425-
/// find proper candidates
426-
///////////////////////////////////////////////////////////////////////////
427-
428-
tier::ConsolidationCandidate best(sorted_segments.begin());
429-
430-
if (sorted_segments.size() >= min_segments_per_tier) {
431-
for (auto i = sorted_segments.begin(), end = sorted_segments.end();
432-
i != end; ++i) {
433-
tier::ConsolidationCandidate candidate(i);
434-
435-
while (candidate.segments.second != end &&
436-
candidate.count < max_segments_per_tier) {
437-
candidate.size += candidate.segments.second->size;
438-
439-
if (candidate.size > max_segments_bytes) {
440-
// overcome the limit
441-
break;
442-
}
443-
444-
++candidate.count;
445-
++candidate.segments.second;
356+
auto getSegmentSize = [](const tier::SegmentStats& segment) {
357+
return segment.meta->byte_size;
358+
};
446359

447-
if (candidate.count < min_segments_per_tier) {
448-
// not enough segments yet
449-
continue;
450-
}
451-
452-
candidate.score = tier::consolidation_score(
453-
candidate, max_segments_per_tier, floor_segment_bytes);
454-
455-
if (candidate.score < min_score) {
456-
// score is too small
457-
continue;
458-
}
459-
460-
if (best.score < candidate.score) {
461-
best = candidate;
462-
}
463-
}
464-
}
465-
}
360+
auto tiers = tier::mapToTiers(sorted_segments, getSegmentSize);
466361

467362
///////////////////////////////////////////////////////////////////////////
468363
/// Stage 4
469-
/// pick the best candidate
364+
/// Find best candidate for consolidation.
470365
///////////////////////////////////////////////////////////////////////////
471366

472-
std::copy(best.begin(), best.end(), std::back_inserter(candidates));
367+
tier::ConsolidationCandidate best;
368+
auto ret = tier::findBestConsolidationCandidate<tier::SegmentStats>(tiers, max_segments_bytes, best);
369+
if (!ret) {
370+
return;
371+
}
372+
373+
std::copy(best.first(), best.last() + 1, std::back_inserter(candidates));
473374
};
474375
}
475376

0 commit comments

Comments
 (0)