Skip to content

Commit 06d4d26

Browse files
authored
Implement parallelism in validator engine (#1812)
* Apply saved patches for parallel checking of accounts' transactions * Cleanup unneeded code * Fix account_dict update logic * Implement thread-safe scan_account_libraries * Implement thread-safe deferring messages * Implement thread-safe timer * Implement thread-safe storage stat cache * Implement thread-safe work time stat * Cleanup threadpool * Add option to control parallel validation of accounts * Final touches * Integrate with td::actor system * Implement synchronous `CheckAccountTxs` execution * Implement parallel `CheckAccountTxs` execution * Cleanup unneeded code * Remove unneeded lock-free structure * Final polishing * Fix ValidateQuery actor dying before all CheckAccountTxs actors finish * Catch VM errors inside CheckAccountTxs actor * Prevent ValidateQuery actor dying by timeout before CheckAccountTxs * Better naming for CheckAccountTxs actors * Fixes after merge * Fix timer measurements * More accurate validation stats * More accurate error handling * Remove unneeded alarm in CheckAccountTxs actor * Fix incorrect error reporting * Final fix of error handling * Fix error handling (again) * Rename option * Add fixes from review * Codestyle * Remove debug code
1 parent 244789a commit 06d4d26

File tree

10 files changed

+466
-207
lines changed

10 files changed

+466
-207
lines changed

tl/generate/scheme/ton_api.tl

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -980,10 +980,11 @@ validatorStats.validatedBlock
980980
self:int256
981981
valid:Bool comment:string
982982
bytes:int collated_data_bytes:int
983-
total_time:double work_time:double cpu_work_time:double time_stats:string
983+
total_time:double work_time:double actual_time:double cpu_work_time:double time_stats:string
984984
work_time_real_stats:string
985985
work_time_cpu_stats:string
986-
storage_stat_cache:validatorStats.storageStatCacheStats = validatorStats.ValidatedBlock;
986+
storage_stat_cache:validatorStats.storageStatCacheStats
987+
parallel_accounts_validation:Bool = validatorStats.ValidatedBlock;
987988

988989
validatorStats.newValidatorGroup.node id:int256 pubkey:PublicKey adnl_id:int256 weight:long = validatorStats.newValidatorGroup.Node;
989990
validatorStats.newValidatorGroup session_id:int256 shard:tonNode.shardId cc_seqno:int

validator-engine/validator-engine.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1638,6 +1638,7 @@ td::Status ValidatorEngine::load_global_config() {
16381638
}
16391639
validator_options_.write().set_hardforks(std::move(h));
16401640
validator_options_.write().set_catchain_broadcast_speed_multiplier(broadcast_speed_multiplier_catchain_);
1641+
validator_options_.write().set_parallel_validation(parallel_validation_);
16411642

16421643
for (auto &id : config_.collator_node_whitelist) {
16431644
validator_options_.write().set_collator_node_whitelisted_validator(id, true);
@@ -5576,6 +5577,9 @@ int main(int argc, char *argv[]) {
55765577
}
55775578
return td::Status::OK();
55785579
});
5580+
p.add_option('\0', "parallel-validation", "parallel validation over different accounts", [&]() {
5581+
acts.push_back([&x]() { td::actor::send_closure(x, &ValidatorEngine::set_parallel_validation, true); });
5582+
});
55795583
auto S = p.run(argc, argv);
55805584
if (S.is_error()) {
55815585
LOG(ERROR) << "failed to parse options: " << S.move_as_error();

validator-engine/validator-engine.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,7 @@ class ValidatorEngine : public td::actor::Actor {
257257
td::optional<ton::BlockSeqno> sync_shards_upto_;
258258
ton::adnl::AdnlNodeIdShort shard_block_retainer_adnl_id_ = ton::adnl::AdnlNodeIdShort::zero();
259259
bool shard_block_retainer_adnl_id_fullnode_ = false;
260+
bool parallel_validation_ = false;
260261
double initial_sync_delay_ = 60.0;
261262

262263
std::set<ton::CatchainSeqno> unsafe_catchains_;
@@ -382,6 +383,9 @@ class ValidatorEngine : public td::actor::Actor {
382383
void set_shard_block_retainer_adnl_id_fullnode() {
383384
shard_block_retainer_adnl_id_fullnode_ = true;
384385
}
386+
void set_parallel_validation(bool value) {
387+
parallel_validation_ = value;
388+
}
385389
void set_initial_sync_delay(double value) {
386390
initial_sync_delay_ = value;
387391
}

validator/fabric.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ struct ValidateParams {
5454

5555
// Optional - used for validation of optimistic candidates
5656
Ref<BlockData> optimistic_prev_block = {};
57+
58+
bool parallel_validation = false;
5759
};
5860

5961
td::actor::ActorOwn<Db> create_db_actor(td::actor::ActorId<ValidatorManager> manager, std::string db_root_,

validator/impl/validate-query.cpp

Lines changed: 358 additions & 188 deletions
Large diffs are not rendered by default.

validator/impl/validate-query.hpp

Lines changed: 62 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@
1818
*/
1919
#pragma once
2020

21+
#include <atomic>
2122
#include <map>
23+
#include <optional>
2224
#include <string>
2325
#include <vector>
2426

@@ -103,9 +105,9 @@ inline ErrorCtxSet ErrorCtx::set_guard(std::vector<std::string> str_list) {
103105
/*
104106
*
105107
* must write candidate to disk, if accepted
106-
* can reject block only if it is invalid (i.e. in case of
108+
* can reject block only if it is invalid (i.e. in case of
107109
* internal errors must retry or crash)
108-
* only exception: block can be rejected, if it is known from
110+
* only exception: block can be rejected, if it is known from
109111
* masterchain, that it will not be part of shardchain finalized
110112
* state
111113
*
@@ -150,6 +152,13 @@ class ValidateQuery : public td::actor::Actor {
150152
bool prev_key_block_exists_{false};
151153
bool debug_checks_{false};
152154
bool outq_cleanup_partial_{false};
155+
bool parallel_accounts_validation_{false};
156+
bool parallel_accounts_validation_pending_{false};
157+
bool check_account_failed_{false};
158+
td::RealCpuTimer parallel_work_timer_{/*is_paused=*/true};
159+
std::optional<td::Status> check_account_fatal_error_ = std::nullopt;
160+
std::optional<std::string> check_account_reject_error_ = std::nullopt;
161+
std::optional<td::BufferSlice> check_account_reject_reason_ = std::nullopt;
153162
BlockSeqno prev_key_seqno_{~0u};
154163
int stage_{0};
155164
td::BitArray<64> shard_pfx_;
@@ -201,8 +210,7 @@ class ValidateQuery : public td::actor::Actor {
201210
ton::BlockIdExt prev_key_block_;
202211
ton::LogicalTime prev_key_block_lt_;
203212
std::unique_ptr<block::BlockLimits> block_limits_;
204-
std::unique_ptr<block::BlockLimitStatus> block_limit_status_;
205-
td::uint64 total_gas_used_{0}, total_special_gas_used_{0};
213+
mutable std::atomic_uint64_t total_gas_used_{0}, total_special_gas_used_{0};
206214

207215
LogicalTime start_lt_, end_lt_;
208216
UnixTime prev_now_{~0u}, now_{~0u};
@@ -330,6 +338,7 @@ class ValidateQuery : public td::actor::Actor {
330338
bool unpack_block_candidate();
331339
bool extract_collated_data_from(Ref<vm::Cell> croot, int idx);
332340
bool extract_collated_data();
341+
bool check_account_failures();
333342
bool try_validate();
334343
bool compute_prev_state();
335344
bool compute_next_state();
@@ -395,13 +404,56 @@ class ValidateQuery : public td::actor::Actor {
395404
td::Bits256& msg_hash);
396405
bool check_in_queue();
397406
bool check_delivered_dequeued();
398-
std::unique_ptr<block::Account> make_account_from(td::ConstBitPtr addr, Ref<vm::CellSlice> account);
399-
std::unique_ptr<block::Account> unpack_account(td::ConstBitPtr addr);
400-
bool check_one_transaction(block::Account& account, LogicalTime lt, Ref<vm::Cell> trans_root, bool is_first,
401-
bool is_last);
402-
bool check_account_transactions(const StdSmcAddress& acc_addr, Ref<vm::CellSlice> acc_tr);
407+
408+
class CheckAccountTxs : public Actor {
409+
public:
410+
struct Context {
411+
std::vector<std::tuple<Bits256, LogicalTime, LogicalTime>> msg_proc_lt{};
412+
block::CurrencyCollection total_burned{0};
413+
std::vector<std::tuple<Bits256, Bits256, bool>> lib_publishers{};
414+
bool defer_all_messages = false;
415+
std::vector<std::pair<td::Ref<vm::Cell>, td::uint32>> storage_stat_cache_update{};
416+
ValidationStats::WorkTimeStats work_time{};
417+
418+
std::optional<td::Status> fatal_error;
419+
std::optional<std::string> reject_error;
420+
std::optional<td::BufferSlice> reject_reason;
421+
};
422+
423+
CheckAccountTxs(const ValidateQuery& vq, td::actor::ActorId<ValidateQuery> vq_id, StdSmcAddress address,
424+
Ref<vm::CellSlice> acc_tr, Context ctx);
425+
426+
bool try_check();
427+
Context extract_context();
428+
429+
private:
430+
void start_up() override;
431+
432+
void abort_query(td::Status error);
433+
bool reject_query(std::string error, td::BufferSlice reason = {});
434+
bool reject_query(std::string err_msg, td::Status error, td::BufferSlice reason = {});
435+
bool fatal_error(td::Status error);
436+
bool fatal_error(std::string err_msg, int err_code = -666);
437+
438+
std::unique_ptr<block::Account> make_account_from(td::ConstBitPtr addr, Ref<vm::CellSlice> account);
439+
std::unique_ptr<block::Account> unpack_account(td::ConstBitPtr addr);
440+
bool check_one_transaction(block::Account& account, LogicalTime lt, Ref<vm::Cell> trans_root, bool is_first,
441+
bool is_last);
442+
bool scan_account_libraries(Ref<vm::Cell> orig_libs, Ref<vm::Cell> final_libs, const td::Bits256& addr);
443+
444+
const ValidateQuery& vq_;
445+
td::actor::ActorId<ValidateQuery> vq_id_;
446+
StdSmcAddress address_;
447+
Ref<vm::CellSlice> acc_tr_;
448+
Context ctx_;
449+
};
450+
friend CheckAccountTxs;
451+
452+
CheckAccountTxs::Context load_check_account_transactions_context(const StdSmcAddress& address);
453+
void save_account_transactions_context(const StdSmcAddress& address, CheckAccountTxs::Context ctx);
454+
455+
void after_check_account_finished(StdSmcAddress address, CheckAccountTxs::Context context);
403456
bool check_transactions();
404-
bool scan_account_libraries(Ref<vm::Cell> orig_libs, Ref<vm::Cell> final_libs, const td::Bits256& addr);
405457
bool check_all_ticktock_processed();
406458
bool check_message_processing_order();
407459
bool check_special_message(Ref<vm::Cell> in_msg_root, const block::CurrencyCollection& amount,

validator/interfaces/validator-manager.h

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,21 @@ struct AsyncSerializerState {
5757
};
5858

5959
struct StorageStatCacheStats {
60-
td::uint64 small_cnt = 0, small_cells = 0;
61-
td::uint64 hit_cnt = 0, hit_cells = 0;
62-
td::uint64 miss_cnt = 0, miss_cells = 0;
60+
std::atomic<td::uint64> small_cnt = 0, small_cells = 0;
61+
std::atomic<td::uint64> hit_cnt = 0, hit_cells = 0;
62+
std::atomic<td::uint64> miss_cnt = 0, miss_cells = 0;
63+
64+
StorageStatCacheStats() {
65+
}
66+
67+
StorageStatCacheStats(const StorageStatCacheStats& other)
68+
: small_cnt(other.small_cnt.load())
69+
, small_cells(other.small_cells.load())
70+
, hit_cnt(other.hit_cnt.load())
71+
, hit_cells(other.hit_cells.load())
72+
, miss_cnt(other.miss_cnt.load())
73+
, miss_cells(other.miss_cells.load()) {
74+
}
6375

6476
tl_object_ptr<ton_api::validatorStats_storageStatCacheStats> tl() const {
6577
return create_tl_object<ton_api::validatorStats_storageStatCacheStats>(small_cnt, small_cells, hit_cnt, hit_cells,
@@ -180,6 +192,8 @@ struct ValidationStats {
180192
td::uint32 actual_bytes = 0, actual_collated_data_bytes = 0;
181193
double total_time = 0.0;
182194
std::string time_stats;
195+
double actual_time = 0.0;
196+
bool parallel_accounts_validation = false;
183197

184198
struct WorkTimeStats {
185199
td::RealCpuTimer::Time total;
@@ -195,13 +209,14 @@ struct ValidationStats {
195209
}
196210
};
197211
WorkTimeStats work_time;
198-
StorageStatCacheStats storage_stat_cache;
212+
mutable StorageStatCacheStats storage_stat_cache;
199213

200214
tl_object_ptr<ton_api::validatorStats_validatedBlock> tl() const {
201215
return create_tl_object<ton_api::validatorStats_validatedBlock>(
202216
create_tl_block_id(block_id), collated_data_hash, validated_at, self.bits256_value(), valid, comment,
203-
actual_bytes, actual_collated_data_bytes, total_time, work_time.total.real, work_time.total.cpu, time_stats,
204-
work_time.to_str(false), work_time.to_str(true), storage_stat_cache.tl());
217+
actual_bytes, actual_collated_data_bytes, total_time, actual_time, work_time.total.real, work_time.total.cpu,
218+
time_stats, work_time.to_str(false), work_time.to_str(true), storage_stat_cache.tl(),
219+
parallel_accounts_validation);
205220
}
206221
};
207222

validator/validator-group.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,8 @@ void ValidatorGroup::validate_block_candidate(validatorsession::BlockSourceInfo
248248
.prev = std::move(prev),
249249
.validator_set = validator_set_,
250250
.local_validator_id = local_id_,
251-
.optimistic_prev_block = optimistic_prev_block_data},
251+
.optimistic_prev_block = optimistic_prev_block_data,
252+
.parallel_validation = opts_.get()->get_parallel_validation()},
252253
manager_, td::Timestamp::in(15.0), std::move(P));
253254
}
254255

validator/validator-options.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,9 @@ struct ValidatorManagerOptionsImpl : public ValidatorManagerOptions {
173173
td::Ref<ShardBlockVerifierConfig> get_shard_block_verifier_config() const override {
174174
return shard_block_verifier_config_;
175175
}
176+
bool get_parallel_validation() const override {
177+
return parallel_validation;
178+
}
176179

177180
void set_zero_block_id(BlockIdExt block_id) override {
178181
zero_block_id_ = block_id;
@@ -295,6 +298,10 @@ struct ValidatorManagerOptionsImpl : public ValidatorManagerOptions {
295298
shard_block_verifier_config_ = std::move(config);
296299
}
297300

301+
void set_parallel_validation(bool value) override {
302+
parallel_validation = value;
303+
}
304+
298305
ValidatorManagerOptionsImpl* make_copy() const override {
299306
return new ValidatorManagerOptionsImpl(*this);
300307
}
@@ -352,6 +359,7 @@ struct ValidatorManagerOptionsImpl : public ValidatorManagerOptions {
352359
std::set<adnl::AdnlNodeIdShort> collator_node_whitelist_;
353360
bool collator_node_whitelist_enabled_ = false;
354361
td::Ref<ShardBlockVerifierConfig> shard_block_verifier_config_{true};
362+
bool parallel_validation = false;
355363
};
356364

357365
} // namespace validator

validator/validator.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ struct ValidatorManagerOptions : public td::CntObject {
144144
virtual td::optional<double> get_catchain_max_block_delay_slow() const = 0;
145145
virtual bool get_state_serializer_enabled() const = 0;
146146
virtual td::Ref<CollatorOptions> get_collator_options() const = 0;
147+
virtual bool get_parallel_validation() const = 0;
147148
virtual double get_catchain_broadcast_speed_multiplier() const = 0;
148149
virtual bool get_permanent_celldb() const = 0;
149150
virtual td::Ref<CollatorsList> get_collators_list() const = 0;
@@ -188,6 +189,7 @@ struct ValidatorManagerOptions : public td::CntObject {
188189
virtual void set_collator_node_whitelisted_validator(adnl::AdnlNodeIdShort id, bool add) = 0;
189190
virtual void set_collator_node_whitelist_enabled(bool enabled) = 0;
190191
virtual void set_shard_block_verifier_config(td::Ref<ShardBlockVerifierConfig> config) = 0;
192+
virtual void set_parallel_validation(bool value) = 0;
191193

192194
static td::Ref<ValidatorManagerOptions> create(BlockIdExt zero_block_id, BlockIdExt init_block_id,
193195
bool allow_blockchain_init = false, double sync_blocks_before = 3600,

0 commit comments

Comments
 (0)