From 5c8115e032ebc90e982e22c105328fcaf0c74ac6 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Mon, 23 Mar 2026 16:08:34 +0300 Subject: [PATCH 01/32] cb for ch output plugin --- _mydata/offsets.yaml | 6 ++ _mydata/push-image.sh | 21 ++++ _mydata/test_config.yaml | 21 ++++ _mydata/welcome.json | 6 ++ plugin/output/clickhouse/clickhouse.go | 135 ++++++++++++++++++++----- 5 files changed, 165 insertions(+), 24 deletions(-) create mode 100644 _mydata/offsets.yaml create mode 100755 _mydata/push-image.sh create mode 100644 _mydata/test_config.yaml create mode 100644 _mydata/welcome.json diff --git a/_mydata/offsets.yaml b/_mydata/offsets.yaml new file mode 100644 index 000000000..cc2b75b8a --- /dev/null +++ b/_mydata/offsets.yaml @@ -0,0 +1,6 @@ +- file: /Users/serlazarenko/dev/repos/file.d/_mydata/welcome.json + inode: 5043215 + source_id: 2049134542 + last_read_timestamp: 1772625961393116000 + streams: + not_set: 499 diff --git a/_mydata/push-image.sh b/_mydata/push-image.sh new file mode 100755 index 000000000..45f398f05 --- /dev/null +++ b/_mydata/push-image.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +VERSION=$(git describe --abbrev=4 --dirty --always --tags | sed -E 's/-.+$//') + +if [[ ! -z "$1" ]]; then + VERSION=$1 +fi + +echo "Build version $VERSION" +docker buildx build --file ./build/package/Dockerfile_dev --build-arg VERSION=$VERSION --platform linux/amd64 -t gitlab-registry.ozon.ru/sre/images/file-d:$VERSION . + +read -p "Do you want to push image? (y/n) " -r +echo +if [[ ! $REPLY =~ ^[Yy]$ ]] +then +echo "Cancelled by user"; +exit 1; +fi + +echo "Push image to registry"; +docker push gitlab-registry.ozon.ru/sre/images/file-d:$VERSION \ No newline at end of file diff --git a/_mydata/test_config.yaml b/_mydata/test_config.yaml new file mode 100644 index 000000000..af7606b7d --- /dev/null +++ b/_mydata/test_config.yaml @@ -0,0 +1,21 @@ +pipelines: + test_pipeline: + input: + type: file + persistence_mode: async + watching_dir: ./_mydata + filename_pattern: "welcome.json" + offsets_file: ./_mydata/offsets.yaml + actions: + - type: rename + override: true + systemd\.unit: service + syslog\.identifier: service + k8s_pod_label_app: service + - type: discard + do_if: + op: equal + field: service + values: [null, ""] + output: + type: stdout diff --git a/_mydata/welcome.json b/_mydata/welcome.json new file mode 100644 index 000000000..6b8585777 --- /dev/null +++ b/_mydata/welcome.json @@ -0,0 +1,6 @@ +{"level":"error","message":"1: k8s_pod_label_app","k8s_pod_label_app":"k8s-service"} +{"level":"info","message":"2: systemd.unit","systemd.unit":"systemd-service"} +{"level":"warning","message":"3: syslog.identifier","syslog.identifier":"syslog-service"} +{"level":"debug","message":"4: No service field"} +{"level":"error","message":"5: Empty k8s_pod_label_app","k8s_pod_label_app":""} +{"level":"info","message":"6: Multiple sources","k8s_pod_label_app":"k8s-service","systemd.unit":"systemd-service"} diff --git a/plugin/output/clickhouse/clickhouse.go b/plugin/output/clickhouse/clickhouse.go index 7f6c8366a..63fb7a7af 100644 --- a/plugin/output/clickhouse/clickhouse.go +++ b/plugin/output/clickhouse/clickhouse.go @@ -8,6 +8,7 @@ import ( "fmt" "net" "strings" + "sync" "time" "github.com/ClickHouse/ch-go" @@ -61,6 +62,10 @@ type Plugin struct { queriesCountMetric *metric.Counter router *pipeline.Router + + bannedHosts map[Address]time.Time + pendingHosts map[Address]struct{} + mu sync.Mutex } type Setting struct { @@ -334,6 +339,18 @@ type Config struct { // > After this timeout batch will be sent even if batch isn't completed. BatchFlushTimeout cfg.Duration `json:"batch_flush_timeout" default:"200ms" parse:"duration"` // * BatchFlushTimeout_ time.Duration + + // > @3@4@5@6 + // > + // > Timeout for banning host if it fails. + BanTimeout cfg.Duration `json:"ban_timeout" default:"10s" parse:"duration"` // * + BanTimeout_ time.Duration + + // > @3@4@5@6 + // > + // > Interval for retrying connections to banned hosts + RetryInterval cfg.Duration `json:"retry_interval" default:"5s" parse:"duration"` // * + RetryInterval_ time.Duration } func init() { @@ -355,6 +372,9 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginParams) { p.logger = params.Logger.Desugar() + p.bannedHosts = make(map[Address]time.Time, len(p.config.Addresses)) // Заранее выделил память под мапку + p.pendingHosts = make(map[Address]struct{}, len(p.config.Addresses)) // не знал, как обыграть момент с удалением успешно + // подключенного(ранее ожидающего) хоста, поэтому поменял на мапу (этот момент видно будет в checkBannedHosts) p.config = config.(*Config) p.registerMetrics(params.MetricCtl) p.ctx, p.cancelFunc = context.WithCancel(context.Background()) @@ -404,28 +424,14 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP } for _, addr := range p.config.Addresses { - addr.Addr = addrWithDefaultPort(addr.Addr, "9000") - pool, err := chpool.New(p.ctx, chpool.Options{ - ClientOptions: ch.Options{ - Logger: p.logger.Named("driver"), - Address: addr.Addr, - Database: p.config.Database, - User: p.config.User, - Password: p.config.Password, - QuotaKey: p.config.QuotaKey, - Compression: compression, - Settings: p.config.ClickhouseSettings.toProtoSettings(), - DialTimeout: time.Second * 10, - TLS: b.Build(), - HandshakeTimeout: time.Minute, - }, - MaxConnLifetime: p.config.MaxConnLifetime_, - MaxConnIdleTime: p.config.MaxConnIdleTime_, - MaxConns: p.config.MaxConns_, - MinConns: p.config.MinConns_, - HealthCheckPeriod: p.config.HealthCheckPeriod_, - }) + pool, err := p.createConnection(addr, compression, b) if err != nil { + var netError net.Error + if errors.As(err, &netError) { + p.mu.Lock() + p.bannedHosts[addr] = time.Now().Add(p.config.BanTimeout_) + p.mu.Unlock() + } p.logger.Error("create clickhouse connection pool", zap.Error(err), zap.String("addr", addr.Addr)) } else { for j := 0; j < *addr.Weight; j++ { @@ -433,11 +439,16 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP } } } - - if len(p.instances) == 0 { + // CORELOG-3268 если все хосты недоступны и fatal_on_failed_insert=false, пропускаем логи и идем дальше, + // если все хосты недоступны и fatal_on_failed_insert=true — падаем. + if len(p.instances) == 0 && p.config.FatalOnFailedInsert == true { p.logger.Fatal("cannot start: no available clickhouse addresses in config") } - + // Объясни момент, в чем критическая важность reconnect делать именно в out, + // а не сразу в start, чтобы к моменту out мб поднялся какой-то хост и мы создали instance + if len(p.bannedHosts) > 0 { + go p.checkBannedHosts(compression, b) + } batcherOpts := pipeline.BatcherOptions{ PipelineName: params.PipelineName, OutputType: outPluginType, @@ -484,6 +495,82 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(p.ctx) } +func (p *Plugin) createConnection(addr Address, compression ch.Compression, b xtls.ConfigBuilder) (*chpool.Pool, error) { + addr.Addr = addrWithDefaultPort(addr.Addr, "9000") + pool, err := chpool.New(p.ctx, chpool.Options{ + ClientOptions: ch.Options{ + Logger: p.logger.Named("driver"), + Address: addr.Addr, + Database: p.config.Database, + User: p.config.User, + Password: p.config.Password, + QuotaKey: p.config.QuotaKey, + Compression: compression, + Settings: p.config.ClickhouseSettings.toProtoSettings(), + DialTimeout: time.Second * 10, + TLS: b.Build(), + HandshakeTimeout: time.Minute, + }, + MaxConnLifetime: p.config.MaxConnLifetime_, + MaxConnIdleTime: p.config.MaxConnIdleTime_, + MaxConns: p.config.MaxConns_, + MinConns: p.config.MinConns_, + HealthCheckPeriod: p.config.HealthCheckPeriod_, + }) + + return pool, err +} + +func (p *Plugin) checkBannedHosts(compression ch.Compression, b xtls.ConfigBuilder) { + ticker := time.NewTicker(p.config.RetryInterval_) + for { + select { + case <-p.ctx.Done(): + return + case <-ticker.C: + p.mu.Lock() + if len(p.bannedHosts) == 0 { // Так обыграл раннее завершение горутины + p.mu.Unlock() + return + } + for host, banUntil := range p.bannedHosts { + if time.Now().After(banUntil) { + p.pendingHosts[host] = struct{}{} + } + } + for host := range p.pendingHosts { + err := p.retryConnect(host, compression, b) + if err != nil { + p.logger.Error("failed to reconnect to banned host", zap.Error(err), zap.String("addr", host.Addr)) + continue + } + delete(p.bannedHosts, host) + delete(p.pendingHosts, host) + } + p.mu.Unlock() + } + } +} + +func (p *Plugin) retryConnect(addr Address, compression ch.Compression, b xtls.ConfigBuilder) error { + pool, err := p.createConnection(addr, compression, b) + if err != nil { + var netError net.Error + if errors.As(err, &netError) { + p.mu.Lock() + p.bannedHosts[addr] = time.Now().Add(p.config.BanTimeout_) + p.mu.Unlock() + } + return err + } + + for j := 0; j < *addr.Weight; j++ { + p.instances = append(p.instances, pool) + } + + return nil +} + func (p *Plugin) Stop() { p.cancelFunc() p.batcher.Stop() From ca693aff32c35a85cdaa2f2c264173e49caf87cd Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Mon, 23 Mar 2026 16:12:40 +0300 Subject: [PATCH 02/32] cb for ch output plugin --- _mydata/offsets.yaml | 6 ------ _mydata/push-image.sh | 21 --------------------- _mydata/test_config.yaml | 21 --------------------- _mydata/welcome.json | 6 ------ 4 files changed, 54 deletions(-) delete mode 100644 _mydata/offsets.yaml delete mode 100755 _mydata/push-image.sh delete mode 100644 _mydata/test_config.yaml delete mode 100644 _mydata/welcome.json diff --git a/_mydata/offsets.yaml b/_mydata/offsets.yaml deleted file mode 100644 index cc2b75b8a..000000000 --- a/_mydata/offsets.yaml +++ /dev/null @@ -1,6 +0,0 @@ -- file: /Users/serlazarenko/dev/repos/file.d/_mydata/welcome.json - inode: 5043215 - source_id: 2049134542 - last_read_timestamp: 1772625961393116000 - streams: - not_set: 499 diff --git a/_mydata/push-image.sh b/_mydata/push-image.sh deleted file mode 100755 index 45f398f05..000000000 --- a/_mydata/push-image.sh +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash - -VERSION=$(git describe --abbrev=4 --dirty --always --tags | sed -E 's/-.+$//') - -if [[ ! -z "$1" ]]; then - VERSION=$1 -fi - -echo "Build version $VERSION" -docker buildx build --file ./build/package/Dockerfile_dev --build-arg VERSION=$VERSION --platform linux/amd64 -t gitlab-registry.ozon.ru/sre/images/file-d:$VERSION . - -read -p "Do you want to push image? (y/n) " -r -echo -if [[ ! $REPLY =~ ^[Yy]$ ]] -then -echo "Cancelled by user"; -exit 1; -fi - -echo "Push image to registry"; -docker push gitlab-registry.ozon.ru/sre/images/file-d:$VERSION \ No newline at end of file diff --git a/_mydata/test_config.yaml b/_mydata/test_config.yaml deleted file mode 100644 index af7606b7d..000000000 --- a/_mydata/test_config.yaml +++ /dev/null @@ -1,21 +0,0 @@ -pipelines: - test_pipeline: - input: - type: file - persistence_mode: async - watching_dir: ./_mydata - filename_pattern: "welcome.json" - offsets_file: ./_mydata/offsets.yaml - actions: - - type: rename - override: true - systemd\.unit: service - syslog\.identifier: service - k8s_pod_label_app: service - - type: discard - do_if: - op: equal - field: service - values: [null, ""] - output: - type: stdout diff --git a/_mydata/welcome.json b/_mydata/welcome.json deleted file mode 100644 index 6b8585777..000000000 --- a/_mydata/welcome.json +++ /dev/null @@ -1,6 +0,0 @@ -{"level":"error","message":"1: k8s_pod_label_app","k8s_pod_label_app":"k8s-service"} -{"level":"info","message":"2: systemd.unit","systemd.unit":"systemd-service"} -{"level":"warning","message":"3: syslog.identifier","syslog.identifier":"syslog-service"} -{"level":"debug","message":"4: No service field"} -{"level":"error","message":"5: Empty k8s_pod_label_app","k8s_pod_label_app":""} -{"level":"info","message":"6: Multiple sources","k8s_pod_label_app":"k8s-service","systemd.unit":"systemd-service"} From ab289dc43f10cb91bf76bfb6f428f703648edc25 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Mon, 23 Mar 2026 16:18:46 +0300 Subject: [PATCH 03/32] docs generate --- plugin/output/clickhouse/README.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/plugin/output/clickhouse/README.md b/plugin/output/clickhouse/README.md index 08b1b374e..e64bd85d1 100644 --- a/plugin/output/clickhouse/README.md +++ b/plugin/output/clickhouse/README.md @@ -215,5 +215,17 @@ After this timeout batch will be sent even if batch isn't completed.
+**`ban_timeout`** *`cfg.Duration`* *`default=10s`* -
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* \ No newline at end of file +Timeout for banning host if it fails. + +
+ +**`retry_interval`** *`cfg.Duration`* *`default=5s`* + +Interval for retrying connections to banned hosts + +
+ + +
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* From df2e46f62d864cad7f874547a22922ad6bf19f77 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Mon, 23 Mar 2026 16:22:37 +0300 Subject: [PATCH 04/32] fix --- plugin/output/clickhouse/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/plugin/output/clickhouse/README.md b/plugin/output/clickhouse/README.md index e64bd85d1..a7e25405c 100644 --- a/plugin/output/clickhouse/README.md +++ b/plugin/output/clickhouse/README.md @@ -229,3 +229,4 @@ Interval for retrying connections to banned hosts
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* + From 56abe5a6f3db637a10fae7cad54ecf5fe06f84f7 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 3 Apr 2026 01:27:25 +0300 Subject: [PATCH 05/32] fix --- plugin/output/clickhouse/README.md | 11 +- plugin/output/clickhouse/clickhouse.go | 204 +++++++++++++------- plugin/output/clickhouse/clickhouse_test.go | 26 ++- 3 files changed, 159 insertions(+), 82 deletions(-) diff --git a/plugin/output/clickhouse/README.md b/plugin/output/clickhouse/README.md index a7e25405c..4ca236c6c 100644 --- a/plugin/output/clickhouse/README.md +++ b/plugin/output/clickhouse/README.md @@ -215,18 +215,17 @@ After this timeout batch will be sent even if batch isn't completed.
-**`ban_timeout`** *`cfg.Duration`* *`default=10s`* +**`failure_cooldown_period`** *`cfg.Duration`* *`default=10s`* -Timeout for banning host if it fails. +Period for which addresses will be banned in case of unavailability.
-**`retry_interval`** *`cfg.Duration`* *`default=5s`* +**`reconnect_interval`** *`cfg.Duration`* *`default=5s`* -Interval for retrying connections to banned hosts +Interval for reconnecting to addresses that are unavailable during initialization.
-
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* - +
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* \ No newline at end of file diff --git a/plugin/output/clickhouse/clickhouse.go b/plugin/output/clickhouse/clickhouse.go index 63fb7a7af..d2dccc6ed 100644 --- a/plugin/output/clickhouse/clickhouse.go +++ b/plugin/output/clickhouse/clickhouse.go @@ -3,6 +3,7 @@ package clickhouse import ( "bytes" "context" + "crypto/tls" "encoding/json" "errors" "fmt" @@ -43,6 +44,11 @@ type Clickhouse interface { Do(ctx context.Context, query ch.Query) error } +type instance struct { + addr Address + pool Clickhouse +} + type Plugin struct { logger *zap.Logger @@ -54,7 +60,7 @@ type Plugin struct { query string // TODO: support shards - instances []Clickhouse + instances []instance requestID atomic.Int64 // plugin metrics @@ -63,9 +69,13 @@ type Plugin struct { router *pipeline.Router + compression ch.Compression + tlsConfig *tls.Config + + poolsByAddr map[Address]Clickhouse bannedHosts map[Address]time.Time pendingHosts map[Address]struct{} - mu sync.Mutex + mu sync.RWMutex } type Setting struct { @@ -342,15 +352,15 @@ type Config struct { // > @3@4@5@6 // > - // > Timeout for banning host if it fails. - BanTimeout cfg.Duration `json:"ban_timeout" default:"10s" parse:"duration"` // * - BanTimeout_ time.Duration + // > Period for which addresses will be banned in case of unavailability. + FailureCooldownPeriod cfg.Duration `json:"failure_cooldown_period" default:"10s" parse:"duration"` // * + FailureCooldownPeriod_ time.Duration // > @3@4@5@6 // > - // > Interval for retrying connections to banned hosts - RetryInterval cfg.Duration `json:"retry_interval" default:"5s" parse:"duration"` // * - RetryInterval_ time.Duration + // > Interval for reconnecting to addresses that are unavailable during initialization. + ReconnectInterval cfg.Duration `json:"reconnect_interval" default:"5s" parse:"duration"` // * + ReconnectInterval_ time.Duration } func init() { @@ -371,11 +381,12 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginParams) { p.logger = params.Logger.Desugar() - - p.bannedHosts = make(map[Address]time.Time, len(p.config.Addresses)) // Заранее выделил память под мапку - p.pendingHosts = make(map[Address]struct{}, len(p.config.Addresses)) // не знал, как обыграть момент с удалением успешно - // подключенного(ранее ожидающего) хоста, поэтому поменял на мапу (этот момент видно будет в checkBannedHosts) p.config = config.(*Config) + + p.bannedHosts = make(map[Address]time.Time, len(p.config.Addresses)) + p.pendingHosts = make(map[Address]struct{}, len(p.config.Addresses)) + p.poolsByAddr = make(map[Address]Clickhouse, len(p.config.Addresses)) + p.registerMetrics(params.MetricCtl) p.ctx, p.cancelFunc = context.WithCancel(context.Background()) @@ -385,6 +396,12 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP if p.config.InsertTimeout_ < 1 { p.logger.Fatal("'db_request_timeout' can't be <1") } + if p.config.ReconnectInterval_ < 1 { + p.logger.Fatal("'reconnect_interval' can't be <1") + } + if p.config.FailureCooldownPeriod_ < 1 { + p.logger.Fatal("'failure_cooldown_period' cant't be <1") + } schema, err := inferInsaneColInputs(p.config.Columns) if err != nil { @@ -400,55 +417,56 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.config.InsertStrategy_ = StrategyInOrder } - var compression ch.Compression switch strings.ToLower(p.config.Compression) { default: fallthrough case "disabled": - compression = ch.CompressionDisabled + p.compression = ch.CompressionDisabled case "lz4": - compression = ch.CompressionLZ4 + p.compression = ch.CompressionLZ4 case "zstd": - compression = ch.CompressionZSTD + p.compression = ch.CompressionZSTD case "none": - compression = ch.CompressionNone + p.compression = ch.CompressionNone } - var b xtls.ConfigBuilder if p.config.CACert != "" { b := xtls.NewConfigBuilder() - err := b.AppendCARoot(p.config.CACert) - if err != nil { + if err := b.AppendCARoot(p.config.CACert); err != nil { p.logger.Fatal("can't append CA root", zap.Error(err)) } + p.tlsConfig = b.Build() } for _, addr := range p.config.Addresses { - pool, err := p.createConnection(addr, compression, b) + pool, err := p.createConnection(addr) if err != nil { var netError net.Error if errors.As(err, &netError) { - p.mu.Lock() - p.bannedHosts[addr] = time.Now().Add(p.config.BanTimeout_) - p.mu.Unlock() + p.pendingHosts[addr] = struct{}{} } p.logger.Error("create clickhouse connection pool", zap.Error(err), zap.String("addr", addr.Addr)) - } else { - for j := 0; j < *addr.Weight; j++ { - p.instances = append(p.instances, pool) - } + continue + } + p.poolsByAddr[addr] = pool + for j := 0; j < *addr.Weight; j++ { + p.instances = append(p.instances, instance{ + addr: addr, + pool: pool, + }) } } - // CORELOG-3268 если все хосты недоступны и fatal_on_failed_insert=false, пропускаем логи и идем дальше, - // если все хосты недоступны и fatal_on_failed_insert=true — падаем. - if len(p.instances) == 0 && p.config.FatalOnFailedInsert == true { + + if len(p.instances) == 0 && p.config.FatalOnFailedInsert { p.logger.Fatal("cannot start: no available clickhouse addresses in config") } - // Объясни момент, в чем критическая важность reconnect делать именно в out, - // а не сразу в start, чтобы к моменту out мб поднялся какой-то хост и мы создали instance - if len(p.bannedHosts) > 0 { - go p.checkBannedHosts(compression, b) + + go p.checkBannedHosts() + + if len(p.pendingHosts) > 0 { + go p.checkPendingHosts() } + batcherOpts := pipeline.BatcherOptions{ PipelineName: params.PipelineName, OutputType: outPluginType, @@ -495,7 +513,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(p.ctx) } -func (p *Plugin) createConnection(addr Address, compression ch.Compression, b xtls.ConfigBuilder) (*chpool.Pool, error) { +func (p *Plugin) createConnection(addr Address) (*chpool.Pool, error) { addr.Addr = addrWithDefaultPort(addr.Addr, "9000") pool, err := chpool.New(p.ctx, chpool.Options{ ClientOptions: ch.Options{ @@ -505,10 +523,10 @@ func (p *Plugin) createConnection(addr Address, compression ch.Compression, b xt User: p.config.User, Password: p.config.Password, QuotaKey: p.config.QuotaKey, - Compression: compression, + Compression: p.compression, Settings: p.config.ClickhouseSettings.toProtoSettings(), DialTimeout: time.Second * 10, - TLS: b.Build(), + TLS: p.tlsConfig, HandshakeTimeout: time.Minute, }, MaxConnLifetime: p.config.MaxConnLifetime_, @@ -521,61 +539,72 @@ func (p *Plugin) createConnection(addr Address, compression ch.Compression, b xt return pool, err } -func (p *Plugin) checkBannedHosts(compression ch.Compression, b xtls.ConfigBuilder) { - ticker := time.NewTicker(p.config.RetryInterval_) +func (p *Plugin) checkPendingHosts() { + ticker := time.NewTicker(p.config.ReconnectInterval_) for { select { case <-p.ctx.Done(): return case <-ticker.C: - p.mu.Lock() - if len(p.bannedHosts) == 0 { // Так обыграл раннее завершение горутины - p.mu.Unlock() + if len(p.pendingHosts) == 0 { return } - for host, banUntil := range p.bannedHosts { - if time.Now().After(banUntil) { - p.pendingHosts[host] = struct{}{} - } - } - for host := range p.pendingHosts { - err := p.retryConnect(host, compression, b) + + for addr := range p.pendingHosts { + pool, err := p.createConnection(addr) if err != nil { - p.logger.Error("failed to reconnect to banned host", zap.Error(err), zap.String("addr", host.Addr)) + p.logger.Error("failed to reconnect to pending host", zap.Error(err), zap.String("addr", addr.Addr)) continue } - delete(p.bannedHosts, host) - delete(p.pendingHosts, host) + + p.mu.Lock() + p.poolsByAddr[addr] = pool + for j := 0; j < *addr.Weight; j++ { + p.instances = append(p.instances, instance{ + addr: addr, + pool: pool, + }) + } + delete(p.pendingHosts, addr) + p.mu.Unlock() } - p.mu.Unlock() } } } -func (p *Plugin) retryConnect(addr Address, compression ch.Compression, b xtls.ConfigBuilder) error { - pool, err := p.createConnection(addr, compression, b) - if err != nil { - var netError net.Error - if errors.As(err, &netError) { +func (p *Plugin) checkBannedHosts() { + ticker := time.NewTicker(p.config.ReconnectInterval_) + for { + select { + case <-p.ctx.Done(): + return + case <-ticker.C: p.mu.Lock() - p.bannedHosts[addr] = time.Now().Add(p.config.BanTimeout_) + for addr, banUntil := range p.bannedHosts { + if time.Now().After(banUntil) { + pool, ok := p.poolsByAddr[addr] + if ok { + for i := 0; i < *addr.Weight; i++ { + p.instances = append(p.instances, instance{ + addr: addr, + pool: pool, + }) + } + } + delete(p.bannedHosts, addr) + } + } p.mu.Unlock() } - return err - } - - for j := 0; j < *addr.Weight; j++ { - p.instances = append(p.instances, pool) } - - return nil } func (p *Plugin) Stop() { p.cancelFunc() p.batcher.Stop() - for _, clickhouse := range p.instances { - clickhouse.Close() + + for _, pool := range p.poolsByAddr { + pool.Close() } } @@ -638,14 +667,27 @@ func (p *Plugin) out(workerData *pipeline.WorkerData, batch *pipeline.Batch) err } }) + p.mu.RLock() + attempts := len(p.instances) + p.mu.RUnlock() + + if attempts == 0 && p.config.FatalOnFailedInsert { + p.logger.Fatal("no available clickhouse addresses") + } + var err error - for i := range p.instances { + for i := 0; i < attempts; i++ { requestID := p.requestID.Inc() - clickhouse := p.getInstance(requestID, i) - err = p.do(clickhouse, data.input) + instance := p.getInstance(requestID, i) + err = p.do(instance.pool, data.input) if err == nil { return nil } + + var netErr net.Error + if errors.As(err, &netErr) { + p.banInstance(instance) + } } if err != nil { p.insertErrorsMetric.Inc() @@ -670,7 +712,23 @@ func (p *Plugin) do(clickhouse Clickhouse, queryInput proto.Input) error { }) } -func (p *Plugin) getInstance(requestID int64, retry int) Clickhouse { +func (p *Plugin) banInstance(inst instance) { + p.mu.Lock() + defer p.mu.Unlock() + + filtered := p.instances[:0] + for _, it := range p.instances { + if it.addr != inst.addr { + filtered = append(filtered, it) + } + } + p.instances = filtered + p.bannedHosts[inst.addr] = time.Now().Add(p.config.FailureCooldownPeriod_) +} + +func (p *Plugin) getInstance(requestID int64, retry int) instance { + p.mu.RLock() + defer p.mu.RUnlock() var instanceIdx int switch p.config.InsertStrategy_ { case StrategyInOrder: diff --git a/plugin/output/clickhouse/clickhouse_test.go b/plugin/output/clickhouse/clickhouse_test.go index 9adf5afa4..14829360a 100644 --- a/plugin/output/clickhouse/clickhouse_test.go +++ b/plugin/output/clickhouse/clickhouse_test.go @@ -14,7 +14,7 @@ func TestPlugin_getInstance(t *testing.T) { ctrl := gomock.NewController(t) - instances := []Clickhouse{ + pools := []Clickhouse{ mockclickhouse.NewMockClickhouse(ctrl), mockclickhouse.NewMockClickhouse(ctrl), mockclickhouse.NewMockClickhouse(ctrl), @@ -22,16 +22,32 @@ func TestPlugin_getInstance(t *testing.T) { mockclickhouse.NewMockClickhouse(ctrl), } + addrs := []Address{ + {Addr: "addr1", Weight: intPtr(1)}, + {Addr: "addr2", Weight: intPtr(2)}, + {Addr: "addr3", Weight: intPtr(3)}, + {Addr: "addr4", Weight: intPtr(4)}, + {Addr: "addr5", Weight: intPtr(5)}, + } + + instances := []instance{ + {addr: addrs[0], pool: pools[0]}, + {addr: addrs[1], pool: pools[1]}, + {addr: addrs[2], pool: pools[2]}, + {addr: addrs[3], pool: pools[3]}, + {addr: addrs[4], pool: pools[4]}, + } + type args struct { id int64 retry int } tests := []struct { name string - instances []Clickhouse + instances []instance stategy InsertStrategy args args - want Clickhouse + want instance }{ // in-order { @@ -223,3 +239,7 @@ func TestAddress_UnmarshalJSON(t *testing.T) { }) } } + +func intPtr(a int) *int { + return &a +} From 7bcf16ac086f83e00015733b7dcca9b2ff13aa14 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Mon, 13 Apr 2026 18:22:49 +0300 Subject: [PATCH 06/32] cb --- plugin/output/elasticsearch/elasticsearch.go | 51 +++++++++++++++++--- plugin/output/http/http.go | 47 ++++++++++++++++-- plugin/output/loki/loki.go | 35 ++++++++++++-- plugin/output/splunk/splunk.go | 40 +++++++++++++-- plugin/output/splunk/splunk_test.go | 4 +- 5 files changed, 156 insertions(+), 21 deletions(-) diff --git a/plugin/output/elasticsearch/elasticsearch.go b/plugin/output/elasticsearch/elasticsearch.go index 4bdaea55e..7a0f25065 100644 --- a/plugin/output/elasticsearch/elasticsearch.go +++ b/plugin/output/elasticsearch/elasticsearch.go @@ -48,6 +48,8 @@ type Plugin struct { cancel context.CancelFunc mu *sync.Mutex + cb *xhttp.CircuitBreaker[string] + // plugin metrics sendErrorMetric *metric.CounterVec indexingErrorsMetric *metric.Counter @@ -203,6 +205,18 @@ type Config struct { // > // > Process ES response and report errors, if any. ProcessResponse bool `json:"process_response" default:"true"` // * + + // > @3@4@5@6 + // > + // > Period for which addresses will be banned in case of unavailability. + BanPeriod cfg.Duration `json:"ban_period" default:"10s" parse:"duration"` // * + BanPeriod_ time.Duration + + // > @3@4@5@6 + // > + // > Interval for reconnecting to addresses that are unavailable during initialization. + ReconnectInterval cfg.Duration `json:"reconnect_interval" default:"5s" parse:"duration"` // * + ReconnectInterval_ time.Duration } type KeepAliveConfig struct { @@ -244,7 +258,21 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.config.IndexValues = append(p.config.IndexValues, "@time") } - p.prepareClient() + endpoints := prepareEndpoints(p.config.Endpoints, p.config.IngestPipeline) + + p.prepareClient(endpoints) + + capacity := p.cb.CalcActiveTargetsCapacity( + endpoints, + func(_ string) int { return 1 }, + ) + p.cb = xhttp.NewCircuitBreaker[string]( + p.config.BanPeriod_, + capacity, + ) + for _, endpoint := range endpoints { + p.cb.AddTarget(xhttp.TargetID(endpoint), endpoint, 1) + } p.maintenance(nil) @@ -299,6 +327,8 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.cancel = cancel p.batcher.Start(ctx) + + go xhttp.CheckBannedHosts(ctx, p.cb, p.config.ReconnectInterval_) } func (p *Plugin) Stop() { @@ -315,9 +345,9 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { p.indexingErrorsMetric = ctl.RegisterCounter("output_elasticsearch_index_error_total", "Number of elasticsearch indexing errors") } -func (p *Plugin) prepareClient() { +func (p *Plugin) prepareClient(endpoints []string) { config := &xhttp.ClientConfig{ - Endpoints: prepareEndpoints(p.config.Endpoints, p.config.IngestPipeline), + Endpoints: endpoints, ConnectionTimeout: p.config.ConnectionTimeout_ * 2, AuthHeader: p.getAuthHeader(), KeepAlive: &xhttp.ClientKeepAliveConfig{ @@ -420,14 +450,19 @@ func (p *Plugin) send(data []byte) (int, error) { if !p.config.ProcessResponse { processFn = nil } - - return p.client.DoTimeout( + statusCode, endpoint, err := p.client.DoTimeout( http.MethodPost, NDJSONContentType, data, p.config.ConnectionTimeout_, processFn, ) + + if err != nil && xhttp.ShouldBanEndpoint(statusCode, err) { + p.cb.BanTarget(xhttp.TargetID(endpoint)) + } + + return statusCode, err } func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, error) { @@ -440,7 +475,7 @@ func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, processFn = nil } - statusCode, err := p.client.DoTimeout( + statusCode, endpoint, err := p.client.DoTimeout( http.MethodPost, NDJSONContentType, data[begin[left]:begin[right]], @@ -448,6 +483,10 @@ func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, processFn, ) + if err != nil && xhttp.ShouldBanEndpoint(statusCode, err) { + p.cb.BanTarget(xhttp.TargetID(endpoint)) + } + if err != nil { p.sendErrorMetric.WithLabelValues(strconv.Itoa(statusCode)).Inc() switch statusCode { diff --git a/plugin/output/http/http.go b/plugin/output/http/http.go index c3ed464ca..925c666f4 100644 --- a/plugin/output/http/http.go +++ b/plugin/output/http/http.go @@ -42,6 +42,7 @@ type Plugin struct { cancel context.CancelFunc mu *sync.Mutex + cb *xhttp.CircuitBreaker[string] // plugin metrics sendErrorMetric *metric.CounterVec @@ -164,6 +165,18 @@ type Config struct { // > // > After a non-retryable write error, fall with a non-zero exit code or not Strict bool `json:"strict" default:"false"` // * + + // > @3@4@5@6 + // > + // > Period for which addresses will be banned in case of unavailability. + BanPeriod cfg.Duration `json:"ban_period" default:"10s" parse:"duration"` // * + BanPeriod_ time.Duration + + // > @3@4@5@6 + // > + // > Interval for reconnecting to addresses that are unavailable during initialization. + ReconnectInterval cfg.Duration `json:"reconnect_interval" default:"5s" parse:"duration"` // * + ReconnectInterval_ time.Duration } type KeepAliveConfig struct { @@ -199,9 +212,21 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.config = config.(*Config) p.registerMetrics(params.MetricCtl) p.mu = &sync.Mutex{} + endpoints := p.prepareEndpoints() - p.prepareClient() + p.prepareClient(endpoints) + capacity := p.cb.CalcActiveTargetsCapacity( + endpoints, + func(_ string) int { return 1 }, + ) + p.cb = xhttp.NewCircuitBreaker[string]( + p.config.BanPeriod_, + capacity, + ) + for _, endpoint := range endpoints { + p.cb.AddTarget(xhttp.TargetID(endpoint), endpoint, 1) + } p.logger.Info("starting batcher", zap.Duration("timeout", p.config.BatchFlushTimeout_)) batcherOpts := pipeline.BatcherOptions{ @@ -253,6 +278,8 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.cancel = cancel p.batcher.Start(ctx) + + go xhttp.CheckBannedHosts(ctx, p.cb, p.config.ReconnectInterval_) } func (p *Plugin) Stop() { @@ -268,9 +295,9 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { p.sendErrorMetric = ctl.RegisterCounterVec("output_http_send_error_total", "Total HTTP send errors", "status_code") } -func (p *Plugin) prepareClient() { +func (p *Plugin) prepareClient(endpoints []string) { config := &xhttp.ClientConfig{ - Endpoints: p.prepareEndpoints(), + Endpoints: endpoints, ConnectionTimeout: p.config.ConnectionTimeout_ * 2, AuthHeader: p.getAuthHeader(), KeepAlive: &xhttp.ClientKeepAliveConfig{ @@ -361,13 +388,19 @@ func (p *Plugin) out(workerData *pipeline.WorkerData, batch *pipeline.Batch) err } func (p *Plugin) send(data []byte) (int, error) { - return p.client.DoTimeout( + statusCode, endpoint, err := p.client.DoTimeout( http.MethodPost, p.config.ContentType, data, p.config.ConnectionTimeout_, nil, ) + + if err != nil && xhttp.ShouldBanEndpoint(statusCode, err) { + p.cb.BanTarget(xhttp.TargetID(endpoint)) + } + + return statusCode, err } func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, error) { @@ -375,7 +408,7 @@ func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, return http.StatusOK, nil } - statusCode, err := p.client.DoTimeout( + statusCode, endpoint, err := p.client.DoTimeout( http.MethodPost, p.config.ContentType, data[begin[left]:begin[right]], @@ -383,6 +416,10 @@ func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, nil, ) + if err != nil && xhttp.ShouldBanEndpoint(statusCode, err) { + p.cb.BanTarget(xhttp.TargetID(endpoint)) + } + if err != nil { p.sendErrorMetric.WithLabelValues(strconv.Itoa(statusCode)).Inc() switch statusCode { diff --git a/plugin/output/loki/loki.go b/plugin/output/loki/loki.go index 160054675..4d03ebad3 100644 --- a/plugin/output/loki/loki.go +++ b/plugin/output/loki/loki.go @@ -178,6 +178,18 @@ type Config struct { // > // > Multiplier for exponential increase of retention between retries RetentionExponentMultiplier int `json:"retention_exponentially_multiplier" default:"2"` // * + + // > @3@4@5@6 + // > + // > Period for which addresses will be banned in case of unavailability. + BanPeriod cfg.Duration `json:"ban_period" default:"10s" parse:"duration"` // * + BanPeriod_ time.Duration + + // > @3@4@5@6 + // > + // > Interval for reconnecting to addresses that are unavailable during initialization. + ReconnectInterval cfg.Duration `json:"reconnect_interval" default:"5s" parse:"duration"` // * + ReconnectInterval_ time.Duration } type AuthStrategy byte @@ -231,6 +243,8 @@ type Plugin struct { client *xhttp.Client batcher *pipeline.RetriableBatcher + cb *xhttp.CircuitBreaker[string] + // plugin metrics sendErrorMetric *metric.CounterVec @@ -259,8 +273,16 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.labels = p.parseLabels() - p.prepareClient() + endpoints := []string{fmt.Sprintf("%s/loki/api/v1/push", p.config.Address)} + p.prepareClient(endpoints) + p.cb = xhttp.NewCircuitBreaker[string]( + p.config.BanPeriod_, + 1, + ) + for _, endpoint := range endpoints { + p.cb.AddTarget(xhttp.TargetID(endpoint), endpoint, 1) + } batcherOpts := &pipeline.BatcherOptions{ PipelineName: params.PipelineName, OutputType: outPluginType, @@ -308,6 +330,8 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.cancel = cancel p.batcher.Start(ctx) + + go xhttp.CheckBannedHosts(ctx, p.cb, p.config.ReconnectInterval_) } func (p *Plugin) Stop() { @@ -412,7 +436,7 @@ func (p *Plugin) send(root *insaneJSON.Root) (int, error) { p.logger.Debug("sent", zap.String("data", string(data))) - statusCode, err := p.client.DoTimeout( + statusCode, endpoint, err := p.client.DoTimeout( http.MethodPost, "application/json", data, @@ -422,6 +446,9 @@ func (p *Plugin) send(root *insaneJSON.Root) (int, error) { if statusCode != http.StatusNoContent { return statusCode, fmt.Errorf("bad response: code=%d, err=%v", statusCode, err) } + if err != nil && xhttp.ShouldBanEndpoint(statusCode, err) { + p.cb.BanTarget(xhttp.TargetID(endpoint)) + } return statusCode, nil } @@ -430,9 +457,9 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { p.sendErrorMetric = ctl.RegisterCounterVec("output_loki_send_error_total", "Total Loki send errors", "status_code") } -func (p *Plugin) prepareClient() { +func (p *Plugin) prepareClient(endpoints []string) { config := &xhttp.ClientConfig{ - Endpoints: []string{fmt.Sprintf("%s/loki/api/v1/push", p.config.Address)}, + Endpoints: endpoints, ConnectionTimeout: p.config.ConnectionTimeout_ * 2, AuthHeader: p.getAuthHeader(), CustomHeaders: p.getCustomHeaders(), diff --git a/plugin/output/splunk/splunk.go b/plugin/output/splunk/splunk.go index 090ae5048..75f4d9f1b 100644 --- a/plugin/output/splunk/splunk.go +++ b/plugin/output/splunk/splunk.go @@ -95,6 +95,8 @@ type Plugin struct { cancel context.CancelFunc + cb *xhttp.CircuitBreaker[string] + // plugin metrics sendErrorMetric *metric.CounterVec @@ -202,6 +204,18 @@ type Config struct { // > Supports copying whole original event, but does not allow to copy directly to the output root // > or the "event" key with any of its subkeys. CopyFields []CopyField `json:"copy_fields" slice:"true"` // * + + // > @3@4@5@6 + // > + // > Period for which addresses will be banned in case of unavailability. + BanPeriod cfg.Duration `json:"ban_period" default:"10s" parse:"duration"` // * + BanPeriod_ time.Duration + + // > @3@4@5@6 + // > + // > Interval for reconnecting to addresses that are unavailable during initialization. + ReconnectInterval cfg.Duration `json:"reconnect_interval" default:"5s" parse:"duration"` // * + ReconnectInterval_ time.Duration } type KeepAliveConfig struct { @@ -235,8 +249,20 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.avgEventSize = params.PipelineSettings.AvgEventSize p.config = config.(*Config) p.registerMetrics(params.MetricCtl) - p.prepareClient() + endpoints := []string{p.config.Endpoint} + p.prepareClient(endpoints) + capacity := p.cb.CalcActiveTargetsCapacity( + endpoints, + func(_ string) int { return 1 }, + ) + p.cb = xhttp.NewCircuitBreaker[string]( + p.config.BanPeriod_, + capacity, + ) + for _, endpoint := range endpoints { + p.cb.AddTarget(xhttp.TargetID(endpoint), endpoint, 1) + } for _, cf := range p.config.CopyFields { if cf.To == "" { p.logger.Error("copies to the root are not allowed") @@ -300,6 +326,8 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.cancel = cancel p.batcher.Start(ctx) + + go xhttp.CheckBannedHosts(ctx, p.cb, p.config.ReconnectInterval_) } func (p *Plugin) Stop() { @@ -319,9 +347,9 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { ) } -func (p *Plugin) prepareClient() { +func (p *Plugin) prepareClient(endpoints []string) { config := &xhttp.ClientConfig{ - Endpoints: []string{p.config.Endpoint}, + Endpoints: endpoints, ConnectionTimeout: p.config.RequestTimeout_, AuthHeader: "Splunk " + p.config.Token, KeepAlive: &xhttp.ClientKeepAliveConfig{ @@ -379,9 +407,13 @@ func (p *Plugin) out(workerData *pipeline.WorkerData, batch *pipeline.Batch) err p.logger.Debugf("trying to send: %s", outBuf) - code, err := p.client.DoTimeout(http.MethodPost, "", outBuf, + code, endpoint, err := p.client.DoTimeout(http.MethodPost, "", outBuf, p.config.RequestTimeout_, parseSplunkError) + if err != nil && xhttp.ShouldBanEndpoint(code, err) { + p.cb.BanTarget(xhttp.TargetID(endpoint)) + } + if err != nil { p.sendErrorMetric.WithLabelValues(strconv.Itoa(code)).Inc() p.logger.Errorf("can't send data to splunk address=%s: %s", p.config.Endpoint, err.Error()) diff --git a/plugin/output/splunk/splunk_test.go b/plugin/output/splunk/splunk_test.go index 40626e076..0592e53ff 100644 --- a/plugin/output/splunk/splunk_test.go +++ b/plugin/output/splunk/splunk_test.go @@ -54,7 +54,7 @@ func TestSplunk(t *testing.T) { }, logger: zap.NewExample().Sugar(), } - plugin.prepareClient() + plugin.prepareClient([]string{plugin.config.Endpoint}) batch := pipeline.NewPreparedBatch([]*pipeline.Event{ {Root: input}, @@ -185,7 +185,7 @@ func TestCopyFields(t *testing.T) { copyFieldsPaths: tt.copyFields, logger: zap.NewExample().Sugar(), } - plugin.prepareClient() + plugin.prepareClient([]string{plugin.config.Endpoint}) batch := pipeline.NewPreparedBatch([]*pipeline.Event{ {Root: input}, From ad16cfc6ddb86841c7e6f6cf54e00992658cf46a Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Mon, 13 Apr 2026 18:28:44 +0300 Subject: [PATCH 07/32] fix --- xhttp/circuit_breaker.go | 114 +++++++++++++++++++++++++++++++++++++++ xhttp/client.go | 22 ++++++-- 2 files changed, 131 insertions(+), 5 deletions(-) create mode 100644 xhttp/circuit_breaker.go diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go new file mode 100644 index 000000000..af8bf24ab --- /dev/null +++ b/xhttp/circuit_breaker.go @@ -0,0 +1,114 @@ +package xhttp + +import ( + "context" + "sync" + "time" + + "github.com/ozontech/file.d/xtime" +) + +type TargetID string + +type Target[T any] struct { + ID TargetID + Client T + Weight int +} + +type CircuitBreaker[T any] struct { + activeTargets []Target[T] + targetsByID map[TargetID]T + weightsByID map[TargetID]int + bannedUntil map[TargetID]time.Time + banPeriod time.Duration + mu sync.RWMutex +} + +func NewCircuitBreaker[T any](banPeriod time.Duration, activeTargetsCap int) *CircuitBreaker[T] { + return &CircuitBreaker[T]{ + activeTargets: make([]Target[T], 0, activeTargetsCap), + targetsByID: make(map[TargetID]T, activeTargetsCap), + weightsByID: make(map[TargetID]int, activeTargetsCap), + bannedUntil: make(map[TargetID]time.Time, activeTargetsCap), + banPeriod: banPeriod, + } +} + +func (cb *CircuitBreaker[T]) AddTarget(id TargetID, client T, weight int) { + cb.mu.Lock() + defer cb.mu.Unlock() + + cb.targetsByID[id] = client + cb.weightsByID[id] = weight + + for i := 0; i < weight; i++ { + cb.activeTargets = append(cb.activeTargets, Target[T]{ + ID: id, + Client: client, + Weight: weight, + }) + } +} + +func (cb *CircuitBreaker[T]) BanTarget(id TargetID) { + cb.mu.Lock() + defer cb.mu.Unlock() + + filtered := cb.activeTargets[:0] + for _, target := range cb.activeTargets { + if target.ID != id { + filtered = append(filtered, target) + } + } + + cb.activeTargets = filtered + cb.bannedUntil[id] = xtime.GetInaccurateTime().Add(cb.banPeriod) +} + +func (cb *CircuitBreaker[T]) RestoreBannedTargets() { + cb.mu.Lock() + defer cb.mu.Unlock() + + for id, until := range cb.bannedUntil { + if xtime.GetInaccurateTime().Before(until) { + continue + } + + client := cb.targetsByID[id] + weight := cb.weightsByID[id] + + for i := 0; i < weight; i++ { + cb.activeTargets = append(cb.activeTargets, Target[T]{ + ID: id, + Client: client, + Weight: weight, + }) + } + + delete(cb.bannedUntil, id) + } +} + +func (cb *CircuitBreaker[T]) CalcActiveTargetsCapacity(target []T, getWeight func(T) int) int { + totalCap := 0 + for _, t := range target { + w := getWeight(t) + totalCap += w + } + return totalCap +} + +func CheckBannedHosts[T any](ctx context.Context, cb *CircuitBreaker[T], reconnectInterval time.Duration) { + ticker := time.NewTicker(reconnectInterval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + cb.RestoreBannedTargets() + } + } +} diff --git a/xhttp/client.go b/xhttp/client.go index f9ea6e6a9..93d25ffcf 100644 --- a/xhttp/client.go +++ b/xhttp/client.go @@ -83,7 +83,7 @@ func (c *Client) DoTimeout( body []byte, timeout time.Duration, processResponse func([]byte) error, -) (int, error) { +) (int, string, error) { req := fasthttp.AcquireRequest() defer fasthttp.ReleaseRequest(req) resp := fasthttp.AcquireResponse() @@ -99,20 +99,20 @@ func (c *Client) DoTimeout( c.prepareRequest(req, endpoint, method, contentType, body) if err := c.client.DoTimeout(req, resp, timeout); err != nil { - return 0, fmt.Errorf("can't send request to %s: %w", endpoint.String(), err) + return 0, endpoint.String(), fmt.Errorf("can't send request to %s: %w", endpoint.String(), err) } respContent := resp.Body() statusCode := resp.Header.StatusCode() if !(http.StatusOK <= statusCode && statusCode <= http.StatusAccepted) { - return statusCode, fmt.Errorf("response status from %s isn't OK: status=%d, body=%s", endpoint.String(), statusCode, string(respContent)) + return statusCode, endpoint.String(), fmt.Errorf("response status from %s isn't OK: status=%d, body=%s", endpoint.String(), statusCode, string(respContent)) } if processResponse != nil { - return statusCode, processResponse(respContent) + return statusCode, endpoint.String(), processResponse(respContent) } - return statusCode, nil + return statusCode, endpoint.String(), nil } func (c *Client) prepareRequest(req *fasthttp.Request, endpoint *fasthttp.URI, method, contentType string, body []byte) { @@ -168,3 +168,15 @@ func parseGzipCompressionLevel(level string) int { return -1 } } + +func ShouldBanEndpoint(statusCode int, err error) bool { + switch statusCode { + case http.StatusBadGateway, + http.StatusServiceUnavailable, + http.StatusGatewayTimeout, + http.StatusTooManyRequests: + return true + default: + return false + } +} From 87d2efbb7d0de6af65e23001b18d99e916fb98ad Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Mon, 13 Apr 2026 19:58:20 +0300 Subject: [PATCH 08/32] magic --- plugin/output/clickhouse/README.md | 2 +- plugin/output/clickhouse/clickhouse.go | 110 ++++++------------- plugin/output/clickhouse/clickhouse_test.go | 104 ++++++++---------- plugin/output/elasticsearch/README.md | 12 ++ plugin/output/elasticsearch/elasticsearch.go | 2 +- plugin/output/http/README.md | 12 ++ plugin/output/http/http.go | 2 +- plugin/output/loki/README.md | 12 ++ plugin/output/splunk/README.md | 12 ++ plugin/output/splunk/splunk.go | 6 +- xhttp/circuit_breaker.go | 16 ++- 11 files changed, 146 insertions(+), 144 deletions(-) diff --git a/plugin/output/clickhouse/README.md b/plugin/output/clickhouse/README.md index 4ca236c6c..4d12e9860 100644 --- a/plugin/output/clickhouse/README.md +++ b/plugin/output/clickhouse/README.md @@ -215,7 +215,7 @@ After this timeout batch will be sent even if batch isn't completed.
-**`failure_cooldown_period`** *`cfg.Duration`* *`default=10s`* +**`ban_period`** *`cfg.Duration`* *`default=10s`* Period for which addresses will be banned in case of unavailability. diff --git a/plugin/output/clickhouse/clickhouse.go b/plugin/output/clickhouse/clickhouse.go index d2dccc6ed..a164899db 100644 --- a/plugin/output/clickhouse/clickhouse.go +++ b/plugin/output/clickhouse/clickhouse.go @@ -19,6 +19,7 @@ import ( "github.com/ozontech/file.d/fd" "github.com/ozontech/file.d/metric" "github.com/ozontech/file.d/pipeline" + "github.com/ozontech/file.d/xhttp" "github.com/ozontech/file.d/xtls" "go.uber.org/atomic" "go.uber.org/zap" @@ -44,11 +45,6 @@ type Clickhouse interface { Do(ctx context.Context, query ch.Query) error } -type instance struct { - addr Address - pool Clickhouse -} - type Plugin struct { logger *zap.Logger @@ -60,7 +56,6 @@ type Plugin struct { query string // TODO: support shards - instances []instance requestID atomic.Int64 // plugin metrics @@ -72,8 +67,8 @@ type Plugin struct { compression ch.Compression tlsConfig *tls.Config + cb *xhttp.CircuitBreaker[Clickhouse] poolsByAddr map[Address]Clickhouse - bannedHosts map[Address]time.Time pendingHosts map[Address]struct{} mu sync.RWMutex } @@ -353,8 +348,8 @@ type Config struct { // > @3@4@5@6 // > // > Period for which addresses will be banned in case of unavailability. - FailureCooldownPeriod cfg.Duration `json:"failure_cooldown_period" default:"10s" parse:"duration"` // * - FailureCooldownPeriod_ time.Duration + BanPeriod cfg.Duration `json:"ban_period" default:"10s" parse:"duration"` // * + BanPeriod_ time.Duration // > @3@4@5@6 // > @@ -383,7 +378,6 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.logger = params.Logger.Desugar() p.config = config.(*Config) - p.bannedHosts = make(map[Address]time.Time, len(p.config.Addresses)) p.pendingHosts = make(map[Address]struct{}, len(p.config.Addresses)) p.poolsByAddr = make(map[Address]Clickhouse, len(p.config.Addresses)) @@ -399,7 +393,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP if p.config.ReconnectInterval_ < 1 { p.logger.Fatal("'reconnect_interval' can't be <1") } - if p.config.FailureCooldownPeriod_ < 1 { + if p.config.BanPeriod_ < 1 { p.logger.Fatal("'failure_cooldown_period' cant't be <1") } @@ -438,6 +432,17 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.tlsConfig = b.Build() } + capacity := xhttp.CalcActiveTargetsCapacity( + p.config.Addresses, + func(addr Address) int { + return *addr.Weight + }, + ) + + p.cb = xhttp.NewCircuitBreaker[Clickhouse]( + p.config.BanPeriod_, + capacity, + ) for _, addr := range p.config.Addresses { pool, err := p.createConnection(addr) if err != nil { @@ -448,20 +453,16 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.logger.Error("create clickhouse connection pool", zap.Error(err), zap.String("addr", addr.Addr)) continue } + p.poolsByAddr[addr] = pool - for j := 0; j < *addr.Weight; j++ { - p.instances = append(p.instances, instance{ - addr: addr, - pool: pool, - }) - } + p.cb.AddTarget(xhttp.TargetID(addr.Addr), pool, *addr.Weight) } - if len(p.instances) == 0 && p.config.FatalOnFailedInsert { + if p.cb.ActiveCount() == 0 && p.config.FatalOnFailedInsert { p.logger.Fatal("cannot start: no available clickhouse addresses in config") } - go p.checkBannedHosts() + go xhttp.CheckBannedHosts(p.ctx, p.cb, p.config.ReconnectInterval_) if len(p.pendingHosts) > 0 { go p.checkPendingHosts() @@ -541,6 +542,7 @@ func (p *Plugin) createConnection(addr Address) (*chpool.Pool, error) { func (p *Plugin) checkPendingHosts() { ticker := time.NewTicker(p.config.ReconnectInterval_) + defer ticker.Stop() for { select { case <-p.ctx.Done(): @@ -559,42 +561,11 @@ func (p *Plugin) checkPendingHosts() { p.mu.Lock() p.poolsByAddr[addr] = pool - for j := 0; j < *addr.Weight; j++ { - p.instances = append(p.instances, instance{ - addr: addr, - pool: pool, - }) - } delete(p.pendingHosts, addr) p.mu.Unlock() - } - } - } -} -func (p *Plugin) checkBannedHosts() { - ticker := time.NewTicker(p.config.ReconnectInterval_) - for { - select { - case <-p.ctx.Done(): - return - case <-ticker.C: - p.mu.Lock() - for addr, banUntil := range p.bannedHosts { - if time.Now().After(banUntil) { - pool, ok := p.poolsByAddr[addr] - if ok { - for i := 0; i < *addr.Weight; i++ { - p.instances = append(p.instances, instance{ - addr: addr, - pool: pool, - }) - } - } - delete(p.bannedHosts, addr) - } + p.cb.AddTarget(xhttp.TargetID(addr.Addr), pool, *addr.Weight) } - p.mu.Unlock() } } } @@ -667,26 +638,25 @@ func (p *Plugin) out(workerData *pipeline.WorkerData, batch *pipeline.Batch) err } }) - p.mu.RLock() - attempts := len(p.instances) - p.mu.RUnlock() + instancesCount := p.cb.ActiveCount() - if attempts == 0 && p.config.FatalOnFailedInsert { + if instancesCount == 0 && p.config.FatalOnFailedInsert { p.logger.Fatal("no available clickhouse addresses") } var err error - for i := 0; i < attempts; i++ { + for i := 0; i < instancesCount; i++ { requestID := p.requestID.Inc() - instance := p.getInstance(requestID, i) - err = p.do(instance.pool, data.input) + idx := p.getInstanceIndex(requestID, i, instancesCount) + instance := p.cb.GetActiveTargetByIndex(idx) + err = p.do(instance.Client, data.input) if err == nil { return nil } var netErr net.Error if errors.As(err, &netErr) { - p.banInstance(instance) + p.cb.BanTarget(instance.ID) } } if err != nil { @@ -712,31 +682,17 @@ func (p *Plugin) do(clickhouse Clickhouse, queryInput proto.Input) error { }) } -func (p *Plugin) banInstance(inst instance) { - p.mu.Lock() - defer p.mu.Unlock() - - filtered := p.instances[:0] - for _, it := range p.instances { - if it.addr != inst.addr { - filtered = append(filtered, it) - } - } - p.instances = filtered - p.bannedHosts[inst.addr] = time.Now().Add(p.config.FailureCooldownPeriod_) -} - -func (p *Plugin) getInstance(requestID int64, retry int) instance { +func (p *Plugin) getInstanceIndex(requestID int64, retry, instanceCount int) int { p.mu.RLock() defer p.mu.RUnlock() var instanceIdx int switch p.config.InsertStrategy_ { case StrategyInOrder: - instanceIdx = retry % len(p.instances) + instanceIdx = retry % instanceCount case StrategyRoundRobin: - instanceIdx = int(requestID) % len(p.instances) + instanceIdx = int(requestID) % instanceCount } - return p.instances[instanceIdx] + return instanceIdx } func addrWithDefaultPort(addr string, defaultPort string) string { diff --git a/plugin/output/clickhouse/clickhouse_test.go b/plugin/output/clickhouse/clickhouse_test.go index 14829360a..76cc8c22e 100644 --- a/plugin/output/clickhouse/clickhouse_test.go +++ b/plugin/output/clickhouse/clickhouse_test.go @@ -1,11 +1,14 @@ package clickhouse import ( + "fmt" "math/rand" "testing" + "time" "github.com/golang/mock/gomock" mockclickhouse "github.com/ozontech/file.d/plugin/output/clickhouse/mock" + "github.com/ozontech/file.d/xhttp" "github.com/stretchr/testify/assert" ) @@ -22,76 +25,60 @@ func TestPlugin_getInstance(t *testing.T) { mockclickhouse.NewMockClickhouse(ctrl), } - addrs := []Address{ - {Addr: "addr1", Weight: intPtr(1)}, - {Addr: "addr2", Weight: intPtr(2)}, - {Addr: "addr3", Weight: intPtr(3)}, - {Addr: "addr4", Weight: intPtr(4)}, - {Addr: "addr5", Weight: intPtr(5)}, - } - - instances := []instance{ - {addr: addrs[0], pool: pools[0]}, - {addr: addrs[1], pool: pools[1]}, - {addr: addrs[2], pool: pools[2]}, - {addr: addrs[3], pool: pools[3]}, - {addr: addrs[4], pool: pools[4]}, - } - type args struct { id int64 retry int } tests := []struct { - name string - instances []instance - stategy InsertStrategy - args args - want instance + name string + stategy InsertStrategy + args args + instanceCount int + want Clickhouse }{ // in-order { - name: "one instance and first retry", - instances: instances[:1], - stategy: StrategyInOrder, - args: args{id: rand.Int63(), retry: 0}, - want: instances[0], + name: "one instance and first retry", + stategy: StrategyInOrder, + args: args{id: rand.Int63(), retry: 0}, + instanceCount: 1, + want: pools[0], }, { - name: "one instance and some retry", - instances: instances[:1], - stategy: StrategyInOrder, - args: args{id: rand.Int63(), retry: 123}, - want: instances[0], + name: "one instance and some retry", + stategy: StrategyInOrder, + args: args{id: rand.Int63(), retry: 123}, + instanceCount: 1, + want: pools[0], }, { - name: "many instances and some retry", - instances: instances, - stategy: StrategyInOrder, - args: args{id: rand.Int63(), retry: 123}, - want: instances[3], // 123%3 + name: "many instances and some retry", + stategy: StrategyInOrder, + args: args{id: rand.Int63(), retry: 123}, + instanceCount: 2, + want: pools[1], // 123%2 }, // round-robin { - name: "many instances and first retry", - instances: instances, - stategy: StrategyRoundRobin, - args: args{id: 123, retry: 0}, - want: instances[3], // 123%3 + name: "many instances and first retry", + stategy: StrategyRoundRobin, + args: args{id: 123, retry: 0}, + instanceCount: 3, + want: pools[0], // 123%3 }, { - name: "many instances and rand retry", - instances: instances, - stategy: StrategyRoundRobin, - args: args{id: 0, retry: rand.Int()}, - want: instances[0], + name: "many instances and rand retry", + stategy: StrategyRoundRobin, + args: args{id: 0, retry: rand.Int()}, + instanceCount: 5, + want: pools[0], }, { - name: "one instances and rand retry", - instances: instances[:1], - stategy: StrategyRoundRobin, - args: args{id: rand.Int63(), retry: rand.Int()}, - want: instances[0], + name: "one instances and rand retry", + stategy: StrategyRoundRobin, + args: args{id: rand.Int63(), retry: rand.Int()}, + instanceCount: 1, + want: pools[0], }, } @@ -100,10 +87,15 @@ func TestPlugin_getInstance(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() - p := &Plugin{instances: tt.instances, config: &Config{InsertStrategy_: tt.stategy}} + cb := xhttp.NewCircuitBreaker[Clickhouse](time.Second, tt.instanceCount) + for i := 0; i < tt.instanceCount; i++ { + cb.AddTarget(xhttp.TargetID(fmt.Sprintf("addr%d", i)), pools[i], 1) + } + p := &Plugin{cb: cb, config: &Config{InsertStrategy_: tt.stategy}} - instance := p.getInstance(tt.args.id, tt.args.retry) - if instance != tt.want { + idx := p.getInstanceIndex(tt.args.id, tt.args.retry, tt.instanceCount) + instance := p.cb.GetActiveTargetByIndex(idx) + if instance.Client != tt.want { t.Fatal("instances are not equal") } }) @@ -239,7 +231,3 @@ func TestAddress_UnmarshalJSON(t *testing.T) { }) } } - -func intPtr(a int) *int { - return &a -} diff --git a/plugin/output/elasticsearch/README.md b/plugin/output/elasticsearch/README.md index bcf034035..1ebe92281 100755 --- a/plugin/output/elasticsearch/README.md +++ b/plugin/output/elasticsearch/README.md @@ -170,5 +170,17 @@ Process ES response and report errors, if any.
+**`ban_period`** *`cfg.Duration`* *`default=10s`* + +Period for which addresses will be banned in case of unavailability. + +
+ +**`reconnect_interval`** *`cfg.Duration`* *`default=5s`* + +Interval for reconnecting to addresses that are unavailable during initialization. + +
+
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* \ No newline at end of file diff --git a/plugin/output/elasticsearch/elasticsearch.go b/plugin/output/elasticsearch/elasticsearch.go index 7a0f25065..32fbe41c8 100644 --- a/plugin/output/elasticsearch/elasticsearch.go +++ b/plugin/output/elasticsearch/elasticsearch.go @@ -262,7 +262,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.prepareClient(endpoints) - capacity := p.cb.CalcActiveTargetsCapacity( + capacity := xhttp.CalcActiveTargetsCapacity( endpoints, func(_ string) int { return 1 }, ) diff --git a/plugin/output/http/README.md b/plugin/output/http/README.md index c5b659b2d..959dd85f0 100755 --- a/plugin/output/http/README.md +++ b/plugin/output/http/README.md @@ -133,5 +133,17 @@ After a non-retryable write error, fall with a non-zero exit code or not
+**`ban_period`** *`cfg.Duration`* *`default=10s`* + +Period for which addresses will be banned in case of unavailability. + +
+ +**`reconnect_interval`** *`cfg.Duration`* *`default=5s`* + +Interval for reconnecting to addresses that are unavailable during initialization. + +
+
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* \ No newline at end of file diff --git a/plugin/output/http/http.go b/plugin/output/http/http.go index 925c666f4..66081a627 100644 --- a/plugin/output/http/http.go +++ b/plugin/output/http/http.go @@ -216,7 +216,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.prepareClient(endpoints) - capacity := p.cb.CalcActiveTargetsCapacity( + capacity := xhttp.CalcActiveTargetsCapacity( endpoints, func(_ string) int { return 1 }, ) diff --git a/plugin/output/loki/README.md b/plugin/output/loki/README.md index 1ce47cee1..d969cf214 100644 --- a/plugin/output/loki/README.md +++ b/plugin/output/loki/README.md @@ -149,5 +149,17 @@ Multiplier for exponential increase of retention between retries
+**`ban_period`** *`cfg.Duration`* *`default=10s`* + +Period for which addresses will be banned in case of unavailability. + +
+ +**`reconnect_interval`** *`cfg.Duration`* *`default=5s`* + +Interval for reconnecting to addresses that are unavailable during initialization. + +
+
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* \ No newline at end of file diff --git a/plugin/output/splunk/README.md b/plugin/output/splunk/README.md index b9df5013f..ff0ed6400 100755 --- a/plugin/output/splunk/README.md +++ b/plugin/output/splunk/README.md @@ -153,5 +153,17 @@ or the "event" key with any of its subkeys.
+**`ban_period`** *`cfg.Duration`* *`default=10s`* + +Period for which addresses will be banned in case of unavailability. + +
+ +**`reconnect_interval`** *`cfg.Duration`* *`default=5s`* + +Interval for reconnecting to addresses that are unavailable during initialization. + +
+
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* \ No newline at end of file diff --git a/plugin/output/splunk/splunk.go b/plugin/output/splunk/splunk.go index 75f4d9f1b..5ef1b1e83 100644 --- a/plugin/output/splunk/splunk.go +++ b/plugin/output/splunk/splunk.go @@ -252,13 +252,9 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP endpoints := []string{p.config.Endpoint} p.prepareClient(endpoints) - capacity := p.cb.CalcActiveTargetsCapacity( - endpoints, - func(_ string) int { return 1 }, - ) p.cb = xhttp.NewCircuitBreaker[string]( p.config.BanPeriod_, - capacity, + 1, ) for _, endpoint := range endpoints { p.cb.AddTarget(xhttp.TargetID(endpoint), endpoint, 1) diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go index af8bf24ab..046a78886 100644 --- a/xhttp/circuit_breaker.go +++ b/xhttp/circuit_breaker.go @@ -90,7 +90,21 @@ func (cb *CircuitBreaker[T]) RestoreBannedTargets() { } } -func (cb *CircuitBreaker[T]) CalcActiveTargetsCapacity(target []T, getWeight func(T) int) int { +func (cb *CircuitBreaker[T]) ActiveCount() int { + cb.mu.RLock() + defer cb.mu.RUnlock() + + return len(cb.activeTargets) +} + +func (cb *CircuitBreaker[T]) GetActiveTargetByIndex(idx int) Target[T] { + cb.mu.RLock() + defer cb.mu.RUnlock() + + return cb.activeTargets[idx] +} + +func CalcActiveTargetsCapacity[T any](target []T, getWeight func(T) int) int { totalCap := 0 for _, t := range target { w := getWeight(t) From cd432f8277b3f5ceed3bfea00ab1af156b403ce7 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Tue, 14 Apr 2026 11:03:13 +0300 Subject: [PATCH 09/32] fix --- plugin/output/loki/loki.go | 2 +- xhttp/client.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/plugin/output/loki/loki.go b/plugin/output/loki/loki.go index 4d03ebad3..9a3214604 100644 --- a/plugin/output/loki/loki.go +++ b/plugin/output/loki/loki.go @@ -446,7 +446,7 @@ func (p *Plugin) send(root *insaneJSON.Root) (int, error) { if statusCode != http.StatusNoContent { return statusCode, fmt.Errorf("bad response: code=%d, err=%v", statusCode, err) } - if err != nil && xhttp.ShouldBanEndpoint(statusCode, err) { + if err != nil && xhttp.ShouldBanEndpoint(statusCode) { p.cb.BanTarget(xhttp.TargetID(endpoint)) } diff --git a/xhttp/client.go b/xhttp/client.go index 93d25ffcf..cac2d0ac0 100644 --- a/xhttp/client.go +++ b/xhttp/client.go @@ -169,7 +169,7 @@ func parseGzipCompressionLevel(level string) int { } } -func ShouldBanEndpoint(statusCode int, err error) bool { +func ShouldBanEndpoint(statusCode int) bool { switch statusCode { case http.StatusBadGateway, http.StatusServiceUnavailable, From 6cdd9950ff59cab6fdc323d85a129414a040ebb9 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Tue, 14 Apr 2026 11:25:57 +0300 Subject: [PATCH 10/32] fix --- plugin/output/elasticsearch/elasticsearch.go | 4 ++-- plugin/output/http/http.go | 4 ++-- plugin/output/splunk/splunk.go | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/plugin/output/elasticsearch/elasticsearch.go b/plugin/output/elasticsearch/elasticsearch.go index 32fbe41c8..c4503b9a4 100644 --- a/plugin/output/elasticsearch/elasticsearch.go +++ b/plugin/output/elasticsearch/elasticsearch.go @@ -458,7 +458,7 @@ func (p *Plugin) send(data []byte) (int, error) { processFn, ) - if err != nil && xhttp.ShouldBanEndpoint(statusCode, err) { + if err != nil && xhttp.ShouldBanEndpoint(statusCode) { p.cb.BanTarget(xhttp.TargetID(endpoint)) } @@ -483,7 +483,7 @@ func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, processFn, ) - if err != nil && xhttp.ShouldBanEndpoint(statusCode, err) { + if err != nil && xhttp.ShouldBanEndpoint(statusCode) { p.cb.BanTarget(xhttp.TargetID(endpoint)) } diff --git a/plugin/output/http/http.go b/plugin/output/http/http.go index 66081a627..5aa1ec079 100644 --- a/plugin/output/http/http.go +++ b/plugin/output/http/http.go @@ -396,7 +396,7 @@ func (p *Plugin) send(data []byte) (int, error) { nil, ) - if err != nil && xhttp.ShouldBanEndpoint(statusCode, err) { + if err != nil && xhttp.ShouldBanEndpoint(statusCode) { p.cb.BanTarget(xhttp.TargetID(endpoint)) } @@ -416,7 +416,7 @@ func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, nil, ) - if err != nil && xhttp.ShouldBanEndpoint(statusCode, err) { + if err != nil && xhttp.ShouldBanEndpoint(statusCode) { p.cb.BanTarget(xhttp.TargetID(endpoint)) } diff --git a/plugin/output/splunk/splunk.go b/plugin/output/splunk/splunk.go index 5ef1b1e83..f8f1a4bab 100644 --- a/plugin/output/splunk/splunk.go +++ b/plugin/output/splunk/splunk.go @@ -406,7 +406,7 @@ func (p *Plugin) out(workerData *pipeline.WorkerData, batch *pipeline.Batch) err code, endpoint, err := p.client.DoTimeout(http.MethodPost, "", outBuf, p.config.RequestTimeout_, parseSplunkError) - if err != nil && xhttp.ShouldBanEndpoint(code, err) { + if err != nil && xhttp.ShouldBanEndpoint(code) { p.cb.BanTarget(xhttp.TargetID(endpoint)) } From 91dffa4e7e13e7effe63450a769590125bdf54e1 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Mon, 27 Apr 2026 12:13:24 +0300 Subject: [PATCH 11/32] remove cb in ch --- plugin/output/clickhouse/README.md | 12 -- plugin/output/clickhouse/clickhouse.go | 191 +++++--------------- plugin/output/clickhouse/clickhouse_test.go | 86 ++++----- 3 files changed, 84 insertions(+), 205 deletions(-) diff --git a/plugin/output/clickhouse/README.md b/plugin/output/clickhouse/README.md index 4d12e9860..08b1b374e 100644 --- a/plugin/output/clickhouse/README.md +++ b/plugin/output/clickhouse/README.md @@ -215,17 +215,5 @@ After this timeout batch will be sent even if batch isn't completed.
-**`ban_period`** *`cfg.Duration`* *`default=10s`* - -Period for which addresses will be banned in case of unavailability. - -
- -**`reconnect_interval`** *`cfg.Duration`* *`default=5s`* - -Interval for reconnecting to addresses that are unavailable during initialization. - -
-
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* \ No newline at end of file diff --git a/plugin/output/clickhouse/clickhouse.go b/plugin/output/clickhouse/clickhouse.go index a164899db..7f6c8366a 100644 --- a/plugin/output/clickhouse/clickhouse.go +++ b/plugin/output/clickhouse/clickhouse.go @@ -3,13 +3,11 @@ package clickhouse import ( "bytes" "context" - "crypto/tls" "encoding/json" "errors" "fmt" "net" "strings" - "sync" "time" "github.com/ClickHouse/ch-go" @@ -19,7 +17,6 @@ import ( "github.com/ozontech/file.d/fd" "github.com/ozontech/file.d/metric" "github.com/ozontech/file.d/pipeline" - "github.com/ozontech/file.d/xhttp" "github.com/ozontech/file.d/xtls" "go.uber.org/atomic" "go.uber.org/zap" @@ -56,6 +53,7 @@ type Plugin struct { query string // TODO: support shards + instances []Clickhouse requestID atomic.Int64 // plugin metrics @@ -63,14 +61,6 @@ type Plugin struct { queriesCountMetric *metric.Counter router *pipeline.Router - - compression ch.Compression - tlsConfig *tls.Config - - cb *xhttp.CircuitBreaker[Clickhouse] - poolsByAddr map[Address]Clickhouse - pendingHosts map[Address]struct{} - mu sync.RWMutex } type Setting struct { @@ -344,18 +334,6 @@ type Config struct { // > After this timeout batch will be sent even if batch isn't completed. BatchFlushTimeout cfg.Duration `json:"batch_flush_timeout" default:"200ms" parse:"duration"` // * BatchFlushTimeout_ time.Duration - - // > @3@4@5@6 - // > - // > Period for which addresses will be banned in case of unavailability. - BanPeriod cfg.Duration `json:"ban_period" default:"10s" parse:"duration"` // * - BanPeriod_ time.Duration - - // > @3@4@5@6 - // > - // > Interval for reconnecting to addresses that are unavailable during initialization. - ReconnectInterval cfg.Duration `json:"reconnect_interval" default:"5s" parse:"duration"` // * - ReconnectInterval_ time.Duration } func init() { @@ -376,11 +354,8 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginParams) { p.logger = params.Logger.Desugar() - p.config = config.(*Config) - - p.pendingHosts = make(map[Address]struct{}, len(p.config.Addresses)) - p.poolsByAddr = make(map[Address]Clickhouse, len(p.config.Addresses)) + p.config = config.(*Config) p.registerMetrics(params.MetricCtl) p.ctx, p.cancelFunc = context.WithCancel(context.Background()) @@ -390,12 +365,6 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP if p.config.InsertTimeout_ < 1 { p.logger.Fatal("'db_request_timeout' can't be <1") } - if p.config.ReconnectInterval_ < 1 { - p.logger.Fatal("'reconnect_interval' can't be <1") - } - if p.config.BanPeriod_ < 1 { - p.logger.Fatal("'failure_cooldown_period' cant't be <1") - } schema, err := inferInsaneColInputs(p.config.Columns) if err != nil { @@ -411,63 +380,64 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.config.InsertStrategy_ = StrategyInOrder } + var compression ch.Compression switch strings.ToLower(p.config.Compression) { default: fallthrough case "disabled": - p.compression = ch.CompressionDisabled + compression = ch.CompressionDisabled case "lz4": - p.compression = ch.CompressionLZ4 + compression = ch.CompressionLZ4 case "zstd": - p.compression = ch.CompressionZSTD + compression = ch.CompressionZSTD case "none": - p.compression = ch.CompressionNone + compression = ch.CompressionNone } + var b xtls.ConfigBuilder if p.config.CACert != "" { b := xtls.NewConfigBuilder() - if err := b.AppendCARoot(p.config.CACert); err != nil { + err := b.AppendCARoot(p.config.CACert) + if err != nil { p.logger.Fatal("can't append CA root", zap.Error(err)) } - p.tlsConfig = b.Build() } - capacity := xhttp.CalcActiveTargetsCapacity( - p.config.Addresses, - func(addr Address) int { - return *addr.Weight - }, - ) - - p.cb = xhttp.NewCircuitBreaker[Clickhouse]( - p.config.BanPeriod_, - capacity, - ) for _, addr := range p.config.Addresses { - pool, err := p.createConnection(addr) + addr.Addr = addrWithDefaultPort(addr.Addr, "9000") + pool, err := chpool.New(p.ctx, chpool.Options{ + ClientOptions: ch.Options{ + Logger: p.logger.Named("driver"), + Address: addr.Addr, + Database: p.config.Database, + User: p.config.User, + Password: p.config.Password, + QuotaKey: p.config.QuotaKey, + Compression: compression, + Settings: p.config.ClickhouseSettings.toProtoSettings(), + DialTimeout: time.Second * 10, + TLS: b.Build(), + HandshakeTimeout: time.Minute, + }, + MaxConnLifetime: p.config.MaxConnLifetime_, + MaxConnIdleTime: p.config.MaxConnIdleTime_, + MaxConns: p.config.MaxConns_, + MinConns: p.config.MinConns_, + HealthCheckPeriod: p.config.HealthCheckPeriod_, + }) if err != nil { - var netError net.Error - if errors.As(err, &netError) { - p.pendingHosts[addr] = struct{}{} - } p.logger.Error("create clickhouse connection pool", zap.Error(err), zap.String("addr", addr.Addr)) - continue + } else { + for j := 0; j < *addr.Weight; j++ { + p.instances = append(p.instances, pool) + } } - - p.poolsByAddr[addr] = pool - p.cb.AddTarget(xhttp.TargetID(addr.Addr), pool, *addr.Weight) } - if p.cb.ActiveCount() == 0 && p.config.FatalOnFailedInsert { + if len(p.instances) == 0 { p.logger.Fatal("cannot start: no available clickhouse addresses in config") } - go xhttp.CheckBannedHosts(p.ctx, p.cb, p.config.ReconnectInterval_) - - if len(p.pendingHosts) > 0 { - go p.checkPendingHosts() - } - batcherOpts := pipeline.BatcherOptions{ PipelineName: params.PipelineName, OutputType: outPluginType, @@ -514,68 +484,11 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(p.ctx) } -func (p *Plugin) createConnection(addr Address) (*chpool.Pool, error) { - addr.Addr = addrWithDefaultPort(addr.Addr, "9000") - pool, err := chpool.New(p.ctx, chpool.Options{ - ClientOptions: ch.Options{ - Logger: p.logger.Named("driver"), - Address: addr.Addr, - Database: p.config.Database, - User: p.config.User, - Password: p.config.Password, - QuotaKey: p.config.QuotaKey, - Compression: p.compression, - Settings: p.config.ClickhouseSettings.toProtoSettings(), - DialTimeout: time.Second * 10, - TLS: p.tlsConfig, - HandshakeTimeout: time.Minute, - }, - MaxConnLifetime: p.config.MaxConnLifetime_, - MaxConnIdleTime: p.config.MaxConnIdleTime_, - MaxConns: p.config.MaxConns_, - MinConns: p.config.MinConns_, - HealthCheckPeriod: p.config.HealthCheckPeriod_, - }) - - return pool, err -} - -func (p *Plugin) checkPendingHosts() { - ticker := time.NewTicker(p.config.ReconnectInterval_) - defer ticker.Stop() - for { - select { - case <-p.ctx.Done(): - return - case <-ticker.C: - if len(p.pendingHosts) == 0 { - return - } - - for addr := range p.pendingHosts { - pool, err := p.createConnection(addr) - if err != nil { - p.logger.Error("failed to reconnect to pending host", zap.Error(err), zap.String("addr", addr.Addr)) - continue - } - - p.mu.Lock() - p.poolsByAddr[addr] = pool - delete(p.pendingHosts, addr) - p.mu.Unlock() - - p.cb.AddTarget(xhttp.TargetID(addr.Addr), pool, *addr.Weight) - } - } - } -} - func (p *Plugin) Stop() { p.cancelFunc() p.batcher.Stop() - - for _, pool := range p.poolsByAddr { - pool.Close() + for _, clickhouse := range p.instances { + clickhouse.Close() } } @@ -638,26 +551,14 @@ func (p *Plugin) out(workerData *pipeline.WorkerData, batch *pipeline.Batch) err } }) - instancesCount := p.cb.ActiveCount() - - if instancesCount == 0 && p.config.FatalOnFailedInsert { - p.logger.Fatal("no available clickhouse addresses") - } - var err error - for i := 0; i < instancesCount; i++ { + for i := range p.instances { requestID := p.requestID.Inc() - idx := p.getInstanceIndex(requestID, i, instancesCount) - instance := p.cb.GetActiveTargetByIndex(idx) - err = p.do(instance.Client, data.input) + clickhouse := p.getInstance(requestID, i) + err = p.do(clickhouse, data.input) if err == nil { return nil } - - var netErr net.Error - if errors.As(err, &netErr) { - p.cb.BanTarget(instance.ID) - } } if err != nil { p.insertErrorsMetric.Inc() @@ -682,17 +583,15 @@ func (p *Plugin) do(clickhouse Clickhouse, queryInput proto.Input) error { }) } -func (p *Plugin) getInstanceIndex(requestID int64, retry, instanceCount int) int { - p.mu.RLock() - defer p.mu.RUnlock() +func (p *Plugin) getInstance(requestID int64, retry int) Clickhouse { var instanceIdx int switch p.config.InsertStrategy_ { case StrategyInOrder: - instanceIdx = retry % instanceCount + instanceIdx = retry % len(p.instances) case StrategyRoundRobin: - instanceIdx = int(requestID) % instanceCount + instanceIdx = int(requestID) % len(p.instances) } - return instanceIdx + return p.instances[instanceIdx] } func addrWithDefaultPort(addr string, defaultPort string) string { diff --git a/plugin/output/clickhouse/clickhouse_test.go b/plugin/output/clickhouse/clickhouse_test.go index 76cc8c22e..9adf5afa4 100644 --- a/plugin/output/clickhouse/clickhouse_test.go +++ b/plugin/output/clickhouse/clickhouse_test.go @@ -1,14 +1,11 @@ package clickhouse import ( - "fmt" "math/rand" "testing" - "time" "github.com/golang/mock/gomock" mockclickhouse "github.com/ozontech/file.d/plugin/output/clickhouse/mock" - "github.com/ozontech/file.d/xhttp" "github.com/stretchr/testify/assert" ) @@ -17,7 +14,7 @@ func TestPlugin_getInstance(t *testing.T) { ctrl := gomock.NewController(t) - pools := []Clickhouse{ + instances := []Clickhouse{ mockclickhouse.NewMockClickhouse(ctrl), mockclickhouse.NewMockClickhouse(ctrl), mockclickhouse.NewMockClickhouse(ctrl), @@ -30,55 +27,55 @@ func TestPlugin_getInstance(t *testing.T) { retry int } tests := []struct { - name string - stategy InsertStrategy - args args - instanceCount int - want Clickhouse + name string + instances []Clickhouse + stategy InsertStrategy + args args + want Clickhouse }{ // in-order { - name: "one instance and first retry", - stategy: StrategyInOrder, - args: args{id: rand.Int63(), retry: 0}, - instanceCount: 1, - want: pools[0], + name: "one instance and first retry", + instances: instances[:1], + stategy: StrategyInOrder, + args: args{id: rand.Int63(), retry: 0}, + want: instances[0], }, { - name: "one instance and some retry", - stategy: StrategyInOrder, - args: args{id: rand.Int63(), retry: 123}, - instanceCount: 1, - want: pools[0], + name: "one instance and some retry", + instances: instances[:1], + stategy: StrategyInOrder, + args: args{id: rand.Int63(), retry: 123}, + want: instances[0], }, { - name: "many instances and some retry", - stategy: StrategyInOrder, - args: args{id: rand.Int63(), retry: 123}, - instanceCount: 2, - want: pools[1], // 123%2 + name: "many instances and some retry", + instances: instances, + stategy: StrategyInOrder, + args: args{id: rand.Int63(), retry: 123}, + want: instances[3], // 123%3 }, // round-robin { - name: "many instances and first retry", - stategy: StrategyRoundRobin, - args: args{id: 123, retry: 0}, - instanceCount: 3, - want: pools[0], // 123%3 + name: "many instances and first retry", + instances: instances, + stategy: StrategyRoundRobin, + args: args{id: 123, retry: 0}, + want: instances[3], // 123%3 }, { - name: "many instances and rand retry", - stategy: StrategyRoundRobin, - args: args{id: 0, retry: rand.Int()}, - instanceCount: 5, - want: pools[0], + name: "many instances and rand retry", + instances: instances, + stategy: StrategyRoundRobin, + args: args{id: 0, retry: rand.Int()}, + want: instances[0], }, { - name: "one instances and rand retry", - stategy: StrategyRoundRobin, - args: args{id: rand.Int63(), retry: rand.Int()}, - instanceCount: 1, - want: pools[0], + name: "one instances and rand retry", + instances: instances[:1], + stategy: StrategyRoundRobin, + args: args{id: rand.Int63(), retry: rand.Int()}, + want: instances[0], }, } @@ -87,15 +84,10 @@ func TestPlugin_getInstance(t *testing.T) { t.Run(tt.name, func(t *testing.T) { t.Parallel() - cb := xhttp.NewCircuitBreaker[Clickhouse](time.Second, tt.instanceCount) - for i := 0; i < tt.instanceCount; i++ { - cb.AddTarget(xhttp.TargetID(fmt.Sprintf("addr%d", i)), pools[i], 1) - } - p := &Plugin{cb: cb, config: &Config{InsertStrategy_: tt.stategy}} + p := &Plugin{instances: tt.instances, config: &Config{InsertStrategy_: tt.stategy}} - idx := p.getInstanceIndex(tt.args.id, tt.args.retry, tt.instanceCount) - instance := p.cb.GetActiveTargetByIndex(idx) - if instance.Client != tt.want { + instance := p.getInstance(tt.args.id, tt.args.retry) + if instance != tt.want { t.Fatal("instances are not equal") } }) From c9eeca875cba9604e946031bdd5acd6f7c94835f Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Mon, 27 Apr 2026 18:35:58 +0300 Subject: [PATCH 12/32] refactor cb for http plugins --- plugin/output/elasticsearch/elasticsearch.go | 41 ++----- plugin/output/http/http.go | 37 ++---- plugin/output/loki/loki.go | 25 ++-- plugin/output/splunk/splunk.go | 26 ++-- plugin/output/splunk/splunk_test.go | 4 +- xhttp/circuit_breaker.go | 120 +++++++------------ xhttp/client.go | 44 +++++-- 7 files changed, 109 insertions(+), 188 deletions(-) diff --git a/plugin/output/elasticsearch/elasticsearch.go b/plugin/output/elasticsearch/elasticsearch.go index c4503b9a4..f6e08fd1f 100644 --- a/plugin/output/elasticsearch/elasticsearch.go +++ b/plugin/output/elasticsearch/elasticsearch.go @@ -48,8 +48,6 @@ type Plugin struct { cancel context.CancelFunc mu *sync.Mutex - cb *xhttp.CircuitBreaker[string] - // plugin metrics sendErrorMetric *metric.CounterVec indexingErrorsMetric *metric.Counter @@ -258,21 +256,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.config.IndexValues = append(p.config.IndexValues, "@time") } - endpoints := prepareEndpoints(p.config.Endpoints, p.config.IngestPipeline) - - p.prepareClient(endpoints) - - capacity := xhttp.CalcActiveTargetsCapacity( - endpoints, - func(_ string) int { return 1 }, - ) - p.cb = xhttp.NewCircuitBreaker[string]( - p.config.BanPeriod_, - capacity, - ) - for _, endpoint := range endpoints { - p.cb.AddTarget(xhttp.TargetID(endpoint), endpoint, 1) - } + p.prepareClient() p.maintenance(nil) @@ -328,7 +312,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(ctx) - go xhttp.CheckBannedHosts(ctx, p.cb, p.config.ReconnectInterval_) + go p.client.CircuitBreaker.CheckBannedEndpoints(ctx, p.config.ReconnectInterval_) } func (p *Plugin) Stop() { @@ -345,9 +329,9 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { p.indexingErrorsMetric = ctl.RegisterCounter("output_elasticsearch_index_error_total", "Number of elasticsearch indexing errors") } -func (p *Plugin) prepareClient(endpoints []string) { +func (p *Plugin) prepareClient() { config := &xhttp.ClientConfig{ - Endpoints: endpoints, + Endpoints: prepareEndpoints(p.config.Endpoints, p.config.IngestPipeline), ConnectionTimeout: p.config.ConnectionTimeout_ * 2, AuthHeader: p.getAuthHeader(), KeepAlive: &xhttp.ClientKeepAliveConfig{ @@ -369,6 +353,8 @@ func (p *Plugin) prepareClient(endpoints []string) { if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } + + p.client.CircuitBreaker = xhttp.NewCircuitBreaker(p.client.GetEndpoints(), p.config.BanPeriod_) } func prepareEndpoints(endpoints []string, ingestPipeline string) []string { @@ -450,19 +436,14 @@ func (p *Plugin) send(data []byte) (int, error) { if !p.config.ProcessResponse { processFn = nil } - statusCode, endpoint, err := p.client.DoTimeout( + + return p.client.DoTimeout( http.MethodPost, NDJSONContentType, data, p.config.ConnectionTimeout_, processFn, ) - - if err != nil && xhttp.ShouldBanEndpoint(statusCode) { - p.cb.BanTarget(xhttp.TargetID(endpoint)) - } - - return statusCode, err } func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, error) { @@ -475,7 +456,7 @@ func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, processFn = nil } - statusCode, endpoint, err := p.client.DoTimeout( + statusCode, err := p.client.DoTimeout( http.MethodPost, NDJSONContentType, data[begin[left]:begin[right]], @@ -483,10 +464,6 @@ func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, processFn, ) - if err != nil && xhttp.ShouldBanEndpoint(statusCode) { - p.cb.BanTarget(xhttp.TargetID(endpoint)) - } - if err != nil { p.sendErrorMetric.WithLabelValues(strconv.Itoa(statusCode)).Inc() switch statusCode { diff --git a/plugin/output/http/http.go b/plugin/output/http/http.go index 5aa1ec079..24d57d65b 100644 --- a/plugin/output/http/http.go +++ b/plugin/output/http/http.go @@ -42,7 +42,6 @@ type Plugin struct { cancel context.CancelFunc mu *sync.Mutex - cb *xhttp.CircuitBreaker[string] // plugin metrics sendErrorMetric *metric.CounterVec @@ -212,21 +211,9 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.config = config.(*Config) p.registerMetrics(params.MetricCtl) p.mu = &sync.Mutex{} - endpoints := p.prepareEndpoints() - p.prepareClient(endpoints) + p.prepareClient() - capacity := xhttp.CalcActiveTargetsCapacity( - endpoints, - func(_ string) int { return 1 }, - ) - p.cb = xhttp.NewCircuitBreaker[string]( - p.config.BanPeriod_, - capacity, - ) - for _, endpoint := range endpoints { - p.cb.AddTarget(xhttp.TargetID(endpoint), endpoint, 1) - } p.logger.Info("starting batcher", zap.Duration("timeout", p.config.BatchFlushTimeout_)) batcherOpts := pipeline.BatcherOptions{ @@ -279,7 +266,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(ctx) - go xhttp.CheckBannedHosts(ctx, p.cb, p.config.ReconnectInterval_) + go p.client.CircuitBreaker.CheckBannedEndpoints(ctx, p.config.ReconnectInterval_) } func (p *Plugin) Stop() { @@ -295,9 +282,9 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { p.sendErrorMetric = ctl.RegisterCounterVec("output_http_send_error_total", "Total HTTP send errors", "status_code") } -func (p *Plugin) prepareClient(endpoints []string) { +func (p *Plugin) prepareClient() { config := &xhttp.ClientConfig{ - Endpoints: endpoints, + Endpoints: p.prepareEndpoints(), ConnectionTimeout: p.config.ConnectionTimeout_ * 2, AuthHeader: p.getAuthHeader(), KeepAlive: &xhttp.ClientKeepAliveConfig{ @@ -319,6 +306,8 @@ func (p *Plugin) prepareClient(endpoints []string) { if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } + + p.client.CircuitBreaker = xhttp.NewCircuitBreaker(p.client.GetEndpoints(), p.config.BanPeriod_) } func (p *Plugin) prepareEndpoints() []string { @@ -388,19 +377,13 @@ func (p *Plugin) out(workerData *pipeline.WorkerData, batch *pipeline.Batch) err } func (p *Plugin) send(data []byte) (int, error) { - statusCode, endpoint, err := p.client.DoTimeout( + return p.client.DoTimeout( http.MethodPost, p.config.ContentType, data, p.config.ConnectionTimeout_, nil, ) - - if err != nil && xhttp.ShouldBanEndpoint(statusCode) { - p.cb.BanTarget(xhttp.TargetID(endpoint)) - } - - return statusCode, err } func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, error) { @@ -408,7 +391,7 @@ func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, return http.StatusOK, nil } - statusCode, endpoint, err := p.client.DoTimeout( + statusCode, err := p.client.DoTimeout( http.MethodPost, p.config.ContentType, data[begin[left]:begin[right]], @@ -416,10 +399,6 @@ func (p *Plugin) sendSplit(left int, right int, begin []int, data []byte) (int, nil, ) - if err != nil && xhttp.ShouldBanEndpoint(statusCode) { - p.cb.BanTarget(xhttp.TargetID(endpoint)) - } - if err != nil { p.sendErrorMetric.WithLabelValues(strconv.Itoa(statusCode)).Inc() switch statusCode { diff --git a/plugin/output/loki/loki.go b/plugin/output/loki/loki.go index 9a3214604..935b71c1e 100644 --- a/plugin/output/loki/loki.go +++ b/plugin/output/loki/loki.go @@ -243,8 +243,6 @@ type Plugin struct { client *xhttp.Client batcher *pipeline.RetriableBatcher - cb *xhttp.CircuitBreaker[string] - // plugin metrics sendErrorMetric *metric.CounterVec @@ -273,16 +271,8 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.labels = p.parseLabels() - endpoints := []string{fmt.Sprintf("%s/loki/api/v1/push", p.config.Address)} - p.prepareClient(endpoints) + p.prepareClient() - p.cb = xhttp.NewCircuitBreaker[string]( - p.config.BanPeriod_, - 1, - ) - for _, endpoint := range endpoints { - p.cb.AddTarget(xhttp.TargetID(endpoint), endpoint, 1) - } batcherOpts := &pipeline.BatcherOptions{ PipelineName: params.PipelineName, OutputType: outPluginType, @@ -331,7 +321,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(ctx) - go xhttp.CheckBannedHosts(ctx, p.cb, p.config.ReconnectInterval_) + go p.client.CircuitBreaker.CheckBannedEndpoints(ctx, p.config.ReconnectInterval_) } func (p *Plugin) Stop() { @@ -436,7 +426,7 @@ func (p *Plugin) send(root *insaneJSON.Root) (int, error) { p.logger.Debug("sent", zap.String("data", string(data))) - statusCode, endpoint, err := p.client.DoTimeout( + statusCode, err := p.client.DoTimeout( http.MethodPost, "application/json", data, @@ -446,9 +436,6 @@ func (p *Plugin) send(root *insaneJSON.Root) (int, error) { if statusCode != http.StatusNoContent { return statusCode, fmt.Errorf("bad response: code=%d, err=%v", statusCode, err) } - if err != nil && xhttp.ShouldBanEndpoint(statusCode) { - p.cb.BanTarget(xhttp.TargetID(endpoint)) - } return statusCode, nil } @@ -457,9 +444,9 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { p.sendErrorMetric = ctl.RegisterCounterVec("output_loki_send_error_total", "Total Loki send errors", "status_code") } -func (p *Plugin) prepareClient(endpoints []string) { +func (p *Plugin) prepareClient() { config := &xhttp.ClientConfig{ - Endpoints: endpoints, + Endpoints: []string{fmt.Sprintf("%s/loki/api/v1/push", p.config.Address)}, ConnectionTimeout: p.config.ConnectionTimeout_ * 2, AuthHeader: p.getAuthHeader(), CustomHeaders: p.getCustomHeaders(), @@ -474,6 +461,8 @@ func (p *Plugin) prepareClient(endpoints []string) { if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } + + p.client.CircuitBreaker = xhttp.NewCircuitBreaker(p.client.GetEndpoints(), p.config.BanPeriod_) } func (p *Plugin) getCustomHeaders() map[string]string { diff --git a/plugin/output/splunk/splunk.go b/plugin/output/splunk/splunk.go index f8f1a4bab..eea6b3e72 100644 --- a/plugin/output/splunk/splunk.go +++ b/plugin/output/splunk/splunk.go @@ -95,8 +95,6 @@ type Plugin struct { cancel context.CancelFunc - cb *xhttp.CircuitBreaker[string] - // plugin metrics sendErrorMetric *metric.CounterVec @@ -249,16 +247,8 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.avgEventSize = params.PipelineSettings.AvgEventSize p.config = config.(*Config) p.registerMetrics(params.MetricCtl) - endpoints := []string{p.config.Endpoint} - p.prepareClient(endpoints) + p.prepareClient() - p.cb = xhttp.NewCircuitBreaker[string]( - p.config.BanPeriod_, - 1, - ) - for _, endpoint := range endpoints { - p.cb.AddTarget(xhttp.TargetID(endpoint), endpoint, 1) - } for _, cf := range p.config.CopyFields { if cf.To == "" { p.logger.Error("copies to the root are not allowed") @@ -323,7 +313,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(ctx) - go xhttp.CheckBannedHosts(ctx, p.cb, p.config.ReconnectInterval_) + go p.client.CircuitBreaker.CheckBannedEndpoints(ctx, p.config.ReconnectInterval_) } func (p *Plugin) Stop() { @@ -343,9 +333,9 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { ) } -func (p *Plugin) prepareClient(endpoints []string) { +func (p *Plugin) prepareClient() { config := &xhttp.ClientConfig{ - Endpoints: endpoints, + Endpoints: []string{p.config.Endpoint}, ConnectionTimeout: p.config.RequestTimeout_, AuthHeader: "Splunk " + p.config.Token, KeepAlive: &xhttp.ClientKeepAliveConfig{ @@ -366,6 +356,8 @@ func (p *Plugin) prepareClient(endpoints []string) { if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } + + p.client.CircuitBreaker = xhttp.NewCircuitBreaker(p.client.GetEndpoints(), p.config.BanPeriod_) } func (p *Plugin) out(workerData *pipeline.WorkerData, batch *pipeline.Batch) error { @@ -403,13 +395,9 @@ func (p *Plugin) out(workerData *pipeline.WorkerData, batch *pipeline.Batch) err p.logger.Debugf("trying to send: %s", outBuf) - code, endpoint, err := p.client.DoTimeout(http.MethodPost, "", outBuf, + code, err := p.client.DoTimeout(http.MethodPost, "", outBuf, p.config.RequestTimeout_, parseSplunkError) - if err != nil && xhttp.ShouldBanEndpoint(code) { - p.cb.BanTarget(xhttp.TargetID(endpoint)) - } - if err != nil { p.sendErrorMetric.WithLabelValues(strconv.Itoa(code)).Inc() p.logger.Errorf("can't send data to splunk address=%s: %s", p.config.Endpoint, err.Error()) diff --git a/plugin/output/splunk/splunk_test.go b/plugin/output/splunk/splunk_test.go index 0592e53ff..40626e076 100644 --- a/plugin/output/splunk/splunk_test.go +++ b/plugin/output/splunk/splunk_test.go @@ -54,7 +54,7 @@ func TestSplunk(t *testing.T) { }, logger: zap.NewExample().Sugar(), } - plugin.prepareClient([]string{plugin.config.Endpoint}) + plugin.prepareClient() batch := pipeline.NewPreparedBatch([]*pipeline.Event{ {Root: input}, @@ -185,7 +185,7 @@ func TestCopyFields(t *testing.T) { copyFieldsPaths: tt.copyFields, logger: zap.NewExample().Sugar(), } - plugin.prepareClient([]string{plugin.config.Endpoint}) + plugin.prepareClient() batch := pipeline.NewPreparedBatch([]*pipeline.Event{ {Root: input}, diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go index 046a78886..a9e29ccb9 100644 --- a/xhttp/circuit_breaker.go +++ b/xhttp/circuit_breaker.go @@ -2,71 +2,70 @@ package xhttp import ( "context" + "math/rand" "sync" "time" "github.com/ozontech/file.d/xtime" + "github.com/valyala/fasthttp" ) -type TargetID string - -type Target[T any] struct { - ID TargetID - Client T - Weight int +type CircuitBreaker struct { + activeEndpoints []*fasthttp.URI + endpointsByID map[string]*fasthttp.URI + bannedUntil map[string]time.Time + banPeriod time.Duration + mu sync.RWMutex } -type CircuitBreaker[T any] struct { - activeTargets []Target[T] - targetsByID map[TargetID]T - weightsByID map[TargetID]int - bannedUntil map[TargetID]time.Time - banPeriod time.Duration - mu sync.RWMutex -} +func NewCircuitBreaker(endpoints []*fasthttp.URI, banPeriod time.Duration) *CircuitBreaker { + cb := &CircuitBreaker{ + activeEndpoints: make([]*fasthttp.URI, 0, len(endpoints)), + endpointsByID: make(map[string]*fasthttp.URI, len(endpoints)), + bannedUntil: make(map[string]time.Time, len(endpoints)), + banPeriod: banPeriod, + } -func NewCircuitBreaker[T any](banPeriod time.Duration, activeTargetsCap int) *CircuitBreaker[T] { - return &CircuitBreaker[T]{ - activeTargets: make([]Target[T], 0, activeTargetsCap), - targetsByID: make(map[TargetID]T, activeTargetsCap), - weightsByID: make(map[TargetID]int, activeTargetsCap), - bannedUntil: make(map[TargetID]time.Time, activeTargetsCap), - banPeriod: banPeriod, + for _, endpoint := range endpoints { + id := endpoint.String() + cb.endpointsByID[id] = endpoint + cb.activeEndpoints = append(cb.activeEndpoints, endpoint) } -} -func (cb *CircuitBreaker[T]) AddTarget(id TargetID, client T, weight int) { - cb.mu.Lock() - defer cb.mu.Unlock() + return cb +} - cb.targetsByID[id] = client - cb.weightsByID[id] = weight +func (cb *CircuitBreaker) GetEndpoint() *fasthttp.URI { + cb.mu.RLock() + defer cb.mu.RUnlock() - for i := 0; i < weight; i++ { - cb.activeTargets = append(cb.activeTargets, Target[T]{ - ID: id, - Client: client, - Weight: weight, - }) + switch len(cb.activeEndpoints) { + case 0: + return nil + case 1: + return cb.activeEndpoints[0] + default: + return cb.activeEndpoints[rand.Int()%len(cb.activeEndpoints)] } } -func (cb *CircuitBreaker[T]) BanTarget(id TargetID) { +func (cb *CircuitBreaker) BanEndpoint(endpoint *fasthttp.URI) { cb.mu.Lock() defer cb.mu.Unlock() - filtered := cb.activeTargets[:0] - for _, target := range cb.activeTargets { - if target.ID != id { - filtered = append(filtered, target) + id := endpoint.String() + filtered := cb.activeEndpoints[:0] + for _, endpoint := range cb.activeEndpoints { + if endpoint.String() != id { + filtered = append(filtered, endpoint) } } - cb.activeTargets = filtered + cb.activeEndpoints = filtered cb.bannedUntil[id] = xtime.GetInaccurateTime().Add(cb.banPeriod) } -func (cb *CircuitBreaker[T]) RestoreBannedTargets() { +func (cb *CircuitBreaker) RestoreBannedEndpoints() { cb.mu.Lock() defer cb.mu.Unlock() @@ -74,46 +73,13 @@ func (cb *CircuitBreaker[T]) RestoreBannedTargets() { if xtime.GetInaccurateTime().Before(until) { continue } - - client := cb.targetsByID[id] - weight := cb.weightsByID[id] - - for i := 0; i < weight; i++ { - cb.activeTargets = append(cb.activeTargets, Target[T]{ - ID: id, - Client: client, - Weight: weight, - }) - } - + endpoint := cb.endpointsByID[id] + cb.activeEndpoints = append(cb.activeEndpoints, endpoint) delete(cb.bannedUntil, id) } } -func (cb *CircuitBreaker[T]) ActiveCount() int { - cb.mu.RLock() - defer cb.mu.RUnlock() - - return len(cb.activeTargets) -} - -func (cb *CircuitBreaker[T]) GetActiveTargetByIndex(idx int) Target[T] { - cb.mu.RLock() - defer cb.mu.RUnlock() - - return cb.activeTargets[idx] -} - -func CalcActiveTargetsCapacity[T any](target []T, getWeight func(T) int) int { - totalCap := 0 - for _, t := range target { - w := getWeight(t) - totalCap += w - } - return totalCap -} - -func CheckBannedHosts[T any](ctx context.Context, cb *CircuitBreaker[T], reconnectInterval time.Duration) { +func (cb *CircuitBreaker) CheckBannedEndpoints(ctx context.Context, reconnectInterval time.Duration) { ticker := time.NewTicker(reconnectInterval) defer ticker.Stop() @@ -122,7 +88,7 @@ func CheckBannedHosts[T any](ctx context.Context, cb *CircuitBreaker[T], reconne case <-ctx.Done(): return case <-ticker.C: - cb.RestoreBannedTargets() + cb.RestoreBannedEndpoints() } } } diff --git a/xhttp/client.go b/xhttp/client.go index cac2d0ac0..dae762f22 100644 --- a/xhttp/client.go +++ b/xhttp/client.go @@ -35,6 +35,7 @@ type ClientConfig struct { type Client struct { client *fasthttp.Client endpoints []*fasthttp.URI + CircuitBreaker *CircuitBreaker authHeader string customHeaders map[string]string gzipCompressionLevel int @@ -83,36 +84,38 @@ func (c *Client) DoTimeout( body []byte, timeout time.Duration, processResponse func([]byte) error, -) (int, string, error) { +) (int, error) { req := fasthttp.AcquireRequest() defer fasthttp.ReleaseRequest(req) resp := fasthttp.AcquireResponse() defer fasthttp.ReleaseResponse(resp) - var endpoint *fasthttp.URI - if len(c.endpoints) == 1 { - endpoint = c.endpoints[0] - } else { - endpoint = c.endpoints[rand.Int()%len(c.endpoints)] + endpoint := c.getEndpoint() + if endpoint == nil { + return 0, fmt.Errorf("no available endpoints") } c.prepareRequest(req, endpoint, method, contentType, body) if err := c.client.DoTimeout(req, resp, timeout); err != nil { - return 0, endpoint.String(), fmt.Errorf("can't send request to %s: %w", endpoint.String(), err) + c.CircuitBreaker.BanEndpoint(endpoint) + return 0, fmt.Errorf("can't send request to %s: %w", endpoint.String(), err) } respContent := resp.Body() statusCode := resp.Header.StatusCode() if !(http.StatusOK <= statusCode && statusCode <= http.StatusAccepted) { - return statusCode, endpoint.String(), fmt.Errorf("response status from %s isn't OK: status=%d, body=%s", endpoint.String(), statusCode, string(respContent)) + if shouldBanEndpoint(statusCode) { + c.CircuitBreaker.BanEndpoint(endpoint) + } + return statusCode, fmt.Errorf("response status from %s isn't OK: status=%d, body=%s", endpoint.String(), statusCode, string(respContent)) } if processResponse != nil { - return statusCode, endpoint.String(), processResponse(respContent) + return statusCode, processResponse(respContent) } - return statusCode, endpoint.String(), nil + return statusCode, nil } func (c *Client) prepareRequest(req *fasthttp.Request, endpoint *fasthttp.URI, method, contentType string, body []byte) { @@ -140,6 +143,10 @@ func (c *Client) prepareRequest(req *fasthttp.Request, endpoint *fasthttp.URI, m } } +func (c *Client) GetEndpoints() []*fasthttp.URI { + return c.endpoints +} + func parseEndpoints(endpoints []string) ([]*fasthttp.URI, error) { res := make([]*fasthttp.URI, 0, len(endpoints)) for _, e := range endpoints { @@ -169,7 +176,22 @@ func parseGzipCompressionLevel(level string) int { } } -func ShouldBanEndpoint(statusCode int) bool { +func (c *Client) getEndpoint() *fasthttp.URI { + if c.CircuitBreaker != nil { + return c.CircuitBreaker.GetEndpoint() + } + + switch len(c.endpoints) { + case 0: + return nil + case 1: + return c.endpoints[0] + default: + return c.endpoints[rand.Int()%len(c.endpoints)] + } +} + +func shouldBanEndpoint(statusCode int) bool { switch statusCode { case http.StatusBadGateway, http.StatusServiceUnavailable, From 7f184805f1bd1a4b4b8be7fed5fe0efe46edc090 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Mon, 27 Apr 2026 18:52:15 +0300 Subject: [PATCH 13/32] fix --- plugin/output/elasticsearch/elasticsearch.go | 6 ++++++ plugin/output/http/http.go | 7 +++++++ plugin/output/loki/loki.go | 7 +++++++ plugin/output/splunk/splunk.go | 8 ++++++++ 4 files changed, 28 insertions(+) diff --git a/plugin/output/elasticsearch/elasticsearch.go b/plugin/output/elasticsearch/elasticsearch.go index f6e08fd1f..d46e99b39 100644 --- a/plugin/output/elasticsearch/elasticsearch.go +++ b/plugin/output/elasticsearch/elasticsearch.go @@ -255,6 +255,12 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP if len(p.config.IndexValues) == 0 { p.config.IndexValues = append(p.config.IndexValues, "@time") } + if p.config.ReconnectInterval_ < 1 { + p.logger.Fatal("'reconnect_interval' can't be <1") + } + if p.config.BanPeriod_ < 1 { + p.logger.Fatal("'ban_period' cant't be <1") + } p.prepareClient() diff --git a/plugin/output/http/http.go b/plugin/output/http/http.go index 24d57d65b..8ef2f2f91 100644 --- a/plugin/output/http/http.go +++ b/plugin/output/http/http.go @@ -212,6 +212,13 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.registerMetrics(params.MetricCtl) p.mu = &sync.Mutex{} + if p.config.ReconnectInterval_ < 1 { + p.logger.Fatal("'reconnect_interval' can't be <1") + } + if p.config.BanPeriod_ < 1 { + p.logger.Fatal("'ban_period' cant't be <1") + } + p.prepareClient() p.logger.Info("starting batcher", zap.Duration("timeout", p.config.BatchFlushTimeout_)) diff --git a/plugin/output/loki/loki.go b/plugin/output/loki/loki.go index 935b71c1e..faa32cecd 100644 --- a/plugin/output/loki/loki.go +++ b/plugin/output/loki/loki.go @@ -271,6 +271,13 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.labels = p.parseLabels() + if p.config.ReconnectInterval_ < 1 { + p.logger.Fatal("'reconnect_interval' can't be <1") + } + if p.config.BanPeriod_ < 1 { + p.logger.Fatal("'ban_period' cant't be <1") + } + p.prepareClient() batcherOpts := &pipeline.BatcherOptions{ diff --git a/plugin/output/splunk/splunk.go b/plugin/output/splunk/splunk.go index eea6b3e72..c8ddac718 100644 --- a/plugin/output/splunk/splunk.go +++ b/plugin/output/splunk/splunk.go @@ -247,6 +247,14 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.avgEventSize = params.PipelineSettings.AvgEventSize p.config = config.(*Config) p.registerMetrics(params.MetricCtl) + + if p.config.ReconnectInterval_ < 1 { + p.logger.Fatal("'reconnect_interval' can't be <1") + } + if p.config.BanPeriod_ < 1 { + p.logger.Fatal("'ban_period' cant't be <1") + } + p.prepareClient() for _, cf := range p.config.CopyFields { From 341a95aa4d008a23d118712990a1eec01bc2870a Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Wed, 6 May 2026 22:45:47 +0300 Subject: [PATCH 14/32] fix --- plugin/output/elasticsearch/README.md | 1 + plugin/output/elasticsearch/elasticsearch.go | 11 +-- plugin/output/http/README.md | 1 + plugin/output/http/http.go | 11 +-- plugin/output/loki/README.md | 1 + plugin/output/loki/loki.go | 11 +-- plugin/output/splunk/README.md | 1 + plugin/output/splunk/splunk.go | 11 +-- xhttp/circuit_breaker.go | 81 ++++++++++---------- xhttp/client.go | 42 +++++++--- 10 files changed, 100 insertions(+), 71 deletions(-) diff --git a/plugin/output/elasticsearch/README.md b/plugin/output/elasticsearch/README.md index 1ebe92281..00df09dc5 100755 --- a/plugin/output/elasticsearch/README.md +++ b/plugin/output/elasticsearch/README.md @@ -173,6 +173,7 @@ Process ES response and report errors, if any. **`ban_period`** *`cfg.Duration`* *`default=10s`* Period for which addresses will be banned in case of unavailability. +If set to 0, circuit breaker is disabled.
diff --git a/plugin/output/elasticsearch/elasticsearch.go b/plugin/output/elasticsearch/elasticsearch.go index d46e99b39..303de4e12 100644 --- a/plugin/output/elasticsearch/elasticsearch.go +++ b/plugin/output/elasticsearch/elasticsearch.go @@ -207,6 +207,7 @@ type Config struct { // > @3@4@5@6 // > // > Period for which addresses will be banned in case of unavailability. + // > If set to 0, circuit breaker is disabled. BanPeriod cfg.Duration `json:"ban_period" default:"10s" parse:"duration"` // * BanPeriod_ time.Duration @@ -258,8 +259,8 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP if p.config.ReconnectInterval_ < 1 { p.logger.Fatal("'reconnect_interval' can't be <1") } - if p.config.BanPeriod_ < 1 { - p.logger.Fatal("'ban_period' cant't be <1") + if p.config.BanPeriod_ < 0 { + p.logger.Fatal("'ban_period' cant't be <0") } p.prepareClient() @@ -318,7 +319,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(ctx) - go p.client.CircuitBreaker.CheckBannedEndpoints(ctx, p.config.ReconnectInterval_) + p.client.Start(ctx) } func (p *Plugin) Stop() { @@ -340,6 +341,8 @@ func (p *Plugin) prepareClient() { Endpoints: prepareEndpoints(p.config.Endpoints, p.config.IngestPipeline), ConnectionTimeout: p.config.ConnectionTimeout_ * 2, AuthHeader: p.getAuthHeader(), + BanPeriod: p.config.BanPeriod_, + ReconnectInterval: p.config.ReconnectInterval_, KeepAlive: &xhttp.ClientKeepAliveConfig{ MaxConnDuration: p.config.KeepAlive.MaxConnDuration_, MaxIdleConnDuration: p.config.KeepAlive.MaxIdleConnDuration_, @@ -359,8 +362,6 @@ func (p *Plugin) prepareClient() { if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } - - p.client.CircuitBreaker = xhttp.NewCircuitBreaker(p.client.GetEndpoints(), p.config.BanPeriod_) } func prepareEndpoints(endpoints []string, ingestPipeline string) []string { diff --git a/plugin/output/http/README.md b/plugin/output/http/README.md index 959dd85f0..a1bf967d1 100755 --- a/plugin/output/http/README.md +++ b/plugin/output/http/README.md @@ -136,6 +136,7 @@ After a non-retryable write error, fall with a non-zero exit code or not **`ban_period`** *`cfg.Duration`* *`default=10s`* Period for which addresses will be banned in case of unavailability. +If set to 0, circuit breaker is disabled.
diff --git a/plugin/output/http/http.go b/plugin/output/http/http.go index 8ef2f2f91..6f6d58208 100644 --- a/plugin/output/http/http.go +++ b/plugin/output/http/http.go @@ -168,6 +168,7 @@ type Config struct { // > @3@4@5@6 // > // > Period for which addresses will be banned in case of unavailability. + // > If set to 0, circuit breaker is disabled. BanPeriod cfg.Duration `json:"ban_period" default:"10s" parse:"duration"` // * BanPeriod_ time.Duration @@ -215,8 +216,8 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP if p.config.ReconnectInterval_ < 1 { p.logger.Fatal("'reconnect_interval' can't be <1") } - if p.config.BanPeriod_ < 1 { - p.logger.Fatal("'ban_period' cant't be <1") + if p.config.BanPeriod_ < 0 { + p.logger.Fatal("'ban_period' cant't be <0") } p.prepareClient() @@ -273,7 +274,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(ctx) - go p.client.CircuitBreaker.CheckBannedEndpoints(ctx, p.config.ReconnectInterval_) + p.client.Start(ctx) } func (p *Plugin) Stop() { @@ -294,6 +295,8 @@ func (p *Plugin) prepareClient() { Endpoints: p.prepareEndpoints(), ConnectionTimeout: p.config.ConnectionTimeout_ * 2, AuthHeader: p.getAuthHeader(), + BanPeriod: p.config.BanPeriod_, + ReconnectInterval: p.config.ReconnectInterval_, KeepAlive: &xhttp.ClientKeepAliveConfig{ MaxConnDuration: p.config.KeepAlive.MaxConnDuration_, MaxIdleConnDuration: p.config.KeepAlive.MaxIdleConnDuration_, @@ -313,8 +316,6 @@ func (p *Plugin) prepareClient() { if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } - - p.client.CircuitBreaker = xhttp.NewCircuitBreaker(p.client.GetEndpoints(), p.config.BanPeriod_) } func (p *Plugin) prepareEndpoints() []string { diff --git a/plugin/output/loki/README.md b/plugin/output/loki/README.md index d969cf214..b08ced5d4 100644 --- a/plugin/output/loki/README.md +++ b/plugin/output/loki/README.md @@ -152,6 +152,7 @@ Multiplier for exponential increase of retention between retries **`ban_period`** *`cfg.Duration`* *`default=10s`* Period for which addresses will be banned in case of unavailability. +If set to 0, circuit breaker is disabled.
diff --git a/plugin/output/loki/loki.go b/plugin/output/loki/loki.go index faa32cecd..ad13698d0 100644 --- a/plugin/output/loki/loki.go +++ b/plugin/output/loki/loki.go @@ -182,6 +182,7 @@ type Config struct { // > @3@4@5@6 // > // > Period for which addresses will be banned in case of unavailability. + // > If set to 0, circuit breaker is disabled. BanPeriod cfg.Duration `json:"ban_period" default:"10s" parse:"duration"` // * BanPeriod_ time.Duration @@ -274,8 +275,8 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP if p.config.ReconnectInterval_ < 1 { p.logger.Fatal("'reconnect_interval' can't be <1") } - if p.config.BanPeriod_ < 1 { - p.logger.Fatal("'ban_period' cant't be <1") + if p.config.BanPeriod_ < 0 { + p.logger.Fatal("'ban_period' cant't be <0") } p.prepareClient() @@ -328,7 +329,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(ctx) - go p.client.CircuitBreaker.CheckBannedEndpoints(ctx, p.config.ReconnectInterval_) + p.client.Start(ctx) } func (p *Plugin) Stop() { @@ -457,6 +458,8 @@ func (p *Plugin) prepareClient() { ConnectionTimeout: p.config.ConnectionTimeout_ * 2, AuthHeader: p.getAuthHeader(), CustomHeaders: p.getCustomHeaders(), + BanPeriod: p.config.BanPeriod_, + ReconnectInterval: p.config.ReconnectInterval_, KeepAlive: &xhttp.ClientKeepAliveConfig{ MaxConnDuration: p.config.KeepAlive.MaxConnDuration_, MaxIdleConnDuration: p.config.KeepAlive.MaxIdleConnDuration_, @@ -468,8 +471,6 @@ func (p *Plugin) prepareClient() { if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } - - p.client.CircuitBreaker = xhttp.NewCircuitBreaker(p.client.GetEndpoints(), p.config.BanPeriod_) } func (p *Plugin) getCustomHeaders() map[string]string { diff --git a/plugin/output/splunk/README.md b/plugin/output/splunk/README.md index ff0ed6400..55aa91e39 100755 --- a/plugin/output/splunk/README.md +++ b/plugin/output/splunk/README.md @@ -156,6 +156,7 @@ or the "event" key with any of its subkeys. **`ban_period`** *`cfg.Duration`* *`default=10s`* Period for which addresses will be banned in case of unavailability. +If set to 0, circuit breaker is disabled.
diff --git a/plugin/output/splunk/splunk.go b/plugin/output/splunk/splunk.go index c8ddac718..ea2cffe0f 100644 --- a/plugin/output/splunk/splunk.go +++ b/plugin/output/splunk/splunk.go @@ -206,6 +206,7 @@ type Config struct { // > @3@4@5@6 // > // > Period for which addresses will be banned in case of unavailability. + // > If set to 0, circuit breaker is disabled. BanPeriod cfg.Duration `json:"ban_period" default:"10s" parse:"duration"` // * BanPeriod_ time.Duration @@ -251,8 +252,8 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP if p.config.ReconnectInterval_ < 1 { p.logger.Fatal("'reconnect_interval' can't be <1") } - if p.config.BanPeriod_ < 1 { - p.logger.Fatal("'ban_period' cant't be <1") + if p.config.BanPeriod_ < 0 { + p.logger.Fatal("'ban_period' cant't be <0") } p.prepareClient() @@ -321,7 +322,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.batcher.Start(ctx) - go p.client.CircuitBreaker.CheckBannedEndpoints(ctx, p.config.ReconnectInterval_) + p.client.Start(ctx) } func (p *Plugin) Stop() { @@ -346,6 +347,8 @@ func (p *Plugin) prepareClient() { Endpoints: []string{p.config.Endpoint}, ConnectionTimeout: p.config.RequestTimeout_, AuthHeader: "Splunk " + p.config.Token, + BanPeriod: p.config.BanPeriod_, + ReconnectInterval: p.config.ReconnectInterval_, KeepAlive: &xhttp.ClientKeepAliveConfig{ MaxConnDuration: p.config.KeepAlive.MaxConnDuration_, MaxIdleConnDuration: p.config.KeepAlive.MaxIdleConnDuration_, @@ -364,8 +367,6 @@ func (p *Plugin) prepareClient() { if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } - - p.client.CircuitBreaker = xhttp.NewCircuitBreaker(p.client.GetEndpoints(), p.config.BanPeriod_) } func (p *Plugin) out(workerData *pipeline.WorkerData, batch *pipeline.Batch) error { diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go index a9e29ccb9..eb9b91a6c 100644 --- a/xhttp/circuit_breaker.go +++ b/xhttp/circuit_breaker.go @@ -10,76 +10,77 @@ import ( "github.com/valyala/fasthttp" ) -type CircuitBreaker struct { - activeEndpoints []*fasthttp.URI - endpointsByID map[string]*fasthttp.URI - bannedUntil map[string]time.Time - banPeriod time.Duration - mu sync.RWMutex +type endpoint struct { + uri *fasthttp.URI + banUntil time.Time } -func NewCircuitBreaker(endpoints []*fasthttp.URI, banPeriod time.Duration) *CircuitBreaker { - cb := &CircuitBreaker{ - activeEndpoints: make([]*fasthttp.URI, 0, len(endpoints)), - endpointsByID: make(map[string]*fasthttp.URI, len(endpoints)), - bannedUntil: make(map[string]time.Time, len(endpoints)), - banPeriod: banPeriod, +type circuitBreaker struct { + endpoints []endpoint + idxByURI map[string]int + banPeriod time.Duration + mu sync.RWMutex +} + +func NewCircuitBreaker(uris []*fasthttp.URI, banPeriod time.Duration) *circuitBreaker { + cb := &circuitBreaker{ + endpoints: make([]endpoint, 0, len(uris)), + idxByURI: make(map[string]int, len(uris)), + banPeriod: banPeriod, } - for _, endpoint := range endpoints { - id := endpoint.String() - cb.endpointsByID[id] = endpoint - cb.activeEndpoints = append(cb.activeEndpoints, endpoint) + for i, uri := range uris { + cb.endpoints = append(cb.endpoints, endpoint{uri: uri}) + cb.idxByURI[uri.String()] = i } return cb } -func (cb *CircuitBreaker) GetEndpoint() *fasthttp.URI { +func (cb *circuitBreaker) getEndpoint() *fasthttp.URI { cb.mu.RLock() defer cb.mu.RUnlock() - switch len(cb.activeEndpoints) { + now := xtime.GetInaccurateTime() + activeEndpoints := make([]*fasthttp.URI, 0, len(cb.endpoints)) + for i := range cb.endpoints { + e := cb.endpoints[i] + if e.banUntil.IsZero() || now.After(e.banUntil) { + activeEndpoints = append(activeEndpoints, e.uri) + } + } + switch len(activeEndpoints) { case 0: return nil case 1: - return cb.activeEndpoints[0] + return activeEndpoints[0] default: - return cb.activeEndpoints[rand.Int()%len(cb.activeEndpoints)] + return activeEndpoints[rand.Int()%len(activeEndpoints)] } } -func (cb *CircuitBreaker) BanEndpoint(endpoint *fasthttp.URI) { +func (cb *circuitBreaker) banEndpoint(uri *fasthttp.URI) { cb.mu.Lock() defer cb.mu.Unlock() - id := endpoint.String() - filtered := cb.activeEndpoints[:0] - for _, endpoint := range cb.activeEndpoints { - if endpoint.String() != id { - filtered = append(filtered, endpoint) - } - } - - cb.activeEndpoints = filtered - cb.bannedUntil[id] = xtime.GetInaccurateTime().Add(cb.banPeriod) + idx := cb.idxByURI[uri.String()] + cb.endpoints[idx].banUntil = xtime.GetInaccurateTime().Add(cb.banPeriod) } -func (cb *CircuitBreaker) RestoreBannedEndpoints() { +func (cb *circuitBreaker) restoreBannedEndpoints() { cb.mu.Lock() defer cb.mu.Unlock() - for id, until := range cb.bannedUntil { - if xtime.GetInaccurateTime().Before(until) { - continue + now := xtime.GetInaccurateTime() + for i := range cb.endpoints { + e := &cb.endpoints[i] + if !e.banUntil.IsZero() && now.After(e.banUntil) { + e.banUntil = time.Time{} } - endpoint := cb.endpointsByID[id] - cb.activeEndpoints = append(cb.activeEndpoints, endpoint) - delete(cb.bannedUntil, id) } } -func (cb *CircuitBreaker) CheckBannedEndpoints(ctx context.Context, reconnectInterval time.Duration) { +func (cb *circuitBreaker) checkBannedEndpoints(ctx context.Context, reconnectInterval time.Duration) { ticker := time.NewTicker(reconnectInterval) defer ticker.Stop() @@ -88,7 +89,7 @@ func (cb *CircuitBreaker) CheckBannedEndpoints(ctx context.Context, reconnectInt case <-ctx.Done(): return case <-ticker.C: - cb.RestoreBannedEndpoints() + cb.restoreBannedEndpoints() } } } diff --git a/xhttp/client.go b/xhttp/client.go index dae762f22..2d1a7176b 100644 --- a/xhttp/client.go +++ b/xhttp/client.go @@ -1,6 +1,7 @@ package xhttp import ( + "context" "fmt" "math/rand" "net/http" @@ -30,12 +31,15 @@ type ClientConfig struct { GzipCompressionLevel string TLS *ClientTLSConfig KeepAlive *ClientKeepAliveConfig + BanPeriod time.Duration + ReconnectInterval time.Duration } type Client struct { client *fasthttp.Client endpoints []*fasthttp.URI - CircuitBreaker *CircuitBreaker + cb *circuitBreaker + reconnectInterval time.Duration authHeader string customHeaders map[string]string gzipCompressionLevel int @@ -70,13 +74,27 @@ func NewClient(cfg *ClientConfig) (*Client, error) { return nil, err } - return &Client{ + c := &Client{ client: client, endpoints: endpoints, authHeader: cfg.AuthHeader, customHeaders: cfg.CustomHeaders, gzipCompressionLevel: parseGzipCompressionLevel(cfg.GzipCompressionLevel), - }, nil + } + + if cfg.BanPeriod > 0 { + c.cb = NewCircuitBreaker(endpoints, cfg.BanPeriod) + } + + return c, nil +} + +func (c *Client) Start(ctx context.Context) { + if c.cb == nil { + return + } + + go c.cb.checkBannedEndpoints(ctx, c.reconnectInterval) } func (c *Client) DoTimeout( @@ -98,7 +116,7 @@ func (c *Client) DoTimeout( c.prepareRequest(req, endpoint, method, contentType, body) if err := c.client.DoTimeout(req, resp, timeout); err != nil { - c.CircuitBreaker.BanEndpoint(endpoint) + c.banEndpoint(endpoint) return 0, fmt.Errorf("can't send request to %s: %w", endpoint.String(), err) } @@ -107,7 +125,7 @@ func (c *Client) DoTimeout( if !(http.StatusOK <= statusCode && statusCode <= http.StatusAccepted) { if shouldBanEndpoint(statusCode) { - c.CircuitBreaker.BanEndpoint(endpoint) + c.banEndpoint(endpoint) } return statusCode, fmt.Errorf("response status from %s isn't OK: status=%d, body=%s", endpoint.String(), statusCode, string(respContent)) } @@ -143,10 +161,6 @@ func (c *Client) prepareRequest(req *fasthttp.Request, endpoint *fasthttp.URI, m } } -func (c *Client) GetEndpoints() []*fasthttp.URI { - return c.endpoints -} - func parseEndpoints(endpoints []string) ([]*fasthttp.URI, error) { res := make([]*fasthttp.URI, 0, len(endpoints)) for _, e := range endpoints { @@ -177,8 +191,8 @@ func parseGzipCompressionLevel(level string) int { } func (c *Client) getEndpoint() *fasthttp.URI { - if c.CircuitBreaker != nil { - return c.CircuitBreaker.GetEndpoint() + if c.cb != nil { + return c.cb.getEndpoint() } switch len(c.endpoints) { @@ -191,6 +205,12 @@ func (c *Client) getEndpoint() *fasthttp.URI { } } +func (c *Client) banEndpoint(endpoint *fasthttp.URI) { + if c.cb != nil { + c.cb.banEndpoint(endpoint) + } +} + func shouldBanEndpoint(statusCode int) bool { switch statusCode { case http.StatusBadGateway, From 6c18f29cb650e94c83c12a4d755c9737ab448161 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Wed, 6 May 2026 22:51:47 +0300 Subject: [PATCH 15/32] fix --- xhttp/circuit_breaker.go | 2 +- xhttp/client.go | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go index eb9b91a6c..983473fd7 100644 --- a/xhttp/circuit_breaker.go +++ b/xhttp/circuit_breaker.go @@ -22,7 +22,7 @@ type circuitBreaker struct { mu sync.RWMutex } -func NewCircuitBreaker(uris []*fasthttp.URI, banPeriod time.Duration) *circuitBreaker { +func newCircuitBreaker(uris []*fasthttp.URI, banPeriod time.Duration) *circuitBreaker { cb := &circuitBreaker{ endpoints: make([]endpoint, 0, len(uris)), idxByURI: make(map[string]int, len(uris)), diff --git a/xhttp/client.go b/xhttp/client.go index 2d1a7176b..9e3463c57 100644 --- a/xhttp/client.go +++ b/xhttp/client.go @@ -77,13 +77,14 @@ func NewClient(cfg *ClientConfig) (*Client, error) { c := &Client{ client: client, endpoints: endpoints, + reconnectInterval: cfg.ReconnectInterval, authHeader: cfg.AuthHeader, customHeaders: cfg.CustomHeaders, gzipCompressionLevel: parseGzipCompressionLevel(cfg.GzipCompressionLevel), } if cfg.BanPeriod > 0 { - c.cb = NewCircuitBreaker(endpoints, cfg.BanPeriod) + c.cb = newCircuitBreaker(endpoints, cfg.BanPeriod) } return c, nil From a334814094c8d6a5a29ae50c8da24654b9ceab1e Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 8 May 2026 14:42:05 +0300 Subject: [PATCH 16/32] fix --- plugin/output/elasticsearch/elasticsearch.go | 14 +++++------- plugin/output/http/http.go | 14 +++++------- plugin/output/loki/loki.go | 16 ++++++-------- plugin/output/splunk/splunk.go | 14 +++++------- plugin/output/splunk/splunk_test.go | 5 +++-- xhttp/circuit_breaker.go | 8 ++++++- xhttp/client.go | 23 ++++---------------- 7 files changed, 39 insertions(+), 55 deletions(-) diff --git a/plugin/output/elasticsearch/elasticsearch.go b/plugin/output/elasticsearch/elasticsearch.go index 303de4e12..5daccf124 100644 --- a/plugin/output/elasticsearch/elasticsearch.go +++ b/plugin/output/elasticsearch/elasticsearch.go @@ -263,7 +263,10 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.logger.Fatal("'ban_period' cant't be <0") } - p.prepareClient() + ctx, cancel := context.WithCancel(context.Background()) + p.cancel = cancel + + p.prepareClient(ctx) p.maintenance(nil) @@ -314,12 +317,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP onError, ) - ctx, cancel := context.WithCancel(context.Background()) - p.cancel = cancel - p.batcher.Start(ctx) - - p.client.Start(ctx) } func (p *Plugin) Stop() { @@ -336,7 +334,7 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { p.indexingErrorsMetric = ctl.RegisterCounter("output_elasticsearch_index_error_total", "Number of elasticsearch indexing errors") } -func (p *Plugin) prepareClient() { +func (p *Plugin) prepareClient(ctx context.Context) { config := &xhttp.ClientConfig{ Endpoints: prepareEndpoints(p.config.Endpoints, p.config.IngestPipeline), ConnectionTimeout: p.config.ConnectionTimeout_ * 2, @@ -358,7 +356,7 @@ func (p *Plugin) prepareClient() { } var err error - p.client, err = xhttp.NewClient(config) + p.client, err = xhttp.NewClient(ctx, config) if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } diff --git a/plugin/output/http/http.go b/plugin/output/http/http.go index 6f6d58208..6ee41ceff 100644 --- a/plugin/output/http/http.go +++ b/plugin/output/http/http.go @@ -220,7 +220,10 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.logger.Fatal("'ban_period' cant't be <0") } - p.prepareClient() + ctx, cancel := context.WithCancel(context.Background()) + p.cancel = cancel + + p.prepareClient(ctx) p.logger.Info("starting batcher", zap.Duration("timeout", p.config.BatchFlushTimeout_)) @@ -269,12 +272,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP onError, ) - ctx, cancel := context.WithCancel(context.Background()) - p.cancel = cancel - p.batcher.Start(ctx) - - p.client.Start(ctx) } func (p *Plugin) Stop() { @@ -290,7 +288,7 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { p.sendErrorMetric = ctl.RegisterCounterVec("output_http_send_error_total", "Total HTTP send errors", "status_code") } -func (p *Plugin) prepareClient() { +func (p *Plugin) prepareClient(ctx context.Context) { config := &xhttp.ClientConfig{ Endpoints: p.prepareEndpoints(), ConnectionTimeout: p.config.ConnectionTimeout_ * 2, @@ -312,7 +310,7 @@ func (p *Plugin) prepareClient() { } var err error - p.client, err = xhttp.NewClient(config) + p.client, err = xhttp.NewClient(ctx, config) if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } diff --git a/plugin/output/loki/loki.go b/plugin/output/loki/loki.go index ad13698d0..b5753c5db 100644 --- a/plugin/output/loki/loki.go +++ b/plugin/output/loki/loki.go @@ -279,7 +279,11 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.logger.Fatal("'ban_period' cant't be <0") } - p.prepareClient() + ctx, cancel := context.WithCancel(context.Background()) + p.ctx = ctx + p.cancel = cancel + + p.prepareClient(ctx) batcherOpts := &pipeline.BatcherOptions{ PipelineName: params.PipelineName, @@ -323,13 +327,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP onError, ) - ctx, cancel := context.WithCancel(context.Background()) - p.ctx = ctx - p.cancel = cancel - p.batcher.Start(ctx) - - p.client.Start(ctx) } func (p *Plugin) Stop() { @@ -452,7 +450,7 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { p.sendErrorMetric = ctl.RegisterCounterVec("output_loki_send_error_total", "Total Loki send errors", "status_code") } -func (p *Plugin) prepareClient() { +func (p *Plugin) prepareClient(ctx context.Context) { config := &xhttp.ClientConfig{ Endpoints: []string{fmt.Sprintf("%s/loki/api/v1/push", p.config.Address)}, ConnectionTimeout: p.config.ConnectionTimeout_ * 2, @@ -467,7 +465,7 @@ func (p *Plugin) prepareClient() { } var err error - p.client, err = xhttp.NewClient(config) + p.client, err = xhttp.NewClient(ctx, config) if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } diff --git a/plugin/output/splunk/splunk.go b/plugin/output/splunk/splunk.go index ea2cffe0f..742eddb32 100644 --- a/plugin/output/splunk/splunk.go +++ b/plugin/output/splunk/splunk.go @@ -256,7 +256,10 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP p.logger.Fatal("'ban_period' cant't be <0") } - p.prepareClient() + ctx, cancel := context.WithCancel(context.Background()) + p.cancel = cancel + + p.prepareClient(ctx) for _, cf := range p.config.CopyFields { if cf.To == "" { @@ -317,12 +320,7 @@ func (p *Plugin) Start(config pipeline.AnyConfig, params *pipeline.OutputPluginP onError, ) - ctx, cancel := context.WithCancel(context.Background()) - p.cancel = cancel - p.batcher.Start(ctx) - - p.client.Start(ctx) } func (p *Plugin) Stop() { @@ -342,7 +340,7 @@ func (p *Plugin) registerMetrics(ctl *metric.Ctl) { ) } -func (p *Plugin) prepareClient() { +func (p *Plugin) prepareClient(ctx context.Context) { config := &xhttp.ClientConfig{ Endpoints: []string{p.config.Endpoint}, ConnectionTimeout: p.config.RequestTimeout_, @@ -363,7 +361,7 @@ func (p *Plugin) prepareClient() { } var err error - p.client, err = xhttp.NewClient(config) + p.client, err = xhttp.NewClient(ctx, config) if err != nil { p.logger.Fatal("can't create http client", zap.Error(err)) } diff --git a/plugin/output/splunk/splunk_test.go b/plugin/output/splunk/splunk_test.go index 40626e076..fec6de177 100644 --- a/plugin/output/splunk/splunk_test.go +++ b/plugin/output/splunk/splunk_test.go @@ -1,6 +1,7 @@ package splunk import ( + "context" "io" "net/http" "net/http/httptest" @@ -54,7 +55,7 @@ func TestSplunk(t *testing.T) { }, logger: zap.NewExample().Sugar(), } - plugin.prepareClient() + plugin.prepareClient(context.Background()) batch := pipeline.NewPreparedBatch([]*pipeline.Event{ {Root: input}, @@ -185,7 +186,7 @@ func TestCopyFields(t *testing.T) { copyFieldsPaths: tt.copyFields, logger: zap.NewExample().Sugar(), } - plugin.prepareClient() + plugin.prepareClient(context.Background()) batch := pipeline.NewPreparedBatch([]*pipeline.Event{ {Root: input}, diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go index 983473fd7..73099281d 100644 --- a/xhttp/circuit_breaker.go +++ b/xhttp/circuit_breaker.go @@ -22,7 +22,11 @@ type circuitBreaker struct { mu sync.RWMutex } -func newCircuitBreaker(uris []*fasthttp.URI, banPeriod time.Duration) *circuitBreaker { +func newCircuitBreaker(ctx context.Context, uris []*fasthttp.URI, banPeriod, reconnectInterval time.Duration) *circuitBreaker { + if banPeriod <= 0 { + return nil + } + cb := &circuitBreaker{ endpoints: make([]endpoint, 0, len(uris)), idxByURI: make(map[string]int, len(uris)), @@ -34,6 +38,8 @@ func newCircuitBreaker(uris []*fasthttp.URI, banPeriod time.Duration) *circuitBr cb.idxByURI[uri.String()] = i } + go cb.checkBannedEndpoints(ctx, reconnectInterval) + return cb } diff --git a/xhttp/client.go b/xhttp/client.go index 9e3463c57..7d13c3973 100644 --- a/xhttp/client.go +++ b/xhttp/client.go @@ -39,13 +39,12 @@ type Client struct { client *fasthttp.Client endpoints []*fasthttp.URI cb *circuitBreaker - reconnectInterval time.Duration authHeader string customHeaders map[string]string gzipCompressionLevel int } -func NewClient(cfg *ClientConfig) (*Client, error) { +func NewClient(ctx context.Context, cfg *ClientConfig) (*Client, error) { client := &fasthttp.Client{ ReadTimeout: cfg.ConnectionTimeout, WriteTimeout: cfg.ConnectionTimeout, @@ -74,28 +73,14 @@ func NewClient(cfg *ClientConfig) (*Client, error) { return nil, err } - c := &Client{ + return &Client{ client: client, endpoints: endpoints, - reconnectInterval: cfg.ReconnectInterval, + cb: newCircuitBreaker(ctx, endpoints, cfg.BanPeriod, cfg.ReconnectInterval), authHeader: cfg.AuthHeader, customHeaders: cfg.CustomHeaders, gzipCompressionLevel: parseGzipCompressionLevel(cfg.GzipCompressionLevel), - } - - if cfg.BanPeriod > 0 { - c.cb = newCircuitBreaker(endpoints, cfg.BanPeriod) - } - - return c, nil -} - -func (c *Client) Start(ctx context.Context) { - if c.cb == nil { - return - } - - go c.cb.checkBannedEndpoints(ctx, c.reconnectInterval) + }, nil } func (c *Client) DoTimeout( From 8371f261c8c15afe52481d1bcace34a16e736e48 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Wed, 20 May 2026 21:05:55 +0300 Subject: [PATCH 17/32] fix docs --- plugin/output/http/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/plugin/output/http/README.md b/plugin/output/http/README.md index 5ab75548e..79d6ea6af 100755 --- a/plugin/output/http/README.md +++ b/plugin/output/http/README.md @@ -157,5 +157,4 @@ Interval for reconnecting to addresses that are unavailable during initializatio
-
*Generated using [__insane-doc__](https://github.com/vitkovskii/insane-doc)* \ No newline at end of file From b3ee12e8e632cc5f6252923e7d6e9780dc9e853f Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 22 May 2026 18:04:13 +0300 Subject: [PATCH 18/32] change switch to if --- xhttp/circuit_breaker.go | 9 +++------ xhttp/client.go | 8 ++------ 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go index 73099281d..cf7e0a42c 100644 --- a/xhttp/circuit_breaker.go +++ b/xhttp/circuit_breaker.go @@ -55,14 +55,11 @@ func (cb *circuitBreaker) getEndpoint() *fasthttp.URI { activeEndpoints = append(activeEndpoints, e.uri) } } - switch len(activeEndpoints) { - case 0: + + if len(activeEndpoints) == 0 { return nil - case 1: - return activeEndpoints[0] - default: - return activeEndpoints[rand.Int()%len(activeEndpoints)] } + return activeEndpoints[rand.Intn(len(activeEndpoints))] } func (cb *circuitBreaker) banEndpoint(uri *fasthttp.URI) { diff --git a/xhttp/client.go b/xhttp/client.go index 7d13c3973..ec9bb9880 100644 --- a/xhttp/client.go +++ b/xhttp/client.go @@ -181,14 +181,10 @@ func (c *Client) getEndpoint() *fasthttp.URI { return c.cb.getEndpoint() } - switch len(c.endpoints) { - case 0: + if len(c.endpoints) == 0 { return nil - case 1: - return c.endpoints[0] - default: - return c.endpoints[rand.Int()%len(c.endpoints)] } + return c.endpoints[rand.Intn(len(c.endpoints))] } func (c *Client) banEndpoint(endpoint *fasthttp.URI) { From b43ebce6c1e8bd0e95a3aedf4dd182b09c6dabb3 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Tue, 26 May 2026 17:54:25 +0300 Subject: [PATCH 19/32] fix docs --- plugin/output/elasticsearch/README.md | 2 +- plugin/output/elasticsearch/elasticsearch.go | 2 +- plugin/output/http/README.md | 2 +- plugin/output/http/http.go | 2 +- plugin/output/loki/README.md | 2 +- plugin/output/loki/loki.go | 2 +- plugin/output/splunk/README.md | 2 +- plugin/output/splunk/splunk.go | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/plugin/output/elasticsearch/README.md b/plugin/output/elasticsearch/README.md index 00df09dc5..03df043be 100755 --- a/plugin/output/elasticsearch/README.md +++ b/plugin/output/elasticsearch/README.md @@ -179,7 +179,7 @@ If set to 0, circuit breaker is disabled. **`reconnect_interval`** *`cfg.Duration`* *`default=5s`* -Interval for reconnecting to addresses that are unavailable during initialization. +Interval for checking banned endpoints availability.
diff --git a/plugin/output/elasticsearch/elasticsearch.go b/plugin/output/elasticsearch/elasticsearch.go index 5daccf124..dbf5d801e 100644 --- a/plugin/output/elasticsearch/elasticsearch.go +++ b/plugin/output/elasticsearch/elasticsearch.go @@ -213,7 +213,7 @@ type Config struct { // > @3@4@5@6 // > - // > Interval for reconnecting to addresses that are unavailable during initialization. + // > Interval for checking banned endpoints availability. ReconnectInterval cfg.Duration `json:"reconnect_interval" default:"5s" parse:"duration"` // * ReconnectInterval_ time.Duration } diff --git a/plugin/output/http/README.md b/plugin/output/http/README.md index 79d6ea6af..7c6a51b66 100755 --- a/plugin/output/http/README.md +++ b/plugin/output/http/README.md @@ -153,7 +153,7 @@ If set to 0, circuit breaker is disabled. **`reconnect_interval`** *`cfg.Duration`* *`default=5s`* -Interval for reconnecting to addresses that are unavailable during initialization. +Interval for checking banned endpoints availability.
diff --git a/plugin/output/http/http.go b/plugin/output/http/http.go index edb9f9cfb..425658199 100644 --- a/plugin/output/http/http.go +++ b/plugin/output/http/http.go @@ -186,7 +186,7 @@ type Config struct { // > @3@4@5@6 // > - // > Interval for reconnecting to addresses that are unavailable during initialization. + // > Interval for checking banned endpoints availability. ReconnectInterval cfg.Duration `json:"reconnect_interval" default:"5s" parse:"duration"` // * ReconnectInterval_ time.Duration } diff --git a/plugin/output/loki/README.md b/plugin/output/loki/README.md index b08ced5d4..26b9fec02 100644 --- a/plugin/output/loki/README.md +++ b/plugin/output/loki/README.md @@ -158,7 +158,7 @@ If set to 0, circuit breaker is disabled. **`reconnect_interval`** *`cfg.Duration`* *`default=5s`* -Interval for reconnecting to addresses that are unavailable during initialization. +Interval for checking banned endpoints availability.
diff --git a/plugin/output/loki/loki.go b/plugin/output/loki/loki.go index b5753c5db..85b7f8c36 100644 --- a/plugin/output/loki/loki.go +++ b/plugin/output/loki/loki.go @@ -188,7 +188,7 @@ type Config struct { // > @3@4@5@6 // > - // > Interval for reconnecting to addresses that are unavailable during initialization. + // > Interval for checking banned endpoints availability. ReconnectInterval cfg.Duration `json:"reconnect_interval" default:"5s" parse:"duration"` // * ReconnectInterval_ time.Duration } diff --git a/plugin/output/splunk/README.md b/plugin/output/splunk/README.md index 55aa91e39..e6b8f2913 100755 --- a/plugin/output/splunk/README.md +++ b/plugin/output/splunk/README.md @@ -162,7 +162,7 @@ If set to 0, circuit breaker is disabled. **`reconnect_interval`** *`cfg.Duration`* *`default=5s`* -Interval for reconnecting to addresses that are unavailable during initialization. +Interval for checking banned endpoints availability.
diff --git a/plugin/output/splunk/splunk.go b/plugin/output/splunk/splunk.go index 742eddb32..9f0e9bdf2 100644 --- a/plugin/output/splunk/splunk.go +++ b/plugin/output/splunk/splunk.go @@ -212,7 +212,7 @@ type Config struct { // > @3@4@5@6 // > - // > Interval for reconnecting to addresses that are unavailable during initialization. + // > Interval for checking banned endpoints availability. ReconnectInterval cfg.Duration `json:"reconnect_interval" default:"5s" parse:"duration"` // * ReconnectInterval_ time.Duration } From 0cad8f66959357b713ddaf6eaad6d560d123306f Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Wed, 27 May 2026 11:17:53 +0300 Subject: [PATCH 20/32] fix --- xhttp/circuit_breaker.go | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go index cf7e0a42c..4efbe08f6 100644 --- a/xhttp/circuit_breaker.go +++ b/xhttp/circuit_breaker.go @@ -16,10 +16,11 @@ type endpoint struct { } type circuitBreaker struct { - endpoints []endpoint - idxByURI map[string]int - banPeriod time.Duration - mu sync.RWMutex + endpoints []endpoint + activeEndpoints []int + idxByURI map[string]int + banPeriod time.Duration + mu sync.Mutex } func newCircuitBreaker(ctx context.Context, uris []*fasthttp.URI, banPeriod, reconnectInterval time.Duration) *circuitBreaker { @@ -28,9 +29,10 @@ func newCircuitBreaker(ctx context.Context, uris []*fasthttp.URI, banPeriod, rec } cb := &circuitBreaker{ - endpoints: make([]endpoint, 0, len(uris)), - idxByURI: make(map[string]int, len(uris)), - banPeriod: banPeriod, + endpoints: make([]endpoint, 0, len(uris)), + activeEndpoints: make([]int, 0, len(uris)), + idxByURI: make(map[string]int, len(uris)), + banPeriod: banPeriod, } for i, uri := range uris { @@ -44,22 +46,23 @@ func newCircuitBreaker(ctx context.Context, uris []*fasthttp.URI, banPeriod, rec } func (cb *circuitBreaker) getEndpoint() *fasthttp.URI { - cb.mu.RLock() - defer cb.mu.RUnlock() + cb.mu.Lock() + defer cb.mu.Unlock() now := xtime.GetInaccurateTime() - activeEndpoints := make([]*fasthttp.URI, 0, len(cb.endpoints)) - for i := range cb.endpoints { - e := cb.endpoints[i] + cb.activeEndpoints = cb.activeEndpoints[:0] + for i, e := range cb.endpoints { if e.banUntil.IsZero() || now.After(e.banUntil) { - activeEndpoints = append(activeEndpoints, e.uri) + cb.activeEndpoints = append(cb.activeEndpoints, i) } } - if len(activeEndpoints) == 0 { + if len(cb.activeEndpoints) == 0 { return nil } - return activeEndpoints[rand.Intn(len(activeEndpoints))] + + idx := rand.Intn(len(cb.activeEndpoints)) + return cb.endpoints[cb.activeEndpoints[idx]].uri } func (cb *circuitBreaker) banEndpoint(uri *fasthttp.URI) { From d0fc78ac127b757f3c5599a596b00e8744d6fb36 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Thu, 28 May 2026 10:10:29 +0300 Subject: [PATCH 21/32] fix --- xhttp/circuit_breaker.go | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go index 4efbe08f6..ffbca7461 100644 --- a/xhttp/circuit_breaker.go +++ b/xhttp/circuit_breaker.go @@ -20,7 +20,7 @@ type circuitBreaker struct { activeEndpoints []int idxByURI map[string]int banPeriod time.Duration - mu sync.Mutex + mu sync.RWMutex } func newCircuitBreaker(ctx context.Context, uris []*fasthttp.URI, banPeriod, reconnectInterval time.Duration) *circuitBreaker { @@ -38,6 +38,7 @@ func newCircuitBreaker(ctx context.Context, uris []*fasthttp.URI, banPeriod, rec for i, uri := range uris { cb.endpoints = append(cb.endpoints, endpoint{uri: uri}) cb.idxByURI[uri.String()] = i + cb.activeEndpoints = append(cb.activeEndpoints, i) } go cb.checkBannedEndpoints(ctx, reconnectInterval) @@ -46,16 +47,8 @@ func newCircuitBreaker(ctx context.Context, uris []*fasthttp.URI, banPeriod, rec } func (cb *circuitBreaker) getEndpoint() *fasthttp.URI { - cb.mu.Lock() - defer cb.mu.Unlock() - - now := xtime.GetInaccurateTime() - cb.activeEndpoints = cb.activeEndpoints[:0] - for i, e := range cb.endpoints { - if e.banUntil.IsZero() || now.After(e.banUntil) { - cb.activeEndpoints = append(cb.activeEndpoints, i) - } - } + cb.mu.RLock() + defer cb.mu.RUnlock() if len(cb.activeEndpoints) == 0 { return nil @@ -71,6 +64,14 @@ func (cb *circuitBreaker) banEndpoint(uri *fasthttp.URI) { idx := cb.idxByURI[uri.String()] cb.endpoints[idx].banUntil = xtime.GetInaccurateTime().Add(cb.banPeriod) + + for i, activeIdx := range cb.activeEndpoints { + if activeIdx == idx { + cb.activeEndpoints[i] = cb.activeEndpoints[len(cb.activeEndpoints)-1] + cb.activeEndpoints = cb.activeEndpoints[:len(cb.activeEndpoints)-1] + break + } + } } func (cb *circuitBreaker) restoreBannedEndpoints() { @@ -82,6 +83,7 @@ func (cb *circuitBreaker) restoreBannedEndpoints() { e := &cb.endpoints[i] if !e.banUntil.IsZero() && now.After(e.banUntil) { e.banUntil = time.Time{} + cb.activeEndpoints = append(cb.activeEndpoints, i) } } } From 9ebe601942d864ae6d5439203ab266a1049aa05d Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Thu, 28 May 2026 15:13:01 +0300 Subject: [PATCH 22/32] fix --- xhttp/circuit_breaker.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go index ffbca7461..60e16a2d4 100644 --- a/xhttp/circuit_breaker.go +++ b/xhttp/circuit_breaker.go @@ -75,6 +75,13 @@ func (cb *circuitBreaker) banEndpoint(uri *fasthttp.URI) { } func (cb *circuitBreaker) restoreBannedEndpoints() { + cb.mu.RLock() + if len(cb.endpoints) == len(cb.activeEndpoints) { + cb.mu.RUnlock() + return + } + cb.mu.RUnlock() + cb.mu.Lock() defer cb.mu.Unlock() From 4002f458ad22c23bda8f5cf941c2f7966ff45497 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 29 May 2026 11:54:38 +0300 Subject: [PATCH 23/32] without cb when one host --- xhttp/circuit_breaker.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xhttp/circuit_breaker.go b/xhttp/circuit_breaker.go index 60e16a2d4..ad6c3e896 100644 --- a/xhttp/circuit_breaker.go +++ b/xhttp/circuit_breaker.go @@ -24,7 +24,7 @@ type circuitBreaker struct { } func newCircuitBreaker(ctx context.Context, uris []*fasthttp.URI, banPeriod, reconnectInterval time.Duration) *circuitBreaker { - if banPeriod <= 0 { + if banPeriod <= 0 || len(uris) == 1 { return nil } From 86a05270b1f3d519758fefc6289f5c0f926a6454 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 29 May 2026 15:05:09 +0300 Subject: [PATCH 24/32] update e2e --- e2e/file_elasticsearch/config_cb.yml | 19 +++++ e2e/file_elasticsearch/docker-compose.yml | 27 +++++- e2e/file_elasticsearch/file_elasticsearch.go | 39 +++++---- e2e/file_elasticsearch/helpers.go | 87 ++++++++++---------- e2e/start_work_test.go | 21 +++-- 5 files changed, 127 insertions(+), 66 deletions(-) create mode 100644 e2e/file_elasticsearch/config_cb.yml diff --git a/e2e/file_elasticsearch/config_cb.yml b/e2e/file_elasticsearch/config_cb.yml new file mode 100644 index 000000000..ec629ea8f --- /dev/null +++ b/e2e/file_elasticsearch/config_cb.yml @@ -0,0 +1,19 @@ +pipelines: + file_elasticsearch_cb: + input: + type: file + persistence_mode: async + watching_dir: SOME_DIR + offsets_file: SOME_FILE + offsets_op: reset + output: + type: elasticsearch + endpoints: + - http://localhost:9200 + - http://localhost:9201 + username: SOME_USERNAME + password: SOME_PASSWORD + index_format: SOME_INDEX + batch_size: 2 + ban_period: 6s + reconnect_interval: 3s diff --git a/e2e/file_elasticsearch/docker-compose.yml b/e2e/file_elasticsearch/docker-compose.yml index 162df6eef..044407894 100644 --- a/e2e/file_elasticsearch/docker-compose.yml +++ b/e2e/file_elasticsearch/docker-compose.yml @@ -1,10 +1,9 @@ -# https://github.com/elastic/start-local/tree/main services: elasticsearch: - image: elasticsearch:8.17.0 + image: elasticsearch:9.4.0 container_name: es-local-test ports: - - "19200:9200" + - "9200:9200" environment: - discovery.type=single-node - ELASTIC_PASSWORD=elastic @@ -15,7 +14,27 @@ services: test: [ "CMD-SHELL", - "curl --output /dev/null --silent --head --fail -u elastic:elastic http://elasticsearch:19200", + "curl --output /dev/null --silent --head --fail -u elastic:elastic http://elasticsearch:9200", + ] + interval: 10s + timeout: 10s + retries: 10 + elasticsearch2: + image: elasticsearch:9.4.0 + container_name: es-local-test-2 + ports: + - "9201:9200" + environment: + - discovery.type=single-node + - ELASTIC_PASSWORD=elastic + - xpack.security.enabled=true + - xpack.security.http.ssl.enabled=false + mem_limit: 1073741824 + healthcheck: + test: + [ + "CMD-SHELL", + "curl --output /dev/null --silent --head --fail -u elastic:elastic http://elasticsearch:9200", ] interval: 10s timeout: 10s diff --git a/e2e/file_elasticsearch/file_elasticsearch.go b/e2e/file_elasticsearch/file_elasticsearch.go index 7ada448dd..9dda4874c 100644 --- a/e2e/file_elasticsearch/file_elasticsearch.go +++ b/e2e/file_elasticsearch/file_elasticsearch.go @@ -18,13 +18,13 @@ import ( // Config for file-elasticsearch plugin e2e test type Config struct { - Count int - Endpoint string - Pipeline string - Username string - Password string - dir string - index string + Count int + Endpoints []string + Pipeline string + Username string + Password string + dir string + index string } // Configure sets additional fields for input and output plugins @@ -43,10 +43,12 @@ func (c *Config) Configure(t *testing.T, conf *cfg.Config, pipelineName string) output.Set("ingest_pipeline", c.Pipeline) output.Set("username", c.Username) output.Set("password", c.Password) - output.Set("endpoints", []string{c.Endpoint}) + output.Set("endpoints", c.Endpoints) - err := createIngestPipeline(c.Endpoint, c.Pipeline, c.Username, c.Password) - require.NoError(t, err) + for _, endpoint := range c.Endpoints { + err := createIngestPipeline(endpoint, c.Pipeline, c.Username, c.Password) + require.NoError(t, err) + } } // Send creates file and writes messages @@ -63,12 +65,19 @@ func (c *Config) Send(t *testing.T) { // Validate waits for the message processing to complete func (c *Config) Validate(t *testing.T) { - err := waitUntilIndexReady(c.Endpoint, c.index, c.Username, c.Password, c.Count, 10, 250*time.Millisecond) - require.NoError(t, err) - docs, err := getDocumentsFromIndex(c.Endpoint, c.index, c.Username, c.Password) + err := waitUntilIndexReady(c.Endpoints, c.index, c.Username, c.Password, c.Count, 10, 250*time.Millisecond) require.NoError(t, err) - require.Len(t, docs, c.Count) - for _, doc := range docs { + + var allDocs []map[string]any + for _, endpoint := range c.Endpoints { + docs, err := getDocumentsFromIndex(endpoint, c.index, c.Username, c.Password) + require.NoError(t, err) + t.Logf("endpoint %s: %d docs", endpoint, len(docs)) + allDocs = append(allDocs, docs...) + } + + require.Len(t, allDocs, c.Count) + for _, doc := range allDocs { if _, ok := doc["processed_at"]; !ok { t.Errorf("doc %v doesn't have processed_at field", doc) } diff --git a/e2e/file_elasticsearch/helpers.go b/e2e/file_elasticsearch/helpers.go index 42269eeba..421bf3e85 100644 --- a/e2e/file_elasticsearch/helpers.go +++ b/e2e/file_elasticsearch/helpers.go @@ -43,7 +43,7 @@ func createIngestPipeline(elasticURL, pipelineID, username, password string) err return nil } -func getDocumentsFromIndex(elasticURL, indexName, username, password string) ([]map[string]interface{}, error) { +func getDocumentsFromIndex(elasticURL, indexName, username, password string) ([]map[string]any, error) { url := fmt.Sprintf("%s/%s/_search", elasticURL, indexName) body := `{"query":{"match_all":{}}}` @@ -77,7 +77,7 @@ func getDocumentsFromIndex(elasticURL, indexName, username, password string) ([] type searchResponse struct { Hits struct { Hits []struct { - Source map[string]interface{} `json:"_source"` + Source map[string]any `json:"_source"` } `json:"hits"` } `json:"hits"` } @@ -87,7 +87,7 @@ func getDocumentsFromIndex(elasticURL, indexName, username, password string) ([] return nil, fmt.Errorf("failed to decode response: %w", err) } - resultDocs := make([]map[string]interface{}, 0, len(result.Hits.Hits)) + resultDocs := make([]map[string]any, 0, len(result.Hits.Hits)) for _, hit := range result.Hits.Hits { resultDocs = append(resultDocs, hit.Source) @@ -96,15 +96,11 @@ func getDocumentsFromIndex(elasticURL, indexName, username, password string) ([] return resultDocs, nil } -func waitUntilIndexReady(elasticURL, indexName, username, password string, minDocs, retries int, delay time.Duration) error { - client := &http.Client{ - Timeout: time.Second, - } - +func getDocCount(client *http.Client, elasticURL, indexName, username, password string) (int, error) { url := fmt.Sprintf("%s/%s/_count", elasticURL, indexName) req, err := http.NewRequest(http.MethodGet, url, http.NoBody) if err != nil { - return fmt.Errorf("failed to create request: %w", err) + return 0, fmt.Errorf("failed to create request: %w", err) } req.Header.Set("Content-Type", "application/json") @@ -112,47 +108,54 @@ func waitUntilIndexReady(elasticURL, indexName, username, password string, minDo req.SetBasicAuth(username, password) } - for i := 0; i < retries; i++ { - ok, err := func() (bool, error) { - resp, err := client.Do(req) - if err != nil { - return false, fmt.Errorf("failed to make request: %w", err) - } - defer func() { _ = resp.Body.Close() }() + resp, err := client.Do(req) + if err != nil { + return 0, fmt.Errorf("failed to make request: %w", err) + } + defer func() { _ = resp.Body.Close() }() - if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusServiceUnavailable { - return false, nil - } + if resp.StatusCode == http.StatusNotFound || resp.StatusCode == http.StatusServiceUnavailable { + return 0, nil + } - if resp.StatusCode != http.StatusOK { - return false, fmt.Errorf("unexpected status code: %d", resp.StatusCode) - } + if resp.StatusCode != http.StatusOK { + return 0, fmt.Errorf("unexpected status code: %d", resp.StatusCode) + } - body, err := io.ReadAll(resp.Body) - if err != nil { - return false, fmt.Errorf("failed to read response: %w", err) - } + body, err := io.ReadAll(resp.Body) + if err != nil { + return 0, fmt.Errorf("failed to read response: %w", err) + } - var result map[string]interface{} - if err := json.Unmarshal(body, &result); err != nil { - return false, fmt.Errorf("failed to decode response: %w", err) - } + var result map[string]any + if err := json.Unmarshal(body, &result); err != nil { + return 0, fmt.Errorf("failed to decode response: %w", err) + } - if count, ok := result["count"].(float64); ok { - if int(count) >= minDocs { - return true, nil - } - } else { - return false, fmt.Errorf("unexpected response structure") - } + count, ok := result["count"].(float64) + if !ok { + return 0, fmt.Errorf("unexpected response structure") + } - return false, nil - }() + return int(count), nil +} + +func waitUntilIndexReady(elasticURLs []string, indexName, username, password string, minDocs, retries int, delay time.Duration) error { + client := &http.Client{ + Timeout: time.Second, + } - if err != nil { - return err + for range retries { + total := 0 + for _, elasticURL := range elasticURLs { + count, err := getDocCount(client, elasticURL, indexName, username, password) + if err != nil { + return err + } + total += count } - if ok { + + if total >= minDocs { return nil } time.Sleep(delay) diff --git a/e2e/start_work_test.go b/e2e/start_work_test.go index c7c349eb2..051741040 100644 --- a/e2e/start_work_test.go +++ b/e2e/start_work_test.go @@ -166,14 +166,25 @@ func TestE2EStabilityWorkCase(t *testing.T) { { name: "file_elasticsearch", e2eTest: &file_elasticsearch.Config{ - Count: 10, - Pipeline: "test-ingest-pipeline", - Endpoint: "http://localhost:19200", - Username: "elastic", - Password: "elastic", + Count: 10, + Pipeline: "test-ingest-pipeline", + Endpoints: []string{"http://localhost:9200"}, + Username: "elastic", + Password: "elastic", }, cfgPath: "./file_elasticsearch/config.yml", }, + { + name: "file_elasticsearch_cb", + e2eTest: &file_elasticsearch.Config{ + Count: 10, + Pipeline: "test-ingest-pipeline", + Endpoints: []string{"http://localhost:9200", "http://localhost:9201"}, + Username: "elastic", + Password: "elastic", + }, + cfgPath: "./file_elasticsearch/config_cb.yml", + }, { name: "file_es_split", e2eTest: &file_es_split.Config{}, From b1ae2af033d793c6f26994f0d8950cfc5a8f53a2 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 29 May 2026 15:14:58 +0300 Subject: [PATCH 25/32] fix --- e2e/file_elasticsearch/docker-compose.yml | 4 ++-- e2e/file_elasticsearch/file_elasticsearch.go | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/e2e/file_elasticsearch/docker-compose.yml b/e2e/file_elasticsearch/docker-compose.yml index 044407894..1ac741697 100644 --- a/e2e/file_elasticsearch/docker-compose.yml +++ b/e2e/file_elasticsearch/docker-compose.yml @@ -1,6 +1,6 @@ services: elasticsearch: - image: elasticsearch:9.4.0 + image: elasticsearch:8.17.0 container_name: es-local-test ports: - "9200:9200" @@ -20,7 +20,7 @@ services: timeout: 10s retries: 10 elasticsearch2: - image: elasticsearch:9.4.0 + image: elasticsearch:8.17.0 container_name: es-local-test-2 ports: - "9201:9200" diff --git a/e2e/file_elasticsearch/file_elasticsearch.go b/e2e/file_elasticsearch/file_elasticsearch.go index 9dda4874c..2c0a5712d 100644 --- a/e2e/file_elasticsearch/file_elasticsearch.go +++ b/e2e/file_elasticsearch/file_elasticsearch.go @@ -68,7 +68,7 @@ func (c *Config) Validate(t *testing.T) { err := waitUntilIndexReady(c.Endpoints, c.index, c.Username, c.Password, c.Count, 10, 250*time.Millisecond) require.NoError(t, err) - var allDocs []map[string]any + allDocs := make([]map[string]any, 0, c.Count) for _, endpoint := range c.Endpoints { docs, err := getDocumentsFromIndex(endpoint, c.index, c.Username, c.Password) require.NoError(t, err) From a91fbc1a206e1f24990a7e0994aa2ea6cce760c5 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 29 May 2026 15:28:30 +0300 Subject: [PATCH 26/32] fix --- e2e/file_elasticsearch/config.yml | 2 +- e2e/file_elasticsearch/config_cb.yml | 4 ++-- e2e/file_elasticsearch/docker-compose.yml | 8 ++++---- e2e/start_work_test.go | 4 ++-- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/e2e/file_elasticsearch/config.yml b/e2e/file_elasticsearch/config.yml index 47063cb3f..d5967fe36 100644 --- a/e2e/file_elasticsearch/config.yml +++ b/e2e/file_elasticsearch/config.yml @@ -9,7 +9,7 @@ pipelines: output: type: elasticsearch endpoints: - - http://localhost:9200 + - http://localhost:19200 username: SOME_USERNAME password: SOME_PASSWORD index_format: SOME_INDEX diff --git a/e2e/file_elasticsearch/config_cb.yml b/e2e/file_elasticsearch/config_cb.yml index ec629ea8f..584661e4e 100644 --- a/e2e/file_elasticsearch/config_cb.yml +++ b/e2e/file_elasticsearch/config_cb.yml @@ -9,8 +9,8 @@ pipelines: output: type: elasticsearch endpoints: - - http://localhost:9200 - - http://localhost:9201 + - http://localhost:19200 + - http://localhost:19201 username: SOME_USERNAME password: SOME_PASSWORD index_format: SOME_INDEX diff --git a/e2e/file_elasticsearch/docker-compose.yml b/e2e/file_elasticsearch/docker-compose.yml index 1ac741697..a7f14dc6a 100644 --- a/e2e/file_elasticsearch/docker-compose.yml +++ b/e2e/file_elasticsearch/docker-compose.yml @@ -3,7 +3,7 @@ services: image: elasticsearch:8.17.0 container_name: es-local-test ports: - - "9200:9200" + - "19200:9200" environment: - discovery.type=single-node - ELASTIC_PASSWORD=elastic @@ -14,7 +14,7 @@ services: test: [ "CMD-SHELL", - "curl --output /dev/null --silent --head --fail -u elastic:elastic http://elasticsearch:9200", + "curl --output /dev/null --silent --head --fail -u elastic:elastic http://localhost:9200", ] interval: 10s timeout: 10s @@ -23,7 +23,7 @@ services: image: elasticsearch:8.17.0 container_name: es-local-test-2 ports: - - "9201:9200" + - "19201:9200" environment: - discovery.type=single-node - ELASTIC_PASSWORD=elastic @@ -34,7 +34,7 @@ services: test: [ "CMD-SHELL", - "curl --output /dev/null --silent --head --fail -u elastic:elastic http://elasticsearch:9200", + "curl --output /dev/null --silent --head --fail -u elastic:elastic http://localhost:9200", ] interval: 10s timeout: 10s diff --git a/e2e/start_work_test.go b/e2e/start_work_test.go index 051741040..7dcaca2c2 100644 --- a/e2e/start_work_test.go +++ b/e2e/start_work_test.go @@ -168,7 +168,7 @@ func TestE2EStabilityWorkCase(t *testing.T) { e2eTest: &file_elasticsearch.Config{ Count: 10, Pipeline: "test-ingest-pipeline", - Endpoints: []string{"http://localhost:9200"}, + Endpoints: []string{"http://localhost:19200"}, Username: "elastic", Password: "elastic", }, @@ -179,7 +179,7 @@ func TestE2EStabilityWorkCase(t *testing.T) { e2eTest: &file_elasticsearch.Config{ Count: 10, Pipeline: "test-ingest-pipeline", - Endpoints: []string{"http://localhost:9200", "http://localhost:9201"}, + Endpoints: []string{"http://localhost:19200", "http://localhost:19201"}, Username: "elastic", Password: "elastic", }, From 7507ca880de00f05429a0416d71fa7b04032b712 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 29 May 2026 20:31:53 +0300 Subject: [PATCH 27/32] fix --- e2e/file_elasticsearch/file_elasticsearch.go | 2 +- e2e/file_elasticsearch/helpers.go | 60 ++++++++++++-------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/e2e/file_elasticsearch/file_elasticsearch.go b/e2e/file_elasticsearch/file_elasticsearch.go index 2c0a5712d..ae3512a70 100644 --- a/e2e/file_elasticsearch/file_elasticsearch.go +++ b/e2e/file_elasticsearch/file_elasticsearch.go @@ -46,7 +46,7 @@ func (c *Config) Configure(t *testing.T, conf *cfg.Config, pipelineName string) output.Set("endpoints", c.Endpoints) for _, endpoint := range c.Endpoints { - err := createIngestPipeline(endpoint, c.Pipeline, c.Username, c.Password) + err := createIngestPipeline(endpoint, c.Pipeline, c.Username, c.Password, c.Count) require.NoError(t, err) } } diff --git a/e2e/file_elasticsearch/helpers.go b/e2e/file_elasticsearch/helpers.go index 421bf3e85..21c477e5a 100644 --- a/e2e/file_elasticsearch/helpers.go +++ b/e2e/file_elasticsearch/helpers.go @@ -9,38 +9,52 @@ import ( "time" ) -func createIngestPipeline(elasticURL, pipelineID, username, password string) error { +func createIngestPipeline(elasticURL, pipelineID, username, password string, retries int) error { url := fmt.Sprintf("%s/_ingest/pipeline/%s", elasticURL, pipelineID) - pipelineBody := `{"description":"test ingest pipeline","processors":[{"set":{"field":"processed_at","value":"{{_ingest.timestamp}}"}}]}` - req, err := http.NewRequest(http.MethodPut, url, strings.NewReader(pipelineBody)) - if err != nil { - return fmt.Errorf("failed to create request: %w", err) - } + var err error + for i := range retries { + err = func() error { + req, err := http.NewRequest(http.MethodPut, url, strings.NewReader(pipelineBody)) + if err != nil { + return fmt.Errorf("failed to create request: %w", err) + } - req.Header.Set("Content-Type", "application/json") - if username != "" && password != "" { - req.SetBasicAuth(username, password) - } + req.Header.Set("Content-Type", "application/json") + if username != "" && password != "" { + req.SetBasicAuth(username, password) + } - client := &http.Client{Timeout: time.Second} - resp, err := client.Do(req) - if err != nil { - return fmt.Errorf("failed to make HTTP request: %w", err) - } - defer func() { _ = resp.Body.Close() }() + client := &http.Client{Timeout: time.Second} + resp, err := client.Do(req) + if err != nil { + return fmt.Errorf("failed to make HTTP request: %w", err) + } + defer func() { _ = resp.Body.Close() }() - respBody, err := io.ReadAll(resp.Body) - if err != nil { - return fmt.Errorf("failed to read body response: %w", err) - } + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read body response: %w", err) + } - if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusCreated { - return fmt.Errorf("unexpected status: %d, body: %s", resp.StatusCode, string(respBody)) + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusCreated { + return fmt.Errorf("unexpected status: %d, body: %s", resp.StatusCode, string(respBody)) + } + + return nil + }() + + if err == nil { + return nil + } + + if i < retries-1 { + time.Sleep(200 * time.Millisecond) + } } - return nil + return fmt.Errorf("can't create pipeline after %d retries: %w", retries, err) } func getDocumentsFromIndex(elasticURL, indexName, username, password string) ([]map[string]any, error) { From ccf7aacfaec3dfff9c90ebcd9ba72a164ee5b0c1 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 29 May 2026 20:43:51 +0300 Subject: [PATCH 28/32] fix --- e2e/start_work_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/start_work_test.go b/e2e/start_work_test.go index 7dcaca2c2..b688b97b4 100644 --- a/e2e/start_work_test.go +++ b/e2e/start_work_test.go @@ -166,7 +166,7 @@ func TestE2EStabilityWorkCase(t *testing.T) { { name: "file_elasticsearch", e2eTest: &file_elasticsearch.Config{ - Count: 10, + Count: 50, Pipeline: "test-ingest-pipeline", Endpoints: []string{"http://localhost:19200"}, Username: "elastic", @@ -177,7 +177,7 @@ func TestE2EStabilityWorkCase(t *testing.T) { { name: "file_elasticsearch_cb", e2eTest: &file_elasticsearch.Config{ - Count: 10, + Count: 50, Pipeline: "test-ingest-pipeline", Endpoints: []string{"http://localhost:19200", "http://localhost:19201"}, Username: "elastic", From 6969042c3a2a49d9813f67c020c50226a491e1d6 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 29 May 2026 21:00:26 +0300 Subject: [PATCH 29/32] fix --- e2e/file_elasticsearch/helpers.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/file_elasticsearch/helpers.go b/e2e/file_elasticsearch/helpers.go index 21c477e5a..11145499e 100644 --- a/e2e/file_elasticsearch/helpers.go +++ b/e2e/file_elasticsearch/helpers.go @@ -50,7 +50,7 @@ func createIngestPipeline(elasticURL, pipelineID, username, password string, ret } if i < retries-1 { - time.Sleep(200 * time.Millisecond) + time.Sleep(time.Second) } } From e4c0dcd76e512ab5f6559bc5b8390fbe2a9691a5 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 29 May 2026 21:37:42 +0300 Subject: [PATCH 30/32] fix --- e2e/file_elasticsearch/docker-compose.yml | 4 ++-- e2e/start_work_test.go | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/e2e/file_elasticsearch/docker-compose.yml b/e2e/file_elasticsearch/docker-compose.yml index a7f14dc6a..a472a40d4 100644 --- a/e2e/file_elasticsearch/docker-compose.yml +++ b/e2e/file_elasticsearch/docker-compose.yml @@ -14,7 +14,7 @@ services: test: [ "CMD-SHELL", - "curl --output /dev/null --silent --head --fail -u elastic:elastic http://localhost:9200", + "curl --output /dev/null --silent --head --fail -u elastic:elastic http://localhost:19200", ] interval: 10s timeout: 10s @@ -34,7 +34,7 @@ services: test: [ "CMD-SHELL", - "curl --output /dev/null --silent --head --fail -u elastic:elastic http://localhost:9200", + "curl --output /dev/null --silent --head --fail -u elastic:elastic http://localhost:19201", ] interval: 10s timeout: 10s diff --git a/e2e/start_work_test.go b/e2e/start_work_test.go index b688b97b4..7dcaca2c2 100644 --- a/e2e/start_work_test.go +++ b/e2e/start_work_test.go @@ -166,7 +166,7 @@ func TestE2EStabilityWorkCase(t *testing.T) { { name: "file_elasticsearch", e2eTest: &file_elasticsearch.Config{ - Count: 50, + Count: 10, Pipeline: "test-ingest-pipeline", Endpoints: []string{"http://localhost:19200"}, Username: "elastic", @@ -177,7 +177,7 @@ func TestE2EStabilityWorkCase(t *testing.T) { { name: "file_elasticsearch_cb", e2eTest: &file_elasticsearch.Config{ - Count: 50, + Count: 10, Pipeline: "test-ingest-pipeline", Endpoints: []string{"http://localhost:19200", "http://localhost:19201"}, Username: "elastic", From 90902c88fb805d9e9a56acda87a7fce9ed6b036b Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 29 May 2026 21:43:31 +0300 Subject: [PATCH 31/32] fix --- e2e/file_elasticsearch/docker-compose.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/e2e/file_elasticsearch/docker-compose.yml b/e2e/file_elasticsearch/docker-compose.yml index a472a40d4..994c488e4 100644 --- a/e2e/file_elasticsearch/docker-compose.yml +++ b/e2e/file_elasticsearch/docker-compose.yml @@ -14,7 +14,7 @@ services: test: [ "CMD-SHELL", - "curl --output /dev/null --silent --head --fail -u elastic:elastic http://localhost:19200", + "curl --output /dev/null --silent --head --fail -u elastic:elastic http://elasticsearch:19200", ] interval: 10s timeout: 10s @@ -34,7 +34,7 @@ services: test: [ "CMD-SHELL", - "curl --output /dev/null --silent --head --fail -u elastic:elastic http://localhost:19201", + "curl --output /dev/null --silent --head --fail -u elastic:elastic http://elasticsearch:19201", ] interval: 10s timeout: 10s From fe08704b8c532d9ba5729f4a71965a2835db2ff1 Mon Sep 17 00:00:00 2001 From: Sergey Lazarenko Date: Fri, 29 May 2026 22:00:04 +0300 Subject: [PATCH 32/32] fix --- e2e/file_elasticsearch/docker-compose.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e/file_elasticsearch/docker-compose.yml b/e2e/file_elasticsearch/docker-compose.yml index 994c488e4..17a65833f 100644 --- a/e2e/file_elasticsearch/docker-compose.yml +++ b/e2e/file_elasticsearch/docker-compose.yml @@ -34,7 +34,7 @@ services: test: [ "CMD-SHELL", - "curl --output /dev/null --silent --head --fail -u elastic:elastic http://elasticsearch:19201", + "curl --output /dev/null --silent --head --fail -u elastic:elastic http://elasticsearch2:19201", ] interval: 10s timeout: 10s