Skip to content

CP-308811: Add an option to limit the span depth in tracing #6607

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 84 additions & 14 deletions ocaml/libs/tracing/tracing.ml
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,8 @@ module TraceContext = struct

let empty = {traceparent= None; baggage= None}

let depth_key = "span.depth"

let with_traceparent traceparent ctx = {ctx with traceparent}

let with_baggage baggage ctx = {ctx with baggage}
Expand All @@ -230,6 +232,20 @@ module TraceContext = struct

let baggage_of ctx = ctx.baggage

let baggage_depth_of ctx =
Option.bind (baggage_of ctx) (List.assoc_opt depth_key)
|> Option.value ~default:"1"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is 1 a good default? Is the default not unlimited depth? Or has 1 special meaning? It seems there is quite a bit of int/string conversion and I assume that is because the this is a string/string map. If we need more structured data, the whole baggage could be a JSON value.

Do we have to do this conversion at every function boundary because we need to update the depth that is passed on?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is 1 a good default?

Everything ought to have a depth baggage, which is derived from the parent's depth + 1. If this doesn't exist, it must be a new root span (no parent) and should have depth 1. If there were another situation where the depth was empty, we wouldn't know any other value to pick anyway and choosing 1 errs on the side of keeping spans rather than throwing them away.

Do we have to do this conversion at every function boundary because we need to update the depth that is passed on?

Yes, it's really a limitation of the Span.attributes which are string * string. It would be nice for depth to keep it as an int to avoid these conversions but I think other baggage values can use string.

Using JSON for the bagage (and attributes) does sound like a good future improvement, as that's what it is trying to represent really, and is how it ends up when it's exported to jaeger/bugtool

|> int_of_string

let update_with_baggage k v ctx =
let new_baggage =
baggage_of ctx
|> Option.value ~default:[]
|> List.remove_assoc k
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we manipulate this a lot we might want to replace this with a StringMap in the future, for now it is probably ok if we only have 1 or 2 keys

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be called update? Always try to remove the key if it exists first and add the new k/v pair.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree that update makes more sense here :)

|> List.cons (k, v)
in
with_baggage (Some new_baggage) ctx

let parse input =
let open Astring.String in
let trim_pair (key, value) = (trim key, trim value) in
Expand Down Expand Up @@ -322,22 +338,36 @@ module Span = struct

let start ?(attributes = Attributes.empty)
?(trace_context : TraceContext.t option) ~name ~parent ~span_kind () =
let trace_id, extra_context =
let trace_id, extra_context, depth =
match parent with
| None ->
(Trace_id.make (), TraceContext.empty)
(Trace_id.make (), TraceContext.empty, 1)
| Some span_parent ->
(span_parent.context.trace_id, span_parent.context.trace_context)
( span_parent.context.trace_id
, span_parent.context.trace_context
, TraceContext.baggage_depth_of span_parent.context.trace_context + 1
)
in
let span_id = Span_id.make () in
let extra_context_with_depth =
TraceContext.(
update_with_baggage depth_key (string_of_int depth) extra_context
)
in
let context : SpanContext.t =
{trace_id; span_id; trace_context= extra_context}
{trace_id; span_id; trace_context= extra_context_with_depth}
in
let context =
(* If trace_context is provided to the call, override any inherited trace context. *)
trace_context
|> Option.fold ~none:context
~some:(Fun.flip SpanContext.with_trace_context context)
(* If trace_context is provided to the call, override any inherited trace
context except span.depth which should still be maintained. *)
match trace_context with
| Some tc ->
let tc_with_depth =
TraceContext.(update_with_baggage depth_key (string_of_int depth) tc)
in
SpanContext.with_trace_context tc_with_depth context
| None ->
context
in
(* Using gettimeofday over Mtime as it is better for sharing timestamps between the systems *)
let begin_time = Unix.gettimeofday () in
Expand Down Expand Up @@ -473,6 +503,11 @@ module Spans = struct

let set_max_traces x = Atomic.set max_traces x

(* Default is much larger than the largest current traces, so effectively off *)
let max_depth = Atomic.make 100

let set_max_depth x = Atomic.set max_depth x

let finished_spans = Atomic.make ([], 0)

let span_hashtbl_is_empty () = TraceMap.is_empty (Atomic.get spans)
Expand Down Expand Up @@ -713,12 +748,18 @@ module Tracer = struct
let get_tracer ~name:_ = TracerProvider.get_current ()

let span_of_span_context context name : Span.t =
let tc = SpanContext.context_of_span_context context in
let new_depth = TraceContext.baggage_depth_of tc in
let new_tc =
TraceContext.(update_with_baggage depth_key (string_of_int new_depth) tc)
in
let context = SpanContext.with_trace_context new_tc context in
{
context
; status= {status_code= Status.Unset; _description= None}
; name
; parent= None
; span_kind= SpanKind.Client (* This will be the span of the client call*)
; span_kind= SpanKind.Client (* This will be the span of the client call *)
; begin_time= Unix.gettimeofday ()
; end_time= None
; links= []
Expand All @@ -730,10 +771,23 @@ module Tracer = struct
?(span_kind = SpanKind.Internal) ~name ~parent () :
(Span.t option, exn) result =
let open TracerProvider in
(* Do not start span if the TracerProvider is disabled*)
let parent_depth =
Option.fold ~none:1
~some:(fun parent ->
parent.Span.context
|> SpanContext.context_of_span_context
|> TraceContext.baggage_depth_of
)
parent
in
(* Do not start span if the TracerProvider is disabled *)
if not t.enabled then
ok_none (* Do not start span if the max depth has been reached *)
else if parent_depth >= Atomic.get Spans.max_depth then (
let parent_trace_id = Option.fold ~none:"None" ~some:(fun p -> p.Span.context |> SpanContext.span_id_of_span_context |> Span_id.to_string) parent in
debug "Max_span_depth limit reached, not creating span %s (parent %s)" name parent_trace_id ;
ok_none
else
) else
let attributes = Attributes.merge_into t.attributes attributes in
let span =
Span.start ~attributes ?trace_context ~name ~parent ~span_kind ()
Expand All @@ -750,16 +804,24 @@ module Tracer = struct
|> Spans.remove_from_spans
|> Option.map (fun existing_span ->
let old_context = Span.get_context existing_span in
let parent_trace_context = Span.get_trace_context parent in
let new_depth =
TraceContext.baggage_depth_of parent_trace_context + 1
in
let new_context : SpanContext.t =
let trace_context = span.Span.context.trace_context in
let trace_context =
TraceContext.(
update_with_baggage depth_key (string_of_int new_depth)
span.Span.context.trace_context
)
in
SpanContext.context
(SpanContext.trace_id_of_span_context parent.context)
old_context.span_id
|> SpanContext.with_trace_context trace_context
in
let updated_span = {existing_span with parent= Some parent} in
let updated_span = {updated_span with context= new_context} in

let () = Spans.add_to_spans ~span:updated_span in
updated_span
)
Expand Down Expand Up @@ -926,7 +988,15 @@ module Propagator = struct
let trace_context' =
TraceContext.with_traceparent (Some traceparent) trace_context
in
let carrier' = P.inject_into trace_context' carrier in
let new_depth =
TraceContext.baggage_depth_of trace_context' + 1 |> string_of_int
in
let trace_context'' =
TraceContext.(
update_with_baggage depth_key new_depth trace_context'
)
in
let carrier' = P.inject_into trace_context'' carrier in
f carrier'
| _ ->
f carrier
Expand Down
2 changes: 2 additions & 0 deletions ocaml/libs/tracing/tracing.mli
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,8 @@ module Spans : sig

val set_max_traces : int -> unit

val set_max_depth : int -> unit

val span_count : unit -> int

val since : unit -> Span.t list * int
Expand Down
28 changes: 26 additions & 2 deletions ocaml/libs/tracing/tracing_export.ml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ let export_interval = ref 30.

let set_export_interval t = export_interval := t

let export_chunk_size = Atomic.make 10000

let set_export_chunk_size x = Atomic.set export_chunk_size x

let host_id = ref "localhost"

let set_host_id id = host_id := id
Expand Down Expand Up @@ -289,17 +293,37 @@ module Destination = struct
with exn ->
debug "Tracing: unable to export span : %s" (Printexc.to_string exn)

(*TODO move this into Spans.since itself? *)
let rec span_info_chunks span_info batch_size =
let rec list_to_chunks_inner l n curr chunks =
if n = 0 then
if List.length l > 0 then
list_to_chunks_inner l batch_size [] ((curr, batch_size) :: chunks)
else
(curr, batch_size) :: chunks
else
match l with
| [] -> (curr, List.length curr) :: chunks
| h :: t ->
list_to_chunks_inner t (n-1) (h :: curr) chunks
in
list_to_chunks_inner (fst span_info) batch_size [] []

let flush_spans () =
let ((_span_list, span_count) as span_info) = Spans.since () in
let attributes = [("export.traces.count", string_of_int span_count)] in
let@ parent =
with_tracing ~span_kind:Server ~trace_context:TraceContext.empty
~parent:None ~attributes ~name:"Tracing.flush_spans"
in
TracerProvider.get_tracer_providers ()
let endpoints = TracerProvider.get_tracer_providers ()
|> List.filter TracerProvider.get_enabled
|> List.concat_map TracerProvider.get_endpoints
|> List.iter (export_to_endpoint parent span_info)
in
let span_info_chunks = span_info_chunks span_info (Atomic.get export_chunk_size) in
List.iter (fun s_i ->
List.iter (export_to_endpoint parent s_i) endpoints
) span_info_chunks

let delay = Delay.make ()

Expand Down
7 changes: 7 additions & 0 deletions ocaml/libs/tracing/tracing_export.mli
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,13 @@ val set_export_interval : float -> unit
Default is every [30.] seconds.
*)

val set_export_chunk_size : int -> unit
(** [set_export_chunk_size size] sets the maximum number of finished spans that
can be exported in one chunk to [size].

Default is 10000 spans.
*)

val set_host_id : string -> unit
(** [set_host_id id] sets the id of the host to [id].

Expand Down
1 change: 1 addition & 0 deletions ocaml/tests/test_observer.ml
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ let verify_json_fields_and_values ~json =
; ("xs.host.uuid", `String _)
; ("xs.host.name", `String _)
; ("service.name", `String _)
; ("span.depth", `String _)
]
)
; ("annotations", `List _)
Expand Down
14 changes: 14 additions & 0 deletions ocaml/xapi/xapi_globs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -1051,10 +1051,14 @@ let trace_log_dir = ref "/var/log/dt/zipkinv2/json"

let export_interval = ref 30.

let export_chunk_size = ref 10000

let max_spans = ref 10000

let max_traces = ref 10000

let max_span_depth = ref 100

let use_xmlrpc = ref true

let compress_tracing_files = ref true
Expand Down Expand Up @@ -1671,6 +1675,11 @@ let other_options =
, (fun () -> string_of_float !export_interval)
, "The interval for exports in Tracing"
)
; ( "export-chunk-size"
, Arg.Set_int export_chunk_size
, (fun () -> string_of_int !export_chunk_size)
, "The span chunk size for exports in Tracing"
)
; ( "max-spans"
, Arg.Set_int max_spans
, (fun () -> string_of_int !max_spans)
Expand Down Expand Up @@ -1778,6 +1787,11 @@ let other_options =
, (fun () -> string_of_float !vm_sysprep_wait)
, "Time in seconds to wait for VM to recognise inserted CD"
)
; ( "max-span-depth"
, Arg.Set_int max_span_depth
, (fun () -> string_of_int !max_span_depth)
, "The maximum depth to which spans are recorded in a trace in Tracing"
)
]

(* The options can be set with the variable xapiflags in /etc/sysconfig/xapi.
Expand Down
2 changes: 2 additions & 0 deletions ocaml/xapi/xapi_observer.ml
Original file line number Diff line number Diff line change
Expand Up @@ -599,6 +599,8 @@ let initialise_observer ~__context component =
initialise_observer_component ~__context component

let initialise ~__context =
Tracing.Spans.set_max_depth !Xapi_globs.max_span_depth ;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need a similar initialization in xenopsd?
If the xapi/xenopsd propagation is fixed then the depth limit might be reached inside xenopsd instead, and currentlythat would always default to 100?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See how set_max_spans is implemented, introduce a new Observer.set_... function and call it from initialize_observer_meta.

Tracing_export.set_export_chunk_size !Xapi_globs.export_chunk_size ;
List.iter (initialise_observer_meta ~__context) (startup_components ()) ;
Db.Observer.get_all ~__context
|> List.iter (fun self ->
Expand Down
Loading