-
-
Notifications
You must be signed in to change notification settings - Fork 245
Description
I noticed that the instance names sometimes weren't rendered correctly while testing the Vector container overview dashboard. I investigated and it looks like there's a race condition when two contexts with two different containers are calling the /pmapi/indom
endpoint.
To verify this I wrote a tiny python script which:
- creates two contexts (one for each container, in the example containers named
grafana
andgrafana2
) - calls
/pmapi/<ctx>/fetch
and/pmapi/<ctx>/indom
and checks if the instance ids match
It's using the cgroup.io.stat.wbytes
metric, i.e. the instances are disks (which shouldn't change).
#!/usr/bin/python3
import asyncio
import urllib.parse
import json
async def get_json(url):
# unfortunately Python doesn't come with a builtin async HTTP client
url = urllib.parse.urlsplit(url)
query = (f"GET {url.path}?{url.query} HTTP/1.0\r\nHost: {url.hostname}\r\n\r\n")
reader, writer = await asyncio.open_connection(url.hostname, url.port)
writer.write(query.encode('latin-1'))
response = await reader.read()
response = response.decode('latin-1')
_headers, content = response.split('\r\n\r\n', maxsplit=1)
writer.close()
return json.loads(content)
async def get_ctx(container=None):
hostspec = urllib.parse.quote_plus(f"pcp://127.0.0.1?container={container}")
r = await get_json(f"http://localhost:44322/pmapi/context?hostspec={hostspec}&polltimeout=120")
return r['context']
async def get_instids_from_indoms(ctx):
r = await get_json(f"http://localhost:44322/pmapi/{ctx}/indom?name=cgroup.io.stat.wbytes")
return [instance["instance"] for instance in r["instances"]]
async def get_instids_from_fetch(ctx):
r = await get_json(f"http://localhost:44322/pmapi/{ctx}/fetch?name=cgroup.io.stat.wbytes")
return [instance["instance"] for instance in r["values"][0]["instances"]]
async def run_challenge(ctx):
try:
fetch = str(sorted(await get_instids_from_fetch(ctx)))
indoms = str(sorted(await get_instids_from_indoms(ctx)))
except:
print("FAIL network error")
return
if fetch == indoms:
print(f"PASS context {ctx:>10} fetch {fetch:>10} indoms {indoms:>10}")
else:
print(f"FAIL context {ctx:>10} fetch {fetch:>10} indoms {indoms:>10}")
async def main():
ctx1 = await get_ctx(container='grafana')
ctx2 = await get_ctx(container='grafana2')
print("Sync")
challenges = [run_challenge(ctx1) for i in range(10)] + [run_challenge(ctx2) for i in range(10)]
for challenge in challenges:
await challenge
print("Async")
challenges = [run_challenge(ctx1) for i in range(10)] + [run_challenge(ctx2) for i in range(10)]
await asyncio.gather(*challenges)
print("Sync")
challenges = [run_challenge(ctx1) for i in range(10)] + [run_challenge(ctx2) for i in range(10)]
for challenge in challenges:
await challenge
print("Async")
challenges = [run_challenge(ctx1) for i in range(10)] + [run_challenge(ctx2) for i in range(10)]
await asyncio.gather(*challenges)
asyncio.run(main())
which gives me an output like this:
Sync
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
Async
FAIL context 1913315220 fetch [0, 1, 2] indoms []
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
FAIL context 1600781486 fetch [3, 4, 5] indoms []
FAIL context 1600781486 fetch [3, 4, 5] indoms []
FAIL context 1600781486 fetch [3, 4, 5] indoms [3, 5]
FAIL context 1600781486 fetch [3, 4, 5] indoms []
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
FAIL context 1600781486 fetch [3, 4, 5] indoms [5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
Sync
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1913315220 fetch [0, 1, 2] indoms [0, 1, 2]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
Async
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms []
FAIL context 1913315220 fetch [0, 1, 2] indoms []
FAIL context 1913315220 fetch [0, 1, 2] indoms []
FAIL context 1913315220 fetch [0, 1, 2] indoms [5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
FAIL context 1600781486 fetch [3, 4, 5] indoms []
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
FAIL context 1600781486 fetch [3, 4, 5] indoms [5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
FAIL context 1600781486 fetch [3, 4, 5] indoms [5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
FAIL context 1913315220 fetch [0, 1, 2] indoms [3, 4, 5]
PASS context 1600781486 fetch [3, 4, 5] indoms [3, 4, 5]
Everything is fine if the requests are sent synchronous, but if they're sent asynchronously I get wrong results. The instance id's from the fetch call are always correct, but the indom calls return inconsistent data.
My guess is that there is some non-threadsafe function in the callstack of the indom response handler.