Skip to content

pmproxy: race condition with multiple containers/contexts and pmapi indom request #991

@andreasgerstmayr

Description

@andreasgerstmayr

I noticed that the instance names sometimes weren't rendered correctly while testing the Vector container overview dashboard. I investigated and it looks like there's a race condition when two contexts with two different containers are calling the /pmapi/indom endpoint.

To verify this I wrote a tiny python script which:

  • creates two contexts (one for each container, in the example containers named grafana and grafana2)
  • calls /pmapi/<ctx>/fetch and /pmapi/<ctx>/indom and checks if the instance ids match

It's using the cgroup.io.stat.wbytes metric, i.e. the instances are disks (which shouldn't change).

#!/usr/bin/python3
import asyncio
import urllib.parse
import json

async def get_json(url):
    # unfortunately Python doesn't come with a builtin async HTTP client
    url = urllib.parse.urlsplit(url)
    query = (f"GET {url.path}?{url.query} HTTP/1.0\r\nHost: {url.hostname}\r\n\r\n")
    reader, writer = await asyncio.open_connection(url.hostname, url.port)

    writer.write(query.encode('latin-1'))
    response = await reader.read()
    response = response.decode('latin-1')
    _headers, content = response.split('\r\n\r\n', maxsplit=1)
    writer.close()
    return json.loads(content)

async def get_ctx(container=None):
    hostspec = urllib.parse.quote_plus(f"pcp://127.0.0.1?container={container}")
    r = await get_json(f"http://localhost:44322/pmapi/context?hostspec={hostspec}&polltimeout=120")
    return r['context']

async def get_instids_from_indoms(ctx):
    r = await get_json(f"http://localhost:44322/pmapi/{ctx}/indom?name=cgroup.io.stat.wbytes")
    return [instance["instance"] for instance in r["instances"]]

async def get_instids_from_fetch(ctx):
    r = await get_json(f"http://localhost:44322/pmapi/{ctx}/fetch?name=cgroup.io.stat.wbytes")
    return [instance["instance"] for instance in r["values"][0]["instances"]]

async def run_challenge(ctx):
    try:
        fetch = str(sorted(await get_instids_from_fetch(ctx)))
        indoms = str(sorted(await get_instids_from_indoms(ctx)))
    except:
        print("FAIL network error")
        return

    if fetch == indoms:
        print(f"PASS context {ctx:>10} fetch {fetch:>10} indoms {indoms:>10}")
    else:
        print(f"FAIL context {ctx:>10} fetch {fetch:>10} indoms {indoms:>10}")

async def main():
    ctx1 = await get_ctx(container='grafana')
    ctx2 = await get_ctx(container='grafana2')

    print("Sync")
    challenges = [run_challenge(ctx1) for i in range(10)] + [run_challenge(ctx2) for i in range(10)]
    for challenge in challenges:
        await challenge

    print("Async")
    challenges = [run_challenge(ctx1) for i in range(10)] + [run_challenge(ctx2) for i in range(10)]
    await asyncio.gather(*challenges)

    print("Sync")
    challenges = [run_challenge(ctx1) for i in range(10)] + [run_challenge(ctx2) for i in range(10)]
    for challenge in challenges:
        await challenge

    print("Async")
    challenges = [run_challenge(ctx1) for i in range(10)] + [run_challenge(ctx2) for i in range(10)]
    await asyncio.gather(*challenges)

asyncio.run(main())

which gives me an output like this:

Sync
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
Async
FAIL context 1913315220 fetch  [0, 1, 2] indoms         []
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms        [5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms     [4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms     [4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
FAIL context 1600781486 fetch  [3, 4, 5] indoms         []
FAIL context 1600781486 fetch  [3, 4, 5] indoms         []
FAIL context 1600781486 fetch  [3, 4, 5] indoms     [3, 5]
FAIL context 1600781486 fetch  [3, 4, 5] indoms         []
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
FAIL context 1600781486 fetch  [3, 4, 5] indoms        [5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
Sync
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1913315220 fetch  [0, 1, 2] indoms  [0, 1, 2]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
Async
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms         []
FAIL context 1913315220 fetch  [0, 1, 2] indoms         []
FAIL context 1913315220 fetch  [0, 1, 2] indoms         []
FAIL context 1913315220 fetch  [0, 1, 2] indoms        [5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
FAIL context 1600781486 fetch  [3, 4, 5] indoms         []
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
FAIL context 1600781486 fetch  [3, 4, 5] indoms        [5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
FAIL context 1600781486 fetch  [3, 4, 5] indoms        [5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]
FAIL context 1913315220 fetch  [0, 1, 2] indoms  [3, 4, 5]
PASS context 1600781486 fetch  [3, 4, 5] indoms  [3, 4, 5]

Everything is fine if the requests are sent synchronous, but if they're sent asynchronously I get wrong results. The instance id's from the fetch call are always correct, but the indom calls return inconsistent data.

My guess is that there is some non-threadsafe function in the callstack of the indom response handler.

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions