diff --git a/api/api_tests/internal/transform/test_embed_text.py b/api/api_tests/internal/transform/test_embed_text.py index a30adbc62..b524f90bd 100644 --- a/api/api_tests/internal/transform/test_embed_text.py +++ b/api/api_tests/internal/transform/test_embed_text.py @@ -369,10 +369,12 @@ def test_async_request_handler_empty_prompts_list(mock_make_async_request): assert result == [] -@patch("nv_ingest_api.internal.transform.embed_text.infer_microservice") -def test_make_async_request_happy_path(im_mock): +@patch("nv_ingest_api.util.nim.infer_microservice") +@patch(f"{MODULE_UNDER_TEST}.infer_microservice", create=True) +def test_make_async_request_happy_path(module_im_mock, nim_im_mock): # Assign - im_mock.return_value = [[0.1, 0.2, 0.3]] + nim_im_mock.return_value = [[0.1, 0.2, 0.3]] + module_im_mock.return_value = [[0.1, 0.2, 0.3]] # Act result = module_under_test._make_async_request( prompts=["Hello world"], @@ -385,8 +387,8 @@ def test_make_async_request_happy_path(im_mock): filter_errors=False, dimensions=None, ) - # Assert: client called as expected - im_mock.assert_called_once_with( + # Assert: client called as expected (module-level import is used in embed_text) + module_im_mock.assert_called_once_with( ["Hello world"], "dummy_model", embedding_endpoint="http://dummy-endpoint", @@ -403,10 +405,12 @@ def test_make_async_request_happy_path(im_mock): assert result == {"embedding": [[0.1, 0.2, 0.3]], "info_msg": None} -@patch("nv_ingest_api.internal.transform.embed_text.infer_microservice") -def test_make_async_request_failure_returns_none_embedding_and_info_message(im_mock): +@patch("nv_ingest_api.util.nim.infer_microservice") +@patch(f"{MODULE_UNDER_TEST}.infer_microservice", create=True) +def test_make_async_request_failure_returns_none_embedding_and_info_message(module_im_mock, nim_im_mock): # Arrange - im_mock.side_effect = RuntimeError("Simulated client failure") + nim_im_mock.side_effect = RuntimeError("Simulated client failure") + module_im_mock.side_effect = RuntimeError("Simulated client failure") # Act & Assert with pytest.raises(RuntimeError) as excinfo: diff --git a/api/src/nv_ingest_api/internal/primitives/tracing/tagging.py b/api/src/nv_ingest_api/internal/primitives/tracing/tagging.py index dd771152d..75715bd57 100644 --- a/api/src/nv_ingest_api/internal/primitives/tracing/tagging.py +++ b/api/src/nv_ingest_api/internal/primitives/tracing/tagging.py @@ -15,7 +15,7 @@ def traceable(trace_name: Optional[str] = None): """ - A decorator that adds entry and exit trace timestamps to a IngestControlMessage's metadata + A decorator that adds entry and exit trace timestamps to an IngestControlMessage's metadata based on the presence of a 'config::add_trace_tagging' flag. This decorator checks if the 'config::add_trace_tagging' flag is set to True in the @@ -37,7 +37,7 @@ def traceable(trace_name: Optional[str] = None): Notes ----- - The decorated function must accept a IngestControlMessage object as one of its arguments. + The decorated function must accept an IngestControlMessage object as one of its arguments. For a regular function, this is expected to be the first argument; for a class method, this is expected to be the second argument (after 'self'). The IngestControlMessage object must implement `has_metadata`, `get_metadata`, and `set_metadata` methods used by the decorator @@ -51,7 +51,7 @@ def traceable(trace_name: Optional[str] = None): -------- Automatic stage name detection (recommended): - >>> @traceable() # Uses self.stage_name automatically + >>> @traceable() # Uses self.stage_name automatically ... def process_message(self, message): ... pass @@ -253,14 +253,14 @@ def set_trace_timestamps_with_parent_context(control_message, execution_trace_lo -------- Basic usage in a stage: - >>> execution_trace_log = {"trace::entry::yolox_inference": ts1, "trace::exit::yolox_inference": ts2} + >>> execution_trace_log = {"trace::entry::yolox_inference": ts1, "trace::exit::yolox_inference": ts2} # noqa >>> set_trace_timestamps_with_parent_context( ... control_message, execution_trace_log, "pdf_extractor", logger ... ) This transforms: - trace::entry::yolox_inference -> trace::entry::pdf_extractor::yolox_inference - - trace::exit::yolox_inference -> trace::exit::pdf_extractor::yolox_inference + - trace::exit::yolox_inference -> trace::exit::pdf_extractor::yolox_inference """ if not execution_trace_log: return diff --git a/assets/style.css b/assets/style.css new file mode 100644 index 000000000..36fe2ebba --- /dev/null +++ b/assets/style.css @@ -0,0 +1,111 @@ +/* Global fonts and tokens */ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&display=swap'); + +:root { + --bg: #0f1115; + --surface: #151821; + --surface-2: #1b2030; + --text: #e5e7eb; + --text-muted: #9aa3b2; + --border: #2a2f3d; + --accent: #4f46e5; + --accent-2: #06b6d4; + --success: #10b981; + --danger: #ef4444; + --warning: #f59e0b; + --radius: 8px; + --shadow: 0 1px 2px rgba(0,0,0,0.25), 0 8px 24px rgba(0,0,0,0.18); + --gap-xs: 4px; --gap-sm: 8px; --gap-md: 12px; --gap-lg: 16px; --gap-xl: 24px; +} + +/* Light theme support via data-theme=light on body (optional) */ +body[data-theme="light"] { + --bg: #ffffff; + --surface: #f6f7fb; + --surface-2: #eef1f7; + --text: #111827; + --text-muted: #6b7280; + --border: #e5e7eb; +} + +html, body { height: 100%; } +body { + margin: 0; + background: var(--bg); + color: var(--text); + font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; + font-size: 14px; + line-height: 1.5; +} + +/* Layout */ +.grid { display: grid; grid-template-columns: 300px 1fr; gap: var(--gap-lg); align-items: start; } +.sidebar { + position: sticky; top: var(--gap-lg); + padding: var(--gap-lg); + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius); + box-shadow: var(--shadow); +} +.section-title { font-size: 16px; font-weight: 600; margin: var(--gap-sm) 0; } +.label { color: var(--text-muted); font-size: 12px; text-transform: uppercase; letter-spacing: 0.04em; margin-bottom: 4px; display: block; } +.help, .muted { color: var(--text-muted); font-size: 12px; } +.control { margin: var(--gap-md) 0; } +.kpis { display: grid; grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); gap: var(--gap-md); margin: var(--gap-sm) 0 var(--gap-lg); } +.graph { margin-bottom: var(--gap-lg); background: var(--surface); border: 1px solid var(--border); border-radius: var(--radius); padding: var(--gap-sm); } + +/* Inputs & Buttons */ +input, textarea { + background: var(--surface-2); + color: var(--text); + border: 1px solid var(--border); + border-radius: 6px; + padding: 8px 10px; +} +input::placeholder, textarea::placeholder { color: var(--text-muted); } +button, .button { + appearance: none; border: 1px solid var(--border); background: var(--surface-2); + color: var(--text); padding: 6px 12px; border-radius: 6px; cursor: pointer; +} +button:hover { border-color: color-mix(in srgb, var(--accent) 35%, var(--border)); } +button.primary { background: var(--accent); border-color: var(--accent); color: #fff; } +button.primary:hover { background: color-mix(in srgb, var(--accent) 85%, #000); } + +/* Dash core components */ +/* RadioItems */ +input[type="radio"] { accent-color: var(--accent); } +input[type="checkbox"] { accent-color: var(--accent); } + +/* Dropdown (react-select) */ +.Select-control { background: var(--surface-2); border-color: var(--border); color: var(--text); } +.Select--single > .Select-control .Select-value, .Select-placeholder { color: var(--text); } +.Select-menu-outer { background: var(--surface-2); border-color: var(--border); color: var(--text); z-index: 1000; } +.Select-option { background: var(--surface-2); color: var(--text); } +.Select-option.is-focused { background: color-mix(in srgb, var(--accent) 14%, var(--surface-2)); } +.Select-option.is-selected { background: color-mix(in srgb, var(--accent) 28%, var(--surface-2)); } + +/* Slider (rc-slider) */ +.rc-slider { padding: 6px 0; } +.rc-slider-rail { background: var(--border); } +.rc-slider-track { background: var(--accent); } +.rc-slider-handle { border-color: var(--accent); background: #fff; } + +/* Tabs */ +.tabs, .dash-tabs { background: transparent; } +.tab { background: var(--surface); border: 1px solid var(--border) !important; color: var(--text); border-radius: 6px 6px 0 0; margin-right: 4px; padding: 8px 12px; } +.tab--selected { border-bottom-color: transparent !important; background: var(--surface-2); } + +/* Upload */ +.dccUpload { border: 1px dashed var(--border); border-radius: 6px; padding: 6px 10px; color: var(--text-muted); } + +/* Cytoscape container */ +#proctree-cyto-container { background: var(--surface); border: 1px solid var(--border); border-radius: var(--radius); } +#proctree-graph { background: var(--surface); } + +/* Helper spacing */ +hr { border: none; border-top: 1px solid var(--border); margin: var(--gap-md) 0; opacity: 0.7; } + +/* Plotly figure background harmonization */ +.js-plotly-plot .plotly .bg, .js-plotly-plot .plotly .bglayer, .js-plotly-plot .plotly .plot { background: transparent !important; } +.js-plotly-plot .plotly .infolayer { color: var(--text); } diff --git a/client/src/nv_ingest_client/client/ingest_job_handler.py b/client/src/nv_ingest_client/client/ingest_job_handler.py index c3c9af815..c2f81dd2a 100644 --- a/client/src/nv_ingest_client/client/ingest_job_handler.py +++ b/client/src/nv_ingest_client/client/ingest_job_handler.py @@ -259,6 +259,59 @@ def _save_response_data( clean_doc_name = os.path.basename(doc_name) output_name = f"{clean_doc_name}.metadata.json" + # Additionally, write out parallel files for trace timings and annotations (if present) + try: + os.makedirs(output_directory, exist_ok=True) + except Exception: + # Best-effort; directory should generally exist already + pass + + # Build primitive breakdown from response_data + primitive_total: int = 0 + primitive_counts_by_type: Dict[str, int] = defaultdict(int) + structured_by_subtype: Dict[str, int] = defaultdict(int) + try: + for document in response_data: + # Each document is treated as one primitive + primitive_total += 1 + meta: Dict[str, Any] = document.get("metadata", {}) + content_meta: Dict[str, Any] = meta.get("content_metadata", {}) + doc_type: str = content_meta.get("type", "unknown") + primitive_counts_by_type[doc_type] += 1 + if doc_type == "structured": + subtype: str = content_meta.get("subtype", "unknown") + structured_by_subtype[subtype] += 1 + except Exception: + # Be resilient; don't let counting failures block output + pass + + # Merge trace (if any) with primitive counts and always write a traces file + try: + trace_obj = response.get("trace") or response.get("traces") or {} + trace_out = dict(trace_obj) + trace_out["primitive_counts"] = { + "total": primitive_total, + "by_type": dict(primitive_counts_by_type), + "structured_by_subtype": dict(structured_by_subtype), + } + + trace_path = os.path.join(output_directory, f"{clean_doc_name}.traces.json") + with open(trace_path, "w") as f: + f.write(json.dumps(trace_out, indent=2)) + logger.debug("Wrote trace output to %s", trace_path) + except Exception as e: + logger.error("Failed to write traces for %s: %s", clean_doc_name, e) + + annotations_obj = response.get("annotations") + if annotations_obj: + try: + annotations_path = os.path.join(output_directory, f"{clean_doc_name}.annotations.json") + with open(annotations_path, "w") as f: + f.write(json.dumps(annotations_obj, indent=2)) + logger.debug("Wrote annotations output to %s", annotations_path) + except Exception as e: + logger.error("Failed to write annotations for %s: %s", clean_doc_name, e) + # Organize by document type doc_map: Dict[str, List[Dict[str, Any]]] = {} for document in response_data: diff --git a/scripts/support/__init__.py b/scripts/support/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/support/system_monitor/README.md b/scripts/support/system_monitor/README.md new file mode 100644 index 000000000..a7c3ef1d7 --- /dev/null +++ b/scripts/support/system_monitor/README.md @@ -0,0 +1,166 @@ +# System Monitor + +A lightweight system tracing and dashboard toolkit with two primary workflows: + +1) Real-time monitoring: start/stop tracing directly from the dashboard UI and visualize live metrics. +2) Offline exploration: collect Parquet/CSV on one system and explore it on another system without a running tracer. + +This package contains: +- `system_monitor.py`: Dash dashboard (UI, charts, event annotations, process tree). +- `system_tracer.py`: Tracer library and CLI (collects metrics, writes Parquet atomically). + + +## Requirements + +- Python 3.9+ +- Core: `pandas`, `psutil`, `plotly`, `dash`, `click` +- Optional: + - `pyarrow` (preferred) or `fastparquet` for Parquet + - `dash-cytoscape` for process tree graph view (text view works without it) + - `docker` Python package and a local Docker daemon for container metrics + - `pynvml` for NVIDIA GPU metrics (if NVIDIA drivers are present) + +Install common dependencies (example): + +```bash +pip install pandas psutil plotly dash click pyarrow +# Optional extras +pip install fastparquet dash-cytoscape docker pynvml +``` + +Environment setup +- Ensure the `system_monitor` package is importable at the top level. If you are running from the repo without installing, set PYTHONPATH so `python -m system_monitor` works: + +```bash +export PYTHONPATH=$(pwd)/scripts/support:$PYTHONPATH +``` + +Alternatively, install the package into your environment (recommended for reuse). If you maintain a packaging config, use `pip install -e .` at the repo root. + + +## Quickstart A: Real-time Monitoring (single machine) + +Launch the dashboard on the machine you want to monitor and control tracing from the UI. + +```bash +python -m system_monitor --datafile system_monitor.parquet --port 8050 +``` + +- Open the URL printed (default http://0.0.0.0:8050). +- In the left sidebar under Tracing: + - Set Output Parquet Path (defaults to `system_monitor.parquet`). + - Adjust Sampling and Write Interval. + - Toggle Enable GPU / Enable Docker as needed. + - Click Start to begin live tracing. Click Stop to end. Click Snapshot Now to force an immediate write. +- The graphs update as data is written. You can: + - Switch theme (Light/Dark) + - Change time range and smoothing + - Add/import events and toggle event markers + - Inspect the process tree (text or Cytoscape if installed) + +Notes +- Writes are atomic (tmp + replace) to avoid partial reads. +- If `pyarrow` is unavailable, the tracer falls back to `fastparquet` via pandas. +- Docker/GPU stats are optional and automatically disabled if their deps/daemons are unavailable. +- Timezones: Graphs default to Local display time. You can switch graphs to UTC or a custom IANA zone. Use the "Data timezone (source)" selector if your data was recorded in UTC. + + +## Quickstart B: Offline Data Collection and Exploration (two machines) + +Use one system to collect data (headless), then transfer the file to another system for exploration in the dashboard. + +1) Collect on Source (headless CLI): + +```bash +# Run continuously until interrupted (local timestamps by default) +python -m system_monitor.system_tracer run \ + --output /tmp/system_monitor.parquet \ + --sample-interval 2 \ + --write-interval 10 + +# Record timestamps in UTC instead of local +python -m system_monitor.system_tracer run \ + --output /tmp/system_monitor_utc.parquet \ + --sample-interval 2 \ + --write-interval 10 \ + --utc + +# Or run for a fixed duration (e.g., 5 minutes) +python -m system_monitor.system_tracer run \ + --output /tmp/system_monitor.parquet \ + --sample-interval 2 \ + --write-interval 10 \ + --duration 300 +``` + +2) Transfer the Parquet/CSV to your analysis machine: + +```bash +scp source:/tmp/system_monitor.parquet ./ +``` + +3) Explore on Destination (no tracer needed): + +```bash +python -m system_monitor --datafile ./system_monitor.parquet --port 8050 +``` + +- The dashboard loads the provided file and renders metrics. +- Tracing controls in the UI only affect the local machine; they are independent of the loaded file. + + +## Process Tree Inspection + +From dashboard: +- Go to the Process Tree view. +- Enter a PID or use the PID finder to search by name/command. +- Click Inspect to load the tree and thread counts. +- If `dash-cytoscape` is installed, toggle to Graph view for a visual tree with node details. + +From CLI: + +```bash +python -m system_monitor.system_tracer proctree +``` + + +## Events and Timezones + +- Add events via the sidebar date/time picker. +- Import CSV with two columns: `event,timestamp`. Timestamps are normalized internally. +- Display timezone can be set to Local, UTC, or a custom IANA zone. Event markers and data align accordingly. + + +## Tips & Troubleshooting + +- Parquet engines: Install `pyarrow` for best compatibility. `fastparquet` is used as a fallback. +- Docker metrics: Requires the Docker daemon running and the `docker` Python package. If unavailable, container graphs will be empty. +- GPU metrics: Requires `pynvml` and NVIDIA drivers. If unavailable, GPU graphs will be empty. +- Assets override: Set `SYSTEM_MONITOR_ASSETS` to point to a custom assets directory if desired. +- Permissions: Some process/thread info may require elevated privileges; run as a user with sufficient permissions if you see AccessDenied errors. + + +## Programmatic API (optional) + +Collect a one-off snapshot in Python: + +```python +from system_monitor.system_tracer import collect_system_snapshot +snap = collect_system_snapshot(enable_gpu=False, enable_docker=False) +``` + +Run tracer to Parquet in-process: + +```python +from system_monitor.system_tracer import monitor_to_parquet +monitor_to_parquet(output_file="system_monitor.parquet", sample_interval=2, write_interval=10) +``` + +UTC vs Local +- Tracer defaults to local timestamps. Pass `--utc` to record timestamps in UTC. +- Dashboard defaults to Local display time. Use the "Display timezone" control to switch to UTC or a custom IANA zone, and "Data timezone (source)" to inform the dashboard whether your stored data timestamps are Local or UTC. + + +## License + +Internal project module; follow repository licensing and contribution guidelines. diff --git a/scripts/support/system_monitor/__init__.py b/scripts/support/system_monitor/__init__.py new file mode 100644 index 000000000..9939ca4e2 --- /dev/null +++ b/scripts/support/system_monitor/__init__.py @@ -0,0 +1,11 @@ +"""System Monitor package. + +Provides the canonical entry point for the dashboard and tracer. + +Usage: + python -m system_monitor --datafile system_monitor.parquet +""" + +from .system_monitor import run_dashboard + +__all__ = ["run_dashboard"] diff --git a/scripts/support/system_monitor/__main__.py b/scripts/support/system_monitor/__main__.py new file mode 100644 index 000000000..24b1cf381 --- /dev/null +++ b/scripts/support/system_monitor/__main__.py @@ -0,0 +1,19 @@ +# flake8: noqa +# noqa +""" +Module entry point for the System Monitor dashboard. + +Run with: + python -m system_monitor --datafile system_monitor.parquet +""" +import os + +_HERE = os.path.abspath(os.path.dirname(__file__)) +_PKG_ASSETS = os.path.join(_HERE, "assets") +if os.path.isdir(_PKG_ASSETS) and not os.environ.get("SYSTEM_MONITOR_ASSETS"): + os.environ["SYSTEM_MONITOR_ASSETS"] = _PKG_ASSETS + +from .system_monitor import run_dashboard + +if __name__ == "__main__": + run_dashboard() diff --git a/scripts/support/system_monitor/assets/style.css b/scripts/support/system_monitor/assets/style.css new file mode 100644 index 000000000..ea4fa438a --- /dev/null +++ b/scripts/support/system_monitor/assets/style.css @@ -0,0 +1,137 @@ +/* Global fonts and tokens */ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&display=swap'); + +:root { + --bg: #0f1115; + --surface: #151821; + --surface-2: #1b2030; + --text: #e5e7eb; + --text-muted: #9aa3b2; + --border: #2a2f3d; + --accent: #4f46e5; + --accent-2: #06b6d4; + --success: #10b981; + --danger: #ef4444; + --warning: #f59e0b; + --radius: 8px; + --shadow: 0 1px 2px rgba(0,0,0,0.25), 0 8px 24px rgba(0,0,0,0.18); + --gap-xs: 4px; --gap-sm: 8px; --gap-md: 12px; --gap-lg: 16px; --gap-xl: 24px; +} + +/* Light theme support via data-theme=light on body (optional) */ +body[data-theme="light"] { + --bg: #ffffff; + --surface: #f6f7fb; + --surface-2: #eef1f7; + --text: #111827; + --text-muted: #6b7280; + --border: #e5e7eb; +} + +html, body { height: 100%; } +body { + margin: 0; + background: var(--bg); + color: var(--text); + font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif; + font-size: 14px; + line-height: 1.5; +} + +/* Layout */ +.grid { display: grid; grid-template-columns: 320px 1fr; gap: var(--gap-lg); align-items: start; } +.sidebar { + position: sticky; top: var(--gap-lg); + padding: var(--gap-lg); + background: var(--surface); + border: 1px solid var(--border); + border-radius: var(--radius); + box-shadow: var(--shadow); +} +.section-title { font-size: 16px; font-weight: 600; margin: var(--gap-sm) 0; } +.label { color: var(--text-muted); font-size: 12px; text-transform: uppercase; letter-spacing: 0.04em; margin-bottom: 4px; display: block; } +.help, .muted { color: var(--text-muted); font-size: 12px; } +.control { margin: var(--gap-md) 0; } +.kpis { display: grid; grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); gap: var(--gap-md); margin: var(--gap-sm) 0 var(--gap-lg); } +.graph { margin-bottom: var(--gap-lg); background: var(--surface); border: 1px solid var(--border); border-radius: var(--radius); padding: var(--gap-sm); } + +/* Inputs & Buttons */ +input, textarea { + background: var(--surface-2); + color: var(--text); + border: 1px solid var(--border); + border-radius: 6px; + padding: 8px 10px; +} +input::placeholder, textarea::placeholder { color: var(--text-muted); } +button, .button { + appearance: none; border: 1px solid var(--border); background: var(--surface-2); + color: var(--text); padding: 6px 12px; border-radius: 6px; cursor: pointer; +} +button:hover { border-color: color-mix(in srgb, var(--accent) 35%, var(--border)); } +button.primary { background: var(--accent); border-color: var(--accent); color: #fff; } +button.primary:hover { background: color-mix(in srgb, var(--accent) 85%, #000); } + +/* Dash core components */ +/* RadioItems */ +input[type="radio"] { accent-color: var(--accent); } +input[type="checkbox"] { accent-color: var(--accent); } + +/* Dropdown (react-select) */ +.Select-control { background: var(--surface-2); border-color: var(--border); color: var(--text); } +.Select--single > .Select-control .Select-value, .Select-placeholder { color: var(--text); } +.Select-menu-outer { background: var(--surface-2); border-color: var(--border); color: var(--text); z-index: 1000; } +.Select-option { background: var(--surface-2); color: var(--text); } +.Select-option.is-focused { background: color-mix(in srgb, var(--accent) 14%, var(--surface-2)); } +.Select-option.is-selected { background: color-mix(in srgb, var(--accent) 28%, var(--surface-2)); } + +/* Slider (rc-slider) */ +.rc-slider { padding: 6px 0; } +.rc-slider-rail { background: var(--border); } +.rc-slider-track { background: var(--accent); } +.rc-slider-handle { border-color: var(--accent); background: #fff; } + +/* Tabs */ +.tabs, .dash-tabs { background: transparent; } +.tab { background: var(--surface); border: 1px solid var(--border) !important; color: var(--text); border-radius: 6px 6px 0 0; margin-right: 4px; padding: 8px 12px; } +.tab--selected { border-bottom-color: transparent !important; background: var(--surface-2); } + +/* Upload */ +.dccUpload { border: 1px dashed var(--border); border-radius: 6px; padding: 6px 10px; color: var(--text-muted); } + +/* Cytoscape container */ +#proctree-cyto-container { background: var(--surface); border: 1px solid var(--border); border-radius: var(--radius); } +#proctree-graph { background: var(--surface); } + +/* Helper spacing */ +hr { border: none; border-top: 1px solid var(--border); margin: var(--gap-md) 0; opacity: 0.7; } + +/* Plotly figure background harmonization */ +.js-plotly-plot .plotly .bg, .js-plotly-plot .plotly .bglayer, .js-plotly-plot .plotly .plot { background: transparent !important; } +.js-plotly-plot .plotly .infolayer { color: var(--text); } + +/* Sidebar spacing & control layout improvements */ +/* Add breathing room between stacked elements inside controls */ +.sidebar .control > *:not(:last-child) { margin-bottom: var(--gap-sm); } + +/* Space out button groups (e.g., Start/Stop/Reset/Snapshot) */ +.sidebar .control button, +.sidebar .control .button { margin-right: var(--gap-sm); margin-bottom: var(--gap-sm); } + +/* Make text/number inputs full-width and comfortable in the sidebar */ +.sidebar .control input[type="text"], +.sidebar .control input[type="number"] { width: 100%; box-sizing: border-box; } + +/* RadioItems and Checklist: wrap and space options nicely */ +.sidebar .control .dash-radio-items, +.sidebar .control .dash-checklist { display: flex; flex-wrap: wrap; gap: 6px 12px; } +.sidebar .control .dash-radio-items label, +.sidebar .control .dash-checklist label { margin: 0; display: inline-flex; align-items: center; gap: 6px; } + +/* Upload widget spacing */ +.sidebar .control .dccUpload { margin-top: var(--gap-sm); } + +/* Details sections: subtle separators and clickable summaries */ +.sidebar details { padding: 8px 0; border-top: 1px solid var(--border); } +.sidebar details:first-of-type { border-top: none; } +.sidebar details > summary { cursor: pointer; padding: 6px 0; } diff --git a/scripts/support/system_monitor/callbacks/__init__.py b/scripts/support/system_monitor/callbacks/__init__.py new file mode 100644 index 000000000..e51c9195c --- /dev/null +++ b/scripts/support/system_monitor/callbacks/__init__.py @@ -0,0 +1,28 @@ +from typing import Any + + +def register_callbacks(app: Any, *, cy_available: bool) -> None: + """Register all dashboard callbacks grouped by domain. + + This function aggregates domain callback registration to keep the main + system_monitor.py body declarative. + + Parameters + ---------- + app : Any + Dash app instance. + cy_available : bool + Whether dash_cytoscape is available (for Process Tree graph callbacks). + """ + from .overview import register_overview_callbacks + from .proctree_impl import register_proctree_callbacks + from .events import register_events_callbacks + from .containers import register_containers_callbacks + from .theme import register_theme_callbacks + + # Order can matter for readability; functional independence is maintained. + register_overview_callbacks(app) + register_proctree_callbacks(app, cy_available=cy_available) + register_events_callbacks(app) + register_containers_callbacks(app) + register_theme_callbacks(app) diff --git a/scripts/support/system_monitor/callbacks/containers.py b/scripts/support/system_monitor/callbacks/containers.py new file mode 100644 index 000000000..aae098b62 --- /dev/null +++ b/scripts/support/system_monitor/callbacks/containers.py @@ -0,0 +1,10 @@ +from typing import Any + + +def register_containers_callbacks(app: Any) -> None: + """Register containers tab callbacks. + + Placeholder registrar; existing callbacks remain in system_monitor.py + until migrated. + """ + return diff --git a/scripts/support/system_monitor/callbacks/events.py b/scripts/support/system_monitor/callbacks/events.py new file mode 100644 index 000000000..7b1d8fc3d --- /dev/null +++ b/scripts/support/system_monitor/callbacks/events.py @@ -0,0 +1,10 @@ +from typing import Any + + +def register_events_callbacks(app: Any) -> None: + """Register events and annotations callbacks. + + Placeholder registrar; existing callbacks remain in system_monitor.py + until migrated. + """ + return diff --git a/scripts/support/system_monitor/callbacks/overview.py b/scripts/support/system_monitor/callbacks/overview.py new file mode 100644 index 000000000..f981c2858 --- /dev/null +++ b/scripts/support/system_monitor/callbacks/overview.py @@ -0,0 +1,10 @@ +from typing import Any + + +def register_overview_callbacks(app: Any) -> None: + """Register overview tab callbacks. + + Placeholder registrar; existing callbacks remain in system_monitor.py + until migrated. + """ + return diff --git a/scripts/support/system_monitor/callbacks/proctree.py b/scripts/support/system_monitor/callbacks/proctree.py new file mode 100644 index 000000000..2541ab43a --- /dev/null +++ b/scripts/support/system_monitor/callbacks/proctree.py @@ -0,0 +1,293 @@ +if True: + from typing import Any + from datetime import datetime + import dash + from dash import dcc + from dash.dependencies import Output, Input, State + import psutil + + def register_proctree_callbacks(app: Any, *, cy_available: bool) -> None: + # PID search: suggest processes matching a search string (similar to `ps -AFl | grep `) + @app.callback( + Output("proctree-suggestions", "options"), + [Input("proctree-search", "value")], + prevent_initial_call=False, + ) + def update_proctree_suggestions(search_text): + opts = [] + try: + pattern = (search_text or "").strip() + if not pattern: + return [] + pattern_low = pattern.lower() + # Collect processes with safe attribute access + matches = [] + for p in psutil.process_iter(attrs=["pid", "name", "username", "cmdline", "num_threads"]): + try: + info = p.info + pid = info.get("pid") + name = info.get("name") or "" + username = info.get("username") or "" + cmdline_list = info.get("cmdline") or [] + cmdline = " ".join(cmdline_list) + haystack = f"{name} {cmdline}".lower() + if pattern_low in haystack: + threads = info.get("num_threads") or 0 + label = f"PID {pid} • {username} • thr={threads} • {name} — {cmdline}".strip() + matches.append({"label": label[:300], "value": pid}) + except Exception: + continue + # Limit to first 50 + opts = matches[:50] + except Exception: + opts = [] + return opts + + # When a suggestion is selected, set the PID input + @app.callback( + Output("proctree-pid", "value"), + [Input("proctree-suggestions", "value")], + prevent_initial_call=True, + ) + def set_pid_from_selection(selected_pid): + try: + if selected_pid is None: + return dash.no_update + return int(selected_pid) + except Exception: + return dash.no_update + + # Theme-aware high-contrast styles for PID search controls + @app.callback( + Output("proctree-suggestions", "style"), + [Input("theme-toggle", "value")], + ) + def style_proctree_dropdown(theme_value): + light = { + "width": "420px", + "display": "inline-block", + "color": "#111", + "backgroundColor": "#ffffff", + "border": "1px solid #888", + } + dark = { + "width": "420px", + "display": "inline-block", + "color": "#eee", + "backgroundColor": "#222", + "border": "1px solid #555", + } + return dark if theme_value == "dark" else light + + @app.callback( + Output("proctree-search", "style"), + [Input("theme-toggle", "value")], + ) + def style_proctree_search(theme_value): + base = {"width": "320px", "marginLeft": "6px", "marginRight": "8px"} + if theme_value == "dark": + base.update({"backgroundColor": "#222", "color": "#eee", "border": "1px solid #555"}) + else: + base.update({"backgroundColor": "#fff", "color": "#111", "border": "1px solid #888"}) + return base + + # Toggle between text and graph tree containers + @app.callback( + [Output("proctree-cyto-container", "style"), Output("proctree-tree-text-container", "style")], + [Input("proctree-view-mode", "value")], + ) + def toggle_tree_view(view_mode): + # Always show the Graph container when Graph is selected, even if dash-cytoscape + # is not installed, so the fallback help message is visible. + if view_mode == "graph": + return {"display": "block"}, {"display": "none"} + # default to text view + return {"display": "none"}, {"display": "block"} + + # Build cytoscape elements from last summary + if cy_available: + + @app.callback( + Output("proctree-graph", "elements"), + [Input("proctree-last-summary", "data")], + ) + def build_cytoscape_elements(summary): + try: + plist = (summary or {}).get("processes", []) + if not plist: + return [] + nodes = [] + edges = [] + pids = set() + for p in plist: + pid = p.get("pid") + name = p.get("name") or "?" + threads = int(p.get("threads") or 0) + pids.add(pid) + nodes.append( + { + "data": { + "id": str(pid), + "label": f"{name}({pid}) t={threads}", + "threads": threads, + } + } + ) + for p in plist: + pid = p.get("pid") + ppid = p.get("ppid") + if ppid in pids and pid in pids and ppid is not None and pid is not None: + edges.append({"data": {"source": str(ppid), "target": str(pid)}}) + return nodes + edges + except Exception: + return [] + + @app.callback( + Output("proctree-graph", "stylesheet"), + [Input("theme-toggle", "value")], + ) + def cytoscape_stylesheet(theme_value): + # map threads to size/color + node_color_dark = "#4aa3ff" + node_color_light = "#1f77b4" + text_color_dark = "#e5e5e5" + text_color_light = "#222222" + edge_color_dark = "#888" + edge_color_light = "#aaa" + base = [ + { + "selector": "node", + "style": { + "label": "data(label)", + "font-size": 10, + "color": (text_color_dark if theme_value == "dark" else text_color_light), + "background-color": (node_color_dark if theme_value == "dark" else node_color_light), + "width": "mapData(threads, 0, 64, 20, 60)", + "height": "mapData(threads, 0, 64, 20, 60)", + "text-valign": "center", + "text-halign": "center", + }, + }, + { + "selector": "edge", + "style": { + "line-color": (edge_color_dark if theme_value == "dark" else edge_color_light), + "target-arrow-color": (edge_color_dark if theme_value == "dark" else edge_color_light), + "target-arrow-shape": "triangle", + "curve-style": "bezier", + "width": 1.5, + }, + }, + ] + return base + + @app.callback( + Output("proctree-node-details", "children"), + [Input("proctree-graph", "tapNodeData")], + [State("proctree-last-summary", "data")], + ) + def show_node_details(tap_node, summary): + try: + if not tap_node: + return "" + pid = int(tap_node.get("id")) + plist = (summary or {}).get("processes", []) + ent = next((p for p in plist if p.get("pid") == pid), None) + if not ent: + return "" + name = ent.get("name") or "?" + ppid = ent.get("ppid") + threads = ent.get("threads") + return f"Selected: {name} ({pid}) — PPID={ppid}, Threads={threads}" + except Exception: + return "" + + # Force a layout re-run whenever elements change + @app.callback( + Output("proctree-graph", "layout"), + [Input("proctree-graph", "elements")], + ) + def refresh_cyto_layout(elements): + return {"name": "breadthfirst", "directed": True} + + # Status helper under the graph container + @app.callback( + Output("proctree-graph-status", "children"), + [Input("proctree-view-mode", "value"), Input("proctree-last-summary", "data")], + ) + def update_graph_status(view_mode, summary): + if view_mode != "graph": + return "" + if not cy_available: + return "Graph view requires dash-cytoscape. Install with: pip install dash-cytoscape" + plist = (summary or {}).get("processes", []) + if not plist: + return "No graph data yet. Click 'Inspect' after entering a valid PID." + # count edges + by_ppid = {} + for p in plist: + by_ppid.setdefault(p.get("ppid"), []).append(p) + edge_count = sum(len(v) for k, v in by_ppid.items() if k is not None) + return f"Graph ready: {len(plist)} node(s), {edge_count} edge(s). Tip: click a node to see details." + + # Snapshot current summary + @app.callback( + [Output("proctree-snapshot", "data"), Output("proctree-snapshot-status", "children")], + [Input("proctree-snapshot-btn", "n_clicks")], + [State("proctree-last-summary", "data")], + prevent_initial_call=True, + ) + def take_snapshot(n_clicks, summary): + try: + if not summary or not summary.get("processes"): + return dash.no_update, "No current tree to snapshot. Run Inspect first." + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + snap = {"timestamp": ts, "summary": summary} + return snap, f"Snapshot captured at {ts}." + except Exception: + return dash.no_update, "Failed to capture snapshot." + + # Diff current summary to snapshot + @app.callback( + Output("proctree-diff-result", "children"), + [Input("proctree-diff-btn", "n_clicks")], + [State("proctree-snapshot", "data"), State("proctree-last-summary", "data")], + prevent_initial_call=True, + ) + def diff_to_snapshot(n_clicks, snapshot, current): + try: + if not snapshot or not snapshot.get("summary"): + return "No snapshot available. Click 'Take Snapshot' after Inspect." + snap = snapshot.get("summary") or {} + snap_plist = snap.get("processes", []) + cur_plist = (current or {}).get("processes", []) + snap_by_pid = {p.get("pid"): p for p in snap_plist} + cur_by_pid = {p.get("pid"): p for p in cur_plist} + added = sorted([pid for pid in cur_by_pid.keys() if pid not in snap_by_pid]) + removed = sorted([pid for pid in snap_by_pid.keys() if pid not in cur_by_pid]) + changed = [] + for pid in set(cur_by_pid.keys()).intersection(snap_by_pid.keys()): + t0 = int(snap_by_pid[pid].get("threads") or 0) + t1 = int(cur_by_pid[pid].get("threads") or 0) + if t0 != t1: + changed.append((pid, t0, t1)) + lines = [] + lines.append(f"Added: {len(added)}") + if added: + lines.extend([f" + {pid}" for pid in added[:50]]) + if len(added) > 50: + lines.append(" …") + lines.append(f"Removed: {len(removed)}") + if removed: + lines.extend([f" - {pid}" for pid in removed[:50]]) + if len(removed) > 50: + lines.append(" …") + lines.append(f"Thread changes: {len(changed)}") + if changed: + for pid, t0, t1 in changed[:50]: + lines.append(f" ~ {pid}: {t0} -> {t1}") + if len(changed) > 50: + lines.append(" …") + return dcc.Markdown("```text\n" + "\n".join(lines) + "\n```") + except Exception: + return "Diff failed." diff --git a/scripts/support/system_monitor/callbacks/theme.py b/scripts/support/system_monitor/callbacks/theme.py new file mode 100644 index 000000000..bee8e234a --- /dev/null +++ b/scripts/support/system_monitor/callbacks/theme.py @@ -0,0 +1,10 @@ +from typing import Any + + +def register_theme_callbacks(app: Any) -> None: + """Register theme-related callbacks. + + Placeholder registrar; existing callbacks remain in system_monitor.py + until migrated. + """ + return diff --git a/scripts/support/system_monitor/layout.py b/scripts/support/system_monitor/layout.py new file mode 100644 index 000000000..8b47769d3 --- /dev/null +++ b/scripts/support/system_monitor/layout.py @@ -0,0 +1,1219 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# pylint: skip-file +# flake8: noqa + +from typing import Any +from dash import dcc, html + +try: + import dash_cytoscape as cy # type: ignore +except Exception: # pragma: no cover + cy = None + + +def build_layout(*, datafile: str, interval: int, cy_available: bool, cy_module: Any = None): + """Build the System Monitor dashboard layout (sidebar + tabs + stores + interval). + + Parameters + ---------- + datafile : str + Default datafile path to display in the Output Parquet Path field and initial store. + interval : int + Refresh interval in seconds for the dcc.Interval component. + cy_available : bool + If True, render the Cytoscape process tree graph; otherwise show helper text. + cy_module : Any, optional + The dash_cytoscape module to use if available; if None, falls back to local import if present. + """ + _cy = cy_module if cy_module is not None else cy + + return html.Div( + [ + # Stores + dcc.Store(id="datafile-store", data=datafile, storage_type="local"), + dcc.Store(id="event-store", data=[], storage_type="local"), + dcc.Store(id="event-auto-store", data=[], storage_type="local"), + dcc.Store(id="watch-state-store", data={}, storage_type="session"), + dcc.Store(id="proctree-last-summary", data={}, storage_type="memory"), + dcc.Store(id="proctree-snapshot", data=None, storage_type="local"), + dcc.Store(id="theme-sink", data=None, storage_type="memory"), + # body-bg-sync is provided as a hidden Div below for clientside callback output + html.Div( + [ + # Sidebar controls + html.Div( + [ + html.H2("System Monitor", className="section-title"), + html.Div([html.Span(id="data-source", className="muted")]), + html.Div( + [ + html.Span("Last updated: ", className="muted"), + html.Span("Never", id="last-updated", className="muted"), + ], + style={"marginBottom": "6px"}, + ), + html.Details( + [ + html.Summary("Time & Display", className="section-title"), + html.Div( + [ + html.Div("Time range", className="label"), + dcc.RadioItems( + id="time-range", + options=[ + {"label": "10m", "value": 10}, + {"label": "30m", "value": 30}, + {"label": "1h", "value": 60}, + {"label": "3h", "value": 180}, + {"label": "All", "value": 0}, + ], + value=30, + persistence=True, + persisted_props=["value"], + persistence_type="local", + labelStyle={"display": "inline-block", "marginRight": "8px"}, + ), + ], + className="control", + ), + html.Div( + [ + html.Div("Display timezone", className="label"), + dcc.Dropdown( + id="display-tz", + options=[ + {"label": "Local", "value": "local"}, + {"label": "UTC", "value": "utc"}, + {"label": "Custom…", "value": "custom"}, + ], + value="local", + clearable=False, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + ], + className="control", + ), + html.Div( + [ + dcc.Input( + id="display-tz-custom", + type="text", + placeholder="e.g., America/Denver", + debounce=True, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ) + ], + id="display-tz-custom-wrap", + className="control", + style={"display": "none"}, + ), + html.Div( + [ + html.Div("Data timezone (source)", className="label"), + dcc.RadioItems( + id="data-tz", + options=[ + {"label": "Local", "value": "local"}, + {"label": "UTC", "value": "utc"}, + ], + value="local", + persistence=True, + persisted_props=["value"], + persistence_type="local", + labelStyle={"display": "inline-block", "marginRight": "8px"}, + ), + html.Div( + "Tip: Set to UTC if you started the tracer with --utc.", + className="muted", + style={"marginTop": "4px"}, + ), + ], + className="control", + ), + html.Div( + [ + html.Div("Smoothing (samples)", className="label"), + dcc.Slider( + id="smoothing-window", + min=1, + max=10, + step=1, + value=3, + marks={1: "1", 5: "5", 10: "10"}, + tooltip={"placement": "bottom", "always_visible": False}, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + ], + className="control", + ), + html.Div( + [ + dcc.Checklist( + id="pause-refresh", + options=[{"label": "Pause auto-refresh", "value": "pause"}], + value=[], + inline=True, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ) + ], + className="control", + ), + ], + open=True, + ), + html.Details( + [ + html.Summary("Live Tracing", className="section-title"), + html.Div( + "Configure and control in-process tracing. For offline files, " + "leave tracing stopped.", + className="muted", + style={"marginTop": "-6px", "marginBottom": "6px"}, + ), + html.Div( + [ + html.Div("Data source mode", className="label"), + dcc.RadioItems( + id="data-source-mode", + options=[ + {"label": "Auto (prefer live if running)", "value": "auto"}, + {"label": "Live tracer", "value": "live"}, + {"label": "File (Parquet/CSV)", "value": "file"}, + ], + value="auto", + persistence=True, + persisted_props=["value"], + persistence_type="local", + labelStyle={"display": "block", "marginRight": "8px"}, + ), + ], + className="control", + ), + html.Div( + [ + html.Div("Output Parquet Path", className="label"), + dcc.Input( + id="tracer-output-path", + type="text", + value=datafile, + debounce=True, + persistence=True, + persisted_props=["value"], + persistence_type="local", + style={"width": "100%"}, + ), + ], + className="control", + ), + html.Div( + [ + html.Div("Sampling (s)", className="label"), + dcc.Input( + id="tracer-sample-interval", + type="number", + min=0.1, + step=0.1, + value=5.0, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + ], + className="control", + ), + html.Div( + [ + html.Div("Write Interval (s)", className="label"), + dcc.Input( + id="tracer-write-interval", + type="number", + min=1, + step=1, + value=10.0, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + ], + className="control", + ), + html.Div( + [ + dcc.Checklist( + id="tracer-options", + options=[ + {"label": "Enable GPU", "value": "gpu"}, + {"label": "Enable Docker", "value": "docker"}, + {"label": "UTC timestamps", "value": "utc"}, + ], + value=["gpu", "docker"], + inline=True, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ) + ], + className="control", + ), + html.Div( + [ + html.Button( + "Start", id="tracer-start-btn", n_clicks=0, className="button primary" + ), + html.Button( + "Stop", + id="tracer-stop-btn", + n_clicks=0, + className="button", + style={"marginLeft": "6px"}, + ), + html.Button( + "Reset Buffer", + id="tracer-reset-btn", + n_clicks=0, + className="button", + style={"marginLeft": "6px"}, + ), + html.Button( + "Snapshot Now", + id="tracer-snapshot-btn", + n_clicks=0, + className="button primary", + style={"marginLeft": "6px"}, + ), + ], + className="control", + ), + html.Div(id="tracer-status", className="muted", style={"marginTop": "6px"}), + ], + open=False, + ), + html.Hr(style={"opacity": 0.2}), + html.Details( + [ + html.Summary("Events & Annotations", className="section-title"), + html.Div( + "Add named events to appear as vertical markers on all charts.", + className="muted", + style={"marginTop": "-6px", "marginBottom": "6px"}, + ), + html.Div( + [ + dcc.Input( + id="event-name", + type="text", + placeholder="Event name", + persistence=True, + persisted_props=["value"], + persistence_type="local", + style={"width": "100%"}, + ) + ], + className="control", + ), + html.Div( + [ + html.Div("Event time", className="label"), + html.Div( + [ + dcc.DatePickerSingle( + id="event-date", + display_format="YYYY-MM-DD", + persistence=True, + persistence_type="local", + ), + dcc.Input( + id="event-time", + type="text", + placeholder="HH:MM[:SS]", + persistence=True, + persisted_props=["value"], + persistence_type="local", + style={"width": "120px", "marginLeft": "8px"}, + ), + ], + style={"display": "flex", "alignItems": "center"}, + ), + html.Div( + "Date/time interpreted in selected Display Timezone; stored as UTC.", + className="muted", + style={"marginTop": "4px"}, + ), + ], + className="control", + ), + html.Div( + [ + html.Button( + "Add", id="add-event-btn", n_clicks=0, className="inline button primary" + ), + html.Button( + "Add (Now)", + id="add-event-now-btn", + n_clicks=0, + className="inline button primary", + ), + html.Button("Clear", id="clear-events-btn", n_clicks=0, className="inline"), + dcc.Upload( + id="event-upload", + children=html.Div(["Import CSV"]), + className="dccUpload inline", + ), + ], + className="control", + ), + html.Div( + [ + dcc.Checklist( + id="event-display-options", + options=[{"label": "Show event markers", "value": "markers"}], + value=["markers"], + inline=True, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ) + ], + className="control", + ), + html.Div(id="event-list", className="control"), + ], + open=False, + ), + html.Details( + [ + html.Summary( + "Watch Points (auto-create events when thresholds are exceeded)", + className="section-title", + ), + html.Div( + [ + dcc.Checklist( + id="watch-enable", + options=[ + {"label": "CPU %", "value": "cpu"}, + {"label": "Memory %", "value": "mem"}, + {"label": "Threads", "value": "threads"}, + {"label": "Processes", "value": "procs"}, + ], + value=[], + inline=True, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + html.Div( + [ + html.Span("CPU % >"), + dcc.Input( + id="watch-cpu", + type="number", + min=0, + max=100, + step=1, + style={"width": "70px", "marginRight": "12px"}, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + html.Span("Mem % >"), + dcc.Input( + id="watch-mem", + type="number", + min=0, + max=100, + step=1, + style={"width": "70px", "marginRight": "12px"}, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + html.Span("Threads >"), + dcc.Input( + id="watch-threads", + type="number", + min=0, + step=1, + style={"width": "90px", "marginRight": "12px"}, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + html.Span("Processes >"), + dcc.Input( + id="watch-procs", + type="number", + min=0, + step=1, + style={"width": "90px", "marginRight": "12px"}, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + ], + style={"marginTop": "6px"}, + ), + html.Div( + [ + html.Button( + "Clear Auto Events", + id="clear-auto-events-btn", + n_clicks=0, + className="inline", + ) + ], + className="control", + ), + ] + ), + ], + open=False, + ), + html.Hr(style={"opacity": 0.2}), + html.Details( + [ + html.Summary("Appearance", className="section-title"), + html.Div( + [ + html.Div("Theme", className="label"), + dcc.RadioItems( + id="theme-toggle", + options=[ + {"label": "Light", "value": "light"}, + {"label": "Dark", "value": "dark"}, + ], + value="dark", + persistence=True, + persisted_props=["value"], + persistence_type="local", + labelStyle={"display": "inline-block", "marginRight": "8px"}, + ), + ], + className="control", + ), + ], + open=False, + ), + ], + className="sidebar", + ), + # Main content + html.Div( + [ + # Contextual notice banner (filled by callback) + html.Div( + id="notice-banner", + style={ + "marginBottom": "10px", + "border": "1px solid var(--border)", + "padding": "8px", + "display": "block", + }, + ), + # Tabs + dcc.Tabs( + id="main-tabs", + value="tab-overview", + colors={ + "border": "var(--border)", + "primary": "var(--text)", + "background": "var(--surface)", + }, + children=[ + dcc.Tab( + label="Overview", + value="tab-overview", + children=[ + html.Div( + [ + html.Div( + [ + html.H2("System Overview", className="section-title"), + html.Div(id="kpi-row", className="kpis"), + dcc.Graph( + id="system-overview-graph", + className="graph", + style={"height": "340px"}, + ), + ] + ), + ], + style={"marginTop": "4px"}, + ) + ], + ), + dcc.Tab( + label="Process Tree", + value="tab-proctree", + children=[ + html.Div( + [ + html.H2("Process Tree Inspector", className="section-title"), + dcc.Loading( + id="proctree-loading-controls", + type="dot", + children=html.Div( + [ + html.Div( + [ + html.Span("Root PID:"), + dcc.Input( + id="proctree-pid", + type="number", + placeholder="Enter PID", + debounce=True, + style={ + "width": "140px", + "marginLeft": "6px", + "marginRight": "12px", + }, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + dcc.Checklist( + id="proctree-verbose", + options=[ + {"label": "Verbose", "value": "verbose"} + ], + value=[], + inline=True, + persistence=True, + persisted_props=["value"], + persistence_type="local", + style={ + "display": "inline-block", + "marginRight": "12px", + }, + ), + html.Span("Find PID:"), + dcc.Input( + id="proctree-search", + type="text", + placeholder="e.g. " + "microservice_entrypoint.py", + debounce=False, + style={ + "width": "320px", + "marginLeft": "6px", + "marginRight": "8px", + }, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + dcc.Dropdown( + id="proctree-suggestions", + options=[], + placeholder="Select a matching PID", + style={ + "width": "420px", + "display": "inline-block", + }, + clearable=True, + ), + html.Button( + "Inspect", + id="proctree-run", + n_clicks=0, + className="button primary", + style={"marginLeft": "12px"}, + ), + html.Div( + "Type to search; " + "select a row to populate Root PID.", + className="muted", + style={"marginTop": "6px"}, + ), + html.Div( + [ + html.Span( + "Tree view:", + style={"marginRight": "8px"}, + ), + dcc.RadioItems( + id="proctree-view-mode", + options=[ + { + "label": "Text", + "value": "text", + }, + { + "label": "Graph", + "value": "graph", + }, + ], + value="text", + labelStyle={ + "display": "inline-block", + "marginRight": "8px", + }, + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + html.Button( + "Take Snapshot", + id="proctree-snapshot-btn", + n_clicks=0, + className="button", + style={"marginLeft": "12px"}, + ), + html.Button( + "Compare to Snapshot", + id="proctree-diff-btn", + n_clicks=0, + className="button", + style={"marginLeft": "8px"}, + ), + ], + style={"marginTop": "8px"}, + ), + ], + className="control", + ), + ], + className="control", + ), + ), + dcc.Loading( + id="proctree-loading-totals", + type="dot", + children=html.Div( + id="proctree-totals", + className="muted", + style={"marginBottom": "8px"}, + ), + ), + dcc.Loading( + id="proctree-loading-graphs", + type="default", + style={"width": "100%"}, + children=html.Div( + [ + html.Div( + [ + dcc.Graph( + id="proctree-procs-by-cmd", + className="graph", + style={"height": "260px", "width": "100%"}, + config={"responsive": True}, + ) + ], + style={ + "flex": "1 1 0", + "minWidth": "0", + "boxSizing": "border-box", + "overflow": "hidden", + }, + ), + html.Div( + [ + dcc.Graph( + id="proctree-threads-by-cmd", + className="graph", + style={"height": "260px", "width": "100%"}, + config={"responsive": True}, + ) + ], + style={ + "flex": "1 1 0", + "minWidth": "0", + "boxSizing": "border-box", + "overflow": "hidden", + }, + ), + ], + style={ + "display": "flex", + "gap": "12px", + "alignItems": "stretch", + "width": "100%", + }, + ), + ), + dcc.Loading( + id="proctree-loading-tree", + type="cube", + style={"width": "100%"}, + children=html.Div( + [ + html.H3("Tree", className="section-title"), + html.Div( + [ + ( + _cy.Cytoscape( + id="proctree-graph", + elements=[], + layout={ + "name": "breadthfirst", + "directed": True, + }, + style={ + "width": "100%", + "height": "520px", + "border": "1px solid #444", + "boxSizing": "border-box", + "maxWidth": "100%", + "overflow": "hidden", + }, + stylesheet=[], + ) + if cy_available and _cy is not None + else html.Div( + "Graph view requires " + "dash-cytoscape. Install with: " + "pip install dash-cytoscape", + className="muted", + style={"padding": "8px"}, + ) + ) + ], + id="proctree-cyto-container", + style={ + "display": "none", + "width": "100%", + "maxWidth": "100%", + "overflowX": "hidden", + }, + ), + html.Div( + id="proctree-graph-status", + className="muted", + style={"marginTop": "6px"}, + ), + html.Div( + [ + dcc.Markdown( + id="proctree-tree-md", + style={"whiteSpace": "pre-wrap"}, + ) + ], + id="proctree-tree-text-container", + style={"display": "block"}, + ), + html.Div( + id="proctree-node-details", + className="muted", + style={"marginTop": "8px"}, + ), + html.Div( + id="proctree-snapshot-status", + className="muted", + style={"marginTop": "8px"}, + ), + html.Div( + id="proctree-diff-result", + className="muted", + style={"marginTop": "8px"}, + ), + ] + ), + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ) + ], + ), + dcc.Tab( + label="CPU & Memory", + value="tab-cpu-mem", + children=[ + html.Div( + [ + html.H2("CPU Utilization", className="section-title"), + html.Div( + [ + html.Div( + [ + dcc.Graph( + id="cpu-aggregated-utilization-graph", + className="graph", + style={"height": "320px"}, + ) + ], + style={ + "width": "50%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + html.Div( + [ + dcc.Graph( + id="cpu-individual-utilization-graph", + className="graph", + style={"height": "320px"}, + ) + ], + style={ + "width": "50%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + ] + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + html.Div( + [ + html.H2("Memory Usage", className="section-title"), + dcc.Graph( + id="memory-graph", className="graph", style={"height": "320px"} + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + html.Div( + [ + html.H2("File Descriptor Usage", className="section-title"), + html.Div( + [ + html.Div( + [ + dcc.Graph( + id="file-count-graph", + className="graph", + style={"height": "280px"}, + ) + ], + style={ + "width": "50%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + html.Div( + [ + dcc.Graph( + id="fd-usage-graph", + className="graph", + style={"height": "280px"}, + ) + ], + style={ + "width": "50%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + ] + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + html.Div( + [ + html.H2("Processes and Threads", className="section-title"), + html.Div( + [ + html.Div( + [ + dcc.Graph( + id="process-count-graph", + className="graph", + style={"height": "260px"}, + ) + ], + style={ + "width": "50%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + html.Div( + [ + dcc.Graph( + id="thread-count-graph", + className="graph", + style={"height": "260px"}, + ) + ], + style={ + "width": "50%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + ] + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + ], + ), + dcc.Tab( + label="I/O & Network", + value="tab-io-net", + children=[ + html.Div( + [ + html.H2("Network Activity", className="section-title"), + dcc.Graph( + id="network-graph", className="graph", style={"height": "320px"} + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + html.Div( + [ + html.H2("Disk I/O", className="section-title"), + dcc.Graph( + id="disk-io-graph", className="graph", style={"height": "320px"} + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + ], + ), + dcc.Tab( + label="GPU", + value="tab-gpu", + children=[ + html.Div( + [ + html.H2("GPU Usage", className="section-title"), + html.Div( + [ + html.Div( + [ + dcc.Graph( + id="gpu-utilization-graph", + className="graph", + style={"height": "300px"}, + ) + ], + style={ + "width": "50%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + html.Div( + [ + dcc.Graph( + id="gpu-memory-graph", + className="graph", + style={"height": "300px"}, + ) + ], + style={ + "width": "50%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + ] + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + ], + ), + dcc.Tab( + label="Containers", + value="tab-containers", + children=[ + html.Div( + [ + html.Details( + [ + html.Summary("Container Focus"), + html.Div( + [ + dcc.Checklist( + id="container-auto-top", + options=[ + { + "label": "Auto select top by CPU", + "value": "auto", + } + ], + value=["auto"], + persistence=True, + persisted_props=["value"], + persistence_type="local", + labelStyle={ + "display": "inline-block", + "marginRight": "10px", + }, + ), + html.Label("Top N (when auto):"), + dcc.Slider( + id="container-top-n", + min=1, + max=10, + step=1, + value=5, + marks={1: "1", 5: "5", 10: "10"}, + tooltip={ + "placement": "bottom", + "always_visible": False, + }, + ), + html.Label("Or select containers:"), + dcc.Dropdown( + id="container-select", + options=[], + value=[], + multi=True, + placeholder="Select containers...", + persistence=True, + persisted_props=["value"], + persistence_type="local", + ), + ], + style={"margin": "8px 0"}, + ), + ], + open=False, + ) + ] + ), + html.Div( + [ + html.H2("Container Metrics", className="section-title"), + html.Div( + [ + html.Div( + [ + dcc.Graph( + id="container-cpu-utilization-graph", + className="graph", + style={"height": "280px"}, + ) + ], + style={ + "width": "33%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + html.Div( + [ + dcc.Graph( + id="container-memory-utilization-graph", + className="graph", + style={"height": "280px"}, + ) + ], + style={ + "width": "33%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + html.Div( + [ + dcc.Graph( + id="container-files-graph", + className="graph", + style={"height": "280px"}, + ) + ], + style={ + "width": "33%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + ] + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + html.Div( + [ + html.H2( + "Container Network Throughput (MB/s)", className="section-title" + ), + dcc.Graph( + id="container-net-graph", + className="graph", + style={"height": "300px"}, + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + html.Div( + [ + html.H2("Container Disk I/O (MB/s)", className="section-title"), + dcc.Graph( + id="container-io-graph", + className="graph", + style={"height": "300px"}, + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + html.Div( + [ + html.H2( + "Top Containers (Latest Sample)", className="section-title" + ), + html.Div( + [ + html.Div( + [ + dcc.Graph( + id="container-top-cpu-graph", + className="graph", + style={"height": "280px"}, + ) + ], + style={ + "width": "50%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + html.Div( + [ + dcc.Graph( + id="container-top-mem-graph", + className="graph", + style={"height": "280px"}, + ) + ], + style={ + "width": "50%", + "display": "inline-block", + "verticalAlign": "top", + }, + ), + ] + ), + ], + style={"width": "100%", "marginBottom": "16px"}, + ), + ], + ), + ], + ), + ] + ), + ], + className="grid", + ), + # Refresh interval + dcc.Interval( + id="interval-component", interval=interval * 1000, n_intervals=0 # convert seconds to milliseconds + ), + # Hidden div to sync body background with theme (already declared store above) + html.Div(id="body-bg-sync", style={"display": "none"}), + ], + id="page-container", + style={"padding": "16px"}, + ) diff --git a/scripts/support/system_monitor/system_monitor.py b/scripts/support/system_monitor/system_monitor.py new file mode 100644 index 000000000..d6a7a89f1 --- /dev/null +++ b/scripts/support/system_monitor/system_monitor.py @@ -0,0 +1,1715 @@ +import dash +from dash import dcc, html +from dash.dependencies import Output, Input, State, ALL +import pandas as pd +import plotly.graph_objects as go +import os +import click +import base64 +import io +import csv +from datetime import datetime +import json +import psutil + +# noqa +# flake8: noqa + +# Use absolute package import only (no relatives or fallbacks) +from system_tracer import ( + get_process_tree_summary, + SystemTracer, +) +from layout import build_layout +from callbacks import register_callbacks +from helpers import apply_theme, style_minimal_figure + +try: + from dateutil.tz import tzlocal, gettz +except Exception: + tzlocal = None + gettz = None + +try: + import dash_cytoscape as cy # type: ignore + + CY_AVAILABLE = True +except Exception: + CY_AVAILABLE = False + cy = None + +try: + import pyarrow.parquet as pq # noqa + + PARQUET_AVAILABLE = True +except ImportError: + PARQUET_AVAILABLE = False + print("Warning: pyarrow is not available. Please install pyarrow for Parquet file support.") + + +@click.command() +@click.option("--datafile", "-d", default="system_monitor.parquet", help="Path to the parquet data file") +@click.option("--port", "-p", default=8050, help="Port to run the dashboard server on") +@click.option("--host", "-h", default="0.0.0.0", help="Host to run the dashboard server on") +@click.option("--interval", "-i", default=10, help="Refresh interval in seconds") +@click.option("--debug/--no-debug", default=True, help="Run in debug mode") +def run_dashboard(datafile, port, host, interval, debug): + """Run the system monitoring dashboard with the specified parameters.""" + + # Validate the data file (be permissive; pandas can use either pyarrow or fastparquet) + if not os.path.exists(datafile): + print(f"Error: Data file '{datafile}' not found.") + print("Dashboard will start but won't display data until the file exists.") + elif datafile.endswith(".parquet") and not PARQUET_AVAILABLE: + print("Warning: pyarrow is not installed; will attempt to read parquet via pandas (fastparquet if available).") + elif not datafile.endswith(".parquet") and not datafile.endswith(".csv"): + print(f"Warning: Data file '{datafile}' is not a .parquet or .csv file.") + print("Attempting to load it anyway, but this may cause errors.") + + # Initialize the Dash app and ensure assets/ resolves to the packaged assets by default + pkg_dir = os.path.abspath(os.path.dirname(__file__)) + default_assets = os.path.join(pkg_dir, "assets") + assets_override = os.environ.get("SYSTEM_MONITOR_ASSETS") + assets_path = assets_override if assets_override and os.path.isdir(assets_override) else default_assets + app = dash.Dash(__name__, assets_folder=assets_path) + + # Global Plotly defaults: Inter font and transparent backgrounds (Tufte-style minimalism) + try: + import plotly.io as pio # type: ignore + + tufted = go.layout.Template() + tufted.layout.font.family = "Inter, system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif" + tufted.layout.paper_bgcolor = "rgba(0,0,0,0)" + tufted.layout.plot_bgcolor = "rgba(0,0,0,0)" + pio.templates["tufted"] = tufted + pio.templates.default = "tufted" + except Exception: + pass + + # Keep index minimal; styling handled via assets/style.css + app.index_string = """ + + + + {%metas%} + {%title%} + {%favicon%} + {%css%} + + + {%app_entry%} +
+ {%config%} + {%scripts%} + {%renderer%} +
+ + + """ + + # Set the app title + app.title = "System Resource Monitor" + + # Use extracted layout factory (overrides inline layout above) + app.layout = build_layout( + datafile=datafile, + interval=interval, + cy_available=CY_AVAILABLE, + cy_module=cy, + ) + # Register callbacks by domain (existing in this module; future modules will hook here) + register_callbacks(app, cy_available=CY_AVAILABLE) + + # Helper function to load and filter data + def load_data(data_path, time_range_minutes, source_mode="auto"): + try: + # Live buffer branch (explicit when mode == live, or auto + running) + if source_mode in ("live", "auto"): + try: + if _is_running(): + with _tracer_lock: + tracer = _tracer_obj.get("tracer") + if tracer is not None and getattr(tracer, "data_buffer", None) is not None: + df = pd.DataFrame(list(tracer.data_buffer)) + if time_range_minutes > 0 and not df.empty and "timestamp" in df.columns: + latest_time = pd.to_datetime(df["timestamp"]).max() + time_threshold = latest_time - pd.Timedelta(minutes=time_range_minutes) + df = df[pd.to_datetime(df["timestamp"]) >= time_threshold] + return df + except Exception as le: + print(f"Error reading live buffer: {le}") + + # File branch (explicit when mode == file, or auto + no live) + if data_path and os.path.exists(data_path): + # Prefer parquet if extension says so; let pandas pick available engine (pyarrow/fastparquet) + if data_path.endswith(".parquet"): + try: + df = pd.read_parquet(data_path) + except Exception as pe: + print(f"Parquet read failed via pandas: {pe}. Trying CSV fallback (may fail)...") + df = pd.read_csv(data_path, parse_dates=["timestamp"]) # best-effort + elif data_path.endswith(".csv"): + df = pd.read_csv(data_path, parse_dates=["timestamp"]) + else: + # Try parquet first, then CSV + try: + df = pd.read_parquet(data_path) + except Exception: + df = pd.read_csv(data_path, parse_dates=["timestamp"]) # best-effort + + # Filter by time range if specified + if time_range_minutes > 0 and not df.empty and "timestamp" in df.columns: + latest_time = pd.to_datetime(df["timestamp"]).max() + time_threshold = latest_time - pd.Timedelta(minutes=time_range_minutes) + df = df[pd.to_datetime(df["timestamp"]) >= time_threshold] + + return df + else: + return pd.DataFrame() + except Exception as e: + print(f"Error loading data: {e}") + return pd.DataFrame() + + # Interval enable/disable based on pause toggle + @app.callback( + Output("interval-component", "disabled"), + [Input("pause-refresh", "value")], + ) + def _toggle_interval_disabled(pause_values): + try: + return isinstance(pause_values, list) and ("pause" in pause_values) + except Exception: + return False + + # Notice banner content (guides first-time usage) + @app.callback( + Output("notice-banner", "children"), + [ + Input("interval-component", "n_intervals"), + Input("datafile-store", "data"), + Input("data-source-mode", "value"), + ], + ) + def _notice_banner(_n, data_path, source_mode): + try: + running = _is_running() + has_file = bool(data_path) and os.path.exists(data_path) + if source_mode == "live": + if not running: + return html.Div( + [ + html.Strong("No live data. "), + "Click Start in Live Tracing to begin collecting metrics. ", + "Or switch Data source mode to File and set a Parquet/CSV path.", + ] + ) + elif source_mode == "file": + if not has_file: + return html.Div( + [ + html.Strong("No file loaded. "), + "Set Output Parquet Path to an existing Parquet/CSV and Start the tracer, ", + "or switch Data source mode to Live to collect data in-memory.", + ] + ) + else: # auto + if not running and not has_file: + return html.Div( + [ + html.Strong("No data available. "), + "Start live tracing (left) or set a Parquet/CSV in Output Parquet Path. ", + "Data source mode is Auto: it will prefer live data when the tracer is running.", + ] + ) + except Exception: + pass + return "" + + # ---------------------------- + # Helper utilities (theme, events, small figure helpers) + # ---------------------------- + def normalize_ts(ts_any): + try: + ts = pd.to_datetime(ts_any) + # drop tz if present + if getattr(ts, "tzinfo", None) is not None: + try: + ts = ts.tz_convert("UTC").tz_localize(None) + except Exception: + try: + ts = ts.tz_localize("UTC").tz_localize(None) + except Exception: + pass + return ts + except Exception: + return pd.Timestamp.utcnow() + + # apply_theme and style_minimal_figure are imported from helpers.py + + def make_sparkline(ts, series, theme_value): + """Create a tiny sparkline figure for KPI cards. + + Expects ts (Datetime-like Series) and series (numeric Series) of same length. + """ + fig = go.Figure() + try: + fig.add_trace(go.Scatter(x=ts, y=series, mode="lines", name="", hoverinfo="skip")) + except Exception: + # fallback empty + pass + # Minimal styling + apply_theme(fig, theme_value) + style_minimal_figure(fig, theme_value) + fig.update_layout(margin=dict(l=0, r=0, t=0, b=0)) + fig.update_xaxes(visible=False) + fig.update_yaxes(visible=False) + return fig + + # ---------------------------- + # Strategy pattern: Graph components and registry + # ---------------------------- + class GraphContext: + def __init__(self, smoothing_window, apply_theme_fn, smooth_series_fn): + self.smoothing_window = smoothing_window + self.apply_theme = apply_theme_fn + self.smooth_series = smooth_series_fn + + class GraphComponent: + component_id = "" + title = None + is_time_series = True # whether event markers should be applied + + def build(self, df: pd.DataFrame, ts: pd.Series, ctx: GraphContext, params: dict) -> go.Figure: + raise NotImplementedError + + class CPUIndividualGraph(GraphComponent): + component_id = "cpu-individual-utilization-graph" + title = "CPU Utilization (per core)" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + cpu_cols = [c for c in df.columns if c.startswith("cpu_") and c.endswith("_utilization")] + for c in sorted(cpu_cols): + fig.add_trace( + go.Scatter(x=ts, y=ctx.smooth_series(df[c]), mode="lines", name=c.replace("_utilization", "")) + ) + if len(fig.data) > 0: + fig.update_layout(title=self.title, yaxis_title="%") + return fig + + class CPUAggregateGraph(GraphComponent): + component_id = "cpu-aggregated-utilization-graph" + title = "CPU Utilization (aggregate)" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + cpu_cols = [c for c in df.columns if c.startswith("cpu_") and c.endswith("_utilization")] + if cpu_cols: + cpu_mean = ctx.smooth_series(df[cpu_cols].mean(axis=1)) + fig.add_trace(go.Scatter(x=ts, y=cpu_mean, mode="lines", name="CPU %")) + fig.update_layout(title=self.title, yaxis_title="%") + return fig + + class MemoryGraph(GraphComponent): + component_id = "memory-graph" + title = "Memory Utilization" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + if {"sys_used", "sys_total"}.issubset(df.columns): + mem_pct = (df["sys_used"] / df["sys_total"] * 100.0).clip(lower=0, upper=100) + fig.add_trace(go.Scatter(x=ts, y=ctx.smooth_series(mem_pct), mode="lines", name="Mem %")) + fig.update_layout(title=self.title, yaxis_title="%") + return fig + + class FileCountGraph(GraphComponent): + component_id = "file-count-graph" + title = "Total Open Files" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + if "total_open_files" in df.columns: + fig.add_trace( + go.Scatter(x=ts, y=ctx.smooth_series(df["total_open_files"]), mode="lines", name="Open Files") + ) + fig.update_layout(title=self.title) + return fig + + class FDUsageGraph(GraphComponent): + component_id = "fd-usage-graph" + title = "FD Usage %" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + if "fd_usage_percent" in df.columns: + fig.add_trace(go.Scatter(x=ts, y=ctx.smooth_series(df["fd_usage_percent"]), mode="lines", name="FD %")) + fig.update_layout(title=self.title, yaxis_title="%") + return fig + + class NetworkGraph(GraphComponent): + component_id = "network-graph" + title = "Network Throughput" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + recv_col = "net_bytes_recv_per_sec" if "net_bytes_recv_per_sec" in df.columns else None + sent_col = "net_bytes_sent_per_sec" if "net_bytes_sent_per_sec" in df.columns else None + if recv_col: + fig.add_trace( + go.Scatter(x=ts, y=ctx.smooth_series(df[recv_col]) / (1024**2), mode="lines", name="Down MB/s") + ) + if sent_col: + fig.add_trace( + go.Scatter(x=ts, y=ctx.smooth_series(df[sent_col]) / (1024**2), mode="lines", name="Up MB/s") + ) + if len(fig.data) > 0: + fig.update_layout(title=self.title, yaxis_title="MB/s") + return fig + + class DiskIOGraph(GraphComponent): + component_id = "disk-io-graph" + title = "Disk I/O" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + r_col = "disk_read_bytes_per_sec" if "disk_read_bytes_per_sec" in df.columns else None + w_col = "disk_write_bytes_per_sec" if "disk_write_bytes_per_sec" in df.columns else None + if r_col: + fig.add_trace( + go.Scatter(x=ts, y=ctx.smooth_series(df[r_col]) / (1024**2), mode="lines", name="Read MB/s") + ) + if w_col: + fig.add_trace( + go.Scatter(x=ts, y=ctx.smooth_series(df[w_col]) / (1024**2), mode="lines", name="Write MB/s") + ) + if len(fig.data) > 0: + fig.update_layout(title=self.title, yaxis_title="MB/s") + return fig + + class GPUUtilGraph(GraphComponent): + component_id = "gpu-utilization-graph" + title = "GPU Utilization %" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + util_cols = [c for c in df.columns if c.endswith("_utilization") and c.startswith("gpu_")] + for c in sorted(util_cols): + fig.add_trace( + go.Scatter(x=ts, y=ctx.smooth_series(df[c]), mode="lines", name=c.replace("_utilization", " util")) + ) + if len(fig.data) > 0: + fig.update_layout(title=self.title, yaxis_title="%") + return fig + + class GPUMemoryGraph(GraphComponent): + component_id = "gpu-memory-graph" + title = "GPU Memory %" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + gpu_mem_used = [c for c in df.columns if c.startswith("gpu_") and c.endswith("_used")] + for c in sorted(gpu_mem_used): + idx = c.split("_")[1] + tot_col = f"gpu_{idx}_total" + if tot_col in df.columns: + pct = (df[c] / df[tot_col] * 100.0).clip(lower=0, upper=100) + fig.add_trace(go.Scatter(x=ts, y=ctx.smooth_series(pct), mode="lines", name=f"GPU {idx} Mem %")) + if len(fig.data) > 0: + fig.update_layout(title=self.title, yaxis_title="%") + return fig + + # Helpers for Docker naming (support old and new) — module scope so callbacks can access + def docker_container_names(df): + try: + names = set() + for col in df.columns: + if col.endswith("_container_cpu_percent"): + names.add(col[: -len("_container_cpu_percent")]) + elif col.startswith("docker_") and col.endswith("_cpu_percent"): + names.add(col[len("docker_") : -len("_cpu_percent")]) + return sorted(names) + except Exception: + return [] + + # (migrated) proctree suggestion/search callbacks are registered in callbacks/proctree.py + + # (migrated) proctree PID selection registered in callbacks/proctree.py + + def docker_pick_col(df, name, new_suffix, old_suffix): + for cand in (f"docker_{name}_{new_suffix}", f"{name}_{old_suffix}"): + if cand in df.columns: + return cand + return None + + class ContainerCPUGraph(GraphComponent): + component_id = "container-cpu-utilization-graph" + title = "Container CPU %" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + for name in params.get("selected_containers", []): + col = docker_pick_col(df, name, "cpu_percent", "container_cpu_percent") + if col: + fig.add_trace(go.Scatter(x=ts, y=ctx.smooth_series(df[col]), mode="lines", name=f"{name} CPU%")) + if len(fig.data) > 0: + fig.update_layout(title=self.title, yaxis_title="%") + return fig + + class ContainerMemGraph(GraphComponent): + component_id = "container-memory-utilization-graph" + title = "Container Memory %" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + for name in params.get("selected_containers", []): + col = docker_pick_col(df, name, "mem_percent", "container_mem_percent") + if col: + fig.add_trace(go.Scatter(x=ts, y=ctx.smooth_series(df[col]), mode="lines", name=f"{name} Mem%")) + if len(fig.data) > 0: + fig.update_layout(title=self.title, yaxis_title="%") + return fig + + class ContainerFilesGraph(GraphComponent): + component_id = "container-files-graph" + title = "Container Open Files" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + for name in params.get("selected_containers", []): + col = docker_pick_col(df, name, "open_files", "container_open_files") + if col: + fig.add_trace(go.Scatter(x=ts, y=ctx.smooth_series(df[col]), mode="lines", name=f"{name} Files")) + if len(fig.data) > 0: + fig.update_layout(title=self.title) + return fig + + class ContainerNetGraph(GraphComponent): + component_id = "container-net-graph" + title = "Container Network (selected sum)" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + rx_cols = [] + tx_cols = [] + for n in params.get("selected_containers", []): + col_rx = docker_pick_col(df, n, "net_rx_bytes_per_sec", "container_net_rx_bytes_per_sec") + col_tx = docker_pick_col(df, n, "net_tx_bytes_per_sec", "container_net_tx_bytes_per_sec") + if col_rx: + rx_cols.append(col_rx) + if col_tx: + tx_cols.append(col_tx) + if rx_cols: + fig.add_trace( + go.Scatter( + x=ts, y=ctx.smooth_series(df[rx_cols].sum(axis=1)) / (1024**2), mode="lines", name="RX MB/s" + ) + ) + if tx_cols: + fig.add_trace( + go.Scatter( + x=ts, y=ctx.smooth_series(df[tx_cols].sum(axis=1)) / (1024**2), mode="lines", name="TX MB/s" + ) + ) + if len(fig.data) > 0: + fig.update_layout(title=self.title, yaxis_title="MB/s") + return fig + + class ContainerIOGraph(GraphComponent): + component_id = "container-io-graph" + title = "Container Disk I/O (selected sum)" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + r_cols = [] + w_cols = [] + for n in params.get("selected_containers", []): + col_r = docker_pick_col(df, n, "blkio_read_bytes_per_sec", "container_blkio_read_bytes_per_sec") + col_w = docker_pick_col(df, n, "blkio_write_bytes_per_sec", "container_blkio_write_bytes_per_sec") + if col_r: + r_cols.append(col_r) + if col_w: + w_cols.append(col_w) + if r_cols: + fig.add_trace( + go.Scatter( + x=ts, y=ctx.smooth_series(df[r_cols].sum(axis=1)) / (1024**2), mode="lines", name="Read MB/s" + ) + ) + if w_cols: + fig.add_trace( + go.Scatter( + x=ts, y=ctx.smooth_series(df[w_cols].sum(axis=1)) / (1024**2), mode="lines", name="Write MB/s" + ) + ) + if len(fig.data) > 0: + fig.update_layout(title=self.title, yaxis_title="MB/s") + return fig + + class TopContainersCPUBar(GraphComponent): + component_id = "container-top-cpu-graph" + title = "Top Containers by CPU (latest)" + is_time_series = False + + def build(self, df, ts, ctx, params): + fig = go.Figure() + all_containers = docker_container_names(df) + if not all_containers or df.empty: + return fig + latest = df.iloc[-1] + pairs = [] + for name in all_containers: + col = docker_pick_col(df, name, "cpu_percent", "container_cpu_percent") + if col: + pairs.append((name, latest[col])) + pairs = sorted(pairs, key=lambda x: x[1], reverse=True)[:10] + if pairs: + fig.add_trace(go.Bar(x=[n for n, _ in pairs], y=[v for _, v in pairs], name="CPU %")) + fig.update_layout(title=self.title) + return fig + + class TopContainersMemBar(GraphComponent): + component_id = "container-top-mem-graph" + title = "Top Containers by Mem (latest)" + is_time_series = False + + def build(self, df, ts, ctx, params): + fig = go.Figure() + all_containers = docker_container_names(df) + if not all_containers or df.empty: + return fig + latest = df.iloc[-1] + pairs = [] + for name in all_containers: + col = docker_pick_col(df, name, "mem_percent", "container_mem_percent") + if col: + pairs.append((name, latest[col])) + pairs = sorted(pairs, key=lambda x: x[1], reverse=True)[:10] + if pairs: + fig.add_trace(go.Bar(x=[n for n, _ in pairs], y=[v for _, v in pairs], name="Mem %")) + fig.update_layout(title=self.title) + return fig + + class ProcessCountGraph(GraphComponent): + component_id = "process-count-graph" + title = "System Process Count" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + if "system_process_count" in df.columns: + fig.add_trace( + go.Scatter(x=ts, y=ctx.smooth_series(df["system_process_count"]), mode="lines", name="Processes") + ) + fig.update_layout(title=self.title) + return fig + + class ThreadCountGraph(GraphComponent): + component_id = "thread-count-graph" + title = "System Thread Count" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + if "system_thread_count" in df.columns: + fig.add_trace( + go.Scatter(x=ts, y=ctx.smooth_series(df["system_thread_count"]), mode="lines", name="Threads") + ) + fig.update_layout(title=self.title) + return fig + + class OverviewGraph(GraphComponent): + component_id = "system-overview-graph" + title = "System Overview" + + def build(self, df, ts, ctx, params): + fig = go.Figure() + # include CPU aggregate and Memory % if available + cpu_cols = [c for c in df.columns if c.startswith("cpu_") and c.endswith("_utilization")] + if cpu_cols: + cpu_mean = ctx.smooth_series(df[cpu_cols].mean(axis=1)) + fig.add_trace(go.Scatter(x=ts, y=cpu_mean, mode="lines", name="CPU %")) + if {"sys_used", "sys_total"}.issubset(df.columns): + mem_pct = (df["sys_used"] / df["sys_total"] * 100.0).clip(lower=0, upper=100) + fig.add_trace(go.Scatter(x=ts, y=ctx.smooth_series(mem_pct), mode="lines", name="Mem %")) + if len(fig.data) > 0: + fig.update_layout(title=self.title, yaxis_title="%") + return fig + + def add_event_markers(fig, events_list, display_tz, display_tz_custom): + try: + events_list = events_list or [] + for evt in events_list: + ts_raw = evt.get("timestamp") + name = evt.get("name", "event") + if not ts_raw: + continue + ts = pd.to_datetime(ts_raw) + # Events are stored UTC-naive internally. Convert to selected display tz (naive) for rendering. + try: + if display_tz == "local" and tzlocal is not None: + ts = ts.tz_localize("UTC").tz_convert(tzlocal()).tz_localize(None) + elif display_tz == "custom" and display_tz_custom and gettz is not None: + tz = gettz(display_tz_custom) + if tz is not None: + ts = ts.tz_localize("UTC").tz_convert(tz).tz_localize(None) + else: + # utc: leave as UTC-naive + pass + except Exception: + pass + # Draw vertical line via shape for broader Plotly compatibility + try: + fig.add_shape( + type="line", + xref="x", + x0=ts, + x1=ts, + yref="paper", + y0=0, + y1=1, + line=dict(color="#8888d8", width=1, dash="dash"), + layer="above", + ) + fig.add_annotation( + x=ts, + y=1, + xref="x", + yref="paper", + text=name, + showarrow=False, + xanchor="left", + yanchor="bottom", + font=dict(size=10), + bgcolor="rgba(136,132,216,0.15)", + ) + except Exception: + pass + except Exception: + pass + return fig + + def render_event_list(events_list): + events_list = events_list or [] + items = [] + for idx, e in enumerate(events_list): + ts_txt = e.get("timestamp", "") + name_txt = e.get("name", "event") + items.append( + html.Div( + [ + html.Span(f"{ts_txt} — {name_txt}", className="event-text"), + html.Button( + "Delete", + id={"type": "event-delete", "index": idx}, + n_clicks=0, + className="inline button tiny", + style={"marginLeft": "8px"}, + ), + ], + className="event-item", + style={"display": "flex", "alignItems": "center", "justifyContent": "space-between"}, + ) + ) + if not items: + return html.Div("No events", style={"opacity": 0.7}) + return items + + def make_empty_fig(title): + fig = go.Figure() + fig.add_annotation(text=title, showarrow=False, yref="paper", y=0.5, xref="paper", x=0.5) + fig.update_layout(margin=dict(l=30, r=10, t=30, b=30)) + return fig + + def convert_ts_for_display(ts_series, display_tz, display_tz_custom, data_tz): + try: + ts = pd.to_datetime(ts_series) + # First, assign the correct base timezone to the stored timestamps (they are naive on disk) + base = None + if data_tz == "utc": + base = "UTC" + else: + try: + base = tzlocal() if tzlocal is not None else None + except Exception: + base = None + + if base is not None: + try: + ts = ts.dt.tz_localize(base) + except Exception: + # fallback: try scalar localize if Series.dt failed + try: + ts = pd.DatetimeIndex(ts).tz_localize(base) + except Exception: + pass + + # Convert to target display timezone and drop tz to keep axes naive + try: + target = None + if display_tz == "local" and tzlocal is not None: + target = tzlocal() + elif display_tz == "custom" and display_tz_custom and gettz is not None: + target = gettz(display_tz_custom) + else: + target = "UTC" + if target is not None: + ts = pd.DatetimeIndex(ts).tz_convert(target).tz_localize(None) + except Exception: + pass + return ts + except Exception: + return pd.to_datetime(ts_series, errors="coerce") + + def event_times_for_display(events_list, display_tz, display_tz_custom): + try: + if not events_list: + return pd.Series([], dtype="datetime64[ns]") + ets = pd.to_datetime([e.get("timestamp") for e in events_list if e.get("timestamp")]) + try: + if display_tz == "local" and tzlocal is not None: + ets = pd.DatetimeIndex(ets).tz_localize("UTC").tz_convert(tzlocal()).tz_localize(None) + elif display_tz == "custom" and display_tz_custom and gettz is not None: + tz = gettz(display_tz_custom) + if tz is not None: + ets = pd.DatetimeIndex(ets).tz_localize("UTC").tz_convert(tz).tz_localize(None) + except Exception: + pass + return pd.Series(ets) + except Exception: + return pd.Series([], dtype="datetime64[ns]") + + # Callback to update all graphs periodically + @app.callback( + Output("display-tz-custom-wrap", "style"), + [Input("display-tz", "value")], + ) + def _toggle_custom_tz(display_value): + if display_value == "custom": + return {"display": "block"} + return {"display": "none"} + + @app.callback( + [ + Output("page-container", "style"), + Output("data-source", "children"), + Output("last-updated", "children"), + Output("kpi-row", "children"), + Output("system-overview-graph", "figure"), + Output("cpu-individual-utilization-graph", "figure"), + Output("cpu-aggregated-utilization-graph", "figure"), + Output("memory-graph", "figure"), + Output("file-count-graph", "figure"), + Output("fd-usage-graph", "figure"), + Output("network-graph", "figure"), + Output("disk-io-graph", "figure"), + Output("gpu-utilization-graph", "figure"), + Output("gpu-memory-graph", "figure"), + Output("container-cpu-utilization-graph", "figure"), + Output("container-memory-utilization-graph", "figure"), + Output("container-files-graph", "figure"), + Output("container-net-graph", "figure"), + Output("container-io-graph", "figure"), + Output("container-top-cpu-graph", "figure"), + Output("container-top-mem-graph", "figure"), + Output("process-count-graph", "figure"), + Output("thread-count-graph", "figure"), + ], + [ + Input("interval-component", "n_intervals"), + Input("time-range", "value"), + Input("theme-toggle", "value"), + Input("smoothing-window", "value"), + Input("datafile-store", "data"), + Input("data-source-mode", "value"), + Input("container-auto-top", "value"), + Input("container-top-n", "value"), + Input("container-select", "value"), + Input("event-store", "data"), + Input("event-auto-store", "data"), + Input("event-display-options", "value"), + Input("display-tz", "value"), + Input("display-tz-custom", "value"), + Input("data-tz", "value"), + ], + ) + def update_graphs( + n, + time_range, + theme_value, + smoothing_window, + data_path, + source_mode, + auto_top, + top_n, + selected_manual, + events_data, + auto_events, + event_display_options, + display_tz, + display_tz_custom, + data_tz, + ): + # Load data (respect selected source mode) + df = load_data(data_path or datafile, time_range, source_mode or "auto") + last_timestamp = "Never" + if not df.empty and "timestamp" in df.columns: + try: + last_timestamp = pd.to_datetime(df["timestamp"].max()).strftime("%Y-%m-%d %H:%M:%S") + except Exception: + last_timestamp = str(df["timestamp"].max()) + + # Helpers + def smooth_series(s): + try: + w = max(1, int(smoothing_window or 1)) + if w > 1: + return s.rolling(window=w, min_periods=1).mean() + except Exception: + pass + return s + + ts = ( + convert_ts_for_display(df["timestamp"], display_tz, display_tz_custom, data_tz) + if (not df.empty and "timestamp" in df.columns) + else pd.Series([]) + ) + + # Build display timezone label and offset for UI + def display_tz_info(): + try: + label = "UTC" + tzinfo = None + if display_tz == "local" and tzlocal is not None: + tzinfo = tzlocal() + label = "Local" + elif display_tz == "custom" and display_tz_custom and gettz is not None: + tzinfo = gettz(display_tz_custom) + label = f"Custom ({display_tz_custom})" + else: + from dateutil.tz import tzutc + + tzinfo = tzutc() + label = "UTC" + now_dt = datetime.now(tzinfo) if tzinfo is not None else datetime.utcnow() + offset = now_dt.utcoffset() or pd.Timedelta(0) + total_minutes = int(offset.total_seconds() // 60) + sign = "+" if total_minutes >= 0 else "-" + hh = abs(total_minutes) // 60 + mm = abs(total_minutes) % 60 + offset_str = f"UTC{sign}{hh:02d}:{mm:02d}" + # Try to get a friendly tz name + tzname = now_dt.tzname() if tzinfo is not None else "UTC" + return label, tzname, offset_str + except Exception: + return "UTC", "UTC", "UTC+00:00" + + disp_label, disp_name, disp_offset = display_tz_info() + + # KPI row (compact, minimal) + def kpi_card(label, value, spark_fig=None): + return html.Div( + [ + html.Div(label, style={"fontSize": "12px", "opacity": 0.7}), + html.Div(value, style={"fontSize": "18px", "fontWeight": "600"}), + (dcc.Graph(figure=spark_fig, style={"height": "36px"}) if spark_fig is not None else None), + ], + style={"border": "1px solid #333", "borderRadius": "6px", "padding": "8px", "marginRight": "8px"}, + ) + + kpi_children = [kpi_card("Samples", f"{len(df)}")] + + # Add key latest metrics if present (Tufte-inspired: high information density, no chartjunk) + try: + if not df.empty: + latest_row = df.iloc[-1] + # CPU % (aggregate) + cpu_cols = [c for c in df.columns if c.startswith("cpu_") and c.endswith("_utilization")] + if cpu_cols: + cpu_latest = float(pd.to_numeric(latest_row[cpu_cols], errors="coerce").mean()) + # sparkline for CPU mean + try: + cpu_mean_series = pd.to_numeric(df[cpu_cols], errors="coerce").mean(axis=1) + cpu_spark = make_sparkline(ts, cpu_mean_series, theme_value) + except Exception: + cpu_spark = None + kpi_children.append(kpi_card("CPU %", f"{cpu_latest:0.1f}", cpu_spark)) + # Mem % + if {"sys_used", "sys_total"}.issubset(df.columns): + try: + mem_latest = float(latest_row["sys_used"]) / float(latest_row["sys_total"]) * 100.0 + mem_latest = max(0.0, min(100.0, mem_latest)) + try: + mem_pct_series = (df["sys_used"] / df["sys_total"] * 100.0).clip(lower=0, upper=100) + mem_spark = make_sparkline(ts, mem_pct_series, theme_value) + except Exception: + mem_spark = None + kpi_children.append(kpi_card("Mem %", f"{mem_latest:0.1f}", mem_spark)) + except Exception: + pass + # Processes / Threads + if "system_process_count" in df.columns: + try: + kpi_children.append(kpi_card("Procs", f"{int(latest_row['system_process_count'])}")) + except Exception: + pass + if "system_thread_count" in df.columns: + try: + kpi_children.append(kpi_card("Threads", f"{int(latest_row['system_thread_count'])}")) + except Exception: + pass + # Net MB/s (sum up/down) + rx_col = "net_bytes_recv_per_sec" if "net_bytes_recv_per_sec" in df.columns else None + tx_col = "net_bytes_sent_per_sec" if "net_bytes_sent_per_sec" in df.columns else None + if rx_col or tx_col: + try: + rx = float(latest_row.get(rx_col, 0.0) or 0.0) + tx = float(latest_row.get(tx_col, 0.0) or 0.0) + mbps = (rx + tx) / (1024**2) + try: + rx_series = df[rx_col] if rx_col in df.columns else 0.0 + tx_series = df[tx_col] if tx_col in df.columns else 0.0 + net_series = ( + pd.to_numeric(rx_series, errors="coerce").fillna(0) + + pd.to_numeric(tx_series, errors="coerce").fillna(0) + ) / (1024**2) + net_spark = make_sparkline(ts, net_series, theme_value) + except Exception: + net_spark = None + kpi_children.append(kpi_card("Net MB/s", f"{mbps:0.2f}", net_spark)) + except Exception: + pass + # Disk MB/s (sum r+w) + r_col = "disk_read_bytes_per_sec" if "disk_read_bytes_per_sec" in df.columns else None + w_col = "disk_write_bytes_per_sec" if "disk_write_bytes_per_sec" in df.columns else None + if r_col or w_col: + try: + r = float(latest_row.get(r_col, 0.0) or 0.0) + w = float(latest_row.get(w_col, 0.0) or 0.0) + mbps = (r + w) / (1024**2) + try: + r_series = df[r_col] if r_col in df.columns else 0.0 + w_series = df[w_col] if w_col in df.columns else 0.0 + io_series = ( + pd.to_numeric(r_series, errors="coerce").fillna(0) + + pd.to_numeric(w_series, errors="coerce").fillna(0) + ) / (1024**2) + io_spark = make_sparkline(ts, io_series, theme_value) + except Exception: + io_spark = None + kpi_children.append(kpi_card("Disk MB/s", f"{mbps:0.2f}", io_spark)) + except Exception: + pass + except Exception: + pass + + # Build component registry and figures using strategy pattern + # Determine selected containers first (support old and new docker column names) + all_containers = docker_container_names(df) if (not df.empty) else [] + if all_containers: + if auto_top and "auto" in (auto_top or []): + latest = df.iloc[-1] + scored = [] + for name in all_containers: + col = docker_pick_col(df, name, "cpu_percent", "container_cpu_percent") + if col: + scored.append((name, latest[col])) + selected_containers = [ + n for n, _ in sorted(scored, key=lambda x: x[1], reverse=True)[: int(top_n or 5)] + ] + else: + selected_containers = selected_manual or [] + if not selected_containers: + selected_containers = all_containers[: min(5, len(all_containers))] + else: + selected_containers = [] + + ctx_obj = GraphContext(smoothing_window, apply_theme, smooth_series) + params = {"selected_containers": selected_containers} + + components = [ + OverviewGraph(), + CPUIndividualGraph(), + CPUAggregateGraph(), + MemoryGraph(), + FileCountGraph(), + FDUsageGraph(), + NetworkGraph(), + DiskIOGraph(), + GPUUtilGraph(), + GPUMemoryGraph(), + ContainerCPUGraph(), + ContainerMemGraph(), + ContainerFilesGraph(), + ContainerNetGraph(), + ContainerIOGraph(), + TopContainersCPUBar(), + TopContainersMemBar(), + ProcessCountGraph(), + ThreadCountGraph(), + ] + + # Build figures map by id + figures_by_id = {c.component_id: go.Figure() for c in components} + if not df.empty and "timestamp" in df.columns: + for comp in components: + try: + figures_by_id[comp.component_id] = comp.build(df, ts, ctx_obj, params) + except Exception: + figures_by_id[comp.component_id] = go.Figure() + + # Merge events and theme + merged_events = (events_data or []) + (auto_events or []) + # Add events KPI + try: + evt_count = len(merged_events) + except Exception: + evt_count = 0 + kpi_children.append( + html.Div( + [ + html.Div("Events", style={"fontSize": "12px", "opacity": 0.7}), + html.Div(f"{evt_count}", style={"fontSize": "18px", "fontWeight": "600"}), + ], + style={"border": "1px solid #333", "borderRadius": "6px", "padding": "8px"}, + ) + ) + # Compute visible event time bounds to ensure markers are in-range + evt_ts = event_times_for_display(merged_events, display_tz, display_tz_custom) + # Removed unused figs_all variable (was redundant with time_series_figs and not referenced) + # Time-series figs exclude categorical bar charts (top_*). Bar charts won't get event lines. + # Post-process: apply x-axis type, include events and theme to time-series figs + time_series_ids = [ + "system-overview-graph", + "cpu-individual-utilization-graph", + "cpu-aggregated-utilization-graph", + "memory-graph", + "file-count-graph", + "fd-usage-graph", + "network-graph", + "disk-io-graph", + "gpu-utilization-graph", + "gpu-memory-graph", + "container-cpu-utilization-graph", + "container-memory-utilization-graph", + "container-files-graph", + "container-net-graph", + "container-io-graph", + "process-count-graph", + "thread-count-graph", + ] + markers_enabled = True + try: + markers_enabled = (event_display_options is None) or ("markers" in (event_display_options or [])) + except Exception: + markers_enabled = True + + for fig_id in time_series_ids: + f = figures_by_id.get(fig_id, go.Figure()) + # Ensure date x-axis for proper placement of vertical lines + f.update_xaxes(type="date") + # If we have both data ts and event ts, expand range to include both + try: + if len(ts) > 0 and len(evt_ts) > 0: + xmin = min(pd.to_datetime(ts.min()), pd.to_datetime(evt_ts.min())) + xmax = max(pd.to_datetime(ts.max()), pd.to_datetime(evt_ts.max())) + # Add small padding + pad = pd.Timedelta(seconds=1) + f.update_xaxes(range=[xmin - pad, xmax + pad]) + elif len(ts) == 0 and len(evt_ts) > 0: + # No data, but we do have events: center axis around events so markers are visible + xmin = pd.to_datetime(evt_ts.min()) + xmax = pd.to_datetime(evt_ts.max()) + if xmin == xmax: + xmax = xmin + pd.Timedelta(minutes=1) + pad = pd.Timedelta(seconds=1) + f.update_xaxes(range=[xmin - pad, xmax + pad]) + except Exception: + pass + if markers_enabled: + add_event_markers(f, merged_events, display_tz, display_tz_custom) + apply_theme(f, theme_value) + style_minimal_figure(f, theme_value) + # Always show legends for clarity (after styling which doesn't override legend) + f.update_layout( + showlegend=True, + legend=dict(orientation="h", x=0.0, y=1.0, yanchor="top"), + margin=dict(l=40, r=10, t=40, b=28), + ) + + # Process Tree handled by manual callback; nothing to build here + + # Apply theme to bar charts as well + for fig_id in ["container-top-cpu-graph", "container-top-mem-graph"]: + f = figures_by_id.get(fig_id, go.Figure()) + apply_theme(f, theme_value) + style_minimal_figure(f, theme_value) + f.update_layout(showlegend=True) + + # Page style per theme + page_style = { + "padding": "20px", + "backgroundColor": ("#111111" if theme_value == "dark" else "#ffffff"), + "color": ("#e5e5e5" if theme_value == "dark" else "#222222"), + } + + return ( + page_style, + f"Data source: {data_path or datafile} | Data TZ: UTC-naive (stored); Displayed in:" + f" {disp_label} ({disp_name}, {disp_offset})", + f"Last updated: {last_timestamp} | Display TZ: {disp_label} ({disp_name}, {disp_offset})", + kpi_children, + figures_by_id["system-overview-graph"], + figures_by_id["cpu-individual-utilization-graph"], + figures_by_id["cpu-aggregated-utilization-graph"], + figures_by_id["memory-graph"], + figures_by_id["file-count-graph"], + figures_by_id["fd-usage-graph"], + figures_by_id["network-graph"], + figures_by_id["disk-io-graph"], + figures_by_id["gpu-utilization-graph"], + figures_by_id["gpu-memory-graph"], + figures_by_id["container-cpu-utilization-graph"], + figures_by_id["container-memory-utilization-graph"], + figures_by_id["container-files-graph"], + figures_by_id["container-net-graph"], + figures_by_id["container-io-graph"], + figures_by_id["container-top-cpu-graph"], + figures_by_id["container-top-mem-graph"], + figures_by_id["process-count-graph"], + figures_by_id["thread-count-graph"], + ) + + # (migrated) Manual process tree callback registered in callbacks/proctree_impl.py + + # (migrated) proctree style callbacks registered in callbacks/proctree.py + + # (migrated) proctree view toggle registered in callbacks/proctree.py + + # (migrated) proctree Cytoscape callbacks registered in callbacks/proctree.py + + # (migrated) proctree status helper registered in callbacks/proctree.py + + # (migrated) proctree snapshot callback registered in callbacks/proctree.py + + # (migrated) proctree diff callback registered in callbacks/proctree.py + + # Populate container dropdown options dynamically + @app.callback( + Output("container-select", "options"), + [Input("interval-component", "n_intervals"), Input("time-range", "value"), Input("datafile-store", "data")], + ) + def update_container_options(n, time_range, data_path): + try: + df = load_data(data_path or datafile, time_range) + if df.empty: + return [] + names = sorted( + { + col.replace("_container_cpu_percent", "") + for col in df.columns + if col.endswith("_container_cpu_percent") + } + ) + return [{"label": name, "value": name} for name in names] + except Exception: + return [] + + # Clientside callback to sync CSS theme via data-theme attribute + app.clientside_callback( + """ + function(theme){ + var isLight = (theme === 'light'); + var body = document.body, html = document.documentElement; + if (isLight) { body.setAttribute('data-theme','light'); html.setAttribute('data-theme','light'); } + else { body.removeAttribute('data-theme'); html.removeAttribute('data-theme'); } + return ''; + } + """, + Output("body-bg-sync", "children"), + Input("theme-toggle", "value"), + ) + + # Manage events: add, add-now, clear + @app.callback( + [Output("event-store", "data"), Output("event-list", "children")], + [ + Input("add-event-btn", "n_clicks"), + Input("add-event-now-btn", "n_clicks"), + Input("clear-events-btn", "n_clicks"), + Input("event-upload", "contents"), + Input({"type": "event-delete", "index": ALL}, "n_clicks"), + ], + [ + State("event-name", "value"), + State("event-date", "date"), + State("event-time", "value"), + State("event-store", "data"), + State("event-auto-store", "data"), + State("time-range", "value"), + State("display-tz", "value"), + State("display-tz-custom", "value"), + State("datafile-store", "data"), + ], + ) + def manage_events( + n_add, + n_now, + n_clear, + upload_contents, + delete_clicks, + name, + date_val, + time_val, + data, + auto_data, + time_range, + display_tz, + display_tz_custom, + data_path, + ): + data = data or [] + triggered = getattr(dash, "callback_context", None) + trig_id = "" + if triggered and triggered.triggered: + trig_id = triggered.triggered[0]["prop_id"].split(".")[0] + + # Helper to get latest timestamp from datafile + def latest_data_ts(): + df = load_data(data_path or datafile, time_range) + if not df.empty and "timestamp" in df.columns: + try: + return df["timestamp"].max() + except Exception: + pass + # Fallback: now in selected display timezone (naive), will be normalized to UTC-naive below + return pd.Timestamp(datetime.now()) + + def selected_tzinfo(): + if display_tz == "utc": + try: + from dateutil.tz import tzutc + + return tzutc() + except Exception: + return None + if display_tz == "custom" and display_tz_custom and gettz is not None: + tz = gettz(display_tz_custom) + if tz is not None: + return tz + return tzlocal() if tzlocal is not None else None + + def normalize_ts_to_utc_naive(ts_any): + ts = pd.to_datetime(ts_any) + # If tz-aware: convert to UTC then drop tz + if getattr(ts, "tzinfo", None) is not None and ts.tzinfo is not None: + try: + ts = ts.tz_convert("UTC").tz_localize(None) + except Exception: + try: + # If tz_convert fails, maybe it's offset-naive; localize first assuming UTC + ts = ts.tz_localize("UTC").tz_localize(None) + except Exception: + pass + return ts + # tz-naive: assume in the selected display tz, convert to UTC, drop tz + try: + tzinf = selected_tzinfo() + if tzinf is not None: + ts_loc = ts.tz_localize(tzinf) + ts = ts_loc.tz_convert("UTC").tz_localize(None) + return ts + except Exception: + pass + return ts + + def from_text_to_utc_naive(val): + if not val: + raise ValueError("no datetime provided") + base = pd.to_datetime(val) + # Treat input as wall time in selected display tz; convert to UTC, then drop tz + tzinf = selected_tzinfo() + if tzinf is not None: + try: + base_loc = base.tz_localize(tzinf) + base = base_loc.tz_convert("UTC").tz_localize(None) + except Exception: + pass + return base + + # Handle per-item delete via pattern-matching id + if trig_id.startswith("{") and "event-delete" in trig_id: + try: + obj = json.loads(trig_id) + idx = int(obj.get("index", -1)) + except Exception: + idx = -1 + if 0 <= idx < len(data): + data.pop(idx) + return data, render_event_list(data) + + if trig_id == "clear-events-btn": + return [], html.Div("No events", style={"opacity": 0.7}) + + if trig_id in ("add-event-btn", "add-event-now-btn"): + evt_name = (name or "").strip() or "event" + if trig_id == "add-event-now-btn": + # Use current time in selected tz -> convert to UTC-naive for storage + now_dt = pd.Timestamp(datetime.now()) + tzinf = selected_tzinfo() + if tzinf is not None: + try: + now_dt = now_dt.tz_localize(tzinf).tz_convert("UTC").tz_localize(None) + except Exception: + pass + ts = now_dt + else: + # Combine date + time into a single wall time in selected display tz + if date_val: + t_str = (time_val or "00:00:00").strip() + # normalize time format HH:MM[:SS] + parts = t_str.split(":") + if len(parts) == 1: + t_str = f"{parts[0]}:00:00" + elif len(parts) == 2: + t_str = f"{parts[0]}:{parts[1]}:00" + try: + ts = from_text_to_utc_naive(f"{date_val} {t_str}") + except Exception: + ts = latest_data_ts() + else: + ts = latest_data_ts() + # Normalize to isoformat string + try: + ts_norm = normalize_ts(ts) + ts_str = pd.to_datetime(ts_norm).strftime("%Y-%m-%d %H:%M:%S") + except Exception: + ts_str = str(ts) + new_data = data + [{"name": evt_name, "timestamp": ts_str}] + return new_data, render_event_list(new_data) + + if trig_id == "event-upload": + try: + content_type, content_string = upload_contents.split(",") + decoded = base64.b64decode(content_string) + text = decoded.decode("utf-8", errors="ignore") + reader = csv.reader(io.StringIO(text)) + imported = [] + for row in reader: + if not row: + continue + if len(row) == 1: + # try split by comma manually + parts = row[0].split(",") + if len(parts) >= 2: + row = [parts[0], ",".join(parts[1:])] + else: + continue + evt_name = (row[0] or "").strip() or "event" + ts_text = (row[1] or "").strip() + if not ts_text: + continue + try: + ts = normalize_ts_to_utc_naive(ts_text) + ts_str = pd.to_datetime(ts).strftime("%Y-%m-%d %H:%M:%S") + except Exception: + ts_str = str(ts_text) + imported.append({"name": evt_name, "timestamp": ts_str}) + new_data = (data or []) + imported + return new_data, render_event_list(new_data) + except Exception: + return data, render_event_list(data) + + # default: just render existing + return data, render_event_list(data) + + # Auto event triggers (watch points) + @app.callback( + [Output("event-auto-store", "data"), Output("watch-state-store", "data")], + [Input("interval-component", "n_intervals"), Input("clear-auto-events-btn", "n_clicks")], + [ + State("watch-enable", "value"), + State("watch-cpu", "value"), + State("watch-mem", "value"), + State("watch-threads", "value"), + State("watch-procs", "value"), + State("event-auto-store", "data"), + State("watch-state-store", "data"), + State("datafile-store", "data"), + State("time-range", "value"), + ], + ) + def apply_watch_points( + n, + n_clear_auto, + enable_vals, + thr_cpu, + thr_mem, + thr_thr, + thr_prc, + auto_events, + watch_state, + data_path, + time_range, + ): + auto_events = auto_events or [] + watch_state = watch_state or {"cpu": False, "mem": False, "threads": False, "procs": False} + enabled = set(enable_vals or []) + # If clear button triggered, reset auto events and watch state immediately + triggered = getattr(dash, "callback_context", None) + trig_id = "" + if triggered and triggered.triggered: + trig_id = triggered.triggered[0]["prop_id"].split(".")[0] + if trig_id == "clear-auto-events-btn": + # Clear auto events and set watch_state according to current readings + try: + df_now = load_data(data_path or datafile, time_range) + except Exception: + return [], watch_state + if df_now.empty or "timestamp" not in df_now.columns: + return [], watch_state + latest_now = df_now.iloc[-1] + # Compute over-threshold flags to avoid immediate re-trigger if still over + new_state = {"cpu": False, "mem": False, "threads": False, "procs": False} + try: + if "cpu" in enabled and thr_cpu is not None: + cpu_cols = [c for c in df_now.columns if c.startswith("cpu_") and c.endswith("_utilization")] + cpu_avg = float(latest_now[cpu_cols].mean()) if cpu_cols else None + new_state["cpu"] = cpu_avg is not None and cpu_avg >= float(thr_cpu) + except Exception: + pass + try: + if "mem" in enabled and thr_mem is not None: + used = float(latest_now.get("sys_used", 0.0)) + total = float(latest_now.get("sys_total", 0.0)) + mem_pct = (used / total * 100.0) if total > 0 else None + new_state["mem"] = mem_pct is not None and mem_pct >= float(thr_mem) + except Exception: + pass + try: + if "threads" in enabled and thr_thr is not None: + thr_count = float(latest_now.get("system_thread_count", 0)) + new_state["threads"] = thr_count >= float(thr_thr) + except Exception: + pass + try: + if "procs" in enabled and thr_prc is not None: + prc_count = float(latest_now.get("system_process_count", 0)) + new_state["procs"] = prc_count >= float(thr_prc) + except Exception: + pass + return [], new_state + try: + df = load_data(data_path or datafile, time_range) + except Exception: + return auto_events, watch_state + if df.empty or "timestamp" not in df.columns: + return auto_events, watch_state + latest = df.iloc[-1] + ts = latest.get("timestamp") + try: + ts_norm = pd.to_datetime(normalize_ts(ts)).strftime("%Y-%m-%d %H:%M:%S") + except Exception: + ts_norm = str(ts) + + new_events = [] + # CPU avg threshold + if "cpu" in enabled and thr_cpu is not None: + try: + cpu_cols = [c for c in df.columns if c.startswith("cpu_") and c.endswith("_utilization")] + cpu_avg = float(latest[cpu_cols].mean()) if cpu_cols else None + if cpu_avg is not None: + over = cpu_avg >= float(thr_cpu) + if over and not watch_state.get("cpu", False): + new_events.append( + { + "name": f"CPU% > {float(thr_cpu):.0f} (now {cpu_avg:.1f})", + "timestamp": ts_norm, + } + ) + watch_state["cpu"] = over + except Exception: + pass + # Memory percent threshold + if "mem" in enabled and thr_mem is not None: + try: + used = float(latest.get("sys_used", 0.0)) + total = float(latest.get("sys_total", 0.0)) + mem_pct = (used / total * 100.0) if total > 0 else None + if mem_pct is not None: + over = mem_pct >= float(thr_mem) + if over and not watch_state.get("mem", False): + new_events.append( + { + "name": f"Mem% > {float(thr_mem):.0f} (now {mem_pct:.1f})", + "timestamp": ts_norm, + } + ) + watch_state["mem"] = over + except Exception: + pass + # Threads threshold + if "threads" in enabled and thr_thr is not None: + try: + thr_count = float(latest.get("system_thread_count", 0)) + over = thr_count >= float(thr_thr) + if over and not watch_state.get("threads", False): + new_events.append( + { + "name": f"Threads > {float(thr_thr):.0f} (now {thr_count:.0f})", + "timestamp": ts_norm, + } + ) + watch_state["threads"] = over + except Exception: + pass + # Processes threshold + if "procs" in enabled and thr_prc is not None: + try: + prc_count = float(latest.get("system_process_count", 0)) + over = prc_count >= float(thr_prc) + if over and not watch_state.get("procs", False): + new_events.append( + { + "name": f"Processes > {float(thr_prc):.0f} (now {prc_count:.0f})", + "timestamp": ts_norm, + } + ) + watch_state["procs"] = over + except Exception: + pass + + if new_events: + return (auto_events + new_events), watch_state + return auto_events, watch_state + + # ---------------------------- + # Tracer integration + # ---------------------------- + import threading + + _tracer_lock = threading.Lock() + _tracer_obj = {"tracer": None, "thread": None} # type: ignore + + def _is_running(): + t = _tracer_obj.get("thread") + return t is not None and t.is_alive() + + @app.callback( + [Output("tracer-status", "children"), Output("datafile-store", "data")], + [ + Input("tracer-start-btn", "n_clicks"), + Input("tracer-stop-btn", "n_clicks"), + Input("tracer-reset-btn", "n_clicks"), + Input("tracer-snapshot-btn", "n_clicks"), + ], + [ + State("tracer-output-path", "value"), + State("tracer-sample-interval", "value"), + State("tracer-write-interval", "value"), + State("tracer-options", "value"), + State("datafile-store", "data"), + ], + prevent_initial_call=True, + ) + def manage_tracer(n_start, n_stop, n_reset, n_snap, out_path, sample_iv, write_iv, options, current_path): + trig = getattr(dash, "callback_context", None) + trig_id = "" + if trig and trig.triggered: + trig_id = trig.triggered[0]["prop_id"].split(".")[0] + + out_path = (out_path or current_path or datafile).strip() + enable_gpu = (options or []) and ("gpu" in (options or [])) + enable_docker = (options or []) and ("docker" in (options or [])) + use_utc = (options or []) and ("utc" in (options or [])) + + if trig_id == "tracer-start-btn": + with _tracer_lock: + if _is_running(): + return (f"Tracer already running -> {out_path}", out_path) + tracer = SystemTracer( + sample_interval=float(sample_iv or 5.0), + # Keep data in memory when running from dashboard; only snapshot persists + write_interval=0.0, + output_file=out_path, + enable_gpu=bool(enable_gpu), + enable_docker=bool(enable_docker), + use_utc=bool(use_utc), + write_final=False, + ) + th = threading.Thread(target=tracer.run, kwargs={"duration": None, "verbose": False}, daemon=True) + _tracer_obj["tracer"] = tracer + _tracer_obj["thread"] = th + th.start() + return (f"Tracer started -> {out_path}", out_path) + + if trig_id == "tracer-stop-btn": + with _tracer_lock: + tracer = _tracer_obj.get("tracer") + th = _tracer_obj.get("thread") + if tracer is not None: + try: + tracer.stop() + except Exception: + pass + if th is not None: + try: + th.join(timeout=2.0) + except Exception: + pass + _tracer_obj["tracer"] = None + _tracer_obj["thread"] = None + return ("Tracer stopped.", out_path) + + if trig_id == "tracer-reset-btn": + with _tracer_lock: + tracer = _tracer_obj.get("tracer") + if tracer is not None: + try: + tracer.reset() + except Exception: + pass + return ("Tracer buffer reset.", out_path) + + if trig_id == "tracer-snapshot-btn": + with _tracer_lock: + tracer = _tracer_obj.get("tracer") + if tracer is not None: + try: + tracer.set_output_file(out_path) + path = tracer.snapshot(out_path) + return (f"Snapshot saved to {path}", path) + except Exception as e: + return (f"Snapshot failed: {e}", out_path) + # If tracer not running, still write empty/new file to path + try: + # Create a unique suffixed filename if destination exists + base_path = (out_path or "system_monitor.parquet").strip() + root, ext = os.path.splitext(base_path) + if not ext: + ext = ".parquet" + root = base_path # original base without extension + base_path = base_path + ext + path = base_path + if os.path.exists(path): + idx = 0 + while True: + candidate = f"{root}_{idx}{ext}" + if not os.path.exists(candidate): + path = candidate + break + idx += 1 + pd.DataFrame([]).to_parquet(path) + return (f"Snapshot (empty) saved to {path}", path) + except Exception as e: + return (f"Snapshot failed: {e}", out_path) + + return ("", out_path) + + # Start the server + print(f"Starting dashboard server on http://{host}:{port}/") + print(f"Using data file: {datafile}") + print(f"Refresh interval: {interval} seconds") + print("Press Ctrl+C to stop the server") + app.run(debug=debug, host=host, port=port) + + +if __name__ == "__main__": + run_dashboard() # This invokes the Click command diff --git a/scripts/support/system_monitor/system_tracer.py b/scripts/support/system_monitor/system_tracer.py new file mode 100644 index 000000000..1b08fef7d --- /dev/null +++ b/scripts/support/system_monitor/system_tracer.py @@ -0,0 +1,980 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES. +# All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import time +import json +import argparse +from typing import Optional, Dict, Any, List +import threading +import os + +import pandas as pd +import psutil + +try: + import pynvml # type: ignore +except Exception: # NVML is optional + pynvml = None # type: ignore +try: + import docker # type: ignore +except Exception: # Docker is optional + docker = None # type: ignore +import subprocess + +# Add these to your requirements.txt: +# pyarrow>=12.0.0 +# fastparquet>=2023.2.0 + +# --- Helpers to mirror docker stats behavior --- +# Keep names internal to avoid changing external API + + +def _docker_cpu_percent(stats: dict) -> float: + """Compute CPU% similar to `docker stats` using precpu_stats. + Falls back gracefully if fields are missing. + """ + try: + cpu_stats = stats.get("cpu_stats", {}) + precpu_stats = stats.get("precpu_stats", {}) + + cpu_total = cpu_stats.get("cpu_usage", {}).get("total_usage") + pre_cpu_total = precpu_stats.get("cpu_usage", {}).get("total_usage") + + system_total = cpu_stats.get("system_cpu_usage") + pre_system_total = precpu_stats.get("system_cpu_usage") + + if cpu_total is None or pre_cpu_total is None or system_total is None or pre_system_total is None: + return 0.0 + + cpu_delta = cpu_total - pre_cpu_total + system_delta = system_total - pre_system_total + + # Prefer online_cpus when available (cgroup v2 aware); otherwise percpu length + online_cpus = cpu_stats.get("online_cpus") + if not online_cpus: + percpu = cpu_stats.get("cpu_usage", {}).get("percpu_usage") or [] + online_cpus = len(percpu) if percpu else (psutil.cpu_count() or 1) + + if system_delta > 0 and cpu_delta > 0: + return (cpu_delta / system_delta) * online_cpus * 100.0 + return 0.0 + except Exception: + return 0.0 + + +def _docker_memory_usage_limit_percent(mem_stats: dict): + """Return (used_bytes, limit_bytes, percent) using docker's approach. + On cgroup v1: used = usage - cache. On v2: prefer inactive_file subtraction if present. + """ + try: + usage = mem_stats.get("usage", 0) or 0 + limit = mem_stats.get("limit", 0) or 0 + stats = mem_stats.get("stats", {}) or {} + + # Prefer inactive_file (cgroup v2) when present; otherwise cache (v1) + inactive_file = stats.get("inactive_file") + if inactive_file is None: + inactive_file = stats.get("total_inactive_file") + cache = stats.get("cache") + + if inactive_file is not None: + used = max(usage - inactive_file, 0) + elif cache is not None: + used = max(usage - cache, 0) + else: + used = usage + + percent = (used / limit * 100.0) if limit and limit > 0 else 0.0 + return used, limit, percent + except Exception: + return 0, 0, 0.0 + + +def _aggregate_network_bytes(stats: dict): + """Sum rx/tx across all interfaces from docker stats JSON.""" + rx = 0 + tx = 0 + try: + networks = stats.get("networks", {}) or {} + for _if, vals in networks.items(): + rx += int(vals.get("rx_bytes", 0) or 0) + tx += int(vals.get("tx_bytes", 0) or 0) + except Exception: + pass + return rx, tx + + +def _aggregate_blkio_bytes(stats: dict): + """Sum blkio read/write bytes from docker stats JSON.""" + read = 0 + write = 0 + try: + entries = stats.get("blkio_stats", {}).get("io_service_bytes_recursive", []) or [] + for e in entries: + op = (e.get("op") or "").lower() + val = int(e.get("value", 0) or 0) + if op == "read": + read += val + elif op == "write": + write += val + except Exception: + pass + return read, write + + +class BaseCollector: + def collect(self) -> Dict[str, Any]: # pragma: no cover - interface + return {} + + def close(self) -> None: # pragma: no cover - optional + pass + + +class MemoryCollector(BaseCollector): + def collect(self) -> Dict[str, Any]: + mem = psutil.virtual_memory() + return {"sys_total": mem.total, "sys_used": mem.used, "sys_free": mem.free} + + +class CPUCollector(BaseCollector): + def __init__(self, percpu: bool = True, interval: Optional[float] = None) -> None: + self.percpu = percpu + self.interval = interval + + def collect(self) -> Dict[str, Any]: + utils = psutil.cpu_percent(percpu=self.percpu, interval=self.interval) + if self.percpu: + return {f"cpu_{i}_utilization": v for i, v in enumerate(utils)} + else: + return {"cpu_avg_utilization": utils} + + +class OpenFilesCollector(BaseCollector): + def __init__(self, use_lsof_fallback: bool = True) -> None: + self.use_lsof_fallback = use_lsof_fallback + + def collect(self) -> Dict[str, Any]: + try: + total_open_files = len(psutil.Process().net_connections()) + for proc in psutil.process_iter(["pid", "name"]): + try: + total_open_files += len(proc.open_files()) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + pass + + if self.use_lsof_fallback: + try: + result = subprocess.run(["lsof", "-n"], capture_output=True, text=True) + lsof_count = len(result.stdout.splitlines()) - 1 + total_open_files = max(total_open_files, lsof_count) + except (subprocess.SubprocessError, FileNotFoundError): + pass + + max_files = 0 + max_files_process = "None" + for proc in psutil.process_iter(["pid", "name"]): + try: + open_count = len(proc.open_files()) + if open_count > max_files: + max_files = open_count + max_files_process = f"{proc.name()}({proc.pid})" + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + pass + + try: + with open("/proc/sys/fs/file-max", "r") as f: + fd_max = int(f.read().strip()) + with open("/proc/sys/fs/file-nr", "r") as f: + fd_used = int(f.read().split()[0]) + fd_percentage = (fd_used / fd_max) * 100 if fd_max > 0 else 0 + except (FileNotFoundError, ValueError, IndexError): + fd_max = 0 + fd_used = 0 + fd_percentage = 0 + + return { + "total_open_files": total_open_files, + "max_files_process": max_files_process, + "max_files_count": max_files, + "fd_used": fd_used, + "fd_max": fd_max, + "fd_usage_percent": fd_percentage, + } + except Exception as e: + print(f"Error getting open files count: {e}") + return { + "total_open_files": -1, + "max_files_process": f"Error: {str(e)}", + "max_files_count": -1, + "fd_used": -1, + "fd_max": -1, + "fd_usage_percent": -1, + } + + +# -------- Process tree/thread inspector (Python equivalent of thread_checker.sh) -------- +def get_process_tree_summary(root_pid: int, verbose: bool = False) -> Dict[str, Any]: + """Return a summary of a process tree rooted at root_pid. + + Provides per-process thread counts and command names, totals, and aggregation by command. + This mirrors the functionality of thread_checker.sh using psutil. + """ + result: Dict[str, Any] = { + "root_pid": root_pid, + "processes": [], # list of {pid, ppid, name, threads} + "totals": {"total_processes": 0, "total_threads": 0}, + "aggregated_by_command": [], # list of {command, processes, total_threads} + "verbose": verbose, + } + try: + if root_pid <= 0: + return result + try: + root = psutil.Process(root_pid) + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + # Fallback: scan process table, locate root by pid and build tree using PPID relationships + all_infos: Dict[int, Dict[str, Any]] = {} + try: + for p in psutil.process_iter(attrs=["pid", "ppid", "name", "num_threads"]): + info = p.info + all_infos[info.get("pid")] = { + "pid": info.get("pid"), + "ppid": info.get("ppid"), + "name": info.get("name") or "(unknown)", + "threads": int(info.get("num_threads") or 0), + } + except Exception: + pass + if root_pid not in all_infos: + return result + # Build children map + by_ppid: Dict[Optional[int], list] = {} + for it in all_infos.values(): + by_ppid.setdefault(it.get("ppid"), []).append(it) + # DFS from root_pid to collect subtree + stack = [root_pid] + per_pid = [] + total_threads = 0 + seen = set() + while stack: + cur = stack.pop() + if cur in seen: + continue + seen.add(cur) + ent = all_infos.get(cur) + if not ent: + continue + per_pid.append(ent) + total_threads += ent.get("threads", 0) + for child in by_ppid.get(cur, []): + cid = child.get("pid") + if cid is not None: + stack.append(cid) + result["processes"] = sorted(per_pid, key=lambda x: (x.get("ppid") or -1, x.get("pid") or -1)) + result["totals"] = {"total_processes": len(per_pid), "total_threads": total_threads} + # Aggregate by command + agg: Dict[str, Dict[str, int]] = {} + for it in per_pid: + cmd = it.get("name") or "(unknown)" + ent = agg.setdefault(cmd, {"processes": 0, "total_threads": 0}) + ent["processes"] += 1 + ent["total_threads"] += int(it.get("threads") or 0) + result["aggregated_by_command"] = [ + {"command": k, **v} for k, v in sorted(agg.items(), key=lambda kv: kv[1]["total_threads"], reverse=True) + ] + return result + + # Gather all processes in the tree (root + recursive children) + procs = [root] + try: + procs.extend(root.children(recursive=True)) + except Exception: + pass + + per_pid = [] + total_threads = 0 + for p in procs: + if p is None: + continue + pid = None + ppid = None + name = None + threads = 0 + try: + pid = p.pid + except Exception: + continue + try: + ppid = p.ppid() + except Exception: + ppid = None + try: + name = p.name() + except Exception: + name = "(access-denied)" + try: + threads = int(p.num_threads()) + except Exception: + # If threads cannot be read due to permissions, treat as 0 but still include the process + threads = 0 + info = {"pid": pid, "ppid": ppid, "name": name, "threads": threads} + per_pid.append(info) + total_threads += threads + + result["processes"] = sorted(per_pid, key=lambda x: x["pid"]) + result["totals"] = {"total_processes": len(per_pid), "total_threads": total_threads} + + # Aggregate by command + agg: Dict[str, Dict[str, int]] = {} + for it in per_pid: + cmd = it["name"] or "(unknown)" + ent = agg.setdefault(cmd, {"processes": 0, "total_threads": 0}) + ent["processes"] += 1 + ent["total_threads"] += it["threads"] + result["aggregated_by_command"] = [ + {"command": k, **v} for k, v in sorted(agg.items(), key=lambda kv: kv[1]["total_threads"], reverse=True) + ] + except Exception as e: + result["error"] = str(e) + return result + + +class DiskIOCollector(BaseCollector): + def collect(self) -> Dict[str, Any]: + try: + io_counters = psutil.disk_io_counters() + return { + "disk_read_bytes": io_counters.read_bytes, + "disk_write_bytes": io_counters.write_bytes, + "disk_read_count": io_counters.read_count, + "disk_write_count": io_counters.write_count, + "disk_busy_time": io_counters.busy_time if hasattr(io_counters, "busy_time") else 0, + } + except Exception as e: + print(f"Error getting disk I/O stats: {e}") + return { + "disk_read_bytes": -1, + "disk_write_bytes": -1, + "disk_read_count": -1, + "disk_write_count": -1, + "disk_busy_time": -1, + } + + +class NetworkCollector(BaseCollector): + def collect(self) -> Dict[str, Any]: + try: + net_io = psutil.net_io_counters() + return { + "net_bytes_sent": net_io.bytes_sent, + "net_bytes_recv": net_io.bytes_recv, + "net_packets_sent": net_io.packets_sent, + "net_packets_recv": net_io.packets_recv, + "net_errin": net_io.errin, + "net_errout": net_io.errout, + "net_dropin": net_io.dropin, + "net_dropout": net_io.dropout, + } + except Exception as e: + print(f"Error getting network stats: {e}") + return { + "net_bytes_sent": -1, + "net_bytes_recv": -1, + "net_packets_sent": -1, + "net_packets_recv": -1, + "net_errin": -1, + "net_errout": -1, + "net_dropin": -1, + "net_dropout": -1, + } + + +class GPUCollector(BaseCollector): + def __init__(self) -> None: + self._inited = False + self._available = False + + def _init(self): + if self._inited: + return + try: + pynvml.nvmlInit() + self._available = True + except Exception as e: + print(f"GPU monitoring not available: {e}") + self._available = False + finally: + self._inited = True + + def collect(self) -> Dict[str, Any]: + self._init() + gpu_stats: Dict[str, Any] = {} + if not self._available: + return gpu_stats + try: + device_count = pynvml.nvmlDeviceGetCount() + for i in range(device_count): + try: + handle = pynvml.nvmlDeviceGetHandleByIndex(i) + memory_info = pynvml.nvmlDeviceGetMemoryInfo(handle) + utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) + gpu_stats[f"gpu_{i}_total"] = memory_info.total + gpu_stats[f"gpu_{i}_used"] = memory_info.used + gpu_stats[f"gpu_{i}_free"] = memory_info.free + gpu_stats[f"gpu_{i}_utilization"] = utilization.gpu + try: + temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU) + gpu_stats[f"gpu_{i}_temp"] = temp + except: # noqa: E722 + pass + except pynvml.NVMLError as e: + print(f"Error retrieving info for GPU {i}: {e}") + except Exception as e: + print(f"Error initializing GPU monitoring: {e}") + return gpu_stats + + def close(self) -> None: + if self._inited and self._available: + try: + pynvml.nvmlShutdown() + except: # noqa: E722 + pass + finally: + self._inited = False + self._available = False + + +class ProcessThreadCollector(BaseCollector): + def collect(self) -> Dict[str, Any]: + proc_count = 0 + thread_count = 0 + try: + for proc in psutil.process_iter(["pid"]): + proc_count += 1 + try: + thread_count += proc.num_threads() + except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + pass + except Exception as e: + print(f"Error counting processes/threads: {e}") + return {"system_process_count": proc_count, "system_thread_count": thread_count} + + +class DockerCollector(BaseCollector): + """Collect per-container stats and flatten into the row namespace. + + Key format (clear, consistent): + __ + Example: docker_nginx_cpu_percent, docker_postgres_mem_used_bytes + """ + + def __init__( + self, + client: Optional["docker.DockerClient"] = None, + key_prefix: str = "docker", + separator: str = "_", + ) -> None: + self.client = client + self.key_prefix = key_prefix + self.separator = separator + if self.client is None: + try: + self.client = docker.from_env() + except Exception as e: + print("Error connecting to Docker daemon:", e) + self.client = None + + def collect(self) -> Dict[str, Any]: + out: Dict[str, Any] = {} + if self.client is None: + return out + try: + containers = self.client.containers.list() + for container in containers: + try: + stats_raw = container.stats(stream=False) + stats = ( + json.loads(stats_raw.decode("utf-8")) + if isinstance(stats_raw, (bytes, bytearray)) + else stats_raw + ) + + cpu_percent = _docker_cpu_percent(stats) + used_bytes, limit_bytes, mem_percent = _docker_memory_usage_limit_percent( + stats.get("memory_stats", {}) + ) + mem_usage_gb = used_bytes / (1024**3) # noqa: F841 + mem_limit_gb = limit_bytes / (1024**3) if limit_bytes else 0 # noqa: F841 + + rx_bytes, tx_bytes = _aggregate_network_bytes(stats) + blk_read, blk_write = _aggregate_blkio_bytes(stats) + + # Best-effort open files from container init PID + try: + inspect_data = container.attrs + pid = inspect_data.get("State", {}).get("Pid", 0) + if pid and pid > 0: + proc = psutil.Process(pid) + open_files_count = len(proc.open_files()) + else: + open_files_count = -1 + except Exception: + open_files_count = -1 + + cname = container.name + sep = self.separator + pref = f"{self.key_prefix}{sep}{cname}" if self.key_prefix else cname + out.update( + { + f"{pref}{sep}cpu_percent": cpu_percent, + # memory (expose both raw bytes and derived percent/limit) + f"{pref}{sep}mem_used_bytes": int(used_bytes), + f"{pref}{sep}mem_limit_bytes": int(limit_bytes), + f"{pref}{sep}mem_percent": mem_percent, + # open files (best-effort) + f"{pref}{sep}open_files": open_files_count, + # cumulative counters for per-second derivation + f"{pref}{sep}net_rx_bytes": rx_bytes, + f"{pref}{sep}net_tx_bytes": tx_bytes, + f"{pref}{sep}blkio_read_bytes": blk_read, + f"{pref}{sep}blkio_write_bytes": blk_write, + } + ) + except Exception as e: + print(f"Error retrieving stats for container {container.name}: {e}") + except Exception as e: + print("Error listing Docker containers:", e) + return out + + +def calculate_deltas(current, previous, delta_keys): + deltas = {} + if previous: + for key in delta_keys: + if key in current and key in previous: + if isinstance(current[key], (int, float)) and isinstance(previous[key], (int, float)): + time_diff = (current["timestamp"] - previous["timestamp"]).total_seconds() + if time_diff > 0: + delta_per_sec = (current[key] - previous[key]) / time_diff + deltas[f"{key}_per_sec"] = delta_per_sec + return deltas + + +class SystemTracer: + """Encapsulated system monitoring with configurable options. + + Provides collection of system metrics, optional Docker and GPU stats, + delta computation for cumulative counters, and periodic Parquet writing. + """ + + def __init__( + self, + sample_interval: float = 5.0, + write_interval: float = 10.0, + output_file: str = "system_monitor.parquet", + enable_gpu: bool = True, + enable_docker: bool = True, + docker_client: Optional["docker.DockerClient"] = None, + collectors: Optional[List[BaseCollector]] = None, + use_utc: bool = False, + write_final: bool = True, + ) -> None: + self.sample_interval = sample_interval + self.write_interval = write_interval + self.output_file = output_file + self.enable_gpu = enable_gpu + self.enable_docker = enable_docker + self.data_buffer: List[Dict[str, Any]] = [] + self.previous_row: Optional[Dict[str, Any]] = None + self.delta_keys: List[str] = [ + "disk_read_bytes", + "disk_write_bytes", + "disk_read_count", + "disk_write_count", + "net_bytes_sent", + "net_bytes_recv", + "net_packets_sent", + "net_packets_recv", + ] + self.last_write_time = time.time() + self.docker_client = docker_client + self.use_utc = use_utc + self.write_final = write_final + if self.enable_docker and self.docker_client is None: + try: + self.docker_client = docker.from_env() + except Exception as e: + print(f"Docker client not available: {e}") + self.docker_client = None + + # Initialize per-metric collectors (allow override) + if collectors is not None: + self.collectors = collectors + else: + self.collectors: List[BaseCollector] = [ + MemoryCollector(), + CPUCollector(percpu=True, interval=None), + OpenFilesCollector(), + DiskIOCollector(), + NetworkCollector(), + ProcessThreadCollector(), + ] + self.gpu_collector: Optional[GPUCollector] = None + if self.enable_gpu: + self.gpu_collector = GPUCollector() + self.collectors.append(self.gpu_collector) + self.docker_collector: Optional[DockerCollector] = None + if self.enable_docker and self.docker_client is not None: + self.docker_collector = DockerCollector(client=self.docker_client) + self.collectors.append(self.docker_collector) + # Control flags/state + self._stop_event = threading.Event() + + def _shutdown_gpu(self) -> None: + # Back-compat: close GPU collector if present + if self.gpu_collector is not None: + try: + self.gpu_collector.close() + except Exception: + pass + + def write_parquet_to(self, df: pd.DataFrame, destination_path: str) -> None: + """Atomically write the dataframe to a specific parquet destination path. + + Mirrors write_parquet but targets the provided destination rather than self.output_file. + """ + tmp_path = f"{destination_path}.tmp" + # Try pyarrow first + try: + import pyarrow as pa # type: ignore + import pyarrow.parquet as pq # type: ignore + + table = pa.Table.from_pandas(df) + pq.write_table(table, tmp_path) + os.replace(tmp_path, destination_path) + return + except ImportError: + pass + except Exception: + try: + if os.path.exists(tmp_path): + os.remove(tmp_path) + except Exception: + pass + raise + + # Fallback: fastparquet via pandas + try: + df.to_parquet(tmp_path, engine="fastparquet") + os.replace(tmp_path, destination_path) + finally: + try: + if os.path.exists(tmp_path): + os.remove(tmp_path) + except Exception: + pass + + def collect_once(self) -> Dict[str, Any]: + """Collect a single snapshot of system metrics, computing deltas if possible.""" + timestamp = pd.Timestamp.utcnow() if self.use_utc else pd.Timestamp.now() + row: Dict[str, Any] = {"timestamp": timestamp} + # Aggregate from all collectors + for collector in self.collectors: + try: + data = collector.collect() + if not data: + continue + row.update(data) + except Exception as e: + print(f"Collector {collector.__class__.__name__} failed: {e}") + + # If Docker collector present, add its cumulative keys to delta set + if self.docker_collector is not None: + # Match new naming: docker__ + suffixes = ( + "_net_rx_bytes", + "_net_tx_bytes", + "_blkio_read_bytes", + "_blkio_write_bytes", + ) + for k in list(row.keys()): + if any(k.endswith(sfx) for sfx in suffixes) and k not in self.delta_keys: + self.delta_keys.append(k) + + # Deltas + if self.previous_row: + deltas = calculate_deltas(row, self.previous_row, self.delta_keys) + row.update(deltas) + self.previous_row = row.copy() + return row + + def write_parquet(self, df: pd.DataFrame) -> None: + """Atomically write the current dataframe to the parquet output path. + + Prefers pyarrow; falls back to fastparquet if available. Writes to a temp + file and atomically replaces the target so readers never see partial data. + """ + tmp_path = f"{self.output_file}.tmp" + # Try pyarrow first + try: + import pyarrow as pa # type: ignore + import pyarrow.parquet as pq # type: ignore + + table = pa.Table.from_pandas(df) + pq.write_table(table, tmp_path) + os.replace(tmp_path, self.output_file) + return + except ImportError: + pass + except Exception: + # If pyarrow present but write failed, clean up and re-raise to try fallback + try: + if os.path.exists(tmp_path): + os.remove(tmp_path) + except Exception: + pass + raise + + # Fallback: fastparquet via pandas + try: + df.to_parquet(tmp_path, engine="fastparquet") + os.replace(tmp_path, self.output_file) + finally: + try: + if os.path.exists(tmp_path): + os.remove(tmp_path) + except Exception: + pass + + def run(self, duration: Optional[float] = None, verbose: bool = True) -> None: + """Run monitoring loop. If duration is set (seconds), stop after duration; else run until Ctrl+C.""" + start_time = time.time() + if verbose: + print( + f"Starting system monitoring. Data will be written to {self.output_file} every " + f"{self.write_interval} seconds." + ) + print("Press Ctrl+C to stop monitoring." if duration is None else f"Stopping after {duration} seconds...") + try: + while not self._stop_event.is_set(): + row = self.collect_once() + self.data_buffer.append(row) + + if verbose: + ts = row["timestamp"] + cpu_cols = [k for k in row.keys() if k.startswith("cpu_") and k.endswith("_utilization")] + cpu_vals = [row[k] for k in cpu_cols] + cpu_avg = sum(cpu_vals) / len(cpu_vals) if cpu_vals else 0.0 + mem_pct = (row["sys_used"] / row["sys_total"] * 100.0) if row.get("sys_total") else 0.0 + print( + f"\n[{ts}] CPU Avg: {cpu_avg:.1f}% | Memory: {mem_pct:.1f}% | " + f"Open Files: {row.get('total_open_files', 0)}" + ) + if "disk_read_bytes_per_sec" in row: + print( + f"Disk I/O: {row['disk_read_bytes_per_sec']/1024**2:.2f} MB/s read, " + f"{row['disk_write_bytes_per_sec']/1024**2:.2f} MB/s write" + ) + if "net_bytes_recv_per_sec" in row: + print( + f"Network: {row['net_bytes_recv_per_sec']/1024**2:.2f} MB/s down, " + f"{row['net_bytes_sent_per_sec']/1024**2:.2f} MB/s up" + ) + + # Periodic write (overwrite full buffered data) if enabled + now = time.time() + if ( + self.write_interval + and self.write_interval > 0 + and (now - self.last_write_time) >= self.write_interval + ): + df = pd.DataFrame(self.data_buffer) + try: + self.write_parquet(df) + if verbose: + print( + f"Total accumulated data ({len(self.data_buffer)} rows) " + f"written to {self.output_file} at {row['timestamp']}" + ) + self.last_write_time = now + except Exception as e: + print(f"Error writing periodic data: {e}") + + # Stop conditions + if duration is not None and (now - start_time) >= duration: + break + + time.sleep(self.sample_interval) + except KeyboardInterrupt: + if verbose: + print("\nStopping monitoring. Writing final data batch...") + finally: + # Final write if enabled + if self.write_final and self.data_buffer: + df = pd.DataFrame(self.data_buffer) + try: + self.write_parquet(df) + if verbose: + print(f"Final data written to {self.output_file}. Exiting.") + except Exception as e: + print(f"Error writing final data: {e}") + # Close any collectors that need cleanup + try: + self._shutdown_gpu() + except Exception: + pass + + def stop(self) -> None: + """Signal the run loop to stop.""" + self._stop_event.set() + + def reset(self) -> None: + """Clear accumulated data and deltas. Does not change output file.""" + self.data_buffer = [] + self.previous_row = None + self.last_write_time = time.time() + # Do not clear stop flag to allow caller to decide lifecycle + + def set_output_file(self, output_file: str) -> None: + """Update the output parquet path used by periodic writes.""" + self.output_file = output_file + + def snapshot(self, output_file: Optional[str] = None) -> str: + """Write the current buffered dataframe to the specified parquet path (or self.output_file). + + Returns the path written. + """ + base_path = output_file or self.output_file + if not base_path: + raise ValueError("No output_file specified for snapshot.") + # If destination exists, create a unique suffixed name: file.parquet -> file_0.parquet, ... + root, ext = os.path.splitext(base_path) + if not ext: + ext = ".parquet" + root = base_path # original base without extension + base_path = base_path + ext + + path = base_path + if os.path.exists(path): + idx = 0 + while True: + candidate = f"{root}_{idx}{ext}" + if not os.path.exists(candidate): + path = candidate + break + idx += 1 + df = pd.DataFrame(self.data_buffer) + if not df.empty: + self.write_parquet_to(df, path) + else: + # Still write an empty table with schema + self.write_parquet_to(pd.DataFrame([]), path) + return path + + +# -------- Functional API -------- +def collect_system_snapshot(enable_gpu: bool = True, enable_docker: bool = True, docker_client=None) -> Dict[str, Any]: + tracer = SystemTracer( + sample_interval=0.0, + write_interval=0.0, + output_file="", + enable_gpu=enable_gpu, + enable_docker=enable_docker, + docker_client=docker_client, + use_utc=False, + ) + return tracer.collect_once() + + +def monitor_to_parquet( + output_file: str = "system_monitor.parquet", + sample_interval: float = 5.0, + write_interval: float = 10.0, + duration: Optional[float] = None, + enable_gpu: bool = True, + enable_docker: bool = True, + docker_client=None, + verbose: bool = True, + use_utc: bool = False, +) -> None: + tracer = SystemTracer( + sample_interval=sample_interval, + write_interval=write_interval, + output_file=output_file, + enable_gpu=enable_gpu, + enable_docker=enable_docker, + docker_client=docker_client, + use_utc=use_utc, + ) + tracer.run(duration=duration, verbose=verbose) + + +# -------- CLI utility -------- +def main(): + parser = argparse.ArgumentParser(description="System monitor/tracer CLI") + sub = parser.add_subparsers(dest="command") + + # run (default) + p_run = sub.add_parser("run", help="Run continuous monitoring and write Parquet") + p_run.add_argument("--output", default="system_monitor.parquet", help="Parquet output file path") + p_run.add_argument("--sample-interval", type=float, default=5.0, help="Sampling interval seconds") + p_run.add_argument("--write-interval", type=float, default=10.0, help="Write interval seconds") + p_run.add_argument("--duration", type=float, default=None, help="Optional duration to run (seconds)") + p_run.add_argument("--no-gpu", action="store_true", help="Disable GPU collection") + p_run.add_argument("--no-docker", action="store_true", help="Disable Docker collection") + p_run.add_argument("--quiet", action="store_true", help="Reduce console output") + p_run.add_argument("--utc", action="store_true", help="Record timestamps in UTC (default is local time)") + + # snapshot + p_snap = sub.add_parser("snapshot", help="Collect a single snapshot and print JSON") + p_snap.add_argument("--no-gpu", action="store_true", help="Disable GPU collection") + p_snap.add_argument("--no-docker", action="store_true", help="Disable Docker collection") + p_snap.add_argument("--utc", action="store_true", help="Use UTC timestamp for the snapshot") + + # proctree (process/thread inspection) + p_tree = sub.add_parser("proctree", help="Inspect a process tree and summarize threads") + p_tree.add_argument("pid", type=int, help="Root PID to inspect") + p_tree.add_argument("--verbose", action="store_true", help="Verbose per-PID output in JSON") + + args = parser.parse_args() + if not getattr(args, "command", None): + # No subcommand provided; default to 'run' so that subparser defaults are applied + args = parser.parse_args(["run"]) + cmd = args.command + + if cmd == "snapshot": + # One-off snapshot; use_utc affects only the timestamp on this row + tracer = SystemTracer( + sample_interval=0.0, + write_interval=0.0, + output_file="", + enable_gpu=not args.no_gpu, + enable_docker=not args.no_docker, + docker_client=None, + use_utc=bool(getattr(args, "utc", False)), + ) + snap = tracer.collect_once() + print(json.dumps(snap, default=str)) + return + + if cmd == "proctree": + summary = get_process_tree_summary(args.pid, verbose=args.verbose) + print(json.dumps(summary, default=str)) + return + + # default: run + monitor_to_parquet( + output_file=args.output, + sample_interval=args.sample_interval, + write_interval=args.write_interval, + duration=args.duration, + enable_gpu=not args.no_gpu, + enable_docker=not args.no_docker, + verbose=not args.quiet, + use_utc=bool(getattr(args, "utc", False)), + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/support/system_monitor/tracer.py b/scripts/support/system_monitor/tracer.py new file mode 100644 index 000000000..98e9483b9 --- /dev/null +++ b/scripts/support/system_monitor/tracer.py @@ -0,0 +1,25 @@ +"""Stable import surface for tracer utilities. + +Usage: + from system_monitor.tracer import ( + get_process_tree_summary, + ) +""" + +# flake8: noqa + +if __name__ == "__main__": + import sys + + # Delegate to the full CLI in system_tracer + from .system_tracer import main as tracer_main + + sys.exit(tracer_main()) + +"""Lightweight wrapper to expose tracer CLI under a short module path.""" + +from .system_tracer import ( + get_process_tree_summary, +) + +__all__ = ["get_process_tree_summary"] diff --git a/scripts/support/trace_summarizer.py b/scripts/support/trace_summarizer.py new file mode 100644 index 000000000..22b588ffb --- /dev/null +++ b/scripts/support/trace_summarizer.py @@ -0,0 +1,377 @@ +#!/usr/bin/env python3 +""" +Trace Summarizer + +Parses NV-Ingest trace maps (trace::entry::, trace::exit::) and computes +total time spent in each area. Aggregates across numbered suffixes like _0, _1. + +Usage: + - From file(s): + python scripts/support/trace_summarizer.py traces.json [more.json] + - From stdin: + cat traces.json | python scripts/support/trace_summarizer.py - + +Options: + --units [ns|ms|s] Output units (default: ms) + --json Output JSON summary + --top N Show top N entries only +""" + +import sys +import json +import re +import argparse +import os +import glob +import math +from collections import defaultdict +from typing import Dict, Any, List, Tuple + + +ENTRY_PREFIX = "trace::entry::" +EXIT_PREFIX = "trace::exit::" +SUFFIX_NUM_RE = re.compile(r"(.*)_(\d+)$") + + +def _normalize_name(name: str) -> str: + """ + Normalize a trace name by stripping trailing _ to aggregate repeated items. + Example: "pdf_extractor::pdf_extraction::pdfium_pages_to_numpy_0" -> "...::pdfium_pages_to_numpy" + """ + m = SUFFIX_NUM_RE.match(name) + return m.group(1) if m else name + + +def _load_trace_map(path: str) -> Dict[str, Any]: + if path == "-": + data = sys.stdin.read() + return json.loads(data) + with open(path, "r") as f: + return json.load(f) + + +def _iter_entries(trace_map: Dict[str, Any]) -> List[Tuple[str, int]]: + """ + Return list of (name, entry_ts) for all entry keys. + """ + out = [] + for k, v in trace_map.items(): + if isinstance(k, str) and k.startswith(ENTRY_PREFIX): + name = k[len(ENTRY_PREFIX) :] + try: + ts = int(v) + except Exception: + # Attempt float -> int if provided as float + ts = int(float(v)) + out.append((name, ts)) + return out + + +def _get_exit_ts(trace_map: Dict[str, Any], name: str) -> int | None: + k = EXIT_PREFIX + name + if k not in trace_map: + return None + v = trace_map[k] + try: + return int(v) + except Exception: + return int(float(v)) + + +def _convert_units(ns: int, units: str) -> float: + if units == "ns": + return float(ns) + if units == "ms": + return ns / 1e6 + if units == "s": + return ns / 1e9 + raise ValueError("Unsupported units: " + units) + + +def summarize_durations(trace_maps: List[Dict[str, Any]], normalize_suffixes: bool = True) -> Dict[str, List[int]]: + """ + Compute list of durations (in ns) per normalized area name across all maps. + """ + durations: Dict[str, List[int]] = defaultdict(list) + for trace_map in trace_maps: + entries = _iter_entries(trace_map) + for name, ts_entry in entries: + ts_exit = _get_exit_ts(trace_map, name) + if ts_exit is None: + print(f"[warn] missing exit for '{name}'", file=sys.stderr) + continue + if ts_exit < ts_entry: + print(f"[warn] exit < entry for '{name}'", file=sys.stderr) + continue + norm = _normalize_name(name) if normalize_suffixes else name + durations[norm].append(ts_exit - ts_entry) + return dict(durations) + + +def summarize(trace_maps: List[Dict[str, Any]], normalize_suffixes: bool = True) -> Dict[str, int]: + """ + Compute total durations in nanoseconds per normalized area name across all maps. + """ + totals_ns: Dict[str, int] = defaultdict(int) + durations = summarize_durations(trace_maps, normalize_suffixes=normalize_suffixes) + for k, vals in durations.items(): + totals_ns[k] = sum(vals) + return dict(totals_ns) + + +def main(): + ap = argparse.ArgumentParser(description="Summarize NV-Ingest trace timings.") + ap.add_argument("inputs", nargs="+", help="JSON files or '-' for stdin") + ap.add_argument("--units", choices=["ns", "ms", "s"], default="ms", help="Output units (default: ms)") + ap.add_argument("--json", action="store_true", help="Output JSON instead of table") + ap.add_argument("--tree", action="store_true", help="Output hierarchical, indented tree view") + ap.add_argument( + "--no-aggregate-suffixes", + action="store_true", + help="Do not strip trailing numeric suffixes; keep entries like *_0, *_1 separate", + ) + ap.add_argument( + "--cumulative", + action="store_true", + help="Tree mode: show parent totals including the sum of all descendants", + ) + ap.add_argument("--top", type=int, default=0, help="Show top N entries") + ap.add_argument( + "--threshold", + type=float, + default=0.0, + help="Minimum fraction (0..1) of total aggregate time a stage must account for to be shown", + ) + ap.add_argument( + "--exclude-channel-in", + action="store_true", + help="Exclude entries whose names contain 'channel_in' or 'network_in' from listings", + ) + args = ap.parse_args() + + trace_maps = [] + # Aggregate primitive counts across inputs if present + primitive_total = 0 + primitive_by_type = defaultdict(int) + structured_by_subtype = defaultdict(int) + # Expand inputs: allow directories and glob patterns + expanded_inputs: List[str] = [] + for spec in args.inputs: + if spec == "-": + expanded_inputs.append(spec) + continue + if os.path.isdir(spec): + # Aggregate all *.traces.json files in the directory (non-recursive) + expanded_inputs.extend(sorted(glob.glob(os.path.join(spec, "*.traces.json")))) + continue + # Glob pattern (supports recursive with **) + matches = glob.glob(spec, recursive=True) + if matches: + expanded_inputs.extend(sorted(matches)) + else: + # Fallback: treat as a single file path + expanded_inputs.append(spec) + + for p in expanded_inputs: + try: + m = _load_trace_map(p) + trace_maps.append(m) + pc = m.get("primitive_counts") + if isinstance(pc, dict): + try: + primitive_total += int(pc.get("total", 0)) + except Exception: + pass + by_type = pc.get("by_type") or {} + if isinstance(by_type, dict): + for k, v in by_type.items(): + try: + primitive_by_type[k] += int(v) + except Exception: + continue + by_sub = pc.get("structured_by_subtype") or {} + if isinstance(by_sub, dict): + for k, v in by_sub.items(): + try: + structured_by_subtype[k] += int(v) + except Exception: + continue + except Exception as e: + print(f"[error] failed to load {p}: {e}", file=sys.stderr) + return 2 + + normalize_suffixes = not args.no_aggregate_suffixes + durations_by_name = summarize_durations(trace_maps, normalize_suffixes=normalize_suffixes) + + # Optionally exclude channel_in/network_in entries + if args.exclude_channel_in: + durations_by_name = { + k: v for k, v in durations_by_name.items() if ("channel_in" not in k and "network_in" not in k) + } + + # Compute stats per name + def _percentile(values: List[int], p: float) -> float: + if not values: + return 0.0 + vs = sorted(values) + n = len(vs) + # Nearest-rank method + rank = max(1, int(math.ceil(p * n))) + return float(vs[rank - 1]) + + stats_map: Dict[str, Dict[str, float]] = {} + for name, vals in durations_by_name.items(): + total = float(sum(vals)) + count = len(vals) + mean = (total / count) if count else 0.0 + p95 = _percentile(vals, 0.95) + p99 = _percentile(vals, 0.99) + stats_map[name] = { + "total_ns": total, + "count": count, + "mean_ns": mean, + "p95_ns": p95, + "p99_ns": p99, + } + + # Apply threshold filtering as a fraction of total aggregate time + total_ns_all = sum(meta["total_ns"] for meta in stats_map.values()) + if args.threshold and total_ns_all > 0: + stats_map = { + name: meta for name, meta in stats_map.items() if (meta["total_ns"] / total_ns_all) >= args.threshold + } + + # Sort by total time desc + items = sorted(stats_map.items(), key=lambda kv: kv[1]["total_ns"], reverse=True) + + if args.json: + out = { + name: { + "total": _convert_units(int(meta["total_ns"]), args.units), + "count": int(meta["count"]), + "mean": _convert_units(int(meta["mean_ns"]), args.units), + "p95": _convert_units(int(meta["p95_ns"]), args.units), + "p99": _convert_units(int(meta["p99_ns"]), args.units), + } + for name, meta in items + } + print(json.dumps(out, indent=2)) + return 0 + + if args.tree: + # Print primitive distribution before timing tree (non-JSON) + print("primitive distribution") + print("-" * 98) + print(f"{'total':<80} {primitive_total:>16}") + if primitive_by_type: + for k, v in sorted(primitive_by_type.items(), key=lambda kv: kv[1], reverse=True): + print(f"{k:<80} {v:>16}") + if structured_by_subtype: + print() + print("structured by subtype") + print("-" * 98) + for k, v in sorted(structured_by_subtype.items(), key=lambda kv: kv[1], reverse=True): + print(f"{k:<80} {v:>16}") + print() + + # Human-readable hierarchical tree + def build_tree(pairs: List[Tuple[str, int]]): + tree = {} + for full_name, total_ns in pairs: + parts = full_name.split("::") if full_name else [full_name] + cur = tree + for i, part in enumerate(parts): + if part not in cur: + cur[part] = {"__total_ns__": 0, "__children__": {}} + if i == len(parts) - 1: + cur[part]["__total_ns__"] += total_ns + cur = cur[part]["__children__"] + return tree + + def flatten_tree(node: dict, level: int = 0): + rows = [] + children = [(k, v) for k, v in node.items() if not k.startswith("__")] + children.sort(key=lambda kv: kv[1]["__total_ns__"], reverse=True) + for name, meta in children: + rows.append((level, name, meta["__total_ns__"])) + rows.extend(flatten_tree(meta["__children__"], level + 1)) + return rows + + # Apply --top to root-level only (tree mode) + root_totals = defaultdict(int) + for name, meta in items: + total_ns = int(meta["total_ns"]) + root = name.split("::")[0] + root_totals[root] += total_ns + sorted_roots = sorted(root_totals.items(), key=lambda kv: kv[1], reverse=True) + if args.top and args.top > 0: + keep_roots = set([r for r, _ in sorted_roots[: args.top]]) + filtered_items = [ + (name, int(meta["total_ns"])) for name, meta in items if name.split("::")[0] in keep_roots + ] + else: + filtered_items = [(name, int(meta["total_ns"])) for name, meta in items] + + tree = build_tree(filtered_items) + rows = flatten_tree(tree) + # Print as table with indentation for the area column + col1 = "area" + col2 = f"total ({args.units})" + col3 = "count" + col4 = f"mean ({args.units})" + col5 = f"p95 ({args.units})" + col6 = f"p99 ({args.units})" + print(f"{col1:<60} {col2:>12} {col3:>8} {col4:>12} {col5:>12} {col6:>12}") + print("-" * 120) + for level, name, total_ns in rows: + total_val = _convert_units(total_ns, args.units) + display = f"{' ' * level}{name}" + meta = stats_map.get(name) + if meta: + count = int(meta["count"]) + mean_val = _convert_units(int(meta["mean_ns"]), args.units) + p95_val = _convert_units(int(meta["p95_ns"]), args.units) + p99_val = _convert_units(int(meta["p99_ns"]), args.units) + print( + f"{display:<60} {total_val:>12.3f} {count:>8} {mean_val:>12.3f} {p95_val:>12.3f} {p99_val:>12.3f}" + ) + else: + print(f"{display:<60} {total_val:>12.3f} {'-':>8} {'-':>12} {'-':>12} {'-':>12}") + else: + # Print primitive distribution before flat table (non-JSON) + print("primitive distribution") + print("-" * 98) + print(f"{'total':<80} {primitive_total:>16}") + if primitive_by_type: + for k, v in sorted(primitive_by_type.items(), key=lambda kv: kv[1], reverse=True): + print(f"{k:<80} {v:>16}") + if structured_by_subtype: + print() + print("structured by subtype") + print("-" * 98) + for k, v in sorted(structured_by_subtype.items(), key=lambda kv: kv[1], reverse=True): + print(f"{k:<80} {v:>16}") + print() + # Flat table output with stats + flat_items = items[: args.top] if (args.top and args.top > 0) else items + col1 = "area" + col2 = f"total ({args.units})" + col3 = "count" + col4 = f"mean ({args.units})" + col5 = f"p95 ({args.units})" + col6 = f"p99 ({args.units})" + print(f"{col1:<60} {col2:>12} {col3:>8} {col4:>12} {col5:>12} {col6:>12}") + print("-" * 120) + for name, meta in flat_items: + total_val = _convert_units(int(meta["total_ns"]), args.units) + count = int(meta["count"]) + mean_val = _convert_units(int(meta["mean_ns"]), args.units) + p95_val = _convert_units(int(meta["p95_ns"]), args.units) + p99_val = _convert_units(int(meta["p99_ns"]), args.units) + print(f"{name:<60} {total_val:>12.3f} {count:>8} {mean_val:>12.3f} {p95_val:>12.3f} {p99_val:>12.3f}") + + return 0 + + +if __name__ == "__main__": + sys.exit(main())