diff --git a/nvidia/dgx-spark-mcp-server/README.md b/nvidia/dgx-spark-mcp-server/README.md new file mode 100644 index 0000000..1cd78cf --- /dev/null +++ b/nvidia/dgx-spark-mcp-server/README.md @@ -0,0 +1,84 @@ +# DGX Spark MCP Server Playbook + +This playbook installs and configures the **DGX Spark MCP Server**, a tool that provides hardware-aware Apache Spark optimization for NVIDIA DGX systems via the Model Context Protocol (MCP). + +## Overview + +The DGX Spark MCP Server enables MCP clients (like Claude Desktop or Claude Code) to: +* **Detect Hardware**: Automatically read DGX GPU topology, memory, and CPU specs. +* **Optimize Spark**: Generate tuned Spark configurations (`spark-submit` args) based on detected hardware and workload type (ETL, ML Training, Inference). +* **Monitor**: Check real-time GPU availability before submitting jobs. + +## Prerequisites + +* **NVIDIA DGX System** (or compatible GPU server) +* **NVIDIA Drivers** installed (`nvidia-smi` available) +* **Node.js 18+** +* **Root access** (for systemd service installation) + +## Directory Structure + +``` +. +├── config/ +│ └── default.json # Default configuration +├── deploy/ +│ └── dgx-spark-mcp.service # Systemd service file +└── scripts/ + └── install.sh # Automated installer +``` + +## Installation + +1. **Run the installer**: + ```bash + sudo ./scripts/install.sh + ``` + This script will: + * Install `dgx-spark-mcp` globally via `npm`. + * Create a dedicated system user (`dgx`). + * Setup logging directory `/var/log/dgx-spark-mcp`. + * Install and start the systemd service. + +2. **Verify Installation**: + ```bash + systemctl status dgx-spark-mcp + ``` + +## Configuration + +The configuration file is located at `/etc/dgx-spark-mcp/config.json`. + +### Key Settings + +* **`mcp.transport`**: `stdio` (default) or `sse`. +* **`hardware.enableGpuMonitoring`**: Set to `true` to enable real-time `nvidia-smi` queries. +* **`logging.level`**: `info` or `debug`. + +## Usage with Claude Desktop + +Add the following to your `claude_desktop_config.json`: + +```json +{ + "mcpServers": { + "dgx-spark": { + "command": "dgx-spark-mcp" + } + } +} +``` + +## Troubleshooting + +**Service fails to start?** +Check logs: +```bash +journalctl -u dgx-spark-mcp -f +``` + +**Permission denied?** +Ensure the `dgx` user has permissions to access `nvidia-smi`. You may need to add the user to the `video` group: +```bash +usermod -a -G video dgx +``` diff --git a/nvidia/dgx-spark-mcp-server/config/default.json b/nvidia/dgx-spark-mcp-server/config/default.json new file mode 100644 index 0000000..3fee04c --- /dev/null +++ b/nvidia/dgx-spark-mcp-server/config/default.json @@ -0,0 +1,33 @@ +{ + "server": { + "port": 3000, + "host": "localhost", + "nodeEnv": "production" + }, + "logging": { + "level": "info", + "format": "json", + "dir": "/var/log/dgx-spark-mcp", + "maxFiles": 10, + "maxSize": "10m" + }, + "mcp": { + "serverName": "dgx-spark-mcp", + "serverVersion": "0.1.0", + "transport": "stdio" + }, + "hardware": { + "nvidiaSmiPath": "/usr/bin/nvidia-smi", + "cacheTTL": 30000, + "enableGpuMonitoring": true + }, + "spark": {}, + "performance": { + "enableMetrics": true, + "metricsInterval": 60000, + "healthCheckInterval": 30000 + }, + "security": { + "enableAuth": false + } +} diff --git a/nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service b/nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service new file mode 100644 index 0000000..bf8cd39 --- /dev/null +++ b/nvidia/dgx-spark-mcp-server/deploy/dgx-spark-mcp.service @@ -0,0 +1,48 @@ +[Unit] +Description=DGX Spark MCP Server +Documentation=https://github.com/raibid-labs/dgx-spark-mcp +After=network.target +Wants=network-online.target + +[Service] +Type=simple +User=dgx +Group=dgx +# Environment variables +Environment="NODE_ENV=production" +Environment="DGX_MCP_CONFIG_PATH=/etc/dgx-spark-mcp/config.json" + +# Start the service +# Assumes installed globally via npm +ExecStart=/usr/local/bin/dgx-spark-mcp + +# Restart policy +Restart=on-failure +RestartSec=10 +StartLimitInterval=600 +StartLimitBurst=5 + +# Resource limits +LimitNOFILE=65536 +LimitNPROC=4096 + +# Security hardening +NoNewPrivileges=true +PrivateTmp=true +ProtectSystem=strict +ProtectHome=true +# Allow write access to logs +ReadWritePaths=/var/log/dgx-spark-mcp + +# Logging +StandardOutput=journal +StandardError=journal +SyslogIdentifier=dgx-spark-mcp + +# Process management +KillMode=mixed +KillSignal=SIGTERM +TimeoutStopSec=30 + +[Install] +WantedBy=multi-user.target diff --git a/nvidia/dgx-spark-mcp-server/scripts/install.sh b/nvidia/dgx-spark-mcp-server/scripts/install.sh new file mode 100755 index 0000000..c2f033b --- /dev/null +++ b/nvidia/dgx-spark-mcp-server/scripts/install.sh @@ -0,0 +1,78 @@ +#!/bin/bash +set -euo pipefail + +# DGX Spark MCP Server - Playbook Installation Script +# Installs the server from NPM and configures systemd + +# Configuration +PACKAGE_NAME="dgx-spark-mcp" +SERVICE_NAME="dgx-spark-mcp" +CONFIG_DIR="/etc/dgx-spark-mcp" +LOG_DIR="/var/log/dgx-spark-mcp" +USER="dgx" +GROUP="dgx" + +# Colors +RED='\033[0;31m' +GREEN='\033[0;32m' +NC='\033[0m' + +log_info() { echo -e "${GREEN}[INFO]${NC} $1"; } +log_error() { echo -e "${RED}[ERROR]${NC} $1"; } + +# Check root +if [[ $EUID -ne 0 ]]; then + log_error "This script must be run as root" + exit 1 +fi + +# 1. Install Node.js (if missing) - Brief check +if ! command -v node &> /dev/null; then + log_info "Node.js not found. Please install Node.js 18+." + exit 1 +fi + +# 2. Install Package +log_info "Installing $PACKAGE_NAME from registry..." +npm install -g $PACKAGE_NAME + +# 3. Create User +if ! id -u "$USER" &>/dev/null; then + log_info "Creating user $USER..." + useradd --system --no-create-home --shell /bin/false "$USER" +fi + +# 4. Setup Directories +log_info "Setting up directories..." +mkdir -p "$CONFIG_DIR" +mkdir -p "$LOG_DIR" + +# Copy config if provided in playbook +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [[ -f "$SCRIPT_DIR/../config/default.json" ]]; then + cp "$SCRIPT_DIR/../config/default.json" "$CONFIG_DIR/config.json" +else + log_info "No default config found, using internal defaults." +fi + +# Permissions +chown -R "$USER:$GROUP" "$LOG_DIR" +chown -R "$USER:$GROUP" "$CONFIG_DIR" +chmod 755 "$LOG_DIR" +chmod 755 "$CONFIG_DIR" + +# 5. Setup Service +log_info "Configuring systemd service..." +if [[ -f "$SCRIPT_DIR/../deploy/$SERVICE_NAME.service" ]]; then + cp "$SCRIPT_DIR/../deploy/$SERVICE_NAME.service" "/etc/systemd/system/$SERVICE_NAME.service" + systemctl daemon-reload + systemctl enable "$SERVICE_NAME" + systemctl restart "$SERVICE_NAME" + log_info "Service started." +else + log_error "Service file not found." + exit 1 +fi + +log_info "Installation complete." +log_info "Status: systemctl status $SERVICE_NAME"