Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 63 additions & 5 deletions databricks-mcp-server/databricks_mcp_server/tools/sql.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""SQL tools - Execute SQL queries and get table information."""

from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Union

from databricks_tools_core.sql import (
execute_sql as _execute_sql,
Expand All @@ -14,6 +14,45 @@
from ..server import mcp


def _format_results_markdown(rows: List[Dict[str, Any]]) -> str:
"""Format SQL results as a markdown table.

Markdown tables state column names once in the header instead of repeating
them on every row (as JSON does), reducing token usage by ~50%.

Args:
rows: List of row dicts from the SQL executor.

Returns:
Markdown table string, or "(no results)" if empty.
"""
if not rows:
return "(no results)"

columns = list(rows[0].keys())

# Build header
header = "| " + " | ".join(columns) + " |"
separator = "| " + " | ".join("---" for _ in columns) + " |"

# Build rows — convert None to empty string, stringify everything
data_lines = []
for row in rows:
cells = []
for col in columns:
val = row.get(col)
cell = "" if val is None else str(val)
# Escape pipe characters inside cell values
cell = cell.replace("|", "\\|")
cells.append(cell)
data_lines.append("| " + " | ".join(cells) + " |")

parts = [header, separator] + data_lines
# Append row count for awareness
parts.append(f"\n({len(rows)} row{'s' if len(rows) != 1 else ''})")
return "\n".join(parts)


@mcp.tool
def execute_sql(
sql_query: str,
Expand All @@ -22,7 +61,8 @@ def execute_sql(
schema: str = None,
timeout: int = 180,
query_tags: str = None,
) -> List[Dict[str, Any]]:
output_format: str = "markdown",
) -> Union[str, List[Dict[str, Any]]]:
"""
Execute a SQL query on a Databricks SQL Warehouse.

Expand All @@ -40,18 +80,25 @@ def execute_sql(
timeout: Timeout in seconds (default: 180)
query_tags: Optional query tags for cost attribution (e.g., "team:eng,cost_center:701").
Appears in system.query.history and Query History UI.
output_format: Result format — "markdown" (default) or "json".
Markdown tables are ~50% smaller than JSON because column names appear
only once in the header instead of on every row. Use "json" when you
need machine-parseable output.

Returns:
List of dictionaries, each representing a row with column names as keys.
Markdown table string (default) or list of row dictionaries (if output_format="json").
"""
return _execute_sql(
rows = _execute_sql(
sql_query=sql_query,
warehouse_id=warehouse_id,
catalog=catalog,
schema=schema,
timeout=timeout,
query_tags=query_tags,
)
if output_format == "json":
return rows
return _format_results_markdown(rows)


@mcp.tool
Expand All @@ -63,6 +110,7 @@ def execute_sql_multi(
timeout: int = 180,
max_workers: int = 4,
query_tags: str = None,
output_format: str = "markdown",
) -> Dict[str, Any]:
"""
Execute multiple SQL statements with dependency-aware parallelism.
Expand All @@ -82,11 +130,14 @@ def execute_sql_multi(
timeout: Timeout per query in seconds (default: 180)
max_workers: Maximum parallel queries per group (default: 4)
query_tags: Optional query tags for cost attribution (e.g., "team:eng,cost_center:701").
output_format: Result format — "markdown" (default) or "json".
Markdown tables are ~50% smaller than JSON because column names appear
only once in the header instead of on every row.

Returns:
Dictionary with results per query and execution summary.
"""
return _execute_sql_multi(
result = _execute_sql_multi(
sql_content=sql_content,
warehouse_id=warehouse_id,
catalog=catalog,
Expand All @@ -95,6 +146,13 @@ def execute_sql_multi(
max_workers=max_workers,
query_tags=query_tags,
)
# Format sample_results in each query result if markdown requested
if output_format != "json" and "results" in result:
for query_result in result["results"].values():
sample = query_result.get("sample_results")
if sample and isinstance(sample, list) and len(sample) > 0:
query_result["sample_results"] = _format_results_markdown(sample)
return result


@mcp.tool
Expand Down
78 changes: 78 additions & 0 deletions databricks-mcp-server/tests/test_sql_output_format.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
"""Unit tests for SQL output formatting (markdown vs JSON)."""

from databricks_mcp_server.tools.sql import _format_results_markdown


class TestFormatResultsMarkdown:
"""Tests for _format_results_markdown helper."""

def test_empty_list_returns_no_results(self):
assert _format_results_markdown([]) == "(no results)"

def test_single_row(self):
rows = [{"id": "1", "name": "Alice"}]
result = _format_results_markdown(rows)
lines = result.strip().split("\n")
assert lines[0] == "| id | name |"
assert lines[1] == "| --- | --- |"
assert lines[2] == "| 1 | Alice |"
assert "(1 row)" in result

def test_multiple_rows(self):
rows = [
{"id": "1", "name": "Alice", "city": "NYC"},
{"id": "2", "name": "Bob", "city": "Chicago"},
{"id": "3", "name": "Carol", "city": "Denver"},
]
result = _format_results_markdown(rows)
lines = result.strip().split("\n")
# Header + separator + 3 data rows + blank + count
assert lines[0] == "| id | name | city |"
assert lines[1] == "| --- | --- | --- |"
assert lines[2] == "| 1 | Alice | NYC |"
assert lines[3] == "| 2 | Bob | Chicago |"
assert lines[4] == "| 3 | Carol | Denver |"
assert "(3 rows)" in result

def test_none_values_become_empty(self):
rows = [{"id": "1", "name": None}]
result = _format_results_markdown(rows)
assert "| 1 | |" in result

def test_pipe_chars_escaped(self):
rows = [{"expr": "a | b"}]
result = _format_results_markdown(rows)
assert "a \\| b" in result

def test_column_names_appear_once(self):
"""The whole point: column names should appear exactly once (in the header)."""
rows = [
{"event_id": "1", "event_name": "Concert A"},
{"event_id": "2", "event_name": "Concert B"},
{"event_id": "3", "event_name": "Concert C"},
]
result = _format_results_markdown(rows)
# Column name should appear once in header, not repeated per row
assert result.count("event_id") == 1
assert result.count("event_name") == 1

def test_markdown_smaller_than_json(self):
"""Markdown output should be significantly smaller than JSON for many rows."""
import json

rows = [
{
"id": str(i),
"name": f"User {i}",
"email": f"user{i}@example.com",
"department": "Engineering",
"status": "Active",
}
for i in range(50)
]
md = _format_results_markdown(rows)
js = json.dumps(rows)
# Markdown should be at least 30% smaller
assert len(md) < len(js) * 0.7, (
f"Markdown ({len(md)} chars) should be <70% of JSON ({len(js)} chars)"
)
Loading