From 53d3ed158bfa2e885ca222e3d15435fc83186c1b Mon Sep 17 00:00:00 2001 From: kdjkdjkdj <26911657+kdjkdjkdj@users.noreply.github.com> Date: Sun, 19 Apr 2026 18:55:16 +0200 Subject: [PATCH 1/5] fix: reconfigure stdio to UTF-8 + LF at startup (#36) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Python's text-mode stdio defaults on Windows break line-delimited JSON-RPC framing: stdout translates \n to \r\n and stdin decodes input as cp1252. MCP clients send UTF-8 JSON-RPC framed by LF, so: - CRLF on stdout corrupts the response framing for strict parsers. - cp1252 stdin corrupts non-ASCII bytes — a path containing "ä" (UTF-8 \xc3\xa4) arrives as "ä" and the subsequent file operation fails with Security violation: invalid path. Reconfigure sys.stdin/stdout at the top of main() so the server behaves identically on Windows and Unix without requiring a wrapper script. No-op on platforms that already default to UTF-8/LF. Refs: #36 Co-Authored-By: Claude Opus 4.7 (1M context) --- markitdown_mcp/server.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/markitdown_mcp/server.py b/markitdown_mcp/server.py index 1bdfa42..605ccdb 100644 --- a/markitdown_mcp/server.py +++ b/markitdown_mcp/server.py @@ -1289,6 +1289,14 @@ async def run(self) -> None: def main() -> None: """Main entry point for console script.""" + # Force UTF-8 + LF on stdio. Python's text-mode defaults on Windows are + # cp1252 for stdin/stdout and CRLF translation on stdout, both of which + # break the MCP protocol: clients send UTF-8 JSON-RPC framed by LF, so + # cp1252 stdin corrupts non-ASCII input (e.g. path names with umlauts) + # and CRLF stdout corrupts the line-delimited framing. No-op on platforms + # that already default to UTF-8/LF. + sys.stdout.reconfigure(encoding="utf-8", newline="\n") + sys.stdin.reconfigure(encoding="utf-8") async def run_server() -> None: """Run the MCP server asynchronously.""" From 15997abf5c89d86233a1b65155baa759c0d66bbd Mon Sep 17 00:00:00 2001 From: kdjkdjkdj <26911657+kdjkdjkdj@users.noreply.github.com> Date: Sun, 19 Apr 2026 18:55:30 +0200 Subject: [PATCH 2/5] fix: handle missing HOME/USERPROFILE in get_safe_working_directories (#36) Path.home() raises RuntimeError if neither HOME nor USERPROFILE is set in the process environment. Some MCP clients (including Claude Code) spawn stdio servers with an effectively empty environment on Windows, which causes MarkItDownMCPServer.__init__ to abort with an opaque traceback before any request is processed. Wrap the call in try/except, log a warning, and skip the home-subdir additions. The server still starts with CWD, tempdir, and fixtures as safe directories. Refs: #36 Co-Authored-By: Claude Opus 4.7 (1M context) --- markitdown_mcp/server.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/markitdown_mcp/server.py b/markitdown_mcp/server.py index 605ccdb..e7be569 100644 --- a/markitdown_mcp/server.py +++ b/markitdown_mcp/server.py @@ -640,13 +640,22 @@ def get_safe_working_directories() -> list[str]: # Add current working directory safe_dirs.append(str(Path.cwd())) - # Add home directory subdirectories (but not root directories) - home = Path.home() - safe_subdirs = ["Documents", "Downloads", "Desktop", "tmp"] - for subdir in safe_subdirs: - potential_dir = home / subdir - if potential_dir.exists(): - safe_dirs.append(str(potential_dir)) + # Home directory subdirectories. Path.home() raises RuntimeError if neither + # HOME nor USERPROFILE is set, which can happen when MCP clients spawn the + # server with an empty environment. Fall back gracefully rather than + # crashing the server at init. + try: + home = Path.home() + except RuntimeError: + logger.warning("Could not determine user home; skipping home subdirs") + home = None + + if home is not None: + safe_subdirs = ["Documents", "Downloads", "Desktop", "tmp"] + for subdir in safe_subdirs: + potential_dir = home / subdir + if potential_dir.exists(): + safe_dirs.append(str(potential_dir)) # Add temp directories temp_dir = Path(tempfile.gettempdir()) From d4bd710b52ef4d88b62d6bc945b18d477ed2cc6b Mon Sep 17 00:00:00 2001 From: kdjkdjkdj <26911657+kdjkdjkdj@users.noreply.github.com> Date: Sun, 19 Apr 2026 18:55:50 +0200 Subject: [PATCH 3/5] fix: do not reply to JSON-RPC notifications (#36) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit JSON-RPC 2.0 §4.1 requires that servers MUST NOT reply to notifications (messages without an "id" field). The current dispatch in MarkItDownMCPServer.run() invents a fake id of "unknown" and always writes a response, which breaks any strict MCP client on the notifications/initialized handshake message. - Detect notifications before handling and skip the response write. - Pass through the real id (including numeric or null) instead of coercing to "unknown". - Widen MCPRequest.id / MCPResponse.id to str | int | None to reflect what JSON-RPC actually allows. Spec: https://www.jsonrpc.org/specification#notification Refs: #36 Co-Authored-By: Claude Opus 4.7 (1M context) --- markitdown_mcp/server.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/markitdown_mcp/server.py b/markitdown_mcp/server.py index e7be569..7f7bdb8 100644 --- a/markitdown_mcp/server.py +++ b/markitdown_mcp/server.py @@ -673,7 +673,8 @@ def get_safe_working_directories() -> list[str]: class MCPRequest: """Represents an incoming MCP protocol request.""" - id: str + # JSON-RPC 2.0: id may be str, int, or absent (notification → None here). + id: str | int | None method: str params: dict[str, Any] @@ -682,7 +683,7 @@ class MCPRequest: class MCPResponse: """Represents an MCP protocol response.""" - id: str + id: str | int | None result: dict[str, Any] | None = None error: dict[str, Any] | None = None @@ -1268,14 +1269,22 @@ async def run(self) -> None: try: message = json.loads(line.strip()) + + # JSON-RPC 2.0 §4.1: messages without "id" are notifications; + # the server MUST NOT reply to them. + is_notification = "id" not in message + request = MCPRequest( - id=message.get("id", "unknown"), + id=message.get("id"), method=message["method"], params=message.get("params", {}), ) response = await self.handle_request(request) + if is_notification: + continue + # Send response response_dict: dict[str, Any] = {"jsonrpc": "2.0", "id": response.id} if response.result is not None: From a55a0ec30c719f7a6ca70c48b6bf28d6a86dcb9f Mon Sep 17 00:00:00 2001 From: kdjkdjkdj <26911657+kdjkdjkdj@users.noreply.github.com> Date: Sun, 19 Apr 2026 18:56:12 +0200 Subject: [PATCH 4/5] fix: drop anyOf from convert_file inputSchema (#36) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Anthropic Messages API rejects tool schemas with oneOf/allOf/anyOf at the top level of input_schema: input_schema does not support oneOf, allOf, or anyOf at the top level → the convert_file tool silently fails to load for any Anthropic-based client (Claude Code, Claude Desktop). Drop the anyOf and move the either-file_path-or-file_content rule into the tool description. The handler already enforces the requirement at runtime, so there is no behavior change for callers providing either shape. The schema is duplicated in get_tools() and inline in the tools/list handler; both are updated. Consider consolidating them in a follow-up. Refs: #36 Co-Authored-By: Claude Opus 4.7 (1M context) --- markitdown_mcp/server.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/markitdown_mcp/server.py b/markitdown_mcp/server.py index 7f7bdb8..cac6fbf 100644 --- a/markitdown_mcp/server.py +++ b/markitdown_mcp/server.py @@ -768,7 +768,11 @@ def get_tools(self) -> list[dict[str, Any]]: return [ { "name": "convert_file", - "description": "Convert a file to Markdown using MarkItDown", + "description": ( + "Convert a file to Markdown using MarkItDown. " + "Provide either 'file_path' OR both 'file_content' (base64) " + "and 'filename'." + ), "inputSchema": { "type": "object", "properties": { @@ -787,10 +791,6 @@ def get_tools(self) -> list[dict[str, Any]]: "description": "Original filename when using file_content", }, }, - "anyOf": [ - {"required": ["file_path"]}, - {"required": ["file_content", "filename"]}, - ], }, }, { @@ -838,7 +838,11 @@ async def handle_request(self, request: MCPRequest) -> MCPResponse: "tools": [ { "name": "convert_file", - "description": "Convert a file to Markdown using MarkItDown", + "description": ( + "Convert a file to Markdown using MarkItDown. " + "Provide either 'file_path' OR both " + "'file_content' (base64) and 'filename'." + ), "inputSchema": { "type": "object", "properties": { @@ -859,10 +863,6 @@ async def handle_request(self, request: MCPRequest) -> MCPResponse: "file_content", }, }, - "anyOf": [ - {"required": ["file_path"]}, - {"required": ["file_content", "filename"]}, - ], }, }, { From e1094c68703bf8983304fd2fd0d58b82273766ea Mon Sep 17 00:00:00 2001 From: kdjkdjkdj <26911657+kdjkdjkdj@users.noreply.github.com> Date: Sun, 19 Apr 2026 18:56:34 +0200 Subject: [PATCH 5/5] fix: dispatch format-specific validation on exact MIME type (#36) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit validate_file_content_security() selected the XML sanitizer via a substring check: `"xml" in mime_type`. Every Office OpenXML format has a MIME type containing "openxmlformats-officedocument-…" — including docx, xlsx, and pptx. They were therefore routed into validate_xml_security(), which opens the file as text, scans it for XML entity patterns, and writes a "sanitized" .xml copy. MarkItDown then received a UTF-8-decoded ZIP container and produced ~400 KB of garbled output instead of Markdown. Replace the three substring checks (xml/json/csv) with exact matches against module-level MIME-type sets. json and csv were less explosive in practice but had the same anti-pattern. Refs: #36 Co-Authored-By: Claude Opus 4.7 (1M context) --- markitdown_mcp/server.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/markitdown_mcp/server.py b/markitdown_mcp/server.py index cac6fbf..152f38b 100644 --- a/markitdown_mcp/server.py +++ b/markitdown_mcp/server.py @@ -29,6 +29,14 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger("markitdown-mcp") +# Exact MIME types for format-specific security validation. A substring check +# like `"xml" in mime_type` incorrectly matches Office OpenXML formats (docx, +# xlsx, pptx — whose MIME contains "openxmlformats") and routes them through +# the XML sanitizer, corrupting the binary payload. +_XML_MIME_TYPES = {"text/xml", "application/xml"} +_JSON_MIME_TYPES = {"application/json", "text/json"} +_CSV_MIME_TYPES = {"text/csv", "application/csv"} + class SecurityError(Exception): """Raised when a security violation is detected.""" @@ -281,12 +289,13 @@ def validate_file_content_security(file_path: str) -> str: mime_type, _ = mimetypes.guess_type(file_path) file_ext = Path(file_path).suffix.lower() - # Apply format-specific validation - if (mime_type and "xml" in mime_type) or file_ext in [".xml", ".xhtml"]: + # Apply format-specific validation. Use exact MIME-type matching to + # avoid false positives (e.g. docx MIME contains "xml" as substring). + if (mime_type in _XML_MIME_TYPES) or file_ext in [".xml", ".xhtml"]: return validate_xml_security(file_path) - if (mime_type and "json" in mime_type) or file_ext == ".json": + if (mime_type in _JSON_MIME_TYPES) or file_ext == ".json": return validate_json_security(file_path) - if (mime_type and "csv" in mime_type) or file_ext == ".csv": + if (mime_type in _CSV_MIME_TYPES) or file_ext == ".csv": return validate_csv_security(file_path) # General file size check