From 53d3ed158bfa2e885ca222e3d15435fc83186c1b Mon Sep 17 00:00:00 2001
From: kdjkdjkdj <26911657+kdjkdjkdj@users.noreply.github.com>
Date: Sun, 19 Apr 2026 18:55:16 +0200
Subject: [PATCH 1/5] fix: reconfigure stdio to UTF-8 + LF at startup (#36)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Python's text-mode stdio defaults on Windows break line-delimited
JSON-RPC framing: stdout translates \n to \r\n and stdin decodes input
as cp1252. MCP clients send UTF-8 JSON-RPC framed by LF, so:

- CRLF on stdout corrupts the response framing for strict parsers.
- cp1252 stdin corrupts non-ASCII bytes — a path containing "ä"
  (UTF-8 \xc3\xa4) arrives as "Ã¤" and the subsequent file operation
  fails with Security violation: invalid path.

Reconfigure sys.stdin/stdout at the top of main() so the server
behaves identically on Windows and Unix without requiring a wrapper
script. No-op on platforms that already default to UTF-8/LF.

Refs: #36

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 markitdown_mcp/server.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/markitdown_mcp/server.py b/markitdown_mcp/server.py
index 1bdfa42..605ccdb 100644
--- a/markitdown_mcp/server.py
+++ b/markitdown_mcp/server.py
@@ -1289,6 +1289,14 @@ async def run(self) -> None:
 
 def main() -> None:
     """Main entry point for console script."""
+    # Force UTF-8 + LF on stdio. Python's text-mode defaults on Windows are
+    # cp1252 for stdin/stdout and CRLF translation on stdout, both of which
+    # break the MCP protocol: clients send UTF-8 JSON-RPC framed by LF, so
+    # cp1252 stdin corrupts non-ASCII input (e.g. path names with umlauts)
+    # and CRLF stdout corrupts the line-delimited framing. No-op on platforms
+    # that already default to UTF-8/LF.
+    sys.stdout.reconfigure(encoding="utf-8", newline="\n")
+    sys.stdin.reconfigure(encoding="utf-8")
 
     async def run_server() -> None:
         """Run the MCP server asynchronously."""

From 15997abf5c89d86233a1b65155baa759c0d66bbd Mon Sep 17 00:00:00 2001
From: kdjkdjkdj <26911657+kdjkdjkdj@users.noreply.github.com>
Date: Sun, 19 Apr 2026 18:55:30 +0200
Subject: [PATCH 2/5] fix: handle missing HOME/USERPROFILE in
 get_safe_working_directories (#36)

Path.home() raises RuntimeError if neither HOME nor USERPROFILE is set
in the process environment. Some MCP clients (including Claude Code)
spawn stdio servers with an effectively empty environment on Windows,
which causes MarkItDownMCPServer.__init__ to abort with an opaque
traceback before any request is processed.

Wrap the call in try/except, log a warning, and skip the home-subdir
additions. The server still starts with CWD, tempdir, and fixtures as
safe directories.

Refs: #36

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 markitdown_mcp/server.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/markitdown_mcp/server.py b/markitdown_mcp/server.py
index 605ccdb..e7be569 100644
--- a/markitdown_mcp/server.py
+++ b/markitdown_mcp/server.py
@@ -640,13 +640,22 @@ def get_safe_working_directories() -> list[str]:
     # Add current working directory
     safe_dirs.append(str(Path.cwd()))
 
-    # Add home directory subdirectories (but not root directories)
-    home = Path.home()
-    safe_subdirs = ["Documents", "Downloads", "Desktop", "tmp"]
-    for subdir in safe_subdirs:
-        potential_dir = home / subdir
-        if potential_dir.exists():
-            safe_dirs.append(str(potential_dir))
+    # Home directory subdirectories. Path.home() raises RuntimeError if neither
+    # HOME nor USERPROFILE is set, which can happen when MCP clients spawn the
+    # server with an empty environment. Fall back gracefully rather than
+    # crashing the server at init.
+    try:
+        home = Path.home()
+    except RuntimeError:
+        logger.warning("Could not determine user home; skipping home subdirs")
+        home = None
+
+    if home is not None:
+        safe_subdirs = ["Documents", "Downloads", "Desktop", "tmp"]
+        for subdir in safe_subdirs:
+            potential_dir = home / subdir
+            if potential_dir.exists():
+                safe_dirs.append(str(potential_dir))
 
     # Add temp directories
     temp_dir = Path(tempfile.gettempdir())

From d4bd710b52ef4d88b62d6bc945b18d477ed2cc6b Mon Sep 17 00:00:00 2001
From: kdjkdjkdj <26911657+kdjkdjkdj@users.noreply.github.com>
Date: Sun, 19 Apr 2026 18:55:50 +0200
Subject: [PATCH 3/5] fix: do not reply to JSON-RPC notifications (#36)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

JSON-RPC 2.0 §4.1 requires that servers MUST NOT reply to notifications
(messages without an "id" field). The current dispatch in
MarkItDownMCPServer.run() invents a fake id of "unknown" and always
writes a response, which breaks any strict MCP client on the
notifications/initialized handshake message.

- Detect notifications before handling and skip the response write.
- Pass through the real id (including numeric or null) instead of
  coercing to "unknown".
- Widen MCPRequest.id / MCPResponse.id to str | int | None to reflect
  what JSON-RPC actually allows.

Spec: https://www.jsonrpc.org/specification#notification

Refs: #36

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 markitdown_mcp/server.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/markitdown_mcp/server.py b/markitdown_mcp/server.py
index e7be569..7f7bdb8 100644
--- a/markitdown_mcp/server.py
+++ b/markitdown_mcp/server.py
@@ -673,7 +673,8 @@ def get_safe_working_directories() -> list[str]:
 class MCPRequest:
     """Represents an incoming MCP protocol request."""
 
-    id: str
+    # JSON-RPC 2.0: id may be str, int, or absent (notification → None here).
+    id: str | int | None
     method: str
     params: dict[str, Any]
 
@@ -682,7 +683,7 @@ class MCPRequest:
 class MCPResponse:
     """Represents an MCP protocol response."""
 
-    id: str
+    id: str | int | None
     result: dict[str, Any] | None = None
     error: dict[str, Any] | None = None
 
@@ -1268,14 +1269,22 @@ async def run(self) -> None:
 
                 try:
                     message = json.loads(line.strip())
+
+                    # JSON-RPC 2.0 §4.1: messages without "id" are notifications;
+                    # the server MUST NOT reply to them.
+                    is_notification = "id" not in message
+
                     request = MCPRequest(
-                        id=message.get("id", "unknown"),
+                        id=message.get("id"),
                         method=message["method"],
                         params=message.get("params", {}),
                     )
 
                     response = await self.handle_request(request)
 
+                    if is_notification:
+                        continue
+
                     # Send response
                     response_dict: dict[str, Any] = {"jsonrpc": "2.0", "id": response.id}
                     if response.result is not None:

From a55a0ec30c719f7a6ca70c48b6bf28d6a86dcb9f Mon Sep 17 00:00:00 2001
From: kdjkdjkdj <26911657+kdjkdjkdj@users.noreply.github.com>
Date: Sun, 19 Apr 2026 18:56:12 +0200
Subject: [PATCH 4/5] fix: drop anyOf from convert_file inputSchema (#36)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Anthropic Messages API rejects tool schemas with oneOf/allOf/anyOf
at the top level of input_schema:

  input_schema does not support oneOf, allOf, or anyOf at the top level

→ the convert_file tool silently fails to load for any Anthropic-based
client (Claude Code, Claude Desktop).

Drop the anyOf and move the either-file_path-or-file_content rule into
the tool description. The handler already enforces the requirement at
runtime, so there is no behavior change for callers providing either
shape.

The schema is duplicated in get_tools() and inline in the tools/list
handler; both are updated. Consider consolidating them in a follow-up.

Refs: #36

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 markitdown_mcp/server.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/markitdown_mcp/server.py b/markitdown_mcp/server.py
index 7f7bdb8..cac6fbf 100644
--- a/markitdown_mcp/server.py
+++ b/markitdown_mcp/server.py
@@ -768,7 +768,11 @@ def get_tools(self) -> list[dict[str, Any]]:
         return [
             {
                 "name": "convert_file",
-                "description": "Convert a file to Markdown using MarkItDown",
+                "description": (
+                    "Convert a file to Markdown using MarkItDown. "
+                    "Provide either 'file_path' OR both 'file_content' (base64) "
+                    "and 'filename'."
+                ),
                 "inputSchema": {
                     "type": "object",
                     "properties": {
@@ -787,10 +791,6 @@ def get_tools(self) -> list[dict[str, Any]]:
                             "description": "Original filename when using file_content",
                         },
                     },
-                    "anyOf": [
-                        {"required": ["file_path"]},
-                        {"required": ["file_content", "filename"]},
-                    ],
                 },
             },
             {
@@ -838,7 +838,11 @@ async def handle_request(self, request: MCPRequest) -> MCPResponse:
                         "tools": [
                             {
                                 "name": "convert_file",
-                                "description": "Convert a file to Markdown using MarkItDown",
+                                "description": (
+                                    "Convert a file to Markdown using MarkItDown. "
+                                    "Provide either 'file_path' OR both "
+                                    "'file_content' (base64) and 'filename'."
+                                ),
                                 "inputSchema": {
                                     "type": "object",
                                     "properties": {
@@ -859,10 +863,6 @@ async def handle_request(self, request: MCPRequest) -> MCPResponse:
                                             "file_content",
                                         },
                                     },
-                                    "anyOf": [
-                                        {"required": ["file_path"]},
-                                        {"required": ["file_content", "filename"]},
-                                    ],
                                 },
                             },
                             {

From e1094c68703bf8983304fd2fd0d58b82273766ea Mon Sep 17 00:00:00 2001
From: kdjkdjkdj <26911657+kdjkdjkdj@users.noreply.github.com>
Date: Sun, 19 Apr 2026 18:56:34 +0200
Subject: [PATCH 5/5] fix: dispatch format-specific validation on exact MIME
 type (#36)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

validate_file_content_security() selected the XML sanitizer via a
substring check: `"xml" in mime_type`. Every Office OpenXML format has
a MIME type containing "openxmlformats-officedocument-…" — including
docx, xlsx, and pptx. They were therefore routed into
validate_xml_security(), which opens the file as text, scans it for
XML entity patterns, and writes a "sanitized" .xml copy. MarkItDown
then received a UTF-8-decoded ZIP container and produced ~400 KB of
garbled output instead of Markdown.

Replace the three substring checks (xml/json/csv) with exact matches
against module-level MIME-type sets. json and csv were less explosive
in practice but had the same anti-pattern.

Refs: #36

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 markitdown_mcp/server.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/markitdown_mcp/server.py b/markitdown_mcp/server.py
index cac6fbf..152f38b 100644
--- a/markitdown_mcp/server.py
+++ b/markitdown_mcp/server.py
@@ -29,6 +29,14 @@
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger("markitdown-mcp")
 
+# Exact MIME types for format-specific security validation. A substring check
+# like `"xml" in mime_type` incorrectly matches Office OpenXML formats (docx,
+# xlsx, pptx — whose MIME contains "openxmlformats") and routes them through
+# the XML sanitizer, corrupting the binary payload.
+_XML_MIME_TYPES = {"text/xml", "application/xml"}
+_JSON_MIME_TYPES = {"application/json", "text/json"}
+_CSV_MIME_TYPES = {"text/csv", "application/csv"}
+
 
 class SecurityError(Exception):
     """Raised when a security violation is detected."""
@@ -281,12 +289,13 @@ def validate_file_content_security(file_path: str) -> str:
         mime_type, _ = mimetypes.guess_type(file_path)
         file_ext = Path(file_path).suffix.lower()
 
-        # Apply format-specific validation
-        if (mime_type and "xml" in mime_type) or file_ext in [".xml", ".xhtml"]:
+        # Apply format-specific validation. Use exact MIME-type matching to
+        # avoid false positives (e.g. docx MIME contains "xml" as substring).
+        if (mime_type in _XML_MIME_TYPES) or file_ext in [".xml", ".xhtml"]:
             return validate_xml_security(file_path)
-        if (mime_type and "json" in mime_type) or file_ext == ".json":
+        if (mime_type in _JSON_MIME_TYPES) or file_ext == ".json":
             return validate_json_security(file_path)
-        if (mime_type and "csv" in mime_type) or file_ext == ".csv":
+        if (mime_type in _CSV_MIME_TYPES) or file_ext == ".csv":
             return validate_csv_security(file_path)
 
         # General file size check