Skip to content

feat: add HTML to markdown conversion for http_request tool #63

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,13 @@ response = agent.tool.http_request(
auth_type="Bearer",
auth_token="your_token_here"
)

# Convert HTML webpages to markdown for better readability
response = agent.tool.http_request(
method="GET",
url="https://example.com/article",
convert_to_markdown=True
)
```

### Python Code Execution
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ dependencies = [
"tenacity>=9.1.2,<10.0.0",
"watchdog>=6.0.0,<7.0.0",
"slack_bolt>=1.23.0,<2.0.0",
"markdownify>=1.0.0,<2.0.0",
"readabilipy>=0.2.0,<1.0.0",
# Note: Always want the latest tzdata
"tzdata ; platform_system == 'Windows'",
]
Expand Down
59 changes: 58 additions & 1 deletion src/strands_tools/http_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
from typing import Any, Dict, Optional, Union
from urllib.parse import urlparse

import markdownify
import readabilipy.simple_json
import requests
from aws_requests_auth.aws_auth import AWSRequestsAuth
from requests.adapters import HTTPAdapter
Expand All @@ -50,7 +52,7 @@
"JWT, AWS SigV4, Digest auth, and enterprise authentication patterns. Automatically reads tokens from "
"environment variables (GITHUB_TOKEN, GITLAB_TOKEN, AWS credentials, etc.) when auth_env_var is specified. "
"Use environment(action='list') to view available variables. Includes session management, metrics, "
"streaming support, cookie handling, and redirect control."
"streaming support, cookie handling, redirect control, and optional HTML to markdown conversion."
),
"inputSchema": {
"json": {
Expand Down Expand Up @@ -133,6 +135,10 @@
"type": "integer",
"description": "Maximum number of redirects to follow (default: 30)",
},
"convert_to_markdown": {
"type": "boolean",
"description": "Convert HTML responses to markdown format (default: False).",
},
"aws_auth": {
"type": "object",
"description": "AWS auth configuration for SigV4",
Expand Down Expand Up @@ -185,6 +191,30 @@
REQUEST_METRICS = collections.defaultdict(list)


def extract_content_from_html(html: str) -> str:
"""Extract and convert HTML content to Markdown format.

Args:
html: Raw HTML content to process

Returns:
Simplified markdown version of the content, or original HTML if conversion fails
"""
try:
ret = readabilipy.simple_json.simple_json_from_html_string(html, use_readability=True)
if not ret.get("content"):
return html

content = markdownify.markdownify(
ret["content"],
heading_style=markdownify.ATX,
)
return content
except Exception:
# If conversion fails, return original HTML
return html


def create_session(config: Dict[str, Any]) -> requests.Session:
"""Create and configure a requests Session object."""
session = requests.Session()
Expand Down Expand Up @@ -569,6 +599,15 @@ def http_request(tool: ToolUse, **kwargs: Any) -> ToolResult:
)
```

6. Convert HTML responses to markdown:
```python
http_request(
method="GET",
url="https://example.com/article",
convert_to_markdown=True, # Converts HTML content to readable markdown
)
```

Environment Variables:
- Authentication tokens are read from environment when auth_env_var is specified
- AWS credentials are automatically loaded from environment variables or credentials file
Expand Down Expand Up @@ -798,6 +837,24 @@ def http_request(tool: ToolUse, **kwargs: Any) -> ToolResult:
else:
content = response.text

# Convert HTML to markdown if requested
convert_to_markdown = tool_input.get("convert_to_markdown", False)
if convert_to_markdown:
content_type = response.headers.get("content-type", "")
is_html_content = (
"text/html" in content_type.lower()
or "<html" in content[:100].lower()
or "<!doctype html" in content[:100].lower()
)

if is_html_content:
original_content = content
content = extract_content_from_html(content)

# Add a note if conversion was successful
if content != original_content:
console.print(Text("✓ Converted HTML content to markdown", style="green"))

# Format and display the response
response_panel = format_response_preview(response, content, metrics if metrics is not None else None)
console.print(response_panel)
Expand Down
85 changes: 85 additions & 0 deletions tests/test_http_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -961,3 +961,88 @@ def test_http_request_via_agent(agent):
result_text = extract_result_text(result)
assert "Status Code: 200" in result_text
assert "success via agent" in result_text


@responses.activate
def test_markdown_conversion():
"""Test HTML to markdown conversion functionality."""
# Mock HTML content
html_content = """
<!DOCTYPE html>
<html>
<head>
<title>Test Page</title>
</head>
<body>
<h1>Main Heading</h1>
<p>This is a paragraph with <strong>bold text</strong> and <em>italic text</em>.</p>
<ul>
<li>List item 1</li>
<li>List item 2</li>
</ul>
<a href="https://example.com">Link to example</a>
</body>
</html>
"""

# Set up mock response with HTML content
responses.add(responses.GET, "https://example.com/article", body=html_content, status=200, content_type="text/html")

# Test without markdown conversion (should return HTML)
with patch("strands_tools.http_request.get_user_input") as mock_input:
mock_input.return_value = "y"
result = http_request.http_request(
{"input": {"method": "GET", "url": "https://example.com/article"}, "toolUseId": "test1"}
)

result_text = extract_result_text(result)
assert "Status Code: 200" in result_text
assert "<html>" in result_text # Should contain HTML

# Test with markdown conversion (should convert to markdown if packages available)
with patch("strands_tools.http_request.get_user_input") as mock_input:
mock_input.return_value = "y"
result = http_request.http_request(
{
"input": {"method": "GET", "url": "https://example.com/article", "convert_to_markdown": True},
"toolUseId": "test2",
}
)

result_text = extract_result_text(result)
assert "Status Code: 200" in result_text
# Verify markdown conversion worked - HTML tags should be removed and text content preserved
assert "<html>" not in result_text # HTML tags should be gone
assert "<h1>" not in result_text
assert "<p>" not in result_text
assert "Main Heading" in result_text # Text content should remain
assert "bold text" in result_text
assert "italic text" in result_text
assert "List item 1" in result_text
assert "List item 2" in result_text


@responses.activate
def test_markdown_conversion_non_html():
"""Test that non-HTML content is not affected by markdown conversion."""
# Set up mock response with JSON content
responses.add(
responses.GET,
"https://example.com/api/data",
json={"message": "hello", "data": [1, 2, 3]},
status=200,
)

# Test with markdown conversion enabled on non-HTML content
with patch("strands_tools.http_request.get_user_input") as mock_input:
mock_input.return_value = "y"
result = http_request.http_request(
{
"input": {"method": "GET", "url": "https://example.com/api/data", "convert_to_markdown": True},
"toolUseId": "test3",
}
)

result_text = extract_result_text(result)
assert "Status Code: 200" in result_text
assert '"message": "hello"' in result_text # Should still be JSON (no conversion for non-HTML)