diff --git a/README.md b/README.md index 1a361f1..36853ef 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,13 @@ response = agent.tool.http_request( auth_type="Bearer", auth_token="your_token_here" ) + +# Convert HTML webpages to markdown for better readability +response = agent.tool.http_request( + method="GET", + url="https://example.com/article", + convert_to_markdown=True +) ``` ### Python Code Execution diff --git a/pyproject.toml b/pyproject.toml index 3042e36..6398340 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,8 @@ dependencies = [ "tenacity>=9.1.2,<10.0.0", "watchdog>=6.0.0,<7.0.0", "slack_bolt>=1.23.0,<2.0.0", + "markdownify>=1.0.0,<2.0.0", + "readabilipy>=0.2.0,<1.0.0", # Note: Always want the latest tzdata "tzdata ; platform_system == 'Windows'", ] diff --git a/src/strands_tools/http_request.py b/src/strands_tools/http_request.py index 4198f17..884ff21 100644 --- a/src/strands_tools/http_request.py +++ b/src/strands_tools/http_request.py @@ -26,6 +26,8 @@ from typing import Any, Dict, Optional, Union from urllib.parse import urlparse +import markdownify +import readabilipy.simple_json import requests from aws_requests_auth.aws_auth import AWSRequestsAuth from requests.adapters import HTTPAdapter @@ -50,7 +52,7 @@ "JWT, AWS SigV4, Digest auth, and enterprise authentication patterns. Automatically reads tokens from " "environment variables (GITHUB_TOKEN, GITLAB_TOKEN, AWS credentials, etc.) when auth_env_var is specified. " "Use environment(action='list') to view available variables. Includes session management, metrics, " - "streaming support, cookie handling, and redirect control." + "streaming support, cookie handling, redirect control, and optional HTML to markdown conversion." ), "inputSchema": { "json": { @@ -133,6 +135,10 @@ "type": "integer", "description": "Maximum number of redirects to follow (default: 30)", }, + "convert_to_markdown": { + "type": "boolean", + "description": "Convert HTML responses to markdown format (default: False).", + }, "aws_auth": { "type": "object", "description": "AWS auth configuration for SigV4", @@ -185,6 +191,30 @@ REQUEST_METRICS = collections.defaultdict(list) +def extract_content_from_html(html: str) -> str: + """Extract and convert HTML content to Markdown format. + + Args: + html: Raw HTML content to process + + Returns: + Simplified markdown version of the content, or original HTML if conversion fails + """ + try: + ret = readabilipy.simple_json.simple_json_from_html_string(html, use_readability=True) + if not ret.get("content"): + return html + + content = markdownify.markdownify( + ret["content"], + heading_style=markdownify.ATX, + ) + return content + except Exception: + # If conversion fails, return original HTML + return html + + def create_session(config: Dict[str, Any]) -> requests.Session: """Create and configure a requests Session object.""" session = requests.Session() @@ -569,6 +599,15 @@ def http_request(tool: ToolUse, **kwargs: Any) -> ToolResult: ) ``` + 6. Convert HTML responses to markdown: + ```python + http_request( + method="GET", + url="https://example.com/article", + convert_to_markdown=True, # Converts HTML content to readable markdown + ) + ``` + Environment Variables: - Authentication tokens are read from environment when auth_env_var is specified - AWS credentials are automatically loaded from environment variables or credentials file @@ -798,6 +837,24 @@ def http_request(tool: ToolUse, **kwargs: Any) -> ToolResult: else: content = response.text + # Convert HTML to markdown if requested + convert_to_markdown = tool_input.get("convert_to_markdown", False) + if convert_to_markdown: + content_type = response.headers.get("content-type", "") + is_html_content = ( + "text/html" in content_type.lower() + or " + +
+This is a paragraph with bold text and italic text.
+" not in result_text + assert "Main Heading" in result_text # Text content should remain + assert "bold text" in result_text + assert "italic text" in result_text + assert "List item 1" in result_text + assert "List item 2" in result_text + + +@responses.activate +def test_markdown_conversion_non_html(): + """Test that non-HTML content is not affected by markdown conversion.""" + # Set up mock response with JSON content + responses.add( + responses.GET, + "https://example.com/api/data", + json={"message": "hello", "data": [1, 2, 3]}, + status=200, + ) + + # Test with markdown conversion enabled on non-HTML content + with patch("strands_tools.http_request.get_user_input") as mock_input: + mock_input.return_value = "y" + result = http_request.http_request( + { + "input": {"method": "GET", "url": "https://example.com/api/data", "convert_to_markdown": True}, + "toolUseId": "test3", + } + ) + + result_text = extract_result_text(result) + assert "Status Code: 200" in result_text + assert '"message": "hello"' in result_text # Should still be JSON (no conversion for non-HTML)