diff --git a/examples/markdownify/markdownify_scrapegraphai.py b/examples/markdownify/markdownify_scrapegraphai.py index de36607d..30d9713f 100644 --- a/examples/markdownify/markdownify_scrapegraphai.py +++ b/examples/markdownify/markdownify_scrapegraphai.py @@ -1,35 +1,20 @@ """ -Example script demonstrating the markdownify functionality +Scrape a webpage as clean markdown using scrapegraph-py v2 API. +Replaces the old markdownify() call with scrape(). """ +import json import os + from dotenv import load_dotenv from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -def main(): - # Load environment variables - load_dotenv() - # Set up logging - sgai_logger.set_logging(level="INFO") +load_dotenv() - # Initialize the client - api_key = os.getenv("SCRAPEGRAPH_API_KEY") - if not api_key: - raise ValueError("SCRAPEGRAPH_API_KEY environment variable not found") - sgai_client = Client(api_key=api_key) +api_key = os.getenv("SCRAPEGRAPH_API_KEY") +if not api_key: + raise ValueError("SCRAPEGRAPH_API_KEY environment variable not found") - # Example 1: Convert a website to Markdown - print("Example 1: Converting website to Markdown") - print("-" * 50) - response = sgai_client.markdownify( - website_url="https://example.com" - ) - print("Markdown output:") - print(response["result"]) # Access the result key from the dictionary - print("\nMetadata:") - print(response.get("metadata", {})) # Use get() with default value - print("\n" + "=" * 50 + "\n") -if __name__ == "__main__": - main() +with Client(api_key=api_key) as client: + response = client.scrape(url="https://example.com") + print(json.dumps(response, indent=2)) diff --git a/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai.py b/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai.py index e88a92ce..2f8bce3f 100644 --- a/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai.py +++ b/examples/search_graph/scrapegraphai/searchscraper_scrapegraphai.py @@ -1,83 +1,20 @@ """ -Example implementation of search-based scraping using Scrapegraph AI. -This example demonstrates how to use the searchscraper to extract information from the web. +Search the web and extract AI-structured results using scrapegraph-py v2 API. +Replaces the old searchscraper() call with search(). """ +import json import os -from typing import Dict, Any + from dotenv import load_dotenv from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -def format_response(response: Dict[str, Any]) -> None: - """ - Format and print the search response in a readable way. - - Args: - response (Dict[str, Any]): The response from the search API - """ - print("\n" + "="*50) - print("SEARCH RESULTS") - print("="*50) - - # Print request ID - print(f"\nRequest ID: {response['request_id']}") - - # Print number of sources - urls = response.get('reference_urls', []) - print(f"\nSources Processed: {len(urls)}") - - # Print the extracted information - print("\nExtracted Information:") - print("-"*30) - if isinstance(response['result'], dict): - for key, value in response['result'].items(): - print(f"\n{key.upper()}:") - if isinstance(value, list): - for item in value: - print(f" • {item}") - else: - print(f" {value}") - else: - print(response['result']) - - # Print source URLs - if urls: - print("\nSources:") - print("-"*30) - for i, url in enumerate(urls, 1): - print(f"{i}. {url}") - print("\n" + "="*50) - -def main(): - # Load environment variables - load_dotenv() - - # Get API key - api_key = os.getenv("SCRAPEGRAPH_API_KEY") - if not api_key: - raise ValueError("SCRAPEGRAPH_API_KEY not found in environment variables") - - # Configure logging - sgai_logger.set_logging(level="INFO") - - # Initialize client - sgai_client = Client(api_key=api_key) - - try: - # Basic search scraper example - print("\nSearching for information...") - search_response = sgai_client.searchscraper( - user_prompt="Extract webpage information" - ) - format_response(search_response) +load_dotenv() - except Exception as e: - print(f"\nError occurred: {str(e)}") - finally: - # Always close the client - sgai_client.close() +api_key = os.getenv("SCRAPEGRAPH_API_KEY") +if not api_key: + raise ValueError("SCRAPEGRAPH_API_KEY not found in environment variables") -if __name__ == "__main__": - main() +with Client(api_key=api_key) as client: + response = client.search(query="Extract webpage information") + print(json.dumps(response, indent=2)) diff --git a/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai.py b/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai.py index 47181cbb..944a6a6e 100644 --- a/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai.py +++ b/examples/smart_scraper_graph/scrapegraphai/smartscraper_scrapegraphai.py @@ -1,45 +1,23 @@ """ -Example implementation using scrapegraph-py client directly. +Extract structured data from a webpage using scrapegraph-py v2 API. +Replaces the old smartscraper() call with extract(). """ +import json import os + from dotenv import load_dotenv from scrapegraph_py import Client -from scrapegraph_py.logger import sgai_logger - -def main(): - # Load environment variables from .env file - load_dotenv() - - # Get API key from environment variables - api_key = os.getenv("SCRAPEGRAPH_API_KEY") - if not api_key: - raise ValueError("SCRAPEGRAPH_API_KEY non trovato nelle variabili d'ambiente") - - # Set up logging - sgai_logger.set_logging(level="INFO") - - # Initialize the client with API key from environment - sgai_client = Client(api_key=api_key) - - try: - # SmartScraper request - response = sgai_client.smartscraper( - website_url="https://scrapegraphai.com", - user_prompt="Extract the founders' informations" - ) - # Print the response - print(f"Request ID: {response['request_id']}") - print(f"Result: {response['result']}") - if response.get('reference_urls'): - print(f"Reference URLs: {response['reference_urls']}") +load_dotenv() - except Exception as e: - print(f"Error occurred: {str(e)}") - finally: - # Always close the client - sgai_client.close() +api_key = os.getenv("SCRAPEGRAPH_API_KEY") +if not api_key: + raise ValueError("SCRAPEGRAPH_API_KEY not found in environment variables") -if __name__ == "__main__": - main() +with Client(api_key=api_key) as client: + response = client.extract( + url="https://scrapegraphai.com", + prompt="Extract the founders' informations", + ) + print(json.dumps(response, indent=2)) diff --git a/pyproject.toml b/pyproject.toml index 6537bbcf..5dd2b198 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ dependencies = [ "jsonschema>=4.25.1", "duckduckgo-search>=8.1.1", "pydantic>=2.12.5", - "scrapegraph-py>=1.44.0", + "scrapegraph-py>=2.0.0", ] readme = "README.md" diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py index ffcd3dbe..2807125b 100644 --- a/scrapegraphai/graphs/smart_scraper_graph.py +++ b/scrapegraphai/graphs/smart_scraper_graph.py @@ -79,31 +79,23 @@ def _create_graph(self) -> BaseGraph: if self.llm_model == "scrapegraphai/smart-scraper": try: from scrapegraph_py import Client - from scrapegraph_py.logger import sgai_logger except ImportError: raise ImportError( "scrapegraph_py is not installed. Please install it using 'pip install scrapegraph-py'." ) - sgai_logger.set_logging(level="INFO") - - # Initialize the client with explicit API key - sgai_client = Client(api_key=self.config.get("api_key")) - - # SmartScraper request - response = sgai_client.smartscraper( - website_url=self.source, - user_prompt=self.prompt, - ) - - # Use logging instead of print for better production practices - if "request_id" in response and "result" in response: - logger.info(f"Request ID: {response['request_id']}") - logger.info(f"Result: {response['result']}") - else: - logger.warning("Missing expected keys in response.") + with Client(api_key=self.config.get("api_key")) as sgai_client: + # Extract request (v2 API) + response = sgai_client.extract( + url=self.source, + prompt=self.prompt, + output_schema=self.schema, + ) - sgai_client.close() + if "id" in response: + logger.info(f"Request ID: {response['id']}") + if "data" in response: + logger.info(f"Result: {response['data']}") return response