Fixing scraping behaviour

codebanesr · codebanesr · commit a11d727f6fb8 · 2024-02-22T21:31:46.000-08:00
diff --git a/llm-server/workers/tasks/url_parsers.py b/llm-server/workers/tasks/url_parsers.py
@@ -70,24 +70,22 @@ def get_url_fragments(self, content) -> List[LinkInformation]:
     def find_all_headings_and_highlights(
         self, content: str
     ) -> Tuple[str, List[Tuple[str, str]]]:
-        soup = BeautifulSoup(content, "lxml")
-        title_tag = soup.title
-        title = ""
-        if title_tag is not None:
-            title = title_tag.get_text(strip=True)
-
-        headings: List[Tuple[str, str]] = []
-
-        for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
-            heading_text = heading.get_text(strip=True)
 
-            # Check if the heading or one of its children has an 'id' attribute
-            id_tag = heading.find(attrs={"id": True})
-            if id_tag:
-                heading_id = id_tag["id"]
-                headings.append((heading_text, heading_id))
-
-        return title, headings
+        soup = BeautifulSoup(content, "lxml")
+        title = soup.title.text if soup.title else ""
+        elements_with_id = soup.find_all(id=True)
+        links = soup.find_all("a")
+        pairs = []
+        for element in elements_with_id:
+            id_ = element.get("id")
+            if id_:  # A simple check if the id exists
+                corresponding_links = [
+                    link for link in links if link.get("href") == "#" + id_
+                ]  # Removed "./#" prefix
+                if corresponding_links:
+                    for link in corresponding_links:
+                        pairs.append((element.get_text(strip=True), id_))
+        return title, pairs
 
     def parse_text_content(self, content) -> str:
         text = BeautifulSoup(content, "lxml").get_text()
diff --git a/llm-server/workers/tasks/web_crawl.py b/llm-server/workers/tasks/web_crawl.py
@@ -73,7 +73,8 @@ def scrape_url(url: str, bot_id: str):
             for heading_text, heading_id in headings
         ]
 
-        add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
+        if len(items) > 0:
+            add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
         return parser.parse_text_content(content)
     except ValueError as e:
         # Log an error message if no parser is available for the content type
@@ -140,11 +141,8 @@ def scrape_website(url: str, bot_id: str, max_pages: int) -> int:
                     chatbot_id=bot_id, url=current_url, status="SUCCESS"
                 )
 
-                # Get links on the current page
-                links = get_links(current_url)
-
-                # Add new links to the queue
-                queue.extend(links)
+            links = get_links(current_url)
+            queue.extend(links)
 
         except Exception as e:
             logger.error("WEB_SCRAPE_ERROR", error=e)