Skip to content
This repository was archived by the owner on Mar 26, 2025. It is now read-only.

Commit a11d727

Browse files
committed
Fixing scraping behaviour
1 parent 5514b33 commit a11d727

File tree

2 files changed

+19
-23
lines changed

2 files changed

+19
-23
lines changed

llm-server/workers/tasks/url_parsers.py

Lines changed: 15 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -70,24 +70,22 @@ def get_url_fragments(self, content) -> List[LinkInformation]:
7070
def find_all_headings_and_highlights(
7171
self, content: str
7272
) -> Tuple[str, List[Tuple[str, str]]]:
73-
soup = BeautifulSoup(content, "lxml")
74-
title_tag = soup.title
75-
title = ""
76-
if title_tag is not None:
77-
title = title_tag.get_text(strip=True)
78-
79-
headings: List[Tuple[str, str]] = []
80-
81-
for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
82-
heading_text = heading.get_text(strip=True)
8373

84-
# Check if the heading or one of its children has an 'id' attribute
85-
id_tag = heading.find(attrs={"id": True})
86-
if id_tag:
87-
heading_id = id_tag["id"]
88-
headings.append((heading_text, heading_id))
89-
90-
return title, headings
74+
soup = BeautifulSoup(content, "lxml")
75+
title = soup.title.text if soup.title else ""
76+
elements_with_id = soup.find_all(id=True)
77+
links = soup.find_all("a")
78+
pairs = []
79+
for element in elements_with_id:
80+
id_ = element.get("id")
81+
if id_: # A simple check if the id exists
82+
corresponding_links = [
83+
link for link in links if link.get("href") == "#" + id_
84+
] # Removed "./#" prefix
85+
if corresponding_links:
86+
for link in corresponding_links:
87+
pairs.append((element.get_text(strip=True), id_))
88+
return title, pairs
9189

9290
def parse_text_content(self, content) -> str:
9391
text = BeautifulSoup(content, "lxml").get_text()

llm-server/workers/tasks/web_crawl.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,8 @@ def scrape_url(url: str, bot_id: str):
7373
for heading_text, heading_id in headings
7474
]
7575

76-
add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
76+
if len(items) > 0:
77+
add_cmdbar_data(items, {"url": url, "bot_id": bot_id})
7778
return parser.parse_text_content(content)
7879
except ValueError as e:
7980
# Log an error message if no parser is available for the content type
@@ -140,11 +141,8 @@ def scrape_website(url: str, bot_id: str, max_pages: int) -> int:
140141
chatbot_id=bot_id, url=current_url, status="SUCCESS"
141142
)
142143

143-
# Get links on the current page
144-
links = get_links(current_url)
145-
146-
# Add new links to the queue
147-
queue.extend(links)
144+
links = get_links(current_url)
145+
queue.extend(links)
148146

149147
except Exception as e:
150148
logger.error("WEB_SCRAPE_ERROR", error=e)

0 commit comments

Comments
 (0)