@@ -70,24 +70,22 @@ def get_url_fragments(self, content) -> List[LinkInformation]:
70
70
def find_all_headings_and_highlights (
71
71
self , content : str
72
72
) -> Tuple [str , List [Tuple [str , str ]]]:
73
- soup = BeautifulSoup (content , "lxml" )
74
- title_tag = soup .title
75
- title = ""
76
- if title_tag is not None :
77
- title = title_tag .get_text (strip = True )
78
-
79
- headings : List [Tuple [str , str ]] = []
80
-
81
- for heading in soup .find_all (["h1" , "h2" , "h3" , "h4" , "h5" , "h6" ]):
82
- heading_text = heading .get_text (strip = True )
83
73
84
- # Check if the heading or one of its children has an 'id' attribute
85
- id_tag = heading .find (attrs = {"id" : True })
86
- if id_tag :
87
- heading_id = id_tag ["id" ]
88
- headings .append ((heading_text , heading_id ))
89
-
90
- return title , headings
74
+ soup = BeautifulSoup (content , "lxml" )
75
+ title = soup .title .text if soup .title else ""
76
+ elements_with_id = soup .find_all (id = True )
77
+ links = soup .find_all ("a" )
78
+ pairs = []
79
+ for element in elements_with_id :
80
+ id_ = element .get ("id" )
81
+ if id_ : # A simple check if the id exists
82
+ corresponding_links = [
83
+ link for link in links if link .get ("href" ) == "#" + id_
84
+ ] # Removed "./#" prefix
85
+ if corresponding_links :
86
+ for link in corresponding_links :
87
+ pairs .append ((element .get_text (strip = True ), id_ ))
88
+ return title , pairs
91
89
92
90
def parse_text_content (self , content ) -> str :
93
91
text = BeautifulSoup (content , "lxml" ).get_text ()
0 commit comments