@@ -139,19 +139,22 @@ Readability.prototype = {
139139 // Readability-readerable.js. Please keep both copies in sync.
140140 unlikelyCandidates :
141141 / - a d - | a i 2 h t m l | b a n n e r | b r e a d c r u m b s | c o m b x | c o m m e n t | c o m m u n i t y | c o v e r - w r a p | d i s q u s | e x t r a | f o o t e r | g d p r | h e a d e r | l e g e n d s | m e n u | r e l a t e d | r e m a r k | r e p l i e s | r s s | s h o u t b o x | s i d e b a r | s k y s c r a p e r | s o c i a l | s p o n s o r | s u p p l e m e n t a l | a d - b r e a k | a g e g a t e | p a g i n a t i o n | p a g e r | p o p u p | y o m - r e m o t e / i,
142- okMaybeItsACandidate : / a n d | a r t i c l e | b o d y | c o l u m n | c o n t e n t | m a i n | s h a d o w / i,
143-
142+ okMaybeItsACandidate : / a n d | a r t i c l e | b o d y | c o l u m n | c o n t e n t | m a i n | s h a d o w | h l j s - | k a t e x - | m p - c o m m o n - v i d e o s n a p | p r e s e r v e - c l a s s - t a g / i,
143+ needSaveHeaderTitle : / H [ 1 - 7 ] { 1 } / ,
144+ ignoreCleanClassesWhitelist : / ^ ( h l j s | k a t e x | m p - c o m m o n - v i d e o s n a p | p r e s e r v e - c l a s s - t a g ) ( - .* ) ? $ / i,
145+ stylePreserveClassCandidates : / ^ ( k a t e x - | m p - c o m m o n - v i d e o s n a p ) ( - .* ) ? $ / i,
146+ ignoreCleanStylesWhitelist : / ^ ( h l j s | k a t e x | m p - c o m m o n - v i d e o s n a p ) ( - .* ) ? $ / i,
144147 positive :
145148 / a r t i c l e | b o d y | c o n t e n t | e n t r y | h e n t r y | h - e n t r y | m a i n | p a g e | p a g i n a t i o n | p o s t | t e x t | b l o g | s t o r y / i,
146149 negative :
147150 / - a d - | h i d d e n | ^ h i d $ | h i d $ | h i d | ^ h i d | b a n n e r | c o m b x | c o m m e n t | c o m - | c o n t a c t | f o o t e r | g d p r | m a s t h e a d | m e d i a | m e t a | o u t b r a i n | p r o m o | r e l a t e d | s c r o l l | s h a r e | s h o u t b o x | s i d e b a r | s k y s c r a p e r | s p o n s o r | s h o p p i n g | t a g s | w i d g e t / i,
148151 extraneous :
149152 / p r i n t | a r c h i v e | c o m m e n t | d i s c u s s | e [ \- ] ? m a i l | s h a r e | r e p l y | a l l | l o g i n | s i g n | s i n g l e | u t i l i t y / i,
150- byline : / b y l i n e | a u t h o r | d a t e l i n e | w r i t t e n b y | p - a u t h o r / i,
153+ byline : / b y l i n e | a u t h o r | d a t e l i n e | w r i t t e n b y | p - a u t h o r | p r o f i l e B t | b i o / i,
151154 replaceFonts : / < ( \/ ? ) f o n t [ ^ > ] * > / gi,
152155 normalize : / \s { 2 , } / g,
153156 videos :
154- / \/ \/ ( w w w \. ) ? ( ( d a i l y m o t i o n | y o u t u b e | y o u t u b e - n o c o o k i e | p l a y e r \. v i m e o | v \. q q ) \. c o m | ( a r c h i v e | u p l o a d \. w i k i m e d i a ) \. o r g | p l a y e r \. t w i t c h \. t v ) / i,
157+ / \/ \/ ( w w w \. ) ? ( ( d a i l y m o t i o n | y o u t u b e | y o u t u b e - n o c o o k i e | p l a y e r \. v i m e o | m p v i d e o | q p i c | v \. q q ) \. c o m | ( a r c h i v e | u p l o a d \. w i k i m e d i a ) \. o r g | p l a y e r \. t w i t c h \. t v ) | . c n / i,
155158 shareElements : / ( \b | _ ) ( s h a r e | s h a r e d a d d y ) ( \b | _ ) / i,
156159 nextLink : / ( n e x t | w e i t e r | c o n t i n u e | > ( [ ^ \| ] | $ ) | » ( [ ^ \| ] | $ ) ) / i,
157160 prevLink : / ( p r e v | e a r l | o l d | n e w | < | « ) / i,
@@ -208,7 +211,6 @@ Readability.prototype = {
208211 "frame" ,
209212 "hspace" ,
210213 "rules" ,
211- "style" ,
212214 "valign" ,
213215 "vspace" ,
214216 ] ,
@@ -258,6 +260,22 @@ Readability.prototype = {
258260 "TIME" ,
259261 "VAR" ,
260262 "WBR" ,
263+ "SVG" ,
264+ "PATH" ,
265+ "G" ,
266+ "FIGURE" ,
267+ "FIGCAPTION" ,
268+ "PICTURE" ,
269+ "SOURCE" ,
270+ "TRACK" ,
271+ "AREA" ,
272+ "MAP" ,
273+ "TABLE" ,
274+ "ARTICLE" ,
275+ "SECTION" ,
276+ "P" ,
277+ "OL" ,
278+ "UL" ,
261279 ] ,
262280
263281 // These are the classes that readability sets itself.
@@ -416,9 +434,19 @@ Readability.prototype = {
416434 */
417435 _cleanClasses ( node ) {
418436 var classesToPreserve = this . _classesToPreserve ;
437+ var ignoreCleanClassesWhitelist = this . REGEXPS . ignoreCleanClassesWhitelist
438+ var hasWhitelistClass = false
439+
419440 var className = ( node . getAttribute ( "class" ) || "" )
420441 . split ( / \s + / )
421- . filter ( cls => classesToPreserve . includes ( cls ) )
442+ . filter ( cls => {
443+ if ( ignoreCleanClassesWhitelist . test ( cls ) ) {
444+ hasWhitelistClass = true
445+ return true
446+ }
447+
448+ return classesToPreserve . includes ( cls )
449+ } )
422450 . join ( " " ) ;
423451
424452 if ( className ) {
@@ -427,7 +455,9 @@ Readability.prototype = {
427455 node . removeAttribute ( "class" ) ;
428456 }
429457
430- for ( node = node . firstElementChild ; node ; node = node . nextElementSibling ) {
458+
459+
460+ for ( node = ! hasWhitelistClass ? node . firstElementChild : null ; node ; node = node . nextElementSibling ) {
431461 this . _cleanClasses ( node ) ;
432462 }
433463 } ,
@@ -595,12 +625,11 @@ Readability.prototype = {
595625 // If there's a separator in the title, first remove the final part
596626 if ( / [ \| \- \\ \/ > » ] / . test ( curTitle ) ) {
597627 titleHadHierarchicalSeparators = / [ \\ \/ > » ] / . test ( curTitle ) ;
598- let allSeparators = Array . from ( origTitle . matchAll ( / [ \| \- \\ \/ > » ] / gi) ) ;
599- curTitle = origTitle . substring ( 0 , allSeparators . pop ( ) . index ) ;
628+ curTitle = origTitle . replace ( / ( .* ) [ \| \- \\ \/ > » ] .* / gi, "$1" ) ;
600629
601630 // If the resulting title is too short, remove the first part instead:
602631 if ( wordCount ( curTitle ) < 3 ) {
603- curTitle = origTitle . replace ( / ^ [ ^ \| \- \\ \/ > » ] * [ \| \- \\ \/ > » ] / gi, "" ) ;
632+ curTitle = origTitle . replace ( / [ ^ \| \- \\ \/ > » ] * [ \| \- \\ \/ > » ] ( . * ) / gi, "$1 " ) ;
604633 }
605634 } else if ( curTitle . includes ( ": " ) ) {
606635 // Check if we have an heading containing this exact string, so we
@@ -825,6 +854,11 @@ Readability.prototype = {
825854 this . _cleanConditionally ( articleContent , "ul" ) ;
826855 this . _cleanConditionally ( articleContent , "div" ) ;
827856
857+ this . _replaceNodeTags (
858+ this . _getAllNodesWithTag ( articleContent , [ "slax-mark" ] ) ,
859+ "span"
860+ ) ;
861+
828862 // replace H1 with H2 as H1 should be only title that is displayed separately
829863 this . _replaceNodeTags (
830864 this . _getAllNodesWithTag ( articleContent , [ "h1" ] ) ,
@@ -1064,9 +1098,11 @@ Readability.prototype = {
10641098 var matchString = node . className + " " + node . id ;
10651099
10661100 if ( ! this . _isProbablyVisible ( node ) ) {
1067- this . log ( "Removing hidden node - " + matchString ) ;
1068- node = this . _removeAndGetNext ( node ) ;
1069- continue ;
1101+ if ( ! this . _haveAllowedVideoTag ( node ) ) {
1102+ this . log ( "Removing hidden node - " + matchString ) ;
1103+ node = this . _removeAndGetNext ( node ) ;
1104+ continue ;
1105+ }
10701106 }
10711107
10721108 // User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
@@ -1121,7 +1157,8 @@ Readability.prototype = {
11211157 ! this . _hasAncestorTag ( node , "table" ) &&
11221158 ! this . _hasAncestorTag ( node , "code" ) &&
11231159 node . tagName !== "BODY" &&
1124- node . tagName !== "A"
1160+ node . tagName !== "A" &&
1161+ ! this . REGEXPS . needSaveHeaderTitle . test ( node . tagName )
11251162 ) {
11261163 this . log ( "Removing unlikely candidate - " + matchString ) ;
11271164 node = this . _removeAndGetNext ( node ) ;
@@ -2100,7 +2137,12 @@ Readability.prototype = {
21002137 e . removeAttribute ( "height" ) ;
21012138 }
21022139
2103- var cur = e . firstElementChild ;
2140+ if ( ! this . REGEXPS . stylePreserveClassCandidates . test ( e . className ) ) {
2141+ e . removeAttribute ( 'style' )
2142+ }
2143+
2144+ const ignore = this . REGEXPS . ignoreCleanStylesWhitelist . test ( e . className ) ;
2145+ var cur = ! ignore ? e . firstElementChild : null ;
21042146 while ( cur !== null ) {
21052147 this . _cleanStyles ( cur ) ;
21062148 cur = cur . nextElementSibling ;
@@ -2416,6 +2458,10 @@ Readability.prototype = {
24162458 if ( textLength === 0 ) {
24172459 return 0 ;
24182460 }
2461+ if ( e . querySelector ( 'h1, h2, h3, h4, h5, h6, h7' ) &&
2462+ ( e . textContent . trim ( ) || e . querySelector ( 'a' ) || e . querySelector ( 'svg' ) ) ) {
2463+ return 1 ;
2464+ }
24192465 var childrenLength = 0 ;
24202466 var children = this . _getAllNodesWithTag ( e , tags ) ;
24212467 this . _forEachNode (
@@ -2480,6 +2526,11 @@ Readability.prototype = {
24802526 return false ;
24812527 }
24822528
2529+ const iframe = e . querySelector ( 'iframe' )
2530+ if ( iframe && this . _allowedVideoRegex . test ( iframe . src ) ) {
2531+ return false
2532+ }
2533+
24832534 var weight = this . _getClassWeight ( node ) ;
24842535
24852536 this . log ( "Cleaning Conditionally" , node ) ;
@@ -2496,6 +2547,7 @@ Readability.prototype = {
24962547 // ominous signs, remove the element.
24972548 var p = node . getElementsByTagName ( "p" ) . length ;
24982549 var img = node . getElementsByTagName ( "img" ) . length ;
2550+ var video = node . getElementsByTagName ( 'video' ) . length
24992551 var li = node . getElementsByTagName ( "li" ) . length - 100 ;
25002552 var input = node . getElementsByTagName ( "input" ) . length ;
25012553 var headingDensity = this . _getTextDensity ( node , [
@@ -2594,9 +2646,9 @@ Readability.prototype = {
25942646 `Suspicious embed. (embedCount=${ embedCount } , contentLength=${ contentLength } )`
25952647 ) ;
25962648 }
2597- if ( img === 0 && textDensity === 0 ) {
2649+ if ( img === 0 && textDensity === 0 && video === 0 ) {
25982650 errs . push (
2599- `No useful content. (img=${ img } , textDensity=${ textDensity } )`
2651+ `No useful content. (img=${ img } , textDensity=${ textDensity } , video= ${ video } )`
26002652 ) ;
26012653 }
26022654
@@ -2631,6 +2683,11 @@ Readability.prototype = {
26312683 } ) ;
26322684 } ,
26332685
2686+ _haveAllowedVideoTag ( e ) {
2687+ const videos = Array . from ( e . querySelectorAll ( 'video' ) ) || [ ]
2688+ return ! ! videos . find ( video => this . _allowedVideoRegex . test ( video . getAttribute ( 'src' ) ) )
2689+ } ,
2690+
26342691 /**
26352692 * Clean out elements that match the specified conditions
26362693 *
@@ -2680,7 +2737,7 @@ Readability.prototype = {
26802737 }
26812738 var heading = this . _getInnerText ( node , false ) ;
26822739 this . log ( "Evaluating similarity of header:" , heading , this . _articleTitle ) ;
2683- return this . _textSimilarity ( this . _articleTitle , heading ) > 0.75 ;
2740+ return this . _textSimilarity ( this . _articleTitle , heading ) === 1 ;
26842741 } ,
26852742
26862743 _flagIsActive ( flag ) {
0 commit comments