Skip to content

Commit 032b51d

Browse files
committed
📦 synchronising historical changes
1 parent 04fd32f commit 032b51d

File tree

4 files changed

+116
-20
lines changed

4 files changed

+116
-20
lines changed

‎.github/publish.yml‎

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
name: Node.js Package
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
8+
jobs:
9+
build:
10+
runs-on: ubuntu-latest
11+
permissions:
12+
packages: write
13+
contents: read
14+
steps:
15+
- uses: actions/checkout@v4
16+
- uses: actions/setup-node@v4
17+
with:
18+
node-version: 20.11.1
19+
registry-url: https://npm.pkg.github.com/
20+
- run: npm ci
21+
- run: npm install
22+
- run: npm publish
23+
env:
24+
NODE_AUTH_TOKEN: ${{secrets.GITHUB_TOKEN}}

‎README.md‎

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,18 @@
1+
# Slax-Readability.js
2+
3+
Since the original Readability version is prone to excessive deletion of the normal content of the article, this forked version is only intended to deal with the following specific issues to avoid missing content in the article
4+
5+
- Add a unified whitelist for clean class
6+
- Add a whitelist for ignoring style removal
7+
- Retain elements in the highlights block
8+
- Adjusted video's regularity
9+
- Retain iframes with video src
10+
- add unified preserve-class-tag for preserving class type
11+
- Added ignore handling for microsoft video numbers
12+
- Expand the scope of paragraph preservation
13+
- Preserve wechat image style, skip processing
14+
- For H1~H7 tags, increased inclusiveness
15+
116
# Readability.js
217

318
A standalone version of the readability library used for [Firefox Reader View](https://support.mozilla.org/kb/firefox-reader-view-clutter-free-web-pages).

‎Readability.js‎

Lines changed: 75 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -139,19 +139,22 @@ Readability.prototype = {
139139
// Readability-readerable.js. Please keep both copies in sync.
140140
unlikelyCandidates:
141141
/-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|footer|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
142-
okMaybeItsACandidate: /and|article|body|column|content|main|shadow/i,
143-
142+
okMaybeItsACandidate: /and|article|body|column|content|main|shadow|hljs-|katex-|mp-common-videosnap|preserve-class-tag/i,
143+
needSaveHeaderTitle: /H[1-7]{1}/,
144+
ignoreCleanClassesWhitelist: /^(hljs|katex|mp-common-videosnap|preserve-class-tag)(-.*)?$/i,
145+
stylePreserveClassCandidates: /^(katex-|mp-common-videosnap)(-.*)?$/i,
146+
ignoreCleanStylesWhitelist: /^(hljs|katex|mp-common-videosnap)(-.*)?$/i,
144147
positive:
145148
/article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
146149
negative:
147150
/-ad-|hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|footer|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|widget/i,
148151
extraneous:
149152
/print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
150-
byline: /byline|author|dateline|writtenby|p-author/i,
153+
byline: /byline|author|dateline|writtenby|p-author|profileBt|bio/i,
151154
replaceFonts: /<(\/?)font[^>]*>/gi,
152155
normalize: /\s{2,}/g,
153156
videos:
154-
/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)/i,
157+
/\/\/(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|mpvideo|qpic|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)|.cn/i,
155158
shareElements: /(\b|_)(share|sharedaddy)(\b|_)/i,
156159
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
157160
prevLink: /(prev|earl|old|new|<|«)/i,
@@ -208,7 +211,6 @@ Readability.prototype = {
208211
"frame",
209212
"hspace",
210213
"rules",
211-
"style",
212214
"valign",
213215
"vspace",
214216
],
@@ -258,6 +260,22 @@ Readability.prototype = {
258260
"TIME",
259261
"VAR",
260262
"WBR",
263+
"SVG",
264+
"PATH",
265+
"G",
266+
"FIGURE",
267+
"FIGCAPTION",
268+
"PICTURE",
269+
"SOURCE",
270+
"TRACK",
271+
"AREA",
272+
"MAP",
273+
"TABLE",
274+
"ARTICLE",
275+
"SECTION",
276+
"P",
277+
"OL",
278+
"UL",
261279
],
262280

263281
// These are the classes that readability sets itself.
@@ -416,9 +434,19 @@ Readability.prototype = {
416434
*/
417435
_cleanClasses(node) {
418436
var classesToPreserve = this._classesToPreserve;
437+
var ignoreCleanClassesWhitelist = this.REGEXPS.ignoreCleanClassesWhitelist
438+
var hasWhitelistClass = false
439+
419440
var className = (node.getAttribute("class") || "")
420441
.split(/\s+/)
421-
.filter(cls => classesToPreserve.includes(cls))
442+
.filter(cls => {
443+
if (ignoreCleanClassesWhitelist.test(cls)) {
444+
hasWhitelistClass = true
445+
return true
446+
}
447+
448+
return classesToPreserve.includes(cls)
449+
})
422450
.join(" ");
423451

424452
if (className) {
@@ -427,7 +455,9 @@ Readability.prototype = {
427455
node.removeAttribute("class");
428456
}
429457

430-
for (node = node.firstElementChild; node; node = node.nextElementSibling) {
458+
459+
460+
for (node = !hasWhitelistClass ? node.firstElementChild : null; node; node = node.nextElementSibling) {
431461
this._cleanClasses(node);
432462
}
433463
},
@@ -595,12 +625,11 @@ Readability.prototype = {
595625
// If there's a separator in the title, first remove the final part
596626
if (/ [\|\-\\\/>»] /.test(curTitle)) {
597627
titleHadHierarchicalSeparators = / [\\\/>»] /.test(curTitle);
598-
let allSeparators = Array.from(origTitle.matchAll(/ [\|\-\\\/>»] /gi));
599-
curTitle = origTitle.substring(0, allSeparators.pop().index);
628+
curTitle = origTitle.replace(/(.*)[\|\-\\\/>»] .*/gi, "$1");
600629

601630
// If the resulting title is too short, remove the first part instead:
602631
if (wordCount(curTitle) < 3) {
603-
curTitle = origTitle.replace(/^[^\|\-\\\/>»]*[\|\-\\\/>»]/gi, "");
632+
curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, "$1");
604633
}
605634
} else if (curTitle.includes(": ")) {
606635
// Check if we have an heading containing this exact string, so we
@@ -825,6 +854,11 @@ Readability.prototype = {
825854
this._cleanConditionally(articleContent, "ul");
826855
this._cleanConditionally(articleContent, "div");
827856

857+
this._replaceNodeTags(
858+
this._getAllNodesWithTag(articleContent, ["slax-mark"]),
859+
"span"
860+
);
861+
828862
// replace H1 with H2 as H1 should be only title that is displayed separately
829863
this._replaceNodeTags(
830864
this._getAllNodesWithTag(articleContent, ["h1"]),
@@ -1064,9 +1098,11 @@ Readability.prototype = {
10641098
var matchString = node.className + " " + node.id;
10651099

10661100
if (!this._isProbablyVisible(node)) {
1067-
this.log("Removing hidden node - " + matchString);
1068-
node = this._removeAndGetNext(node);
1069-
continue;
1101+
if (!this._haveAllowedVideoTag(node)) {
1102+
this.log("Removing hidden node - " + matchString);
1103+
node = this._removeAndGetNext(node);
1104+
continue;
1105+
}
10701106
}
10711107

10721108
// User is not able to see elements applied with both "aria-modal = true" and "role = dialog"
@@ -1121,7 +1157,8 @@ Readability.prototype = {
11211157
!this._hasAncestorTag(node, "table") &&
11221158
!this._hasAncestorTag(node, "code") &&
11231159
node.tagName !== "BODY" &&
1124-
node.tagName !== "A"
1160+
node.tagName !== "A" &&
1161+
!this.REGEXPS.needSaveHeaderTitle.test(node.tagName)
11251162
) {
11261163
this.log("Removing unlikely candidate - " + matchString);
11271164
node = this._removeAndGetNext(node);
@@ -2100,7 +2137,12 @@ Readability.prototype = {
21002137
e.removeAttribute("height");
21012138
}
21022139

2103-
var cur = e.firstElementChild;
2140+
if (!this.REGEXPS.stylePreserveClassCandidates.test(e.className)) {
2141+
e.removeAttribute('style')
2142+
}
2143+
2144+
const ignore = this.REGEXPS.ignoreCleanStylesWhitelist.test(e.className);
2145+
var cur = !ignore ? e.firstElementChild : null;
21042146
while (cur !== null) {
21052147
this._cleanStyles(cur);
21062148
cur = cur.nextElementSibling;
@@ -2416,6 +2458,10 @@ Readability.prototype = {
24162458
if (textLength === 0) {
24172459
return 0;
24182460
}
2461+
if (e.querySelector('h1, h2, h3, h4, h5, h6, h7') &&
2462+
(e.textContent.trim() || e.querySelector('a') || e.querySelector('svg'))) {
2463+
return 1;
2464+
}
24192465
var childrenLength = 0;
24202466
var children = this._getAllNodesWithTag(e, tags);
24212467
this._forEachNode(
@@ -2480,6 +2526,11 @@ Readability.prototype = {
24802526
return false;
24812527
}
24822528

2529+
const iframe = e.querySelector('iframe')
2530+
if (iframe && this._allowedVideoRegex.test(iframe.src)) {
2531+
return false
2532+
}
2533+
24832534
var weight = this._getClassWeight(node);
24842535

24852536
this.log("Cleaning Conditionally", node);
@@ -2496,6 +2547,7 @@ Readability.prototype = {
24962547
// ominous signs, remove the element.
24972548
var p = node.getElementsByTagName("p").length;
24982549
var img = node.getElementsByTagName("img").length;
2550+
var video = node.getElementsByTagName('video').length
24992551
var li = node.getElementsByTagName("li").length - 100;
25002552
var input = node.getElementsByTagName("input").length;
25012553
var headingDensity = this._getTextDensity(node, [
@@ -2594,9 +2646,9 @@ Readability.prototype = {
25942646
`Suspicious embed. (embedCount=${embedCount}, contentLength=${contentLength})`
25952647
);
25962648
}
2597-
if (img === 0 && textDensity === 0) {
2649+
if (img === 0 && textDensity === 0 && video === 0) {
25982650
errs.push(
2599-
`No useful content. (img=${img}, textDensity=${textDensity})`
2651+
`No useful content. (img=${img}, textDensity=${textDensity}, video=${video})`
26002652
);
26012653
}
26022654

@@ -2631,6 +2683,11 @@ Readability.prototype = {
26312683
});
26322684
},
26332685

2686+
_haveAllowedVideoTag(e) {
2687+
const videos = Array.from(e.querySelectorAll('video')) || []
2688+
return !!videos.find(video => this._allowedVideoRegex.test(video.getAttribute('src')))
2689+
},
2690+
26342691
/**
26352692
* Clean out elements that match the specified conditions
26362693
*
@@ -2680,7 +2737,7 @@ Readability.prototype = {
26802737
}
26812738
var heading = this._getInnerText(node, false);
26822739
this.log("Evaluating similarity of header:", heading, this._articleTitle);
2683-
return this._textSimilarity(this._articleTitle, heading) > 0.75;
2740+
return this._textSimilarity(this._articleTitle, heading) === 1;
26842741
},
26852742

26862743
_flagIsActive(flag) {

‎package.json‎

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
2-
"name": "@mozilla/readability",
3-
"version": "0.6.0",
2+
"name": "@slax-lab/slax-readability",
3+
"version": "0.6.1",
44
"description": "A standalone version of the readability library used for Firefox Reader View.",
55
"main": "index.js",
66
"types": "index.d.ts",

0 commit comments

Comments
 (0)