Skip to content

Commit be1ee53

Browse files
authored
BlockRules Fixes (0.4.3) (#75)
- blockrules fix: when checking an iframe nav request, match inFrameUrl against the parent iframe, not current one - blockrules: cleanup, always allow 'pywb.proxy' static files - logging: when 'debug' logging enabled, log urls blocked and conditional iframe checks from blockrules - tests: add more complex test for blockrules - update CHANGES and support info in README - bump to 0.4.3
1 parent f0c5ca1 commit be1ee53

File tree

6 files changed

+85
-26
lines changed

6 files changed

+85
-26
lines changed

CHANGES.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
## CHANGES
22

3+
v0.4.3
4+
- BlockRules Fixes: When considering the 'inFrameUrl' for a navigation request for an iframe, use URL of parent frame.
5+
- BlockRules Fixes: Always allow pywb proxy scripts.
6+
- Logging: Improved debug logging for block rules (log blocked requests and conditional iframe requests) when 'debug' set in 'logging'
7+
38
v0.4.2
49
- Compose/docs: Build latest image by default, update README to refer to latest image
510
- Fix typo in `crawler.capturePrefix` that resulted in `directFetchCapture()` always failing

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -484,10 +484,9 @@ Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should
484484
Support
485485
-------
486486

487-
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/)
487+
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between. Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
488488

489-
Initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between
490-
Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
489+
Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).
491490

492491

493492
License

crawler.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ class Crawler {
329329
await this.initPages();
330330

331331
if (this.params.blockRules && this.params.blockRules.length) {
332-
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage);
332+
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, (text) => this.debugLog(text));
333333
}
334334

335335
if (this.params.screencastPort) {

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "browsertrix-crawler",
3-
"version": "0.4.2",
3+
"version": "0.4.3",
44
"main": "browsertrix-crawler",
55
"repository": "https://github.com/webrecorder/browsertrix-crawler",
66
"author": "Ilya Kreymer <[email protected]>, Webrecorder Software",

tests/blockrules.test.js

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,4 +130,36 @@ test("test block url in frame url", () => {
130130
});
131131

132132

133+
test("test block rules complex example, block external urls on main frame, but not on youtube", () => {
134+
const config = {
135+
"seeds": [
136+
"https://archiveweb.page/guide/troubleshooting/errors.html",
137+
],
138+
"depth": "0",
139+
"blockRules": [{
140+
"url": "(archiveweb.page|www.youtube.com)",
141+
"type": "allowOnly",
142+
"inFrameUrl": "archiveweb.page"
143+
}, {
144+
"url": "https://archiveweb.page/assets/js/vendor/lunr.min.js",
145+
"inFrameUrl": "archiveweb.page"
146+
}, {
147+
"url": "https://www.youtube.com/embed/",
148+
"type": "allowOnly",
149+
"frameTextMatch": "(\\\\\"channelId\\\\\":\\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\\")"
150+
}],
151+
152+
"combineWARC": true,
153+
154+
"logging": "stats,debug"
155+
};
156+
157+
158+
runCrawl("block-7", config);
159+
160+
expect(doesCDXContain("block-7", "\"https://archiveweb.page/assets/js/vendor/lunr.min.js\"")).toBe(false);
161+
expect(doesCDXContain("block-7", "\"video/mp4\"")).toBe(true);
162+
});
163+
164+
133165

util/blockrules.js

Lines changed: 44 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ const fetch = require("node-fetch");
22

33
const RULE_TYPES = ["block", "allowOnly"];
44

5+
const ALWAYS_ALLOW = ["https://pywb.proxy/", "http://pywb.proxy/"];
6+
57

68
// ===========================================================================
79
class BlockRule
@@ -37,20 +39,21 @@ ${this.frameTextMatch ? "Frame Text Regex: " + this.frameTextMatch : ""}
3739
// ===========================================================================
3840
class BlockRules
3941
{
40-
constructor(blockRules, blockPutUrl, blockErrMsg) {
42+
constructor(blockRules, blockPutUrl, blockErrMsg, debugLog) {
4143
this.rules = [];
4244
this.blockPutUrl = blockPutUrl;
4345
this.blockErrMsg = blockErrMsg;
46+
this.debugLog = debugLog;
4447
this.putUrlSet = new Set();
4548

4649
for (const ruleData of blockRules) {
4750
this.rules.push(new BlockRule(ruleData));
4851
}
4952

5053
if (this.rules.length) {
51-
console.log("URL Block Rules:\n");
54+
this.debugLog("URL Block Rules:\n");
5255
for (const rule of this.rules) {
53-
console.log(rule.toString());
56+
this.debugLog(rule.toString());
5457
}
5558
}
5659
}
@@ -79,15 +82,20 @@ class BlockRules
7982
return;
8083
}
8184

85+
// always allow special pywb proxy script
86+
for (const allowUrl of ALWAYS_ALLOW) {
87+
if (url.startsWith(allowUrl)) {
88+
request.continue();
89+
return;
90+
}
91+
}
92+
8293
for (const rule of this.rules) {
83-
const {done, block} = await this.shouldBlock(rule, request);
94+
const {done, block, frameUrl} = await this.shouldBlock(rule, request, url);
8495

8596
if (block) {
86-
//const frameUrl = request.frame().url();
87-
//console.log("Blocking/Aborting Request for: " + request.url());
88-
// not allowed, abort loading this response
8997
request.abort();
90-
await this.recordBlockMsg(request.url());
98+
await this.recordBlockMsg(url, frameUrl);
9199
return;
92100
}
93101
if (done) {
@@ -98,42 +106,56 @@ class BlockRules
98106
request.continue();
99107
}
100108

101-
async shouldBlock(rule, request) {
102-
const reqUrl = request.url();
103-
109+
async shouldBlock(rule, request, reqUrl) {
104110
const {url, inFrameUrl, frameTextMatch} = rule;
105111

106112
const type = rule.type || "block";
107113
const allowOnly = (type === "allowOnly");
108114

109-
const frameUrl = request.frame().url();
115+
const isNavReq = request.isNavigationRequest();
116+
117+
const frame = request.frame();
118+
119+
let frameUrl = null;
120+
121+
if (isNavReq) {
122+
const parentFrame = frame.parentFrame();
123+
if (parentFrame) {
124+
frameUrl = parentFrame.url();
125+
} else {
126+
frameUrl = frame.url();
127+
}
128+
} else {
129+
frameUrl = frame.url();
130+
}
110131

111132
// ignore initial page
112133
if (frameUrl === "about:blank") {
113-
return {block: false, done: true};
134+
return {block: false, done: true, frameUrl};
114135
}
115136

116137
// not a frame match, skip rule
117138
if (inFrameUrl && !frameUrl.match(inFrameUrl)) {
118-
return {block: false, done: false};
139+
return {block: false, done: false, frameUrl};
119140
}
120141

121142
const urlMatched = (url && reqUrl.match(url));
122143

123144
// if frame text-based rule: if url matched and a frame request
124145
// frame text-based match: only applies to nav requests, never block otherwise
125146
if (frameTextMatch) {
126-
if (!urlMatched || !request.isNavigationRequest()) {
127-
return {block: false, done: false};
147+
if (!urlMatched || !isNavReq) {
148+
return {block: false, done: false, frameUrl};
128149
}
129150

130151
const block = await this.isTextMatch(request, reqUrl, frameTextMatch) ? !allowOnly : allowOnly;
131-
return {block, done: true};
152+
this.debugLog(`iframe ${url} conditionally ${block ? "BLOCKED" : "ALLOWED"}, parent frame ${frameUrl}`);
153+
return {block, done: true, frameUrl};
132154
}
133155

134156
// for non frame text rule, simply match by URL
135157
const block = urlMatched ? !allowOnly : allowOnly;
136-
return {block, done: false};
158+
return {block, done: false, frameUrl};
137159
}
138160

139161
async isTextMatch(request, reqUrl, frameTextMatch) {
@@ -144,11 +166,13 @@ class BlockRules
144166
return !!text.match(frameTextMatch);
145167

146168
} catch (e) {
147-
console.log(e);
169+
this.debugLog(e);
148170
}
149171
}
150172

151-
async recordBlockMsg(url) {
173+
async recordBlockMsg(url, frameUrl) {
174+
this.debugLog(`URL Blocked/Aborted: ${url} in frame ${frameUrl}`);
175+
152176
if (!this.blockErrMsg || !this.blockPutUrl) {
153177
return;
154178
}
@@ -162,7 +186,6 @@ class BlockRules
162186
const body = this.blockErrMsg;
163187
const putUrl = new URL(this.blockPutUrl);
164188
putUrl.searchParams.set("url", url);
165-
//console.log("put url", putUrl.href);
166189
await fetch(putUrl.href, {method: "PUT", headers: {"Content-Type": "text/html"}, body});
167190
}
168191
}

0 commit comments

Comments
 (0)