Skip to content

Commit 8d7fb1e

Browse files
authored
1.2.8 updates: (#668)
- rewriting: update wabac.js, use getCustomRewriter(), don't truncate POST request bodies for URLs that use a custom rewriter - browser: disable --enable-automation, setting webdriver = true, so no need for override - deps: update puppeteer-core, necessary changes for latest puppeteer
1 parent bb34c5e commit 8d7fb1e

File tree

7 files changed

+53
-50
lines changed

7 files changed

+53
-50
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ ADD config/ /app/
4242

4343
ADD html/ /app/html/
4444

45-
ARG RWP_VERSION=2.1.3
45+
ARG RWP_VERSION=2.1.4
4646
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
4747
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
4848
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz

package.json

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "browsertrix-crawler",
3-
"version": "1.2.7",
3+
"version": "1.2.8",
44
"main": "browsertrix-crawler",
55
"type": "module",
66
"repository": "https://github.com/webrecorder/browsertrix-crawler",
@@ -18,8 +18,8 @@
1818
"dependencies": {
1919
"@novnc/novnc": "^1.4.0",
2020
"@types/sax": "^1.2.7",
21-
"@webrecorder/wabac": "^2.19.4",
22-
"browsertrix-behaviors": "^0.6.3",
21+
"@webrecorder/wabac": "^2.19.7",
22+
"browsertrix-behaviors": "^0.6.4",
2323
"fetch-socks": "^1.3.0",
2424
"get-folder-size": "^4.0.0",
2525
"husky": "^8.0.3",
@@ -30,7 +30,7 @@
3030
"p-queue": "^7.3.4",
3131
"pixelmatch": "^5.3.0",
3232
"pngjs": "^7.0.0",
33-
"puppeteer-core": "^22.14.0",
33+
"puppeteer-core": "^23.0.2",
3434
"sax": "^1.3.0",
3535
"sharp": "^0.32.6",
3636
"tsc": "^2.0.4",

src/util/browser.ts

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ export class Browser {
113113
headless,
114114
executablePath: this.getBrowserExe(),
115115
ignoreDefaultArgs: ["--enable-automation", "--hide-scrollbars"],
116-
ignoreHTTPSErrors: true,
116+
acceptInsecureCerts: true,
117117
handleSIGHUP: signals,
118118
handleSIGINT: signals,
119119
handleSIGTERM: signals,
@@ -140,11 +140,6 @@ export class Browser {
140140
}
141141

142142
async setupPage({ page }: { page: Page; cdp: CDPSession }) {
143-
await this.addInitScript(
144-
page,
145-
'Object.defineProperty(navigator, "webdriver", {value: false});',
146-
);
147-
148143
switch (this.swOpt) {
149144
case "disabled":
150145
logger.debug("Service Workers: always disabled", {}, "browser");

src/util/recorder.ts

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,8 @@ import {
1414

1515
import { fetch, Response } from "undici";
1616

17-
import {
18-
baseRules as baseDSRules,
19-
htmlRules as htmlDSRules,
20-
// @ts-expect-error TODO fill in why error is expected
21-
} from "@webrecorder/wabac/src/rewrite/index.js";
17+
// @ts-expect-error TODO fill in why error is expected
18+
import { getCustomRewriter } from "@webrecorder/wabac/src/rewrite/index.js";
2219
import {
2320
rewriteDASH,
2421
rewriteHLS,
@@ -1003,10 +1000,9 @@ export class Recorder {
10031000
case "text/javascript":
10041001
case "application/javascript":
10051002
case "application/x-javascript": {
1006-
const rules = contentType === "text/html" ? htmlDSRules : baseDSRules;
1007-
const rw = rules.getRewriter(url);
1003+
const rw = getCustomRewriter(url, isHTMLMime(contentType));
10081004

1009-
if (rw !== rules.defaultRewriter) {
1005+
if (rw) {
10101006
string = payload.toString();
10111007
newString = rw.rewrite(string, { live: true, save: extraOpts });
10121008
}

src/util/reqresp.ts

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
// @ts-expect-error TODO fill in why error is expected
22
import { getStatusText } from "@webrecorder/wabac/src/utils.js";
3+
// @ts-expect-error TODO fill in why error is expected
4+
import { getCustomRewriter } from "@webrecorder/wabac/src/rewrite/index.js";
35

46
import { Protocol } from "puppeteer-core";
57
import { postToGetUrl } from "warcio";
@@ -372,8 +374,11 @@ export class RequestResponseInfo {
372374
};
373375

374376
if (postToGetUrl(convData)) {
375-
//this.requestBody = convData.requestBody;
376-
// truncate to avoid extra long URLs
377+
// if not custom rewrite, truncate to avoid extra long URLs
378+
if (getCustomRewriter(this.url, isHTMLMime(this.getMimeType() || ""))) {
379+
return convData.url;
380+
}
381+
377382
try {
378383
const url = new URL(convData.url);
379384
for (const [key, value] of url.searchParams.entries()) {

src/util/screenshots.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ export class Screenshots {
7070
});
7171
}
7272
const options = screenshotTypes[screenshotType];
73-
const screenshotBuffer = await this.page.screenshot(options);
73+
const screenshotBuffer = Buffer.from(await this.page.screenshot(options));
7474
if (state && screenshotType === "view") {
7575
state.screenshotView = screenshotBuffer;
7676
}

yarn.lock

Lines changed: 35 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1300,21 +1300,21 @@
13001300
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
13011301
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
13021302

1303-
"@webrecorder/wabac@^2.19.4":
1304-
version "2.19.4"
1305-
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.4.tgz#6c91a65928413b8394f17b57f57a803dcb111dbe"
1306-
integrity sha512-USWUoreSfgyeYYrC2/o2YYr4dCUSwgOSzbpdapqh90VQ4Fb0fjwPAiessBCH4rA5yd9QpOgWdkapDmXvLx6Bww==
1303+
"@webrecorder/wabac@^2.19.7":
1304+
version "2.19.7"
1305+
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.19.7.tgz#3afe48f79752bcd189cffd5d5e6a8dbe4f394053"
1306+
integrity sha512-X9UFxWCww1KWDnAaEjg7vpg6SznBov5a88FPxbOvo5yCT/UkJcQHaa0qo1L52l46sIAUnSbsYz1ur9yMd6ygVA==
13071307
dependencies:
13081308
"@peculiar/asn1-ecc" "^2.3.4"
13091309
"@peculiar/asn1-schema" "^2.3.3"
13101310
"@peculiar/x509" "^1.9.2"
1311-
"@webrecorder/wombat" "^3.7.11"
1311+
"@webrecorder/wombat" "^3.7.14"
13121312
acorn "^8.10.0"
13131313
auto-js-ipfs "^2.1.1"
13141314
base64-js "^1.5.1"
13151315
brotli "^1.3.3"
13161316
buffer "^6.0.3"
1317-
fast-xml-parser "^4.4.0"
1317+
fast-xml-parser "^4.4.1"
13181318
hash-wasm "^4.9.0"
13191319
http-link-header "^1.1.3"
13201320
http-status-codes "^2.1.4"
@@ -1329,10 +1329,10 @@
13291329
stream-browserify "^3.0.0"
13301330
warcio "^2.2.1"
13311331

1332-
"@webrecorder/wombat@^3.7.11":
1333-
version "3.7.11"
1334-
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.11.tgz#27539f52317b2d80af4f28d971d59b53bc0f2b96"
1335-
integrity sha512-WlGpKjHUpP2aZo/OrY5aduNX/TVdo+hSkzu9as/63wSQ4ZFWIqZ+pxYXci43hjV5oVjcMP4KALLq+V+Fuo8qSA==
1332+
"@webrecorder/wombat@^3.7.14":
1333+
version "3.7.14"
1334+
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.7.14.tgz#3779e4cadb256755bbbfd2960805965ec4daacd8"
1335+
integrity sha512-sDNH+c8WstQrK91y8kIPJh1XAC2WXLU5rC8wztANzK1mVzA7v6XB5gk3Yp7OIAn4bn1XuGRVjubhKhmxVVZ9kg==
13361336
dependencies:
13371337
warcio "^2.2.0"
13381338

@@ -1677,10 +1677,10 @@ browserslist@^4.22.2:
16771677
node-releases "^2.0.14"
16781678
update-browserslist-db "^1.0.13"
16791679

1680-
browsertrix-behaviors@^0.6.3:
1681-
version "0.6.3"
1682-
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.3.tgz#cdd6457bcc718cc30257fd754a2c12191a6431a2"
1683-
integrity sha512-fr9w8ANqmxDid4Ile+dYjwcU5nD4+ZhTBVID2zBYWNoSoFkrEILUtpSAbBmLtr5Ujulxjn71uUQwMOfAFAUqzw==
1680+
browsertrix-behaviors@^0.6.4:
1681+
version "0.6.4"
1682+
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.6.4.tgz#33fe9a433108f2faac3a03af91aff940433e5b87"
1683+
integrity sha512-xaiO/VqqeSd5FnAkIKQINxC/q3Med33Lqw3LGxD4NBtkcMSh1Anz/+830QHVlQbp08nIPUXYV96hDrx1Uv0PmQ==
16841684
dependencies:
16851685
query-selector-shadow-dom "^1.0.1"
16861686

@@ -1801,10 +1801,10 @@ chownr@^1.1.1:
18011801
resolved "https://registry.yarnpkg.com/chownr/-/chownr-1.1.4.tgz#6fc9d7b42d32a583596337666e7d08084da2cc6b"
18021802
integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==
18031803

1804-
1805-
version "0.6.2"
1806-
resolved "https://registry.yarnpkg.com/chromium-bidi/-/chromium-bidi-0.6.2.tgz#91f9daa20984833b52221084480fbe0465b29c67"
1807-
integrity sha512-4WVBa6ijmUTVr9cZD4eicQD8Mdy/HCX3bzEIYYpmk0glqYLoWH+LqQEvV9RpDRzoQSbY1KJHloYXbDMXMbDPhg==
1804+
1805+
version "0.6.4"
1806+
resolved "https://registry.yarnpkg.com/chromium-bidi/-/chromium-bidi-0.6.4.tgz#627d76bae2819d59b61a413babe9664e0a16b71d"
1807+
integrity sha512-8zoq6ogmhQQkAKZVKO2ObFTl4uOkqoX1PlKQX3hZQ5E9cbUotcAb7h4pTNVAGGv8Z36PF3CtdOriEp/Rz82JqQ==
18081808
dependencies:
18091809
mitt "3.0.1"
18101810
urlpattern-polyfill "10.0.0"
@@ -1973,6 +1973,13 @@ debug@^4.3.5:
19731973
dependencies:
19741974
ms "2.1.2"
19751975

1976+
debug@^4.3.6:
1977+
version "4.3.6"
1978+
resolved "https://registry.yarnpkg.com/debug/-/debug-4.3.6.tgz#2ab2c38fbaffebf8aa95fdfe6d88438c7a13c52b"
1979+
integrity sha512-O/09Bd4Z1fBrU4VzkhFqVgpPzaGbw6Sm9FEkBT1A/YBXQFGuuSxa1dN2nxgxS34JmKXqYx8CZAwEVoJFImUXIg==
1980+
dependencies:
1981+
ms "2.1.2"
1982+
19761983
decode-uri-component@^0.2.2:
19771984
version "0.2.2"
19781985
resolved "https://registry.yarnpkg.com/decode-uri-component/-/decode-uri-component-0.2.2.tgz#e69dbe25d37941171dd540e024c444cd5188e1e9"
@@ -2428,10 +2435,10 @@ fast-xml-parser@^4.2.2:
24282435
dependencies:
24292436
strnum "^1.0.5"
24302437

2431-
fast-xml-parser@^4.4.0:
2432-
version "4.4.0"
2433-
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.0.tgz#341cc98de71e9ba9e651a67f41f1752d1441a501"
2434-
integrity sha512-kLY3jFlwIYwBNDojclKsNAC12sfD6NwW74QB2CoNGPvtVxjliYehVunB3HYyNi+n4Tt1dAcgwYvmKF/Z18flqg==
2438+
fast-xml-parser@^4.4.1:
2439+
version "4.4.1"
2440+
resolved "https://registry.yarnpkg.com/fast-xml-parser/-/fast-xml-parser-4.4.1.tgz#86dbf3f18edf8739326447bcaac31b4ae7f6514f"
2441+
integrity sha512-xkjOecfnKGkSsOwtZ5Pz7Us/T6mrbPQrq0nh+aCO5V9nk5NLWmasAHumTKjiPJPWANe+kAZ84Jc8ooJkzZ88Sw==
24352442
dependencies:
24362443
strnum "^1.0.5"
24372444

@@ -4345,14 +4352,14 @@ punycode@^2.1.0:
43454352
resolved "https://registry.yarnpkg.com/punycode/-/punycode-2.1.1.tgz#b58b010ac40c22c5657616c8d2c2c02c7bf479ec"
43464353
integrity sha512-XRsRjdf+j5ml+y/6GKHPZbrF/8p2Yga0JPtdqTIY2Xe5ohJPD9saDJJLPvp9+NSBprVvevdXZybnj2cv8OEd0A==
43474354

4348-
puppeteer-core@^22.14.0:
4349-
version "22.14.0"
4350-
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-22.14.0.tgz#5bb466adba725c966b0a86f0337a476d4c68ebec"
4351-
integrity sha512-rl4tOY5LcA3e374GAlsGGHc05HL3eGNf5rZ+uxkl6id9zVZKcwcp1Z+Nd6byb6WPiPeecT/dwz8f/iUm+AZQSw==
4355+
puppeteer-core@^23.0.2:
4356+
version "23.0.2"
4357+
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-23.0.2.tgz#343c8d003e609620febfe35f76847a0014cdc97c"
4358+
integrity sha512-MvOHn+g1TYkAR2oVd/bf/YWXKqFTJmkhyyurYgxkrjh8rBOL1ZH5VyOsLJi0bLO7/yoipAmk1gFZEx9HUJnaoA==
43524359
dependencies:
43534360
"@puppeteer/browsers" "2.3.0"
4354-
chromium-bidi "0.6.2"
4355-
debug "^4.3.5"
4361+
chromium-bidi "0.6.4"
4362+
debug "^4.3.6"
43564363
devtools-protocol "0.0.1312386"
43574364
ws "^8.18.0"
43584365

0 commit comments

Comments
 (0)