Skip to content

Commit 9a3fa33

Browse files
committed
serialization fixes:
- serialize original POST request to warc to avoid confusion, partial fix for webrecorder/replayweb.page#61 - don't serialize empty revisit records with just statusline (fixed via warcio.js 1.4.7) optimization: don't convert response body to string unless rewriting! also: don't convert post-to-get on replay as now supported in chrome! (via wombat 3.3.0) fidelity improvements from wabac.js 2.8.0-beta.0 + wombat 3.3.0 recorder: when getting text nodes, timeout after 10s in case getDOM() call does not return (can happen in case of PDFs) dependencies: update to wabac.js 2.8.0-beta.0 + wombat 3.3.0 + warcio.js signing: update to new format, store signed data in signedData field in datapackage-digest.json bump to 0.6.12
1 parent f6f1dfe commit 9a3fa33

File tree

10 files changed

+94
-52
lines changed

10 files changed

+94
-52
lines changed

package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
{
22
"name": "archiveweb.page",
3-
"version": "0.6.11",
3+
"version": "0.6.12",
44
"main": "index.js",
55
"description": "Create Web Archives directly in your browser",
66
"repository": "https://github.com/webrecorder/archiveweb.page",
77
"author": "Webrecorder Software",
88
"license": "AGPL-3.0-or-later",
99
"dependencies": {
1010
"@fortawesome/fontawesome-free": "^5.13.0",
11-
"@webrecorder/wabac": "^2.7.11",
11+
"@webrecorder/wabac": "^2.8.0-beta.0",
1212
"browsertrix-behaviors": "^0.2.3",
1313
"bulma": "^0.9.2",
1414
"flexsearch": "^0.6.32",
@@ -23,7 +23,7 @@
2323
"pretty-bytes": "^5.3.0",
2424
"replaywebpage": "^1.4.6",
2525
"uuid": "^8.3.2",
26-
"warcio": "^1.4.5"
26+
"warcio": "^1.4.7"
2727
},
2828
"devDependencies": {
2929
"copy-webpack-plugin": "^6.4.0",

src/downloader.js

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -453,9 +453,7 @@ class Downloader
453453

454454
if (this.signer) {
455455
try {
456-
const {signature, publicKey} = await this.signer.sign(hash);
457-
data.signature = signature;
458-
data.publicKey = publicKey;
456+
data.signedData = await this.signer.sign(hash, this.createdDate);
459457

460458
this.signer.close();
461459
this.signer = null;
@@ -600,10 +598,10 @@ class Downloader
600598
}
601599

602600
async createWARCRecord(resource) {
603-
const url = resource.url;
601+
let url = resource.url;
604602
const date = new Date(resource.ts).toISOString();
605603
resource.timestamp = getTSMillis(date);
606-
const httpHeaders = resource.respHeaders;
604+
const httpHeaders = resource.respHeaders || {};
607605
const warcVersion = "WARC/1.1";
608606

609607
// remove aas never preserved in browser-based capture
@@ -616,12 +614,26 @@ class Downloader
616614

617615
let refersToUrl, refersToDate;
618616

617+
let method = "GET";
618+
let requestBody;
619+
620+
// non-GET request/response:
621+
// if original request body + original requestURL is preserved, write that with original method
622+
// otherwise, just serialize the converted-to-GET form
623+
if (resource.method && resource.method !== "GET" && resource.requestBody && resource.requestUrl) {
624+
requestBody = resource.requestBody;
625+
method = resource.method;
626+
url = resource.requestUrl;
627+
} else {
628+
requestBody = new Uint8Array([]);
629+
}
630+
619631
const digestOriginal = this.digestsVisted[resource.digest];
620632

621633
if (resource.digest && digestOriginal) {
622634

623635
// if exact resource in a row, and same page, then just skip instead of writing revisit
624-
if (url === this.lastUrl && pageId === this.lastPageId) {
636+
if (url === this.lastUrl && pageId === this.lastPageId && method === "GET") {
625637
//console.log("Skip Dupe: " + url);
626638
return null;
627639
}
@@ -657,7 +669,9 @@ class Downloader
657669
return null;
658670
}
659671

660-
this.digestsVisted[resource.digest] = {url, date};
672+
if (method === "GET") {
673+
this.digestsVisted[resource.digest] = {url, date};
674+
}
661675
}
662676

663677
const status = resource.status || 200;
@@ -700,7 +714,6 @@ class Downloader
700714
"WARC-Concurrent-To": record.warcHeader("WARC-Record-ID"),
701715
};
702716

703-
const method = resource.method || "GET";
704717
const urlParsed = new URL(url);
705718
const statusline = method + " " + url.slice(urlParsed.origin.length);
706719

@@ -709,7 +722,7 @@ class Downloader
709722
warcHeaders: reqWarcHeaders,
710723
httpHeaders: resource.reqHeaders,
711724
statusline,
712-
}, getPayload(new Uint8Array([])));
725+
}, getPayload(requestBody));
713726

714727
records.push(await WARCSerializer.serialize(reqRecord, {gzip: true}));
715728
}

src/keystore.js

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ export class Signer
7474
}
7575
}
7676

77-
async sign(string) {
77+
async sign(string, created) {
7878
let keyPair;
7979
let keys = await this.loadKeys();
8080

@@ -119,7 +119,14 @@ export class Signer
119119

120120
signature = encodeBase64(new Uint8Array(signature));
121121

122-
return { signature, publicKey: keys.public };
122+
return {
123+
hash: string,
124+
signature,
125+
publicKey: keys.public,
126+
created,
127+
// eslint-disable-next-line no-undef
128+
software: `ArchiveWeb.page ${__VERSION__}`
129+
};
123130
}
124131

125132
async saveKeys(keys, id = "_userkey") {

src/recorder.js

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -627,9 +627,11 @@ class Recorder {
627627
}
628628

629629
try {
630-
//const startTime = new Date().getTime();
631-
return await this.send("DOM.getDocument", {"depth": -1, "pierce": true});
632-
//console.log(`Time getting text for ${this.pageInfo.id}: ${(new Date().getTime() - startTime)}`);
630+
// wait upto 5s for getDocument, otherwise proceed
631+
return await Promise.race([
632+
this.send("DOM.getDocument", {"depth": -1, "pierce": true}),
633+
sleep(10000)
634+
]);
633635
} catch(e) {
634636
console.log(e);
635637
return null;
@@ -962,10 +964,10 @@ class Recorder {
962964
case "text/javascript":
963965
case "application/javascript":
964966
case "application/x-javascript": {
965-
string = payload.toString("utf-8");
966967
const rw = baseDSRules.getRewriter(params.request.url);
967968

968969
if (rw !== baseDSRules.defaultRewriter) {
970+
string = payload.toString("utf-8");
969971
newString = rw.rewrite(string, {live: true});
970972
}
971973
break;

src/requestresponseinfo.js

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ const MAX_URL_LENGTH = 4096;
99

1010
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
1111

12+
const encoder = new TextEncoder();
13+
1214

1315
// ===========================================================================
1416
class RequestResponseInfo
@@ -154,6 +156,8 @@ class RequestResponseInfo
154156
respHeaders.headersDict["x-wabac-preset-cookie"] = cookie;
155157
}
156158

159+
const reqUrl = this.url;
160+
157161
if (this.postData) {
158162
const convData = {
159163
url: this.url,
@@ -168,7 +172,8 @@ class RequestResponseInfo
168172
}
169173
}
170174

171-
const data = {url: this.url,
175+
const data = {
176+
url: this.url,
172177
ts: this.ts,
173178
status: this.status,
174179
statusText:this.statusText,
@@ -182,6 +187,14 @@ class RequestResponseInfo
182187

183188
if (this.method !== "GET") {
184189
data.method = this.method;
190+
if (this.postData) {
191+
if (typeof(this.postData) === "string") {
192+
data.requestBody = encoder.encode(this.postData);
193+
} else {
194+
data.requestBody = this.postData;
195+
}
196+
data.requestUrl = reqUrl;
197+
}
185198
}
186199

187200
return data;

wr-ext/bg.js

Lines changed: 8 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

wr-ext/manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "Webrecorder ArchiveWeb.page",
33
"description": "Create high-fidelity web archives directly in your browser",
4-
"version": "0.6.11",
4+
"version": "0.6.12",
55
"content_security_policy": "script-src 'self' 'unsafe-eval'; object-src 'self'",
66
"permissions": [
77
"debugger",

wr-ext/replay/sw.js

Lines changed: 8 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

wr-ext/replay/ui.js

Lines changed: 3 additions & 3 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

yarn.lock

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -876,12 +876,12 @@
876876
warcio "^1.4.6"
877877
wbn "^0.0.3"
878878

879-
"@webrecorder/wabac@^2.7.11":
880-
version "2.7.11"
881-
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.7.11.tgz#6ffaada9bcf702245e8bd948d387ca0a675d9e37"
882-
integrity sha512-uZz3b2kJRuGzE8wP0tJ848oIaqJHPrXivnDHd6wew3bPDhLr5s9twO1ozn9P6hJSkL3H6f5Yf8gw3HTKbNtygA==
879+
"@webrecorder/wabac@^2.8.0-beta.0":
880+
version "2.8.0-beta.0"
881+
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.8.0-beta.0.tgz#0bdc8b95cea811d0eef9af11602f85cde28cdd87"
882+
integrity sha512-JWZ0/VtuJ0ax1GmQwpcqYnU6rWjvgLTgoHRTO0pV343gsMlBPXfBopEkPTmN0eUhgN7iLPTJownBNboRLGyECA==
883883
dependencies:
884-
"@webrecorder/wombat" "^3.2.2"
884+
"@webrecorder/wombat" "^3.3.0"
885885
brotli "github:foliojs/brotli.js"
886886
fast-xml-parser "^3.15.1"
887887
format-link-header "^3.1.1"
@@ -903,6 +903,13 @@
903903
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.2.2.tgz#e803d6929f9c55d0e68bf0f4b363114c27695924"
904904
integrity sha512-oe+EMpPZUo5YoPjvrfxrf8+z/atBh9yzr7S2xFOeXXpeBt/s62rFY84n/tuE6poIKtYHSD9Radjap7JSuE9F6w==
905905

906+
"@webrecorder/wombat@^3.3.0":
907+
version "3.3.0"
908+
resolved "https://registry.yarnpkg.com/@webrecorder/wombat/-/wombat-3.3.0.tgz#9f14cb75ce722683a047c8687d1ef6353a3e58ed"
909+
integrity sha512-BpyRndvu6RaOVfIY4rdjcN8PM4ezcRItQp94/GVeksXahMlkf9CHsr/ATtPYUv1g7UIqq+O540WG9p3BmjwNqQ==
910+
dependencies:
911+
warcio "^1.4.6"
912+
906913
"@xtuc/ieee754@^1.2.0":
907914
version "1.2.0"
908915
resolved "https://registry.yarnpkg.com/@xtuc/ieee754/-/ieee754-1.2.0.tgz#eef014a3145ae477a1cbc00cd1e552336dceb790"
@@ -10863,10 +10870,10 @@ vm-browserify@^1.0.1:
1086310870
resolved "https://registry.yarnpkg.com/vm-browserify/-/vm-browserify-1.1.2.tgz#78641c488b8e6ca91a75f511e7a3b32a86e5dda0"
1086410871
integrity sha512-2ham8XPWTONajOR0ohOKOHXkm3+gaBmGut3SRuu75xLd/RRaY6vqgh8NBYYk7+RW3u5AtzPQZG8F10LHkl0lAQ==
1086510872

10866-
warcio@^1.4.5:
10867-
version "1.4.5"
10868-
resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.4.5.tgz#24ca61f799185c5d88cdd0a65d279f376b4f9a63"
10869-
integrity sha512-VwFBdmEQhWHmxsdyiLM0INHD1KZ2+EGYzslZXFe6JdbuTfSF/dYRQ/wEdvp+m28mydphROF6D32KfkIMRU1NZw==
10873+
warcio@^1.4.6:
10874+
version "1.4.6"
10875+
resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.4.6.tgz#56ea9a118b89b7b9c71eec493d140876f65d94ee"
10876+
integrity sha512-uncb/Xokc9XL4/1UMHPteAWpiLToOkcu2qhT2ivhVN1t8HwApHGAmLY4y9gqewgtO3s5aRq6T7lJ6WSP5IlA+A==
1087010877
dependencies:
1087110878
"@peculiar/webcrypto" "^1.1.1"
1087210879
esm "^3.2.25"
@@ -10876,10 +10883,10 @@ warcio@^1.4.5:
1087610883
uuid-random "^1.3.0"
1087710884
yargs "^15.3.1"
1087810885

10879-
warcio@^1.4.6:
10880-
version "1.4.6"
10881-
resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.4.6.tgz#56ea9a118b89b7b9c71eec493d140876f65d94ee"
10882-
integrity sha512-uncb/Xokc9XL4/1UMHPteAWpiLToOkcu2qhT2ivhVN1t8HwApHGAmLY4y9gqewgtO3s5aRq6T7lJ6WSP5IlA+A==
10886+
warcio@^1.4.7:
10887+
version "1.4.7"
10888+
resolved "https://registry.yarnpkg.com/warcio/-/warcio-1.4.7.tgz#16c2b02946926f80e346ef924178d9d8b558b2cf"
10889+
integrity sha512-0Yu5qVkV09PhvBVWXNidaH/OqUEKlnzZnNraCsvNJiYnoqxAlqeyBT3u6cbT36CIgxqHQSZlrkklr3fICzFoHg==
1088310890
dependencies:
1088410891
"@peculiar/webcrypto" "^1.1.1"
1088510892
esm "^3.2.25"

0 commit comments

Comments
 (0)