Skip to content

Commit f811c65

Browse files
committed
more download improvements:
- fix missing http protocol in request records - condense download menu: download only selected if any pages are selected - set content-length when removing encoding to ensure correct http content-length - add option to download warc/1.0 warcs ensure content-length is set to actual content-length after recording recorder: don't detach if not running dependency: update to replaywebpage 1.5.1 for improved download options bump to 0.6.16
1 parent 055affa commit f811c65

File tree

10 files changed

+179
-160
lines changed

10 files changed

+179
-160
lines changed

package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "archiveweb.page",
3-
"version": "0.6.15",
3+
"version": "0.6.16",
44
"main": "index.js",
55
"description": "Create Web Archives directly in your browser",
66
"repository": "https://github.com/webrecorder/archiveweb.page",
@@ -11,7 +11,7 @@
1111
"@webrecorder/wabac": "^2.9.0",
1212
"browsertrix-behaviors": "^0.2.3",
1313
"btoa": "^1.2.1",
14-
"bulma": "^0.9.2",
14+
"bulma": "^0.9.3",
1515
"flexsearch": "^0.6.32",
1616
"hash-wasm": "^4.4.1",
1717
"http-status-codes": "^1.4.0",
@@ -22,7 +22,7 @@
2222
"lodash": "^4.17.20",
2323
"node-fetch": "^2.6.1",
2424
"pretty-bytes": "^5.3.0",
25-
"replaywebpage": "^1.5.0",
25+
"replaywebpage": "^1.5.1",
2626
"uuid": "^8.3.2",
2727
"warcio": "^1.4.7"
2828
},

src/downloader.js

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ class Downloader
8787
this.modifiedDate = coll.config.metadata.mtime ? new Date(coll.config.metadata.mtime).toISOString() : null;
8888

8989
this.format = format;
90+
this.warcVersion = (format === "warc1.0") ? "WARC/1.0" : "WARC/1.1";
9091

9192
this.filename = filename;
9293

@@ -122,6 +123,7 @@ class Downloader
122123
return this.downloadWACZ(this.filename, sizeCallback);
123124

124125
case "warc":
126+
case "warc1.0":
125127
return this.downloadWARC(this.filename, sizeCallback);
126128

127129
default:
@@ -557,7 +559,7 @@ class Downloader
557559
}
558560

559561
async createWARCInfo(filename) {
560-
const warcVersion = "WARC/1.1";
562+
const warcVersion = this.warcVersion;
561563
const type = "warcinfo";
562564

563565
const info = {
@@ -579,21 +581,24 @@ class Downloader
579581
return buffer;
580582
}
581583

582-
removeEncodingHeaders(headersMap) {
584+
fixupHttpHeaders(headersMap, length) {
583585
let count = 0;
584586
for (const [name] of Object.entries(headersMap)) {
585587
const lowerName = name.toLowerCase();
586-
if (lowerName === "content-encoding") {
588+
switch (lowerName) {
589+
case "content-encoding":
590+
case "transfer-encoding":
587591
delete headersMap[name];
588-
if (++count === 2) {
589-
break;
590-
}
592+
++count;
593+
break;
594+
595+
case "content-length":
596+
headersMap[name] = "" + length;
597+
++count;
598+
break;
591599
}
592-
if (lowerName === "transfer-encoding") {
593-
delete headersMap[name];
594-
if (++count === 2) {
595-
break;
596-
}
600+
if (count === 3) {
601+
break;
597602
}
598603
}
599604
}
@@ -603,10 +608,7 @@ class Downloader
603608
const date = new Date(resource.ts).toISOString();
604609
resource.timestamp = getTSMillis(date);
605610
const httpHeaders = resource.respHeaders || {};
606-
const warcVersion = "WARC/1.1";
607-
608-
// remove aas never preserved in browser-based capture
609-
this.removeEncodingHeaders(httpHeaders);
611+
const warcVersion = this.warcVersion;
610612

611613
const pageId = resource.pageId;
612614

@@ -693,6 +695,9 @@ class Downloader
693695
warcHeaders["WARC-Payload-Digest"] = resource.digest;
694696
}
695697

698+
// remove encoding, set content-length as encoding never preserved in browser-based capture
699+
this.fixupHttpHeaders(httpHeaders, payload.length);
700+
696701
const record = await WARCRecord.create({
697702
url, date, type, warcVersion, warcHeaders, statusline, httpHeaders,
698703
refersToUrl, refersToDate}, getPayload(payload));
@@ -716,7 +721,7 @@ class Downloader
716721
};
717722

718723
const urlParsed = new URL(url);
719-
const statusline = method + " " + url.slice(urlParsed.origin.length);
724+
const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`;
720725

721726
const reqRecord = await WARCRecord.create({
722727
url, date, warcVersion, type,
@@ -740,7 +745,7 @@ class Downloader
740745

741746
const type = "resource";
742747
const warcHeaders = {"Content-Type": "text/plain; charset=\"UTF-8\""};
743-
const warcVersion = "WARC/1.1";
748+
const warcVersion = this.warcVersion;
744749

745750
const payload = getPayload(encoder.encode(resource.text));
746751

src/recorder.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,10 @@ class Recorder {
132132
}
133133

134134
async detach() {
135+
if (!this.running) {
136+
return;
137+
}
138+
135139
this.stopping = true;
136140

137141
const domNodes = await this.getFullText(true);
@@ -1188,8 +1192,8 @@ class Recorder {
11881192

11891193
console.log("Start Async Load: " + request.url);
11901194

1191-
const result = await this.pageEval("__awp_async_fetch__", expression, sessions);
1192-
console.log("Async Fetch Result: " + JSON.stringify(result));
1195+
await this.pageEval("__awp_async_fetch__", expression, sessions);
1196+
//console.log("Async Fetch Result: " + JSON.stringify(result));
11931197
}
11941198

11951199
async doAsyncFetch(request, sessions) {

src/requestresponseinfo.js

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@ import { postToGetUrl } from "warcio";
77
// max URL length for post/put payload-converted URLs
88
const MAX_URL_LENGTH = 4096;
99

10+
const CONTENT_LENGTH = "content-length";
11+
const CONTENT_TYPE = "content-type";
1012
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];
1113

1214
const encoder = new TextEncoder();
@@ -146,10 +148,10 @@ class RequestResponseInfo
146148

147149
this.ts = new Date().getTime();
148150

149-
const respHeaders = this.getResponseHeadersDict();
151+
const respHeaders = this.getResponseHeadersDict(payload.length);
150152
const reqHeaders = this.getRequestHeadersDict();
151153

152-
const mime = (respHeaders.headers.get("content-type") || "").split(";")[0];
154+
const mime = (respHeaders.headers.get(CONTENT_TYPE) || "").split(";")[0];
153155
const cookie = reqHeaders.headers.get("cookie");
154156

155157
if (cookie) {
@@ -230,19 +232,24 @@ class RequestResponseInfo
230232
return this._getHeadersDict(this.requestHeaders, null);
231233
}
232234

233-
getResponseHeadersDict() {
234-
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList);
235+
getResponseHeadersDict(length) {
236+
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length);
235237
}
236238

237-
_getHeadersDict(headersDict, headersList) {
239+
_getHeadersDict(headersDict, headersList, actualContentLength) {
238240
if (!headersDict && headersList) {
239241
headersDict = {};
240242

241243
for (const header of headersList) {
242-
if (EXCLUDE_HEADERS.includes(header.name.toLowerCase())) {
244+
const headerName = header.name.toLowerCase();
245+
if (EXCLUDE_HEADERS.includes(headerName)) {
243246
continue;
244247
}
245-
headersDict[header.name] = header.value.replace(/\n/g, ", ");
248+
if (actualContentLength && headerName === CONTENT_LENGTH) {
249+
headersDict[headerName] = "" + actualContentLength;
250+
continue;
251+
}
252+
headersDict[headerName] = header.value.replace(/\n/g, ", ");
246253
}
247254
}
248255

@@ -260,7 +267,12 @@ class RequestResponseInfo
260267
delete headersDict[key];
261268
continue;
262269
}
263-
if (EXCLUDE_HEADERS.includes(key.toLowerCase())) {
270+
const keyLower = key.toLowerCase();
271+
if (EXCLUDE_HEADERS.includes(keyLower)) {
272+
continue;
273+
}
274+
if (actualContentLength && keyLower === CONTENT_LENGTH) {
275+
headersDict[key] = "" + actualContentLength;
264276
continue;
265277
}
266278
headersDict[key] = headersDict[key].replace(/\n/g, ", ");
@@ -284,8 +296,8 @@ class RequestResponseInfo
284296
const length = this.payload.length;
285297

286298
const { headers } = this.getResponseHeadersDict();
287-
const contentType = headers.get("content-type");
288-
const contentLength = headers.get("content-length");
299+
const contentType = headers.get(CONTENT_TYPE);
300+
const contentLength = headers.get(CONTENT_LENGTH);
289301

290302
if (Number(contentLength) !== length) {
291303
return false;

wr-ext/bg.js

Lines changed: 4 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

wr-ext/manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"name": "Webrecorder ArchiveWeb.page",
33
"description": "Create high-fidelity web archives directly in your browser",
4-
"version": "0.6.15",
4+
"version": "0.6.16",
55
"content_security_policy": "script-src 'self' 'unsafe-eval'; object-src 'self'",
66
"permissions": [
77
"debugger",

wr-ext/popup.js

Lines changed: 44 additions & 44 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

wr-ext/replay/sw.js

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

wr-ext/replay/ui.js

Lines changed: 66 additions & 68 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

yarn.lock

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1997,10 +1997,10 @@ builtin-status-codes@^3.0.0:
19971997
resolved "https://registry.yarnpkg.com/builtin-status-codes/-/builtin-status-codes-3.0.0.tgz#85982878e21b98e1c66425e03d0174788f569ee8"
19981998
integrity sha1-hZgoeOIbmOHGZCXgPQF0eI9Wnug=
19991999

2000-
bulma@^0.9.2:
2001-
version "0.9.2"
2002-
resolved "https://registry.yarnpkg.com/bulma/-/bulma-0.9.2.tgz#340011e119c605f19b8ca886bfea595f1deaf23c"
2003-
integrity sha512-e14EF+3VSZ488yL/lJH0tR8mFWiEQVCMi/BQUMi2TGMBOk+zrDg4wryuwm/+dRSHJw0gMawp2tsW7X1JYUCE3A==
2000+
bulma@^0.9.3:
2001+
version "0.9.3"
2002+
resolved "https://registry.yarnpkg.com/bulma/-/bulma-0.9.3.tgz#ddccb7436ebe3e21bf47afe01d3c43a296b70243"
2003+
integrity sha512-0d7GNW1PY4ud8TWxdNcP6Cc8Bu7MxcntD/RRLGWuiw/s0a9P+XlH/6QoOIrmbj6o8WWJzJYhytiu9nFjTszk1g==
20042004

20052005
20062006
version "3.0.0"
@@ -9688,14 +9688,14 @@ repeating@^2.0.0:
96889688
dependencies:
96899689
is-finite "^1.0.0"
96909690

9691-
replaywebpage@^1.5.0:
9692-
version "1.5.0"
9693-
resolved "https://registry.yarnpkg.com/replaywebpage/-/replaywebpage-1.5.0.tgz#e2bf21e927a12e0f253073c7da38920ed5c159d5"
9694-
integrity sha512-vGhsYWfV55M7JobbvG+O+Uf9/VXkE2nkx9lkvgb4ze53Bl2aMV3yzjsHzsCsEQ6fE/XmBlYTEbIaC090W3lv/A==
9691+
replaywebpage@^1.5.1:
9692+
version "1.5.1"
9693+
resolved "https://registry.yarnpkg.com/replaywebpage/-/replaywebpage-1.5.1.tgz#0f2cf3f3a9b485eb8b112e486c5f6fd9b9853eca"
9694+
integrity sha512-etgrBrYH+kXQSEdximKV4/UYQB+23ZXRHi0W1g2d6L/fxHaAqAHSLTzQWpiaAYvyUjGN0H2h+fhQCSOJvQSb9w==
96959695
dependencies:
96969696
"@fortawesome/fontawesome-free" "^5.13.0"
96979697
"@webrecorder/wabac" "^2.9.0-beta.1"
9698-
bulma "^0.9.2"
9698+
bulma "^0.9.3"
96999699
electron-log "^4.3.0"
97009700
electron-updater "^4.3.5"
97019701
fetch-ndjson "^1.1.0"

0 commit comments

Comments
 (0)