mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-24 19:10:15 +00:00
track size of page resources:
- add 'size' entry to each resource in urn:pageinfo records - add 'size' entry to pages in pages.jsonl, set to sum of the size of all resources listed in urn:pageinfo record
This commit is contained in:
@@ -98,6 +98,7 @@ type PageEntry = {
|
||||
title?: string;
|
||||
loadState?: number;
|
||||
mime?: string;
|
||||
size?: number;
|
||||
seed?: boolean;
|
||||
text?: string;
|
||||
favIconUrl?: string;
|
||||
@@ -2650,6 +2651,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||
text,
|
||||
loadState,
|
||||
mime,
|
||||
size,
|
||||
favicon,
|
||||
status,
|
||||
} = state;
|
||||
@@ -2673,6 +2675,9 @@ self.__bx_behaviors.selectMainBehavior();
|
||||
if (mime) {
|
||||
row.mime = mime;
|
||||
}
|
||||
if (size) {
|
||||
row.size = size;
|
||||
}
|
||||
|
||||
if (status) {
|
||||
row.status = status;
|
||||
|
||||
@@ -66,6 +66,7 @@ export type PageInfoValue = {
|
||||
mime?: string;
|
||||
type?: string;
|
||||
error?: string;
|
||||
size?: number;
|
||||
fromBrowserCache?: boolean;
|
||||
};
|
||||
|
||||
@@ -116,6 +117,7 @@ export class Recorder extends EventEmitter {
|
||||
pendingRequests!: Map<string, RequestResponseInfo>;
|
||||
skipIds!: Set<string>;
|
||||
pageInfo!: PageInfoRecord;
|
||||
pageSize = 0;
|
||||
mainFrameId: string | null = null;
|
||||
skipRangeUrls!: Map<string, number>;
|
||||
skipPageInfo = false;
|
||||
@@ -950,6 +952,7 @@ export class Recorder extends EventEmitter {
|
||||
this.skipRangeUrls = new Map<string, number>();
|
||||
this.skipPageInfo = false;
|
||||
this.pageFinished = false;
|
||||
this.pageSize = 0;
|
||||
this.pageInfo = {
|
||||
pageid,
|
||||
urls: {},
|
||||
@@ -963,8 +966,9 @@ export class Recorder extends EventEmitter {
|
||||
addPageRecord(reqresp: RequestResponseInfo) {
|
||||
if (this.isValidUrl(reqresp.url)) {
|
||||
const { status, resourceType: type } = reqresp;
|
||||
const size = reqresp.readSize || reqresp.payload?.length || 0;
|
||||
const mime = reqresp.getMimeType();
|
||||
const info: PageInfoValue = { status, mime, type };
|
||||
const info: PageInfoValue = { status, mime, type, size };
|
||||
if (reqresp.errorText) {
|
||||
info.error = reqresp.errorText;
|
||||
}
|
||||
@@ -973,6 +977,7 @@ export class Recorder extends EventEmitter {
|
||||
// info.fromBrowserCache = true;
|
||||
// }
|
||||
this.pageInfo.urls[reqresp.getCanonURL()] = info;
|
||||
this.pageSize += size;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -983,7 +988,7 @@ export class Recorder extends EventEmitter {
|
||||
{ url: "urn:pageinfo:" + this.pageUrl },
|
||||
"recorder",
|
||||
);
|
||||
return;
|
||||
return null;
|
||||
}
|
||||
const text = JSON.stringify(this.pageInfo, null, 2);
|
||||
|
||||
@@ -1000,7 +1005,7 @@ export class Recorder extends EventEmitter {
|
||||
"recorder",
|
||||
);
|
||||
|
||||
return this.pageInfo.ts;
|
||||
return { ts: this.pageInfo.ts, size: this.pageSize };
|
||||
}
|
||||
|
||||
async awaitPageResources() {
|
||||
|
||||
@@ -94,6 +94,7 @@ export class PageState {
|
||||
title?: string;
|
||||
mime?: string;
|
||||
ts?: Date;
|
||||
size?: number;
|
||||
|
||||
callbacks: PageCallbacks = {};
|
||||
|
||||
|
||||
@@ -301,7 +301,12 @@ export class PageWorker {
|
||||
} finally {
|
||||
try {
|
||||
if (this.recorder) {
|
||||
opts.data.ts = this.recorder.writePageInfoRecord();
|
||||
const res = this.recorder.writePageInfoRecord();
|
||||
if (res) {
|
||||
const { size, ts } = res;
|
||||
opts.data.ts = ts;
|
||||
opts.data.size = size;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
logger.error(
|
||||
|
||||
Reference in New Issue
Block a user