mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-24 19:10:15 +00:00
behaviors: don't run behaviors in iframes that are about:blank or are… (#211)
* behaviors: don't run behaviors in iframes that are about:blank or are from an ad-host (even if ad-blocking is not disabled), fixes #210 * logging: log behavior wait start and success, in addition to error, with url in details
This commit is contained in:
26
crawler.js
26
crawler.js
@@ -412,7 +412,11 @@ export class Crawler {
|
||||
if (!page.isHTMLPage) {
|
||||
this.logger.info("Skipping behaviors for non-HTML page", data, "behavior");
|
||||
} else {
|
||||
await Promise.allSettled(page.frames().map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();")));
|
||||
await Promise.allSettled(
|
||||
page.frames().
|
||||
filter(frame => this.shouldRunBehavior(frame)).
|
||||
map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();"))
|
||||
);
|
||||
|
||||
// also wait for general net idle
|
||||
await this.netIdle(page);
|
||||
@@ -433,6 +437,20 @@ export class Crawler {
|
||||
}
|
||||
}
|
||||
|
||||
async shouldRunBehavior(frame) {
|
||||
if (!frame.parentFrame()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const url = frame.url();
|
||||
|
||||
if (url === "about:blank") {
|
||||
return false;
|
||||
}
|
||||
|
||||
return !!(await this.adBlockRules.shouldBlock(null, url));
|
||||
}
|
||||
|
||||
async createWARCInfo(filename) {
|
||||
const warcVersion = "WARC/1.0";
|
||||
const type = "warcinfo";
|
||||
@@ -592,9 +610,7 @@ export class Crawler {
|
||||
|
||||
await this.initPages();
|
||||
|
||||
if (this.params.blockAds) {
|
||||
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, this.logger);
|
||||
}
|
||||
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, this.logger);
|
||||
|
||||
if (this.params.blockRules && this.params.blockRules.length) {
|
||||
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, this.logger);
|
||||
@@ -794,7 +810,7 @@ export class Crawler {
|
||||
}
|
||||
}
|
||||
|
||||
if (this.adBlockRules) {
|
||||
if (this.adBlockRules && this.params.blockAds) {
|
||||
await this.adBlockRules.initPage(page);
|
||||
}
|
||||
|
||||
|
||||
@@ -143,14 +143,17 @@ export function chromeArgs (proxy, userAgent=null, extraArgs=[]) {
|
||||
}
|
||||
|
||||
|
||||
export async function evaluateWithCLI(frame, funcString) {
|
||||
export async function evaluateWithCLI(frame, funcString, name = "behaviors") {
|
||||
const context = await frame.executionContext();
|
||||
const url = frame.url();
|
||||
|
||||
logger.info(`Running ${name}...`, {url});
|
||||
|
||||
// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
|
||||
const contextId = context._contextId;
|
||||
const expression = funcString + "\n//# sourceURL=__puppeteer_evaluation_script__";
|
||||
|
||||
const { exceptionDetails, result: remoteObject } = await context._client
|
||||
const { exceptionDetails, result } = await context._client
|
||||
.send("Runtime.evaluate", {
|
||||
expression,
|
||||
contextId,
|
||||
@@ -161,12 +164,16 @@ export async function evaluateWithCLI(frame, funcString) {
|
||||
});
|
||||
|
||||
if (exceptionDetails) {
|
||||
const details = exceptionDetails.stackTrace || {};
|
||||
details.url = url;
|
||||
logger.error(
|
||||
"Behavior Evaluation Failed: " + exceptionDetails.text, exceptionDetails.stackTrace || {}
|
||||
`Run ${name} failed: ${exceptionDetails.text}`, details
|
||||
);
|
||||
} else {
|
||||
logger.info(`Run ${name} finished`, {url});
|
||||
}
|
||||
|
||||
return remoteObject.value;
|
||||
return result.value;
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user