behaviors: don't run behaviors in iframes that are about:blank or are… (#211)

* behaviors: don't run behaviors in iframes that are about:blank or are from an ad-host (even if ad-blocking is not disabled), fixes #210

* logging: log behavior wait start and success, in addition to error, with url in details
This commit is contained in:
Ilya Kreymer
2023-01-23 16:47:33 -08:00
committed by GitHub
parent c0b0d5b87f
commit 38a9dbdaae
2 changed files with 32 additions and 9 deletions

View File

@@ -412,7 +412,11 @@ export class Crawler {
if (!page.isHTMLPage) {
this.logger.info("Skipping behaviors for non-HTML page", data, "behavior");
} else {
await Promise.allSettled(page.frames().map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();")));
await Promise.allSettled(
page.frames().
filter(frame => this.shouldRunBehavior(frame)).
map(frame => evaluateWithCLI(frame, "self.__bx_behaviors.run();"))
);
// also wait for general net idle
await this.netIdle(page);
@@ -433,6 +437,20 @@ export class Crawler {
}
}
async shouldRunBehavior(frame) {
if (!frame.parentFrame()) {
return true;
}
const url = frame.url();
if (url === "about:blank") {
return false;
}
return !!(await this.adBlockRules.shouldBlock(null, url));
}
async createWARCInfo(filename) {
const warcVersion = "WARC/1.0";
const type = "warcinfo";
@@ -592,9 +610,7 @@ export class Crawler {
await this.initPages();
if (this.params.blockAds) {
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, this.logger);
}
this.adBlockRules = new AdBlockRules(this.captureBasePrefix, this.params.adBlockMessage, this.logger);
if (this.params.blockRules && this.params.blockRules.length) {
this.blockRules = new BlockRules(this.params.blockRules, this.captureBasePrefix, this.params.blockMessage, this.logger);
@@ -794,7 +810,7 @@ export class Crawler {
}
}
if (this.adBlockRules) {
if (this.adBlockRules && this.params.blockAds) {
await this.adBlockRules.initPage(page);
}

View File

@@ -143,14 +143,17 @@ export function chromeArgs (proxy, userAgent=null, extraArgs=[]) {
}
export async function evaluateWithCLI(frame, funcString) {
export async function evaluateWithCLI(frame, funcString, name = "behaviors") {
const context = await frame.executionContext();
const url = frame.url();
logger.info(`Running ${name}...`, {url});
// from puppeteer _evaluateInternal() but with includeCommandLineAPI: true
const contextId = context._contextId;
const expression = funcString + "\n//# sourceURL=__puppeteer_evaluation_script__";
const { exceptionDetails, result: remoteObject } = await context._client
const { exceptionDetails, result } = await context._client
.send("Runtime.evaluate", {
expression,
contextId,
@@ -161,12 +164,16 @@ export async function evaluateWithCLI(frame, funcString) {
});
if (exceptionDetails) {
const details = exceptionDetails.stackTrace || {};
details.url = url;
logger.error(
"Behavior Evaluation Failed: " + exceptionDetails.text, exceptionDetails.stackTrace || {}
`Run ${name} failed: ${exceptionDetails.text}`, details
);
} else {
logger.info(`Run ${name} finished`, {url});
}
return remoteObject.value;
return result.value;
}