background behaviors refactor: (fixes #23)

- move auto-play, auto-fetch and auto-scroll behaviors to behaviors/global/*
- bgbehaviors manages these background behaviors
- command line --bgbehaviors option specifies which background behaviors to run (defaults to auto-fetch and auto-play)
This commit is contained in:
Ilya Kreymer
2021-02-08 22:21:34 -08:00
parent 7cfeefd19b
commit 8c85ca2749
8 changed files with 408 additions and 288 deletions

View File

@@ -35,6 +35,7 @@ RUN yarn install
ADD config.yaml /app/
ADD uwsgi.ini /app/
ADD *.js /app/
ADD behaviors/ /app/behaviors/
RUN ln -s /app/main.js /usr/bin/crawl

View File

@@ -1,203 +0,0 @@
// AutoFetcher script
// extract and fetch all urls from srcsets, from images as well as audio/video
// also extract any urls from media query stylesheets that have not necessarily been loaded
// (May not work for cross-origin stylesheets)
const SRC_SET_SELECTOR = 'img[srcset], img[data-srcset], img[data-src], ' +
'video[srcset], video[data-srcset], video[data-src], audio[srcset], audio[data-srcset], audio[data-src], ' +
'picture > source[srcset], picture > source[data-srcset], picture > source[data-src], ' +
'video > source[srcset], video > source[data-srcset], video > source[data-src], ' +
'audio > source[srcset], audio > source[data-srcset], audio > source[data-src]';
const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
const STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
const IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
// ===========================================================================
class AutoFetcher
{
constructor() {
this.urlSet = new Set();
this.urlqueue = [];
this.numPending = 0;
}
init() {
console.log("init autofetch");
window.addEventListener("load", () => {
this.extractSrcSrcSetAll(document);
this.extractStyleSheets();
this.initObserver();
});
}
isValidUrl(url) {
return url && (url.startsWith("http:") || url.startsWith("https:"));
}
queueUrl(url) {
try {
url = new URL(url, document.baseURI).href;
} catch (e) {
return;
}
if (!this.isValidUrl(url)) {
return;
}
if (this.urlSet.has(url)) {
return;
}
this.urlSet.add(url);
this.doFetch(url);
}
async doFetch(url) {
this.urlqueue.push(url);
if (this.numPending <= 6) {
while (this.urlqueue.length > 0) {
const url = this.urlqueue.shift();
try {
this.numPending++;
console.log("AutoFetching: " + url);
const resp = await fetch(url);
await resp.blob();
} catch (e) {
console.log(e)
}
this.numPending--;
}
}
}
initObserver() {
this.mutobz = new MutationObserver((changes) => this.observeChange(changes));
this.mutobz.observe(document.documentElement, {
characterData: false,
characterDataOldValue: false,
attributes: true,
attributeOldValue: true,
subtree: true,
childList: true,
attributeFilter: ['srcset']
});
}
processChangedNode(target) {
switch (target.nodeType) {
case Node.ATTRIBUTE_NODE:
if (target.nodeName === "srcset") {
this.extractSrcSetAttr(target.nodeValue);
}
break;
case Node.TEXT_NODE:
if (target.parentNode && target.parentNode.tagName === "STYLE") {
this.extractStyleText(target.nodeValue);
}
break;
case Node.ELEMENT_NODE:
if (target.sheet) {
this.extractStyleSheet(target.sheet);
}
this.extractSrcSrcSet(target);
setTimeout(() => this.extractSrcSrcSetAll(target), 1000);
break;
}
}
observeChange(changes) {
for (const change of changes) {
this.processChangedNode(change.target);
if (change.type === "childList") {
for (const node of change.addedNodes) {
this.processChangedNode(node);
}
}
}
}
extractSrcSrcSetAll(root) {
const elems = root.querySelectorAll(SRC_SET_SELECTOR);
for (const elem of elems) {
//console.log(elem);
this.extractSrcSrcSet(elem);
}
}
extractSrcSrcSet(elem) {
if (!elem || elem.nodeType !== Node.ELEMENT_NODE) {
console.warn("No elem to extract from");
return;
}
const src = elem.src || elem.getAttribute("data-src");
if (src) {
this.queueUrl(src);
}
const srcset = elem.srcset || elem.getAttribute("data-srcset");
if (srcset) {
this.extractSrcSetAttr(srcset);
}
}
extractSrcSetAttr(srcset) {
for (const v of srcset.split(SRCSET_REGEX)) {
if (v) {
const parts = v.trim().split(" ");
this.queueUrl(parts[0]);
}
}
}
extractStyleSheets(root) {
root = root || document;
for (const sheet of root.styleSheets) {
this.extractStyleSheet(sheet);
}
}
extractStyleSheet(sheet) {
let rules;
try {
rules = sheet.cssRules || sheet.rules;
} catch (e) {
console.log("Can't access stylesheet");
return;
}
for (const rule of rules) {
if (rule.type === CSSRule.MEDIA_RULE) {
this.extractStyleText(rule.cssText);
}
}
}
extractStyleText(text) {
const urlExtractor = (m, n1, n2, n3) => {
this.queueUrl(n2);
return n1 + n2 + n3;
};
text.replace(STYLE_REGEX, urlExtractor).replace(IMPORT_REGEX, urlExtractor);
}
}
new AutoFetcher().init();

44
behaviors/bgbehaviors.js Normal file
View File

@@ -0,0 +1,44 @@
const AutoPlayBehavior = require("./global/autoplay");
const AutoFetchBehavior = require("./global/autofetcher");
const AutoScrollBehavior = require("./global/autoscroll");
// ===========================================================================
class BackgroundBehaviors
{
constructor(bgbehaviors) {
this.doAutoFetch = bgbehaviors.includes("auto-fetch");
this.doAutoPlay = bgbehaviors.includes("auto-play");
this.doAutoScroll = bgbehaviors.includes("auto-scroll");
}
async setup(page, crawler) {
const behaviors = [];
try {
if (this.doAutoFetch) {
behaviors.push(new AutoFetchBehavior());
}
if (this.doAutoPlay) {
behaviors.push(new AutoPlayBehavior());
}
if (this.doAutoScroll) {
behaviors.push(new AutoScrollBehavior());
}
await Promise.all(behaviors.map(b => b.beforeLoad(page, crawler)));
} catch (err) {
console.log(err);
}
return () => Promise.all(behaviors.map(b => b.afterLoad(page, crawler)));
}
}
module.exports = BackgroundBehaviors;

View File

@@ -0,0 +1,232 @@
// AutoFetcher script
// extract and fetch all urls from srcsets, from images as well as audio/video
// also extract any urls from media query stylesheets that have not necessarily been loaded
// (May not work for cross-origin stylesheets)
function autofetcher() {
const SRC_SET_SELECTOR = 'img[srcset], img[data-srcset], img[data-src], ' +
'video[srcset], video[data-srcset], video[data-src], audio[srcset], audio[data-srcset], audio[data-src], ' +
'picture > source[srcset], picture > source[data-srcset], picture > source[data-src], ' +
'video > source[srcset], video > source[data-srcset], video > source[data-src], ' +
'audio > source[srcset], audio > source[data-srcset], audio > source[data-src]';
const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
const STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
const IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
class AutoFetcher
{
constructor() {
this.urlSet = new Set();
this.urlqueue = [];
this.numPending = 0;
}
init() {
console.log("init autofetch");
window.addEventListener("load", () => {
this.run();
this.initObserver();
});
}
async run() {
while (true) {
this.extractSrcSrcSetAll(document);
this.extractStyleSheets();
if (window.__crawler_nextPhase) {
await window.__crawler_nextPhase();
} else {
break;
}
}
}
isValidUrl(url) {
return url && (url.startsWith("http:") || url.startsWith("https:"));
}
queueUrl(url) {
try {
url = new URL(url, document.baseURI).href;
} catch (e) {
return;
}
if (!this.isValidUrl(url)) {
return;
}
if (this.urlSet.has(url)) {
return;
}
this.urlSet.add(url);
this.doFetch(url);
}
async doFetch(url) {
this.urlqueue.push(url);
if (this.numPending <= 6) {
while (this.urlqueue.length > 0) {
const url = this.urlqueue.shift();
try {
this.numPending++;
console.log("AutoFetching: " + url);
const resp = await fetch(url);
await resp.blob();
} catch (e) {
console.log(e)
}
this.numPending--;
}
}
}
initObserver() {
this.mutobz = new MutationObserver((changes) => this.observeChange(changes));
this.mutobz.observe(document.documentElement, {
characterData: false,
characterDataOldValue: false,
attributes: true,
attributeOldValue: true,
subtree: true,
childList: true,
attributeFilter: ['srcset']
});
}
processChangedNode(target) {
switch (target.nodeType) {
case Node.ATTRIBUTE_NODE:
if (target.nodeName === "srcset") {
this.extractSrcSetAttr(target.nodeValue);
}
break;
case Node.TEXT_NODE:
if (target.parentNode && target.parentNode.tagName === "STYLE") {
this.extractStyleText(target.nodeValue);
}
break;
case Node.ELEMENT_NODE:
if (target.sheet) {
this.extractStyleSheet(target.sheet);
}
this.extractSrcSrcSet(target);
setTimeout(() => this.extractSrcSrcSetAll(target), 1000);
break;
}
}
observeChange(changes) {
for (const change of changes) {
this.processChangedNode(change.target);
if (change.type === "childList") {
for (const node of change.addedNodes) {
this.processChangedNode(node);
}
}
}
}
extractSrcSrcSetAll(root) {
const elems = root.querySelectorAll(SRC_SET_SELECTOR);
for (const elem of elems) {
//console.log(elem);
this.extractSrcSrcSet(elem);
}
}
extractSrcSrcSet(elem) {
if (!elem || elem.nodeType !== Node.ELEMENT_NODE) {
console.warn("No elem to extract from");
return;
}
const src = elem.src || elem.getAttribute("data-src");
if (src) {
this.queueUrl(src);
}
const srcset = elem.srcset || elem.getAttribute("data-srcset");
if (srcset) {
this.extractSrcSetAttr(srcset);
}
}
extractSrcSetAttr(srcset) {
for (const v of srcset.split(SRCSET_REGEX)) {
if (v) {
const parts = v.trim().split(" ");
this.queueUrl(parts[0]);
}
}
}
extractStyleSheets(root) {
root = root || document;
for (const sheet of root.styleSheets) {
this.extractStyleSheet(sheet);
}
}
extractStyleSheet(sheet) {
let rules;
try {
rules = sheet.cssRules || sheet.rules;
} catch (e) {
console.log("Can't access stylesheet");
return;
}
for (const rule of rules) {
if (rule.type === CSSRule.MEDIA_RULE) {
this.extractStyleText(rule.cssText);
}
}
}
extractStyleText(text) {
const urlExtractor = (m, n1, n2, n3) => {
this.queueUrl(n2);
return n1 + n2 + n3;
};
text.replace(STYLE_REGEX, urlExtractor).replace(IMPORT_REGEX, urlExtractor);
}
}
new AutoFetcher().init();
};
// ===========================================================================
class AutoFetchBehavior
{
async beforeLoad(page) {
const iife = `(${autofetcher.toString()})();`;
await page.evaluateOnNewDocument(iife);
}
async afterLoad() {
}
}
module.exports = AutoFetchBehavior;

View File

@@ -1,4 +1,5 @@
(() => {
// ===========================================================================
function autoplay() {
function run() {
if (self.navigator.__crawler_autoplay) {
return;
@@ -87,5 +88,52 @@
}
}, 3000);
})();
};
// ===========================================================================
class AutoPlayBehavior
{
constructor() {
this.mediaPromises = [];
this.waitForVideo = false;
}
async beforeLoad(page, crawler) {
try {
await page.exposeFunction("__crawler_queueUrls", async (url) => {
this.mediaPromises.push(crawler.directFetchCapture(url));
});
await page.exposeFunction("__crawler_autoplayLoad", (url) => {
console.log("*** Loading autoplay URL: " + url);
this.waitForVideo = true;
});
const iife = `(${autoplay.toString()})();`;
await page.evaluateOnNewDocument(iife);
} catch(err) {
console.log(err);
}
}
async afterLoad(page, crawler) {
try {
await Promise.all(this.mediaPromises);
} catch (e) {
console.log("Error loading media URLs", e);
}
if (this.waitForVideo) {
console.log("Extra wait 15s for video loading");
await crawler.sleep(15000);
}
}
}
module.exports = AutoPlayBehavior

View File

@@ -0,0 +1,37 @@
async function autoScroll() {
const canScrollMore = () =>
self.scrollY + self.innerHeight <
Math.max(
self.document.body.scrollHeight,
self.document.body.offsetHeight,
self.document.documentElement.clientHeight,
self.document.documentElement.scrollHeight,
self.document.documentElement.offsetHeight
);
const scrollOpts = { top: 250, left: 0, behavior: "auto" };
while (canScrollMore()) {
self.scrollBy(scrollOpts);
await new Promise(resolve => setTimeout(resolve, 500));
}
}
// ===========================================================================
class AutoScrollBehavior
{
async beforeLoad(page, crawler) {
}
async afterLoad(page, crawler) {
try {
await Promise.race([page.evaluate(autoscroll), crawler.sleep(30000)]);
} catch (e) {
console.warn("Autoscroll Behavior Failed", e);
}
}
}
module.exports = AutoScrollBehavior;

View File

@@ -8,6 +8,9 @@ const fs = require("fs");
const Sitemapper = require("sitemapper");
const { v4: uuidv4 } = require("uuid");
const BackgroundBehaviors = require("./behaviors/bgbehaviors");
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
@@ -61,6 +64,9 @@ class Crawler {
// pages file
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
// background behaviors
this.bgbehaviors = new BackgroundBehaviors(this.params.bgbehaviors || []);
}
configureUA() {
@@ -241,7 +247,13 @@ class Crawler {
"statsFilename": {
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
}
},
"bgbehaviors": {
describe: "Which background behaviors to enable on each page",
default: "auto-play,auto-fetch",
type: "string",
},
};
}
@@ -281,6 +293,9 @@ class Crawler {
}
}
// background behaviors to apply
argv.bgbehaviors = argv.bgbehaviors.split(",");
if (!argv.newContext) {
argv.newContext = "page";
}
@@ -380,6 +395,31 @@ class Crawler {
}
}
async crawlPage({page, data}) {
try {
if (this.emulateDevice) {
await page.emulate(this.emulateDevice);
}
const bgbehavior = await this.bgbehaviors.setup(page, this);
// run custom driver here
await this.driver({page, data, crawler: this});
const title = await page.title();
this.writePage(data.url, title);
if (bgbehavior) {
await bgbehavior();
}
this.writeStats();
} catch (e) {
console.warn(e);
}
}
async crawl() {
try {
this.driver = require(this.params.driver);
@@ -399,17 +439,7 @@ class Crawler {
monitor: this.monitor
});
this.cluster.task(async (opts) => {
try {
await this.driver({...opts, crawler: this});
const title = await opts.page.title();
this.writePage(opts.data.url, title);
this.writeStats();
} catch (e) {
console.warn(e);
}
});
this.cluster.task((opts) => this.crawlPage(opts));
this.initPages();

View File

@@ -1,9 +1,3 @@
const fs = require("fs");
const autoplayScript = fs.readFileSync("/app/autoplay.js", "utf-8");
const autofetchScript = fs.readFileSync("/app/autofetcher.js", "utf-8");
//const autoplayScript = require("/app/autoplay.js");
/* eslint-disable no-undef */
@@ -11,87 +5,24 @@ const autofetchScript = fs.readFileSync("/app/autofetcher.js", "utf-8");
module.exports = async ({data, page, crawler}) => {
const {url} = data;
//page.on("requestfailed", message => console.warn(message._failureText));
if (!await crawler.isHTML(url)) {
await crawler.directFetchCapture(url);
return;
}
if (crawler.emulateDevice) {
await page.emulate(crawler.emulateDevice);
}
const mediaResults = [];
await page.exposeFunction("__crawler_queueUrls", async (url) => {
mediaResults.push(await crawler.directFetchCapture(url));
});
let waitForVideo = false;
await page.exposeFunction("__crawler_autoplayLoad", (url) => {
console.log("*** Loading autoplay URL: " + url);
waitForVideo = true;
});
try {
await page.evaluateOnNewDocument(autoplayScript);
await page.evaluateOnNewDocument(autofetchScript);
} catch(e) {
console.log(e);
}
const gotoOpts = {
waitUntil: crawler.params.waitUntil,
timeout: crawler.params.timeout
};
try {
console.log("Wait page load...");
await page.goto(url, gotoOpts);
console.log("Done");
} catch (e) {
console.log(`Load timeout for ${url}`, e);
}
try {
await Promise.all(mediaResults);
} catch (e) {
console.log("Error loading media URLs", e);
}
if (waitForVideo) {
console.log("Extra wait 15s for video loading");
await crawler.sleep(15000);
}
if (crawler.params.scroll) {
try {
await Promise.race([page.evaluate(autoScroll), crawler.sleep(30000)]);
} catch (e) {
console.warn("Behavior Failed", e);
}
}
await crawler.extractLinks(page, "a[href]");
};
async function autoScroll() {
const canScrollMore = () =>
self.scrollY + self.innerHeight <
Math.max(
self.document.body.scrollHeight,
self.document.body.offsetHeight,
self.document.documentElement.clientHeight,
self.document.documentElement.scrollHeight,
self.document.documentElement.offsetHeight
);
const scrollOpts = { top: 250, left: 0, behavior: "auto" };
while (canScrollMore()) {
self.scrollBy(scrollOpts);
await new Promise(resolve => setTimeout(resolve, 500));
}
}