mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-26 20:00:19 +00:00
background behaviors refactor: (fixes #23)
- move auto-play, auto-fetch and auto-scroll behaviors to behaviors/global/* - bgbehaviors manages these background behaviors - command line --bgbehaviors option specifies which background behaviors to run (defaults to auto-fetch and auto-play)
This commit is contained in:
@@ -35,6 +35,7 @@ RUN yarn install
|
||||
ADD config.yaml /app/
|
||||
ADD uwsgi.ini /app/
|
||||
ADD *.js /app/
|
||||
ADD behaviors/ /app/behaviors/
|
||||
|
||||
RUN ln -s /app/main.js /usr/bin/crawl
|
||||
|
||||
|
||||
203
autofetcher.js
203
autofetcher.js
@@ -1,203 +0,0 @@
|
||||
// AutoFetcher script
|
||||
// extract and fetch all urls from srcsets, from images as well as audio/video
|
||||
// also extract any urls from media query stylesheets that have not necessarily been loaded
|
||||
// (May not work for cross-origin stylesheets)
|
||||
|
||||
|
||||
const SRC_SET_SELECTOR = 'img[srcset], img[data-srcset], img[data-src], ' +
|
||||
'video[srcset], video[data-srcset], video[data-src], audio[srcset], audio[data-srcset], audio[data-src], ' +
|
||||
'picture > source[srcset], picture > source[data-srcset], picture > source[data-src], ' +
|
||||
'video > source[srcset], video > source[data-srcset], video > source[data-src], ' +
|
||||
'audio > source[srcset], audio > source[data-srcset], audio > source[data-src]';
|
||||
|
||||
const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
|
||||
|
||||
const STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
|
||||
const IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class AutoFetcher
|
||||
{
|
||||
constructor() {
|
||||
this.urlSet = new Set();
|
||||
this.urlqueue = [];
|
||||
this.numPending = 0;
|
||||
}
|
||||
|
||||
init() {
|
||||
console.log("init autofetch");
|
||||
|
||||
window.addEventListener("load", () => {
|
||||
this.extractSrcSrcSetAll(document);
|
||||
this.extractStyleSheets();
|
||||
|
||||
this.initObserver();
|
||||
});
|
||||
}
|
||||
|
||||
isValidUrl(url) {
|
||||
return url && (url.startsWith("http:") || url.startsWith("https:"));
|
||||
}
|
||||
|
||||
queueUrl(url) {
|
||||
try {
|
||||
url = new URL(url, document.baseURI).href;
|
||||
} catch (e) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.isValidUrl(url)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.urlSet.has(url)) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.urlSet.add(url);
|
||||
|
||||
this.doFetch(url);
|
||||
}
|
||||
|
||||
async doFetch(url) {
|
||||
this.urlqueue.push(url);
|
||||
if (this.numPending <= 6) {
|
||||
while (this.urlqueue.length > 0) {
|
||||
const url = this.urlqueue.shift();
|
||||
try {
|
||||
this.numPending++;
|
||||
console.log("AutoFetching: " + url);
|
||||
const resp = await fetch(url);
|
||||
await resp.blob();
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
}
|
||||
this.numPending--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
initObserver() {
|
||||
this.mutobz = new MutationObserver((changes) => this.observeChange(changes));
|
||||
|
||||
this.mutobz.observe(document.documentElement, {
|
||||
characterData: false,
|
||||
characterDataOldValue: false,
|
||||
attributes: true,
|
||||
attributeOldValue: true,
|
||||
subtree: true,
|
||||
childList: true,
|
||||
attributeFilter: ['srcset']
|
||||
});
|
||||
}
|
||||
|
||||
processChangedNode(target) {
|
||||
switch (target.nodeType) {
|
||||
case Node.ATTRIBUTE_NODE:
|
||||
if (target.nodeName === "srcset") {
|
||||
this.extractSrcSetAttr(target.nodeValue);
|
||||
}
|
||||
break;
|
||||
|
||||
case Node.TEXT_NODE:
|
||||
if (target.parentNode && target.parentNode.tagName === "STYLE") {
|
||||
this.extractStyleText(target.nodeValue);
|
||||
}
|
||||
break;
|
||||
|
||||
case Node.ELEMENT_NODE:
|
||||
if (target.sheet) {
|
||||
this.extractStyleSheet(target.sheet);
|
||||
}
|
||||
this.extractSrcSrcSet(target);
|
||||
setTimeout(() => this.extractSrcSrcSetAll(target), 1000);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
observeChange(changes) {
|
||||
for (const change of changes) {
|
||||
this.processChangedNode(change.target);
|
||||
|
||||
if (change.type === "childList") {
|
||||
for (const node of change.addedNodes) {
|
||||
this.processChangedNode(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractSrcSrcSetAll(root) {
|
||||
const elems = root.querySelectorAll(SRC_SET_SELECTOR);
|
||||
|
||||
for (const elem of elems) {
|
||||
//console.log(elem);
|
||||
this.extractSrcSrcSet(elem);
|
||||
}
|
||||
}
|
||||
|
||||
extractSrcSrcSet(elem) {
|
||||
if (!elem || elem.nodeType !== Node.ELEMENT_NODE) {
|
||||
console.warn("No elem to extract from");
|
||||
return;
|
||||
}
|
||||
|
||||
const src = elem.src || elem.getAttribute("data-src");
|
||||
|
||||
if (src) {
|
||||
this.queueUrl(src);
|
||||
}
|
||||
|
||||
const srcset = elem.srcset || elem.getAttribute("data-srcset");
|
||||
|
||||
if (srcset) {
|
||||
this.extractSrcSetAttr(srcset);
|
||||
}
|
||||
}
|
||||
|
||||
extractSrcSetAttr(srcset) {
|
||||
for (const v of srcset.split(SRCSET_REGEX)) {
|
||||
if (v) {
|
||||
const parts = v.trim().split(" ");
|
||||
this.queueUrl(parts[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractStyleSheets(root) {
|
||||
root = root || document;
|
||||
|
||||
for (const sheet of root.styleSheets) {
|
||||
this.extractStyleSheet(sheet);
|
||||
}
|
||||
}
|
||||
|
||||
extractStyleSheet(sheet) {
|
||||
let rules;
|
||||
|
||||
try {
|
||||
rules = sheet.cssRules || sheet.rules;
|
||||
} catch (e) {
|
||||
console.log("Can't access stylesheet");
|
||||
return;
|
||||
}
|
||||
|
||||
for (const rule of rules) {
|
||||
if (rule.type === CSSRule.MEDIA_RULE) {
|
||||
this.extractStyleText(rule.cssText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractStyleText(text) {
|
||||
const urlExtractor = (m, n1, n2, n3) => {
|
||||
this.queueUrl(n2);
|
||||
return n1 + n2 + n3;
|
||||
};
|
||||
|
||||
text.replace(STYLE_REGEX, urlExtractor).replace(IMPORT_REGEX, urlExtractor);
|
||||
}
|
||||
}
|
||||
|
||||
new AutoFetcher().init();
|
||||
44
behaviors/bgbehaviors.js
Normal file
44
behaviors/bgbehaviors.js
Normal file
@@ -0,0 +1,44 @@
|
||||
const AutoPlayBehavior = require("./global/autoplay");
|
||||
|
||||
const AutoFetchBehavior = require("./global/autofetcher");
|
||||
|
||||
const AutoScrollBehavior = require("./global/autoscroll");
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class BackgroundBehaviors
|
||||
{
|
||||
constructor(bgbehaviors) {
|
||||
this.doAutoFetch = bgbehaviors.includes("auto-fetch");
|
||||
this.doAutoPlay = bgbehaviors.includes("auto-play");
|
||||
this.doAutoScroll = bgbehaviors.includes("auto-scroll");
|
||||
}
|
||||
|
||||
async setup(page, crawler) {
|
||||
const behaviors = [];
|
||||
|
||||
try {
|
||||
if (this.doAutoFetch) {
|
||||
behaviors.push(new AutoFetchBehavior());
|
||||
}
|
||||
|
||||
if (this.doAutoPlay) {
|
||||
behaviors.push(new AutoPlayBehavior());
|
||||
}
|
||||
|
||||
if (this.doAutoScroll) {
|
||||
behaviors.push(new AutoScrollBehavior());
|
||||
}
|
||||
|
||||
await Promise.all(behaviors.map(b => b.beforeLoad(page, crawler)));
|
||||
|
||||
} catch (err) {
|
||||
console.log(err);
|
||||
}
|
||||
|
||||
return () => Promise.all(behaviors.map(b => b.afterLoad(page, crawler)));
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = BackgroundBehaviors;
|
||||
|
||||
232
behaviors/global/autofetcher.js
Normal file
232
behaviors/global/autofetcher.js
Normal file
@@ -0,0 +1,232 @@
|
||||
// AutoFetcher script
|
||||
// extract and fetch all urls from srcsets, from images as well as audio/video
|
||||
// also extract any urls from media query stylesheets that have not necessarily been loaded
|
||||
// (May not work for cross-origin stylesheets)
|
||||
|
||||
function autofetcher() {
|
||||
|
||||
const SRC_SET_SELECTOR = 'img[srcset], img[data-srcset], img[data-src], ' +
|
||||
'video[srcset], video[data-srcset], video[data-src], audio[srcset], audio[data-srcset], audio[data-src], ' +
|
||||
'picture > source[srcset], picture > source[data-srcset], picture > source[data-src], ' +
|
||||
'video > source[srcset], video > source[data-srcset], video > source[data-src], ' +
|
||||
'audio > source[srcset], audio > source[data-srcset], audio > source[data-src]';
|
||||
|
||||
const SRCSET_REGEX = /\s*(\S*\s+[\d\.]+[wx]),|(?:\s*,(?:\s+|(?=https?:)))/;
|
||||
|
||||
const STYLE_REGEX = /(url\s*\(\s*[\\"']*)([^)'"]+)([\\"']*\s*\))/gi;
|
||||
const IMPORT_REGEX = /(@import\s*[\\"']*)([^)'";]+)([\\"']*\s*;?)/gi;
|
||||
|
||||
|
||||
class AutoFetcher
|
||||
{
|
||||
constructor() {
|
||||
this.urlSet = new Set();
|
||||
this.urlqueue = [];
|
||||
this.numPending = 0;
|
||||
}
|
||||
|
||||
init() {
|
||||
console.log("init autofetch");
|
||||
|
||||
window.addEventListener("load", () => {
|
||||
this.run();
|
||||
this.initObserver();
|
||||
});
|
||||
}
|
||||
|
||||
async run() {
|
||||
while (true) {
|
||||
this.extractSrcSrcSetAll(document);
|
||||
this.extractStyleSheets();
|
||||
if (window.__crawler_nextPhase) {
|
||||
await window.__crawler_nextPhase();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
isValidUrl(url) {
|
||||
return url && (url.startsWith("http:") || url.startsWith("https:"));
|
||||
}
|
||||
|
||||
queueUrl(url) {
|
||||
try {
|
||||
url = new URL(url, document.baseURI).href;
|
||||
} catch (e) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!this.isValidUrl(url)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.urlSet.has(url)) {
|
||||
return;
|
||||
}
|
||||
|
||||
this.urlSet.add(url);
|
||||
|
||||
this.doFetch(url);
|
||||
}
|
||||
|
||||
async doFetch(url) {
|
||||
this.urlqueue.push(url);
|
||||
if (this.numPending <= 6) {
|
||||
while (this.urlqueue.length > 0) {
|
||||
const url = this.urlqueue.shift();
|
||||
try {
|
||||
this.numPending++;
|
||||
console.log("AutoFetching: " + url);
|
||||
const resp = await fetch(url);
|
||||
await resp.blob();
|
||||
} catch (e) {
|
||||
console.log(e)
|
||||
}
|
||||
this.numPending--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
initObserver() {
|
||||
this.mutobz = new MutationObserver((changes) => this.observeChange(changes));
|
||||
|
||||
this.mutobz.observe(document.documentElement, {
|
||||
characterData: false,
|
||||
characterDataOldValue: false,
|
||||
attributes: true,
|
||||
attributeOldValue: true,
|
||||
subtree: true,
|
||||
childList: true,
|
||||
attributeFilter: ['srcset']
|
||||
});
|
||||
}
|
||||
|
||||
processChangedNode(target) {
|
||||
switch (target.nodeType) {
|
||||
case Node.ATTRIBUTE_NODE:
|
||||
if (target.nodeName === "srcset") {
|
||||
this.extractSrcSetAttr(target.nodeValue);
|
||||
}
|
||||
break;
|
||||
|
||||
case Node.TEXT_NODE:
|
||||
if (target.parentNode && target.parentNode.tagName === "STYLE") {
|
||||
this.extractStyleText(target.nodeValue);
|
||||
}
|
||||
break;
|
||||
|
||||
case Node.ELEMENT_NODE:
|
||||
if (target.sheet) {
|
||||
this.extractStyleSheet(target.sheet);
|
||||
}
|
||||
this.extractSrcSrcSet(target);
|
||||
setTimeout(() => this.extractSrcSrcSetAll(target), 1000);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
observeChange(changes) {
|
||||
for (const change of changes) {
|
||||
this.processChangedNode(change.target);
|
||||
|
||||
if (change.type === "childList") {
|
||||
for (const node of change.addedNodes) {
|
||||
this.processChangedNode(node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractSrcSrcSetAll(root) {
|
||||
const elems = root.querySelectorAll(SRC_SET_SELECTOR);
|
||||
|
||||
for (const elem of elems) {
|
||||
//console.log(elem);
|
||||
this.extractSrcSrcSet(elem);
|
||||
}
|
||||
}
|
||||
|
||||
extractSrcSrcSet(elem) {
|
||||
if (!elem || elem.nodeType !== Node.ELEMENT_NODE) {
|
||||
console.warn("No elem to extract from");
|
||||
return;
|
||||
}
|
||||
|
||||
const src = elem.src || elem.getAttribute("data-src");
|
||||
|
||||
if (src) {
|
||||
this.queueUrl(src);
|
||||
}
|
||||
|
||||
const srcset = elem.srcset || elem.getAttribute("data-srcset");
|
||||
|
||||
if (srcset) {
|
||||
this.extractSrcSetAttr(srcset);
|
||||
}
|
||||
}
|
||||
|
||||
extractSrcSetAttr(srcset) {
|
||||
for (const v of srcset.split(SRCSET_REGEX)) {
|
||||
if (v) {
|
||||
const parts = v.trim().split(" ");
|
||||
this.queueUrl(parts[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractStyleSheets(root) {
|
||||
root = root || document;
|
||||
|
||||
for (const sheet of root.styleSheets) {
|
||||
this.extractStyleSheet(sheet);
|
||||
}
|
||||
}
|
||||
|
||||
extractStyleSheet(sheet) {
|
||||
let rules;
|
||||
|
||||
try {
|
||||
rules = sheet.cssRules || sheet.rules;
|
||||
} catch (e) {
|
||||
console.log("Can't access stylesheet");
|
||||
return;
|
||||
}
|
||||
|
||||
for (const rule of rules) {
|
||||
if (rule.type === CSSRule.MEDIA_RULE) {
|
||||
this.extractStyleText(rule.cssText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extractStyleText(text) {
|
||||
const urlExtractor = (m, n1, n2, n3) => {
|
||||
this.queueUrl(n2);
|
||||
return n1 + n2 + n3;
|
||||
};
|
||||
|
||||
text.replace(STYLE_REGEX, urlExtractor).replace(IMPORT_REGEX, urlExtractor);
|
||||
}
|
||||
}
|
||||
|
||||
new AutoFetcher().init();
|
||||
};
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class AutoFetchBehavior
|
||||
{
|
||||
async beforeLoad(page) {
|
||||
const iife = `(${autofetcher.toString()})();`;
|
||||
await page.evaluateOnNewDocument(iife);
|
||||
}
|
||||
|
||||
async afterLoad() {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
module.exports = AutoFetchBehavior;
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
(() => {
|
||||
// ===========================================================================
|
||||
function autoplay() {
|
||||
function run() {
|
||||
if (self.navigator.__crawler_autoplay) {
|
||||
return;
|
||||
@@ -87,5 +88,52 @@
|
||||
}
|
||||
}, 3000);
|
||||
|
||||
})();
|
||||
};
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class AutoPlayBehavior
|
||||
{
|
||||
constructor() {
|
||||
this.mediaPromises = [];
|
||||
this.waitForVideo = false;
|
||||
}
|
||||
|
||||
async beforeLoad(page, crawler) {
|
||||
try {
|
||||
await page.exposeFunction("__crawler_queueUrls", async (url) => {
|
||||
this.mediaPromises.push(crawler.directFetchCapture(url));
|
||||
});
|
||||
|
||||
await page.exposeFunction("__crawler_autoplayLoad", (url) => {
|
||||
console.log("*** Loading autoplay URL: " + url);
|
||||
this.waitForVideo = true;
|
||||
});
|
||||
|
||||
const iife = `(${autoplay.toString()})();`;
|
||||
await page.evaluateOnNewDocument(iife);
|
||||
|
||||
} catch(err) {
|
||||
console.log(err);
|
||||
}
|
||||
}
|
||||
|
||||
async afterLoad(page, crawler) {
|
||||
try {
|
||||
await Promise.all(this.mediaPromises);
|
||||
} catch (e) {
|
||||
console.log("Error loading media URLs", e);
|
||||
}
|
||||
|
||||
if (this.waitForVideo) {
|
||||
console.log("Extra wait 15s for video loading");
|
||||
await crawler.sleep(15000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
module.exports = AutoPlayBehavior
|
||||
|
||||
37
behaviors/global/autoscroll.js
Normal file
37
behaviors/global/autoscroll.js
Normal file
@@ -0,0 +1,37 @@
|
||||
async function autoScroll() {
|
||||
const canScrollMore = () =>
|
||||
self.scrollY + self.innerHeight <
|
||||
Math.max(
|
||||
self.document.body.scrollHeight,
|
||||
self.document.body.offsetHeight,
|
||||
self.document.documentElement.clientHeight,
|
||||
self.document.documentElement.scrollHeight,
|
||||
self.document.documentElement.offsetHeight
|
||||
);
|
||||
|
||||
const scrollOpts = { top: 250, left: 0, behavior: "auto" };
|
||||
|
||||
while (canScrollMore()) {
|
||||
self.scrollBy(scrollOpts);
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// ===========================================================================
|
||||
class AutoScrollBehavior
|
||||
{
|
||||
|
||||
async beforeLoad(page, crawler) {
|
||||
}
|
||||
|
||||
async afterLoad(page, crawler) {
|
||||
try {
|
||||
await Promise.race([page.evaluate(autoscroll), crawler.sleep(30000)]);
|
||||
} catch (e) {
|
||||
console.warn("Autoscroll Behavior Failed", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = AutoScrollBehavior;
|
||||
54
crawler.js
54
crawler.js
@@ -8,6 +8,9 @@ const fs = require("fs");
|
||||
const Sitemapper = require("sitemapper");
|
||||
const { v4: uuidv4 } = require("uuid");
|
||||
|
||||
const BackgroundBehaviors = require("./behaviors/bgbehaviors");
|
||||
|
||||
|
||||
const HTML_TYPES = ["text/html", "application/xhtml", "application/xhtml+xml"];
|
||||
const WAIT_UNTIL_OPTS = ["load", "domcontentloaded", "networkidle0", "networkidle2"];
|
||||
|
||||
@@ -61,6 +64,9 @@ class Crawler {
|
||||
|
||||
// pages file
|
||||
this.pagesFile = path.join(this.pagesDir, "pages.jsonl");
|
||||
|
||||
// background behaviors
|
||||
this.bgbehaviors = new BackgroundBehaviors(this.params.bgbehaviors || []);
|
||||
}
|
||||
|
||||
configureUA() {
|
||||
@@ -241,7 +247,13 @@ class Crawler {
|
||||
|
||||
"statsFilename": {
|
||||
describe: "If set, output stats as JSON to this file. (Relative filename resolves to crawl working directory)"
|
||||
}
|
||||
},
|
||||
|
||||
"bgbehaviors": {
|
||||
describe: "Which background behaviors to enable on each page",
|
||||
default: "auto-play,auto-fetch",
|
||||
type: "string",
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
@@ -281,6 +293,9 @@ class Crawler {
|
||||
}
|
||||
}
|
||||
|
||||
// background behaviors to apply
|
||||
argv.bgbehaviors = argv.bgbehaviors.split(",");
|
||||
|
||||
if (!argv.newContext) {
|
||||
argv.newContext = "page";
|
||||
}
|
||||
@@ -380,6 +395,31 @@ class Crawler {
|
||||
}
|
||||
}
|
||||
|
||||
async crawlPage({page, data}) {
|
||||
try {
|
||||
if (this.emulateDevice) {
|
||||
await page.emulate(this.emulateDevice);
|
||||
}
|
||||
|
||||
const bgbehavior = await this.bgbehaviors.setup(page, this);
|
||||
|
||||
// run custom driver here
|
||||
await this.driver({page, data, crawler: this});
|
||||
|
||||
const title = await page.title();
|
||||
this.writePage(data.url, title);
|
||||
|
||||
if (bgbehavior) {
|
||||
await bgbehavior();
|
||||
}
|
||||
|
||||
this.writeStats();
|
||||
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
}
|
||||
}
|
||||
|
||||
async crawl() {
|
||||
try {
|
||||
this.driver = require(this.params.driver);
|
||||
@@ -399,17 +439,7 @@ class Crawler {
|
||||
monitor: this.monitor
|
||||
});
|
||||
|
||||
this.cluster.task(async (opts) => {
|
||||
try {
|
||||
await this.driver({...opts, crawler: this});
|
||||
const title = await opts.page.title();
|
||||
this.writePage(opts.data.url, title);
|
||||
this.writeStats();
|
||||
|
||||
} catch (e) {
|
||||
console.warn(e);
|
||||
}
|
||||
});
|
||||
this.cluster.task((opts) => this.crawlPage(opts));
|
||||
|
||||
this.initPages();
|
||||
|
||||
|
||||
@@ -1,9 +1,3 @@
|
||||
const fs = require("fs");
|
||||
|
||||
const autoplayScript = fs.readFileSync("/app/autoplay.js", "utf-8");
|
||||
|
||||
const autofetchScript = fs.readFileSync("/app/autofetcher.js", "utf-8");
|
||||
|
||||
//const autoplayScript = require("/app/autoplay.js");
|
||||
|
||||
/* eslint-disable no-undef */
|
||||
@@ -11,87 +5,24 @@ const autofetchScript = fs.readFileSync("/app/autofetcher.js", "utf-8");
|
||||
module.exports = async ({data, page, crawler}) => {
|
||||
const {url} = data;
|
||||
|
||||
//page.on("requestfailed", message => console.warn(message._failureText));
|
||||
|
||||
if (!await crawler.isHTML(url)) {
|
||||
await crawler.directFetchCapture(url);
|
||||
return;
|
||||
}
|
||||
|
||||
if (crawler.emulateDevice) {
|
||||
await page.emulate(crawler.emulateDevice);
|
||||
}
|
||||
|
||||
const mediaResults = [];
|
||||
|
||||
await page.exposeFunction("__crawler_queueUrls", async (url) => {
|
||||
mediaResults.push(await crawler.directFetchCapture(url));
|
||||
});
|
||||
|
||||
let waitForVideo = false;
|
||||
|
||||
await page.exposeFunction("__crawler_autoplayLoad", (url) => {
|
||||
console.log("*** Loading autoplay URL: " + url);
|
||||
waitForVideo = true;
|
||||
});
|
||||
|
||||
try {
|
||||
await page.evaluateOnNewDocument(autoplayScript);
|
||||
await page.evaluateOnNewDocument(autofetchScript);
|
||||
} catch(e) {
|
||||
console.log(e);
|
||||
}
|
||||
|
||||
const gotoOpts = {
|
||||
waitUntil: crawler.params.waitUntil,
|
||||
timeout: crawler.params.timeout
|
||||
};
|
||||
|
||||
try {
|
||||
console.log("Wait page load...");
|
||||
await page.goto(url, gotoOpts);
|
||||
console.log("Done");
|
||||
} catch (e) {
|
||||
console.log(`Load timeout for ${url}`, e);
|
||||
}
|
||||
|
||||
try {
|
||||
await Promise.all(mediaResults);
|
||||
} catch (e) {
|
||||
console.log("Error loading media URLs", e);
|
||||
}
|
||||
|
||||
if (waitForVideo) {
|
||||
console.log("Extra wait 15s for video loading");
|
||||
await crawler.sleep(15000);
|
||||
}
|
||||
|
||||
if (crawler.params.scroll) {
|
||||
try {
|
||||
await Promise.race([page.evaluate(autoScroll), crawler.sleep(30000)]);
|
||||
} catch (e) {
|
||||
console.warn("Behavior Failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
await crawler.extractLinks(page, "a[href]");
|
||||
};
|
||||
|
||||
async function autoScroll() {
|
||||
const canScrollMore = () =>
|
||||
self.scrollY + self.innerHeight <
|
||||
Math.max(
|
||||
self.document.body.scrollHeight,
|
||||
self.document.body.offsetHeight,
|
||||
self.document.documentElement.clientHeight,
|
||||
self.document.documentElement.scrollHeight,
|
||||
self.document.documentElement.offsetHeight
|
||||
);
|
||||
|
||||
const scrollOpts = { top: 250, left: 0, behavior: "auto" };
|
||||
|
||||
while (canScrollMore()) {
|
||||
self.scrollBy(scrollOpts);
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user