From a767721f5e3482d5fdaf55fe02af64e2b5a1bd76 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 23 Jan 2023 10:43:12 -0800 Subject: [PATCH] =?UTF-8?q?crawl=20state:=20add=20getPendingList()=20to=20?= =?UTF-8?q?return=20pending=20state=20from=20either=E2=80=A6=20(#205)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * crawl state: add getPendingList() to return pending state from either memory or redis crawl state, fix stats logging with redis state. Return pending list as json object logging: check if data object is an error, log fields from error. Convert missing console.* to new logger * evaluate failuire: log with error, not fatal --- crawler.js | 14 +++++++------- util/browser.js | 4 ++-- util/logger.js | 4 +++- util/state.js | 11 ++++++++++- util/storage.js | 2 +- 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/crawler.js b/crawler.js index d79d7fd3..296a8d37 100644 --- a/crawler.js +++ b/crawler.js @@ -384,7 +384,7 @@ export class Crawler { if (this.params.screenshot) { if (!page.isHTMLPage) { - console.log("Skipping screenshots for non-HTML page"); + this.logger.info("Skipping screenshots for non-HTML page"); } const archiveDir = path.join(this.collDir, "archive"); const screenshots = new Screenshots({page, url: data.url, directory: archiveDir}); @@ -428,7 +428,7 @@ export class Crawler { await this.serializeConfig(); } catch (e) { - this.logger.error(`Error crawling page ${data.url}`, e.message); + this.logger.error(`Error crawling page ${data.url}`, e); await this.markPageFailed(page); } } @@ -755,16 +755,16 @@ export class Crawler { } const realSize = await this.crawlState.realSize(); - const pending = await this.crawlState.numRealPending(); + const pendingList = await this.crawlState.getPendingList(); const done = await this.crawlState.numDone(); - const total = realSize + pending + done; + const total = realSize + pendingList.length + done; const limit = {max: this.params.limit || 0, hit: this.limitHit}; const stats = { "crawled": done, "total": total, - "pending": pending, + "pending": pendingList.length, "limit": limit, - "pendingPages": Array.from(this.crawlState.pending.values()).map(x => JSON.stringify(x)) + "pendingPages": pendingList.map(x => JSON.stringify(x)) }; this.logger.info("Crawl statistics", stats, "crawlState"); @@ -947,7 +947,7 @@ export class Crawler { await this.sleep(5.5); } } catch (e) { - this.logger.warn("Check CF failed, ignoring", e.message); + this.logger.warn("Check CF failed, ignoring", e); } } diff --git a/util/browser.js b/util/browser.js index 0d2d6cd5..63f20aa3 100644 --- a/util/browser.js +++ b/util/browser.js @@ -161,8 +161,8 @@ export async function evaluateWithCLI(frame, funcString) { }); if (exceptionDetails) { - logger.fatal( - "Behavior Evaluation Failed" + exceptionDetails.text + logger.error( + "Behavior Evaluation Failed: " + exceptionDetails.text, exceptionDetails.stackTrace || {} ); } diff --git a/util/logger.js b/util/logger.js index 2340d539..fd66e04c 100644 --- a/util/logger.js +++ b/util/logger.js @@ -7,7 +7,9 @@ export class Logger } logAsJSON(message, data, context, logLevel="info") { - if (typeof data !== "object") { + if (data instanceof Error) { + data = {"type": "exception", "message": data.message, "stack": data.stack}; + } else if (typeof data !== "object") { data = {"message": data.toString()}; } let dataToLog = { diff --git a/util/state.js b/util/state.js index 57f349c6..4562650d 100644 --- a/util/state.js +++ b/util/state.js @@ -130,8 +130,8 @@ export class MemoryCrawlState extends BaseState async serialize() { const queued = this.queue.map(x => JSON.stringify(x)); - const pending = Array.from(this.pending.values()).map(x => JSON.stringify(x)); const done = this.done.map(x => JSON.stringify(x)); + const pending = (await this.getPendingList()).map(x => JSON.stringify(x)); return {queued, pending, done}; } @@ -179,6 +179,10 @@ export class MemoryCrawlState extends BaseState async numRealPending() { return this.pending.size; } + + async getPendingList() { + return Array.from(this.pending.values()); + } } @@ -445,6 +449,11 @@ return 0; return res; } + async getPendingList() { + const list = await this.redis.hvals(this.pkey); + return list.map(x => JSON.parse(x)); + } + async resetPendings() { const pendingUrls = await this.redis.hkeys(this.pkey); diff --git a/util/storage.js b/util/storage.js index 0402e180..0d6135ac 100644 --- a/util/storage.js +++ b/util/storage.js @@ -132,7 +132,7 @@ export function initStorage() { userId: process.env.STORE_USER, }; - console.log("Initing Storage..."); + logger.info("Initing Storage..."); return new S3StorageSync(storeInfo, opts); }