diff --git a/crawler.js b/crawler.js index d79d7fd3..296a8d37 100644 --- a/crawler.js +++ b/crawler.js @@ -384,7 +384,7 @@ export class Crawler { if (this.params.screenshot) { if (!page.isHTMLPage) { - console.log("Skipping screenshots for non-HTML page"); + this.logger.info("Skipping screenshots for non-HTML page"); } const archiveDir = path.join(this.collDir, "archive"); const screenshots = new Screenshots({page, url: data.url, directory: archiveDir}); @@ -428,7 +428,7 @@ export class Crawler { await this.serializeConfig(); } catch (e) { - this.logger.error(`Error crawling page ${data.url}`, e.message); + this.logger.error(`Error crawling page ${data.url}`, e); await this.markPageFailed(page); } } @@ -755,16 +755,16 @@ export class Crawler { } const realSize = await this.crawlState.realSize(); - const pending = await this.crawlState.numRealPending(); + const pendingList = await this.crawlState.getPendingList(); const done = await this.crawlState.numDone(); - const total = realSize + pending + done; + const total = realSize + pendingList.length + done; const limit = {max: this.params.limit || 0, hit: this.limitHit}; const stats = { "crawled": done, "total": total, - "pending": pending, + "pending": pendingList.length, "limit": limit, - "pendingPages": Array.from(this.crawlState.pending.values()).map(x => JSON.stringify(x)) + "pendingPages": pendingList.map(x => JSON.stringify(x)) }; this.logger.info("Crawl statistics", stats, "crawlState"); @@ -947,7 +947,7 @@ export class Crawler { await this.sleep(5.5); } } catch (e) { - this.logger.warn("Check CF failed, ignoring", e.message); + this.logger.warn("Check CF failed, ignoring", e); } } diff --git a/util/browser.js b/util/browser.js index 0d2d6cd5..63f20aa3 100644 --- a/util/browser.js +++ b/util/browser.js @@ -161,8 +161,8 @@ export async function evaluateWithCLI(frame, funcString) { }); if (exceptionDetails) { - logger.fatal( - "Behavior Evaluation Failed" + exceptionDetails.text + logger.error( + "Behavior Evaluation Failed: " + exceptionDetails.text, exceptionDetails.stackTrace || {} ); } diff --git a/util/logger.js b/util/logger.js index 2340d539..fd66e04c 100644 --- a/util/logger.js +++ b/util/logger.js @@ -7,7 +7,9 @@ export class Logger } logAsJSON(message, data, context, logLevel="info") { - if (typeof data !== "object") { + if (data instanceof Error) { + data = {"type": "exception", "message": data.message, "stack": data.stack}; + } else if (typeof data !== "object") { data = {"message": data.toString()}; } let dataToLog = { diff --git a/util/state.js b/util/state.js index 57f349c6..4562650d 100644 --- a/util/state.js +++ b/util/state.js @@ -130,8 +130,8 @@ export class MemoryCrawlState extends BaseState async serialize() { const queued = this.queue.map(x => JSON.stringify(x)); - const pending = Array.from(this.pending.values()).map(x => JSON.stringify(x)); const done = this.done.map(x => JSON.stringify(x)); + const pending = (await this.getPendingList()).map(x => JSON.stringify(x)); return {queued, pending, done}; } @@ -179,6 +179,10 @@ export class MemoryCrawlState extends BaseState async numRealPending() { return this.pending.size; } + + async getPendingList() { + return Array.from(this.pending.values()); + } } @@ -445,6 +449,11 @@ return 0; return res; } + async getPendingList() { + const list = await this.redis.hvals(this.pkey); + return list.map(x => JSON.parse(x)); + } + async resetPendings() { const pendingUrls = await this.redis.hkeys(this.pkey); diff --git a/util/storage.js b/util/storage.js index 0402e180..0d6135ac 100644 --- a/util/storage.js +++ b/util/storage.js @@ -132,7 +132,7 @@ export function initStorage() { userId: process.env.STORE_USER, }; - console.log("Initing Storage..."); + logger.info("Initing Storage..."); return new S3StorageSync(storeInfo, opts); }