mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-25 11:20:18 +00:00
crawl state: add getPendingList() to return pending state from either… (#205)
* crawl state: add getPendingList() to return pending state from either memory or redis crawl state, fix stats logging with redis state. Return pending list as json object logging: check if data object is an error, log fields from error. Convert missing console.* to new logger * evaluate failuire: log with error, not fatal
This commit is contained in:
14
crawler.js
14
crawler.js
@@ -384,7 +384,7 @@ export class Crawler {
|
||||
|
||||
if (this.params.screenshot) {
|
||||
if (!page.isHTMLPage) {
|
||||
console.log("Skipping screenshots for non-HTML page");
|
||||
this.logger.info("Skipping screenshots for non-HTML page");
|
||||
}
|
||||
const archiveDir = path.join(this.collDir, "archive");
|
||||
const screenshots = new Screenshots({page, url: data.url, directory: archiveDir});
|
||||
@@ -428,7 +428,7 @@ export class Crawler {
|
||||
await this.serializeConfig();
|
||||
|
||||
} catch (e) {
|
||||
this.logger.error(`Error crawling page ${data.url}`, e.message);
|
||||
this.logger.error(`Error crawling page ${data.url}`, e);
|
||||
await this.markPageFailed(page);
|
||||
}
|
||||
}
|
||||
@@ -755,16 +755,16 @@ export class Crawler {
|
||||
}
|
||||
|
||||
const realSize = await this.crawlState.realSize();
|
||||
const pending = await this.crawlState.numRealPending();
|
||||
const pendingList = await this.crawlState.getPendingList();
|
||||
const done = await this.crawlState.numDone();
|
||||
const total = realSize + pending + done;
|
||||
const total = realSize + pendingList.length + done;
|
||||
const limit = {max: this.params.limit || 0, hit: this.limitHit};
|
||||
const stats = {
|
||||
"crawled": done,
|
||||
"total": total,
|
||||
"pending": pending,
|
||||
"pending": pendingList.length,
|
||||
"limit": limit,
|
||||
"pendingPages": Array.from(this.crawlState.pending.values()).map(x => JSON.stringify(x))
|
||||
"pendingPages": pendingList.map(x => JSON.stringify(x))
|
||||
};
|
||||
|
||||
this.logger.info("Crawl statistics", stats, "crawlState");
|
||||
@@ -947,7 +947,7 @@ export class Crawler {
|
||||
await this.sleep(5.5);
|
||||
}
|
||||
} catch (e) {
|
||||
this.logger.warn("Check CF failed, ignoring", e.message);
|
||||
this.logger.warn("Check CF failed, ignoring", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -161,8 +161,8 @@ export async function evaluateWithCLI(frame, funcString) {
|
||||
});
|
||||
|
||||
if (exceptionDetails) {
|
||||
logger.fatal(
|
||||
"Behavior Evaluation Failed" + exceptionDetails.text
|
||||
logger.error(
|
||||
"Behavior Evaluation Failed: " + exceptionDetails.text, exceptionDetails.stackTrace || {}
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,9 @@ export class Logger
|
||||
}
|
||||
|
||||
logAsJSON(message, data, context, logLevel="info") {
|
||||
if (typeof data !== "object") {
|
||||
if (data instanceof Error) {
|
||||
data = {"type": "exception", "message": data.message, "stack": data.stack};
|
||||
} else if (typeof data !== "object") {
|
||||
data = {"message": data.toString()};
|
||||
}
|
||||
let dataToLog = {
|
||||
|
||||
@@ -130,8 +130,8 @@ export class MemoryCrawlState extends BaseState
|
||||
|
||||
async serialize() {
|
||||
const queued = this.queue.map(x => JSON.stringify(x));
|
||||
const pending = Array.from(this.pending.values()).map(x => JSON.stringify(x));
|
||||
const done = this.done.map(x => JSON.stringify(x));
|
||||
const pending = (await this.getPendingList()).map(x => JSON.stringify(x));
|
||||
|
||||
return {queued, pending, done};
|
||||
}
|
||||
@@ -179,6 +179,10 @@ export class MemoryCrawlState extends BaseState
|
||||
async numRealPending() {
|
||||
return this.pending.size;
|
||||
}
|
||||
|
||||
async getPendingList() {
|
||||
return Array.from(this.pending.values());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -445,6 +449,11 @@ return 0;
|
||||
return res;
|
||||
}
|
||||
|
||||
async getPendingList() {
|
||||
const list = await this.redis.hvals(this.pkey);
|
||||
return list.map(x => JSON.parse(x));
|
||||
}
|
||||
|
||||
async resetPendings() {
|
||||
const pendingUrls = await this.redis.hkeys(this.pkey);
|
||||
|
||||
|
||||
@@ -132,7 +132,7 @@ export function initStorage() {
|
||||
userId: process.env.STORE_USER,
|
||||
};
|
||||
|
||||
console.log("Initing Storage...");
|
||||
logger.info("Initing Storage...");
|
||||
return new S3StorageSync(storeInfo, opts);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user