crawl state: add getPendingList() to return pending state from either… (#205)

* crawl state: add getPendingList() to return pending state from either memory or redis crawl state, fix stats logging with redis state. Return pending list as json object
logging: check if data object is an error, log fields from error. Convert missing console.* to new logger
* evaluate failuire: log with error, not fatal
This commit is contained in:
Ilya Kreymer
2023-01-23 10:43:12 -08:00
committed by GitHub
parent 1a066dbd7b
commit a767721f5e
5 changed files with 23 additions and 12 deletions

View File

@@ -384,7 +384,7 @@ export class Crawler {
if (this.params.screenshot) {
if (!page.isHTMLPage) {
console.log("Skipping screenshots for non-HTML page");
this.logger.info("Skipping screenshots for non-HTML page");
}
const archiveDir = path.join(this.collDir, "archive");
const screenshots = new Screenshots({page, url: data.url, directory: archiveDir});
@@ -428,7 +428,7 @@ export class Crawler {
await this.serializeConfig();
} catch (e) {
this.logger.error(`Error crawling page ${data.url}`, e.message);
this.logger.error(`Error crawling page ${data.url}`, e);
await this.markPageFailed(page);
}
}
@@ -755,16 +755,16 @@ export class Crawler {
}
const realSize = await this.crawlState.realSize();
const pending = await this.crawlState.numRealPending();
const pendingList = await this.crawlState.getPendingList();
const done = await this.crawlState.numDone();
const total = realSize + pending + done;
const total = realSize + pendingList.length + done;
const limit = {max: this.params.limit || 0, hit: this.limitHit};
const stats = {
"crawled": done,
"total": total,
"pending": pending,
"pending": pendingList.length,
"limit": limit,
"pendingPages": Array.from(this.crawlState.pending.values()).map(x => JSON.stringify(x))
"pendingPages": pendingList.map(x => JSON.stringify(x))
};
this.logger.info("Crawl statistics", stats, "crawlState");
@@ -947,7 +947,7 @@ export class Crawler {
await this.sleep(5.5);
}
} catch (e) {
this.logger.warn("Check CF failed, ignoring", e.message);
this.logger.warn("Check CF failed, ignoring", e);
}
}

View File

@@ -161,8 +161,8 @@ export async function evaluateWithCLI(frame, funcString) {
});
if (exceptionDetails) {
logger.fatal(
"Behavior Evaluation Failed" + exceptionDetails.text
logger.error(
"Behavior Evaluation Failed: " + exceptionDetails.text, exceptionDetails.stackTrace || {}
);
}

View File

@@ -7,7 +7,9 @@ export class Logger
}
logAsJSON(message, data, context, logLevel="info") {
if (typeof data !== "object") {
if (data instanceof Error) {
data = {"type": "exception", "message": data.message, "stack": data.stack};
} else if (typeof data !== "object") {
data = {"message": data.toString()};
}
let dataToLog = {

View File

@@ -130,8 +130,8 @@ export class MemoryCrawlState extends BaseState
async serialize() {
const queued = this.queue.map(x => JSON.stringify(x));
const pending = Array.from(this.pending.values()).map(x => JSON.stringify(x));
const done = this.done.map(x => JSON.stringify(x));
const pending = (await this.getPendingList()).map(x => JSON.stringify(x));
return {queued, pending, done};
}
@@ -179,6 +179,10 @@ export class MemoryCrawlState extends BaseState
async numRealPending() {
return this.pending.size;
}
async getPendingList() {
return Array.from(this.pending.values());
}
}
@@ -445,6 +449,11 @@ return 0;
return res;
}
async getPendingList() {
const list = await this.redis.hvals(this.pkey);
return list.map(x => JSON.parse(x));
}
async resetPendings() {
const pendingUrls = await this.redis.hkeys(this.pkey);

View File

@@ -132,7 +132,7 @@ export function initStorage() {
userId: process.env.STORE_USER,
};
console.log("Initing Storage...");
logger.info("Initing Storage...");
return new S3StorageSync(storeInfo, opts);
}