mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-22 10:07:06 +00:00
Don't remove excluded-on-redirect URLs from seen list (#936)
Fixes #937 - Don't remove URLs from seen list - Add new excluded key, add URLs to be excluded (out-of-scope on redirect) to excluded set. The size of this set can be used to get the URLs that have been excluded in this way, to compute number of discovered URLs. - Don't write urn:pageinfo records for excluded pages, along with not writing to pages/extraPages.jsonl
This commit is contained in:
@@ -2076,12 +2076,11 @@ self.__bx_behaviors.selectMainBehavior();
|
||||
return;
|
||||
}
|
||||
|
||||
const realSize = await this.crawlState.queueSize();
|
||||
const pendingPages = await this.crawlState.getPendingList();
|
||||
const pending = pendingPages.length;
|
||||
const crawled = await this.crawlState.numDone();
|
||||
const failed = await this.crawlState.numFailed();
|
||||
const total = realSize + pendingPages.length + crawled + failed;
|
||||
const total = await this.crawlState.numFound();
|
||||
const limit = { max: this.pageLimit || 0, hit: this.limitHit };
|
||||
const stats = {
|
||||
crawled,
|
||||
@@ -2219,6 +2218,7 @@ self.__bx_behaviors.selectMainBehavior();
|
||||
// excluded in recorder
|
||||
data.pageSkipped = true;
|
||||
logger.warn("Page Load Blocked, skipping", { msg, loadState });
|
||||
throw new Error("logged");
|
||||
} else {
|
||||
return this.pageFailed("Page Load Failed", retry, {
|
||||
msg,
|
||||
|
||||
@@ -118,6 +118,7 @@ export class Recorder extends EventEmitter {
|
||||
pageInfo!: PageInfoRecord;
|
||||
mainFrameId: string | null = null;
|
||||
skipRangeUrls!: Map<string, number>;
|
||||
skipPageInfo = false;
|
||||
|
||||
swTargetId?: string | null;
|
||||
swFrameIds = new Set<string>();
|
||||
@@ -743,6 +744,7 @@ export class Recorder extends EventEmitter {
|
||||
);
|
||||
|
||||
if (errorReason) {
|
||||
this.skipPageInfo = true;
|
||||
await cdp.send("Fetch.failRequest", {
|
||||
requestId,
|
||||
errorReason,
|
||||
@@ -946,6 +948,7 @@ export class Recorder extends EventEmitter {
|
||||
this.pendingRequests = new Map();
|
||||
this.skipIds = new Set();
|
||||
this.skipRangeUrls = new Map<string, number>();
|
||||
this.skipPageInfo = false;
|
||||
this.pageFinished = false;
|
||||
this.pageInfo = {
|
||||
pageid,
|
||||
@@ -974,6 +977,14 @@ export class Recorder extends EventEmitter {
|
||||
}
|
||||
|
||||
writePageInfoRecord() {
|
||||
if (this.skipPageInfo) {
|
||||
logger.debug(
|
||||
"Skipping writing pageinfo for blocked page",
|
||||
{ url: "urn:pageinfo:" + this.pageUrl },
|
||||
"recorder",
|
||||
);
|
||||
return;
|
||||
}
|
||||
const text = JSON.stringify(this.pageInfo, null, 2);
|
||||
|
||||
const url = this.pageUrl;
|
||||
|
||||
@@ -130,11 +130,18 @@ export class PageState {
|
||||
// ============================================================================
|
||||
declare module "ioredis" {
|
||||
interface RedisCommander<Context> {
|
||||
numfound(
|
||||
skey: string,
|
||||
esKey: string,
|
||||
exKey: string,
|
||||
): Result<number, Context>;
|
||||
|
||||
addqueue(
|
||||
pkey: string,
|
||||
qkey: string,
|
||||
skey: string,
|
||||
esKey: string,
|
||||
exKey: string,
|
||||
url: string,
|
||||
score: number,
|
||||
data: string,
|
||||
@@ -203,6 +210,7 @@ export type SaveState = {
|
||||
errors: string[];
|
||||
extraSeeds: string[];
|
||||
sitemapDone: boolean;
|
||||
excluded?: string[];
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
@@ -228,6 +236,8 @@ export class RedisCrawlState {
|
||||
esKey: string;
|
||||
esMap: string;
|
||||
|
||||
exKey: string;
|
||||
|
||||
sitemapDoneKey: string;
|
||||
|
||||
waczFilename: string | null = null;
|
||||
@@ -267,16 +277,27 @@ export class RedisCrawlState {
|
||||
this.esKey = this.key + ":extraSeeds";
|
||||
this.esMap = this.key + ":esMap";
|
||||
|
||||
// stores URLs that have been seen but excluded
|
||||
// (eg. redirect-to-excluded or trimmed)
|
||||
this.exKey = this.key + ":excluded";
|
||||
|
||||
this.sitemapDoneKey = this.key + ":sitemapDone";
|
||||
|
||||
this._initLuaCommands(this.redis);
|
||||
}
|
||||
|
||||
_initLuaCommands(redis: Redis) {
|
||||
redis.defineCommand("addqueue", {
|
||||
numberOfKeys: 4,
|
||||
redis.defineCommand("numfound", {
|
||||
numberOfKeys: 3,
|
||||
lua: `
|
||||
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]);
|
||||
return redis.call('scard', KEYS[1]) - redis.call('llen', KEYS[2]) - redis.call('scard', KEYS[3]);
|
||||
`,
|
||||
});
|
||||
|
||||
redis.defineCommand("addqueue", {
|
||||
numberOfKeys: 5,
|
||||
lua: `
|
||||
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]) - redis.call('scard', KEYS[5]);
|
||||
local limit = tonumber(ARGV[4]);
|
||||
if limit > 0 and size >= limit then
|
||||
return 1;
|
||||
@@ -303,7 +324,7 @@ return 0;
|
||||
if json then
|
||||
local data = cjson.decode(json);
|
||||
redis.call('hdel', KEYS[2], data.url);
|
||||
redis.call('srem', KEYS[3], data.url);
|
||||
redis.call('sadd', KEYS[3], data.url);
|
||||
end
|
||||
return 1;
|
||||
`,
|
||||
@@ -464,7 +485,7 @@ return inx;
|
||||
async markExcluded(url: string) {
|
||||
await this.redis.hdel(this.pkey, url);
|
||||
|
||||
await this.redis.srem(this.skey, url);
|
||||
await this.redis.sadd(this.exKey, url);
|
||||
}
|
||||
|
||||
recheckScope(data: QueueEntry, seeds: ScopedSeed[]) {
|
||||
@@ -486,6 +507,10 @@ return inx;
|
||||
);
|
||||
}
|
||||
|
||||
async numFound() {
|
||||
return await this.redis.numfound(this.skey, this.esKey, this.exKey);
|
||||
}
|
||||
|
||||
async trimToLimit(limit: number) {
|
||||
if (limit === 0) {
|
||||
return;
|
||||
@@ -501,7 +526,7 @@ return inx;
|
||||
const remain = Math.max(0, limit - totalComplete);
|
||||
// trim queue until size <= remain
|
||||
while (
|
||||
(await this.redis.trimqueue(this.qkey, this.pkey, this.skey, remain)) ===
|
||||
(await this.redis.trimqueue(this.qkey, this.pkey, this.exKey, remain)) ===
|
||||
1
|
||||
) {
|
||||
/* ignore */
|
||||
@@ -721,6 +746,7 @@ return inx;
|
||||
this.qkey,
|
||||
this.skey,
|
||||
this.esKey,
|
||||
this.exKey,
|
||||
url,
|
||||
this._getScore(data),
|
||||
JSON.stringify(data),
|
||||
@@ -763,8 +789,10 @@ return inx;
|
||||
const errors = await this.getErrorList();
|
||||
const extraSeeds = await this._iterListKeys(this.esKey, seen);
|
||||
const sitemapDone = await this.isSitemapDone();
|
||||
const excludedSet = await this._iterSet(this.exKey);
|
||||
|
||||
const finished = [...seen.values()];
|
||||
const excluded = [...excludedSet.values()];
|
||||
|
||||
return {
|
||||
extraSeeds,
|
||||
@@ -774,6 +802,7 @@ return inx;
|
||||
sitemapDone,
|
||||
failed,
|
||||
errors,
|
||||
excluded,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -860,6 +889,7 @@ return inx;
|
||||
await this.redis.del(this.fkey);
|
||||
await this.redis.del(this.skey);
|
||||
await this.redis.del(this.ekey);
|
||||
await this.redis.del(this.exKey);
|
||||
|
||||
let seen: string[] = [];
|
||||
|
||||
@@ -955,6 +985,11 @@ return inx;
|
||||
}
|
||||
|
||||
await this.redis.sadd(this.skey, seen);
|
||||
|
||||
if (state.excluded?.length) {
|
||||
await this.redis.sadd(this.exKey, state.excluded);
|
||||
}
|
||||
|
||||
return seen.length;
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import { execSync } from "child_process";
|
||||
|
||||
test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
|
||||
execSync(
|
||||
"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
|
||||
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
|
||||
|
||||
// no entries besides header
|
||||
expect(
|
||||
@@ -19,3 +19,28 @@ test("ensure exclusion is applied on redirected URL, which contains 'help', so i
|
||||
|
||||
});
|
||||
|
||||
|
||||
test("ensure exclusion applied on redirect URL, and URL is not requeued again", () => {
|
||||
execSync(
|
||||
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test-2 --extraHops 1 --url https://www.iana.org/domains/example --url https://example-com.webrecorder.net/page-2 --generateCDX");
|
||||
|
||||
|
||||
// no entries besides header
|
||||
expect(
|
||||
fs
|
||||
.readFileSync(
|
||||
"test-crawls/collections/redir-exclude-test-2/pages/extraPages.jsonl",
|
||||
"utf8",
|
||||
).trim().split("\n").length
|
||||
).toBe(1);
|
||||
|
||||
|
||||
const data = fs.readFileSync(
|
||||
"test-crawls/collections/redir-exclude-test-2/indexes/index.cdxj",
|
||||
{ encoding: "utf-8" },
|
||||
);
|
||||
|
||||
// expect no urn:pageinfo records for excluded page
|
||||
const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`);
|
||||
expect(first < 0).toBe(true);
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user