Don't remove excluded-on-redirect URLs from seen list (#936)

Fixes #937 
- Don't remove URLs from seen list
- Add new excluded key, add URLs to be excluded (out-of-scope on
redirect) to excluded set. The size of this set can be used to get the
URLs that have been excluded in this way, to compute number of
discovered URLs.
- Don't write urn:pageinfo records for excluded pages, along with not
writing to pages/extraPages.jsonl
This commit is contained in:
Ilya Kreymer
2025-12-08 22:41:52 -08:00
committed by GitHub
parent 4a703cdc09
commit 850a6a6665
4 changed files with 80 additions and 9 deletions

View File

@@ -2076,12 +2076,11 @@ self.__bx_behaviors.selectMainBehavior();
return;
}
const realSize = await this.crawlState.queueSize();
const pendingPages = await this.crawlState.getPendingList();
const pending = pendingPages.length;
const crawled = await this.crawlState.numDone();
const failed = await this.crawlState.numFailed();
const total = realSize + pendingPages.length + crawled + failed;
const total = await this.crawlState.numFound();
const limit = { max: this.pageLimit || 0, hit: this.limitHit };
const stats = {
crawled,
@@ -2219,6 +2218,7 @@ self.__bx_behaviors.selectMainBehavior();
// excluded in recorder
data.pageSkipped = true;
logger.warn("Page Load Blocked, skipping", { msg, loadState });
throw new Error("logged");
} else {
return this.pageFailed("Page Load Failed", retry, {
msg,

View File

@@ -118,6 +118,7 @@ export class Recorder extends EventEmitter {
pageInfo!: PageInfoRecord;
mainFrameId: string | null = null;
skipRangeUrls!: Map<string, number>;
skipPageInfo = false;
swTargetId?: string | null;
swFrameIds = new Set<string>();
@@ -743,6 +744,7 @@ export class Recorder extends EventEmitter {
);
if (errorReason) {
this.skipPageInfo = true;
await cdp.send("Fetch.failRequest", {
requestId,
errorReason,
@@ -946,6 +948,7 @@ export class Recorder extends EventEmitter {
this.pendingRequests = new Map();
this.skipIds = new Set();
this.skipRangeUrls = new Map<string, number>();
this.skipPageInfo = false;
this.pageFinished = false;
this.pageInfo = {
pageid,
@@ -974,6 +977,14 @@ export class Recorder extends EventEmitter {
}
writePageInfoRecord() {
if (this.skipPageInfo) {
logger.debug(
"Skipping writing pageinfo for blocked page",
{ url: "urn:pageinfo:" + this.pageUrl },
"recorder",
);
return;
}
const text = JSON.stringify(this.pageInfo, null, 2);
const url = this.pageUrl;

View File

@@ -130,11 +130,18 @@ export class PageState {
// ============================================================================
declare module "ioredis" {
interface RedisCommander<Context> {
numfound(
skey: string,
esKey: string,
exKey: string,
): Result<number, Context>;
addqueue(
pkey: string,
qkey: string,
skey: string,
esKey: string,
exKey: string,
url: string,
score: number,
data: string,
@@ -203,6 +210,7 @@ export type SaveState = {
errors: string[];
extraSeeds: string[];
sitemapDone: boolean;
excluded?: string[];
};
// ============================================================================
@@ -228,6 +236,8 @@ export class RedisCrawlState {
esKey: string;
esMap: string;
exKey: string;
sitemapDoneKey: string;
waczFilename: string | null = null;
@@ -267,16 +277,27 @@ export class RedisCrawlState {
this.esKey = this.key + ":extraSeeds";
this.esMap = this.key + ":esMap";
// stores URLs that have been seen but excluded
// (eg. redirect-to-excluded or trimmed)
this.exKey = this.key + ":excluded";
this.sitemapDoneKey = this.key + ":sitemapDone";
this._initLuaCommands(this.redis);
}
_initLuaCommands(redis: Redis) {
redis.defineCommand("addqueue", {
numberOfKeys: 4,
redis.defineCommand("numfound", {
numberOfKeys: 3,
lua: `
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]);
return redis.call('scard', KEYS[1]) - redis.call('llen', KEYS[2]) - redis.call('scard', KEYS[3]);
`,
});
redis.defineCommand("addqueue", {
numberOfKeys: 5,
lua: `
local size = redis.call('scard', KEYS[3]) - redis.call('llen', KEYS[4]) - redis.call('scard', KEYS[5]);
local limit = tonumber(ARGV[4]);
if limit > 0 and size >= limit then
return 1;
@@ -303,7 +324,7 @@ return 0;
if json then
local data = cjson.decode(json);
redis.call('hdel', KEYS[2], data.url);
redis.call('srem', KEYS[3], data.url);
redis.call('sadd', KEYS[3], data.url);
end
return 1;
`,
@@ -464,7 +485,7 @@ return inx;
async markExcluded(url: string) {
await this.redis.hdel(this.pkey, url);
await this.redis.srem(this.skey, url);
await this.redis.sadd(this.exKey, url);
}
recheckScope(data: QueueEntry, seeds: ScopedSeed[]) {
@@ -486,6 +507,10 @@ return inx;
);
}
async numFound() {
return await this.redis.numfound(this.skey, this.esKey, this.exKey);
}
async trimToLimit(limit: number) {
if (limit === 0) {
return;
@@ -501,7 +526,7 @@ return inx;
const remain = Math.max(0, limit - totalComplete);
// trim queue until size <= remain
while (
(await this.redis.trimqueue(this.qkey, this.pkey, this.skey, remain)) ===
(await this.redis.trimqueue(this.qkey, this.pkey, this.exKey, remain)) ===
1
) {
/* ignore */
@@ -721,6 +746,7 @@ return inx;
this.qkey,
this.skey,
this.esKey,
this.exKey,
url,
this._getScore(data),
JSON.stringify(data),
@@ -763,8 +789,10 @@ return inx;
const errors = await this.getErrorList();
const extraSeeds = await this._iterListKeys(this.esKey, seen);
const sitemapDone = await this.isSitemapDone();
const excludedSet = await this._iterSet(this.exKey);
const finished = [...seen.values()];
const excluded = [...excludedSet.values()];
return {
extraSeeds,
@@ -774,6 +802,7 @@ return inx;
sitemapDone,
failed,
errors,
excluded,
};
}
@@ -860,6 +889,7 @@ return inx;
await this.redis.del(this.fkey);
await this.redis.del(this.skey);
await this.redis.del(this.ekey);
await this.redis.del(this.exKey);
let seen: string[] = [];
@@ -955,6 +985,11 @@ return inx;
}
await this.redis.sadd(this.skey, seen);
if (state.excluded?.length) {
await this.redis.sadd(this.exKey, state.excluded);
}
return seen.length;
}

View File

@@ -6,7 +6,7 @@ import { execSync } from "child_process";
test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
execSync(
"docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
// no entries besides header
expect(
@@ -19,3 +19,28 @@ test("ensure exclusion is applied on redirected URL, which contains 'help', so i
});
test("ensure exclusion applied on redirect URL, and URL is not requeued again", () => {
execSync(
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test-2 --extraHops 1 --url https://www.iana.org/domains/example --url https://example-com.webrecorder.net/page-2 --generateCDX");
// no entries besides header
expect(
fs
.readFileSync(
"test-crawls/collections/redir-exclude-test-2/pages/extraPages.jsonl",
"utf8",
).trim().split("\n").length
).toBe(1);
const data = fs.readFileSync(
"test-crawls/collections/redir-exclude-test-2/indexes/index.cdxj",
{ encoding: "utf-8" },
);
// expect no urn:pageinfo records for excluded page
const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`);
expect(first < 0).toBe(true);
});