Files
browsertrix-crawler/tests/exclude-redirected.test.js
Ilya Kreymer 850a6a6665 Don't remove excluded-on-redirect URLs from seen list (#936)
Fixes #937 
- Don't remove URLs from seen list
- Add new excluded key, add URLs to be excluded (out-of-scope on
redirect) to excluded set. The size of this set can be used to get the
URLs that have been excluded in this way, to compute number of
discovered URLs.
- Don't write urn:pageinfo records for excluded pages, along with not
writing to pages/extraPages.jsonl
2025-12-08 22:41:52 -08:00

47 lines
1.7 KiB
JavaScript

import fs from "fs";
import { execSync } from "child_process";
// example.com includes a link to 'https://www.iana.org/domains/example' which redirects to 'https://www.iana.org/help/example-domains'
// pgae loading should be blocked on redirected due to exclusion of 'help', though the initial link is loaded
test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
execSync(
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
// no entries besides header
expect(
fs
.readFileSync(
"test-crawls/collections/redir-exclude-test/pages/extraPages.jsonl",
"utf8",
).trim().split("\n").length
).toBe(1);
});
test("ensure exclusion applied on redirect URL, and URL is not requeued again", () => {
execSync(
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test-2 --extraHops 1 --url https://www.iana.org/domains/example --url https://example-com.webrecorder.net/page-2 --generateCDX");
// no entries besides header
expect(
fs
.readFileSync(
"test-crawls/collections/redir-exclude-test-2/pages/extraPages.jsonl",
"utf8",
).trim().split("\n").length
).toBe(1);
const data = fs.readFileSync(
"test-crawls/collections/redir-exclude-test-2/indexes/index.cdxj",
{ encoding: "utf-8" },
);
// expect no urn:pageinfo records for excluded page
const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`);
expect(first < 0).toBe(true);
});