mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-25 11:20:18 +00:00
Fixes #937 - Don't remove URLs from seen list - Add new excluded key, add URLs to be excluded (out-of-scope on redirect) to excluded set. The size of this set can be used to get the URLs that have been excluded in this way, to compute number of discovered URLs. - Don't write urn:pageinfo records for excluded pages, along with not writing to pages/extraPages.jsonl
47 lines
1.7 KiB
JavaScript
47 lines
1.7 KiB
JavaScript
import fs from "fs";
|
|
import { execSync } from "child_process";
|
|
|
|
// example.com includes a link to 'https://www.iana.org/domains/example' which redirects to 'https://www.iana.org/help/example-domains'
|
|
// pgae loading should be blocked on redirected due to exclusion of 'help', though the initial link is loaded
|
|
|
|
test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
|
|
execSync(
|
|
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test --extraHops 1");
|
|
|
|
// no entries besides header
|
|
expect(
|
|
fs
|
|
.readFileSync(
|
|
"test-crawls/collections/redir-exclude-test/pages/extraPages.jsonl",
|
|
"utf8",
|
|
).trim().split("\n").length
|
|
).toBe(1);
|
|
|
|
});
|
|
|
|
|
|
test("ensure exclusion applied on redirect URL, and URL is not requeued again", () => {
|
|
execSync(
|
|
"docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example-com.webrecorder.net/ --exclude help --collection redir-exclude-test-2 --extraHops 1 --url https://www.iana.org/domains/example --url https://example-com.webrecorder.net/page-2 --generateCDX");
|
|
|
|
|
|
// no entries besides header
|
|
expect(
|
|
fs
|
|
.readFileSync(
|
|
"test-crawls/collections/redir-exclude-test-2/pages/extraPages.jsonl",
|
|
"utf8",
|
|
).trim().split("\n").length
|
|
).toBe(1);
|
|
|
|
|
|
const data = fs.readFileSync(
|
|
"test-crawls/collections/redir-exclude-test-2/indexes/index.cdxj",
|
|
{ encoding: "utf-8" },
|
|
);
|
|
|
|
// expect no urn:pageinfo records for excluded page
|
|
const first = data.indexOf(`"urn:pageinfo:https://www.iana.org/domains/example"`);
|
|
expect(first < 0).toBe(true);
|
|
});
|