mirror of
https://github.com/webrecorder/browsertrix-crawler.git
synced 2025-12-26 03:40:19 +00:00
Dockerfile: switch to cmd 'crawl', instead of entrypoint to support running 'pywb' also
update README with docker-compose and docker run examples, update commandline example default output to './crawls' subdirectory
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -3,4 +3,4 @@ __pycache__
|
|||||||
*.egg-info/
|
*.egg-info/
|
||||||
collections/
|
collections/
|
||||||
node_modules/
|
node_modules/
|
||||||
output/
|
crawls/
|
||||||
|
|||||||
@@ -32,7 +32,9 @@ ADD config.yaml /app/
|
|||||||
ADD uwsgi.ini /app/
|
ADD uwsgi.ini /app/
|
||||||
ADD *.js /app/
|
ADD *.js /app/
|
||||||
|
|
||||||
|
RUN ln -s /app/main.js /usr/bin/crawl
|
||||||
|
|
||||||
WORKDIR /output
|
WORKDIR /output
|
||||||
|
|
||||||
ENTRYPOINT ["node", "/app/main.js"]
|
CMD ["crawl"]
|
||||||
|
|
||||||
|
|||||||
107
README.md
107
README.md
@@ -1,7 +1,6 @@
|
|||||||
Browsertrix Core
|
# Browsertrix Crawler
|
||||||
================
|
|
||||||
|
|
||||||
Browsertrix Core is a simplified browser-based high-fidliety crawling system, designed to run a single crawl in a single Docker container.
|
Browsertrix Crwaler is a simplified browser-based high-fidelity crawling system, designed to run a single crawl in a single Docker container.
|
||||||
|
|
||||||
It is designed as part of a more streamlined replacement of the original [Browsertrix](https://github.com/webrecorder/browsertrix).
|
It is designed as part of a more streamlined replacement of the original [Browsertrix](https://github.com/webrecorder/browsertrix).
|
||||||
|
|
||||||
@@ -18,47 +17,101 @@ The system uses:
|
|||||||
- `pywb` - in recording mode for capturing the content
|
- `pywb` - in recording mode for capturing the content
|
||||||
|
|
||||||
|
|
||||||
The crawl produces a single pywb collection, at `/output/collections/capture`.
|
The crawl produces a single pywb collection, at `/output/collections/<collection name>`.
|
||||||
|
|
||||||
The collection can be mounted as a Docker volume and then accessed in pywb.
|
The collection can be mounted as a Docker volume and then accessed in pywb.
|
||||||
|
|
||||||
|
|
||||||
Crawling Parameters
|
## Crawling Parameters
|
||||||
-------------------
|
|
||||||
|
|
||||||
The image currently accepts the following parameters:
|
The image currently accepts the following parameters:
|
||||||
|
|
||||||
- `--url URL` - the url to be crawled (required)
|
```
|
||||||
- `--workers N` - number of crawl workers to be run in parallel
|
browsertrix-crawler [options]
|
||||||
- `--wait-until` - Puppeteer setting for how long to wait for page load. See [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options). The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example).
|
|
||||||
- `--name` - Name of ZIM file (defaults to the hostname of the URL)
|
|
||||||
- `--output` - output directory (defaults to `/output`)
|
|
||||||
- `--limit U` - Limit capture to at most U URLs
|
|
||||||
- `--exclude <regex>` - skip URLs that match the regex from crawling. Can be specified multiple times.
|
|
||||||
- `--scroll [N]` - if set, will activate a simple auto-scroll behavior on each page to scroll for upto N seconds
|
|
||||||
|
|
||||||
|
|
||||||
The following is an example usage. The `--cap-add` and `--shm-size`
|
|
||||||
flags are [needed to run Chrome in Docker](https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips).
|
|
||||||
|
|
||||||
Example command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker run -v ./collections/my-crawl:/output/collections/capture --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1g -it webrecorder/browsertrix-crawler --url https://www.iana.org/ --workers 2
|
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--help Show help [boolean]
|
||||||
|
--version Show version number [boolean]
|
||||||
|
-u, --url The URL to start crawling from [string] [required]
|
||||||
|
-w, --workers The number of workers to run in parallel
|
||||||
|
[number] [default: 1]
|
||||||
|
--newContext The context for each new capture, can be a new: page,
|
||||||
|
session or browser. [string] [default: "page"]
|
||||||
|
--waitUntil Puppeteer page.goto() condition to wait for before
|
||||||
|
continuing [default: "load"]
|
||||||
|
--limit Limit crawl to this number of pages [number] [default: 0]
|
||||||
|
--timeout Timeout for each page to load (in seconds)
|
||||||
|
[number] [default: 90]
|
||||||
|
--scope Regex of page URLs that should be included in the crawl
|
||||||
|
(defaults to the immediate directory of URL)
|
||||||
|
--exclude Regex of page URLs that should be excluded from the crawl.
|
||||||
|
--scroll If set, will autoscroll to bottom of the page
|
||||||
|
[boolean] [default: false]
|
||||||
|
-c, --collection Collection name to crawl to (replay will be accessible
|
||||||
|
under this name in pywb preview)
|
||||||
|
[string] [default: "capture"]
|
||||||
|
--headless Run in headless mode, otherwise start xvfb
|
||||||
|
[boolean] [default: false]
|
||||||
|
--driver JS driver for the crawler
|
||||||
|
[string] [default: "/Users/ilya/work/browsertrix-crawler/defaultDriver.js"]
|
||||||
|
--generateCDX If set, generate index (CDXJ) for use with pywb after crawl
|
||||||
|
is done [boolean] [default: false]
|
||||||
```
|
```
|
||||||
|
|
||||||
The puppeteer-cluster provides monitoring output which is enabled by default and prints the crawl status to the Docker log.
|
For the `--waitUntil` flag, see [page.goto waitUntil options](https://github.com/puppeteer/puppeteer/blob/main/docs/api.md#pagegotourl-options).
|
||||||
|
|
||||||
With the above example, when the crawl is finished, you can run pywb and browse the collection from: `http://localhost:8080/my-crawl/https://www.iana.org/`
|
The default is `load`, but for static sites, `--wait-until domcontentloaded` may be used to speed up the crawl (to avoid waiting for ads to load for example),
|
||||||
|
while `--waitUntil networkidle0` may make sense for dynamic sites.
|
||||||
|
|
||||||
|
### Example Usage
|
||||||
|
|
||||||
|
|
||||||
|
#### With Docker-Compose
|
||||||
|
|
||||||
|
The Docker Compose file can simplify building and running a crawl, and includes some required settings for `docker run`, including mounting a volume.
|
||||||
|
|
||||||
|
For example, the following commands demonstrate building the image, running a simple crawl with 2 workers:
|
||||||
|
|
||||||
|
```
|
||||||
|
docker-compose build
|
||||||
|
docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --collection wr-net --workers 2
|
||||||
|
```
|
||||||
|
|
||||||
|
While the crawl is running, puppeteer-cluster provides monitoring output which is enabled by default and prints the crawl status to the Docker log.
|
||||||
|
|
||||||
|
The output is written to `./crawls/collections/wr-net` by default.
|
||||||
|
|
||||||
|
When done, you can even use the browsertrix-crawler image to also start a local [pywb](https://github.com/webrecorder/pywb) instance
|
||||||
|
to preview the crawl:
|
||||||
|
|
||||||
|
```
|
||||||
|
cd crawls
|
||||||
|
docker run it -p 8080:8080 webrecorder/browsertrix-crawler pywb
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, loading the `http://localhost:8080/wr-net/https://webrecorder.net/` should load a recent crawl of the `https://webrecorder.net/` site.
|
||||||
|
|
||||||
|
|
||||||
|
#### With `docker run`
|
||||||
|
|
||||||
|
Browsertrix Crawler can of course all be run directly with Docker run, but requires a few more options.
|
||||||
|
|
||||||
|
In particular, the `--cap-add` and `--shm-size`
|
||||||
|
flags are [needed to run Chrome in Docker](https://github.com/puppeteer/puppeteer/blob/v1.0.0/docs/troubleshooting.md#tips).
|
||||||
|
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker run -v $PWD/crawls:/output --cap-add=SYS_ADMIN --cap-add=NET_ADMIN --shm-size=1g -it webrecorder/browsertrix-crawler --url https://webrecorder.net/ --workers 2
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
Support
|
Support
|
||||||
-------
|
-------
|
||||||
|
|
||||||
Initial support for development of Browsertrix Core, was provided by [Kiwix](https://kiwix.org/)
|
Initial support for development of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/)
|
||||||
|
|
||||||
Initial functionality for Browsertrix Core was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between
|
Initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between
|
||||||
Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
|
Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -83,6 +83,7 @@ class Crawler {
|
|||||||
alias: "u",
|
alias: "u",
|
||||||
describe: "The URL to start crawling from",
|
describe: "The URL to start crawling from",
|
||||||
type: "string",
|
type: "string",
|
||||||
|
demandOption: true,
|
||||||
},
|
},
|
||||||
|
|
||||||
"workers": {
|
"workers": {
|
||||||
@@ -131,7 +132,7 @@ class Crawler {
|
|||||||
|
|
||||||
"collection": {
|
"collection": {
|
||||||
alias: "c",
|
alias: "c",
|
||||||
describe: "Collection Name",
|
describe: "Collection name to crawl to (replay will be accessible under this name in pywb preview)",
|
||||||
type: "string",
|
type: "string",
|
||||||
default: "capture"
|
default: "capture"
|
||||||
},
|
},
|
||||||
@@ -148,7 +149,7 @@ class Crawler {
|
|||||||
default: path.join(__dirname, "defaultDriver.js"),
|
default: path.join(__dirname, "defaultDriver.js"),
|
||||||
},
|
},
|
||||||
|
|
||||||
"generate-cdx": {
|
"generateCDX": {
|
||||||
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
|
describe: "If set, generate index (CDXJ) for use with pywb after crawl is done",
|
||||||
type: "boolean",
|
type: "boolean",
|
||||||
default: false,
|
default: false,
|
||||||
|
|||||||
@@ -1,11 +1,14 @@
|
|||||||
version: '3.5'
|
version: '3.5'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
browsertrix-crawler:
|
crawler:
|
||||||
image: webrecorder/browsertrix-crawler
|
image: webrecorder/browsertrix-crawler
|
||||||
build:
|
build:
|
||||||
context: ./
|
context: ./
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
- ./crawls:/output
|
||||||
|
|
||||||
cap_add:
|
cap_add:
|
||||||
- NET_ADMIN
|
- NET_ADMIN
|
||||||
- SYS_ADMIN
|
- SYS_ADMIN
|
||||||
|
|||||||
Reference in New Issue
Block a user