• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

MITLibraries / browsertrix-harvester / 6303100575

25 Sep 2023 06:02PM UTC coverage: 58.583%. First build
6303100575

Pull #2

github

ghukill
expected btrix CLI arg ordering
Pull Request #2: Timx 247 initial build

50 of 50 new or added lines in 4 files covered. (100.0%)

215 of 367 relevant lines covered (58.58%)

0.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.15
/browsertrix_harvester/crawl.py
1
"""browsertrix_harvester.crawl"""
1✔
2

3
import json
1✔
4
import logging
1✔
5
import os
1✔
6
import shutil
1✔
7
import subprocess
1✔
8

9
import smart_open  # type: ignore[import]
1✔
10

11
from browsertrix_harvester.exceptions import ConfigYamlError
1✔
12
from browsertrix_harvester.utils import require_container
1✔
13

14
logger = logging.getLogger(__name__)
1✔
15

16

17
class Crawler:
1✔
18
    """Class that manages browsertrix crawls."""
1✔
19

20
    DOCKER_CONTAINER_CONFIG_YAML_FILEPATH = "/btrixharvest/crawl-config.yaml"
1✔
21

22
    # ruff: noqa: FBT001, FBT002
23
    def __init__(
1✔
24
        self,
25
        crawl_name: str,
26
        config_yaml_filepath: str,
27
        sitemap_from_date: str | None = None,
28
        num_workers: int = 4,
29
        btrix_args_json: str | None = None,
30
    ) -> None:
31
        self.crawl_name = crawl_name
1✔
32
        self.config_yaml_filepath = config_yaml_filepath
1✔
33
        self.sitemap_from_date = sitemap_from_date
1✔
34
        self.num_workers = num_workers
1✔
35
        self.btrix_args_json = btrix_args_json
1✔
36

37
    @property
1✔
38
    def crawl_output_dir(self) -> str:
1✔
39
        return f"/crawls/collections/{self.crawl_name}"
1✔
40

41
    @property
1✔
42
    def wacz_filepath(self) -> str:
1✔
43
        """Location of WACZ archive after crawl is completed."""
44
        return f"{self.crawl_output_dir}/{self.crawl_name}.wacz"
1✔
45

46
    @require_container
1✔
47
    def crawl(self) -> tuple[int, list[str], list[str]]:
1✔
48
        """Perform a browsertrix crawl.
49

50
        This method is decorated with @required_container that will prevent it from
51
        running if not inside a Docker container with the alias executable "crawl" that
52
        is a symlink to the Browsertrix node application.
53

54
        The crawl itself is invoked via a subprocess OS command that runs and waits for
55
        the crawl to complete.
56
        """
57
        # copy config yaml to known, local file location
58
        self._copy_config_yaml_local()
1✔
59

60
        # remove pre-existing crawl
61
        self._remove_previous_crawl()
1✔
62

63
        # build subprocess command
64
        cmd = self._build_subprocess_command()
1✔
65

66
        stdout, stderr = [], []
1✔
67
        # ruff: noqa: S603
68
        with subprocess.Popen(
1✔
69
            cmd,
70
            cwd="/crawls",
71
            env=self._get_subprocess_env_vars(),
72
            stdout=subprocess.PIPE,
73
            stderr=subprocess.PIPE,
74
            text=True,
75
        ) as process:
76
            if process.stdout is not None:  # pragma: no cover
1✔
77
                for line in process.stdout:
1✔
78
                    # ruff: noqa: PLW2901
79
                    line = line.strip()
1✔
80
                    if line is not None and line != "":
1✔
81
                        logger.debug(line)
1✔
82
                        stdout.append(line)
1✔
83
            if process.stderr is not None:  # pragma: no cover
1✔
84
                for line in process.stderr:
1✔
85
                    # ruff: noqa: PLW2901
86
                    line = line.strip()
1✔
87
                    if line is not None and line != "":
1✔
88
                        logger.debug(line)
1✔
89
                        stderr.append(line)
1✔
90
            return_code = process.wait()
1✔
91
        return return_code, stdout, stderr
1✔
92

93
    def _copy_config_yaml_local(self) -> None:
1✔
94
        """Download and/or copy config YAML to expected location"""
95
        logger.info(
1✔
96
            "creating docker container copy of config YAML from: %s",
97
            self.config_yaml_filepath,
98
        )
99
        try:
1✔
100
            with smart_open.open(
1✔
101
                self.config_yaml_filepath, "rb"
102
            ) as f_in, smart_open.open(
103
                self.DOCKER_CONTAINER_CONFIG_YAML_FILEPATH, "wb"
104
            ) as f_out:
105
                f_out.write(f_in.read())
1✔
106
        except Exception as e:
1✔
107
            logger.exception(
1✔
108
                "could not open file locally or from S3: %s", self.config_yaml_filepath
109
            )
110
            raise ConfigYamlError from e
1✔
111

112
    def _remove_previous_crawl(self) -> None:
1✔
113
        """Remove previous crawl if exists.
114

115
        Browsertrix will APPEND to previous crawls -- WARC files, indexed data, etc. -- if
116
        the crawl directory already exists.  While the WACZ file will overwrite, this can
117
        still introduce some unneeded complexity for a container that really should only
118
        ever have one crawl per invocation.
119
        """
120
        if os.path.exists(self.crawl_output_dir):
1✔
121
            logger.warning("removing pre-existing crawl at: %s", self.crawl_output_dir)
1✔
122
            shutil.rmtree(self.crawl_output_dir)
1✔
123

124
    def _build_subprocess_command(self) -> list:
1✔
125
        """Build subprocess command that will execute browsertrix-crawler with args.
126

127
        Build dictionary of key/value pairs from defaults defined here, common arguments
128
        broken out as explicit CLI arguments, and any additional arguments passed as a
129
        JSON string via CLI command that browsertrix accepts.  They are applied and
130
        overridden in that order, then serialized as a flat list for OS command.
131
        """
132
        # build base command
133
        cmd = [
1✔
134
            "crawl",
135
            "--useSitemap",
136
        ]
137

138
        # default args
139
        btrix_args = {
1✔
140
            "--collection": self.crawl_name,
141
            "--config": self.DOCKER_CONTAINER_CONFIG_YAML_FILEPATH,
142
            "--logging": "stats",
143
        }
144

145
        # apply common arguments as standalone CLI arguments
146
        if self.num_workers is not None:
1✔
147
            btrix_args["--workers"] = str(self.num_workers)
1✔
148
        if self.sitemap_from_date is not None:
1✔
149
            btrix_args["--sitemapFromDate"] = self.sitemap_from_date
1✔
150

151
        # lastly, if JSON string of btrix args provided, parse and apply
152
        if self.btrix_args_json is not None:
1✔
153
            btrix_additional_args = json.loads(self.btrix_args_json)
×
154
            for arg_name, arg_value in btrix_additional_args.items():
×
155
                btrix_args[arg_name] = str(arg_value)
×
156

157
        # flatten to list and extend base command
158
        btrix_args_list = [item for sublist in btrix_args.items() for item in sublist]
1✔
159
        cmd.extend(btrix_args_list)
1✔
160

161
        logger.info(cmd)
1✔
162

163
        return cmd
1✔
164

165
    @staticmethod
1✔
166
    def _get_subprocess_env_vars() -> dict:
1✔
167
        """Prepare env vars for subprocess that calls browsertrix-crawler
168

169
        Browsertrix is a node application that runs in this container, and relies on
170
        some global python libraries.  Because this CLI app, browsertrix-harvester, runs
171
        in a pipenv virtual environment, it's required to UNSET a couple of env vars when
172
        calling the os command to crawl.
173
        """
174
        env_vars = dict(os.environ)
1✔
175
        env_vars.pop("VIRTUAL_ENV", None)
1✔
176
        return env_vars
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc