6303100575

Committed 25 Sep 2023 06:02PM UTC coverage: 58.583%. First build

Build # 6303100575

Build Type

Pull #2

github

Committed by

ghukill

Commit Message

expected btrix CLI arg ordering

Pull Request Pull Request #2: Timx 247 initial build

Run Details

50 of 50 new or added lines in 4 files covered. (100.0%)

215 of 367 relevant lines covered (58.58%)

0.59 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

96.15

/browsertrix_harvester/crawl.py

"""browsertrix_harvester.crawl"""

import json
import logging
import os
import shutil
import subprocess

import smart_open  # type: ignore[import]

from browsertrix_harvester.exceptions import ConfigYamlError
from browsertrix_harvester.utils import require_container

logger = logging.getLogger(__name__)


class Crawler:
    """Class that manages browsertrix crawls."""

    DOCKER_CONTAINER_CONFIG_YAML_FILEPATH = "/btrixharvest/crawl-config.yaml"

    # ruff: noqa: FBT001, FBT002
    def __init__(
        self,
        crawl_name: str,
        config_yaml_filepath: str,
        sitemap_from_date: str | None = None,
        num_workers: int = 4,
        btrix_args_json: str | None = None,
    ) -> None:
        self.crawl_name = crawl_name
        self.config_yaml_filepath = config_yaml_filepath
        self.sitemap_from_date = sitemap_from_date
        self.num_workers = num_workers
        self.btrix_args_json = btrix_args_json

    @property
    def crawl_output_dir(self) -> str:
        return f"/crawls/collections/{self.crawl_name}"

    @property
    def wacz_filepath(self) -> str:
        """Location of WACZ archive after crawl is completed."""
        return f"{self.crawl_output_dir}/{self.crawl_name}.wacz"

    @require_container
    def crawl(self) -> tuple[int, list[str], list[str]]:
        """Perform a browsertrix crawl.

        This method is decorated with @required_container that will prevent it from
        running if not inside a Docker container with the alias executable "crawl" that
        is a symlink to the Browsertrix node application.

        The crawl itself is invoked via a subprocess OS command that runs and waits for
        the crawl to complete.
        """
        # copy config yaml to known, local file location
        self._copy_config_yaml_local()

        # remove pre-existing crawl
        self._remove_previous_crawl()

        # build subprocess command
        cmd = self._build_subprocess_command()

        stdout, stderr = [], []
        # ruff: noqa: S603
        with subprocess.Popen(
            cmd,
            cwd="/crawls",
            env=self._get_subprocess_env_vars(),
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True,
        ) as process:
            if process.stdout is not None:  # pragma: no cover
                for line in process.stdout:
                    # ruff: noqa: PLW2901
                    line = line.strip()
                    if line is not None and line != "":
                        logger.debug(line)
                        stdout.append(line)
            if process.stderr is not None:  # pragma: no cover
                for line in process.stderr:
                    # ruff: noqa: PLW2901
                    line = line.strip()
                    if line is not None and line != "":
                        logger.debug(line)
                        stderr.append(line)
            return_code = process.wait()
        return return_code, stdout, stderr

    def _copy_config_yaml_local(self) -> None:
        """Download and/or copy config YAML to expected location"""
        logger.info(
            "creating docker container copy of config YAML from: %s",
            self.config_yaml_filepath,
        )
        try:
            with smart_open.open(
                self.config_yaml_filepath, "rb"
            ) as f_in, smart_open.open(
                self.DOCKER_CONTAINER_CONFIG_YAML_FILEPATH, "wb"
            ) as f_out:
                f_out.write(f_in.read())
        except Exception as e:
            logger.exception(
                "could not open file locally or from S3: %s", self.config_yaml_filepath
            )
            raise ConfigYamlError from e

    def _remove_previous_crawl(self) -> None:
        """Remove previous crawl if exists.

        Browsertrix will APPEND to previous crawls -- WARC files, indexed data, etc. -- if
        the crawl directory already exists.  While the WACZ file will overwrite, this can
        still introduce some unneeded complexity for a container that really should only
        ever have one crawl per invocation.
        """
        if os.path.exists(self.crawl_output_dir):
            logger.warning("removing pre-existing crawl at: %s", self.crawl_output_dir)
            shutil.rmtree(self.crawl_output_dir)

    def _build_subprocess_command(self) -> list:
        """Build subprocess command that will execute browsertrix-crawler with args.

        Build dictionary of key/value pairs from defaults defined here, common arguments
        broken out as explicit CLI arguments, and any additional arguments passed as a
        JSON string via CLI command that browsertrix accepts.  They are applied and
        overridden in that order, then serialized as a flat list for OS command.
        """
        # build base command
        cmd = [
            "crawl",
            "--useSitemap",
        ]

        # default args
        btrix_args = {
            "--collection": self.crawl_name,
            "--config": self.DOCKER_CONTAINER_CONFIG_YAML_FILEPATH,
            "--logging": "stats",
        }

        # apply common arguments as standalone CLI arguments
        if self.num_workers is not None:
            btrix_args["--workers"] = str(self.num_workers)
        if self.sitemap_from_date is not None:
            btrix_args["--sitemapFromDate"] = self.sitemap_from_date

        # lastly, if JSON string of btrix args provided, parse and apply
        if self.btrix_args_json is not None:
            btrix_additional_args = json.loads(self.btrix_args_json)
            for arg_name, arg_value in btrix_additional_args.items():
                btrix_args[arg_name] = str(arg_value)

        # flatten to list and extend base command
        btrix_args_list = [item for sublist in btrix_args.items() for item in sublist]
        cmd.extend(btrix_args_list)

        logger.info(cmd)

        return cmd

    @staticmethod
    def _get_subprocess_env_vars() -> dict:
        """Prepare env vars for subprocess that calls browsertrix-crawler

        Browsertrix is a node application that runs in this container, and relies on
        some global python libraries.  Because this CLI app, browsertrix-harvester, runs
        in a pipenv virtual environment, it's required to UNSET a couple of env vars when
        calling the os command to crawl.
        """
        env_vars = dict(os.environ)
        env_vars.pop("VIRTUAL_ENV", None)
        return env_vars

1	"""browsertrix_harvester.crawl"""	1✔
2
3	import json	1✔
4	import logging	1✔
5	import os	1✔
6	import shutil	1✔
7	import subprocess	1✔
8
9	import smart_open # type: ignore[import]	1✔
10
11	from browsertrix_harvester.exceptions import ConfigYamlError	1✔
12	from browsertrix_harvester.utils import require_container	1✔
13
14	logger = logging.getLogger(__name__)	1✔
15
16
17	class Crawler:	1✔
18	"""Class that manages browsertrix crawls."""	1✔
19
20	DOCKER_CONTAINER_CONFIG_YAML_FILEPATH = "/btrixharvest/crawl-config.yaml"	1✔
21
22	# ruff: noqa: FBT001, FBT002
23	def __init__(	1✔
24	self,
25	crawl_name: str,
26	config_yaml_filepath: str,
27	sitemap_from_date: str \| None = None,
28	num_workers: int = 4,
29	btrix_args_json: str \| None = None,
30	) -> None:
31	self.crawl_name = crawl_name	1✔
32	self.config_yaml_filepath = config_yaml_filepath	1✔
33	self.sitemap_from_date = sitemap_from_date	1✔
34	self.num_workers = num_workers	1✔
35	self.btrix_args_json = btrix_args_json	1✔
36
37	@property	1✔
38	def crawl_output_dir(self) -> str:	1✔
39	return f"/crawls/collections/{self.crawl_name}"	1✔
40
41	@property	1✔
42	def wacz_filepath(self) -> str:	1✔
43	"""Location of WACZ archive after crawl is completed."""
44	return f"{self.crawl_output_dir}/{self.crawl_name}.wacz"	1✔
45
46	@require_container	1✔
47	def crawl(self) -> tuple[int, list[str], list[str]]:	1✔
48	"""Perform a browsertrix crawl.
49
50	This method is decorated with @required_container that will prevent it from
51	running if not inside a Docker container with the alias executable "crawl" that
52	is a symlink to the Browsertrix node application.
53
54	The crawl itself is invoked via a subprocess OS command that runs and waits for
55	the crawl to complete.
56	"""
57	# copy config yaml to known, local file location
58	self._copy_config_yaml_local()	1✔
59
60	# remove pre-existing crawl
61	self._remove_previous_crawl()	1✔
62
63	# build subprocess command
64	cmd = self._build_subprocess_command()	1✔
65
66	stdout, stderr = [], []	1✔
67	# ruff: noqa: S603
68	with subprocess.Popen(	1✔
69	cmd,
70	cwd="/crawls",
71	env=self._get_subprocess_env_vars(),
72	stdout=subprocess.PIPE,
73	stderr=subprocess.PIPE,
74	text=True,
75	) as process:
76	if process.stdout is not None: # pragma: no cover	1✔
77	for line in process.stdout:	1✔
78	# ruff: noqa: PLW2901
79	line = line.strip()	1✔
80	if line is not None and line != "":	1✔
81	logger.debug(line)	1✔
82	stdout.append(line)	1✔
83	if process.stderr is not None: # pragma: no cover	1✔
84	for line in process.stderr:	1✔
85	# ruff: noqa: PLW2901
86	line = line.strip()	1✔
87	if line is not None and line != "":	1✔
88	logger.debug(line)	1✔
89	stderr.append(line)	1✔
90	return_code = process.wait()	1✔
91	return return_code, stdout, stderr	1✔
92
93	def _copy_config_yaml_local(self) -> None:	1✔
94	"""Download and/or copy config YAML to expected location"""
95	logger.info(	1✔
96	"creating docker container copy of config YAML from: %s",
97	self.config_yaml_filepath,
98	)
99	try:	1✔
100	with smart_open.open(	1✔
101	self.config_yaml_filepath, "rb"
102	) as f_in, smart_open.open(
103	self.DOCKER_CONTAINER_CONFIG_YAML_FILEPATH, "wb"
104	) as f_out:
105	f_out.write(f_in.read())	1✔
106	except Exception as e:	1✔
107	logger.exception(	1✔
108	"could not open file locally or from S3: %s", self.config_yaml_filepath
109	)
110	raise ConfigYamlError from e	1✔
111
112	def _remove_previous_crawl(self) -> None:	1✔
113	"""Remove previous crawl if exists.
114
115	Browsertrix will APPEND to previous crawls -- WARC files, indexed data, etc. -- if
116	the crawl directory already exists. While the WACZ file will overwrite, this can
117	still introduce some unneeded complexity for a container that really should only
118	ever have one crawl per invocation.
119	"""
120	if os.path.exists(self.crawl_output_dir):	1✔
121	logger.warning("removing pre-existing crawl at: %s", self.crawl_output_dir)	1✔
122	shutil.rmtree(self.crawl_output_dir)	1✔
123
124	def _build_subprocess_command(self) -> list:	1✔
125	"""Build subprocess command that will execute browsertrix-crawler with args.
126
127	Build dictionary of key/value pairs from defaults defined here, common arguments
128	broken out as explicit CLI arguments, and any additional arguments passed as a
129	JSON string via CLI command that browsertrix accepts. They are applied and
130	overridden in that order, then serialized as a flat list for OS command.
131	"""
132	# build base command
133	cmd = [	1✔
134	"crawl",
135	"--useSitemap",
136	]
137
138	# default args
139	btrix_args = {	1✔
140	"--collection": self.crawl_name,
141	"--config": self.DOCKER_CONTAINER_CONFIG_YAML_FILEPATH,
142	"--logging": "stats",
143	}
144
145	# apply common arguments as standalone CLI arguments
146	if self.num_workers is not None:	1✔
147	btrix_args["--workers"] = str(self.num_workers)	1✔
148	if self.sitemap_from_date is not None:	1✔
149	btrix_args["--sitemapFromDate"] = self.sitemap_from_date	1✔
150
151	# lastly, if JSON string of btrix args provided, parse and apply
152	if self.btrix_args_json is not None:	1✔
153	btrix_additional_args = json.loads(self.btrix_args_json)	×
154	for arg_name, arg_value in btrix_additional_args.items():	×
155	btrix_args[arg_name] = str(arg_value)	×
156
157	# flatten to list and extend base command
158	btrix_args_list = [item for sublist in btrix_args.items() for item in sublist]	1✔
159	cmd.extend(btrix_args_list)	1✔
160
161	logger.info(cmd)	1✔
162
163	return cmd	1✔
164
165	@staticmethod	1✔
166	def _get_subprocess_env_vars() -> dict:	1✔
167	"""Prepare env vars for subprocess that calls browsertrix-crawler
168
169	Browsertrix is a node application that runs in this container, and relies on
170	some global python libraries. Because this CLI app, browsertrix-harvester, runs
171	in a pipenv virtual environment, it's required to UNSET a couple of env vars when
172	calling the os command to crawl.
173	"""
174	env_vars = dict(os.environ)	1✔
175	env_vars.pop("VIRTUAL_ENV", None)	1✔
176	return env_vars	1✔

MITLibraries / browsertrix-harvester / 6303100575

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous