• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mborsetti / webchanges / 24951690269

26 Apr 2026 07:56AM UTC coverage: 73.149% (-0.09%) from 73.235%
24951690269

push

github

mborsetti
Version 3.36.0

1525 of 2484 branches covered (61.39%)

Branch coverage included in aggregate %.

5291 of 6834 relevant lines covered (77.42%)

11.13 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

78.97
/webchanges/filters/_html.py
1
"""HTML/XML/CSS/XPath filters."""
2

3
# The code below is subject to the license contained in the LICENSE.md file, which is part of the source code.
4

5
from __future__ import annotations
15✔
6

7
import html
15✔
8
import importlib.util
15✔
9
import logging
15✔
10
import re
15✔
11
import warnings
15✔
12
from abc import ABC
15✔
13
from enum import Enum
15✔
14
from html.parser import HTMLParser
15✔
15
from typing import TYPE_CHECKING, Any
15✔
16
from urllib.parse import urljoin
15✔
17
from xml.dom import minidom
15✔
18

19
import html2text
15✔
20
from lxml import etree
15✔
21
from lxml.cssselect import CSSSelector
15✔
22

23
from webchanges.filters._base import AutoMatchFilter, FilterBase, RegexMatchFilter
15✔
24

25
if TYPE_CHECKING:
26
    from webchanges.jobs import JobBase
27

28
try:
15✔
29
    import bs4
15✔
30
except ImportError as e:  # pragma: has-bs4
×
31
    bs4 = str(e)  # ty:ignore[invalid-assignment]
×
32

33
try:
15✔
34
    import cssbeautifier
15✔
35
except ImportError as e:  # pragma: no cover
36
    cssbeautifier = str(e)  # ty:ignore[invalid-assignment]
37

38
try:
15✔
39
    import jsbeautifier
15✔
40
except ImportError as e:  # pragma: no cover
41
    jsbeautifier = str(e)  # ty:ignore[invalid-assignment]
42

43
logger = logging.getLogger(__name__)
15✔
44

45
__all__ = [
15✔
46
    'AutoMatchFilter',
47
    'RegexMatchFilter',
48
]
49

50

51
class BeautifyFilter(FilterBase):
15✔
52
    """Beautify HTML (requires Python package ``BeautifulSoup`` and optionally ``jsbeautifier`` and/or
53
    ``cssbeautifier``).
54
    """
55

56
    __kind__ = 'beautify'
15✔
57

58
    __supported_subfilters__: dict[str, str] = {
15✔
59
        'absolute_links': 'Convert relative links to absolute ones.',
60
        'indent': 'Number of spaces by which to indent HTML output.',
61
    }
62

63
    __default_subfilter__ = 'indent'
15✔
64

65
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
66
        """Filter (process) the data.
67

68
        :param data: The data to be filtered (processed).
69
        :param subfilter: The subfilter information.
70
        :returns: The data and media type (fka MIME type) of the data after the filter has been applied.
71
        """
72
        if isinstance(bs4, str):
15!
73
            self.raise_import_error('BeautifulSoup', self.__kind__, bs4)
×
74

75
        bs4_features = 'lxml' if importlib.util.find_spec('lxml') is not None else 'html'
15✔
76
        soup = bs4.BeautifulSoup(data, features=bs4_features)
15✔
77

78
        if isinstance(jsbeautifier, str):
15!
79
            logger.warning(
×
80
                f"Python package 'jsbeautifier' cannot be imported; will not beautify <script> tags"
81
                f' ({self.job.get_indexed_location()})\n{jsbeautifier}'
82
            )
83
        else:
84
            scripts = soup.find_all('script')
15✔
85
            for script in scripts:
15!
86
                if script.string:
×
87
                    beautified_js = jsbeautifier.beautify(script.string)
×
88
                    script.string = beautified_js
×
89

90
        if isinstance(cssbeautifier, str):
15!
91
            logger.warning(
×
92
                "Python package 'cssbeautifier' cannot be imported; will not beautify <style> tags"
93
                f' ({self.job.get_indexed_location()})\n{cssbeautifier}'
94
            )
95
        else:
96
            styles = soup.find_all('style')
15✔
97
            for style in styles:
15!
98
                if style.string:
×
99
                    beautified_css = cssbeautifier.beautify(style.string)
×
100
                    style.string = beautified_css
×
101

102
        if subfilter.get('absolute_links') is None or subfilter.get('absolute_links'):
15!
103
            for link in soup.find_all('a', href=True):
15✔
104
                link['href'] = urljoin(self.job.url, link['href'])
15✔
105

106
        indent = subfilter.get('indent', 1)
15✔
107
        return soup.prettify(formatter=bs4.formatter.HTMLFormatter(indent=indent)), mime_type
15✔
108

109

110
class AbsoluteLinksFilter(FilterBase):
15✔
111
    """Replace relative HTML <a> href links with absolute ones."""
112

113
    __kind__ = 'absolute_links'
15✔
114

115
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
116
        tree = etree.HTML(data)
15✔
117
        elem: etree._Element
118
        for elem in tree.xpath('//*[@action]'):  # ty:ignore[invalid-assignment, not-iterable]
15!
119
            elem.attrib['action'] = urljoin(self.job.url, elem.attrib['action'])
×
120
        for elem in tree.xpath('//object[@data]'):  # ty:ignore[invalid-assignment, not-iterable]
15!
121
            elem.attrib['data'] = urljoin(self.job.url, elem.attrib['data'])
×
122
        for elem in tree.xpath('//*[@href]'):  # ty:ignore[invalid-assignment, not-iterable]
15✔
123
            elem.attrib['href'] = urljoin(self.job.url, elem.attrib['href'])
15✔
124
        for elem in tree.xpath('//*[@src]'):  # ty:ignore[invalid-assignment, not-iterable]
15!
125
            elem.attrib['src'] = urljoin(self.job.url, elem.attrib['src'])
×
126
        return etree.tostring(tree, encoding='unicode', method='html'), mime_type
15✔
127

128

129
class Html2TextFilter(FilterBase):
15✔
130
    """Convert a string consisting of HTML to Unicode plain text for easy difference checking."""
131

132
    __kind__ = 'html2text'
15✔
133

134
    __supported_subfilters__: dict[str, str] = {
15✔
135
        'method': 'Method to use for conversion (html2text [default], bs4, or strip_tags)',
136
        'separator': 'bs4: Strings will be concatenated using this separator',
137
        'strip': 'bs4: If True, strings will be stripped before being concatenated',
138
        '<any>': 'html2text: Library-specific options (see '
139
        'https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options)',
140
    }
141

142
    __default_subfilter__ = 'method'
15✔
143

144
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
145
        """Filter (process) the data.
146

147
        Subfilter key can be ``method`` and any method-specific option to be passed to it.
148
        The following ``method`` keys are supported:
149

150
        * ``html2text`` (default): Use html2text Python library to extract text (in Markdown).
151

152
          * options: See
153
            https://github.com/Alir3z4/html2text/blob/master/docs/usage.md#available-options,
154
            however the following options are set to non-default values:
155

156
            * ``unicode_snob = True``
157
            * ``body_width = 0``
158
            * ``ignore_images = True``
159
            * ``single_line_break = True``
160
            * ``wrap_links = False``
161

162
        * ``bs4``: Use Beautiful Soup Python library to extract plain text.
163

164
          * options:
165

166
            * parser: the type of markup you want to parse (currently supported are ``html``, ``xml``, and ``html5``)
167
              or the name of the parser library you want to use (currently supported options are ``lxml``,
168
              ``html5lib`` and ``html.parser``) as per
169
              https://www.crummy.com/software/BeautifulSoup/bs4/doc/#specifying-the-parser-to-use.  Different parsers
170
              are compared at https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser.
171
              Note: ``html5lib``requires having the ``html5lib`` Python package already installed. Defaults to 'lxml'.
172
            * separator: Strings will be concatenated using this separator. Defaults to `````` (empty string).
173
            * strip: If True, strings will be stripped before being concatenated. Defaults to False.
174

175
        * ``strip_tags``: A simple and fast regex-based HTML tag stripper.
176

177
        :param data: The data to be filtered (processed).
178
        :param subfilter: The subfilter information.
179
        :returns: The data and media type (fka MIME type) of the data after the filter has been applied.
180
        """
181
        # extract method and options from subfilter, defaulting to method html2text
182
        if not isinstance(data, str):
15!
183
            raise ValueError
×
184
        options = subfilter.copy()
15✔
185
        method = options.pop('method', 'html2text')
15✔
186

187
        if method in {'html2text', 'pyhtml2text'}:  # pythtml2text for backward compatibility
15✔
188
            if method == 'pyhtml2text':
15✔
189
                warnings.warn(
15✔
190
                    f"Filter html2text's method 'pyhtml2text' is deprecated: remove method as it's now the "
191
                    f"filter's default ({self.job.get_indexed_location()})",
192
                    DeprecationWarning,
193
                    stacklevel=1,
194
                )
195
            parser = html2text.HTML2Text()
15✔
196
            parser.unicode_snob = True
15✔
197
            parser.body_width = 0
15✔
198
            parser.ignore_images = True
15✔
199
            parser.single_line_break = True
15✔
200
            parser.wrap_links = False
15✔
201
            if hasattr(self.job, 'url'):
15!
202
                parser.baseurl = self.job.url
15✔
203
            for k, v in options.items():
15!
204
                setattr(parser, k.lower(), v)
×
205
                if k == 'pad_tables':
×
206
                    self.job.markdown_padded_tables = v
×
207

208
            # html2text returns lines with spaces at the end even if they are ignored when rendered
209
            return '\n'.join(line.rstrip() for line in parser.handle(data).splitlines()), 'text/markdown'
15✔
210

211
        if method == 'bs4':
15✔
212
            if isinstance(bs4, str):
15!
213
                self.raise_import_error('BeautifulSoup', self.__kind__, bs4)
×
214

215
            default_bs4_parser = 'lxml' if importlib.util.find_spec('lxml') is not None else 'html'
15✔
216
            bs4_parser: str = options.pop('parser', default_bs4_parser)
15✔
217
            try:
15✔
218
                soup = bs4.BeautifulSoup(data, features=bs4_parser)
15✔
219
            except bs4.FeatureNotFound:
×
220
                raise ValueError(  # noqa: B904
×
221
                    f"Filter html2text's method 'bs4' has been invoked with parser '{bs4_parser}', which is either not "
222
                    f'installed or is not supported by Beautiful Soup. Please refer to the documentation at '
223
                    f'https://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser. '
224
                    f'({self.job.get_indexed_location()})'
225
                )
226
            separator: str = options.pop('separator', '')
15✔
227
            strip: bool = options.pop('strip', False)
15✔
228
            return soup.get_text(separator=separator, strip=strip), 'text/plain'
15✔
229

230
        if method in {'strip_tags', 're'}:  # re for backward compatibility
15✔
231
            if method == 're':
15!
232
                warnings.warn(
15✔
233
                    f"Filter html2text's method 're' is deprecated: replace with 'strip_tags' "
234
                    f'({self.job.get_indexed_location()})',
235
                    DeprecationWarning,
236
                    stacklevel=1,
237
                )
238
            stripped_tags = html.unescape(re.sub(r'<[^>]*>', '', data))
15✔
239
            return '\n'.join((line.rstrip() for line in stripped_tags.splitlines() if line.strip() != '')), 'text/plain'
15✔
240

241
        if method == 'lynx':
15✔
242
            raise NotImplementedError(
243
                f"Filter html2text's method 'lynx' is no longer supported; for similar results, use the filter without "
244
                f'specifying a method. ({self.job.get_indexed_location()})'
245
            )
246

247
        raise ValueError(f"Unknown method {method} for filter 'html2text'. ({self.job.get_indexed_location()})")
15✔
248

249

250
class FormatXMLFilter(FilterBase):
15✔
251
    """Convert to formatted XML using lxml.etree."""
252

253
    __kind__ = 'format-xml'
15✔
254

255
    __no_subfilter__ = True
15✔
256

257
    # __supported_subfilters__: dict[str, str] = {
258
    #     'indentation': 'Indentation level for pretty-printing',
259
    # }
260
    #
261
    # __default_subfilter__ = 'indentation'
262

263
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
264
        parsed_xml = etree.XML(data)
×
265
        if not mime_type.endswith('xml'):
×
266
            mime_type = 'application/xml'
×
267
        return etree.tostring(parsed_xml, encoding='unicode', pretty_print=True), mime_type
×
268

269

270
class PrettyXMLFilter(FilterBase):
15✔
271
    """Pretty-print XML using xml.dom.minidom."""
272

273
    __kind__ = 'pretty-xml'
15✔
274

275
    __supported_subfilters__: dict[str, str] = {
15✔
276
        'indentation': 'Indentation level for pretty-printing',
277
    }
278

279
    __default_subfilter__ = 'indentation'
15✔
280

281
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
282
        indentation = int(subfilter.get('indentation', 2))
×
283
        if not mime_type.endswith('xml'):
×
284
            mime_type = 'application/xml'
×
285
        return minidom.parseString(data).toprettyxml(indent=' ' * indentation), mime_type  # noqa: S318 use defusedxml.
×
286

287

288
class FilterBy(Enum):
15✔
289
    ATTRIBUTE = 1
15✔
290
    TAG = 2
15✔
291

292

293
class ElementsBy(HTMLParser, ABC):
15✔
294
    def __init__(self, filter_by: FilterBy, name: str, value: Any = None) -> None:  # noqa: ANN401 Dynamically typed expressions Any are disallowed
15✔
295
        super().__init__()
15✔
296

297
        self._filter_by = filter_by
15✔
298
        if self._filter_by == FilterBy.ATTRIBUTE:
15✔
299
            self._attributes = {name: value}
15✔
300
        else:
301
            # FilterBy.TAG
302
            self._name = name
15✔
303

304
        self._result: list[str] = []
15✔
305
        self._inside: bool = False
15✔
306
        self._elts: list[str] = []
15✔
307

308
    def get_html(self) -> str:
15✔
309
        return ''.join(self._result)
15✔
310

311
    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
15✔
312
        ad = dict(attrs)
15✔
313

314
        if (self._filter_by == FilterBy.ATTRIBUTE and all(ad.get(k) == v for k, v in self._attributes.items())) or (
15✔
315
            self._filter_by == FilterBy.TAG and tag == self._name
316
        ):
317
            self._inside = True
15✔
318

319
        if self._inside:
15✔
320
            self._result.append(f'<{tag}{" " if attrs else ""}%s>' % ' '.join(f'{k}="{v}"' for k, v in attrs))
15✔
321
            self._elts.append(tag)
15✔
322

323
    def handle_endtag(self, tag: str) -> None:
15✔
324
        if self._inside:
15✔
325
            self._result.append(f'</{tag}>')
15✔
326
            if tag in self._elts:
15!
327
                t = self._elts.pop()
15✔
328
                while t != tag and self._elts:
15!
329
                    t = self._elts.pop()
×
330
            if not self._elts:
15✔
331
                self._inside = False
15✔
332

333
    def handle_data(self, data: str) -> None:
15✔
334
        if self._inside:
15✔
335
            self._result.append(data)
15✔
336

337

338
class ElementByIdFilter(FilterBase):
15✔
339
    """Get all HTML elements matching an ID."""
340

341
    __kind__ = 'element-by-id'
15✔
342

343
    __supported_subfilters__: dict[str, str] = {
15✔
344
        'id': 'ID of the element to filter for (required)',
345
    }
346

347
    __default_subfilter__ = 'id'
15✔
348

349
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
350
        if not isinstance(data, str):
15!
351
            raise ValueError
×
352
        if 'id' not in subfilter:
15✔
353
            raise ValueError(
15✔
354
                f"The 'element-by-id' filter needs an id for filtering. ({self.job.get_indexed_location()})"
355
            )
356

357
        element_by_id = ElementsBy(FilterBy.ATTRIBUTE, 'id', subfilter['id'])
15✔
358
        element_by_id.feed(data)
15✔
359
        return element_by_id.get_html(), mime_type
15✔
360

361

362
class ElementByClassFilter(FilterBase):
15✔
363
    """Get all HTML elements matching a class."""
364

365
    __kind__ = 'element-by-class'
15✔
366

367
    __supported_subfilters__: dict[str, str] = {
15✔
368
        'class': 'HTML class attribute to filter for (required)',
369
    }
370

371
    __default_subfilter__ = 'class'
15✔
372

373
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
374
        if not isinstance(data, str):
15!
375
            raise ValueError
×
376
        if 'class' not in subfilter:
15✔
377
            raise ValueError(
15✔
378
                f"The 'element-by-class' filter needs a class for filtering. ({self.job.get_indexed_location()})"
379
            )
380

381
        element_by_class = ElementsBy(FilterBy.ATTRIBUTE, 'class', subfilter['class'])
15✔
382
        element_by_class.feed(data)
15✔
383
        return element_by_class.get_html(), mime_type
15✔
384

385

386
class ElementByStyleFilter(FilterBase):
15✔
387
    """Get all HTML elements matching a style."""
388

389
    __kind__ = 'element-by-style'
15✔
390

391
    __supported_subfilters__: dict[str, str] = {
15✔
392
        'style': 'HTML style attribute value to filter for (required)',
393
    }
394

395
    __default_subfilter__ = 'style'
15✔
396

397
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
398
        if not isinstance(data, str):
15!
399
            raise ValueError
×
400
        if 'style' not in subfilter:
15!
401
            raise ValueError(
15✔
402
                f"The 'element-by-style' filter needs a style for filtering. ({self.job.get_indexed_location()})"
403
            )
404

405
        element_by_style = ElementsBy(FilterBy.ATTRIBUTE, 'style', subfilter['style'])
×
406
        element_by_style.feed(data)
×
407
        return element_by_style.get_html(), mime_type
×
408

409

410
class ElementByTagFilter(FilterBase):
15✔
411
    """Get all HTML elements matching a tag."""
412

413
    __kind__ = 'element-by-tag'
15✔
414

415
    __supported_subfilters__: dict[str, str] = {
15✔
416
        'tag': 'HTML tag name to filter for (required)',
417
    }
418

419
    __default_subfilter__ = 'tag'
15✔
420

421
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
422
        if not isinstance(data, str):
15!
423
            raise ValueError
×
424
        if 'tag' not in subfilter:
15✔
425
            raise ValueError(
15✔
426
                f"The 'element-by-tag' filter needs a tag for filtering. ({self.job.get_indexed_location()})"
427
            )
428

429
        element_by_tag = ElementsBy(FilterBy.TAG, subfilter['tag'])
15✔
430
        element_by_tag.feed(data)
15✔
431
        return element_by_tag.get_html(), mime_type
15✔
432

433

434
class LxmlParser:
15✔
435
    EXPR_NAMES: dict[str, str] = {
15✔
436
        'css': 'a CSS selector',
437
        'xpath': 'an XPath expression',
438
    }
439

440
    expression: str
15✔
441
    method: str
15✔
442
    namespaces: dict[str, str] | None
15✔
443
    parser: etree._FeedParser
15✔
444
    skip: int
15✔
445

446
    def __init__(
15✔
447
        self,
448
        filter_kind: str,
449
        subfilter: dict[str, Any],
450
        expr_key: str,
451
        job: JobBase,
452
    ) -> None:
453
        self.filter_kind = filter_kind
15✔
454
        self.method = subfilter.get('method', 'html')
15✔
455
        if self.method not in {'html', 'xml'}:
15✔
456
            raise ValueError(
15✔
457
                f"The '{filter_kind}' filter's method must be 'html' or 'xml', got '{self.method}'. "
458
                f'({job.get_indexed_location()})'
459
            )
460
        if expr_key not in subfilter:
15✔
461
            raise ValueError(
15✔
462
                f"The '{filter_kind}' filter needs {self.EXPR_NAMES[filter_kind]} for filtering. "
463
                f'({job.get_indexed_location()})'
464
            )
465
        self.expression = subfilter[expr_key]
15✔
466
        self.exclude = subfilter.get('exclude')
15✔
467
        self.namespaces = subfilter.get('namespaces')
15✔
468
        self.skip = int(subfilter.get('skip', 0))
15✔
469
        self.sort_items = bool(subfilter.get('sort', False))
15✔
470
        self.maxitems = int(subfilter.get('maxitems', 0))
15✔
471
        if self.method == 'html' and self.namespaces:
15✔
472
            raise ValueError(
15✔
473
                f"The '{filter_kind}' filter's namespace prefixes are only supported with 'method: xml'. "
474
                f'({job.get_indexed_location()})'
475
            )
476
        self.data = ''
15✔
477

478
    def feed(self, data: str) -> None:
15✔
479
        self.data += data
15✔
480

481
    @staticmethod
15✔
482
    def _to_string(element: etree._Element | str, method: str) -> str:
15✔
483
        # Handle "/text()" selector, which returns lxml.etree._ElementUnicodeResult
484
        # (https://github.com/thp/urlwatch/issues/282)
485
        if isinstance(element, str):
15✔
486
            return element
15✔
487

488
        return etree.tostring(element, encoding='unicode', method=method, pretty_print=True, with_tail=False).strip()
15✔
489

490
    @staticmethod
15✔
491
    def _remove_element(element: etree._Element) -> None:
15✔
492
        parent = element.getparent()
15✔
493
        if parent is None:
15!
494
            # Do not exclude root element
495
            return
×
496
        if isinstance(element, etree._ElementUnicodeResult):
15✔
497
            if element.is_tail:
15!
498
                parent.tail = None
×
499
            elif element.is_text:
15!
500
                parent.text = None
×
501
            elif element.is_attribute:
15!
502
                del parent.attrib[element.attrname]
15✔
503
        else:
504
            previous = element.getprevious()
15✔
505
            if element.tail is not None:
15!
506
                if previous is not None:
15✔
507
                    previous.tail = previous.tail + element.tail if previous.tail else element.tail
15✔
508
                else:
509
                    parent.text = parent.text + element.tail if parent.text else element.tail
15✔
510
            parent.remove(element)
15✔
511

512
    def _reevaluate(self, element: etree._Element) -> etree._Element | str | None:
15✔
513
        if self._orphaned(element):
15✔
514
            return None
15✔
515
        if isinstance(element, etree._ElementUnicodeResult):
15✔
516
            parent = element.getparent()
15✔
517
            if parent is None:
15!
518
                return element
×
519
            if element.is_tail:
15!
520
                return parent.tail
×
521
            if element.is_text:
15✔
522
                return parent.text
15✔
523
            if element.is_attribute:
15!
524
                return parent.attrib.get(element.attrname)
15✔
525
            return element
×
526
        return element
15✔
527

528
    def _orphaned(self, element: etree._Element) -> bool:
15✔
529
        if isinstance(element, etree._ElementUnicodeResult):
15✔
530
            parent = element.getparent()
15✔
531
            if (
15!
532
                (element.is_tail and parent.tail is None)  # ty:ignore[unresolved-attribute]
533
                or (element.is_text and parent.text is None)  # ty:ignore[unresolved-attribute]
534
                or (element.is_attribute and parent.attrib.get(element.attrname) is None)  # ty:ignore[unresolved-attribute]
535
            ):
536
                return True
×
537
            element = parent  # ty:ignore[invalid-assignment]
15✔
538
        try:
15✔
539
            tree = element.getroottree()
15✔
540
            path = tree.getpath(element)
15✔
541
            return element is not tree.xpath(path, namespaces=self.namespaces)[0]  # ty:ignore[not-subscriptable]
15✔
542
        except (ValueError, IndexError):
15✔
543
            return True
15✔
544

545
    def _get_filtered_elements(
15✔
546
        self,
547
        job_index_number: int | None = None,
548
    ) -> list[etree._Element | str]:
549
        if self.method == 'xml' and isinstance(self.data, str):
15✔
550
            # see https://lxml.de/FAQ.html#why-can-t-lxml-parse-my-xml-from-unicode-strings
551
            data: str | bytes = self.data.encode(errors='xmlcharrefreplace')
15✔
552
        elif self.method == 'html' and self.data.startswith('<?xml'):
15!
553
            # handle legacy https://stackoverflow.com/questions/37592045/
554
            data = self.data.split('>', maxsplit=1)[1]
×
555
        else:
556
            data = self.data
15✔
557
        try:
15✔
558
            root = etree.XML(data) if self.method == 'xml' else etree.HTML(data)
15✔
559
        except ValueError as e:
×
560
            args = (
×
561
                f"Filter '{self.filter_kind}' encountered the following error when parsing the data. Check that "
562
                f"'method: {self.method}' is the correct one.\n    {type(e).__name__}: {e}"
563
            )
564
            raise RuntimeError(args) from None
×
565
        if root is None:
15!
566
            return []
×
567
        selected_elems: list[etree._Element] | None = None
15✔
568
        excluded_elems: list[etree._Element] | None = None
15✔
569
        try:
15✔
570
            if self.filter_kind == 'css':
15✔
571
                selected_elems = CSSSelector(self.expression, namespaces=self.namespaces)(root)  # ty:ignore[invalid-assignment]
15✔
572
                excluded_elems = CSSSelector(self.exclude, namespaces=self.namespaces)(root) if self.exclude else None  # ty:ignore[invalid-assignment]
15✔
573

574
            elif self.filter_kind == 'xpath':
15!
575
                selected_elems = root.xpath(self.expression, namespaces=self.namespaces)  # ty:ignore[invalid-assignment]
15✔
576
                excluded_elems = root.xpath(self.exclude, namespaces=self.namespaces) if self.exclude else None  # ty:ignore[invalid-assignment]
15✔
577
        except (etree.ParserError, etree.XMLSchemaError, etree.XPathError) as e:
×
578
            raise ValueError(f'Job {job_index_number} {type(e).__name__}: {e} {self.expression}') from e
×
579
        if excluded_elems is not None:
15✔
580
            for el in excluded_elems:
15✔
581
                self._remove_element(el)
15✔
582
        if isinstance(selected_elems, str):
15!
583
            return [selected_elems]
×
584
        if selected_elems is not None:
15!
585
            return [el for el in map(self._reevaluate, selected_elems) if el is not None]
15✔
586
        return []
×
587

588
    def get_filtered_data(self, job_index_number: int | None = None) -> str:
15✔
589
        elements = self._get_filtered_elements(job_index_number)
15✔
590
        if self.skip:
15!
591
            elements = elements[self.skip :]
×
592
        if self.maxitems:
15!
593
            elements = elements[: self.maxitems]
×
594
        elementstrs = (self._to_string(element, self.method) for element in elements)
15✔
595
        return '\n'.join(sorted(elementstrs) if self.sort_items else elementstrs)
15✔
596

597

598
LXML_PARSER_COMMON_SUBFILTERS = {
15✔
599
    'method': 'The method (html or xml) used for parsing',
600
    'exclude': 'Elements to remove from the final result',
601
    'namespaces': 'Mapping of XML namespaces for matching',
602
    'skip': 'Number of elements to skip from the beginning (default: 0)',
603
    'maxitems': 'Maximum number of items to return (default: all)',
604
    'sort': 'Sort matched items after filtering (default: False)',
605
}
606

607

608
class CSSFilter(FilterBase):
15✔
609
    """Filter XML/HTML using CSS selectors."""
610

611
    __kind__ = 'css'
15✔
612

613
    __supported_subfilters__: dict[str, str] = {
15✔
614
        'selector': 'The CSS selector to use for filtering (required)',
615
        **LXML_PARSER_COMMON_SUBFILTERS,
616
    }
617

618
    __default_subfilter__ = 'selector'
15✔
619

620
    EXPR_NAMES: dict[str, str]
15✔
621
    expression: str
15✔
622
    exclude: str
15✔
623
    namespaces: dict[str, str]
15✔
624
    skip: int
15✔
625
    maxitems: int
15✔
626

627
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
628
        if not isinstance(data, str):
15!
629
            raise ValueError
×
630
        lxml_parser = LxmlParser('css', subfilter, 'selector', self.job)
15✔
631
        lxml_parser.feed(data)
15✔
632
        return lxml_parser.get_filtered_data(self.job.index_number), mime_type
15✔
633

634

635
class XPathFilter(FilterBase):
15✔
636
    """Filter XML/HTML using XPath expressions."""
637

638
    __kind__ = 'xpath'
15✔
639

640
    __supported_subfilters__: dict[str, str] = {
15✔
641
        'path': 'The XPath to use for filtering (required)',
642
        **LXML_PARSER_COMMON_SUBFILTERS,
643
    }
644

645
    __default_subfilter__ = 'path'
15✔
646

647
    EXPR_NAMES: dict[str, str]
15✔
648
    expression: str
15✔
649
    exclude: str
15✔
650
    namespaces: dict[str, str]
15✔
651
    skip: int
15✔
652
    maxitems: int
15✔
653

654
    def filter(self, data: str | bytes, mime_type: str, subfilter: dict[str, Any]) -> tuple[str | bytes, str]:
15✔
655
        if not isinstance(data, str):
15!
656
            raise ValueError
×
657
        lxml_parser = LxmlParser('xpath', subfilter, 'path', self.job)
15✔
658
        lxml_parser.feed(data)
15✔
659
        return lxml_parser.get_filtered_data(self.job.index_number), mime_type
15✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc