• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mborsetti / webchanges / 16871375955

11 Aug 2025 05:06AM UTC coverage: 72.561% (-1.9%) from 74.431%
16871375955

push

github

mborsetti
Version 3.31.1rc0

1749 of 2772 branches covered (63.1%)

Branch coverage included in aggregate %.

4574 of 5942 relevant lines covered (76.98%)

5.96 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

71.62
/webchanges/differs.py
1
"""Differs."""
2

3
# The code below is subject to the license contained in the LICENSE file, which is part of the source code.
4

5
from __future__ import annotations
8✔
6

7
import base64
8✔
8
import difflib
8✔
9
import html
8✔
10
import logging
8✔
11
import math
8✔
12
import os
8✔
13
import re
8✔
14
import shlex
8✔
15
import subprocess
8✔
16
import sys
8✔
17
import tempfile
8✔
18
import traceback
8✔
19
import urllib.parse
8✔
20
import warnings
8✔
21
from base64 import b64encode
8✔
22
from concurrent.futures import ThreadPoolExecutor
8✔
23
from datetime import datetime
8✔
24
from io import BytesIO
8✔
25
from pathlib import Path
8✔
26
from typing import TYPE_CHECKING, Any, Iterator, Literal, TypedDict
8✔
27
from xml.parsers.expat import ExpatError
8✔
28
from zoneinfo import ZoneInfo
8✔
29

30
import html2text
8✔
31
import yaml
8✔
32

33
from webchanges.jobs import JobBase
8✔
34
from webchanges.util import TrackSubClasses, linkify, mark_to_html
8✔
35

36
try:
8✔
37
    from deepdiff import DeepDiff
8✔
38
    from deepdiff.model import DiffLevel
8✔
39
except ImportError as e:  # pragma: no cover
40
    DeepDiff = str(e)  # type: ignore[assignment,misc]
41

42
try:
8✔
43
    import httpx
8✔
44
except ImportError:  # pragma: no cover
45
    httpx = None  # type: ignore[assignment]
46
if httpx is not None:
8!
47
    try:
8✔
48
        import h2
8✔
49
    except ImportError:  # pragma: no cover
50
        h2 = None  # type: ignore[assignment]
51

52
try:
8✔
53
    import numpy as np
8✔
54
except ImportError as e:  # pragma: no cover
55
    np = str(e)  # type: ignore[assignment]
56

57
try:
8✔
58
    from PIL import Image, ImageChops, ImageEnhance, ImageStat
8✔
59
except ImportError as e:  # pragma: no cover
60
    Image = str(e)  # type: ignore[assignment]
61

62
# https://stackoverflow.com/questions/712791
63
try:
8✔
64
    import simplejson as jsonlib
8✔
65
except ImportError:  # pragma: no cover
66
    import json as jsonlib  # type: ignore[no-redef]
67

68
try:
8✔
69
    import xmltodict
8✔
70
except ImportError as e:  # pragma: no cover
71
    xmltodict = str(e)  # type: ignore[assignment]
72

73
# https://stackoverflow.com/questions/39740632
74
if TYPE_CHECKING:
75
    from webchanges.handler import JobState
76
    from webchanges.storage import _ConfigDifferDefaults
77

78

79
logger = logging.getLogger(__name__)
8✔
80

81
AiGoogleDirectives = TypedDict(
8✔
82
    'AiGoogleDirectives',
83
    {
84
        'model': str,
85
        'additions_only': str,
86
        'system_instructions': str,
87
        'prompt': str,
88
        'prompt_ud_context_lines': int,
89
        'timeout': int,
90
        'max_output_tokens': int | None,
91
        'temperature': float | None,
92
        'top_p': float | None,
93
        'top_k': float | None,
94
        'thinking_budget': float | None,
95
        'tools': list[Any],
96
    },
97
    total=False,
98
)
99

100

101
class DifferBase(metaclass=TrackSubClasses):
8✔
102
    """The base class for differs."""
103

104
    __subclasses__: dict[str, type[DifferBase]] = {}
8✔
105
    __anonymous_subclasses__: list[type[DifferBase]] = []
8✔
106

107
    __kind__: str = ''
8✔
108

109
    __supported_directives__: dict[str, str] = {}  # this must be present, even if empty
8✔
110

111
    css_added_style = 'background-color:#d1ffd1;color:#082b08;'
8✔
112
    css_deltd_style = 'background-color:#fff0f0;color:#9c1c1c;text-decoration:line-through;'
8✔
113
    css_remvd_style = 'text-decoration:line-through;'
8✔
114

115
    def __init__(self, state: JobState) -> None:
8✔
116
        """
117

118
        :param state: the JobState.
119
        """
120
        self.job = state.job
8✔
121
        self.state = state
8✔
122

123
    @classmethod
8✔
124
    def differ_documentation(cls) -> str:
8✔
125
        """Generates simple differ documentation for use in the --features command line argument.
126

127
        :returns: A string to display.
128
        """
129
        result: list[str] = []
8✔
130
        for sc in TrackSubClasses.sorted_by_kind(cls):
8✔
131
            # default_directive = getattr(sc, '__default_directive__', None)
132
            result.extend((f'  * {sc.__kind__} - {sc.__doc__}',))
8✔
133
            if hasattr(sc, '__supported_directives__'):
8!
134
                for key, doc in sc.__supported_directives__.items():
8✔
135
                    result.append(f'      {key} ... {doc}')
8✔
136
        result.append('\n[] ... Parameter can be supplied as unnamed value\n')
8✔
137
        return '\n'.join(result)
8✔
138

139
    @staticmethod
8✔
140
    def debugger_attached() -> bool:
8✔
141
        """Checks if the code is currently running within an external debugger (e.g. IDE).
142

143
        :returns: True if an external debugger is attached, False otherwise.
144
        """
145
        return sys.breakpointhook.__module__ != 'sys'
8✔
146

147
    @classmethod
8✔
148
    def normalize_differ(
8✔
149
        cls,
150
        differ_spec: dict[str, Any] | None,
151
        job_index_number: int | None = None,
152
        differ_defaults: _ConfigDifferDefaults | None = None,
153
    ) -> tuple[str, dict[str, Any]]:
154
        """Checks the differ_spec for its validity and applies default values.
155

156
        :param differ_spec: The differ as entered by the user; use "unified" if empty.
157
        :param job_index_number: The job index number.
158
        :returns: A validated differ_kind, directives tuple.
159
        """
160

161
        def directives_with_defaults(
8✔
162
            differ_spec: str, directives: dict[str, Any], differ_defaults: _ConfigDifferDefaults | None = None
163
        ) -> dict[str, Any]:
164
            """Obtain differ subdirectives that also contains defaults from the configuration.
165

166
            :param differ_spec: The differ as entered by the user; use "unified" if empty.
167
            :param directives: The differ directives as stated in the job.
168
            :param config: The configuration.
169
            :returns: directives inclusive of configuration defaults.
170
            """
171
            if differ_defaults is None:
8✔
172
                logger.info('No configuration object found to look for differ defaults')
8✔
173
                return directives
8✔
174

175
            differ_default = differ_defaults.get(differ_spec, {})
8✔
176
            if isinstance(differ_default, dict):
8!
177
                # merge defaults from configuration (including dicts) into differ directives without overwriting them
178
                for key, value in differ_default.items():
8!
179
                    if key in directives:
×
180
                        if directives[key] is None:  # for speed
×
181
                            directives[key] = value
×
182
                        elif isinstance(differ_default[key], dict) and isinstance(
×
183
                            directives[key],
184
                            dict,
185
                        ):
186
                            for subkey, subvalue in differ_default[key].items():
×
187
                                if key in directives and subkey not in directives[key]:
×
188
                                    directives[key][subkey] = subvalue
×
189
                        # elif isinstance(differ_default[key], list) and isinstance(directives[key], list):
190
                        #     directives[key] = list(set(directives[key] + differ_default[key]))
191
                    else:
192
                        directives[key] = value
×
193

194
            return directives
8✔
195

196
        differ_spec = differ_spec or {'name': 'unified'}
8✔
197
        directives = differ_spec.copy()
8✔
198
        differ_kind = directives.pop('name', '')
8✔
199
        if not differ_kind:
8✔
200
            if list(directives.keys()) == ['command']:
8!
201
                differ_kind = 'command'
8✔
202
            else:
203
                raise ValueError(
×
204
                    f"Job {job_index_number}: Differ directive must have a 'name' sub-directive: {differ_spec}."
205
                )
206

207
        differcls: DifferBase | None = cls.__subclasses__.get(differ_kind, None)  # type: ignore[assignment]
8✔
208
        if not differcls:
8✔
209
            raise ValueError(f'Job {job_index_number}: No differ named {differ_kind}.')
8✔
210

211
        directives = directives_with_defaults(differ_kind, directives, differ_defaults)
8✔
212

213
        if hasattr(differcls, '__supported_directives__'):
8!
214
            provided_keys = set(directives.keys())
8✔
215
            allowed_keys = set(differcls.__supported_directives__.keys())
8✔
216
            unknown_keys = provided_keys.difference(allowed_keys)
8✔
217
            if unknown_keys and '<any>' not in allowed_keys:
8✔
218
                raise ValueError(
8✔
219
                    f'Job {job_index_number}: Differ {differ_kind} does not support sub-directive(s) '
220
                    f'{", ".join(unknown_keys)} (supported: {", ".join(sorted(allowed_keys))}).'
221
                )
222

223
        return differ_kind, directives
8✔
224

225
    @classmethod
8✔
226
    def process(
8✔
227
        cls,
228
        differ_kind: str,
229
        directives: dict[str, Any],
230
        job_state: JobState,
231
        report_kind: Literal['text', 'markdown', 'html'] = 'text',
232
        tz: ZoneInfo | None = None,
233
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
234
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
235
        """Process the differ.
236

237
        :param differ_kind: The name of the differ.
238
        :param directives: The directives.
239
        :param job_state: The JobState.
240
        :param report_kind: The report kind required.
241
        :param tz: The timezone of the report.
242
        :param _unfiltered_diff: Any previous diffs generated by the same filter, who can be used to generate a diff
243
           for a different report_kind.
244
        :returns: The output of the differ or an error message with traceback if it fails.
245
        """
246
        logger.info(f'Job {job_state.job.index_number}: Applying differ {differ_kind}, directives {directives}')
8✔
247
        differcls: type[DifferBase] | None = cls.__subclasses__.get(differ_kind)  # type: ignore[assignment]
8✔
248
        if differcls:
8✔
249
            try:
8✔
250
                return differcls(job_state).differ(directives, report_kind, _unfiltered_diff, tz)
8✔
251
            except Exception as e:
8✔
252
                # Differ failed
253
                if cls.debugger_attached():
8!
254
                    raise
×
255
                logger.info(
8✔
256
                    f'Job {job_state.job.index_number}: Differ {differ_kind} with {directives=} encountered error {e}'
257
                )
258
                # Undo saving of new data since user won't see the diff
259
                job_state.delete_latest()
8✔
260

261
                job_state.exception = e
8✔
262
                job_state.traceback = job_state.job.format_error(e, traceback.format_exc())
8✔
263
                directives_text = (
8✔
264
                    ', '.join(f'{key}={value}' for key, value in directives.items()) if directives else 'None'
265
                )
266
                return {
8✔
267
                    'text': (
268
                        f'Differ {differ_kind} with directive(s) {directives_text} encountered an '
269
                        f'error:\n\n{job_state.traceback}'
270
                    ),
271
                    'markdown': (
272
                        f'## Differ {differ_kind} with directive(s) {directives_text} '
273
                        f'encountered an error:\n```\n{job_state.traceback}\n```\n'
274
                    ),
275
                    'html': (
276
                        f'<span style="color:red;font-weight:bold">Differ {differ_kind} with directive(s) '
277
                        f'{directives_text} encountered an error:<br>\n<br>\n'
278
                        f'<span style="font-family:monospace;white-space:pre-wrap;">{job_state.traceback}'
279
                        f'</span></span>'
280
                    ),
281
                }
282
        else:
283
            return {}
8✔
284

285
    def differ(
8✔
286
        self,
287
        directives: dict[str, Any],
288
        report_kind: Literal['text', 'markdown', 'html'],
289
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
290
        tz: ZoneInfo | None = None,
291
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
292
        """Generate a formatted diff representation of data changes.
293

294
        Creates a diff representation in one or more output formats (text, markdown, or HTML).
295
        At minimum, this function must return output in the format specified by 'report_kind'.
296
        As results are memoized for performance optimization, it can generate up to all three formats simultaneously.
297

298
        :param state: The JobState.
299

300
        :param directives: The directives.
301
        :param report_kind: The report_kind for which a diff must be generated (at a minimum).
302
        :param _unfiltered_diff: Any previous diffs generated by the same filter, who can be used to generate a diff
303
           for a different report_kind.
304
        :param tz: The timezone of the report.
305
        :returns: An empty dict if there is no change, otherwise a dict with report_kind as key and diff as value
306
           (as a minimum for the report_kind requested).
307
        :raises RuntimeError: If the external diff tool returns an error.
308
        """
309
        raise NotImplementedError()
310

311
    @staticmethod
8✔
312
    def make_timestamp(
8✔
313
        timestamp: float,
314
        tz: ZoneInfo | None = None,
315
    ) -> str:
316
        """Format a timestamp as an RFC 5322 compliant datetime string.
317

318
        Converts a numeric timestamp to a formatted datetime string following the RFC 5322 (email) standard. When a
319
        timezone is provided, its full name, if known, is appended.
320

321
        :param timestamp: The timestamp.
322
        :param tz: The IANA timezone of the report.
323
        :returns: A datetime string in RFC 5322 (email) format or 'NEW' if timestamp is 0.
324
        """
325
        if timestamp:
8✔
326
            dt = datetime.fromtimestamp(timestamp).astimezone(tz=tz)
8✔
327
            # add timezone name if known
328
            if dt.strftime('%Z') != dt.strftime('%z')[:3]:
8✔
329
                cfws = f' ({dt.strftime("%Z")})'
8✔
330
            else:
331
                cfws = ''
8✔
332
            return dt.strftime('%a, %d %b %Y %H:%M:%S %z') + cfws
8✔
333
        else:
334
            return 'NEW'
8✔
335

336
    @staticmethod
8✔
337
    def html2text(data: str) -> str:
8✔
338
        """Converts html to text.
339

340
        :param data: the string in html format.
341
        :returns: the string in text format.
342
        """
343
        parser = html2text.HTML2Text()
8✔
344
        parser.unicode_snob = True
8✔
345
        parser.body_width = 0
8✔
346
        parser.ignore_images = True
8✔
347
        parser.single_line_break = True
8✔
348
        parser.wrap_links = False
8✔
349
        return '\n'.join(line.rstrip() for line in parser.handle(data).splitlines())
8✔
350

351
    def raise_import_error(self, package_name: str, error_message: str) -> None:
8✔
352
        """Raise ImportError for missing package.
353

354
        :param package_name: The name of the module/package that could not be imported.
355
        :param error_message: The error message from ImportError.
356

357
        :raises: ImportError.
358
        """
359
        raise ImportError(
8✔
360
            f"Job {self.job.index_number}: Python package '{package_name}' is not installed; cannot use "
361
            f"'differ: {self.__kind__}' ({self.job.get_location()})\n{error_message}"
362
        )
363

364

365
class UnifiedDiffer(DifferBase):
8✔
366
    """(Default) Generates a unified diff."""
367

368
    __kind__ = 'unified'
8✔
369

370
    __supported_directives__: dict[str, str] = {
8✔
371
        'context_lines': 'the number of context lines (default: 3)',
372
        'range_info': 'include range information lines (default: true)',
373
        'additions_only': 'keep only addition lines (default: false)',
374
        'deletions_only': 'keep only deletion lines (default: false)',
375
    }
376

377
    def unified_diff_to_html(self, diff: str) -> Iterator[str]:
8✔
378
        """
379
        Generates a colorized HTML table from unified diff, applying styles and processing based on job values.
380

381
        :param diff: the unified diff
382
        """
383

384
        def process_line(line: str, line_num: int, is_markdown: bool, monospace_style: str) -> str:
8✔
385
            """
386
            Processes each line for HTML output, handling special cases and styles.
387

388
            :param line: The line to analyze.
389
            :param line_num: The line number in the document.
390
            :param monospace_style: Additional style string for monospace text.
391

392
            :returns: The line processed into an HTML table row string.
393
            """
394
            # The style= string (or empty string) to add to an HTML tag.
395
            if line_num == 0:
8✔
396
                style = 'font-family:monospace;color:darkred;'
8✔
397
            elif line_num == 1:
8✔
398
                style = 'font-family:monospace;color:darkgreen;'
8✔
399
            elif line[0] == '+':  # addition
8✔
400
                style = f'{monospace_style}{self.css_added_style}'
8✔
401
            elif line[0] == '-':  # deletion
8✔
402
                style = f'{monospace_style}{self.css_deltd_style}'
8✔
403
            elif line[0] == ' ':  # context line
8✔
404
                style = monospace_style
8✔
405
            elif line[0] == '@':  # range information
8✔
406
                style = 'font-family:monospace;background-color:#fbfbfb;'
8✔
407
            elif line[0] == '/':  # informational header added by additions_only or deletions_only filters
8!
408
                style = 'background-color:lightyellow;'
8✔
409
            else:
410
                raise RuntimeError('Unified Diff does not comform to standard!')
×
411
            style = f' style="{style}"' if style else ''
8✔
412

413
            if line_num > 1 and line[0] != '@':  # don't apply to headers or range information
8✔
414
                if is_markdown or line[0] == '/':  # our informational header
8✔
415
                    line = mark_to_html(line[1:], self.job.markdown_padded_tables)
8✔
416
                else:
417
                    line = linkify(line[1:])
8✔
418
            return f'<tr><td{style}>{line}</td></tr>'
8✔
419

420
        table_style = (
8✔
421
            ' style="border-collapse:collapse;font-family:monospace;white-space:pre-wrap;"'
422
            if self.job.monospace
423
            else ' style="border-collapse:collapse;"'
424
        )
425
        yield f'<table{table_style}>'
8✔
426
        is_markdown = self.state.is_markdown()
8✔
427
        monospace_style = 'font-family:monospace;' if self.job.monospace else ''
8✔
428
        for i, line in enumerate(diff.splitlines()):
8✔
429
            yield process_line(line, i, is_markdown, monospace_style)
8✔
430
        yield '</table>'
8✔
431

432
    def differ(
8✔
433
        self,
434
        directives: dict[str, Any],
435
        report_kind: Literal['text', 'markdown', 'html'],
436
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
437
        tz: ZoneInfo | None = None,
438
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
439
        additions_only = directives.get('additions_only') or self.job.additions_only
8✔
440
        deletions_only = directives.get('deletions_only') or self.job.deletions_only
8✔
441
        out_diff: dict[Literal['text', 'markdown', 'html'], str] = {}
8✔
442
        if report_kind == 'html' and _unfiltered_diff is not None and 'text' in _unfiltered_diff:
8✔
443
            diff_text = _unfiltered_diff['text']
8✔
444
        else:
445
            empty_return: dict[Literal['text', 'markdown', 'html'], str] = {'text': '', 'markdown': '', 'html': ''}
8✔
446
            contextlines = directives.get('context_lines', self.job.contextlines)
8✔
447
            if contextlines is None:
8✔
448
                if additions_only or deletions_only:
8✔
449
                    contextlines = 0
8✔
450
                else:
451
                    contextlines = 3
8✔
452
            diff = list(
8✔
453
                difflib.unified_diff(
454
                    str(self.state.old_data).splitlines(),
455
                    str(self.state.new_data).splitlines(),
456
                    '@',
457
                    '@',
458
                    self.make_timestamp(self.state.old_timestamp, tz),
459
                    self.make_timestamp(self.state.new_timestamp, tz),
460
                    contextlines,
461
                    lineterm='',
462
                )
463
            )
464
            if not diff:
8✔
465
                self.state.verb = 'changed,no_report'
8✔
466
                return empty_return
8✔
467
            # replace tabs in header lines
468
            diff[0] = diff[0].replace('\t', ' ')
8✔
469
            diff[1] = diff[1].replace('\t', ' ')
8✔
470

471
            if additions_only:
8✔
472
                if len(self.state.old_data) and len(self.state.new_data) / len(self.state.old_data) <= 0.25:
8✔
473
                    diff = [
8✔
474
                        *diff[:2],
475
                        '/**Comparison type: Additions only**',
476
                        '/**Deletions are being shown as 75% or more of the content has been deleted**',
477
                        *diff[2:],
478
                    ]
479
                else:
480
                    head = '---' + diff[0][3:]
8✔
481
                    diff = [line for line in diff if line.startswith('+') or line.startswith('@')]
8!
482
                    diff = [
8!
483
                        line1
484
                        for line1, line2 in zip(['', *diff], [*diff, ''], strict=False)
485
                        if not (line1.startswith('@') and line2.startswith('@'))
486
                    ][1:]
487
                    diff = diff[:-1] if diff[-1].startswith('@') else diff
8✔
488
                    if len(diff) == 1 or len([line for line in diff if line.removeprefix('+').rstrip()]) == 2:
8!
489
                        self.state.verb = 'changed,no_report'
8✔
490
                        return empty_return
8✔
491
                    diff = [head, diff[0], '/**Comparison type: Additions only**', *diff[1:]]
8✔
492
            elif deletions_only:
8✔
493
                head = '--- @' + diff[1][3:]
8✔
494
                diff = [line for line in diff if line.startswith('-') or line.startswith('@')]
8!
495
                diff = [
8!
496
                    line1
497
                    for line1, line2 in zip(['', *diff], [*diff, ''], strict=False)
498
                    if not (line1.startswith('@') and line2.startswith('@'))
499
                ][1:]
500
                diff = diff[:-1] if diff[-1].startswith('@') else diff
8✔
501
                if len(diff) == 1 or len([line for line in diff if line.removeprefix('-').rstrip()]) == 2:
8!
502
                    self.state.verb = 'changed,no_report'
8✔
503
                    return empty_return
8✔
504
                diff = [diff[0], head, '/**Comparison type: Deletions only**', *diff[1:]]
8✔
505

506
            # remove range info lines if needed
507
            if directives.get('range_info') is False or (
8✔
508
                directives.get('range_info') is None and additions_only and (len(diff) < 4 or diff[3][0] != '/')
509
            ):
510
                diff = [line for line in diff if not line.startswith('@@ ')]
8!
511

512
            diff_text = '\n'.join(diff)
8✔
513

514
            out_diff.update(
8✔
515
                {
516
                    'text': diff_text,
517
                    'markdown': diff_text,
518
                }
519
            )
520

521
        if report_kind == 'html':
8✔
522
            out_diff['html'] = '\n'.join(self.unified_diff_to_html(diff_text))
8✔
523

524
        return out_diff
8✔
525

526

527
class TableDiffer(DifferBase):
8✔
528
    """Generates a Python HTML table diff."""
529

530
    __kind__ = 'table'
8✔
531

532
    __supported_directives__: dict[str, str] = {
8✔
533
        'tabsize': 'tab stop spacing (default: 8)',
534
    }
535

536
    def differ(
8✔
537
        self,
538
        directives: dict[str, Any],
539
        report_kind: Literal['text', 'markdown', 'html'],
540
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
541
        tz: ZoneInfo | None = None,
542
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
543
        out_diff: dict[Literal['text', 'markdown', 'html'], str] = {}
8✔
544
        if report_kind in {'text', 'markdown'} and _unfiltered_diff is not None and 'html' in _unfiltered_diff:
8✔
545
            table = _unfiltered_diff['html']
8✔
546
        else:
547
            tabsize = int(directives.get('tabsize', 8))
8✔
548
            html_diff = difflib.HtmlDiff(tabsize=tabsize)
8✔
549
            table = html_diff.make_table(
8✔
550
                str(self.state.old_data).splitlines(keepends=True),
551
                str(self.state.new_data).splitlines(keepends=True),
552
                self.make_timestamp(self.state.old_timestamp, tz),
553
                self.make_timestamp(self.state.new_timestamp, tz),
554
                True,
555
                3,
556
            )
557
            # fix table formatting
558
            table = table.replace('<th ', '<th style="font-family:monospace" ')
8✔
559
            table = table.replace('<td ', '<td style="font-family:monospace" ')
8✔
560
            table = table.replace(' nowrap="nowrap"', '')
8✔
561
            table = table.replace('<a ', '<a style="font-family:monospace;color:inherit" ')
8✔
562
            table = table.replace('<span class="diff_add"', '<span style="color:green;background-color:lightgreen"')
8✔
563
            table = table.replace('<span class="diff_sub"', '<span style="color:red;background-color:lightred"')
8✔
564
            table = table.replace('<span class="diff_chg"', '<span style="color:orange;background-color:lightyellow"')
8✔
565
            out_diff['html'] = table
8✔
566

567
        if report_kind in {'text', 'markdown'}:
8✔
568
            diff_text = self.html2text(table)
8✔
569
            out_diff.update(
8✔
570
                {
571
                    'text': diff_text,
572
                    'markdown': diff_text,
573
                }
574
            )
575

576
        return out_diff
8✔
577

578

579
class CommandDiffer(DifferBase):
8✔
580
    """Runs an external command to generate the diff."""
581

582
    __kind__ = 'command'
8✔
583

584
    __supported_directives__: dict[str, str] = {
8✔
585
        'command': 'The command to execute',
586
        'is_html': 'Whether the output of the command is HTML',
587
    }
588

589
    re_ptags = re.compile(r'^<p>|</p>$')
8✔
590
    re_htags = re.compile(r'<(/?)h\d>')
8✔
591
    re_tagend = re.compile(r'<(?!.*<).*>+$')
8✔
592

593
    def differ(
8✔
594
        self,
595
        directives: dict[str, Any],
596
        report_kind: Literal['text', 'markdown', 'html'],
597
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
598
        tz: ZoneInfo | None = None,
599
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
600
        if self.job.monospace:
8!
601
            head_html = '\n'.join(
×
602
                [
603
                    '<span style="font-family:monospace;white-space:pre-wrap;">',
604
                    # f"Using command differ: {directives['command']}",
605
                    f'<span style="color:darkred;">--- @ {self.make_timestamp(self.state.old_timestamp, tz)}</span>',
606
                    f'<span style="color:darkgreen;">+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}</span>',
607
                ]
608
            )
609
        else:
610
            head_html = '<br>\n'.join(
8✔
611
                [
612
                    '<span style="font-family:monospace;">',
613
                    # f"Using command differ: {directives['command']}",
614
                    f'<span style="color:darkred;">--- @ {self.make_timestamp(self.state.old_timestamp, tz)}</span>',
615
                    f'<span style="color:darkgreen;">+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}</span>',
616
                    '</span>',
617
                ]
618
            )
619

620
        out_diff: dict[Literal['text', 'markdown', 'html'], str] = {}
8✔
621
        command = directives['command']
8✔
622
        if report_kind == 'html' and _unfiltered_diff is not None and 'text' in _unfiltered_diff:
8✔
623
            diff_text = ''.join(_unfiltered_diff['text'].splitlines(keepends=True)[2:])
8✔
624
        else:
625
            old_data = self.state.old_data
8✔
626
            new_data = self.state.new_data
8✔
627
            if self.state.is_markdown():
8✔
628
                # protect the link anchor from being split (won't work)
629
                markdown_links_re = re.compile(r'\[(.*?)][(](.*?)[)]')
8✔
630
                old_data = markdown_links_re.sub(
8!
631
                    lambda x: f'[{urllib.parse.quote(x.group(1))}]({x.group(2)})', str(old_data)
632
                )
633
                new_data = markdown_links_re.sub(
8!
634
                    lambda x: f'[{urllib.parse.quote(x.group(1))}]({x.group(2)})', str(new_data)
635
                )
636

637
            # External diff tool
638
            with tempfile.TemporaryDirectory() as tmp_dir:
8✔
639
                tmp_path = Path(tmp_dir)
8✔
640
                old_file_path = tmp_path.joinpath('old_file')
8✔
641
                new_file_path = tmp_path.joinpath('new_file')
8✔
642
                if isinstance(old_data, str):
8!
643
                    old_file_path.write_text(old_data)
8✔
644
                else:
645
                    old_file_path.write_bytes(old_data)
×
646
                if isinstance(new_data, str):
8!
647
                    new_file_path.write_text(new_data)
8✔
648
                else:
649
                    new_file_path.write_bytes(new_data)
×
650
                cmdline = [*shlex.split(command), str(old_file_path), str(new_file_path)]
8✔
651
                proc = subprocess.run(cmdline, capture_output=True, text=True)  # noqa: S603 subprocess call
8✔
652
            if proc.stderr or proc.returncode > 1:
8✔
653
                raise RuntimeError(
8✔
654
                    f"Job {self.job.index_number}: External differ '{directives}' returned '{proc.stderr.strip()}' "
655
                    f'({self.job.get_location()})'
656
                ) from subprocess.CalledProcessError(proc.returncode, cmdline)
657
            if proc.returncode == 0:
8✔
658
                self.state.verb = 'changed,no_report'
8✔
659
                logger.info(
8✔
660
                    f"Job {self.job.index_number}: Command in differ 'command' returned 0 (no report) "
661
                    f'({self.job.get_location()})'
662
                )
663
                return {'text': '', 'markdown': '', 'html': ''}
8✔
664
            head_text = '\n'.join(
8✔
665
                [
666
                    # f"Using command differ: {directives['command']}",
667
                    f'\033[91m--- @ {self.make_timestamp(self.state.old_timestamp, tz)}\033[0m',
668
                    f'\033[92m+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}\033[0m',
669
                    '',
670
                ]
671
            )
672
            diff = proc.stdout
8✔
673
            if self.state.is_markdown():
8!
674
                # undo the protection of the link anchor from being split
675
                diff = markdown_links_re.sub(lambda x: f'[{urllib.parse.unquote(x.group(1))}]({x.group(2)})', diff)
8!
676
            if command.startswith('wdiff') and self.job.contextlines == 0:
8!
677
                # remove lines that don't have any changes
678
                keeplines = []
×
679
                for line in diff.splitlines(keepends=True):
×
680
                    if any(x in line for x in {'{+', '+}', '[-', '-]'}):
×
681
                        keeplines.append(line)
×
682
                diff = ''.join(keeplines)
×
683
            if directives.get('is_html'):
8!
684
                diff_text = self.html2text(diff)
×
685
                out_diff.update(
×
686
                    {
687
                        'text': head_text + diff_text,
688
                        'markdown': head_text + diff_text,
689
                        'html': head_html + diff,
690
                    }
691
                )
692
            else:
693
                diff_text = diff
8✔
694
                out_diff.update(
8✔
695
                    {
696
                        'text': head_text + diff_text,
697
                        'markdown': head_text + diff_text,
698
                    }
699
                )
700

701
        if report_kind == 'html' and 'html' not in out_diff:
8✔
702
            if command.startswith('wdiff'):
8!
703
                # colorize output of wdiff
704
                out_diff['html'] = head_html + self.wdiff_to_html(diff_text)
×
705
            else:
706
                out_diff['html'] = head_html + html.escape(diff_text)
8✔
707

708
        if self.job.monospace and 'html' in out_diff:
8!
709
            out_diff['html'] += '</span>'
×
710

711
        return out_diff
8✔
712

713
    def wdiff_to_html(self, diff: str) -> str:
8✔
714
        """
715
        Colorize output of wdiff.
716

717
        :param diff: The output of the wdiff command.
718
        :returns: The colorized HTML output.
719
        """
720
        html_diff = html.escape(diff)
8✔
721
        if self.state.is_markdown():
8✔
722
            # detect and fix multiline additions or deletions
723
            is_add = False
8✔
724
            is_del = False
8✔
725
            new_diff = []
8✔
726
            for line in html_diff.splitlines():
8✔
727
                if is_add:
8✔
728
                    line = '{+' + line
8✔
729
                    is_add = False
8✔
730
                elif is_del:
8✔
731
                    line = '[-' + line
8✔
732
                    is_del = False
8✔
733
                for match in re.findall(r'\[-|-]|{\+|\+}', line):
8✔
734
                    if match == '[-':
8✔
735
                        is_del = True
8✔
736
                    if match == '-]':
8✔
737
                        is_del = False
8✔
738
                    if match == '{+':
8✔
739
                        is_add = True
8✔
740
                    if match == '+}':
8✔
741
                        is_add = False
8✔
742
                if is_add:
8✔
743
                    line += '+}'
8✔
744
                elif is_del:
8✔
745
                    line += '-]'
8✔
746
                new_diff.append(line)
8✔
747
            html_diff = '<br>\n'.join(new_diff)
8✔
748

749
        # wdiff colorization (cannot be done with global CSS class as Gmail overrides it)
750
        html_diff = re.sub(
8✔
751
            r'\{\+(.*?)\+}',
752
            lambda x: f'<span style="{self.css_added_style}">{x.group(1)}</span>',
753
            html_diff,
754
            flags=re.DOTALL,
755
        )
756
        html_diff = re.sub(
8✔
757
            r'\[-(.*?)-]',
758
            lambda x: f'<span style="{self.css_deltd_style}">{x.group(1)}</span>',
759
            html_diff,
760
            flags=re.DOTALL,
761
        )
762
        if self.job.monospace:
8✔
763
            return f'<span style="font-family:monospace;white-space:pre-wrap">{html_diff}</span>'
8✔
764
        else:
765
            return html_diff
8✔
766

767

768
class DeepdiffDiffer(DifferBase):
8✔
769
    __kind__ = 'deepdiff'
8✔
770

771
    __supported_directives__: dict[str, str] = {
8✔
772
        'data_type': "either 'json' (default), 'yaml', or 'xml'",
773
        'ignore_order': 'Whether to ignore the order in which the items have appeared (default: false)',
774
        'ignore_string_case': 'Whether to be case-sensitive or not when comparing strings (default: false)',
775
        'significant_digits': (
776
            'The number of digits AFTER the decimal point to be used in the comparis: ston (default: no limit)'
777
        ),
778
        'compact': 'Whether to output a compact representation that also ignores changes of types (default: false)',
779
    }
780

781
    def differ(
8✔
782
        self,
783
        directives: dict[str, Any],
784
        report_kind: Literal['text', 'markdown', 'html'],
785
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
786
        tz: ZoneInfo | None = None,
787
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
788
        if isinstance(DeepDiff, str):  # pragma: no cover
789
            self.raise_import_error('deepdiff', DeepDiff)
790
            raise RuntimeError()  # for type checker
791

792
        span_added = f'<span style="{self.css_added_style}">'
8✔
793
        span_deltd = f'<span style="{self.css_deltd_style}">'
8✔
794
        span_remvd = f'<span style="{self.css_remvd_style}">'
8✔
795

796
        def _pretty_deepdiff(
8✔
797
            ddiff: DeepDiff,
798
            report_kind: Literal['text', 'markdown', 'html'],
799
            compact: bool,
800
        ) -> str:
801
            """
802
            Customized version of deepdiff.serialization.SerializationMixin.pretty method, edited to include the
803
            values deleted or added and an option for colorized HTML output. The pretty human-readable string
804
            output for the diff object regardless of what view was used to generate the diff.
805
            """
806
            # Edited strings originally in deepdiff.serialization._get_pretty_form_text
807
            # See https://github.com/seperman/deepdiff/blob/master/deepdiff/serialization.py
808
            if compact:
8✔
809
                root = '⊤'  # noqa: RUF001 DOWN TACK
8✔
810
                if report_kind == 'html':
8✔
811
                    pretty_form_texts = {
8✔
812
                        'type_changes': (
813
                            f'{{diff_path}}: {span_deltd}{{val_t1}}</span> ⮕ {span_added}{{val_t2}}</span>'
814
                        ),
815
                        'values_changed': (
816
                            f'{{diff_path}}: {span_deltd}{{val_t1}}</span> ⮕ {span_added}{{val_t2}}</span>'
817
                        ),
818
                        'dictionary_item_added': f'{{diff_path}}: {span_added}{{val_t2}}</span>',
819
                        'dictionary_item_removed': f'{span_deltd}{{diff_path}}: {{val_t1}}</span>',
820
                        'iterable_item_added': f'{{diff_path}}: {span_added}{{val_t2}}</span>',
821
                        'iterable_item_removed': f'{span_deltd}{{diff_path}}: {{val_t1}}</span>',
822
                        'attribute_added': f'{{diff_path}}: {span_added}{{val_t2}}</span>',
823
                        'attribute_removed': f'{span_remvd}{{diff_path}}</span>: {span_deltd}{{val_t1}}</span>',
824
                        'set_item_added': f'⊤[{{val_t2}}]: {span_added}{{val_t1}}</span>',  # noqa: RUF001 DOWN TACK
825
                        'set_item_removed': (
826
                            f'{span_remvd}⊤[{{val_t1}}]</span>: {span_deltd}{{val_t2}}</span>'  # noqa: RUF001
827
                        ),
828
                        'repetition_change': (
829
                            f'{{diff_path}}: repetition change {span_deltd}{{val_t1}}</span> ⮕ '
830
                            f'{span_added}{{val_t2}}</span>'
831
                        ),
832
                    }
833
                else:
834
                    pretty_form_texts = {
8✔
835
                        'type_changes': '{diff_path}: {val_t1} → {val_t2}',
836
                        'values_changed': '{diff_path}: {val_t1} → {val_t2}',
837
                        'dictionary_item_added': '{diff_path}: new {val_t2}',
838
                        'dictionary_item_removed': '{diff_path}: removed {val_t1}',
839
                        'iterable_item_added': '{diff_path}: new {val_t2}',
840
                        'iterable_item_removed': '{diff_path}: removed {val_t1}',
841
                        'attribute_added': '{diff_path}: new {val_t2}',
842
                        'attribute_removed': '{diff_path}: removed {val_t1}',
843
                        'set_item_added': '⊤[{val_t2}]: new {val_t1}',  # noqa: RUF001 DOWN TACK
844
                        'set_item_removed': '⊤[{val_t1}]: removed {val_t2}',  # noqa: RUF001 DOWN TACK
845
                        'repetition_change': '{diff_path}: repetition change {val_t1} → {val_t2}',
846
                    }
847
            else:  # not compact
848
                root = 'root'
8✔
849
                if report_kind == 'html':
8✔
850
                    pretty_form_texts = {
8✔
851
                        'type_changes': (
852
                            'Type of {diff_path} changed from {type_t1} to {type_t2} and value changed '
853
                            f'from {span_deltd}{{val_t1}}</span> to {span_added}{{val_t2}}</span>.'
854
                        ),
855
                        'values_changed': (
856
                            f'Value of {{diff_path}} changed from {span_deltd}{{val_t1}}</span> to {span_added}'
857
                            '{val_t2}</span>.'
858
                        ),
859
                        'dictionary_item_added': (
860
                            f'Item {{diff_path}} added to dictionary as {span_added}{{val_t2}}</span>.'
861
                        ),
862
                        'dictionary_item_removed': (
863
                            f'Item {{diff_path}} removed from dictionary (was {span_deltd}{{val_t1}}</span>).'
864
                        ),
865
                        'iterable_item_added': (
866
                            f'Item {{diff_path}} added to iterable as {span_added}{{val_t2}}</span>.'
867
                        ),
868
                        'iterable_item_removed': (
869
                            f'Item {{diff_path}} removed from iterable (was {span_deltd}{{val_t1}}</span>).'
870
                        ),
871
                        'attribute_added': f'Attribute {{diff_path}} added as {span_added}{{val_t2}}</span>.',
872
                        'attribute_removed': f'Attribute {{diff_path}} removed (was {span_deltd}{{val_t1}}</span>).',
873
                        'set_item_added': f'Item root[{{val_t2}}] added to set as {span_added}{{val_t1}}</span>.',
874
                        'set_item_removed': (
875
                            f'Item root[{{val_t1}}] removed from set (was {span_deltd}{{val_t2}}</span>).'
876
                        ),
877
                        'repetition_change': (
878
                            f'Repetition change for item {{diff_path}} ({span_deltd}{{val_t2}}</span>).'
879
                        ),
880
                    }
881
                else:
882
                    pretty_form_texts = {
8✔
883
                        'type_changes': (
884
                            'Type of {diff_path} changed from {type_t1} to {type_t2} and value changed '
885
                            'from {val_t1} to {val_t2}.'
886
                        ),
887
                        'values_changed': 'Value of {diff_path} changed from {val_t1} to {val_t2}.',
888
                        'dictionary_item_added': 'Item {diff_path} added to dictionary as {val_t2}.',
889
                        'dictionary_item_removed': 'Item {diff_path} removed from dictionary (was {val_t1}).',
890
                        'iterable_item_added': 'Item {diff_path} added to iterable as {val_t2}.',
891
                        'iterable_item_removed': 'Item {diff_path} removed from iterable (was {val_t1}).',
892
                        'attribute_added': 'Attribute {diff_path} added as {val_t2}.',
893
                        'attribute_removed': 'Attribute {diff_path} removed (was {val_t1}).',
894
                        'set_item_added': 'Item root[{val_t2}] added to set as {val_t1}.',
895
                        'set_item_removed': 'Item root[{val_t1}] removed from set (was {val_t2}).',
896
                        'repetition_change': 'Repetition change for item {diff_path} ({val_t2}).',
897
                    }
898

899
            def indent_except_first(text: str, indent: str = '  ') -> str:
8✔
900
                """
901
                Indents all lines of a string except the first line.
902

903
                :param text: The input string (potentially multi-line).
904
                :param indent: The string to use for indentation (defaults to two spaces).
905

906
                :returns: The modified string with subsequent lines indented.
907
                """
908
                if not text:
×
909
                    return text
×
910
                lines = text.splitlines(keepends=True)
×
911
                return indent.join(lines)
×
912

913
            def _pretty_print_diff(ddiff: DiffLevel) -> str:
8✔
914
                """
915
                Customized version of deepdiff.serialization.pretty_print_diff() function, edited to include the
916
                values deleted or added.
917
                """
918

919
                def stringify_value(value: Any, type: str) -> str:
8✔
920
                    if compact:
8✔
921
                        if type in {'str', 'int', 'float'}:
8✔
922
                            return f"'{value}'"
8✔
923
                        elif type in {'dict', 'list'}:
8!
924
                            value_string = yaml.safe_dump(
8✔
925
                                value,
926
                                default_flow_style=False,
927
                                width=999,
928
                                allow_unicode=True,
929
                                sort_keys=False,
930
                            )
931
                            value_list = value_string.splitlines(keepends=True)
8✔
932
                            if len(value_list) < 2:
8!
933
                                return value_string
×
934
                            value_string = '\n    ' + '    '.join(value_list)
8✔
935
                            return value_string.rstrip()
8✔
936

937
                        else:
938
                            return str(value)
×
939
                    else:
940
                        if type in {'str', 'int', 'float'}:
8✔
941
                            return f'"{value}"'
8✔
942
                        elif type in {'dict', 'list'}:
8!
943
                            return jsonlib.dumps(value, ensure_ascii=False, indent=2)
8✔
944
                        else:
945
                            return str(value)
×
946

947
                type_t1 = type(ddiff.t1).__name__
8✔
948
                val_t1 = stringify_value(ddiff.t1, type_t1)
8✔
949
                type_t2 = type(ddiff.t2).__name__
8✔
950
                val_t2 = stringify_value(ddiff.t2, type_t2)
8✔
951

952
                diff_path = ddiff.path(root=root)  # type: ignore[no-untyped-call]
8✔
953
                return '• ' + pretty_form_texts.get(
8✔
954
                    ddiff.report_type,
955
                    '',
956
                ).format(
957
                    diff_path=diff_path,
958
                    type_t1=type_t1,
959
                    type_t2=type_t2,
960
                    val_t1=val_t1,
961
                    val_t2=val_t2,
962
                )
963

964
            result = []
8✔
965
            for tree_item in ddiff.tree.values():
8✔
966
                for item_key in tree_item:
8✔
967
                    result.append(_pretty_print_diff(item_key))
8✔
968

969
            return '\n'.join(result)
8✔
970

971
        def deserialize_data(
8✔
972
            data: str | bytes, mime_type: str | None, data_type: str | None, data_label: Literal['Old', 'New']
973
        ) -> tuple[Any, dict | None]:
974
            """Deserializes the stored data.
975

976
            :param data: The stored data.
977
            :param mime_type: The MIME type of the data.
978
            :param data_type: The value of the data_type sub-parameter (overrides MIME type)
979
            :param data_label: Either old or new, used for error reporting
980

981
            :returns: The deserialized data, any errors
982
            """
983
            if not data:
8✔
984
                return data, None
8✔
985
            if data_type is None:
8✔
986
                if mime_type:
8✔
987
                    media_subtype = mime_type.split('/')[-1].split('+')[-1].split('x-')[-1]
8✔
988
                    if media_subtype in ('yaml', 'yml'):
8✔
989
                        data_type = 'yaml'
8✔
990
                    elif media_subtype == 'xml':
8✔
991
                        data_type = 'xml'
8✔
992
                    elif media_subtype == 'json':
8!
993
                        data_type = 'json'
×
994
                    else:
995
                        logger.info(
8✔
996
                            f'Differ {self.__kind__} could not determine data type of {data_label} data from media '
997
                            f"type {mime_type}; defaulting to 'json'"
998
                        )
999
                        data_type = 'json'
8✔
1000
                else:
1001
                    logger.info(
8✔
1002
                        f"Differ {self.__kind__} data_type for {data_label} data defaulted to 'json' as media type is "
1003
                        'missing'
1004
                    )
1005
                    data_type = 'json'
8✔
1006
            parsed_data: Any = ''
8✔
1007
            if data_type == 'json':
8✔
1008
                try:
8✔
1009
                    parsed_data = jsonlib.loads(data)
8✔
1010
                except jsonlib.JSONDecodeError as e:
8✔
1011
                    self.state.exception = e
8✔
1012
                    self.state.traceback = self.job.format_error(e, traceback.format_exc())
8✔
1013
                    logger.error(
8✔
1014
                        f'Job {self.job.index_number}: {data_label} data is invalid JSON: {e} '
1015
                        f'({self.job.get_location()})'
1016
                    )
1017
                    logger.info(f'Job {self.job.index_number}: {data!r}')
8✔
1018
                    return None, {
8✔
1019
                        'text': f'Differ {self.__kind__} ERROR: {data_label} data is invalid JSON\n{e}',
1020
                        'markdown': f'Differ {self.__kind__} **ERROR: {data_label} data is invalid JSON**\n{e}',
1021
                        'html': f'Differ {self.__kind__} <b>ERROR: {data_label} data is invalid JSON</b>\n{e}',
1022
                    }
1023
            elif data_type == 'yaml':
8✔
1024
                try:
8✔
1025
                    parsed_data = yaml.safe_load(data)
8✔
1026
                except yaml.YAMLError as e:
×
1027
                    self.state.exception = e
×
1028
                    self.state.traceback = self.job.format_error(e, traceback.format_exc())
×
1029
                    logger.error(
×
1030
                        f'Job {self.job.index_number}: {data_label} data is invalid YAML: {e} '
1031
                        f'({self.job.get_location()})'
1032
                    )
1033
                    logger.info(f'Job {self.job.index_number}: {data!r}')
×
1034
                    return None, {
×
1035
                        'text': f'Differ {self.__kind__} ERROR: {data_label} data is invalid YAML\n{e}',
1036
                        'markdown': f'Differ {self.__kind__} **ERROR: {data_label} data is invalid YAML**\n{e}',
1037
                        'html': f'Differ {self.__kind__} <b>ERROR: {data_label} data is invalid YAML</b>\n{e}',
1038
                    }
1039
            elif data_type == 'xml':
8✔
1040
                if isinstance(xmltodict, str):  # pragma: no cover
1041
                    self.raise_import_error('xmltodict', xmltodict)
1042
                    raise RuntimeError()  # for type checker
1043
                try:
8✔
1044
                    parsed_data = xmltodict.parse(data)
8✔
1045
                except ExpatError as e:
×
1046
                    self.state.exception = e
×
1047
                    self.state.traceback = self.job.format_error(e, traceback.format_exc())
×
1048
                    logger.error(
×
1049
                        f'Job {self.job.index_number}: {data_label} data is invalid XML: {e} '
1050
                        f'({self.job.get_location()})'
1051
                    )
1052
                    logger.info(f'Job {self.job.index_number}: {data!r}')
×
1053
                    return None, {
×
1054
                        'text': f'Differ {self.__kind__} ERROR: {data_label} data is invalid XML\n{e}',
1055
                        'markdown': f'Differ {self.__kind__} **ERROR: {data_label} data is invalid XML**\n{e}',
1056
                        'html': f'Differ {self.__kind__} <b>ERROR: {data_label} data is invalid XML</b>\n{e}',
1057
                    }
1058
            return parsed_data, None
8✔
1059

1060
        old_data, err = deserialize_data(
8✔
1061
            self.state.old_data,
1062
            self.state.old_mime_type,
1063
            directives.get('data_type'),
1064
            'Old',
1065
        )
1066
        if err:
8✔
1067
            return err
8✔
1068
        new_data, err = deserialize_data(
8✔
1069
            self.state.new_data,
1070
            self.state.new_mime_type,
1071
            directives.get('data_type'),
1072
            'New',
1073
        )
1074
        if err:
8!
1075
            return err
×
1076
        ignore_order = bool(directives.get('ignore_order'))
8✔
1077
        ignore_string_case = bool(directives.get('ignore_string_case'))
8✔
1078
        significant_digits = directives.get('significant_digits')
8✔
1079
        compact = bool(directives.get('compact'))
8✔
1080
        ddiff = DeepDiff(
8✔
1081
            old_data,
1082
            new_data,
1083
            cache_size=500,
1084
            cache_purge_level=0,
1085
            cache_tuning_sample_size=500,
1086
            default_timezone=tz,
1087
            ignore_order=ignore_order,
1088
            ignore_string_type_changes=True,
1089
            ignore_numeric_type_changes=True,
1090
            ignore_string_case=ignore_string_case,
1091
            significant_digits=significant_digits,
1092
            verbose_level=min(2, max(0, math.ceil(3 - logger.getEffectiveLevel() / 10))),
1093
        )
1094
        diff_text = _pretty_deepdiff(ddiff, report_kind, compact)
8✔
1095
        if not diff_text:
8✔
1096
            self.state.verb = 'changed,no_report'
8✔
1097
            return {'text': '', 'markdown': '', 'html': ''}
8✔
1098

1099
        self.job.set_to_monospace()
8✔
1100
        if report_kind == 'html':
8✔
1101
            html_diff = (
8✔
1102
                f'<span style="font-family:monospace;white-space:pre-wrap;">'
1103
                # f'Differ: {self.__kind__} for {data_type}\n'
1104
                f'<span style="color:darkred;">--- @ {self.make_timestamp(self.state.old_timestamp, tz)}</span>\n'
1105
                f'<span style="color:darkgreen;">+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}</span>\n'
1106
                + diff_text.replace('][', ']<wbr>[')
1107
                + '</span>'
1108
            )
1109
            return {'html': html_diff}
8✔
1110
        else:
1111
            text_diff = (
8✔
1112
                # f'Differ: {self.__kind__} for {data_type}\n'
1113
                f'--- @ {self.make_timestamp(self.state.old_timestamp, tz)}\n'
1114
                f'+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}\n'
1115
                f'{diff_text}'
1116
            )
1117
            return {'text': text_diff, 'markdown': text_diff}
8✔
1118

1119

1120
class ImageDiffer(DifferBase):
8✔
1121
    """Compares two images providing an image outlining areas that have changed."""
1122

1123
    __kind__ = 'image'
8✔
1124

1125
    __supported_directives__: dict[str, str] = {
8✔
1126
        'data_type': (
1127
            "'url' (to retrieve an image), 'ascii85' (Ascii85 data), 'base64' (Base64 data) or 'filename' (the path "
1128
            "to an image file) (default: 'url')"
1129
        ),
1130
        'mse_threshold': (
1131
            'the minimum mean squared error (MSE) between two images to consider them changed, if numpy in installed '
1132
            '(default: 2.5)'
1133
        ),
1134
        'ai_google': 'Generative AI summary of changes (BETA)',
1135
    }
1136

1137
    def differ(
8✔
1138
        self,
1139
        directives: dict[str, Any],
1140
        report_kind: Literal['text', 'markdown', 'html'],
1141
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
1142
        tz: ZoneInfo | None = None,
1143
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
1144
        warnings.warn(
2✔
1145
            f'Job {self.job.index_number}: Using differ {self.__kind__}, which is BETA, may have bugs, and may '
1146
            f'change in the future. Please report any problems or suggestions at '
1147
            f'https://github.com/mborsetti/webchanges/discussions.',
1148
            RuntimeWarning,
1149
            stacklevel=1,
1150
        )
1151
        if isinstance(Image, str):  # pragma: no cover
1152
            self.raise_import_error('pillow', Image)
1153
            raise RuntimeError()  # for type checker
1154
        if isinstance(httpx, str):  # pragma: no cover
1155
            self.raise_import_error('httpx', httpx)
1156
            raise RuntimeError()  # for type checker
1157

1158
        def load_image_from_web(url: str) -> Image.Image:
2✔
1159
            """Fetches the image from an url."""
1160
            logging.debug(f'Retrieving image from {url}')
2✔
1161
            with httpx.stream('GET', url, timeout=10) as response:
2✔
1162
                response.raise_for_status()
2✔
1163
                return Image.open(BytesIO(b''.join(response.iter_bytes())))
2✔
1164

1165
        def load_image_from_file(filename: str) -> Image.Image:
2✔
1166
            """Load an image from a file."""
1167
            logging.debug(f'Reading image from {filename}')
2✔
1168
            return Image.open(filename)
2✔
1169

1170
        def load_image_from_base64(base_64: str) -> Image.Image:
2✔
1171
            """Load an image from an encoded bytes object."""
1172
            logging.debug('Retrieving image from a base64 string')
2✔
1173
            return Image.open(BytesIO(base64.b64decode(base_64)))
2✔
1174

1175
        def load_image_from_ascii85(ascii85: str) -> Image.Image:
2✔
1176
            """Load an image from an encoded bytes object."""
1177
            logging.debug('Retrieving image from an ascii85 string')
2✔
1178
            return Image.open(BytesIO(base64.a85decode(ascii85)))
2✔
1179

1180
        def compute_diff_image(img1: Image.Image, img2: Image.Image) -> tuple[Image.Image, np.float64]:
2✔
1181
            """Compute the difference between two images."""
1182
            # Compute the absolute value of the pixel-by-pixel difference between the two images.
1183
            diff_image = ImageChops.difference(img1, img2)
2✔
1184

1185
            # Compute the mean squared error between the images
1186
            if not isinstance(np, str):
2✔
1187
                diff_array = np.array(diff_image)
2✔
1188
                mse_value = np.mean(np.square(diff_array))
2✔
1189
            else:  # pragma: no cover
1190
                mse_value = None
1191

1192
            # Create the diff image by overlaying this difference on a darkened greyscale background
1193
            back_image = img1.convert('L')
2✔
1194
            back_image_brightness = ImageStat.Stat(back_image).rms[0]
2✔
1195
            back_image = ImageEnhance.Brightness(back_image).enhance(back_image_brightness / 225)
2✔
1196

1197
            # Convert the 'L' image to 'RGB' using a matrix that applies to yellow tint
1198
            # The matrix has 12 elements: 4 for Red, 4 for Green, and 4 for Blue.
1199
            # For yellow, we want Red and Green to copy the L values (1.0) and Blue to be zero.
1200
            # The matrix is: [R, G, B, A] for each of the three output channels
1201
            yellow_tint_matrix = (
2✔
1202
                1.0,
1203
                0.0,
1204
                0.0,
1205
                0.0,  # Red = 100% of the grayscale value
1206
                1.0,
1207
                0.0,
1208
                0.0,
1209
                0.0,  # Green = 100% of the grayscale value
1210
                0.0,
1211
                0.0,
1212
                0.0,
1213
                0.0,  # Blue = 0% of the grayscale value
1214
            )
1215

1216
            # Apply the conversion
1217
            diff_colored = diff_image.convert('RGB').convert('RGB', matrix=yellow_tint_matrix)
2✔
1218

1219
            final_img = ImageChops.add(back_image.convert('RGB'), diff_colored)
2✔
1220
            final_img.format = img2.format
2✔
1221

1222
            return final_img, mse_value
2✔
1223

1224
        def ai_google(
2✔
1225
            old_image: Image.Image,
1226
            new_image: Image.Image,
1227
            diff_image: Image.Image,
1228
            directives: AiGoogleDirectives,
1229
        ) -> tuple[str, str]:
1230
            """Summarize changes in image using Generative AI (ALPHA).  Returns summary and model name."""
1231
            logger.info(f'Job {self.job.index_number}: Running ai_google for {self.__kind__} differ')
×
1232
            warnings.warn(
×
1233
                f'Job {self.job.index_number}: Using differ {self.__kind__} with ai_google, which is ALPHA, '
1234
                f'may have bugs, and may change in the future. Please report any problems or suggestions at '
1235
                f'https://github.com/mborsetti/webchanges/discussions.',
1236
                RuntimeWarning,
1237
                stacklevel=1,
1238
            )
1239

1240
            api_version = '1beta'
×
1241
            # GOOGLE_AI_API_KEY deprecated end of 2025
1242
            gemini_api_key = os.environ.get('GEMINI_API_KEY', '').rstrip()
×
1243
            if not gemini_api_key:
×
1244
                gemini_api_key = os.environ.get('GOOGLE_AI_API_KEY', '').rstrip()
×
1245
                if gemini_api_key:
×
1246
                    warnings.warn(
×
1247
                        'The environment variable GOOGLE_AI_API_KEY is deprecated; please use GEMINI_API_KEY instead.',
1248
                        DeprecationWarning,
1249
                        stacklevel=1,
1250
                    )
1251
            if len(gemini_api_key) != 39:
×
1252
                logger.error(
×
1253
                    f'Job {self.job.index_number}: Environment variable GEMINI_API_KEY not found or is of the '
1254
                    f'incorrect length {len(gemini_api_key)} ({self.job.get_location()})'
1255
                )
1256
                return (
×
1257
                    f'## ERROR in summarizing changes using Google AI:\n'
1258
                    f'Environment variable GEMINI_API_KEY not found or is of the incorrect length '
1259
                    f'{len(gemini_api_key)}.\n',
1260
                    '',
1261
                )
1262
            client = httpx.Client(http2=True, timeout=self.job.timeout)
×
1263

1264
            def _load_image(img_data: tuple[str, Image.Image]) -> dict[str, dict[str, str] | Exception | str]:
×
1265
                img_name, image = img_data
×
1266
                # Convert image to bytes
1267
                img_byte_arr = BytesIO()
×
1268
                image.save(img_byte_arr, format=image.format)
×
1269
                image_data = img_byte_arr.getvalue()
×
1270
                mime_type = f'image/{image.format.lower()}'  # type: ignore[union-attr]
×
1271

1272
                logger.info(
×
1273
                    f'Job {self.job.index_number}: Loading {img_name} ({image.format}) to Google AI '
1274
                    f'({len(image_data) / 1024:,.0f} kbytes)'
1275
                )
1276

1277
                # Initial resumable upload request
1278
                headers = {
×
1279
                    'X-Goog-Upload-Protocol': 'resumable',
1280
                    'X-Goog-Upload-Command': 'start',
1281
                    'X-Goog-Upload-Header-Content-Length': str(len(image_data)),
1282
                    'X-Goog-Upload-Header-Content-Type': mime_type,
1283
                    'Content-Type': 'application/json',
1284
                }
1285
                data = {'file': {'display_name': 'TEXT'}}
×
1286

1287
                try:
×
1288
                    response = client.post(
×
1289
                        f'https://generativelanguage.googleapis.com/upload/v{api_version}/files?key={gemini_api_key}',
1290
                        headers=headers,
1291
                        json=data,
1292
                    )
1293
                except httpx.HTTPError as e:
×
1294
                    return {'error': e, 'img_name': img_name}
×
1295
                upload_url = response.headers['X-Goog-Upload-Url']
×
1296

1297
                # Upload the image data
1298
                headers = {
×
1299
                    'Content-Length': str(len(image_data)),
1300
                    'X-Goog-Upload-Offset': '0',
1301
                    'X-Goog-Upload-Command': 'upload, finalize',
1302
                }
1303
                try:
×
1304
                    response = client.post(upload_url, headers=headers, content=image_data)
×
1305
                except httpx.HTTPError as e:
×
1306
                    return {'error': e, 'img_name': img_name}
×
1307

1308
                # Extract file URI from response
1309
                file_info = response.json()
×
1310
                file_uri = file_info['file']['uri']
×
1311
                logger.info(f'Job {self.job.index_number}: {img_name.capitalize()} loaded to {file_uri}')
×
1312

1313
                return {
×
1314
                    'file_data': {
1315
                        'mime_type': mime_type,
1316
                        'file_uri': file_uri,
1317
                    }
1318
                }
1319

1320
            # upload to Google
1321
            additional_parts: list[dict[str, dict[str, str]]] = []
×
1322
            executor = ThreadPoolExecutor()
×
1323
            for additional_part in executor.map(
×
1324
                _load_image,
1325
                (
1326
                    ('old image', old_image),
1327
                    ('new image', new_image),
1328
                    # ('differences image', diff_image),
1329
                ),
1330
            ):
1331
                if 'error' not in additional_part:
×
1332
                    additional_parts.append(additional_part)  # type: ignore[arg-type]
×
1333
                else:
1334
                    logger.error(
×
1335
                        f'Job {self.job.index_number}: ai_google for {self.__kind__} HTTP Client error '
1336
                        f'{type(additional_part["error"])} when loading {additional_part["img_name"]} to Google AI: '
1337
                        f'{additional_part["error"]}'
1338
                    )
1339
                    return (
×
1340
                        f'HTTP Client error {type(additional_part["error"])} when loading '
1341
                        f'{additional_part["img_name"]} to Google AI: {additional_part["error"]}',
1342
                        '',
1343
                    )
1344

1345
            # system_instructions = (
1346
            #     'You are a skilled journalist tasked with summarizing the key differences between two versions '
1347
            #     'of the same image. The audience for your summary is already familiar with the image, so you can'
1348
            #     'focus on the most significant changes.'
1349
            # )
1350
            # model_prompt = (
1351
            #     'You are a skilled visual analyst tasked with analyzing two versions of an image and summarizing the '
1352
            #     'key differences between them. The audience for your summary is already familiar with the '
1353
            #     "image's content, so you should focus only on the most significant differences.\n\n"
1354
            #     '**Instructions:**\n\n'
1355
            #     # '1. Carefully examine the yellow areas in the image '
1356
            #     f"{additional_parts[2]['file_data']['file_uri']}, identify the differences, and describe them.\n"
1357
            #     f"2. Refer to the old version of the image {additional_parts[0]['file_data']['file_uri']} and the "
1358
            #     f"new version {additional_parts[1]['file_data']['file_uri']}.\n"
1359
            #     '3. You are only interested in those differences, such as additions, removals, or alterations, that '
1360
            #     'modify the intended message or interpretation.\n'
1361
            #     '4. Summarize the identified differences, except those ignored, in a clear and concise manner, '
1362
            #     'explaining how the meaning has shifted or evolved in the new version compared to the old version '
1363
            #     'only when necessary. Be specific and provide examples to illustrate your points when needed.\n'
1364
            #     '5. If there are only additions to the image, then summarize the additions.\n'
1365
            #     '6. Use Markdown formatting to structure your summary effectively. Use headings, bullet points, '
1366
            #     'and other Markdown elements as needed to enhance readability.\n'
1367
            #     '7. Restrict your analysis and summary to the information provided within these images. Do '
1368
            #     'not introduce external information or assumptions.\n'
1369
            # )
1370
            system_instructions = (
×
1371
                'You are a meticulous visual comparison agent. Your task is to analyze two images: an "old '
1372
                'version" and a "new version". Your entire focus is on identifying and listing the concrete, '
1373
                'factual differences between them.'
1374
            )
1375
            model_prompt = (
×
1376
                '**Instructions:**\n'
1377
                '\n'
1378
                f'1.  **Identify Changes:** Directly compare the "new version" '
1379
                f'{additional_parts[0]["file_data"]["file_uri"]} to the "old version" '
1380
                f'{additional_parts[1]["file_data"]["file_uri"]} and identify all additions, removals, and alterations '
1381
                'of visual elements.\n'
1382
                '\n'
1383
                '2.  **Filter for Significance:** From your initial list of changes, you must filter out any that '
1384
                'are minor or cosmetic. A difference is only significant if it alters the core subject matter or '
1385
                'the main message of the image.\n'
1386
                '    *   **IGNORE:** Minor shifts in layout, small changes in color saturation or brightness, or '
1387
                'other cosmetic adjustments that do not change what the image is depicting.\n'
1388
                '    *   **FOCUS ON:** Tangible changes such as added objects, removed people, or altered text.\n'
1389
                '\n'
1390
                '3.  **Summarize the Differences:**\n'
1391
                '    *   Present the significant differences as a bulleted list under the heading "Summary of '
1392
                'Changes".\n'
1393
                '    *   For each point, state the difference factually and concisely (e.g., "An apple was added '
1394
                "to the table,\" \"The text on the sign was changed from 'Open' to 'Closed'\").\n"
1395
                '    *   Only if a change directly and clearly alters the primary message or interpretation of the '
1396
                'image, you may add a brief, one-sentence explanation of this shift. Do not speculate on deeper '
1397
                'meanings.\n'
1398
                '\n'
1399
                '4.  **No Differences Found:** If you analyze both images and find no significant differences '
1400
                'according to the criteria above, you must respond with only the phrase: "No significant '
1401
                'differences were found between the two images." Do not attempt to find minor differences to report.\n'
1402
                '\n'
1403
                '5.  **Grounding:** Your entire analysis must be based solely on the visual information present in '
1404
                'the two images. Do not make assumptions or introduce any external information.'
1405
            )
1406
            directives['thinking_budget'] = directives.get('thinking_budget', 24576)
×
1407
            summary, model_version = AIGoogleDiffer._send_to_model(
×
1408
                self.job,
1409
                system_instructions,
1410
                model_prompt,
1411
                additional_parts=additional_parts,  # type: ignore[arg-type]
1412
                directives=directives,
1413
            )
1414

1415
            return summary, model_version
×
1416

1417
        data_type = directives.get('data_type', 'url')
2✔
1418
        mse_threshold = directives.get('mse_threshold', 2.5)
2✔
1419
        if not isinstance(self.state.old_data, str):
2!
1420
            raise ValueError('old_data is not a string')
×
1421
        if not isinstance(self.state.new_data, str):
2!
1422
            raise ValueError('new_data is not a string')
×
1423
        if data_type == 'url':
2✔
1424
            old_image = load_image_from_web(self.state.old_data)
2✔
1425
            new_image = load_image_from_web(self.state.new_data)
2✔
1426
            old_data = f' (<a href="{self.state.old_data}" target="_blank">Old image</a>)'
2✔
1427
            new_data = f' (<a href="{self.state.new_data}" target="_blank">New image</a>)'
2✔
1428
        elif data_type == 'ascii85':
2✔
1429
            old_image = load_image_from_ascii85(self.state.old_data)
2✔
1430
            new_image = load_image_from_ascii85(self.state.new_data)
2✔
1431
            old_data = ''
2✔
1432
            new_data = ''
2✔
1433
        elif data_type == 'base64':
2✔
1434
            old_image = load_image_from_base64(self.state.old_data)
2✔
1435
            new_image = load_image_from_base64(self.state.new_data)
2✔
1436
            old_data = ''
2✔
1437
            new_data = ''
2✔
1438
        else:  # 'filename'
1439
            old_image = load_image_from_file(self.state.old_data)
2✔
1440
            new_image = load_image_from_file(self.state.new_data)
2✔
1441
            old_data = f' (<a href="file://{self.state.old_data}" target="_blank">Old image</a>)'
2✔
1442
            new_data = f' (<a href="file://{self.state.new_data}" target="_blank">New image</a>)'
2✔
1443

1444
        # Check formats  TODO: is it needed? under which circumstances?
1445
        # if new_image.format != old_image.format:
1446
        #     logger.info(f'Image formats do not match: {old_image.format} vs {new_image.format}')
1447
        # else:
1448
        #     logger.debug(f'image format is {old_image.format}')
1449

1450
        # Convert the images to a base64 object for HTML (before shrinking etc.)
1451
        output_stream = BytesIO()
2✔
1452
        old_image.save(output_stream, format=old_image.format)
2✔
1453
        encoded_old = b64encode(output_stream.getvalue()).decode()
2✔
1454
        if data_type == 'url':
2✔
1455
            encoded_new = ''
2✔
1456
        else:
1457
            output_stream = BytesIO()
2✔
1458
            new_image.save(output_stream, format=new_image.format)
2✔
1459
            encoded_new = b64encode(output_stream.getvalue()).decode()
2✔
1460

1461
        # If needed, shrink the larger image
1462
        if new_image.size != old_image.size:
2✔
1463
            if new_image.size > old_image.size:
2✔
1464
                logging.debug(f'Job {self.job.index_number}: Shrinking the new image')
2✔
1465
                img_format = new_image.format
2✔
1466
                new_image = new_image.resize(old_image.size, Image.Resampling.LANCZOS)
2✔
1467
                new_image.format = img_format
2✔
1468

1469
            else:
1470
                logging.debug(f'Job {self.job.index_number}: Shrinking the old image')
2✔
1471
                img_format = old_image.format
2✔
1472
                old_image = old_image.resize(new_image.size, Image.Resampling.LANCZOS)
2✔
1473
                old_image.format = img_format
2✔
1474

1475
        if old_image == new_image:
2✔
1476
            logger.info(f'Job {self.job.index_number}: New image is identical to the old one')
2✔
1477
            self.state.verb = 'unchanged'
2✔
1478
            return {'text': '', 'markdown': '', 'html': ''}
2✔
1479

1480
        diff_image, mse_value = compute_diff_image(old_image, new_image)
2✔
1481
        if mse_value:
2!
1482
            logger.debug(f'Job {self.job.index_number}: MSE value {mse_value:.2f}')
2✔
1483

1484
        if mse_value and mse_value < mse_threshold:
2✔
1485
            logger.info(
2✔
1486
                f'Job {self.job.index_number}: MSE value {mse_value:.2f} below the threshold of {mse_threshold}; '
1487
                f'considering changes not worthy of a report'
1488
            )
1489
            self.state.verb = 'changed,no_report'
2✔
1490
            return {'text': '', 'markdown': '', 'html': ''}
2✔
1491

1492
        # prepare AI summary
1493
        summary = ''
2✔
1494
        model_version = ''
2✔
1495
        if 'ai_google' in directives:
2!
1496
            summary, model_version = ai_google(old_image, new_image, diff_image, directives.get('ai_google', {}))
×
1497

1498
        # Prepare HTML output
1499
        htm = [
2✔
1500
            f'<span style="font-family:monospace">'
1501
            # f'Differ: {self.__kind__} for {data_type}',
1502
            f'<span style="color:darkred;">--- @ {self.make_timestamp(self.state.old_timestamp, tz)}{old_data}</span>',
1503
            f'<span style="color:darkgreen;">+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}{new_data}'
1504
            '</span>',
1505
            '</span>',
1506
            'New image:',
1507
        ]
1508
        if data_type == 'url':
2✔
1509
            htm.append(f'<img src="{self.state.new_data}" style="max-width: 100%; display: block;">')
2✔
1510
        else:
1511
            htm.append(
2✔
1512
                f'<img src="data:image/{(new_image.format or "").lower()};base64,{encoded_new}" '
1513
                'style="max-width: 100%; display: block;">'
1514
            )
1515
        # Convert the difference image to a base64 object
1516
        output_stream = BytesIO()
2✔
1517
        diff_image.save(output_stream, format=diff_image.format)
2✔
1518
        encoded_diff = b64encode(output_stream.getvalue()).decode()
2✔
1519
        htm.extend(
2✔
1520
            [
1521
                'Differences from old (in yellow):',
1522
                f'<img src="data:image/{(diff_image.format or "").lower()};base64,{encoded_diff}" '
1523
                'style="max-width: 100%; display: block;">',
1524
                'Old image:',
1525
                f'<img src="data:image/{(old_image.format or "").lower()};base64,{encoded_old}" '
1526
                'style="max-width: 100%; display: block;">',
1527
            ]
1528
        )
1529
        changed_text = 'The image has changed; please see an HTML report for the visualization.'
2✔
1530
        if not summary:
2!
1531
            return {
2✔
1532
                'text': changed_text,
1533
                'markdown': changed_text,
1534
                'html': '<br>\n'.join(htm),
1535
            }
1536

1537
        newline = '\n'  # For Python < 3.12 f-string {} compatibility
×
1538
        back_n = '\\n'  # For Python < 3.12 f-string {} compatibility
×
1539
        directives_for_str = {key: value for key, value in directives.items() if key != 'model'}
×
1540
        if 'prompt' in directives_for_str:
×
1541
            directives_for_str['prompt'] = '«custom»'
×
1542
        directives_text = (
×
1543
            (
1544
                ' (ai_google directive(s): '
1545
                + ', '.join(f'{key}={str(value).replace(newline, back_n)}' for key, value in directives_for_str.items())
1546
                + ')'
1547
            )
1548
            if directives_for_str
1549
            else ''
1550
        )
1551
        footer = f"Summary by Google Generative AI's model {model_version}{directives_text}."
×
1552
        return {
×
1553
            'text': (
1554
                f'{summary}\n\n\nA visualization of differences is available in {__package__} HTML reports.'
1555
                f'\n------------\n{footer}'
1556
            ),
1557
            'markdown': (
1558
                f'{summary}\n\n\nA visualization of differences is available in {__package__} HTML reports.'
1559
                f'\n* * *\n{footer}'
1560
            ),
1561
            'html': '<br>\n'.join(
1562
                [
1563
                    mark_to_html(summary, extras={'tables'}).replace('<h2>', '<h3>').replace('</h2>', '</h3>'),
1564
                    '',
1565
                    *htm,
1566
                    '-----',
1567
                    f'<i><small>{footer}</small></i>',
1568
                ]
1569
            ),
1570
        }
1571

1572

1573
class AIGoogleDiffer(DifferBase):
8✔
1574
    """(Default) Generates a summary using Google Generative AI (Gemini models).
1575

1576
    Calls Google Gemini APIs; documentation at https://ai.google.dev/api/rest and tutorial at
1577
    https://ai.google.dev/tutorials/rest_quickstart
1578

1579
    """
1580

1581
    __kind__ = 'ai_google'
8✔
1582

1583
    __supported_directives__: dict[str, str] = {
8✔
1584
        'model': ('model name from https://ai.google.dev/gemini-api/docs/models/gemini (default: gemini-2.0-flash)'),
1585
        'system_instructions': (
1586
            'Optional tone and style instructions for the model (default: see documentation at'
1587
            'https://webchanges.readthedocs.io/en/stable/differs.html#ai-google-diff)'
1588
        ),
1589
        'prompt': 'a custom prompt - {unified_diff}, {unified_diff_new}, {old_text} and {new_text} will be replaced',
1590
        'additions_only': 'summarizes only added lines (including as a result of a change)',
1591
        'prompt_ud_context_lines': 'the number of context lines for {unified_diff} (default: 9999)',
1592
        'timeout': 'the number of seconds before timing out the API call (default: 300)',
1593
        'max_output_tokens': "the maximum number of tokens returned by the model (default: None, i.e. model's default)",
1594
        'temperature': "the model's Temperature parameter (default: 0.0)",
1595
        'top_p': "the model's TopP parameter (default: None, i.e. model's default",
1596
        'top_k': "the model's TopK parameter (default: None, i.e. model's default",
1597
        'tools': "data passed on to the API's 'tools' field (default: None)",
1598
        'unified': 'directives passed to the unified differ (default: None)',
1599
    }
1600
    __default_directive__ = 'model'
8✔
1601

1602
    @staticmethod
8✔
1603
    def _send_to_model(
8✔
1604
        job: JobBase,
1605
        system_instructions: str,
1606
        model_prompt: str,
1607
        additional_parts: list[dict[str, str | dict[str, str]]] | None = None,
1608
        directives: AiGoogleDirectives | None = None,
1609
    ) -> tuple[str, str]:
1610
        """Creates the summary request to the model; returns the summary and the version of the actual model used."""
1611
        api_version = '1beta'
×
1612
        if directives is None:
×
1613
            directives = {}
×
1614
        model = directives.get('model', 'gemini-2.0-flash')
×
1615
        timeout = directives.get('timeout', 300)
×
1616
        max_output_tokens = directives.get('max_output_tokens')
×
1617
        temperature = directives.get('temperature', 0.0)
×
1618
        top_p = directives.get('top_p', 1.0 if temperature == 0.0 else None)
×
1619
        top_k = directives.get('top_k')
×
1620
        # GOOGLE_AI_API_KEY deprecated end of 2025
1621
        gemini_api_key = os.environ.get('GEMINI_API_KEY', '').rstrip()
×
1622
        if not gemini_api_key:
×
1623
            gemini_api_key = os.environ.get('GOOGLE_AI_API_KEY', '').rstrip()
×
1624
            if gemini_api_key:
×
1625
                warnings.warn(
×
1626
                    'The environment variable GOOGLE_AI_API_KEY is deprecated; please use GEMINI_API_KEY instead.',
1627
                    DeprecationWarning,
1628
                    stacklevel=1,
1629
                )
1630
        if len(gemini_api_key) != 39:
×
1631
            logger.error(
×
1632
                f'Job {job.index_number}: Environment variable GEMINI_API_KEY not found or is of the '
1633
                f'incorrect length {len(gemini_api_key)} ({job.get_location()})'
1634
            )
1635
            return (
×
1636
                f'## ERROR in summarizing changes using Google AI:\n'
1637
                f'Environment variable GEMINI_API_KEY not found or is of the incorrect length '
1638
                f'{len(gemini_api_key)}.',
1639
                '',
1640
            )
1641

1642
        data: dict[str, Any] = {
×
1643
            'system_instruction': {'parts': [{'text': system_instructions}]},
1644
            'contents': [{'parts': [{'text': model_prompt}]}],
1645
            'generationConfig': {
1646
                'maxOutputTokens': max_output_tokens,
1647
                'temperature': temperature,
1648
                'topP': top_p,
1649
                'topK': top_k,
1650
            },
1651
        }
1652
        if additional_parts:
×
1653
            data['contents'][0]['parts'].extend(additional_parts)
×
1654
        if directives.get('tools'):
×
1655
            data['tools'] = directives['tools']
×
1656
        if directives.get('thinking_budget'):
×
1657
            data['generationConfig'].update({'thinkingConfig': {'thinkingBudget': directives['thinking_budget']}})
×
1658
        logger.info(f'Job {job.index_number}: Making the content generation request to Google AI model {model}')
×
1659
        model_version = model  # default
×
1660
        try:
×
1661
            r = httpx.Client(http2=True).post(
×
1662
                f'https://generativelanguage.googleapis.com/v{api_version}/models/{model}:generateContent?'
1663
                f'key={gemini_api_key}',
1664
                json=data,
1665
                headers={'Content-Type': 'application/json'},
1666
                timeout=timeout,
1667
            )
1668
            if r.is_success:
×
1669
                result = r.json()
×
1670
                candidate = result['candidates'][0]
×
1671
                finish_reason = candidate['finishReason']
×
1672
                model_version = result['modelVersion']
×
1673
                logger.info(f'Job {job.index_number}: AI generation finished by {finish_reason} using {model_version}')
×
1674
                logger.debug(
×
1675
                    f'Job {job.index_number}: Used {result["usageMetadata"]["totalTokenCount"]:,} tokens, '
1676
                    f'{result["usageMetadata"]["totalTokenCount"]:,} of which for the prompt.'
1677
                )
1678
                if 'content' in candidate:
×
1679
                    if 'parts' in candidate['content']:
×
1680
                        summary: str = candidate['content']['parts'][0]['text'].rstrip()
×
1681
                    else:
1682
                        summary = (
×
1683
                            f'## ERROR in summarizing changes using Google AI:\n'
1684
                            f'Model did not return any candidate output:\n'
1685
                            f'finishReason={finish_reason}'
1686
                            f'{jsonlib.dumps(result["usageMetadata"], ensure_ascii=True, indent=2)}'
1687
                        )
1688
                else:
1689
                    summary = (
×
1690
                        f'## ERROR in summarizing changes using Google AI:\n'
1691
                        f'Model did not return any candidate output:\n'
1692
                        f'{jsonlib.dumps(result, ensure_ascii=True, indent=2)}'
1693
                    )
1694

1695
            elif r.status_code == 400:
×
1696
                summary = (
×
1697
                    f'## ERROR in summarizing changes using Google AI:\n'
1698
                    f'Received error from {r.url.host}: '
1699
                    f'{r.json().get("error", {}).get("message") or ""}'
1700
                )
1701
            else:
1702
                summary = (
×
1703
                    f'## ERROR in summarizing changes using Google AI:\n'
1704
                    f'Received error {r.status_code} {r.reason_phrase} from '
1705
                    f'{r.url.host}'
1706
                )
1707
                if r.content:
×
1708
                    summary += f': {r.json().get("error", {}).get("message") or ""}'
×
1709

1710
        except httpx.HTTPError as e:
×
1711
            summary = (
×
1712
                f'## ERROR in summarizing changes using Google AI:\n'
1713
                f'HTTP client error: {e} when requesting data from '
1714
                f'{e.request.url.host}'
1715
            )
1716

1717
        return summary, model_version
×
1718

1719
    def differ(
8✔
1720
        self,
1721
        directives: AiGoogleDirectives,  # type: ignore[override]
1722
        report_kind: Literal['text', 'markdown', 'html'],
1723
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
1724
        tz: ZoneInfo | None = None,
1725
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
1726
        logger.info(f'Job {self.job.index_number}: Running the {self.__kind__} differ from hooks.py')
8✔
1727
        warnings.warn(
8✔
1728
            f'Job {self.job.index_number}: Using differ {self.__kind__}, which is BETA, may have bugs, and may '
1729
            f'change in the future. Please report any problems or suggestions at '
1730
            f'https://github.com/mborsetti/webchanges/discussions.',
1731
            RuntimeWarning,
1732
            stacklevel=1,
1733
        )
1734

1735
        def get_ai_summary(prompt: str, system_instructions: str) -> tuple[str, str]:
8✔
1736
            """Generate AI summary from unified diff, or an error message, plus the model version."""
1737
            # GOOGLE_AI_API_KEY deprecated end of 2025
1738
            gemini_api_key = os.environ.get('GEMINI_API_KEY', '').rstrip()
8✔
1739
            if not gemini_api_key:
8✔
1740
                gemini_api_key = os.environ.get('GOOGLE_AI_API_KEY', '').rstrip()
8✔
1741
                if gemini_api_key:
8!
1742
                    warnings.warn(
×
1743
                        'The environment variable GOOGLE_AI_API_KEY is deprecated; please use GEMINI_API_KEY instead.',
1744
                        DeprecationWarning,
1745
                        stacklevel=1,
1746
                    )
1747
            if len(gemini_api_key) != 39:
8✔
1748
                logger.error(
8✔
1749
                    f'Job {self.job.index_number}: Environment variable GEMINI_API_KEY not found or is of the '
1750
                    f'incorrect length {len(gemini_api_key)} ({self.job.get_location()})'
1751
                )
1752
                return (
8✔
1753
                    f'## ERROR in summarizing changes using Google AI:\n'
1754
                    f'Environment variable GEMINI_API_KEY not found or is of the incorrect length '
1755
                    f'{len(gemini_api_key)}.\n',
1756
                    '',
1757
                )
1758

1759
            if '{unified_diff' in prompt:  # matches unified_diff or unified_diff_new
8!
1760
                default_context_lines = 9999 if '{unified_diff}' in prompt else 0  # none if only unified_diff_new
×
1761
                context_lines = directives.get('prompt_ud_context_lines', default_context_lines)
×
1762
                unified_diff = '\n'.join(
×
1763
                    difflib.unified_diff(
1764
                        str(self.state.old_data).splitlines(),
1765
                        str(self.state.new_data).splitlines(),
1766
                        # '@',
1767
                        # '@',
1768
                        # self.make_timestamp(self.state.old_timestamp, tz),
1769
                        # self.make_timestamp(self.state.new_timestamp, tz),
1770
                        n=context_lines,
1771
                    )
1772
                )
1773
                if not unified_diff:
×
1774
                    # no changes
1775
                    return '', ''
×
1776
            else:
1777
                unified_diff = ''
8✔
1778

1779
            if '{unified_diff_new}' in prompt:
8!
1780
                unified_diff_new_lines = []
×
1781
                for line in unified_diff.splitlines():
×
1782
                    if line.startswith('+'):
×
1783
                        unified_diff_new_lines.append(line[1:])
×
1784
                unified_diff_new = '\n'.join(unified_diff_new_lines)
×
1785
            else:
1786
                unified_diff_new = ''
8✔
1787

1788
            # check if data is different (same data is sent during testing)
1789
            if '{old_text}' in prompt and '{new_text}' in prompt and self.state.old_data == self.state.new_data:
8!
1790
                return '', ''
8✔
1791

1792
            model_prompt = prompt.format(
×
1793
                unified_diff=unified_diff,
1794
                unified_diff_new=unified_diff_new,
1795
                old_text=self.state.old_data,
1796
                new_text=self.state.new_data,
1797
            )
1798

1799
            summary, model_version = self._send_to_model(
×
1800
                self.job,
1801
                system_instructions,
1802
                model_prompt,
1803
                directives=directives,
1804
            )
1805

1806
            return summary, model_version
×
1807

1808
        default_system_instructions = ''
8✔
1809
        if directives.get('additions_only') or self.job.additions_only:
8!
1810
            default_prompt = '\n'.join(
×
1811
                (
1812
                    'You are an expert analyst AI, specializing in the meticulous summarization of change documents. '
1813
                    'Your task is to summarize the provided unified diff in a clear and concise manner with 100% '
1814
                    'fidelity. Restrict your analysis and summary *only* to the diff provided. Do not introduce any '
1815
                    'external information or assumptions.',
1816
                    '',
1817
                    'Format your summary using Markdown. Use headings, bullet points, and other Markdown elements '
1818
                    'where appropriate to create a well-structured and easily readable summary.',
1819
                    '',
1820
                    '{unified_diff_new}',
1821
                )
1822
            )
1823
        else:
1824
            default_prompt = '\n'.join(
8✔
1825
                (
1826
                    'You are an expert analyst AI, specializing in the meticulous comparison of documents. Your task '
1827
                    'is to identify and summarize only the substantive differences between two versions of a text. '
1828
                    'Your audience is already familiar with the original document and needs a concise summary of the '
1829
                    'most significant changes in meaning or information.',
1830
                    '',
1831
                    '**Instructions:**',
1832
                    '',
1833
                    '1.  **Analyze the Texts:** Carefully review the document provided in the `<old_version>` and '
1834
                    '`</old_version>` tags and the one in the `<new_version>` and `</new_version>` tags.',
1835
                    '',
1836
                    '2.  **Identify Substantive Changes:** Compare the two versions to identify all substantive '
1837
                    'changes. A "substantive change" is defined as any modification that alters the core meaning, '
1838
                    'intent, instructions, or factual information presented in the text. This includes, but is not '
1839
                    'limited to:',
1840
                    '*   Additions of new concepts, data, or requirements.',
1841
                    '*   Deletions of existing information, arguments, or clauses.',
1842
                    '*   Alterations to definitions, conclusions, instructions, or key takeaways.',
1843
                    '',
1844
                    '3.  **Exclude Non-Substantive Changes:** You must disregard any changes that are purely cosmetic, '
1845
                    'typographical, or structural and do not alter the substantive meaning of the document. Explicitly '
1846
                    'ignore the following:',
1847
                    '*   Changes in page numbers, section/chapter numbering, or paragraph numbering.',
1848
                    '*   Corrections of spelling, punctuation, or grammatical errors.',
1849
                    '*   Modifications in formatting, layout, or font.',
1850
                    '*   Rewording or rephrasing that does not change the underlying meaning or intent.',
1851
                    '',
1852
                    '4.  **Summarize Material Differences:** Create a summary of the identified substantive changes '
1853
                    'with 100% fidelity. For each change, provide:',
1854
                    '*   A clear heading identifying the relevant section (e.g., "Section 4: User Guidelines" or '
1855
                    '"Chapteron Methodology").',
1856
                    '*   A concise description of the modification, explaining whether it is an addition, deletion, or '
1857
                    'alteration.',
1858
                    '*   A brief analysis of how the change impacts the overall message or instructions, if not '
1859
                    'immediately obvious.',
1860
                    '',
1861
                    '5.  **Output Format:**',
1862
                    '*   Use Markdown for clear and structured presentation (e.g., headings and bullet points).',
1863
                    '*   If no substantive changes are found, state this clearly.',
1864
                    '*   If the changes consist only of additions, summarize the new content.',
1865
                    '',
1866
                    '6.  **Scope Limitation:** Base your analysis strictly on the provided text excerpts. Do not '
1867
                    'infer or introduce any external context or information.',
1868
                    '',
1869
                    '<old_version>',
1870
                    '{old_text}',
1871
                    '</old_version>',
1872
                    '',
1873
                    '<new_version>',
1874
                    '{new_text}',
1875
                    '</new_version>',
1876
                )
1877
            )
1878

1879
        system_instructions = directives.get('system_instructions', default_system_instructions)
8✔
1880
        prompt = directives.get('prompt', default_prompt).replace('\\n', '\n')
8✔
1881
        summary, model_version = get_ai_summary(prompt, system_instructions)
8✔
1882
        if not summary:
8✔
1883
            self.state.verb = 'changed,no_report'
8✔
1884
            return {'text': '', 'markdown': '', 'html': ''}
8✔
1885
        newline = '\n'  # For Python < 3.12 f-string {} compatibility
8✔
1886
        back_n = '\\n'  # For Python < 3.12 f-string {} compatibility
8✔
1887
        directives_for_str = {key: value for key, value in directives.items() if key != 'model'}
8!
1888
        if 'prompt' in directives_for_str:
8!
1889
            directives_for_str['prompt'] = '«custom»'
×
1890
        directives_text = (
8!
1891
            (
1892
                ' (differ directive(s): '
1893
                + ', '.join(f'{key}={str(value).replace(newline, back_n)}' for key, value in directives_for_str.items())
1894
                + ')'
1895
            )
1896
            if directives_for_str
1897
            else ''
1898
        )
1899
        footer = (
8✔
1900
            f"Summary by Google Generative AI's model {model_version}{directives_text}."
1901
            if model_version or directives_text
1902
            else ''
1903
        )
1904
        temp_unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] = {}
8✔
1905
        for rep_kind in ['text', 'html']:  # markdown is same as text
8✔
1906
            unified_report = DifferBase.process(
8✔
1907
                'unified',
1908
                directives.get('unified') or {},  # type: ignore[arg-type]
1909
                self.state,
1910
                rep_kind,  # type: ignore[arg-type]
1911
                tz,
1912
                temp_unfiltered_diff,
1913
            )
1914
        return {
8✔
1915
            'text': (f'{summary}\n\n{unified_report["text"]}' + (f'\n------------\n{footer}' if footer else '')),
1916
            'markdown': (f'{summary}\n\n{unified_report["markdown"]}' + (f'\n* * *\n{footer}' if footer else '')),
1917
            'html': '\n'.join(
1918
                [
1919
                    mark_to_html(summary, extras={'tables'}).replace('<h2>', '<h3>').replace('</h2>', '</h3>'),
1920
                    '<br>',
1921
                    '<br>',
1922
                    unified_report['html'],
1923
                ]
1924
                + (['-----<br>', f'<i><small>{footer}</small></i>'] if footer else [])
1925
            ),
1926
        }
1927

1928

1929
class WdiffDiffer(DifferBase):
8✔
1930
    __kind__ = 'wdiff'
8✔
1931

1932
    __supported_directives__: dict[str, str] = {
8✔
1933
        'context_lines': 'the number of context lines (default: 3)',
1934
        'range_info': 'include range information lines (default: true)',
1935
    }
1936

1937
    def differ(
8✔
1938
        self,
1939
        directives: dict[str, Any],
1940
        report_kind: Literal['text', 'markdown', 'html'],
1941
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
1942
        tz: ZoneInfo | None = None,
1943
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
1944
        warnings.warn(
8✔
1945
            f'Job {self.job.index_number}: Differ {self.__kind__} is WORK IN PROGRESS and has KNOWN bugs which '
1946
            "are being worked on. DO NOT USE AS THE RESULTS WON'T BE CORRECT.",
1947
            RuntimeWarning,
1948
            stacklevel=1,
1949
        )
1950
        if not isinstance(self.state.old_data, str):
8!
1951
            raise ValueError
×
1952
        if not isinstance(self.state.new_data, str):
8!
1953
            raise ValueError
×
1954

1955
        # Split the texts into words tokenizing newline
1956
        if self.state.is_markdown():
8!
1957
            # Don't split spaces in link text, tokenize space as </s>
1958
            old_data = re.sub(r'\[(.*?)\]', lambda x: '[' + x.group(1).replace(' ', '</s>') + ']', self.state.old_data)
8✔
1959
            words1 = old_data.replace('\n', ' <\\n> ').split(' ')
8✔
1960
            new_data = re.sub(r'\[(.*?)\]', lambda x: '[' + x.group(1).replace(' ', '</s>') + ']', self.state.new_data)
8✔
1961
            words2 = new_data.replace('\n', ' <\\n> ').split(' ')
8✔
1962
        else:
1963
            words1 = self.state.old_data.replace('\n', ' <\\n> ').split(' ')
×
1964
            words2 = self.state.new_data.replace('\n', ' <\\n> ').split(' ')
×
1965

1966
        # Create a Differ object
1967
        import difflib
8✔
1968

1969
        d = difflib.Differ()
8✔
1970

1971
        # Generate a difference list
1972
        diff = list(d.compare(words1, words2))
8✔
1973

1974
        add_html = '<span style="background-color:#d1ffd1;color:#082b08;">'
8✔
1975
        rem_html = '<span style="background-color:#fff0f0;color:#9c1c1c;text-decoration:line-through;">'
8✔
1976

1977
        head_text = '\n'.join(
8✔
1978
            [
1979
                # f'Differ: wdiff',
1980
                f'\033[91m--- @ {self.make_timestamp(self.state.old_timestamp, tz)}\033[0m',
1981
                f'\033[92m+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}\033[0m',
1982
                '',
1983
            ]
1984
        )
1985
        head_html = '<br>\n'.join(
8✔
1986
            [
1987
                '<span style="font-family:monospace;">'
1988
                # 'Differ: wdiff',
1989
                f'<span style="color:darkred;">--- @ {self.make_timestamp(self.state.old_timestamp, tz)}</span>',
1990
                f'<span style="color:darkgreen;">+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}</span>'
1991
                f'</span>',
1992
                '',
1993
            ]
1994
        )
1995
        # Process the diff output to make it more wdiff-like
1996
        result_text = []
8✔
1997
        result_html = []
8✔
1998
        prev_word_text = ''
8✔
1999
        prev_word_html = ''
8✔
2000
        next_text = ''
8✔
2001
        next_html = ''
8✔
2002
        add = False
8✔
2003
        rem = False
8✔
2004

2005
        for word_text in [*diff, '  ']:
8✔
2006
            if word_text[0] == '?':  # additional context line
8✔
2007
                continue
8✔
2008
            word_html = word_text
8✔
2009
            pre_text = [next_text] if next_text else []
8✔
2010
            pre_html = [next_html] if next_html else []
8✔
2011
            next_text = ''
8✔
2012
            next_html = ''
8✔
2013

2014
            if word_text[0] == '+' and not add:  # Beginning of additions
8✔
2015
                if rem:
8✔
2016
                    prev_word_html += '</span>'
8✔
2017
                    rem = False
8✔
2018
                if word_text[2:] == '<\\n>':
8!
2019
                    next_text = '\033[92m'
×
2020
                    next_html = add_html
×
2021
                else:
2022
                    pre_text.append('\033[92m')
8✔
2023
                    pre_html.append(add_html)
8✔
2024
                add = True
8✔
2025
            elif word_text[0] == '-' and not rem:  # Beginning of deletions
8✔
2026
                if add:
8✔
2027
                    prev_word_html += '</span>'
8✔
2028
                    add = False
8✔
2029
                if word_text[2:] == '<\\n>':
8!
2030
                    next_text = '\033[91m'
×
2031
                    next_html = rem_html
×
2032
                else:
2033
                    pre_text.append('\033[91m')
8✔
2034
                    pre_html.append(rem_html)
8✔
2035
                rem = True
8✔
2036
            elif word_text[0] == ' ' and (add or rem):  # Unchanged word
8✔
2037
                if prev_word_text == '<\\n>':
8!
2038
                    prev_word_text = '\033[0m<\\n>'
×
2039
                    prev_word_html = '</span><\\n>'
×
2040
                else:
2041
                    prev_word_text += '\033[0m'
8✔
2042
                    prev_word_html += '</span>'
8✔
2043
                add = False
8✔
2044
                rem = False
8✔
2045
            elif word_text[2:] == '<\\n>':  # New line
8✔
2046
                if add:
8!
2047
                    word_text = '  \033[0m<\\n>'
×
2048
                    word_html = '  </span><\\n>'
×
2049
                    add = False
×
2050
                elif rem:
8!
2051
                    word_text = '  \033[0m<\\n>'
×
2052
                    word_html = '  </span><\\n>'
×
2053
                    rem = False
×
2054

2055
            result_text.append(prev_word_text)
8✔
2056
            result_html.append(prev_word_html)
8✔
2057
            pre_text.append(word_text[2:])
8✔
2058
            pre_html.append(word_html[2:])
8✔
2059
            prev_word_text = ''.join(pre_text)
8✔
2060
            prev_word_html = ''.join(pre_html)
8✔
2061
        if add or rem:
8!
2062
            result_text[-1] += '\033[0m'
×
2063
            result_html[-1] += '</span>'
×
2064

2065
        # rebuild the text from words, replacing the newline token
2066
        diff_text = ' '.join(result_text[1:]).replace('<\\n> ', '\n').replace('<\\n>', '\n')
8✔
2067
        diff_html = ' '.join(result_html[1:]).replace('<\\n> ', '\n').replace('<\\n>', '\n')
8✔
2068

2069
        # build contextlines
2070
        contextlines = directives.get('context_lines', self.job.contextlines)
8✔
2071
        # contextlines = 999
2072
        if contextlines is None:
8!
2073
            contextlines = 3
8✔
2074
        range_info = directives.get('range_info', True)
8✔
2075
        if contextlines < len(diff_text.splitlines()):
8!
2076
            lines_with_changes = []
×
2077
            for i, line in enumerate(diff_text.splitlines()):
×
2078
                if '\033[9' in line:
×
2079
                    lines_with_changes.append(i)
×
2080
            if contextlines:
×
2081
                lines_to_keep: set[int] = set()
×
2082
                for i in lines_with_changes:
×
2083
                    lines_to_keep.update(r for r in range(i - contextlines, i + contextlines + 1))
×
2084
            else:
2085
                lines_to_keep = set(lines_with_changes)
×
2086
            new_diff_text = []
×
2087
            new_diff_html = []
×
2088
            last_line = 0
×
2089
            skip = False
×
2090
            i = 0
×
2091
            for i, (line_text, line_html) in enumerate(
×
2092
                zip(diff_text.splitlines(), diff_html.splitlines(), strict=False)
2093
            ):
2094
                if i in lines_to_keep:
×
2095
                    if range_info and skip:
×
2096
                        new_diff_text.append(f'@@ {last_line + 1}...{i} @@')
×
2097
                        new_diff_html.append(f'@@ {last_line + 1}...{i} @@')
×
2098
                        skip = False
×
2099
                    new_diff_text.append(line_text)
×
2100
                    new_diff_html.append(line_html)
×
2101
                    last_line = i + 1
×
2102
                else:
2103
                    skip = True
×
2104
            if (i + 1) != last_line:
×
2105
                if range_info and skip:
×
2106
                    new_diff_text.append(f'@@ {last_line + 1}...{i + 1} @@')
×
2107
                    new_diff_html.append(f'@@ {last_line + 1}...{i + 1} @@')
×
2108
            diff_text = '\n'.join(new_diff_text)
×
2109
            diff_html = '\n'.join(new_diff_html)
×
2110

2111
        if self.state.is_markdown():
8!
2112
            diff_text = diff_text.replace('</s>', ' ')
8✔
2113
            diff_html = diff_html.replace('</s>', ' ')
8✔
2114
            diff_html = mark_to_html(diff_html, self.job.markdown_padded_tables).replace('<p>', '').replace('</p>', '')
8✔
2115

2116
        if self.job.monospace:
8!
2117
            diff_html = f'<span style="font-family:monospace;white-space:pre-wrap">{diff_html}</span>'
×
2118
        else:
2119
            diff_html = diff_html.replace('\n', '<br>\n')
8✔
2120

2121
        return {
8✔
2122
            'text': head_text + diff_text,
2123
            'markdown': head_text + diff_text,
2124
            'html': head_html + diff_html,
2125
        }
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc