• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

mborsetti / webchanges / 17710149774

14 Sep 2025 10:49AM UTC coverage: 71.376% (-3.1%) from 74.434%
17710149774

push

github

mborsetti
Version 3.31.1.post2

1383 of 2314 branches covered (59.77%)

Branch coverage included in aggregate %.

4614 of 6088 relevant lines covered (75.79%)

5.87 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

70.07
/webchanges/differs.py
1
"""Differs."""
2

3
# The code below is subject to the license contained in the LICENSE file, which is part of the source code.
4

5
from __future__ import annotations
8✔
6

7
import base64
8✔
8
import difflib
8✔
9
import html
8✔
10
import logging
8✔
11
import math
8✔
12
import os
8✔
13
import re
8✔
14
import shlex
8✔
15
import subprocess
8✔
16
import sys
8✔
17
import tempfile
8✔
18
import traceback
8✔
19
import urllib.parse
8✔
20
import warnings
8✔
21
from base64 import b64encode
8✔
22
from concurrent.futures import ThreadPoolExecutor
8✔
23
from datetime import datetime
8✔
24
from io import BytesIO
8✔
25
from pathlib import Path
8✔
26
from typing import TYPE_CHECKING, Any, Iterator, Literal, TypedDict
8✔
27
from xml.parsers.expat import ExpatError
8✔
28
from zoneinfo import ZoneInfo
8✔
29

30
import html2text
8✔
31
import yaml
8✔
32

33
from webchanges.jobs import JobBase
8✔
34
from webchanges.util import TrackSubClasses, linkify, mark_to_html
8✔
35

36
try:
8✔
37
    from deepdiff import DeepDiff
8✔
38
    from deepdiff.model import DiffLevel
8✔
39
except ImportError as e:  # pragma: no cover
40
    DeepDiff = str(e)  # type: ignore[assignment,misc]
41

42
try:
8✔
43
    import httpx
8✔
44
except ImportError:  # pragma: no cover
45
    httpx = None  # type: ignore[assignment]
46
if httpx is not None:
8!
47
    try:
8✔
48
        import h2
8✔
49
    except ImportError:  # pragma: no cover
50
        h2 = None  # type: ignore[assignment]
51

52
try:
8✔
53
    import numpy as np
8✔
54
except ImportError as e:  # pragma: no cover
55
    np = str(e)  # type: ignore[assignment]
56

57
try:
8✔
58
    from PIL import Image, ImageChops, ImageEnhance, ImageStat
8✔
59
except ImportError as e:  # pragma: no cover
60
    Image = str(e)  # type: ignore[assignment]
61

62
# https://stackoverflow.com/questions/712791
63
try:
8✔
64
    import simplejson as jsonlib
8✔
65
except ImportError:  # pragma: no cover
66
    import json as jsonlib
67

68
try:
8✔
69
    import xmltodict
8✔
70
except ImportError as e:  # pragma: no cover
71
    xmltodict = str(e)  # type: ignore[assignment]
72

73
# https://stackoverflow.com/questions/39740632
74
if TYPE_CHECKING:
75
    from webchanges.handler import JobState
76
    from webchanges.storage import _ConfigDifferDefaults
77

78

79
logger = logging.getLogger(__name__)
8✔
80

81
AiGoogleDirectives = TypedDict(
8✔
82
    'AiGoogleDirectives',
83
    {
84
        'model': str,
85
        'additions_only': str,
86
        'system_instructions': str,
87
        'prompt': str,
88
        'prompt_ud_context_lines': int,
89
        'timeout': int,
90
        'max_output_tokens': int | None,
91
        'temperature': float | None,
92
        'top_p': float | None,
93
        'top_k': float | None,
94
        'thinking_budget': float | None,
95
        'tools': list[Any],
96
    },
97
    total=False,
98
)
99

100

101
class DifferBase(metaclass=TrackSubClasses):
8✔
102
    """The base class for differs."""
103

104
    __subclasses__: dict[str, type[DifferBase]] = {}
8✔
105
    __anonymous_subclasses__: list[type[DifferBase]] = []
8✔
106

107
    __kind__: str = ''
8✔
108

109
    __supported_directives__: dict[str, str] = {}  # this must be present, even if empty
8✔
110

111
    css_added_style = 'background-color:#d1ffd1;color:#082b08;'
8✔
112
    css_deltd_style = 'background-color:#fff0f0;color:#9c1c1c;text-decoration:line-through;'
8✔
113
    css_remvd_style = 'text-decoration:line-through;'
8✔
114

115
    def __init__(self, state: JobState) -> None:
8✔
116
        """
117

118
        :param state: the JobState.
119
        """
120
        self.job = state.job
8✔
121
        self.state = state
8✔
122

123
    @classmethod
8✔
124
    def differ_documentation(cls) -> str:
8✔
125
        """Generates simple differ documentation for use in the --features command line argument.
126

127
        :returns: A string to display.
128
        """
129
        result: list[str] = []
8✔
130
        for sc in TrackSubClasses.sorted_by_kind(cls):
8✔
131
            # default_directive = getattr(sc, '__default_directive__', None)
132
            result.extend((f'  * {sc.__kind__} - {sc.__doc__}',))
8✔
133
            if hasattr(sc, '__supported_directives__'):
8!
134
                for key, doc in sc.__supported_directives__.items():
8✔
135
                    result.append(f'      {key} ... {doc}')
8✔
136
        result.append('\n[] ... Parameter can be supplied as unnamed value\n')
8✔
137
        return '\n'.join(result)
8✔
138

139
    @staticmethod
8✔
140
    def debugger_attached() -> bool:
8✔
141
        """Checks if the code is currently running within an external debugger (e.g. IDE).
142

143
        :returns: True if an external debugger is attached, False otherwise.
144
        """
145
        return sys.breakpointhook.__module__ != 'sys'
8✔
146

147
    @classmethod
8✔
148
    def normalize_differ(
8✔
149
        cls,
150
        differ_spec: dict[str, Any] | None,
151
        job_index_number: int | None = None,
152
        differ_defaults: _ConfigDifferDefaults | None = None,
153
    ) -> tuple[str, dict[str, Any]]:
154
        """Checks the differ_spec for its validity and applies default values.
155

156
        :param differ_spec: The differ as entered by the user; use "unified" if empty.
157
        :param job_index_number: The job index number.
158
        :returns: A validated differ_kind, directives tuple.
159
        """
160

161
        def directives_with_defaults(
8✔
162
            differ_spec: str, directives: dict[str, Any], differ_defaults: _ConfigDifferDefaults | None = None
163
        ) -> dict[str, Any]:
164
            """Obtain differ subdirectives that also contains defaults from the configuration.
165

166
            :param differ_spec: The differ as entered by the user; use "unified" if empty.
167
            :param directives: The differ directives as stated in the job.
168
            :param config: The configuration.
169
            :returns: directives inclusive of configuration defaults.
170
            """
171
            if differ_defaults is None:
8✔
172
                logger.info('No configuration object found to look for differ defaults')
8✔
173
                return directives
8✔
174

175
            differ_default = differ_defaults.get(differ_spec, {})
8✔
176
            if isinstance(differ_default, dict):
8!
177
                # merge defaults from configuration (including dicts) into differ directives without overwriting them
178
                for key, value in differ_default.items():
8!
179
                    if key in directives:
×
180
                        if directives[key] is None:  # for speed
×
181
                            directives[key] = value
×
182
                        elif isinstance(differ_default[key], dict) and isinstance(
×
183
                            directives[key],
184
                            dict,
185
                        ):
186
                            for subkey, subvalue in differ_default[key].items():
×
187
                                if key in directives and subkey not in directives[key]:
×
188
                                    directives[key][subkey] = subvalue
×
189
                        # elif isinstance(differ_default[key], list) and isinstance(directives[key], list):
190
                        #     directives[key] = list(set(directives[key] + differ_default[key]))
191
                    else:
192
                        directives[key] = value
×
193

194
            return directives
8✔
195

196
        differ_spec = differ_spec or {'name': 'unified'}
8✔
197
        directives = differ_spec.copy()
8✔
198
        differ_kind = directives.pop('name', '')
8✔
199
        if not differ_kind:
8✔
200
            if list(directives.keys()) == ['command']:
8!
201
                differ_kind = 'command'
8✔
202
            else:
203
                raise ValueError(
×
204
                    f"Job {job_index_number}: Differ directive must have a 'name' sub-directive: {differ_spec}."
205
                )
206

207
        differcls: DifferBase | None = cls.__subclasses__.get(differ_kind, None)  # type: ignore[assignment]
8✔
208
        if not differcls:
8✔
209
            raise ValueError(f'Job {job_index_number}: No differ named {differ_kind}.')
8✔
210

211
        directives = directives_with_defaults(differ_kind, directives, differ_defaults)
8✔
212

213
        if hasattr(differcls, '__supported_directives__'):
8!
214
            provided_keys = set(directives.keys())
8✔
215
            allowed_keys = set(differcls.__supported_directives__.keys())
8✔
216
            unknown_keys = provided_keys.difference(allowed_keys)
8✔
217
            if unknown_keys and '<any>' not in allowed_keys:
8✔
218
                raise ValueError(
8✔
219
                    f'Job {job_index_number}: Differ {differ_kind} does not support sub-directive(s) '
220
                    f'{", ".join(unknown_keys)} (supported: {", ".join(sorted(allowed_keys))}).'
221
                )
222

223
        return differ_kind, directives
8✔
224

225
    @classmethod
8✔
226
    def process(
8✔
227
        cls,
228
        differ_kind: str,
229
        directives: dict[str, Any],
230
        job_state: JobState,
231
        report_kind: Literal['text', 'markdown', 'html'] = 'text',
232
        tz: ZoneInfo | None = None,
233
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
234
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
235
        """Process the differ.
236

237
        :param differ_kind: The name of the differ.
238
        :param directives: The directives.
239
        :param job_state: The JobState.
240
        :param report_kind: The report kind required.
241
        :param tz: The timezone of the report.
242
        :param _unfiltered_diff: Any previous diffs generated by the same filter, who can be used to generate a diff
243
           for a different report_kind.
244
        :returns: The output of the differ or an error message with traceback if it fails.
245
        """
246
        logger.info(f'Job {job_state.job.index_number}: Applying differ {differ_kind}, directives {directives}')
8✔
247
        differcls: type[DifferBase] | None = cls.__subclasses__.get(differ_kind)  # type: ignore[assignment]
8✔
248
        if differcls:
8✔
249
            try:
8✔
250
                return differcls(job_state).differ(directives, report_kind, _unfiltered_diff, tz)
8✔
251
            except Exception as e:
8✔
252
                # Differ failed
253
                if cls.debugger_attached():
8!
254
                    raise
×
255
                logger.info(
8✔
256
                    f'Job {job_state.job.index_number}: Differ {differ_kind} with {directives=} encountered error {e}'
257
                )
258
                # Undo saving of new data since user won't see the diff
259
                job_state.delete_latest()
8✔
260

261
                job_state.exception = e
8✔
262
                job_state.traceback = job_state.job.format_error(e, traceback.format_exc())
8✔
263
                directives_text = (
8✔
264
                    ', '.join(f'{key}={value}' for key, value in directives.items()) if directives else 'None'
265
                )
266
                return {
8✔
267
                    'text': (
268
                        f'Differ {differ_kind} with directive(s) {directives_text} encountered an '
269
                        f'error:\n\n{job_state.traceback}'
270
                    ),
271
                    'markdown': (
272
                        f'## Differ {differ_kind} with directive(s) {directives_text} '
273
                        f'encountered an error:\n```\n{job_state.traceback}\n```\n'
274
                    ),
275
                    'html': (
276
                        f'<span style="color:red;font-weight:bold">Differ {differ_kind} with directive(s) '
277
                        f'{directives_text} encountered an error:<br>\n<br>\n'
278
                        f'<span style="font-family:monospace;white-space:pre-wrap;">{job_state.traceback}'
279
                        f'</span></span>'
280
                    ),
281
                }
282
        else:
283
            return {}
8✔
284

285
    def differ(
8✔
286
        self,
287
        directives: dict[str, Any],
288
        report_kind: Literal['text', 'markdown', 'html'],
289
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
290
        tz: ZoneInfo | None = None,
291
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
292
        """Generate a formatted diff representation of data changes.
293

294
        Creates a diff representation in one or more output formats (text, markdown, or HTML).
295
        At minimum, this function must return output in the format specified by 'report_kind'.
296
        As results are memoized for performance optimization, it can generate up to all three formats simultaneously.
297

298
        :param state: The JobState.
299

300
        :param directives: The directives.
301
        :param report_kind: The report_kind for which a diff must be generated (at a minimum).
302
        :param _unfiltered_diff: Any previous diffs generated by the same filter, who can be used to generate a diff
303
           for a different report_kind.
304
        :param tz: The timezone of the report.
305
        :returns: An empty dict if there is no change, otherwise a dict with report_kind as key and diff as value
306
           (as a minimum for the report_kind requested).
307
        :raises RuntimeError: If the external diff tool returns an error.
308
        """
309
        raise NotImplementedError()
310

311
    @staticmethod
8✔
312
    def make_timestamp(
8✔
313
        timestamp: float,
314
        tz: ZoneInfo | None = None,
315
    ) -> str:
316
        """Format a timestamp as an RFC 5322 compliant datetime string.
317

318
        Converts a numeric timestamp to a formatted datetime string following the RFC 5322 (email) standard. When a
319
        timezone is provided, its full name, if known, is appended.
320

321
        :param timestamp: The timestamp.
322
        :param tz: The IANA timezone of the report.
323
        :returns: A datetime string in RFC 5322 (email) format or 'NEW' if timestamp is 0.
324
        """
325
        if timestamp:
8✔
326
            dt = datetime.fromtimestamp(timestamp).astimezone(tz=tz)
8✔
327
            # add timezone name if known
328
            if dt.strftime('%Z') != dt.strftime('%z')[:3]:
8✔
329
                cfws = f' ({dt.strftime("%Z")})'
8✔
330
            else:
331
                cfws = ''
8✔
332
            return dt.strftime('%a, %d %b %Y %H:%M:%S %z') + cfws
8✔
333
        else:
334
            return 'NEW'
8✔
335

336
    @staticmethod
8✔
337
    def html2text(data: str) -> str:
8✔
338
        """Converts html to text.
339

340
        :param data: the string in html format.
341
        :returns: the string in text format.
342
        """
343
        parser = html2text.HTML2Text()
8✔
344
        parser.unicode_snob = True
8✔
345
        parser.body_width = 0
8✔
346
        parser.ignore_images = True
8✔
347
        parser.single_line_break = True
8✔
348
        parser.wrap_links = False
8✔
349
        return '\n'.join(line.rstrip() for line in parser.handle(data).splitlines())
8✔
350

351
    def raise_import_error(self, package_name: str, error_message: str) -> None:
8✔
352
        """Raise ImportError for missing package.
353

354
        :param package_name: The name of the module/package that could not be imported.
355
        :param error_message: The error message from ImportError.
356

357
        :raises: ImportError.
358
        """
359
        raise ImportError(
8✔
360
            f"Job {self.job.index_number}: Python package '{package_name}' is not installed; cannot use "
361
            f"'differ: {self.__kind__}' ({self.job.get_location()})\n{error_message}"
362
        )
363

364

365
class UnifiedDiffer(DifferBase):
8✔
366
    """(Default) Generates a unified diff."""
367

368
    __kind__ = 'unified'
8✔
369

370
    __supported_directives__: dict[str, str] = {
8✔
371
        'context_lines': 'the number of context lines (default: 3)',
372
        'range_info': 'include range information lines (default: true)',
373
        'additions_only': 'keep only addition lines (default: false)',
374
        'deletions_only': 'keep only deletion lines (default: false)',
375
    }
376

377
    def unified_diff_to_html(self, diff: str) -> Iterator[str]:
8✔
378
        """
379
        Generates a colorized HTML table from unified diff, applying styles and processing based on job values.
380

381
        :param diff: the unified diff
382
        """
383

384
        def process_line(line: str, line_num: int, is_markdown: bool, monospace_style: str) -> str:
8✔
385
            """
386
            Processes each line for HTML output, handling special cases and styles.
387

388
            :param line: The line to analyze.
389
            :param line_num: The line number in the document.
390
            :param monospace_style: Additional style string for monospace text.
391

392
            :returns: The line processed into an HTML table row string.
393
            """
394
            # The style= string (or empty string) to add to an HTML tag.
395
            if line_num == 0:
8✔
396
                style = 'font-family:monospace;color:darkred;'
8✔
397
            elif line_num == 1:
8✔
398
                style = 'font-family:monospace;color:darkgreen;'
8✔
399
            elif line[0] == '+':  # addition
8✔
400
                style = f'{monospace_style}{self.css_added_style}'
8✔
401
            elif line[0] == '-':  # deletion
8✔
402
                style = f'{monospace_style}{self.css_deltd_style}'
8✔
403
            elif line[0] == ' ':  # context line
8✔
404
                style = monospace_style
8✔
405
            elif line[0] == '@':  # range information
8✔
406
                style = 'font-family:monospace;background-color:#fbfbfb;'
8✔
407
            elif line[0] == '/':  # informational header added by additions_only or deletions_only filters
8!
408
                style = 'background-color:lightyellow;'
8✔
409
            else:
410
                raise RuntimeError('Unified Diff does not comform to standard!')
×
411
            style = f' style="{style}"' if style else ''
8✔
412

413
            if line_num > 1 and line[0] != '@':  # don't apply to headers or range information
8✔
414
                if is_markdown or line[0] == '/':  # our informational header
8✔
415
                    line = mark_to_html(line[1:], self.job.markdown_padded_tables)
8✔
416
                else:
417
                    line = linkify(line[1:])
8✔
418
            return f'<tr><td{style}>{line}</td></tr>'
8✔
419

420
        table_style = ' style="border-collapse:collapse;"'
8✔
421
        # table_style = (
422
        #     ' style="border-collapse:collapse;font-family:monospace;white-space:pre-wrap;"'
423
        #     if self.job.monospace
424
        #     else ' style="border-collapse:collapse;"'
425
        # )
426
        yield f'<table{table_style}>'
8✔
427
        is_markdown = self.state.is_markdown()
8✔
428
        monospace_style = 'font-family:monospace;' if self.job.monospace else ''
8✔
429
        for i, line in enumerate(diff.splitlines()):
8✔
430
            yield process_line(line, i, is_markdown, monospace_style)
8✔
431
        yield '</table>'
8✔
432

433
    def differ(
8✔
434
        self,
435
        directives: dict[str, Any],
436
        report_kind: Literal['text', 'markdown', 'html'],
437
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
438
        tz: ZoneInfo | None = None,
439
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
440
        additions_only = directives.get('additions_only') or self.job.additions_only
8✔
441
        deletions_only = directives.get('deletions_only') or self.job.deletions_only
8✔
442
        out_diff: dict[Literal['text', 'markdown', 'html'], str] = {}
8✔
443
        if report_kind == 'html' and _unfiltered_diff is not None and 'text' in _unfiltered_diff:
8✔
444
            diff_text = _unfiltered_diff['text']
8✔
445
        else:
446
            empty_return: dict[Literal['text', 'markdown', 'html'], str] = {'text': '', 'markdown': '', 'html': ''}
8✔
447
            contextlines = directives.get('context_lines', self.job.contextlines)
8✔
448
            if contextlines is None:
8✔
449
                if additions_only or deletions_only:
8✔
450
                    contextlines = 0
8✔
451
                else:
452
                    contextlines = 3
8✔
453
            diff = list(
8✔
454
                difflib.unified_diff(
455
                    str(self.state.old_data).splitlines(),
456
                    str(self.state.new_data).splitlines(),
457
                    '@',
458
                    '@',
459
                    self.make_timestamp(self.state.old_timestamp, tz),
460
                    self.make_timestamp(self.state.new_timestamp, tz),
461
                    contextlines,
462
                    lineterm='',
463
                )
464
            )
465
            if not diff:
8✔
466
                self.state.verb = 'changed,no_report'
8✔
467
                return empty_return
8✔
468
            # replace tabs in header lines
469
            diff[0] = diff[0].replace('\t', ' ')
8✔
470
            diff[1] = diff[1].replace('\t', ' ')
8✔
471

472
            if additions_only:
8✔
473
                if len(self.state.old_data) and len(self.state.new_data) / len(self.state.old_data) <= 0.25:
8✔
474
                    diff = [
8✔
475
                        *diff[:2],
476
                        '/**Comparison type: Additions only**',
477
                        '/**Deletions are being shown as 75% or more of the content has been deleted**',
478
                        *diff[2:],
479
                    ]
480
                else:
481
                    head = '---' + diff[0][3:]
8✔
482
                    diff = [line for line in diff if line.startswith('+') or line.startswith('@')]
8✔
483
                    diff = [
8✔
484
                        line1
485
                        for line1, line2 in zip(['', *diff], [*diff, ''], strict=False)
486
                        if not (line1.startswith('@') and line2.startswith('@'))
487
                    ][1:]
488
                    diff = diff[:-1] if diff[-1].startswith('@') else diff
8✔
489
                    if len(diff) == 1 or len([line for line in diff if line.removeprefix('+').rstrip()]) == 2:
8✔
490
                        self.state.verb = 'changed,no_report'
8✔
491
                        return empty_return
8✔
492
                    diff = [head, diff[0], '/**Comparison type: Additions only**', *diff[1:]]
8✔
493
            elif deletions_only:
8✔
494
                head = '--- @' + diff[1][3:]
8✔
495
                diff = [line for line in diff if line.startswith('-') or line.startswith('@')]
8✔
496
                diff = [
8✔
497
                    line1
498
                    for line1, line2 in zip(['', *diff], [*diff, ''], strict=False)
499
                    if not (line1.startswith('@') and line2.startswith('@'))
500
                ][1:]
501
                diff = diff[:-1] if diff[-1].startswith('@') else diff
8✔
502
                if len(diff) == 1 or len([line for line in diff if line.removeprefix('-').rstrip()]) == 2:
8✔
503
                    self.state.verb = 'changed,no_report'
8✔
504
                    return empty_return
8✔
505
                diff = [diff[0], head, '/**Comparison type: Deletions only**', *diff[1:]]
8✔
506

507
            # remove range info lines if needed
508
            if directives.get('range_info') is False or (
8✔
509
                directives.get('range_info') is None and additions_only and (len(diff) < 4 or diff[3][0] != '/')
510
            ):
511
                diff = [line for line in diff if not line.startswith('@@ ')]
8✔
512

513
            diff_text = '\n'.join(diff)
8✔
514

515
            out_diff.update(
8✔
516
                {
517
                    'text': diff_text,
518
                    'markdown': diff_text,
519
                }
520
            )
521

522
        if report_kind == 'html':
8✔
523
            out_diff['html'] = '\n'.join(self.unified_diff_to_html(diff_text))
8✔
524

525
        return out_diff
8✔
526

527

528
class TableDiffer(DifferBase):
8✔
529
    """Generates a Python HTML table diff."""
530

531
    __kind__ = 'table'
8✔
532

533
    __supported_directives__: dict[str, str] = {
8✔
534
        'tabsize': 'tab stop spacing (default: 8)',
535
    }
536

537
    def differ(
8✔
538
        self,
539
        directives: dict[str, Any],
540
        report_kind: Literal['text', 'markdown', 'html'],
541
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
542
        tz: ZoneInfo | None = None,
543
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
544
        out_diff: dict[Literal['text', 'markdown', 'html'], str] = {}
8✔
545
        if report_kind in {'text', 'markdown'} and _unfiltered_diff is not None and 'html' in _unfiltered_diff:
8✔
546
            table = _unfiltered_diff['html']
8✔
547
        else:
548
            tabsize = int(directives.get('tabsize', 8))
8✔
549
            html_diff = difflib.HtmlDiff(tabsize=tabsize)
8✔
550
            table = html_diff.make_table(
8✔
551
                str(self.state.old_data).splitlines(keepends=True),
552
                str(self.state.new_data).splitlines(keepends=True),
553
                self.make_timestamp(self.state.old_timestamp, tz),
554
                self.make_timestamp(self.state.new_timestamp, tz),
555
                True,
556
                3,
557
            )
558
            # fix table formatting
559
            table = table.replace('<th ', '<th style="font-family:monospace" ')
8✔
560
            table = table.replace('<td ', '<td style="font-family:monospace" ')
8✔
561
            table = table.replace(' nowrap="nowrap"', '')
8✔
562
            table = table.replace('<a ', '<a style="font-family:monospace;color:inherit" ')
8✔
563
            table = table.replace('<span class="diff_add"', '<span style="color:green;background-color:lightgreen"')
8✔
564
            table = table.replace('<span class="diff_sub"', '<span style="color:red;background-color:lightred"')
8✔
565
            table = table.replace('<span class="diff_chg"', '<span style="color:orange;background-color:lightyellow"')
8✔
566
            out_diff['html'] = table
8✔
567

568
        if report_kind in {'text', 'markdown'}:
8✔
569
            diff_text = self.html2text(table)
8✔
570
            out_diff.update(
8✔
571
                {
572
                    'text': diff_text,
573
                    'markdown': diff_text,
574
                }
575
            )
576

577
        return out_diff
8✔
578

579

580
class CommandDiffer(DifferBase):
8✔
581
    """Runs an external command to generate the diff."""
582

583
    __kind__ = 'command'
8✔
584

585
    __supported_directives__: dict[str, str] = {
8✔
586
        'command': 'The command to execute',
587
        'is_html': 'Whether the output of the command is HTML',
588
    }
589

590
    re_ptags = re.compile(r'^<p>|</p>$')
8✔
591
    re_htags = re.compile(r'<(/?)h\d>')
8✔
592
    re_tagend = re.compile(r'<(?!.*<).*>+$')
8✔
593

594
    def differ(
8✔
595
        self,
596
        directives: dict[str, Any],
597
        report_kind: Literal['text', 'markdown', 'html'],
598
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
599
        tz: ZoneInfo | None = None,
600
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
601
        if self.job.monospace:
8!
602
            head_html = '\n'.join(
×
603
                [
604
                    '<span style="font-family:monospace;white-space:pre-wrap;">',
605
                    # f"Using command differ: {directives['command']}",
606
                    f'<span style="color:darkred;">--- @ {self.make_timestamp(self.state.old_timestamp, tz)}</span>',
607
                    f'<span style="color:darkgreen;">+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}</span>',
608
                ]
609
            )
610
        else:
611
            head_html = '<br>\n'.join(
8✔
612
                [
613
                    '<span style="font-family:monospace;">',
614
                    # f"Using command differ: {directives['command']}",
615
                    f'<span style="color:darkred;">--- @ {self.make_timestamp(self.state.old_timestamp, tz)}</span>',
616
                    f'<span style="color:darkgreen;">+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}</span>',
617
                    '</span>',
618
                ]
619
            )
620

621
        out_diff: dict[Literal['text', 'markdown', 'html'], str] = {}
8✔
622
        command = directives['command']
8✔
623
        if report_kind == 'html' and _unfiltered_diff is not None and 'text' in _unfiltered_diff:
8✔
624
            diff_text = ''.join(_unfiltered_diff['text'].splitlines(keepends=True)[2:])
8✔
625
        else:
626
            old_data = self.state.old_data
8✔
627
            new_data = self.state.new_data
8✔
628
            if self.state.is_markdown():
8✔
629
                # protect the link anchor from being split (won't work)
630
                markdown_links_re = re.compile(r'\[(.*?)][(](.*?)[)]')
8✔
631
                old_data = markdown_links_re.sub(
8✔
632
                    lambda x: f'[{urllib.parse.quote(x.group(1))}]({x.group(2)})', str(old_data)
633
                )
634
                new_data = markdown_links_re.sub(
8✔
635
                    lambda x: f'[{urllib.parse.quote(x.group(1))}]({x.group(2)})', str(new_data)
636
                )
637

638
            # External diff tool
639
            with tempfile.TemporaryDirectory() as tmp_dir:
8✔
640
                tmp_path = Path(tmp_dir)
8✔
641
                old_file_path = tmp_path.joinpath('old_file')
8✔
642
                new_file_path = tmp_path.joinpath('new_file')
8✔
643
                if isinstance(old_data, str):
8!
644
                    old_file_path.write_text(old_data)
8✔
645
                else:
646
                    old_file_path.write_bytes(old_data)
×
647
                if isinstance(new_data, str):
8!
648
                    new_file_path.write_text(new_data)
8✔
649
                else:
650
                    new_file_path.write_bytes(new_data)
×
651
                cmdline = [*shlex.split(command), str(old_file_path), str(new_file_path)]
8✔
652
                proc = subprocess.run(cmdline, capture_output=True, text=True)  # noqa: S603 subprocess call
8✔
653
            if proc.stderr or proc.returncode > 1:
8✔
654
                raise RuntimeError(
8✔
655
                    f"Job {self.job.index_number}: External differ '{directives}' returned '{proc.stderr.strip()}' "
656
                    f'({self.job.get_location()})'
657
                ) from subprocess.CalledProcessError(proc.returncode, cmdline)
658
            if proc.returncode == 0:
8✔
659
                self.state.verb = 'changed,no_report'
8✔
660
                logger.info(
8✔
661
                    f"Job {self.job.index_number}: Command in differ 'command' returned 0 (no report) "
662
                    f'({self.job.get_location()})'
663
                )
664
                return {'text': '', 'markdown': '', 'html': ''}
8✔
665
            head_text = '\n'.join(
8✔
666
                [
667
                    # f"Using command differ: {directives['command']}",
668
                    f'\033[91m--- @ {self.make_timestamp(self.state.old_timestamp, tz)}\033[0m',
669
                    f'\033[92m+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}\033[0m',
670
                    '',
671
                ]
672
            )
673
            diff = proc.stdout
8✔
674
            if self.state.is_markdown():
8!
675
                # undo the protection of the link anchor from being split
676
                diff = markdown_links_re.sub(lambda x: f'[{urllib.parse.unquote(x.group(1))}]({x.group(2)})', diff)
8✔
677
            if command.startswith('wdiff') and self.job.contextlines == 0:
8!
678
                # remove lines that don't have any changes
679
                keeplines = []
×
680
                for line in diff.splitlines(keepends=True):
×
681
                    if any(x in line for x in {'{+', '+}', '[-', '-]'}):
×
682
                        keeplines.append(line)
×
683
                diff = ''.join(keeplines)
×
684
            if directives.get('is_html'):
8!
685
                diff_text = self.html2text(diff)
×
686
                out_diff.update(
×
687
                    {
688
                        'text': head_text + diff_text,
689
                        'markdown': head_text + diff_text,
690
                        'html': head_html + diff,
691
                    }
692
                )
693
            else:
694
                diff_text = diff
8✔
695
                out_diff.update(
8✔
696
                    {
697
                        'text': head_text + diff_text,
698
                        'markdown': head_text + diff_text,
699
                    }
700
                )
701

702
        if report_kind == 'html' and 'html' not in out_diff:
8✔
703
            if command.startswith('wdiff'):
8!
704
                # colorize output of wdiff
705
                out_diff['html'] = head_html + self.wdiff_to_html(diff_text)
×
706
            else:
707
                out_diff['html'] = head_html + html.escape(diff_text)
8✔
708

709
        if self.job.monospace and 'html' in out_diff:
8!
710
            out_diff['html'] += '</span>'
×
711

712
        return out_diff
8✔
713

714
    def wdiff_to_html(self, diff: str) -> str:
8✔
715
        """
716
        Colorize output of wdiff.
717

718
        :param diff: The output of the wdiff command.
719
        :returns: The colorized HTML output.
720
        """
721
        html_diff = html.escape(diff)
8✔
722
        if self.state.is_markdown():
8✔
723
            # detect and fix multiline additions or deletions
724
            is_add = False
8✔
725
            is_del = False
8✔
726
            new_diff = []
8✔
727
            for line in html_diff.splitlines():
8✔
728
                if is_add:
8✔
729
                    line = '{+' + line
8✔
730
                    is_add = False
8✔
731
                elif is_del:
8✔
732
                    line = '[-' + line
8✔
733
                    is_del = False
8✔
734
                for match in re.findall(r'\[-|-]|{\+|\+}', line):
8✔
735
                    if match == '[-':
8✔
736
                        is_del = True
8✔
737
                    if match == '-]':
8✔
738
                        is_del = False
8✔
739
                    if match == '{+':
8✔
740
                        is_add = True
8✔
741
                    if match == '+}':
8✔
742
                        is_add = False
8✔
743
                if is_add:
8✔
744
                    line += '+}'
8✔
745
                elif is_del:
8✔
746
                    line += '-]'
8✔
747
                new_diff.append(line)
8✔
748
            html_diff = '<br>\n'.join(new_diff)
8✔
749

750
        # wdiff colorization (cannot be done with global CSS class as Gmail overrides it)
751
        html_diff = re.sub(
8✔
752
            r'\{\+(.*?)\+}',
753
            lambda x: f'<span style="{self.css_added_style}">{x.group(1)}</span>',
754
            html_diff,
755
            flags=re.DOTALL,
756
        )
757
        html_diff = re.sub(
8✔
758
            r'\[-(.*?)-]',
759
            lambda x: f'<span style="{self.css_deltd_style}">{x.group(1)}</span>',
760
            html_diff,
761
            flags=re.DOTALL,
762
        )
763
        if self.job.monospace:
8✔
764
            return f'<span style="font-family:monospace;white-space:pre-wrap">{html_diff}</span>'
8✔
765
        else:
766
            return html_diff
8✔
767

768

769
class DeepdiffDiffer(DifferBase):
8✔
770
    __kind__ = 'deepdiff'
8✔
771

772
    __supported_directives__: dict[str, str] = {
8✔
773
        'data_type': "either 'json' (default), 'yaml', or 'xml'",
774
        'ignore_order': 'Whether to ignore the order in which the items have appeared (default: false)',
775
        'ignore_string_case': 'Whether to be case-sensitive or not when comparing strings (default: false)',
776
        'significant_digits': (
777
            'The number of digits AFTER the decimal point to be used in the comparis: ston (default: no limit)'
778
        ),
779
        'compact': 'Whether to output a compact representation that also ignores changes of types (default: false)',
780
    }
781

782
    def differ(
8✔
783
        self,
784
        directives: dict[str, Any],
785
        report_kind: Literal['text', 'markdown', 'html'],
786
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
787
        tz: ZoneInfo | None = None,
788
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
789
        if isinstance(DeepDiff, str):  # pragma: no cover
790
            self.raise_import_error('deepdiff', DeepDiff)
791
            raise RuntimeError()  # for type checker
792

793
        span_added = f'<span style="{self.css_added_style}">'
8✔
794
        span_deltd = f'<span style="{self.css_deltd_style}">'
8✔
795
        span_remvd = f'<span style="{self.css_remvd_style}">'
8✔
796

797
        def _pretty_deepdiff(
8✔
798
            ddiff: DeepDiff,
799
            report_kind: Literal['text', 'markdown', 'html'],
800
            compact: bool,
801
        ) -> str:
802
            """
803
            Customized version of deepdiff.serialization.SerializationMixin.pretty method, edited to include the
804
            values deleted or added and an option for colorized HTML output. The pretty human-readable string
805
            output for the diff object regardless of what view was used to generate the diff.
806

807
            :param ddiff: The diff object.
808
            :param report_kind: The report kind.
809
            :param compact: Whether to return diff text in compact mode.
810
            """
811
            # Edited strings originally in deepdiff.serialization._get_pretty_form_text
812
            # See https://github.com/seperman/deepdiff/blob/master/deepdiff/serialization.py
813
            if compact:
8✔
814
                root = '⊤'  # noqa: RUF001 DOWN TACK
8✔
815
                if report_kind == 'html':
8✔
816
                    pretty_form_texts = {
8✔
817
                        'type_changes': (
818
                            f'{{diff_path}}: {span_deltd}{{val_t1}}</span> ⮕ {span_added}{{val_t2}}</span>'
819
                        ),
820
                        'values_changed': (
821
                            f'{{diff_path}}: {span_deltd}{{val_t1}}</span> ⮕ {span_added}{{val_t2}}</span>'
822
                        ),
823
                        'dictionary_item_added': f'{{diff_path}}: {span_added}{{val_t2}}</span>',
824
                        'dictionary_item_removed': f'{span_deltd}{{diff_path}}: {{val_t1}}</span>',
825
                        'iterable_item_added': f'{{diff_path}}: {span_added}{{val_t2}}</span>',
826
                        'iterable_item_removed': f'{span_deltd}{{diff_path}}: {{val_t1}}</span>',
827
                        'attribute_added': f'{{diff_path}}: {span_added}{{val_t2}}</span>',
828
                        'attribute_removed': f'{span_remvd}{{diff_path}}</span>: {span_deltd}{{val_t1}}</span>',
829
                        'set_item_added': f'⊤[{{val_t2}}]: {span_added}{{val_t1}}</span>',  # noqa: RUF001 DOWN TACK
830
                        'set_item_removed': (
831
                            f'{span_remvd}⊤[{{val_t1}}]</span>: {span_deltd}{{val_t2}}</span>'  # noqa: RUF001
832
                        ),
833
                        'repetition_change': (
834
                            f'{{diff_path}}: repetition change {span_deltd}{{val_t1}}</span> ⮕ '
835
                            f'{span_added}{{val_t2}}</span>'
836
                        ),
837
                    }
838
                else:
839
                    pretty_form_texts = {
8✔
840
                        'type_changes': '{diff_path}: {val_t1} → {val_t2}',
841
                        'values_changed': '{diff_path}: {val_t1} → {val_t2}',
842
                        'dictionary_item_added': '{diff_path}: new {val_t2}',
843
                        'dictionary_item_removed': '{diff_path}: removed {val_t1}',
844
                        'iterable_item_added': '{diff_path}: new {val_t2}',
845
                        'iterable_item_removed': '{diff_path}: removed {val_t1}',
846
                        'attribute_added': '{diff_path}: new {val_t2}',
847
                        'attribute_removed': '{diff_path}: removed {val_t1}',
848
                        'set_item_added': '⊤[{val_t2}]: new {val_t1}',  # noqa: RUF001 DOWN TACK
849
                        'set_item_removed': '⊤[{val_t1}]: removed {val_t2}',  # noqa: RUF001 DOWN TACK
850
                        'repetition_change': '{diff_path}: repetition change {val_t1} → {val_t2}',
851
                    }
852
            else:  # not compact
853
                root = 'root'
8✔
854
                if report_kind == 'html':
8✔
855
                    pretty_form_texts = {
8✔
856
                        'type_changes': (
857
                            'Type of {diff_path} changed from {type_t1} to {type_t2} and value changed '
858
                            f'from {span_deltd}{{val_t1}}</span> to {span_added}{{val_t2}}</span>.'
859
                        ),
860
                        'values_changed': (
861
                            f'Value of {{diff_path}} changed from {span_deltd}{{val_t1}}</span> to {span_added}'
862
                            '{val_t2}</span>.'
863
                        ),
864
                        'dictionary_item_added': (
865
                            f'Item {{diff_path}} added to dictionary as {span_added}{{val_t2}}</span>.'
866
                        ),
867
                        'dictionary_item_removed': (
868
                            f'Item {{diff_path}} removed from dictionary (was {span_deltd}{{val_t1}}</span>).'
869
                        ),
870
                        'iterable_item_added': (
871
                            f'Item {{diff_path}} added to iterable as {span_added}{{val_t2}}</span>.'
872
                        ),
873
                        'iterable_item_removed': (
874
                            f'Item {{diff_path}} removed from iterable (was {span_deltd}{{val_t1}}</span>).'
875
                        ),
876
                        'attribute_added': f'Attribute {{diff_path}} added as {span_added}{{val_t2}}</span>.',
877
                        'attribute_removed': f'Attribute {{diff_path}} removed (was {span_deltd}{{val_t1}}</span>).',
878
                        'set_item_added': f'Item root[{{val_t2}}] added to set as {span_added}{{val_t1}}</span>.',
879
                        'set_item_removed': (
880
                            f'Item root[{{val_t1}}] removed from set (was {span_deltd}{{val_t2}}</span>).'
881
                        ),
882
                        'repetition_change': (
883
                            f'Repetition change for item {{diff_path}} ({span_deltd}{{val_t2}}</span>).'
884
                        ),
885
                    }
886
                else:
887
                    pretty_form_texts = {
8✔
888
                        'type_changes': (
889
                            'Type of {diff_path} changed from {type_t1} to {type_t2} and value changed '
890
                            'from {val_t1} to {val_t2}.'
891
                        ),
892
                        'values_changed': 'Value of {diff_path} changed from {val_t1} to {val_t2}.',
893
                        'dictionary_item_added': 'Item {diff_path} added to dictionary as {val_t2}.',
894
                        'dictionary_item_removed': 'Item {diff_path} removed from dictionary (was {val_t1}).',
895
                        'iterable_item_added': 'Item {diff_path} added to iterable as {val_t2}.',
896
                        'iterable_item_removed': 'Item {diff_path} removed from iterable (was {val_t1}).',
897
                        'attribute_added': 'Attribute {diff_path} added as {val_t2}.',
898
                        'attribute_removed': 'Attribute {diff_path} removed (was {val_t1}).',
899
                        'set_item_added': 'Item root[{val_t2}] added to set as {val_t1}.',
900
                        'set_item_removed': 'Item root[{val_t1}] removed from set (was {val_t2}).',
901
                        'repetition_change': 'Repetition change for item {diff_path} ({val_t2}).',
902
                    }
903

904
            def _pretty_print_diff(ddiff: DiffLevel) -> str:
8✔
905
                """
906
                Customized version of deepdiff.serialization.pretty_print_diff() function, edited to include the
907
                values deleted or added.
908
                """
909

910
                def stringify_value(value: Any, type: str) -> str:
8✔
911
                    if type in {'str', 'int', 'float'}:
8✔
912
                        if compact:
8✔
913
                            return f"'{value}'"
8✔
914
                        else:
915
                            return f'"{value}"'
8✔
916
                    elif type in {'dict', 'list'}:
8!
917
                        if compact:
8✔
918
                            value_string = yaml.safe_dump(
8✔
919
                                value,
920
                                default_flow_style=False,
921
                                width=999,
922
                                allow_unicode=True,
923
                                sort_keys=False,
924
                            )
925
                            value_list = value_string.splitlines(keepends=True)
8✔
926
                            if len(value_list) < 2:
8!
927
                                return value_string
×
928
                            value_string = '\n    ' + '    '.join(value_list)
8✔
929
                            return value_string.rstrip()
8✔
930
                        else:
931
                            return jsonlib.dumps(value, ensure_ascii=False, indent=2)  # type: ignore[no-any-return]
8✔
932
                    else:
933
                        return str(value)
×
934

935
                type_t1 = type(ddiff.t1).__name__
8✔
936
                val_t1 = stringify_value(ddiff.t1, type_t1)
8✔
937
                type_t2 = type(ddiff.t2).__name__
8✔
938
                val_t2 = stringify_value(ddiff.t2, type_t2)
8✔
939

940
                diff_path = ddiff.path(root=root)
8✔
941
                return '• ' + pretty_form_texts.get(
8✔
942
                    ddiff.report_type or '',
943
                    '',
944
                ).format(
945
                    diff_path=diff_path,
946
                    type_t1=type_t1,
947
                    type_t2=type_t2,
948
                    val_t1=val_t1,
949
                    val_t2=val_t2,
950
                )
951

952
            def _pretty_print_diff_markdown_to_html(ddiff: DiffLevel) -> str:
8✔
953
                """
954
                Customized version of deepdiff.serialization.pretty_print_diff() function, edited to include the
955
                values deleted or added and to convert markdown into html.
956
                """
957

958
                def stringify_value(value: Any, type: str) -> str:
×
959
                    if type in {'str', 'int', 'float'}:
×
960
                        return f"'{mark_to_html(str(value))}'"
×
961
                    elif type in {'dict', 'list'}:
×
962
                        if compact:
×
963
                            value_string = yaml.safe_dump(
×
964
                                value,
965
                                default_flow_style=False,
966
                                width=999,
967
                                allow_unicode=True,
968
                                sort_keys=False,
969
                            )
970
                            value_list = value_string.splitlines(keepends=True)
×
971
                            if len(value_list) < 2:
×
972
                                return value_string
×
973
                            value_string = mark_to_html('\n    ' + '    '.join(value_list))
×
974
                            return value_string.rstrip()
×
975
                        else:
976
                            return mark_to_html(jsonlib.dumps(value, ensure_ascii=False, indent=2))
×
977
                    else:
978
                        return mark_to_html(str(value))
×
979

980
                type_t1 = type(ddiff.t1).__name__
×
981
                val_t1 = stringify_value(ddiff.t1, type_t1)
×
982
                type_t2 = type(ddiff.t2).__name__
×
983
                val_t2 = stringify_value(ddiff.t2, type_t2)
×
984

985
                diff_path = ddiff.path(root=root)
×
986
                return '• ' + pretty_form_texts.get(
×
987
                    ddiff.report_type or '',
988
                    '',
989
                ).format(
990
                    diff_path=diff_path,
991
                    type_t1=type_t1,
992
                    type_t2=type_t2,
993
                    val_t1=val_t1,
994
                    val_t2=val_t2,
995
                )
996

997
            result: list[str] = []
8✔
998
            if report_kind == 'html' and self.state.is_markdown():
8!
999
                for tree_item in ddiff.tree.values():
×
1000
                    for item_key in tree_item:
×
1001
                        result.append(_pretty_print_diff_markdown_to_html(item_key))
×
1002
            else:
1003
                for tree_item in ddiff.tree.values():
8✔
1004
                    for item_key in tree_item:
8✔
1005
                        result.append(_pretty_print_diff(item_key))
8✔
1006

1007
            return '\n'.join(result)
8✔
1008

1009
        def deserialize_data(
8✔
1010
            data: str | bytes, mime_type: str | None, data_type: str | None, data_label: Literal['Old', 'New']
1011
        ) -> tuple[Any, dict | None]:
1012
            """Deserializes the stored data.
1013

1014
            :param data: The stored data.
1015
            :param mime_type: The MIME type of the data.
1016
            :param data_type: The value of the data_type sub-parameter (overrides MIME type)
1017
            :param data_label: Either old or new, used for error reporting
1018

1019
            :returns: The deserialized data, any errors
1020
            """
1021
            if not data:
8✔
1022
                return data, None
8✔
1023
            if data_type is None:
8✔
1024
                if mime_type:
8✔
1025
                    media_subtype = mime_type.split('/')[-1].split('+')[-1].split('x-')[-1]
8✔
1026
                    if media_subtype in ('yaml', 'yml'):
8✔
1027
                        data_type = 'yaml'
8✔
1028
                    elif media_subtype == 'xml':
8✔
1029
                        data_type = 'xml'
8✔
1030
                    elif media_subtype == 'json':
8!
1031
                        data_type = 'json'
×
1032
                    else:
1033
                        logger.info(
8✔
1034
                            f'Differ {self.__kind__} could not determine data type of {data_label} data from media '
1035
                            f"type {mime_type}; defaulting to 'json'"
1036
                        )
1037
                        data_type = 'json'
8✔
1038
                else:
1039
                    logger.info(
8✔
1040
                        f"Differ {self.__kind__} data_type for {data_label} data defaulted to 'json' as media type is "
1041
                        'missing'
1042
                    )
1043
                    data_type = 'json'
8✔
1044
            parsed_data: Any = ''
8✔
1045
            if data_type == 'json':
8✔
1046
                try:
8✔
1047
                    parsed_data = jsonlib.loads(data)
8✔
1048
                except jsonlib.JSONDecodeError as e:
8✔
1049
                    self.state.exception = e
8✔
1050
                    self.state.traceback = self.job.format_error(e, traceback.format_exc())
8✔
1051
                    logger.error(
8✔
1052
                        f'Job {self.job.index_number}: {data_label} data is invalid JSON: {e} '
1053
                        f'({self.job.get_location()})'
1054
                    )
1055
                    logger.info(f'Job {self.job.index_number}: {data!r}')
8✔
1056
                    return None, {
8✔
1057
                        'text': f'Differ {self.__kind__} ERROR: {data_label} data is invalid JSON\n{e}',
1058
                        'markdown': f'Differ {self.__kind__} **ERROR: {data_label} data is invalid JSON**\n{e}',
1059
                        'html': f'Differ {self.__kind__} <b>ERROR: {data_label} data is invalid JSON</b>\n{e}',
1060
                    }
1061
            elif data_type == 'yaml':
8✔
1062
                try:
8✔
1063
                    parsed_data = yaml.safe_load(data)
8✔
1064
                except yaml.YAMLError as e:
×
1065
                    self.state.exception = e
×
1066
                    self.state.traceback = self.job.format_error(e, traceback.format_exc())
×
1067
                    logger.error(
×
1068
                        f'Job {self.job.index_number}: {data_label} data is invalid YAML: {e} '
1069
                        f'({self.job.get_location()})'
1070
                    )
1071
                    logger.info(f'Job {self.job.index_number}: {data!r}')
×
1072
                    return None, {
×
1073
                        'text': f'Differ {self.__kind__} ERROR: {data_label} data is invalid YAML\n{e}',
1074
                        'markdown': f'Differ {self.__kind__} **ERROR: {data_label} data is invalid YAML**\n{e}',
1075
                        'html': f'Differ {self.__kind__} <b>ERROR: {data_label} data is invalid YAML</b>\n{e}',
1076
                    }
1077
            elif data_type == 'xml':
8✔
1078
                if isinstance(xmltodict, str):  # pragma: no cover
1079
                    self.raise_import_error('xmltodict', xmltodict)
1080
                    raise RuntimeError()  # for type checker
1081
                try:
8✔
1082
                    parsed_data = xmltodict.parse(data)
8✔
1083
                except ExpatError as e:
×
1084
                    self.state.exception = e
×
1085
                    self.state.traceback = self.job.format_error(e, traceback.format_exc())
×
1086
                    logger.error(
×
1087
                        f'Job {self.job.index_number}: {data_label} data is invalid XML: {e} '
1088
                        f'({self.job.get_location()})'
1089
                    )
1090
                    logger.info(f'Job {self.job.index_number}: {data!r}')
×
1091
                    return None, {
×
1092
                        'text': f'Differ {self.__kind__} ERROR: {data_label} data is invalid XML\n{e}',
1093
                        'markdown': f'Differ {self.__kind__} **ERROR: {data_label} data is invalid XML**\n{e}',
1094
                        'html': f'Differ {self.__kind__} <b>ERROR: {data_label} data is invalid XML</b>\n{e}',
1095
                    }
1096
            return parsed_data, None
8✔
1097

1098
        old_data, err = deserialize_data(
8✔
1099
            self.state.old_data,
1100
            self.state.old_mime_type,
1101
            directives.get('data_type'),
1102
            'Old',
1103
        )
1104
        if err:
8✔
1105
            return err
8✔
1106
        new_data, err = deserialize_data(
8✔
1107
            self.state.new_data,
1108
            self.state.new_mime_type,
1109
            directives.get('data_type'),
1110
            'New',
1111
        )
1112
        if err:
8!
1113
            return err
×
1114
        ignore_order = bool(directives.get('ignore_order'))
8✔
1115
        ignore_string_case = bool(directives.get('ignore_string_case'))
8✔
1116
        significant_digits = directives.get('significant_digits')
8✔
1117
        compact = bool(directives.get('compact'))
8✔
1118
        ddiff = DeepDiff(
8✔
1119
            old_data,
1120
            new_data,
1121
            cache_size=500,
1122
            cache_purge_level=0,
1123
            cache_tuning_sample_size=500,
1124
            default_timezone=tz,
1125
            ignore_order=ignore_order,
1126
            ignore_string_type_changes=True,
1127
            ignore_numeric_type_changes=True,
1128
            ignore_string_case=ignore_string_case,
1129
            significant_digits=significant_digits,
1130
            verbose_level=min(2, max(0, math.ceil(3 - logger.getEffectiveLevel() / 10))),
1131
        )
1132
        diff_text = _pretty_deepdiff(ddiff, report_kind, compact)
8✔
1133
        if not diff_text:
8✔
1134
            self.state.verb = 'changed,no_report'
8✔
1135
            return {'text': '', 'markdown': '', 'html': ''}
8✔
1136

1137
        self.job.set_to_monospace()
8✔
1138
        if report_kind == 'html':
8✔
1139
            html_diff = (
8✔
1140
                f'<span style="font-family:monospace;white-space:pre-wrap;">'
1141
                # f'Differ: {self.__kind__} for {data_type}\n'
1142
                f'<span style="color:darkred;">--- @ {self.make_timestamp(self.state.old_timestamp, tz)}</span>\n'
1143
                f'<span style="color:darkgreen;">+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}</span>\n'
1144
                + diff_text.replace('][', ']<wbr>[')
1145
                + '</span>'
1146
            )
1147
            return {'html': html_diff}
8✔
1148
        else:
1149
            text_diff = (
8✔
1150
                # f'Differ: {self.__kind__} for {data_type}\n'
1151
                f'--- @ {self.make_timestamp(self.state.old_timestamp, tz)}\n'
1152
                f'+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}\n'
1153
                f'{diff_text}'
1154
            )
1155
            return {'text': text_diff, 'markdown': text_diff}
8✔
1156

1157

1158
class ImageDiffer(DifferBase):
8✔
1159
    """Compares two images providing an image outlining areas that have changed."""
1160

1161
    __kind__ = 'image'
8✔
1162

1163
    __supported_directives__: dict[str, str] = {
8✔
1164
        'data_type': (
1165
            "'url' (to retrieve an image), 'ascii85' (Ascii85 data), 'base64' (Base64 data) or 'filename' (the path "
1166
            "to an image file) (default: 'url')"
1167
        ),
1168
        'mse_threshold': (
1169
            'the minimum mean squared error (MSE) between two images to consider them changed, if numpy in installed '
1170
            '(default: 2.5)'
1171
        ),
1172
        'ai_google': 'Generative AI summary of changes (BETA)',
1173
    }
1174

1175
    def differ(
8✔
1176
        self,
1177
        directives: dict[str, Any],
1178
        report_kind: Literal['text', 'markdown', 'html'],
1179
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
1180
        tz: ZoneInfo | None = None,
1181
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
1182
        warnings.warn(
2✔
1183
            f'Job {self.job.index_number}: Using differ {self.__kind__}, which is BETA, may have bugs, and may '
1184
            f'change in the future. Please report any problems or suggestions at '
1185
            f'https://github.com/mborsetti/webchanges/discussions.',
1186
            RuntimeWarning,
1187
            stacklevel=1,
1188
        )
1189
        if isinstance(Image, str):  # pragma: no cover
1190
            self.raise_import_error('pillow', Image)
1191
            raise RuntimeError()  # for type checker
1192
        if isinstance(httpx, str):  # pragma: no cover
1193
            self.raise_import_error('httpx', httpx)
1194
            raise RuntimeError()  # for type checker
1195

1196
        def load_image_from_web(url: str) -> Image.Image:
2✔
1197
            """Fetches the image from an url."""
1198
            logging.debug(f'Retrieving image from {url}')
2✔
1199
            with httpx.stream('GET', url, timeout=10) as response:
2✔
1200
                response.raise_for_status()
2✔
1201
                return Image.open(BytesIO(b''.join(response.iter_bytes())))
2✔
1202

1203
        def load_image_from_file(filename: str) -> Image.Image:
2✔
1204
            """Load an image from a file."""
1205
            logging.debug(f'Reading image from {filename}')
2✔
1206
            return Image.open(filename)
2✔
1207

1208
        def load_image_from_base64(base_64: str) -> Image.Image:
2✔
1209
            """Load an image from an encoded bytes object."""
1210
            logging.debug('Retrieving image from a base64 string')
2✔
1211
            return Image.open(BytesIO(base64.b64decode(base_64)))
2✔
1212

1213
        def load_image_from_ascii85(ascii85: str) -> Image.Image:
2✔
1214
            """Load an image from an encoded bytes object."""
1215
            logging.debug('Retrieving image from an ascii85 string')
2✔
1216
            return Image.open(BytesIO(base64.a85decode(ascii85)))
2✔
1217

1218
        def compute_diff_image(img1: Image.Image, img2: Image.Image) -> tuple[Image.Image, np.float64]:
2✔
1219
            """Compute the difference between two images."""
1220
            # Compute the absolute value of the pixel-by-pixel difference between the two images.
1221
            diff_image = ImageChops.difference(img1, img2)
2✔
1222

1223
            # Compute the mean squared error between the images
1224
            if not isinstance(np, str):
2✔
1225
                diff_array = np.array(diff_image)
2✔
1226
                mse_value = np.mean(np.square(diff_array))
2✔
1227
            else:  # pragma: no cover
1228
                mse_value = None
1229

1230
            # Create the diff image by overlaying this difference on a darkened greyscale background
1231
            back_image = img1.convert('L')
2✔
1232
            back_image_brightness = ImageStat.Stat(back_image).rms[0]
2✔
1233
            back_image = ImageEnhance.Brightness(back_image).enhance(back_image_brightness / 225)
2✔
1234

1235
            # Convert the 'L' image to 'RGB' using a matrix that applies to yellow tint
1236
            # The matrix has 12 elements: 4 for Red, 4 for Green, and 4 for Blue.
1237
            # For yellow, we want Red and Green to copy the L values (1.0) and Blue to be zero.
1238
            # The matrix is: [R, G, B, A] for each of the three output channels
1239
            yellow_tint_matrix = (
2✔
1240
                1.0,
1241
                0.0,
1242
                0.0,
1243
                0.0,  # Red = 100% of the grayscale value
1244
                1.0,
1245
                0.0,
1246
                0.0,
1247
                0.0,  # Green = 100% of the grayscale value
1248
                0.0,
1249
                0.0,
1250
                0.0,
1251
                0.0,  # Blue = 0% of the grayscale value
1252
            )
1253

1254
            # Apply the conversion
1255
            diff_colored = diff_image.convert('RGB').convert('RGB', matrix=yellow_tint_matrix)
2✔
1256

1257
            final_img = ImageChops.add(back_image.convert('RGB'), diff_colored)
2✔
1258
            final_img.format = img2.format
2✔
1259

1260
            return final_img, mse_value
2✔
1261

1262
        def ai_google(
2✔
1263
            old_image: Image.Image,
1264
            new_image: Image.Image,
1265
            diff_image: Image.Image,
1266
            directives: AiGoogleDirectives,
1267
        ) -> tuple[str, str]:
1268
            """Summarize changes in image using Generative AI (ALPHA).  Returns summary and model name."""
1269
            logger.info(f'Job {self.job.index_number}: Running ai_google for {self.__kind__} differ')
×
1270
            warnings.warn(
×
1271
                f'Job {self.job.index_number}: Using differ {self.__kind__} with ai_google, which is ALPHA, '
1272
                f'may have bugs, and may change in the future. Please report any problems or suggestions at '
1273
                f'https://github.com/mborsetti/webchanges/discussions.',
1274
                RuntimeWarning,
1275
                stacklevel=1,
1276
            )
1277

1278
            api_version = '1beta'
×
1279
            # GOOGLE_AI_API_KEY deprecated end of 2025
1280
            gemini_api_key = os.environ.get('GEMINI_API_KEY', '').rstrip()
×
1281
            if not gemini_api_key:
×
1282
                gemini_api_key = os.environ.get('GOOGLE_AI_API_KEY', '').rstrip()
×
1283
                if gemini_api_key:
×
1284
                    warnings.warn(
×
1285
                        'The environment variable GOOGLE_AI_API_KEY is deprecated; please use GEMINI_API_KEY instead.',
1286
                        DeprecationWarning,
1287
                        stacklevel=1,
1288
                    )
1289
            if len(gemini_api_key) != 39:
×
1290
                logger.error(
×
1291
                    f'Job {self.job.index_number}: Environment variable GEMINI_API_KEY not found or is of the '
1292
                    f'incorrect length {len(gemini_api_key)} ({self.job.get_location()})'
1293
                )
1294
                return (
×
1295
                    f'## ERROR in summarizing changes using Google AI:\n'
1296
                    f'Environment variable GEMINI_API_KEY not found or is of the incorrect length '
1297
                    f'{len(gemini_api_key)}.\n',
1298
                    '',
1299
                )
1300
            client = httpx.Client(http2=True, timeout=self.job.timeout)
×
1301

1302
            def _load_image(img_data: tuple[str, Image.Image]) -> dict[str, dict[str, str] | Exception | str]:
×
1303
                img_name, image = img_data
×
1304
                # Convert image to bytes
1305
                img_byte_arr = BytesIO()
×
1306
                image.save(img_byte_arr, format=image.format)
×
1307
                image_data = img_byte_arr.getvalue()
×
1308
                mime_type = f'image/{image.format.lower()}'  # type: ignore[union-attr]
×
1309

1310
                logger.info(
×
1311
                    f'Job {self.job.index_number}: Loading {img_name} ({image.format}) to Google AI '
1312
                    f'({len(image_data) / 1024:,.0f} kbytes)'
1313
                )
1314

1315
                # Initial resumable upload request
1316
                headers = {
×
1317
                    'X-Goog-Upload-Protocol': 'resumable',
1318
                    'X-Goog-Upload-Command': 'start',
1319
                    'X-Goog-Upload-Header-Content-Length': str(len(image_data)),
1320
                    'X-Goog-Upload-Header-Content-Type': mime_type,
1321
                    'Content-Type': 'application/json',
1322
                }
1323
                data = {'file': {'display_name': 'TEXT'}}
×
1324

1325
                try:
×
1326
                    response = client.post(
×
1327
                        f'https://generativelanguage.googleapis.com/upload/v{api_version}/files?key={gemini_api_key}',
1328
                        headers=headers,
1329
                        json=data,
1330
                    )
1331
                except httpx.HTTPError as e:
×
1332
                    return {'error': e, 'img_name': img_name}
×
1333
                upload_url = response.headers['X-Goog-Upload-Url']
×
1334

1335
                # Upload the image data
1336
                headers = {
×
1337
                    'Content-Length': str(len(image_data)),
1338
                    'X-Goog-Upload-Offset': '0',
1339
                    'X-Goog-Upload-Command': 'upload, finalize',
1340
                }
1341
                try:
×
1342
                    response = client.post(upload_url, headers=headers, content=image_data)
×
1343
                except httpx.HTTPError as e:
×
1344
                    return {'error': e, 'img_name': img_name}
×
1345

1346
                # Extract file URI from response
1347
                file_info = response.json()
×
1348
                file_uri = file_info['file']['uri']
×
1349
                logger.info(f'Job {self.job.index_number}: {img_name.capitalize()} loaded to {file_uri}')
×
1350

1351
                return {
×
1352
                    'file_data': {
1353
                        'mime_type': mime_type,
1354
                        'file_uri': file_uri,
1355
                    }
1356
                }
1357

1358
            # upload to Google
1359
            additional_parts: list[dict[str, dict[str, str]]] = []
×
1360
            executor = ThreadPoolExecutor()
×
1361
            for additional_part in executor.map(
×
1362
                _load_image,
1363
                (
1364
                    ('old image', old_image),
1365
                    ('new image', new_image),
1366
                    # ('differences image', diff_image),
1367
                ),
1368
            ):
1369
                if 'error' not in additional_part:
×
1370
                    additional_parts.append(additional_part)  # type: ignore[arg-type]
×
1371
                else:
1372
                    logger.error(
×
1373
                        f'Job {self.job.index_number}: ai_google for {self.__kind__} HTTP Client error '
1374
                        f'{type(additional_part["error"])} when loading {additional_part["img_name"]} to Google AI: '
1375
                        f'{additional_part["error"]}'
1376
                    )
1377
                    return (
×
1378
                        f'HTTP Client error {type(additional_part["error"])} when loading '
1379
                        f'{additional_part["img_name"]} to Google AI: {additional_part["error"]}',
1380
                        '',
1381
                    )
1382

1383
            # system_instructions = (
1384
            #     'You are a skilled journalist tasked with summarizing the key differences between two versions '
1385
            #     'of the same image. The audience for your summary is already familiar with the image, so you can'
1386
            #     'focus on the most significant changes.'
1387
            # )
1388
            # model_prompt = (
1389
            #     'You are a skilled visual analyst tasked with analyzing two versions of an image and summarizing the '
1390
            #     'key differences between them. The audience for your summary is already familiar with the '
1391
            #     "image's content, so you should focus only on the most significant differences.\n\n"
1392
            #     '**Instructions:**\n\n'
1393
            #     # '1. Carefully examine the yellow areas in the image '
1394
            #     f"{additional_parts[2]['file_data']['file_uri']}, identify the differences, and describe them.\n"
1395
            #     f"2. Refer to the old version of the image {additional_parts[0]['file_data']['file_uri']} and the "
1396
            #     f"new version {additional_parts[1]['file_data']['file_uri']}.\n"
1397
            #     '3. You are only interested in those differences, such as additions, removals, or alterations, that '
1398
            #     'modify the intended message or interpretation.\n'
1399
            #     '4. Summarize the identified differences, except those ignored, in a clear and concise manner, '
1400
            #     'explaining how the meaning has shifted or evolved in the new version compared to the old version '
1401
            #     'only when necessary. Be specific and provide examples to illustrate your points when needed.\n'
1402
            #     '5. If there are only additions to the image, then summarize the additions.\n'
1403
            #     '6. Use Markdown formatting to structure your summary effectively. Use headings, bullet points, '
1404
            #     'and other Markdown elements as needed to enhance readability.\n'
1405
            #     '7. Restrict your analysis and summary to the information provided within these images. Do '
1406
            #     'not introduce external information or assumptions.\n'
1407
            # )
1408
            system_instructions = (
×
1409
                'You are a meticulous visual comparison agent. Your task is to analyze two images: an "old '
1410
                'version" and a "new version". Your entire focus is on identifying and listing the concrete, '
1411
                'factual differences between them.'
1412
            )
1413
            model_prompt = (
×
1414
                '**Instructions:**\n'
1415
                '\n'
1416
                f'1.  **Identify Changes:** Directly compare the "new version" '
1417
                f'{additional_parts[0]["file_data"]["file_uri"]} to the "old version" '
1418
                f'{additional_parts[1]["file_data"]["file_uri"]} and identify all additions, removals, and alterations '
1419
                'of visual elements.\n'
1420
                '\n'
1421
                '2.  **Filter for Significance:** From your initial list of changes, you must filter out any that '
1422
                'are minor or cosmetic. A difference is only significant if it alters the core subject matter or '
1423
                'the main message of the image.\n'
1424
                '    *   **IGNORE:** Minor shifts in layout, small changes in color saturation or brightness, or '
1425
                'other cosmetic adjustments that do not change what the image is depicting.\n'
1426
                '    *   **FOCUS ON:** Tangible changes such as added objects, removed people, or altered text.\n'
1427
                '\n'
1428
                '3.  **Summarize the Differences:**\n'
1429
                '    *   Present the significant differences as a bulleted list under the heading "Summary of '
1430
                'Changes".\n'
1431
                '    *   For each point, state the difference factually and concisely (e.g., "An apple was added '
1432
                "to the table,\" \"The text on the sign was changed from 'Open' to 'Closed'\").\n"
1433
                '    *   Only if a change directly and clearly alters the primary message or interpretation of the '
1434
                'image, you may add a brief, one-sentence explanation of this shift. Do not speculate on deeper '
1435
                'meanings.\n'
1436
                '\n'
1437
                '4.  **No Differences Found:** If you analyze both images and find no significant differences '
1438
                'according to the criteria above, you must respond with only the phrase: "No significant '
1439
                'differences were found between the two images." Do not attempt to find minor differences to report.\n'
1440
                '\n'
1441
                '5.  **Grounding:** Your entire analysis must be based solely on the visual information present in '
1442
                'the two images. Do not make assumptions or introduce any external information.'
1443
            )
1444
            directives['thinking_budget'] = directives.get('thinking_budget', 24576)
×
1445
            summary, model_version = AIGoogleDiffer._send_to_model(
×
1446
                self.job,
1447
                system_instructions,
1448
                model_prompt,
1449
                additional_parts=additional_parts,  # type: ignore[arg-type]
1450
                directives=directives,
1451
            )
1452

1453
            return summary, model_version
×
1454

1455
        data_type = directives.get('data_type', 'url')
2✔
1456
        mse_threshold = directives.get('mse_threshold', 2.5)
2✔
1457
        if not isinstance(self.state.old_data, str):
2!
1458
            raise ValueError('old_data is not a string')
×
1459
        if not isinstance(self.state.new_data, str):
2!
1460
            raise ValueError('new_data is not a string')
×
1461
        if data_type == 'url':
2✔
1462
            old_image = load_image_from_web(self.state.old_data)
2✔
1463
            new_image = load_image_from_web(self.state.new_data)
2✔
1464
            old_data = f' (<a href="{self.state.old_data}" target="_blank">Old image</a>)'
2✔
1465
            new_data = f' (<a href="{self.state.new_data}" target="_blank">New image</a>)'
2✔
1466
        elif data_type == 'ascii85':
2✔
1467
            old_image = load_image_from_ascii85(self.state.old_data)
2✔
1468
            new_image = load_image_from_ascii85(self.state.new_data)
2✔
1469
            old_data = ''
2✔
1470
            new_data = ''
2✔
1471
        elif data_type == 'base64':
2✔
1472
            old_image = load_image_from_base64(self.state.old_data)
2✔
1473
            new_image = load_image_from_base64(self.state.new_data)
2✔
1474
            old_data = ''
2✔
1475
            new_data = ''
2✔
1476
        else:  # 'filename'
1477
            old_image = load_image_from_file(self.state.old_data)
2✔
1478
            new_image = load_image_from_file(self.state.new_data)
2✔
1479
            old_data = f' (<a href="file://{self.state.old_data}" target="_blank">Old image</a>)'
2✔
1480
            new_data = f' (<a href="file://{self.state.new_data}" target="_blank">New image</a>)'
2✔
1481

1482
        # Check formats  TODO: is it needed? under which circumstances?
1483
        # if new_image.format != old_image.format:
1484
        #     logger.info(f'Image formats do not match: {old_image.format} vs {new_image.format}')
1485
        # else:
1486
        #     logger.debug(f'image format is {old_image.format}')
1487

1488
        # Convert the images to a base64 object for HTML (before shrinking etc.)
1489
        output_stream = BytesIO()
2✔
1490
        old_image.save(output_stream, format=old_image.format)
2✔
1491
        encoded_old = b64encode(output_stream.getvalue()).decode()
2✔
1492
        if data_type == 'url':
2✔
1493
            encoded_new = ''
2✔
1494
        else:
1495
            output_stream = BytesIO()
2✔
1496
            new_image.save(output_stream, format=new_image.format)
2✔
1497
            encoded_new = b64encode(output_stream.getvalue()).decode()
2✔
1498

1499
        # If needed, shrink the larger image
1500
        if new_image.size != old_image.size:
2✔
1501
            if new_image.size > old_image.size:
2✔
1502
                logging.debug(f'Job {self.job.index_number}: Shrinking the new image')
2✔
1503
                img_format = new_image.format
2✔
1504
                new_image = new_image.resize(old_image.size, Image.Resampling.LANCZOS)
2✔
1505
                new_image.format = img_format
2✔
1506

1507
            else:
1508
                logging.debug(f'Job {self.job.index_number}: Shrinking the old image')
2✔
1509
                img_format = old_image.format
2✔
1510
                old_image = old_image.resize(new_image.size, Image.Resampling.LANCZOS)
2✔
1511
                old_image.format = img_format
2✔
1512

1513
        if old_image == new_image:
2✔
1514
            logger.info(f'Job {self.job.index_number}: New image is identical to the old one')
2✔
1515
            self.state.verb = 'unchanged'
2✔
1516
            return {'text': '', 'markdown': '', 'html': ''}
2✔
1517

1518
        diff_image, mse_value = compute_diff_image(old_image, new_image)
2✔
1519
        if mse_value:
2!
1520
            logger.debug(f'Job {self.job.index_number}: MSE value {mse_value:.2f}')
2✔
1521

1522
        if mse_value and mse_value < mse_threshold:
2✔
1523
            logger.info(
2✔
1524
                f'Job {self.job.index_number}: MSE value {mse_value:.2f} below the threshold of {mse_threshold}; '
1525
                f'considering changes not worthy of a report'
1526
            )
1527
            self.state.verb = 'changed,no_report'
2✔
1528
            return {'text': '', 'markdown': '', 'html': ''}
2✔
1529

1530
        # prepare AI summary
1531
        summary = ''
2✔
1532
        model_version = ''
2✔
1533
        if 'ai_google' in directives:
2!
1534
            summary, model_version = ai_google(old_image, new_image, diff_image, directives.get('ai_google', {}))
×
1535

1536
        # Prepare HTML output
1537
        htm = [
2✔
1538
            f'<span style="font-family:monospace">'
1539
            # f'Differ: {self.__kind__} for {data_type}',
1540
            f'<span style="color:darkred;">--- @ {self.make_timestamp(self.state.old_timestamp, tz)}{old_data}</span>',
1541
            f'<span style="color:darkgreen;">+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}{new_data}'
1542
            '</span>',
1543
            '</span>',
1544
            'New image:',
1545
        ]
1546
        if data_type == 'url':
2✔
1547
            htm.append(f'<img src="{self.state.new_data}" style="max-width: 100%; display: block;">')
2✔
1548
        else:
1549
            htm.append(
2✔
1550
                f'<img src="data:image/{(new_image.format or "").lower()};base64,{encoded_new}" '
1551
                'style="max-width: 100%; display: block;">'
1552
            )
1553
        # Convert the difference image to a base64 object
1554
        output_stream = BytesIO()
2✔
1555
        diff_image.save(output_stream, format=diff_image.format)
2✔
1556
        encoded_diff = b64encode(output_stream.getvalue()).decode()
2✔
1557
        htm.extend(
2✔
1558
            [
1559
                'Differences from old (in yellow):',
1560
                f'<img src="data:image/{(diff_image.format or "").lower()};base64,{encoded_diff}" '
1561
                'style="max-width: 100%; display: block;">',
1562
                'Old image:',
1563
                f'<img src="data:image/{(old_image.format or "").lower()};base64,{encoded_old}" '
1564
                'style="max-width: 100%; display: block;">',
1565
            ]
1566
        )
1567
        changed_text = 'The image has changed; please see an HTML report for the visualization.'
2✔
1568
        if not summary:
2!
1569
            return {
2✔
1570
                'text': changed_text,
1571
                'markdown': changed_text,
1572
                'html': '<br>\n'.join(htm),
1573
            }
1574

1575
        newline = '\n'  # For Python < 3.12 f-string {} compatibility
×
1576
        back_n = '\\n'  # For Python < 3.12 f-string {} compatibility
×
1577
        directives_for_str = {key: value for key, value in directives.items() if key != 'model'}
×
1578
        if 'prompt' in directives_for_str:
×
1579
            directives_for_str['prompt'] = '«custom»'
×
1580
        directives_text = (
×
1581
            (
1582
                ' (ai_google directive(s): '
1583
                + ', '.join(f'{key}={str(value).replace(newline, back_n)}' for key, value in directives_for_str.items())
1584
                + ')'
1585
            )
1586
            if directives_for_str
1587
            else ''
1588
        )
1589
        footer = f"Summary by Google Generative AI's model {model_version}{directives_text}."
×
1590
        return {
×
1591
            'text': (
1592
                f'{summary}\n\n\nA visualization of differences is available in {__package__} HTML reports.'
1593
                f'\n------------\n{footer}'
1594
            ),
1595
            'markdown': (
1596
                f'{summary}\n\n\nA visualization of differences is available in {__package__} HTML reports.'
1597
                f'\n* * *\n{footer}'
1598
            ),
1599
            'html': '<br>\n'.join(
1600
                [
1601
                    mark_to_html(summary, extras={'tables'}).replace('<h2>', '<h3>').replace('</h2>', '</h3>'),
1602
                    '',
1603
                    *htm,
1604
                    '-----',
1605
                    f'<i><small>{footer}</small></i>',
1606
                ]
1607
            ),
1608
        }
1609

1610

1611
class AIGoogleDiffer(DifferBase):
8✔
1612
    """(Default) Generates a summary using Google Generative AI (Gemini models).
1613

1614
    Calls Google Gemini APIs; documentation at https://ai.google.dev/api/rest and tutorial at
1615
    https://ai.google.dev/tutorials/rest_quickstart
1616

1617
    """
1618

1619
    __kind__ = 'ai_google'
8✔
1620

1621
    __supported_directives__: dict[str, str] = {
8✔
1622
        'model': ('model name from https://ai.google.dev/gemini-api/docs/models/gemini (default: gemini-2.0-flash)'),
1623
        'system_instructions': (
1624
            'Optional tone and style instructions for the model (default: see documentation at'
1625
            'https://webchanges.readthedocs.io/en/stable/differs.html#ai-google-diff)'
1626
        ),
1627
        'prompt': 'a custom prompt - {unified_diff}, {unified_diff_new}, {old_text} and {new_text} will be replaced',
1628
        'additions_only': 'summarizes only added lines (including as a result of a change)',
1629
        'prompt_ud_context_lines': 'the number of context lines for {unified_diff} (default: 9999)',
1630
        'timeout': 'the number of seconds before timing out the API call (default: 300)',
1631
        'max_output_tokens': "the maximum number of tokens returned by the model (default: None, i.e. model's default)",
1632
        'temperature': "the model's Temperature parameter (default: 0.0)",
1633
        'top_p': "the model's TopP parameter (default: None, i.e. model's default",
1634
        'top_k': "the model's TopK parameter (default: None, i.e. model's default",
1635
        'tools': "data passed on to the API's 'tools' field (default: None)",
1636
        'unified': 'directives passed to the unified differ (default: None)',
1637
    }
1638
    __default_directive__ = 'model'
8✔
1639

1640
    @staticmethod
8✔
1641
    def _send_to_model(
8✔
1642
        job: JobBase,
1643
        system_instructions: str,
1644
        model_prompt: str,
1645
        additional_parts: list[dict[str, str | dict[str, str]]] | None = None,
1646
        directives: AiGoogleDirectives | None = None,
1647
    ) -> tuple[str, str]:
1648
        """Creates the summary request to the model; returns the summary and the version of the actual model used."""
1649
        api_version = '1beta'
×
1650
        if directives is None:
×
1651
            directives = {}
×
1652
        model = directives.get('model', 'gemini-2.0-flash')
×
1653
        timeout = directives.get('timeout', 300)
×
1654
        max_output_tokens = directives.get('max_output_tokens')
×
1655
        temperature = directives.get('temperature', 0.0)
×
1656
        top_p = directives.get('top_p', 1.0 if temperature == 0.0 else None)
×
1657
        top_k = directives.get('top_k')
×
1658
        # GOOGLE_AI_API_KEY deprecated end of 2025
1659
        gemini_api_key = os.environ.get('GEMINI_API_KEY', '').rstrip()
×
1660
        if not gemini_api_key:
×
1661
            gemini_api_key = os.environ.get('GOOGLE_AI_API_KEY', '').rstrip()
×
1662
            if gemini_api_key:
×
1663
                warnings.warn(
×
1664
                    'The environment variable GOOGLE_AI_API_KEY is deprecated; please use GEMINI_API_KEY instead.',
1665
                    DeprecationWarning,
1666
                    stacklevel=1,
1667
                )
1668
        if len(gemini_api_key) != 39:
×
1669
            logger.error(
×
1670
                f'Job {job.index_number}: Environment variable GEMINI_API_KEY not found or is of the '
1671
                f'incorrect length {len(gemini_api_key)} ({job.get_location()})'
1672
            )
1673
            return (
×
1674
                f'## ERROR in summarizing changes using Google AI:\n'
1675
                f'Environment variable GEMINI_API_KEY not found or is of the incorrect length '
1676
                f'{len(gemini_api_key)}.',
1677
                '',
1678
            )
1679

1680
        data: dict[str, Any] = {
×
1681
            'system_instruction': {'parts': [{'text': system_instructions}]},
1682
            'contents': [{'parts': [{'text': model_prompt}]}],
1683
            'generationConfig': {
1684
                'maxOutputTokens': max_output_tokens,
1685
                'temperature': temperature,
1686
                'topP': top_p,
1687
                'topK': top_k,
1688
            },
1689
        }
1690
        if additional_parts:
×
1691
            data['contents'][0]['parts'].extend(additional_parts)
×
1692
        if directives.get('tools'):
×
1693
            data['tools'] = directives['tools']
×
1694
        if directives.get('thinking_budget'):
×
1695
            data['generationConfig'].update({'thinkingConfig': {'thinkingBudget': directives['thinking_budget']}})
×
1696
        logger.info(f'Job {job.index_number}: Making the content generation request to Google AI model {model}')
×
1697
        model_version = model  # default
×
1698
        try:
×
1699
            r = httpx.Client(http2=True).post(
×
1700
                f'https://generativelanguage.googleapis.com/v{api_version}/models/{model}:generateContent?'
1701
                f'key={gemini_api_key}',
1702
                json=data,
1703
                headers={'Content-Type': 'application/json'},
1704
                timeout=timeout,
1705
            )
1706
            if r.is_success:
×
1707
                result = r.json()
×
1708
                candidate = result['candidates'][0]
×
1709
                finish_reason = candidate['finishReason']
×
1710
                model_version = result['modelVersion']
×
1711
                logger.info(f'Job {job.index_number}: AI generation finished by {finish_reason} using {model_version}')
×
1712
                logger.debug(
×
1713
                    f'Job {job.index_number}: Used {result["usageMetadata"]["totalTokenCount"]:,} tokens, '
1714
                    f'{result["usageMetadata"]["totalTokenCount"]:,} of which for the prompt.'
1715
                )
1716
                if 'content' in candidate:
×
1717
                    if 'parts' in candidate['content']:
×
1718
                        summary: str = candidate['content']['parts'][0]['text'].rstrip()
×
1719
                    else:
1720
                        summary = (
×
1721
                            f'## ERROR in summarizing changes using Google AI:\n'
1722
                            f'Model did not return any candidate output:\n'
1723
                            f'finishReason={finish_reason}'
1724
                            f'{jsonlib.dumps(result["usageMetadata"], ensure_ascii=True, indent=2)}'
1725
                        )
1726
                else:
1727
                    summary = (
×
1728
                        f'## ERROR in summarizing changes using Google AI:\n'
1729
                        f'Model did not return any candidate output:\n'
1730
                        f'{jsonlib.dumps(result, ensure_ascii=True, indent=2)}'
1731
                    )
1732

1733
            elif r.status_code == 400:
×
1734
                summary = (
×
1735
                    f'## ERROR in summarizing changes using Google AI:\n'
1736
                    f'Received error from {r.url.host}: '
1737
                    f'{r.json().get("error", {}).get("message") or ""}'
1738
                )
1739
            else:
1740
                summary = (
×
1741
                    f'## ERROR in summarizing changes using Google AI:\n'
1742
                    f'Received error {r.status_code} {r.reason_phrase} from '
1743
                    f'{r.url.host}'
1744
                )
1745
                if r.content:
×
1746
                    summary += f': {r.json().get("error", {}).get("message") or ""}'
×
1747

1748
        except httpx.HTTPError as e:
×
1749
            summary = (
×
1750
                f'## ERROR in summarizing changes using Google AI:\n'
1751
                f'HTTP client error: {e} when requesting data from '
1752
                f'{e.request.url.host}'
1753
            )
1754

1755
        return summary, model_version
×
1756

1757
    def differ(
8✔
1758
        self,
1759
        directives: AiGoogleDirectives,  # type: ignore[override]
1760
        report_kind: Literal['text', 'markdown', 'html'],
1761
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
1762
        tz: ZoneInfo | None = None,
1763
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
1764
        logger.info(f'Job {self.job.index_number}: Running the {self.__kind__} differ from hooks.py')
8✔
1765
        warnings.warn(
8✔
1766
            f'Job {self.job.index_number}: Using differ {self.__kind__}, which is BETA, may have bugs, and may '
1767
            f'change in the future. Please report any problems or suggestions at '
1768
            f'https://github.com/mborsetti/webchanges/discussions.',
1769
            RuntimeWarning,
1770
            stacklevel=1,
1771
        )
1772

1773
        def get_ai_summary(prompt: str, system_instructions: str) -> tuple[str, str]:
8✔
1774
            """Generate AI summary from unified diff, or an error message, plus the model version."""
1775
            # GOOGLE_AI_API_KEY deprecated end of 2025
1776
            gemini_api_key = os.environ.get('GEMINI_API_KEY', '').rstrip()
8✔
1777
            if not gemini_api_key:
8✔
1778
                gemini_api_key = os.environ.get('GOOGLE_AI_API_KEY', '').rstrip()
8✔
1779
                if gemini_api_key:
8!
1780
                    warnings.warn(
×
1781
                        'The environment variable GOOGLE_AI_API_KEY is deprecated; please use GEMINI_API_KEY instead.',
1782
                        DeprecationWarning,
1783
                        stacklevel=1,
1784
                    )
1785
            if len(gemini_api_key) != 39:
8✔
1786
                logger.error(
8✔
1787
                    f'Job {self.job.index_number}: Environment variable GEMINI_API_KEY not found or is of the '
1788
                    f'incorrect length {len(gemini_api_key)} ({self.job.get_location()})'
1789
                )
1790
                return (
8✔
1791
                    f'## ERROR in summarizing changes using Google AI:\n'
1792
                    f'Environment variable GEMINI_API_KEY not found or is of the incorrect length '
1793
                    f'{len(gemini_api_key)}.\n',
1794
                    '',
1795
                )
1796

1797
            if '{unified_diff' in prompt:  # matches unified_diff or unified_diff_new
8!
1798
                default_context_lines = 9999 if '{unified_diff}' in prompt else 0  # none if only unified_diff_new
×
1799
                context_lines = directives.get('prompt_ud_context_lines', default_context_lines)
×
1800
                unified_diff = '\n'.join(
×
1801
                    difflib.unified_diff(
1802
                        str(self.state.old_data).splitlines(),
1803
                        str(self.state.new_data).splitlines(),
1804
                        # '@',
1805
                        # '@',
1806
                        # self.make_timestamp(self.state.old_timestamp, tz),
1807
                        # self.make_timestamp(self.state.new_timestamp, tz),
1808
                        n=context_lines,
1809
                    )
1810
                )
1811
                if not unified_diff:
×
1812
                    # no changes
1813
                    return '', ''
×
1814
            else:
1815
                unified_diff = ''
8✔
1816

1817
            if '{unified_diff_new}' in prompt:
8!
1818
                unified_diff_new_lines = []
×
1819
                for line in unified_diff.splitlines():
×
1820
                    if line.startswith('+'):
×
1821
                        unified_diff_new_lines.append(line[1:])
×
1822
                unified_diff_new = '\n'.join(unified_diff_new_lines)
×
1823
            else:
1824
                unified_diff_new = ''
8✔
1825

1826
            # check if data is different (same data is sent during testing)
1827
            if '{old_text}' in prompt and '{new_text}' in prompt and self.state.old_data == self.state.new_data:
8!
1828
                return '', ''
8✔
1829

1830
            model_prompt = prompt.format(
×
1831
                unified_diff=unified_diff,
1832
                unified_diff_new=unified_diff_new,
1833
                old_text=self.state.old_data,
1834
                new_text=self.state.new_data,
1835
            )
1836

1837
            summary, model_version = self._send_to_model(
×
1838
                self.job,
1839
                system_instructions,
1840
                model_prompt,
1841
                directives=directives,
1842
            )
1843

1844
            return summary, model_version
×
1845

1846
        default_system_instructions = ''
8✔
1847
        if directives.get('additions_only') or self.job.additions_only:
8!
1848
            default_prompt = '\n'.join(
×
1849
                (
1850
                    'You are an expert analyst AI, specializing in the meticulous summarization of change documents. '
1851
                    'Your task is to summarize the provided unified diff in a clear and concise manner with 100% '
1852
                    'fidelity. Restrict your analysis and summary *only* to the diff provided. Do not introduce any '
1853
                    'external information or assumptions.',
1854
                    '',
1855
                    'Format your summary using Markdown. Use headings, bullet points, and other Markdown elements '
1856
                    'where appropriate to create a well-structured and easily readable summary.',
1857
                    '',
1858
                    '{unified_diff_new}',
1859
                )
1860
            )
1861
        else:
1862
            default_prompt = '\n'.join(
8✔
1863
                (
1864
                    'You are an expert analyst AI, specializing in the meticulous comparison of documents. Your task '
1865
                    'is to identify and summarize only the substantive differences between two versions of a text. '
1866
                    'Your audience is already familiar with the original document and needs a concise summary of the '
1867
                    'most significant changes in meaning or information.',
1868
                    '',
1869
                    '**Instructions:**',
1870
                    '',
1871
                    '1.  **Analyze the Texts:** Carefully review the document provided in the `<old_version>` and '
1872
                    '`</old_version>` tags and the one in the `<new_version>` and `</new_version>` tags.',
1873
                    '',
1874
                    '2.  **Identify Substantive Changes:** Compare the two versions to identify all substantive '
1875
                    'changes. A "substantive change" is defined as any modification that alters the core meaning, '
1876
                    'intent, instructions, or factual information presented in the text. This includes, but is not '
1877
                    'limited to:',
1878
                    '*   Additions of new concepts, data, or requirements.',
1879
                    '*   Deletions of existing information, arguments, or clauses.',
1880
                    '*   Alterations to definitions, conclusions, instructions, or key takeaways.',
1881
                    '',
1882
                    '3.  **Exclude Non-Substantive Changes:** You must disregard any changes that are purely cosmetic, '
1883
                    'typographical, or structural and do not alter the substantive meaning of the document. Explicitly '
1884
                    'ignore the following:',
1885
                    '*   Changes in page numbers, section/chapter numbering, or paragraph numbering.',
1886
                    '*   Corrections of spelling, punctuation, or grammatical errors.',
1887
                    '*   Modifications in formatting, layout, or font.',
1888
                    '*   Rewording or rephrasing that does not change the underlying meaning or intent.',
1889
                    '',
1890
                    '4.  **Summarize Material Differences:** Create a summary of the identified substantive changes '
1891
                    'with 100% fidelity. For each change, provide:',
1892
                    '*   A clear heading identifying the relevant section (e.g., "Section 4: User Guidelines" or '
1893
                    '"Chapteron Methodology").',
1894
                    '*   A concise description of the modification, explaining whether it is an addition, deletion, or '
1895
                    'alteration.',
1896
                    '*   A brief analysis of how the change impacts the overall message or instructions, if not '
1897
                    'immediately obvious.',
1898
                    '',
1899
                    '5.  **Output Format:**',
1900
                    '*   Use Markdown for clear and structured presentation (e.g., headings and bullet points).',
1901
                    '*   If no substantive changes are found, state this clearly.',
1902
                    '*   If the changes consist only of additions, summarize the new content.',
1903
                    '',
1904
                    '6.  **Scope Limitation:** Base your analysis strictly on the provided text excerpts. Do not '
1905
                    'infer or introduce any external context or information.',
1906
                    '',
1907
                    '<old_version>',
1908
                    '{old_text}',
1909
                    '</old_version>',
1910
                    '',
1911
                    '<new_version>',
1912
                    '{new_text}',
1913
                    '</new_version>',
1914
                )
1915
            )
1916

1917
        system_instructions = directives.get('system_instructions', default_system_instructions)
8✔
1918
        prompt = directives.get('prompt', default_prompt).replace('\\n', '\n')
8✔
1919
        summary, model_version = get_ai_summary(prompt, system_instructions)
8✔
1920
        if not summary:
8✔
1921
            self.state.verb = 'changed,no_report'
8✔
1922
            return {'text': '', 'markdown': '', 'html': ''}
8✔
1923
        newline = '\n'  # For Python < 3.12 f-string {} compatibility
8✔
1924
        back_n = '\\n'  # For Python < 3.12 f-string {} compatibility
8✔
1925
        directives_for_str = {key: value for key, value in directives.items() if key != 'model'}
8✔
1926
        if 'prompt' in directives_for_str:
8!
1927
            directives_for_str['prompt'] = '«custom»'
×
1928
        directives_text = (
8✔
1929
            (
1930
                ' (differ directive(s): '
1931
                + ', '.join(f'{key}={str(value).replace(newline, back_n)}' for key, value in directives_for_str.items())
1932
                + ')'
1933
            )
1934
            if directives_for_str
1935
            else ''
1936
        )
1937
        footer = (
8✔
1938
            f"Summary by Google Generative AI's model {model_version}{directives_text}."
1939
            if model_version or directives_text
1940
            else ''
1941
        )
1942
        temp_unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] = {}
8✔
1943
        for rep_kind in ['text', 'html']:  # markdown is same as text
8✔
1944
            unified_report = DifferBase.process(
8✔
1945
                'unified',
1946
                directives.get('unified') or {},  # type: ignore[arg-type]
1947
                self.state,
1948
                rep_kind,  # type: ignore[arg-type]
1949
                tz,
1950
                temp_unfiltered_diff,
1951
            )
1952
        return {
8✔
1953
            'text': (f'{summary}\n\n{unified_report["text"]}' + (f'\n------------\n{footer}' if footer else '')),
1954
            'markdown': (f'{summary}\n\n{unified_report["markdown"]}' + (f'\n* * *\n{footer}' if footer else '')),
1955
            'html': '\n'.join(
1956
                [
1957
                    mark_to_html(summary, extras={'tables'}).replace('<h2>', '<h3>').replace('</h2>', '</h3>'),
1958
                    '<br>',
1959
                    '<br>',
1960
                    unified_report['html'],
1961
                ]
1962
                + (['-----<br>', f'<i><small>{footer}</small></i>'] if footer else [])
1963
            ),
1964
        }
1965

1966

1967
class WdiffDiffer(DifferBase):
8✔
1968
    __kind__ = 'wdiff'
8✔
1969

1970
    __supported_directives__: dict[str, str] = {
8✔
1971
        'context_lines': 'the number of context lines (default: 3)',
1972
        'range_info': 'include range information lines (default: true)',
1973
    }
1974

1975
    def differ(
8✔
1976
        self,
1977
        directives: dict[str, Any],
1978
        report_kind: Literal['text', 'markdown', 'html'],
1979
        _unfiltered_diff: dict[Literal['text', 'markdown', 'html'], str] | None = None,
1980
        tz: ZoneInfo | None = None,
1981
    ) -> dict[Literal['text', 'markdown', 'html'], str]:
1982
        warnings.warn(
8✔
1983
            f'Job {self.job.index_number}: Differ {self.__kind__} is WORK IN PROGRESS and has KNOWN bugs which '
1984
            "are being worked on. DO NOT USE AS THE RESULTS WON'T BE CORRECT.",
1985
            RuntimeWarning,
1986
            stacklevel=1,
1987
        )
1988
        if not isinstance(self.state.old_data, str):
8!
1989
            raise ValueError
×
1990
        if not isinstance(self.state.new_data, str):
8!
1991
            raise ValueError
×
1992

1993
        # Split the texts into words tokenizing newline
1994
        if self.state.is_markdown():
8!
1995
            # Don't split spaces in link text, tokenize space as </s>
1996
            old_data = re.sub(r'\[(.*?)\]', lambda x: '[' + x.group(1).replace(' ', '</s>') + ']', self.state.old_data)
8✔
1997
            words1 = old_data.replace('\n', ' <\\n> ').split(' ')
8✔
1998
            new_data = re.sub(r'\[(.*?)\]', lambda x: '[' + x.group(1).replace(' ', '</s>') + ']', self.state.new_data)
8✔
1999
            words2 = new_data.replace('\n', ' <\\n> ').split(' ')
8✔
2000
        else:
2001
            words1 = self.state.old_data.replace('\n', ' <\\n> ').split(' ')
×
2002
            words2 = self.state.new_data.replace('\n', ' <\\n> ').split(' ')
×
2003

2004
        # Create a Differ object
2005
        import difflib
8✔
2006

2007
        d = difflib.Differ()
8✔
2008

2009
        # Generate a difference list
2010
        diff = list(d.compare(words1, words2))
8✔
2011

2012
        add_html = '<span style="background-color:#d1ffd1;color:#082b08;">'
8✔
2013
        rem_html = '<span style="background-color:#fff0f0;color:#9c1c1c;text-decoration:line-through;">'
8✔
2014

2015
        head_text = '\n'.join(
8✔
2016
            [
2017
                # f'Differ: wdiff',
2018
                f'\033[91m--- @ {self.make_timestamp(self.state.old_timestamp, tz)}\033[0m',
2019
                f'\033[92m+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}\033[0m',
2020
                '',
2021
            ]
2022
        )
2023
        head_html = '<br>\n'.join(
8✔
2024
            [
2025
                '<span style="font-family:monospace;">'
2026
                # 'Differ: wdiff',
2027
                f'<span style="color:darkred;">--- @ {self.make_timestamp(self.state.old_timestamp, tz)}</span>',
2028
                f'<span style="color:darkgreen;">+++ @ {self.make_timestamp(self.state.new_timestamp, tz)}</span>'
2029
                f'</span>',
2030
                '',
2031
            ]
2032
        )
2033
        # Process the diff output to make it more wdiff-like
2034
        result_text = []
8✔
2035
        result_html = []
8✔
2036
        prev_word_text = ''
8✔
2037
        prev_word_html = ''
8✔
2038
        next_text = ''
8✔
2039
        next_html = ''
8✔
2040
        add = False
8✔
2041
        rem = False
8✔
2042

2043
        for word_text in [*diff, '  ']:
8✔
2044
            if word_text[0] == '?':  # additional context line
8✔
2045
                continue
8✔
2046
            word_html = word_text
8✔
2047
            pre_text = [next_text] if next_text else []
8✔
2048
            pre_html = [next_html] if next_html else []
8✔
2049
            next_text = ''
8✔
2050
            next_html = ''
8✔
2051

2052
            if word_text[0] == '+' and not add:  # Beginning of additions
8✔
2053
                if rem:
8✔
2054
                    prev_word_html += '</span>'
8✔
2055
                    rem = False
8✔
2056
                if word_text[2:] == '<\\n>':
8!
2057
                    next_text = '\033[92m'
×
2058
                    next_html = add_html
×
2059
                else:
2060
                    pre_text.append('\033[92m')
8✔
2061
                    pre_html.append(add_html)
8✔
2062
                add = True
8✔
2063
            elif word_text[0] == '-' and not rem:  # Beginning of deletions
8✔
2064
                if add:
8✔
2065
                    prev_word_html += '</span>'
8✔
2066
                    add = False
8✔
2067
                if word_text[2:] == '<\\n>':
8!
2068
                    next_text = '\033[91m'
×
2069
                    next_html = rem_html
×
2070
                else:
2071
                    pre_text.append('\033[91m')
8✔
2072
                    pre_html.append(rem_html)
8✔
2073
                rem = True
8✔
2074
            elif word_text[0] == ' ' and (add or rem):  # Unchanged word
8✔
2075
                if prev_word_text == '<\\n>':
8!
2076
                    prev_word_text = '\033[0m<\\n>'
×
2077
                    prev_word_html = '</span><\\n>'
×
2078
                else:
2079
                    prev_word_text += '\033[0m'
8✔
2080
                    prev_word_html += '</span>'
8✔
2081
                add = False
8✔
2082
                rem = False
8✔
2083
            elif word_text[2:] == '<\\n>':  # New line
8✔
2084
                if add:
8!
2085
                    word_text = '  \033[0m<\\n>'
×
2086
                    word_html = '  </span><\\n>'
×
2087
                    add = False
×
2088
                elif rem:
8!
2089
                    word_text = '  \033[0m<\\n>'
×
2090
                    word_html = '  </span><\\n>'
×
2091
                    rem = False
×
2092

2093
            result_text.append(prev_word_text)
8✔
2094
            result_html.append(prev_word_html)
8✔
2095
            pre_text.append(word_text[2:])
8✔
2096
            pre_html.append(word_html[2:])
8✔
2097
            prev_word_text = ''.join(pre_text)
8✔
2098
            prev_word_html = ''.join(pre_html)
8✔
2099
        if add or rem:
8!
2100
            result_text[-1] += '\033[0m'
×
2101
            result_html[-1] += '</span>'
×
2102

2103
        # rebuild the text from words, replacing the newline token
2104
        diff_text = ' '.join(result_text[1:]).replace('<\\n> ', '\n').replace('<\\n>', '\n')
8✔
2105
        diff_html = ' '.join(result_html[1:]).replace('<\\n> ', '\n').replace('<\\n>', '\n')
8✔
2106

2107
        # build contextlines
2108
        contextlines = directives.get('context_lines', self.job.contextlines)
8✔
2109
        # contextlines = 999
2110
        if contextlines is None:
8!
2111
            contextlines = 3
8✔
2112
        range_info = directives.get('range_info', True)
8✔
2113
        if contextlines < len(diff_text.splitlines()):
8!
2114
            lines_with_changes = []
×
2115
            for i, line in enumerate(diff_text.splitlines()):
×
2116
                if '\033[9' in line:
×
2117
                    lines_with_changes.append(i)
×
2118
            if contextlines:
×
2119
                lines_to_keep: set[int] = set()
×
2120
                for i in lines_with_changes:
×
2121
                    lines_to_keep.update(r for r in range(i - contextlines, i + contextlines + 1))
×
2122
            else:
2123
                lines_to_keep = set(lines_with_changes)
×
2124
            new_diff_text = []
×
2125
            new_diff_html = []
×
2126
            last_line = 0
×
2127
            skip = False
×
2128
            i = 0
×
2129
            for i, (line_text, line_html) in enumerate(
×
2130
                zip(diff_text.splitlines(), diff_html.splitlines(), strict=False)
2131
            ):
2132
                if i in lines_to_keep:
×
2133
                    if range_info and skip:
×
2134
                        new_diff_text.append(f'@@ {last_line + 1}...{i} @@')
×
2135
                        new_diff_html.append(f'@@ {last_line + 1}...{i} @@')
×
2136
                        skip = False
×
2137
                    new_diff_text.append(line_text)
×
2138
                    new_diff_html.append(line_html)
×
2139
                    last_line = i + 1
×
2140
                else:
2141
                    skip = True
×
2142
            if (i + 1) != last_line:
×
2143
                if range_info and skip:
×
2144
                    new_diff_text.append(f'@@ {last_line + 1}...{i + 1} @@')
×
2145
                    new_diff_html.append(f'@@ {last_line + 1}...{i + 1} @@')
×
2146
            diff_text = '\n'.join(new_diff_text)
×
2147
            diff_html = '\n'.join(new_diff_html)
×
2148

2149
        if self.state.is_markdown():
8!
2150
            diff_text = diff_text.replace('</s>', ' ')
8✔
2151
            diff_html = diff_html.replace('</s>', ' ')
8✔
2152
            diff_html = mark_to_html(diff_html, self.job.markdown_padded_tables).replace('<p>', '').replace('</p>', '')
8✔
2153

2154
        if self.job.monospace:
8!
2155
            diff_html = f'<span style="font-family:monospace;white-space:pre-wrap">{diff_html}</span>'
×
2156
        else:
2157
            diff_html = diff_html.replace('\n', '<br>\n')
8✔
2158

2159
        return {
8✔
2160
            'text': head_text + diff_text,
2161
            'markdown': head_text + diff_text,
2162
            'html': head_html + diff_html,
2163
        }
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc