• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

pantsbuild / pants / 19015773527

02 Nov 2025 05:33PM UTC coverage: 17.872% (-62.4%) from 80.3%
19015773527

Pull #22816

github

web-flow
Merge a12d75757 into 6c024e162
Pull Request #22816: Update Pants internal Python to 3.14

4 of 5 new or added lines in 3 files covered. (80.0%)

28452 existing lines in 683 files now uncovered.

9831 of 55007 relevant lines covered (17.87%)

0.18 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

45.88
/src/python/pants/util/strutil.py
1
# Copyright 2014 Pants project contributors (see CONTRIBUTORS.md).
2
# Licensed under the Apache License, Version 2.0 (see LICENSE).
3

4
from __future__ import annotations
1✔
5

6
import dataclasses
1✔
7
import hashlib
1✔
8
import json
1✔
9
import re
1✔
10
import shlex
1✔
11
import textwrap
1✔
12
from collections import abc
1✔
13
from collections.abc import Callable, Iterable, Mapping
1✔
14
from logging import Logger
1✔
15
from typing import Any, TypeVar
1✔
16

17
import colors
1✔
18
from typing_extensions import ParamSpec
1✔
19

20
from pants.engine.internals.native_engine import Digest
1✔
21
from pants.util.ordered_set import FrozenOrderedSet, OrderedSet
1✔
22

23

24
def ensure_binary(text_or_binary: bytes | str) -> bytes:
1✔
UNCOV
25
    if isinstance(text_or_binary, bytes):
×
26
        return text_or_binary
×
UNCOV
27
    elif isinstance(text_or_binary, str):
×
UNCOV
28
        return text_or_binary.encode("utf8")
×
29
    else:
UNCOV
30
        raise TypeError(f"Argument is neither text nor binary type.({type(text_or_binary)})")
×
31

32

33
def ensure_text(text_or_binary: bytes | str) -> str:
1✔
UNCOV
34
    if isinstance(text_or_binary, bytes):
×
UNCOV
35
        return text_or_binary.decode()
×
UNCOV
36
    elif isinstance(text_or_binary, str):
×
UNCOV
37
        return text_or_binary
×
38
    else:
UNCOV
39
        raise TypeError(f"Argument is neither text nor binary type ({type(text_or_binary)})")
×
40

41

42
def safe_shlex_split(text_or_binary: bytes | str) -> list[str]:
1✔
43
    """Split a string using shell-like syntax.
44

45
    Safe even on python versions whose shlex.split() method doesn't accept unicode.
46
    """
47
    value = ensure_text(text_or_binary)
×
48
    return shlex.split(value)
×
49

50

51
# `_shell_unsafe_chars_pattern` and `shell_quote` are modified from the CPython 3.6 source:
52
# https://github.com/python/cpython/blob/142e3c08a40c75b5788474b0defe7d5c0671f675/Lib/shlex.py#L308
53
_shell_unsafe_chars_pattern = re.compile(r"[^\w@%+=:,./-]").search
1✔
54

55

56
def shell_quote(s: str) -> str:
1✔
57
    """Return a shell-escaped version of the string *s*."""
58
    if not s:
×
59
        return "''"
×
60
    if _shell_unsafe_chars_pattern(s) is None:
×
61
        return s
×
62

63
    # use single quotes, and put single quotes into double quotes
64
    # the string $'b is then quoted as '$'"'"'b'
65
    return "'" + s.replace("'", "'\"'\"'") + "'"
×
66

67

68
def safe_shlex_join(arg_list: Iterable[str]) -> str:
1✔
69
    """Join a list of strings into a shlex-able string.
70

71
    Shell-quotes each argument with `shell_quote()`.
72
    """
73
    return " ".join(shell_quote(arg) for arg in arg_list)
×
74

75

76
def pluralize(count: int, item_type: str, include_count: bool = True) -> str:
1✔
77
    """Pluralizes the item_type if the count does not equal one.
78

79
    For example `pluralize(1, 'apple')` returns '1 apple',
80
    while `pluralize(0, 'apple') returns '0 apples'.
81

82
    When `include_count=False` does not add the count in front of the pluralized `item_type`.
83

84
    :return The count and inflected item_type together as a string
85
    """
86

UNCOV
87
    def pluralize_string(x: str) -> str:
×
UNCOV
88
        if x.endswith("s"):
×
UNCOV
89
            return x + "es"
×
UNCOV
90
        elif x.endswith("y"):
×
UNCOV
91
            return x[:-1] + "ies"
×
92
        else:
UNCOV
93
            return x + "s"
×
94

UNCOV
95
    pluralized_item = item_type if count == 1 else pluralize_string(item_type)
×
UNCOV
96
    if not include_count:
×
UNCOV
97
        return pluralized_item
×
98
    else:
UNCOV
99
        text = f"{count} {pluralized_item}"
×
UNCOV
100
        return text
×
101

102

103
def comma_separated_list(items: Iterable[str]) -> str:
1✔
UNCOV
104
    items = list(items)
×
UNCOV
105
    if len(items) == 0:
×
UNCOV
106
        return ""
×
UNCOV
107
    if len(items) == 1:
×
UNCOV
108
        return items[0]
×
UNCOV
109
    if len(items) == 2:
×
UNCOV
110
        return f"{items[0]} and {items[1]}"
×
111
    # For 3+ items, employ the oxford comma.
UNCOV
112
    return f"{', '.join(items[0:-1])}, and {items[-1]}"
×
113

114

115
def strip_prefix(string: str, prefix: str) -> str:
1✔
116
    """Returns a copy of the string from which the multi-character prefix has been stripped.
117

118
    Use strip_prefix() instead of lstrip() to remove a substring (instead of individual characters)
119
    from the beginning of a string, if the substring is present.  lstrip() does not match substrings
120
    but rather treats a substring argument as a set of characters.
121

122
    :param string: The string from which to strip the specified prefix.
123
    :param prefix: The substring to strip from the left of string, if present.
124
    :return: The string with prefix stripped from the left, if present.
125
    """
UNCOV
126
    if string.startswith(prefix):
×
UNCOV
127
        return string[len(prefix) :]
×
128
    else:
UNCOV
129
        return string
×
130

131

132
# NB: We allow bytes because `ProcessResult.std{err,out}` uses bytes.
133
def strip_v2_chroot_path(v: bytes | str) -> str:
1✔
134
    """Remove all instances of the chroot tmpdir path from the str so that it only uses relative
135
    paths.
136

137
    This is useful when a tool that is run with the V2 engine outputs absolute paths. It is
138
    confusing for the user to see the absolute path in the final output because it is an
139
    implementation detail that Pants copies their source code into a chroot.
140
    """
UNCOV
141
    if isinstance(v, bytes):
×
UNCOV
142
        v = v.decode()
×
UNCOV
143
    return re.sub(r"/[a-zA-Z0-9-_\/]*/pants-sandbox-[a-zA-Z0-9]+/", "", v)
×
144

145

146
@dataclasses.dataclass(frozen=True)
1✔
147
class Simplifier:
1✔
148
    """Helper for options for conditionally simplifying a string."""
149

150
    # it's only rarely useful to show a chroot path to a user, hence they're stripped by default
151
    strip_chroot_path: bool = True
1✔
152
    """remove all instances of the chroot tmpdir path"""
1✔
153
    strip_formatting: bool = False
1✔
154
    """remove ANSI formatting commands (colors, bold, etc)"""
1✔
155

156
    def simplify(self, v: bytes | str) -> str:
1✔
UNCOV
157
        chroot = (
×
158
            strip_v2_chroot_path(v)
159
            if self.strip_chroot_path
160
            else v.decode()
161
            if isinstance(v, bytes)
162
            else v
163
        )
UNCOV
164
        formatting = colors.strip_color(chroot) if self.strip_formatting else chroot
×
UNCOV
165
        assert isinstance(formatting, str)
×
166

UNCOV
167
        return formatting
×
168

169

170
def hard_wrap(s: str, *, indent: int = 0, width: int = 96) -> list[str]:
1✔
171
    """Hard wrap a string while still preserving any prior hard wrapping (new lines).
172

173
    This works well when the input uses soft wrapping, e.g. via Python's implicit string
174
    concatenation.
175

176
    Usually, you will want to join the lines together with "\n".join().
177
    """
178
    # wrap() returns [] for an empty line, but we want to emit those, hence the `or [line]`.
UNCOV
179
    return [
×
180
        f"{' ' * indent}{wrapped_line}"
181
        for line in s.splitlines()
182
        for wrapped_line in textwrap.wrap(line, width=width - indent) or [line]
183
    ]
184

185

186
def bullet_list(elements: Iterable[str], max_elements: int = -1) -> str:
1✔
187
    """Format a bullet list with padding.
188

189
    Callers should normally use `\n\n` before and (if relevant) after this so that the bullets
190
    appear as a distinct section.
191

192
    The `max_elements` may be used to limit the number of bullet rows to output, and instead leave a
193
    last bullet item with "* ... and N more".
194
    """
UNCOV
195
    if not elements:
×
UNCOV
196
        return ""
×
197

UNCOV
198
    if max_elements > 0:
×
UNCOV
199
        elements = tuple(elements)
×
UNCOV
200
        if len(elements) > max_elements:
×
UNCOV
201
            elements = elements[: max_elements - 1] + (
×
202
                f"... and {len(elements) - max_elements + 1} more",
203
            )
204

UNCOV
205
    sep = "\n  * "
×
UNCOV
206
    return f"  * {sep.join(elements)}"
×
207

208

209
def first_paragraph(s: str) -> str:
1✔
210
    """Get the first paragraph, where paragraphs are separated by blank lines."""
UNCOV
211
    lines = s.splitlines()
×
UNCOV
212
    first_blank_line_index = next(
×
213
        (i for i, line in enumerate(lines) if line.strip() == ""), len(lines)
214
    )
UNCOV
215
    return " ".join(lines[:first_blank_line_index])
×
216

217

218
# This is more conservative that it necessarily need be. In practice POSIX filesystems
219
# support any printable character except the path separator (forward slash), but it's
220
# better to be over-cautious.
221

222
# TODO: <> may not be safe in Windows paths. When we support Windows we will probably
223
#  want to detect that here and be more restrictive on that platform. But we do want
224
#  to support <> where possible, because they appear in target partition descriptions
225
#  (e.g., "CPython>=2.7,<3") and those are sometimes converted to paths.
226
_non_path_safe_re = re.compile(r"[^a-zA-Z0-9_\-.()<>,= ]")
1✔
227

228

229
def path_safe(s: str) -> str:
1✔
UNCOV
230
    return _non_path_safe_re.sub("_", s)
×
231

232

233
# TODO: This may be a bit too eager. Some strings might want to preserve multiple spaces in them
234
# (e.g. a Python code block which has a comment in it would have 2 spaces before the "#", which
235
# would be squashed by this eager regex). The challenge is that there's some overlap between prose
236
# (which shouldn't need multiple spaces) and code (which might) for non-alphanumeric characters.
237
# We can tighten as necessary.
238
_super_space_re = re.compile(r"(\S)  +(\S)")
1✔
239
_more_than_2_newlines = re.compile(r"\n{2}\n+")
1✔
240
_leading_whitespace_re = re.compile(r"(^[ ]*)(?:[^ \n])", re.MULTILINE)
1✔
241

242

243
def softwrap(text: str) -> str:
1✔
244
    """Turns a multiline-ish string into a softwrapped string.
245

246
    This is primarily used to turn strings in source code, which often have a single paragraph
247
    span multiple source lines, into consistently formatted blocks for hardwrapping later.
248

249
    Applies the following rules:
250
        - Dedents the text (you also don't need to start your string with a backslash)
251
            (The algorithm used for dedention simply looks at the first indented line and
252
            unambiguously tries to strip that much indentation from every indented line thereafter.)
253
        - Replaces all occurrences of multiple spaces in a sentence with a single space
254
        - Replaces all occurrences of multiple newlines with exactly 2 newlines
255
        - Replaces singular newlines with a space (to turn a paragraph into one long line)
256
            - Unless the following line is indented, or begins with a `* ` (to indicate an item in a list),
257
              in which case the newline and indentation are preserved.
258
        - Double-newlines are preserved
259
        - Extra indentation is preserved, and also preserves the indented line's ending
260
            (If your indented line needs to be continued due to it being longer than the suggested
261
            width, use trailing backlashes to line-continue the line. Because we squash multiple
262
            spaces, this will "just work".)
263

264
    To keep the numbered or bullet lists indented without converting to a code block,
265
    make sure to use 2 spaces (and not 4).
266
    """
267
    if not text:
1✔
268
        return text
×
269
    # If callers didn't use a leading "\" that's OK.
270
    if text[0] == "\n":
1✔
271
        text = text[1:]
1✔
272

273
    text = _more_than_2_newlines.sub("\n\n", text)
1✔
274
    margin = _leading_whitespace_re.search(text)
1✔
275
    if margin:
1✔
276
        text = re.sub(r"(?m)^" + margin[1], "", text)
1✔
277

278
    lines = text.splitlines(keepends=True)
1✔
279
    # NB: collecting a list of strs and `"".join` is more performant than calling `+=` repeatedly.
280
    result_strs = []
1✔
281
    for i, line in enumerate(lines):
1✔
282
        line = _super_space_re.sub(r"\1 \2", line)
1✔
283
        next_line = lines[i + 1] if i + 1 < len(lines) else ""
1✔
284
        if (
1✔
285
            "\n" in (line, next_line)
286
            or line.startswith(" ")
287
            or next_line.startswith(" ")
288
            or line.lstrip().startswith("* ")
289
        ):
290
            result_strs.append(line)
1✔
291
        else:
292
            result_strs.append(line.rstrip())
1✔
293
            result_strs.append(" ")
1✔
294

295
    return "".join(result_strs).rstrip()
1✔
296

297

298
_MEMORY_UNITS = ["B", "KiB", "MiB", "GiB"]
1✔
299

300

301
def fmt_memory_size(value: int, *, units: Iterable[str] = _MEMORY_UNITS) -> str:
1✔
302
    """Formats a numeric value as amount of bytes alongside the biggest byte-based unit from the
303
    list that represents the same amount without using decimals."""
304

UNCOV
305
    if not units:
×
306
        return str(value)
×
307

UNCOV
308
    amount = value
×
UNCOV
309
    unit_idx = 0
×
310

UNCOV
311
    units = tuple(units)
×
UNCOV
312
    while (amount >= 1024 and amount % 1024 == 0) and unit_idx < len(units) - 1:
×
UNCOV
313
        amount = int(amount / 1024)
×
UNCOV
314
        unit_idx += 1
×
315

UNCOV
316
    return f"{int(amount)}{units[unit_idx]}"
×
317

318

319
def strval(val: str | Callable[[], str]) -> str:
1✔
320
    return val if isinstance(val, str) else val()
1✔
321

322

323
def help_text(val: str | Callable[[], str]) -> str | Callable[[], str]:
1✔
324
    """Convenience method for defining an optionally lazy-evaluated softwrapped help string.
325

326
    This exists because `mypy` does not respect the type hints defined on base `Field` and `Target`
327
    classes.
328
    """
329
    # This can go away when https://github.com/python/mypy/issues/14702 is fixed
330
    if isinstance(val, str):
1✔
331
        return softwrap(val)
1✔
332
    else:
333
        return lambda: softwrap(val())
1✔
334

335

336
P = ParamSpec("P")
1✔
337
R = TypeVar("R")
1✔
338

339

340
def docstring(doc: str | Callable[[], str]) -> Callable[[Callable[P, R]], Callable[P, R]]:
1✔
341
    """Use this decorator to provide a dynamic doc-string to a function."""
342

343
    def wrapper(func: Callable[P, R]) -> Callable[P, R]:
1✔
344
        func.__doc__ = strval(doc)
1✔
345
        return func
1✔
346

347
    return wrapper
1✔
348

349

350
class _JsonEncoder(json.JSONEncoder):
1✔
351
    """Allow us to serialize everything, with a fallback on `str()` in case of any esoteric
352
    types."""
353

354
    def default(self, o):
1✔
355
        """Return a serializable object for o."""
UNCOV
356
        if isinstance(o, abc.Mapping):
×
UNCOV
357
            return dict(o)
×
UNCOV
358
        if isinstance(o, (abc.Sequence, OrderedSet, FrozenOrderedSet)):
×
UNCOV
359
            return list(o)
×
360

361
        # NB: A quick way to embed the type in the hash so that two objects with the same data but
362
        # different types produce different hashes.
UNCOV
363
        classname = o.__class__.__name__
×
UNCOV
364
        if dataclasses.is_dataclass(o):
×
UNCOV
365
            return {"__class__.__name__": classname, **dataclasses.asdict(o)}
×
UNCOV
366
        if isinstance(o, (Digest,)):
×
UNCOV
367
            return {"__class__.__name__": classname, "fingerprint": o.fingerprint}
×
368
        return super().default(o)
×
369

370

371
def stable_hash(value: Any, *, name: str = "sha256") -> str:
1✔
372
    """Attempts to return a stable hash of the value stable across processes.
373

374
    "Stable" here means that if `value` is equivalent in multiple invocations (across multiple
375
    processes), it should produce the same hash. To that end, what values are accepted are limited
376
    in scope.
377
    """
UNCOV
378
    return hashlib.new(
×
379
        name,
380
        json.dumps(
381
            value, indent=None, separators=(",", ":"), sort_keys=True, cls=_JsonEncoder
382
        ).encode("utf-8"),
383
    ).hexdigest()
384

385

386
# NB: If an OS string is not valid UTF-8, Python encodes the non-decodable bytes
387
#  as lone surrogates (see https://peps.python.org/pep-0383/).
388
#  However when we pass these to Rust, we will fail to decode as strict UTF-8.
389
#  So we perform a lossy re-encoding to prevent this.
390
def strict_utf8(s: str) -> str:
1✔
UNCOV
391
    return s.encode("utf-8", "replace").decode("utf-8")
×
392

393

394
def get_strict_env(env: Mapping[str, str], logger: Logger) -> Mapping[str, str]:
1✔
UNCOV
395
    strict_env = {}
×
UNCOV
396
    for key, val in sorted(env.items()):
×
UNCOV
397
        strict_key = strict_utf8(key)
×
UNCOV
398
        strict_val = strict_utf8(val)
×
UNCOV
399
        if strict_key == key:
×
UNCOV
400
            if strict_val == val:
×
UNCOV
401
                strict_env[strict_key] = strict_val
×
402
            else:
403
                logger.warning(f"Environment variable with non-UTF-8 value ignored: {key}")
×
404
        else:
405
            # We can only log strict_key, because logging will choke on non-UTF-8.
406
            # But the reader will know what we mean.
407
            logger.warning(f"Environment variable with non-UTF-8 name ignored: {strict_key}")
×
UNCOV
408
    return strict_env
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc