• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

hasgeek / coaster / 9244418196

26 May 2024 03:39PM UTC coverage: 84.467% (-4.8%) from 89.263%
9244418196

push

github

web-flow
Add async support for Quart+Flask (#470)

This commit bumps the version number from 0.7 to 0.8 as it has extensive changes:

* Ruff replaces black, isort and flake8 for linting and formatting
* All decorators now support async functions and provide async wrapper implementations
* Some obsolete modules have been removed
* Pagination from Flask-SQLAlchemy is now included, removing that dependency (but still used in tests)
* New `compat` module provides wrappers to both Quart and Flake and is used by all other modules
* Some tests run using Quart. The vast majority of tests are not upgraded, nor are there tests for async decorators, so overall line coverage has dropped significantly. Comprehensive test coverage is still pending; for now we are using Funnel's tests as the extended test suite

648 of 1023 new or added lines in 29 files covered. (63.34%)

138 existing lines in 17 files now uncovered.

3948 of 4674 relevant lines covered (84.47%)

3.38 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.86
/src/coaster/utils/text.py
1
"""Text processing utilities."""
4✔
2

3
from __future__ import annotations
4✔
4

5
import re
4✔
6
import string
4✔
7
from functools import partial
4✔
8
from html import unescape
4✔
9
from typing import Optional, Union
4✔
10
from xml.etree.ElementTree import Element  # nosec B405
4✔
11

12
import html5lib
4✔
13
from bleach.linkifier import DEFAULT_CALLBACKS, LinkifyFilter
4✔
14
from bleach.sanitizer import Cleaner
4✔
15
from markupsafe import Markup
4✔
16

17
__all__ = [
4✔
18
    'VALID_TAGS',
19
    'LINKIFY_SKIP_TAGS',
20
    'LINKIFY_CALLBACKS',
21
    'compress_whitespace',
22
    'deobfuscate_email',
23
    'normalize_spaces',
24
    'normalize_spaces_multiline',
25
    'sanitize_html',
26
    'simplify_text',
27
    'text_blocks',
28
    'ulstrip',
29
    'unicode_extended_whitespace',
30
    'urstrip',
31
    'ustrip',
32
]
33

34

35
#: Unicode's list of whitespace characters is missing some that were previously
36
#: classified as whitespace but are now considered format characters. These are
37
#: invisible and usually arrive via copy-paste, so we include them here as characters to
38
#: be replaced with spaces and stripped from the ends of text.
39
unicode_format_whitespace = (
4✔
40
    '\x85'  # NEXT LINE (NEL)
41
    '\xa0'  # NO-BREAK SPACE (NBSP)
42
    '\u1680'  # OGHAM SPACE MARK
43
    '\u180e'  # MONGOLIAN VOWEL SEPARATOR
44
    '\u2000'  # EN QUAD
45
    '\u2001'  # EM QUAD
46
    '\u2002'  # EN SPACE
47
    '\u2003'  # EM SPACE
48
    '\u2004'  # THREE-PER-EM SPACE
49
    '\u2005'  # FOUR-PER-EM SPACE
50
    '\u2006'  # SIX-PER-EM SPACE
51
    '\u2007'  # FIGURE SPACE
52
    '\u2008'  # PUNCTUATION SPACE
53
    '\u2009'  # THIN SPACE
54
    '\u200a'  # HAIR SPACE
55
    '\u200b'  # ZERO WIDTH SPACE (format)
56
    '\u200c'  # ZERO WIDTH NON-JOINER (format)
57
    '\u200d'  # ZERO WIDTH JOINER (format)
58
    '\u2028'  # LINE SEPARATOR
59
    '\u2029'  # PARAGRAPH SEPARATOR
60
    '\u202f'  # NARROW NO-BREAK SPACE (NNBSP)
61
    '\u205f'  # MEDIUM MATHEMATICAL SPACE (MMSP)
62
    '\u2060'  # WORD JOINER (format)
63
    '\u3000'  # IDEOGRAPHIC SPACE
64
    '\ufeff'  # ZERO WIDTH NO-BREAK SPACE (format)
65
)
66

67
ascii_whitespace = '\t\n\x0b\x0c\r\x1c\x1d\x1e\x1f '
4✔
68
ascii_whitespace_without_newline = '\t\x1f '
4✔
69
unicode_extended_whitespace = ascii_whitespace + unicode_format_whitespace
4✔
70

71
re_singleline_spaces = re.compile(
4✔
72
    '[' + unicode_extended_whitespace + ']', re.UNICODE | re.MULTILINE
73
)
74
re_multiline_spaces = re.compile(
4✔
75
    # This is missing \u2028 and \u2029 (separators)
76
    '[' + ascii_whitespace_without_newline + unicode_format_whitespace + ']',
77
    re.UNICODE | re.MULTILINE,
78
)
79
re_compress_spaces = re.compile(
4✔
80
    r'[\s' + unicode_format_whitespace + ']+', re.UNICODE | re.MULTILINE
81
)
82

83
VALID_TAGS: dict[str, list[str]] = {
4✔
84
    'a': ['href', 'title', 'target', 'rel'],
85
    'abbr': ['title'],
86
    'b': [],
87
    'br': [],
88
    'blockquote': [],
89
    'cite': [],
90
    'code': [],
91
    'dd': [],
92
    'del': [],
93
    'dl': [],
94
    'dt': [],
95
    'em': [],
96
    'h3': [],
97
    'h4': [],
98
    'h5': [],
99
    'h6': [],
100
    'hr': [],
101
    'i': [],
102
    'img': ['src', 'width', 'height', 'align', 'alt'],
103
    'ins': [],
104
    'li': [],
105
    'mark': [],
106
    'p': [],
107
    'pre': [],
108
    'ol': ['start'],
109
    'strong': [],
110
    'sup': [],
111
    'sub': [],
112
    'ul': [],
113
}
114

115
LINKIFY_SKIP_TAGS = ['pre', 'code', 'kbd', 'samp', 'var']
4✔
116

117
# Attrs is described in the Linkify source as {(namespace, name): value}, but the code
118
# that calls us sets it as `attrs = {(None, "href"): href, "_text": url}`
119
LinkifyAttrsType = Optional[dict[Union[tuple[Optional[str], str], str], str]]
4✔
120

121

122
# Adapted from https://bleach.readthedocs.io/en/latest/linkify.html#preventing-links
123
def dont_linkify_filenames(
4✔
124
    attrs: LinkifyAttrsType,
125
    new: bool = False,
126
) -> LinkifyAttrsType:
127
    # This is an existing link, so leave it be
128
    if not new:
4✔
129
        return attrs
4✔
130
    if attrs is None:
4✔
UNCOV
131
        return attrs
×
132
    # If the TLD is '.py', make sure it starts with http: or https:.
133
    # Use _text because that's the original text
134
    link_text = attrs['_text']
4✔
135
    if link_text.endswith('.py') and not link_text.startswith(('http:', 'https:')):
4✔
136
        # This looks like a Python file, not a URL. Don't make a link.
137
        return None
4✔
138
    # Everything checks out, keep going to the next callback.
139
    return attrs
4✔
140

141

142
LINKIFY_CALLBACKS = [*DEFAULT_CALLBACKS, dont_linkify_filenames]  # type: ignore[list-item]
4✔
143

144

145
def sanitize_html(
4✔
146
    value: str,
147
    valid_tags: Optional[dict[str, list[str]]] = None,
148
    strip: bool = True,
149
    linkify: bool = False,
150
) -> Markup:
151
    """Strip unwanted markup out of HTML."""
152
    if valid_tags is None:
4✔
153
        valid_tags = VALID_TAGS
4✔
154
    if linkify:
4✔
155
        filters = [
4✔
156
            partial(
157
                LinkifyFilter, callbacks=LINKIFY_CALLBACKS, skip_tags=LINKIFY_SKIP_TAGS
158
            )
159
        ]
160
    else:
161
        filters = []
4✔
162
    cleaner = Cleaner(
4✔
163
        tags=list(valid_tags.keys()),
164
        attributes=valid_tags,
165
        filters=filters,
166
        strip=strip,
167
    )
168
    return Markup(cleaner.clean(value))
4✔
169

170

171
blockish_tags = {
4✔
172
    'address',
173
    'article',
174
    'aside',
175
    'audio',
176
    'blockquote',
177
    'canvas',
178
    'dd',
179
    'div',
180
    'dl',
181
    'dt',
182
    'fieldset',
183
    'figcaption',
184
    'figure',
185
    'footer',
186
    'form',
187
    'h1',
188
    'h2',
189
    'h3',
190
    'h4',
191
    'h5',
192
    'h6',
193
    'header',
194
    'hgroup',
195
    'hr',
196
    'li',
197
    'noscript',
198
    'ol',
199
    'output',
200
    'p',
201
    'pre',
202
    'section',
203
    'table',
204
    'td',
205
    'tfoot',
206
    'th',
207
    'tr',
208
    'ul',
209
    'video',
210
}
211

212

213
def text_blocks(html_text: str, skip_pre: bool = True) -> list[str]:
4✔
214
    """Extract a list of paragraphs from a given HTML string."""
215
    doc = html5lib.parseFragment(html_text)
4✔
216
    blocks = []
4✔
217

218
    def subloop(
4✔
219
        parent_tag: Optional[str], element: Element, lastchild: bool = False
220
    ) -> None:
221
        if callable(
4✔
222
            element.tag
223
        ):  # Comments have a callable tag. TODO: Find out, anything else?
224
            tag = '<!-->'
4✔
225
            text = ''
4✔
226
            tail = element.tail or ''
4✔
227
        else:
228
            # Extract tag from namespace: "{http://www.w3.org/1999/xhtml}html"
229
            tag = element.tag.split('}')[-1]
4✔
230
            text = element.text or ''
4✔
231
            tail = element.tail or ''
4✔
232

233
        if tag == 'pre' and skip_pre:
4✔
234
            text = ''
4✔
235

236
        if tag in blockish_tags or tag == 'DOCUMENT_FRAGMENT':
4✔
237
            text = text.lstrip()  # Leading whitespace is insignificant in a block tag
4✔
238
            if len(element) == 0:
4✔
239
                # No children? Then trailing whitespace is insignificant
240
                text = text.rstrip()
4✔
241
            # If there's text, add it.
242
            # If there's no text but the next element is not a block tag, add a blank
243
            # anyway (unless it's a pre tag and we want to skip_pre, in which case
244
            # ignore it again).
245
            if text:
4✔
246
                blocks.append(text)
4✔
247
            elif (
4✔
248
                len(element)
249
                and isinstance(element[0].tag, str)
250
                and element[0].tag.split('}')[-1] not in blockish_tags
251
                and not (skip_pre and tag == 'pre')
252
            ):
253
                blocks.append('')
4✔
254
        elif not blocks:
4✔
NEW
255
            if text:
×
NEW
256
                blocks.append(text)
×
257
        else:
258
            blocks[-1] += text
4✔
259

260
        if len(element) > 0 and not (skip_pre and tag == 'pre'):
4✔
261
            for child in element[:-1]:
4✔
262
                subloop(tag, child)
4✔
263
            subloop(tag, element[-1], lastchild=True)
4✔
264

265
        if tag in blockish_tags:
4✔
266
            # Leading whitespace is insignificant after a block tag
267
            tail = tail.lstrip()
4✔
268
            if tail:
4✔
UNCOV
269
                blocks.append(tail)
×
270
        else:
271
            if parent_tag in blockish_tags and lastchild:
4✔
272
                # Trailing whitespace is insignificant before a block tag end
273
                tail = tail.rstrip()
4✔
274
            if not blocks:
4✔
UNCOV
275
                if tail:
×
276
                    blocks.append(tail)
×
277
            elif tag == 'br' and tail:
4✔
278
                blocks[-1] += '\n' + tail
4✔
279
            else:
280
                blocks[-1] += tail
4✔
281

282
    subloop(None, doc)
4✔
283
    # Replace &nbsp; with ' '
284
    return [t.replace('\xa0', ' ') for t in blocks]
4✔
285

286

287
def normalize_spaces(text: str) -> str:
4✔
288
    """Replace whitespace characters with regular spaces."""
UNCOV
289
    return re_singleline_spaces.sub(' ', text)
×
290

291

292
def normalize_spaces_multiline(text: str) -> str:
4✔
293
    """
294
    Replace whitespace characters with regular spaces, in multiline text.
295

296
    Line break characters like newlines are not considered whitespace.
297
    """
298
    return re_multiline_spaces.sub(' ', text)
4✔
299

300

301
def ulstrip(text: str) -> str:
4✔
302
    """Strip Unicode extended whitespace from the left side of a string."""
303
    return text.lstrip(unicode_extended_whitespace)
4✔
304

305

306
def urstrip(text: str) -> str:
4✔
307
    """Strip Unicode extended whitespace from the right side of a string."""
308
    return text.rstrip(unicode_extended_whitespace)
4✔
309

310

311
def ustrip(text: str) -> str:
4✔
312
    """Strip Unicode extended whitespace from a string."""
313
    return text.strip(unicode_extended_whitespace)
4✔
314

315

316
def compress_whitespace(text: str) -> str:
4✔
317
    """Reduce all space-like characters into single spaces and strip from ends."""
318
    return ustrip(re_compress_spaces.sub(' ', text))
4✔
319

320

321
# Based on http://jasonpriem.org/obfuscation-decoder/
322
_deobfuscate_dot1_re = re.compile(r'\W+\.\W+|\W+dot\W+|\W+d0t\W+', re.U | re.I)
4✔
323
_deobfuscate_dot2_re = re.compile(r'([a-z0-9])DOT([a-z0-9])')
4✔
324
_deobfuscate_dot3_re = re.compile(r'([A-Z0-9])dot([A-Z0-9])')
4✔
325
_deobfuscate_at1_re = re.compile(r'\W*@\W*|\W+at\W+', re.U | re.I)
4✔
326
_deobfuscate_at2_re = re.compile(r'([a-z0-9])AT([a-z0-9])')
4✔
327
_deobfuscate_at3_re = re.compile(r'([A-Z0-9])at([A-Z0-9])')
4✔
328

329

330
def deobfuscate_email(text: str) -> str:
4✔
331
    """Deobfuscate email addresses in provided text."""
332
    text = unescape(text)
4✔
333
    # Find the "dot"
334
    text = _deobfuscate_dot1_re.sub('.', text)
4✔
335
    text = _deobfuscate_dot2_re.sub(r'\1.\2', text)
4✔
336
    text = _deobfuscate_dot3_re.sub(r'\1.\2', text)
4✔
337
    # Find the "at"
338
    text = _deobfuscate_at1_re.sub('@', text)
4✔
339
    text = _deobfuscate_at2_re.sub(r'\1@\2', text)
4✔
340
    return _deobfuscate_at3_re.sub(r'\1@\2', text)
4✔
341

342

343
def simplify_text(text: str) -> str:
4✔
344
    """
345
    Simplify text to allow comparison.
346

347
    >>> simplify_text("Awesome Coder wanted at Awesome Company")
348
    'awesome coder wanted at awesome company'
349
    >>> simplify_text("Awesome Coder, wanted  at Awesome Company! ")
350
    'awesome coder wanted at awesome company'
351
    >>> simplify_text("Awesome Coder, wanted  at Awesome Company! ") == (
352
    ...     'awesome coder wanted at awesome company'
353
    ... )
354
    True
355
    """
356
    text = text.translate(text.maketrans('', '', string.punctuation)).lower()
4✔
357
    return ' '.join(text.split())
4✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc