• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

snarfed / bridgy-fed / 41d10851-5bfa-4684-9acc-e042d794f291

20 Sep 2024 07:12PM UTC coverage: 92.811% (+0.04%) from 92.776%
41d10851-5bfa-4684-9acc-e042d794f291

push

circleci

snarfed
refactor Web.poll_feed_task, add better error handling

pull out an internal poll method, wrap it in a mostly-catch-all exception handler, make sure we more reliably create the next task even if fetching or parsing fails

fixes:
https://console.cloud.google.com/errors/detail/CICLhIKMm6nSHw;locations=global;time=P30D?project=bridgy-federated
https://console.cloud.google.com/errors/detail/CJCrlNfNn6n3FQ;locations=global;time=P30D?project=bridgy-federated
https://console.cloud.google.com/errors/detail/CPnSx-yemJ-hlAE;locations=global;time=P30D?project=bridgy-federated
etc

45 of 46 new or added lines in 2 files covered. (97.83%)

19 existing lines in 1 file now uncovered.

4196 of 4521 relevant lines covered (92.81%)

0.93 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.87
/web.py
1
"""Webmention protocol with microformats2 in HTML, aka the IndieWeb stack."""
2
from datetime import timedelta, timezone
1✔
3
import difflib
1✔
4
import logging
1✔
5
import re
1✔
6
import statistics
1✔
7
import urllib.parse
1✔
8
from urllib.parse import quote, urlencode, urljoin, urlparse
1✔
9
from xml.etree import ElementTree
1✔
10

11
import brevity
1✔
12
from flask import redirect, render_template, request
1✔
13
from google.cloud import ndb
1✔
14
from google.cloud.ndb import ComputedProperty
1✔
15
from granary import as1, as2, atom, microformats2, rss
1✔
16
import mf2util
1✔
17
from oauth_dropins.webutil import flask_util, util
1✔
18
from oauth_dropins.webutil.appengine_config import tasks_client
1✔
19
from oauth_dropins.webutil import appengine_info
1✔
20
from oauth_dropins.webutil.flask_util import cloud_tasks_only, error, flash
1✔
21
from oauth_dropins.webutil.util import domain_from_link, json_dumps, json_loads
1✔
22
from oauth_dropins.webutil import webmention
1✔
23
from requests import HTTPError, RequestException
1✔
24
from requests.auth import HTTPBasicAuth
1✔
25
from werkzeug.exceptions import BadGateway, BadRequest, HTTPException, NotFound
1✔
26

27
import common
1✔
28
from common import (
1✔
29
    CACHE_CONTROL,
30
    DOMAIN_RE,
31
    PRIMARY_DOMAIN,
32
    PROTOCOL_DOMAINS,
33
    SUPERDOMAIN,
34
)
35
from flask_app import app
1✔
36
from ids import normalize_user_id, translate_object_id, translate_user_id
1✔
37
from models import Follower, Object, PROTOCOLS, Target, User
1✔
38
from protocol import Protocol
1✔
39

40
logger = logging.getLogger(__name__)
1✔
41

42
# https://github.com/snarfed/bridgy-fed/issues/314
43
WWW_DOMAINS = frozenset((
1✔
44
    'www.jvt.me',
45
))
46

47
FEED_TYPES = {
1✔
48
    atom.CONTENT_TYPE.split(';')[0]: 'atom',
49
    rss.CONTENT_TYPE.split(';')[0]: 'rss',
50
    # https://stackoverflow.com/questions/4832357/whats-the-difference-between-text-xml-vs-application-xml-for-webservice-respons
51
    'application/xml': 'xml',
52
    'text/xml': 'xml',
53
}
54
MIN_FEED_POLL_PERIOD = timedelta(hours=2)
1✔
55
MAX_FEED_POLL_PERIOD = timedelta(days=1)
1✔
56
MAX_FEED_PROPERTY_SIZE = 500 * 1000  # Object.atom/rss
1✔
57
MAX_FEED_ITEMS_PER_POLL = 10
1✔
58

59
# populated into Web.redirects_error
60
OWNS_WEBFINGER = 'This site serves its own Webfinger, and likely ActivityPub too.'
1✔
61

62

63
def is_valid_domain(domain, allow_internal=True):
1✔
64
    """Returns True if this is a valid domain we can use, False otherwise.
65

66
    Args:
67
      domain (str):
68
      allow_internal (bool): whether to return True for internal domains
69
        like ``fed.brid.gy``, ``bsky.brid.gy``, etc
70

71
    Valid means TLD is ok, not blacklisted, etc.
72
    """
73
    if not domain or not re.match(DOMAIN_RE, domain):
1✔
74
        logger.debug(f"{domain} doesn't look like a domain")
1✔
75
        return False
1✔
76

77
    if Web.is_blocklisted(domain, allow_internal=allow_internal):
1✔
78
        logger.info(f'{domain} is blocklisted')
1✔
79
        return False
1✔
80

81
    tld = domain.split('.')[-1]
1✔
82
    if tld not in brevity.TLDS:
1✔
83
        logger.info(f"{domain} looks like a domain but {tld} isn't a TLD")
1✔
84
        return False
1✔
85

86
    return True
1✔
87

88

89
class Web(User, Protocol):
1✔
90
    """Web user and webmention protocol implementation.
91

92
    The key name is the domain.
93
    """
94
    ABBREV = 'web'
1✔
95
    PHRASE = 'the web'
1✔
96
    OTHER_LABELS = ('webmention',)
1✔
97
    LOGO_HTML = '🌐'  # used to be 🕸️
1✔
98
    CONTENT_TYPE = common.CONTENT_TYPE_HTML
1✔
99
    DEFAULT_ENABLED_PROTOCOLS = ('activitypub',)
1✔
100
    SUPPORTED_AS1_TYPES = (
1✔
101
        tuple(as1.ACTOR_TYPES)
102
        + tuple(as1.POST_TYPES)
103
        + tuple(as1.CRUD_VERBS)
104
        + ('audio', 'bookmark', 'event', 'image', 'video')
105
        + ('follow', 'like', 'share', 'stop-following')
106
    )
107

108
    has_redirects = ndb.BooleanProperty()
1✔
109
    redirects_error = ndb.TextProperty()
1✔
110
    has_hcard = ndb.BooleanProperty()
1✔
111
    last_webmention_in = ndb.DateTimeProperty(tzinfo=timezone.utc)
1✔
112
    last_polled_feed = ndb.DateTimeProperty(tzinfo=timezone.utc)
1✔
113
    feed_last_item = ndb.StringProperty()  # id (URL)
1✔
114
    feed_etag = ndb.StringProperty()
1✔
115
    feed_last_modified = ndb.StringProperty()
1✔
116

117
    # only used by protocol bot users in Bluesky, for polling their chat
118
    # messages with chat.bsky.convo.getLog
119
    atproto_last_chat_log_cursor = ndb.StringProperty()
1✔
120

121
    # Originally, BF served Web users' AP actor ids on fed.brid.gy, eg
122
    # https://fed.brid.gy/snarfed.org . When we started adding new protocols, we
123
    # switched to per-protocol subdomains, eg https://web.brid.gy/snarfed.org .
124
    # However, we need to preserve the old users' actor ids as is.
125
    #
126
    # Also, our per-protocol bot accounts in ActivityPub are on their own
127
    # subdomains, eg @bsky.brid.gy@bsky.brid.gy.
128
    #
129
    # So, this property tracks which subdomain a given Web user's AP actor uses.
130
    ap_subdomain = ndb.StringProperty(
1✔
131
        choices=['ap', 'bsky', 'fed', 'web', 'fake', 'other', 'eefake'],
132
        default='web')
133

134
    # OLD. some stored entities still have these; do not reuse.
135
    # superfeedr_subscribed = ndb.DateTimeProperty(tzinfo=timezone.utc)
136
    # superfeedr_subscribed_feed = ndb.StringProperty()
137

138
    @classmethod
1✔
139
    def _get_kind(cls):
1✔
140
        return 'MagicKey'
1✔
141

142
    def _pre_put_hook(self):
1✔
143
        """Validate domain id, don't allow upper case or invalid characters."""
144
        super()._pre_put_hook()
1✔
145
        id = self.key.id()
1✔
146
        assert is_valid_domain(id), id
1✔
147
        assert id.lower() == id, f'upper case is not allowed in Web key id: {id}'
1✔
148

149
    @classmethod
1✔
150
    def get_or_create(cls, id, allow_opt_out=False, verify=None, **kwargs):
1✔
151
        """Normalize domain, then pass through to :meth:`User.get_or_create`.
152

153
        Normalizing currently consists of lower casing and removing leading and
154
        trailing dots.
155

156
        Args:
157
          verify (bool): whether to call :meth:`verify` to load h-card, check
158
            redirects, etc. Defaults to calling it only if the user is new.
159
        """
160
        key = cls.key_for(id, allow_opt_out=allow_opt_out)
1✔
161
        if not key:
1✔
162
            return None  # opted out
1✔
163

164
        domain = key.id()
1✔
165
        if util.domain_or_parent_in(domain, [SUPERDOMAIN.strip('.')]):
1✔
166
            return super().get_by_id(domain)
1✔
167

168
        user = super().get_or_create(domain, allow_opt_out=allow_opt_out, **kwargs)
1✔
169
        if not user:
1✔
170
            return None
×
171

172
        if verify or (verify is None and not user.existing):
1✔
173
            user = user.verify()
1✔
174

175
        if not user.existing:
1✔
176
            common.create_task(queue='poll-feed', domain=user.key.id())
1✔
177

178
        return user
1✔
179

180
    @ndb.ComputedProperty
1✔
181
    def handle(self):
1✔
182
        """Returns this user's chosen username or domain, eg ``user.com``."""
183
        # prettify if domain, noop if username
184
        username = self.username()
1✔
185
        if username != self.key.id():
1✔
186
            return domain_from_link(username, minimize=False)
1✔
187
        return username
1✔
188

189
    def handle_as(self, to_proto):
1✔
190
        """Special case ActivityPub to use custom username."""
191
        if to_proto in ('activitypub', 'ap', PROTOCOLS['ap']):
1✔
192
            return (f'@{self.username()}@{self.key.id()}' if self.has_redirects
1✔
193
                    else f'@{self.key.id()}@{self.ap_subdomain}{SUPERDOMAIN}')
194

195
        return super().handle_as(to_proto)
1✔
196

197
    def id_as(self, to_proto):
1✔
198
        """Special case ActivityPub to use ``ap_subdomain``."""
199
        if isinstance(to_proto, str):
1✔
200
            to_proto = PROTOCOLS[to_proto]
×
201

202
        converted = translate_user_id(id=self.key.id(), from_=self,
1✔
203
                                      to=to_proto)
204

205
        if to_proto.LABEL == 'activitypub':
1✔
206
            other = 'web' if self.ap_subdomain == 'fed' else 'fed'
1✔
207
            converted = converted.replace(f'https://{other}.brid.gy/',
1✔
208
                                          f'https://{self.ap_subdomain}.brid.gy/')
209

210
        return converted
1✔
211

212
    web_url = User.profile_id
1✔
213

214
    def is_web_url(self, url):
1✔
215
        return super().is_web_url(url, ignore_www=True)
1✔
216

217
    def user_page_path(self, rest=None):
1✔
218
        """Always use domain."""
219
        path = f'/{self.ABBREV}/{self.key.id()}'
1✔
220

221
        if rest:
1✔
222
            if not rest.startswith('?'):
1✔
223
                path += '/'
1✔
224
            path += rest.lstrip('/')
1✔
225

226
        return path
1✔
227

228
    def username(self):
1✔
229
        """Returns the user's preferred username.
230

231
        Uses stored representative h-card if available, falls back to id.
232

233
        Returns:
234
          str:
235
        """
236
        id = self.key.id()
1✔
237

238
        if self.obj and self.obj.as1 and self.direct:
1✔
239
            for url in (util.get_list(self.obj.as1, 'url') +
1✔
240
                        util.get_list(self.obj.as1, 'urls')):
241
                url = url.get('value') if isinstance(url, dict) else url
1✔
242
                if url and url.startswith('acct:'):
1✔
243
                    try:
1✔
244
                        urluser, urldomain = util.parse_acct_uri(url)
1✔
245
                    except ValueError as e:
1✔
246
                        continue
1✔
247
                    if urldomain == id:
1✔
248
                        logger.info(f'Found custom username: {urluser}')
1✔
249
                        return urluser
1✔
250

251
        # logger.debug(f'Defaulting username to key id {id}')
252
        return id
1✔
253

254
    @ndb.ComputedProperty
1✔
255
    def status(self):
1✔
256
        if self.redirects_error == OWNS_WEBFINGER:
1✔
257
            # looks like this site is already its own fediverse server
258
            return 'blocked'
1✔
259

260
        return super().status
1✔
261

262
    def verify(self):
1✔
263
        """Fetches site a couple ways to check for redirects and h-card.
264

265
        Returns:
266
          web.Web: user that was verified. May be different than self! eg if
267
          self's domain started with www and we switch to the root domain.
268
        """
269
        domain = self.key.id()
1✔
270
        logger.info(f'Verifying {domain}')
1✔
271

272
        if domain.startswith('www.') and domain not in WWW_DOMAINS:
1✔
273
            # if root domain serves ok, use it instead
274
            # https://github.com/snarfed/bridgy-fed/issues/314
275
            root = domain.removeprefix('www.')
1✔
276
            root_site = f'https://{root}/'
1✔
277
            try:
1✔
278
                resp = util.requests_get(root_site, gateway=False)
1✔
279
                if resp.ok and self.is_web_url(resp.url):
1✔
280
                    logger.info(f'{root_site} serves ok ; using {root} instead')
1✔
281
                    root_user = Web.get_or_create(
1✔
282
                        root,
283
                        enabled_protocols=self.enabled_protocols,
284
                        direct=self.direct)
285
                    self.use_instead = root_user.key
1✔
286
                    self.put()
1✔
287
                    return root_user.verify()
1✔
288
            except RequestException as e:
×
289
                logger.info(f"Couldn't fetch {root_site} : {e}")
×
290
                logger.info(f"Continuing with {domain}")
×
291
                pass
×
292

293
        # check webfinger redirect
294
        path = f'/.well-known/webfinger?resource=acct:{domain}@{domain}'
1✔
295
        self.has_redirects = False
1✔
296
        self.redirects_error = None
1✔
297
        try:
1✔
298
            url = urljoin(self.web_url(), path)
1✔
299
            resp = util.requests_get(url, gateway=False)
1✔
300
            domain_urls = ([f'https://{domain}/' for domain in common.DOMAINS] +
1✔
301
                           [common.host_url()])
302
            expected = [urljoin(url, path) for url in domain_urls]
1✔
303
            if resp.url:
1✔
304
                got = urllib.parse.unquote(resp.url)
1✔
305
                if got in expected:
1✔
306
                    self.has_redirects = True
1✔
307
                else:
308
                    # check host-meta to see if they serve their own Webfinger
309
                    resp = util.requests_get(
1✔
310
                        urljoin(self.web_url(), '/.well-known/host-meta'),
311
                        gateway=False)
312
                    if (resp.status_code == 200
1✔
313
                            and domain_from_link(resp.url) not in common.DOMAINS):
314
                        logger.info(f"{domain} serves Webfinger! probably a fediverse server")
1✔
315
                        self.redirects_error = OWNS_WEBFINGER
1✔
316
                    else:
317
                        diff = '\n'.join(difflib.Differ().compare([got], [expected[0]]))
1✔
318
                        self.redirects_error = f'Current vs expected:<pre>{diff}</pre>'
1✔
319
            else:
320
                lines = [url, f'  returned HTTP {resp.status_code}']
1✔
321
                if resp.url and resp.url != url:
1✔
UNCOV
322
                    lines[1:1] = ['  redirected to:', resp.url]
×
323
                self.redirects_error = '<pre>' + '\n'.join(lines) + '</pre>'
1✔
324
        except RequestException:
×
UNCOV
325
            pass
×
326

327
        # check home page
328
        self.obj = None
1✔
329
        self.has_hcard = False
1✔
330
        try:
1✔
331
            self.obj = Web.load(self.web_url(), remote=True, gateway=True)
1✔
332
            if self.obj:
1✔
333
                self.has_hcard = True
1✔
334
        except (BadRequest, NotFound):
1✔
335
            pass
1✔
336

337
        self.put()
1✔
338
        return self
1✔
339

340
    @classmethod
1✔
341
    def key_for(cls, id, allow_opt_out=False):
1✔
342
        """Returns the :class:`ndb.Key` for a given id.
343

344
        If id is a domain, uses it as is. If it's a home page URL or fed.brid.gy
345
        or web.brid.gy AP actor URL, extracts the domain and uses that.
346
        Otherwise, returns None.
347

348
        Args:
349
          id (str)
350
          allow_opt_out (bool): whether to allow users who are currently opted out
351

352
        Returns:
353
        ndb.Key or None:
354
        """
355
        if not id:
1✔
356
            return None
1✔
357

358
        id = id.lower().strip('.')
1✔
359
        if util.is_web(id):
1✔
360
            parsed = urlparse(id)
1✔
361
            if parsed.path in ('', '/'):
1✔
362
                id = parsed.netloc
1✔
363

364
        if is_valid_domain(id, allow_internal=True):
1✔
365
            return super().key_for(id, allow_opt_out=allow_opt_out)
1✔
366

367
        return None
1✔
368

369
    @classmethod
1✔
370
    def owns_id(cls, id):
1✔
371
        """Returns True on domains and internal URLs, None on other URLs.
372

373
        All web pages are http(s) URLs, but not all http(s) URLs are web pages.
374
        """
375
        if not id:
1✔
UNCOV
376
            return False
×
377
        elif is_valid_domain(id, allow_internal=True):
1✔
378
            return True
1✔
379

380
        domain = domain_from_link(id)
1✔
381
        if domain == PRIMARY_DOMAIN or domain in PROTOCOL_DOMAINS:
1✔
382
            return True
1✔
383

384
        # we allowed internal domains for protocol bot actors above, but we
385
        # don't want to allow non-homepage URLs on those domains, eg
386
        # https://bsky.brid.gy/foo, so don't allow internal here
387
        if util.is_web(id) and is_valid_domain(domain, allow_internal=False):
1✔
388
            return None
1✔
389

390
        return False
1✔
391

392
    @classmethod
1✔
393
    def owns_handle(cls, handle, allow_internal=False):
1✔
394
        if handle == PRIMARY_DOMAIN or handle in PROTOCOL_DOMAINS:
1✔
395
            return True
1✔
396
        elif not is_valid_domain(handle, allow_internal=allow_internal):
1✔
397
            return False
1✔
398

399
    @classmethod
1✔
400
    def handle_to_id(cls, handle):
1✔
401
        assert cls.owns_handle(handle) is not False
1✔
402
        return handle
1✔
403

404
    @classmethod
1✔
405
    def target_for(cls, obj, shared=False):
1✔
406
        """Returns `obj`'s id, as a URL webmention target."""
407
        # TODO: we have entities in prod that fail this, eg
408
        # https://indieweb.social/users/bismark has source_protocol webmention
409
        # assert obj.source_protocol in (cls.LABEL, cls.ABBREV, 'ui', None), str(obj)
410

411
        if not util.is_web(obj.key.id()):
1✔
412
            logger.warning(f"{obj.key.id()} is source_protocol web but id isn't a URL!")
1✔
413
            return None
1✔
414

415
        return obj.key.id()
1✔
416

417
    @classmethod
1✔
418
    def send(to_cls, obj, url, from_user=None, orig_obj=None, **kwargs):
1✔
419
        """Sends a webmention to a given target URL.
420

421
        See :meth:`Protocol.send` for details.
422

423
        Returns False if the target URL doesn't advertise a webmention endpoint,
424
        or if webmention/microformats2 don't support the activity type.
425
        https://fed.brid.gy/docs#error-handling
426
        """
427
        targets = as1.targets(obj.as1)
1✔
428
        if not (url in targets or
1✔
429
                # homepage, check domain too
430
                (urlparse(url).path.strip('/') == ''
431
                 and domain_from_link(url) in targets)):
432
            logger.debug(f'Skipping sending to {url} , not a target in the object')
1✔
433
            return False
1✔
434

435
        if to_cls.is_blocklisted(url):
1✔
436
            logger.info(f'Skipping sending to blocklisted {url}')
×
UNCOV
437
            return False
×
438

439
        source_id = translate_object_id(
1✔
440
            id=obj.key.id(), from_=PROTOCOLS[obj.source_protocol], to=Web)
441
        source_url = quote(source_id, safe=':/%+')
1✔
442
        logger.info(f'Sending webmention from {source_url} to {url}')
1✔
443

444
        # we only send webmentions for responses. for sending normal posts etc
445
        # to followers, we just update our stored objects (elsewhere) and web
446
        # users consume them via feeds.
447
        endpoint = common.webmention_discover(url).endpoint
1✔
448
        if not endpoint:
1✔
449
            return False
1✔
450

451
        webmention.send(endpoint, source_url, url)
1✔
452
        return True
1✔
453

454
    @classmethod
1✔
455
    def load(cls, id, **kwargs):
1✔
456
        """Wrap :meth:`Protocol.load` to convert domains to homepage URLs."""
457
        if re.match(DOMAIN_RE, id):
1✔
458
            id = f'https://{id}/'
1✔
459

460
        return super().load(id, **kwargs)
1✔
461

462
    @classmethod
1✔
463
    def fetch(cls, obj, gateway=False, check_backlink=False,
1✔
464
              authorship_fetch_mf2=True, metaformats=None, **kwargs):
465
        """Fetches a URL over HTTP and extracts its microformats2.
466

467
        Follows redirects, but doesn't change the original URL in ``obj``'s id!
468
        :class:`google.cloud.ndb.model.Model` doesn't allow that anyway, but more
469
        importantly, we want to preserve that original URL becase other objects
470
        may refer to it instead of the final redirect destination URL.
471

472
        See :meth:`Protocol.fetch` for other background.
473

474
        Args:
475
          gateway (bool): passed through to
476
            :func:`oauth_dropins.webutil.util.fetch_mf2`
477
          check_backlink (bool): optional, whether to require a link to Bridgy
478
            Fed. Ignored if the URL is a homepage, ie has no path.
479
          authorship_fetch_mf2 (bool): optional, when running the authorship
480
            algorithm, fetch author URL if necessary
481
          kwargs: ignored
482
        """
483
        url = obj.key.id()
1✔
484
        if not util.is_web(url):
1✔
485
            logger.info(f'{url} is not a URL')
1✔
486
            return False
1✔
487

488
        is_homepage = urlparse(url).path.strip('/') == ''
1✔
489
        if is_homepage:
1✔
490
            domain = domain_from_link(url)
1✔
491
            if domain == PRIMARY_DOMAIN or domain in PROTOCOL_DOMAINS:
1✔
492
                profile = util.read(f'{domain}.as2.json')
1✔
493
                if profile:
1✔
494
                    obj.as2 = json_loads(profile)
1✔
495
                    return True
1✔
UNCOV
496
                return False
×
497

498
        require_backlink = (common.host_url().rstrip('/')
1✔
499
                            if check_backlink and not is_homepage
500
                            else None)
501
        if metaformats is None:
1✔
502
            # default to only for homepages
503
            metaformats = urlparse(url).path in ('', '/')
1✔
504

505
        try:
1✔
506
            parsed = util.fetch_mf2(url, gateway=gateway, metaformats=metaformats,
1✔
507
                                    require_backlink=require_backlink)
508
        except ValueError as e:
1✔
509
            error(str(e))
1✔
510

511
        if parsed is None:
1✔
512
            error(f'id {urlparse(url).fragment} not found in {url}')
1✔
513
        elif not parsed.get('items'):
1✔
514
            logger.info(f'No microformats2 found in {url}')
1✔
515
            return False
1✔
516

517
        # find mf2 item
518
        if is_homepage:
1✔
519
            logger.info(f"{url} is user's web url")
1✔
520
            entry = mf2util.representative_hcard(parsed, parsed['url'])
1✔
521
            if not entry:
1✔
522
                error(f"Couldn't find a representative h-card (http://microformats.org/wiki/representative-hcard-parsing) on {parsed['url']}")
1✔
523
            logger.info(f'Found representative h-card')
1✔
524
        else:
525
            entry = mf2util.find_first_entry(parsed, ['h-entry'])
1✔
526
            if not entry:
1✔
UNCOV
527
                error(f'No microformats2 h-entry found in {url}')
×
528

529
        # discard uid if set; we use URL as id
530
        props = entry.setdefault('properties', {})
1✔
531
        if 'uid' in props:
1✔
532
            logger.info(f'Discarding uid property: {props["uid"]}')
1✔
533
            props.pop('uid')
1✔
534

535
        # store final URL in mf2 object
536
        if is_homepage:
1✔
537
            entry.setdefault('rel-urls', {}).update(parsed.get('rel-urls', {}))
1✔
538
            entry.setdefault('type', ['h-card'])
1✔
539
        if parsed['url']:
1✔
540
            entry['url'] = parsed['url']
1✔
541
        logger.info(f'Extracted microformats2 entry: {json_dumps(entry, indent=2)}')
1✔
542

543
        if not is_homepage:
1✔
544
            # default actor/author to home page URL
545
            authors = props.setdefault('author', [])
1✔
546
            if not microformats2.get_string_urls(authors):
1✔
547
                homepage = urljoin(parsed.get('url') or url, '/')
1✔
548
                logger.info(f'Defaulting author URL to {homepage}')
1✔
549
                if authors and isinstance(authors[0], dict):
1✔
550
                    authors[0]['properties']['url'] = [homepage]
1✔
551
                else:
552
                    authors.insert(0, homepage)
1✔
553

554
            # run full authorship algorithm if necessary:
555
            # https://indieweb.org/authorship
556
            # duplicated in microformats2.json_to_object
557
            author = util.get_first(props, 'author')
1✔
558
            if not isinstance(author, dict):
1✔
559
                logger.info(f'Fetching full authorship for author {author}')
1✔
560
                fetch_fn = util.fetch_mf2 if authorship_fetch_mf2 else None
1✔
561
                author = mf2util.find_author({'items': [entry]}, hentry=entry,
1✔
562
                                             fetch_mf2_func=fetch_fn)
563
                logger.info(f'Got: {author}')
1✔
564
                if author:
1✔
565
                    props['author'] = util.trim_nulls([{
1✔
566
                        "type": ["h-card"],
567
                        'properties': {
568
                            field: [author[field]] if author.get(field) else []
569
                            for field in ('name', 'photo', 'url')
570
                        },
571
                    }])
572

573
        obj.mf2 = entry
1✔
574
        return True
1✔
575

576
    @classmethod
1✔
577
    def _convert(cls, obj, from_user=None):
1✔
578
        """Converts a :class:`Object` to HTML.
579

580
        Args:
581
          obj (models.Object)
582
          from_user (models.User): user (actor) this activity/object is from
583

584
        Returns:
585
          str:
586
        """
587
        if not obj or not obj.as1:
1✔
UNCOV
588
            return ''
×
589

590
        obj_as1 = obj.as1
1✔
591
        if from_user and not from_user.is_enabled(cls):
1✔
UNCOV
592
            error(f'{from_user.key.id()} => {cls.LABEL} not enabled')
×
593

594
        from_proto = PROTOCOLS.get(obj.source_protocol)
1✔
595
        if from_proto:
1✔
596
            # fill in author/actor if available
597
            for field in 'author', 'actor':
1✔
598
                val = as1.get_object(obj.as1, field)
1✔
599
                if val.keys() == set(['id']) and val['id']:
1✔
600
                    loaded = from_proto.load(val['id'])
1✔
601
                    if loaded and loaded.as1:
1✔
602
                        obj_as1 = {**obj_as1, field: loaded.as1}
1✔
603
        else:
604
            logger.debug(f'Not hydrating actor or author due to source_protocol {obj.source_protocol}')
1✔
605

606
        html = microformats2.activities_to_html([cls.translate_ids(obj_as1)])
1✔
607

608
        # add HTML meta redirect to source page. should trigger for end users in
609
        # browsers but not for webmention receivers (hopefully).
610
        url = util.get_url(obj_as1) or obj_as1.get('id') or obj.key.id()
1✔
611
        if util.is_web(url):
1✔
612
            utf8 = '<meta charset="utf-8">'
1✔
613
            refresh = f'<meta http-equiv="refresh" content="0;url={url}">'
1✔
614
            html = html.replace(utf8, utf8 + '\n' + refresh)
1✔
615

616
        return html
1✔
617

618

619
@app.get('/web-site')
1✔
620
@flask_util.headers(CACHE_CONTROL)
1✔
621
def enter_web_site():
1✔
UNCOV
622
    return render_template('enter_web_site.html')
×
623

624

625
@app.post('/web-site')
1✔
626
def check_web_site():
1✔
627
    logger.info(f'Params: {list(request.form.items())}')
1✔
628

629
    url = request.values['url']
1✔
630

631
    # this normalizes and lower cases domain
632
    try:
1✔
633
        domain = normalize_user_id(id=url, proto=Web)
1✔
634
    except (ValueError, AssertionError):
1✔
635
        logger.info(f'bad web id? {url}', exc_info=True)
1✔
636
        domain = None
1✔
637

638
    if not domain or not is_valid_domain(domain, allow_internal=False):
1✔
639
        flash(f'{url} is not a valid or supported web site')
1✔
640
        return render_template('enter_web_site.html'), 400
1✔
641

642
    if util.is_web(url) and urlparse(url).path.strip('/'):
1✔
643
        flash('Only top-level web sites and domains are supported.')
1✔
644
        return render_template('enter_web_site.html'), 400
1✔
645

646
    try:
1✔
647
        user = Web.get_or_create(domain, enabled_protocols=['atproto'],
1✔
648
                                 propagate=True, direct=True, verify=True)
649
    except BaseException as e:
1✔
650
        code, body = util.interpret_http_exception(e)
1✔
651
        if code:
1✔
652
            flash(f"Couldn't connect to {url}: {e}")
1✔
653
            return render_template('enter_web_site.html')
1✔
UNCOV
654
        raise
×
655

656
    if not user:  # opted out
1✔
657
        flash(f'{url} is not a valid or supported web site')
1✔
658
        return render_template('enter_web_site.html'), 400
1✔
659

660
    user.put()
1✔
661

662
    if user.redirects_error == OWNS_WEBFINGER:
1✔
663
        flash(f'{url} looks like a fediverse server! Try a normal web site.')
×
UNCOV
664
        return render_template('enter_web_site.html'), 400
×
665

666
    return redirect(user.user_page_path())
1✔
667

668

669
@app.post('/webmention')
1✔
670
def webmention_external():
1✔
671
    """Handles inbound webmention, enqueue task to process.
672

673
    Use a task queue to deliver to followers because we send to each inbox in
674
    serial, which can take a long time with many followers/instances.
675
    """
676
    logger.info(f'Params: {list(request.form.items())}')
1✔
677

678
    source = flask_util.get_required_param('source').strip()
1✔
679
    if Web.owns_id(source) is False:
1✔
680
        error(f'Bad URL {source}')
1✔
681
    elif urlparse(source).scheme != 'https':
1✔
682
        error('source URLs must be https (with SSL)')
1✔
683

684
    domain = domain_from_link(source, minimize=False)
1✔
685
    if not domain:
1✔
UNCOV
686
        error(f'Bad source URL {source}')
×
687

688
    user = Web.get_by_id(domain)
1✔
689
    if not user:
1✔
690
        error(f'No user found for domain {domain}')
1✔
691

692
    user.last_webmention_in = util.now()
1✔
693
    user.put()
1✔
694

695
    return common.create_task('webmention', **request.form)
1✔
696

697

698
def poll_feed(user, feed_url, rel_type):
1✔
699
    """Fetches a :class:`Web` site's feed and delivers new/updated posts.
700

701
    Args:
702
      user (Web)
703
      feed_url (str)
704
      rel_type (str): feed link's top-level rel type in home page HTML, usually
705
        either ``atom`` or ``rss`
706

707
    Returns:
708
      list of dict AS1 activities:
709
    """
710
    # fetch feed
711
    headers = {}
1✔
712
    if user.feed_etag:
1✔
713
        headers['If-None-Match'] = user.feed_etag
1✔
714
    if user.feed_last_modified:
1✔
715
        headers['If-Modified-Since'] = user.feed_last_modified
1✔
716
    resp = util.requests_get(feed_url, headers=headers, gateway=True)
1✔
717

718
    # update user
719
    user.last_polled_feed = util.now()
1✔
720
    user.feed_etag = resp.headers.get('ETag')
1✔
721
    user.feed_last_modified = resp.headers.get('Last-Modified')
1✔
722

723
    # parse feed
724
    content_type = resp.headers.get('Content-Type') or ''
1✔
725
    type = FEED_TYPES.get(content_type.split(';')[0])
1✔
726
    if resp.status_code == 304:
1✔
727
        logger.info('Feed is unchanged since last poll')
1✔
728
        user.put()
1✔
729
        return []
1✔
730
    elif type == 'atom' or (type == 'xml' and rel_type == 'atom'):
1✔
731
        activities = atom.atom_to_activities(resp.text)
1✔
732
        obj_feed_prop = {'atom': resp.text[:MAX_FEED_PROPERTY_SIZE]}
1✔
733
    elif type == 'rss' or (type == 'xml' and rel_type == 'rss'):
1✔
734
        activities = rss.to_activities(resp.text)
1✔
735
        obj_feed_prop = {'rss': resp.text[:MAX_FEED_PROPERTY_SIZE]}
1✔
736
    else:
737
        raise ValueError(f'Unknown feed type {content_type}')
1✔
738

739
    if len(activities) > MAX_FEED_ITEMS_PER_POLL:
1✔
740
        logger.info(f'Got {len(activities)} feed items, only processing the first {MAX_FEED_ITEMS_PER_POLL}')
1✔
741
        activities = activities[:MAX_FEED_ITEMS_PER_POLL]
1✔
742

743
    # create Objects and receive tasks
744
    for i, activity in enumerate(activities):
1✔
745
        # default actor and author to user
746
        activity.setdefault('actor', {}).setdefault('id', user.profile_id())
1✔
747
        obj = activity.setdefault('object', {})
1✔
748
        obj.setdefault('author', {}).setdefault('id', user.profile_id())
1✔
749

750
        # use URL as id since some feeds use non-URL (eg tag URI) ids
751
        for elem in obj, activity:
1✔
752
            if url := elem.get('url'):
1✔
753
                elem['id'] = elem['url']
1✔
754

755
        logger.debug(f'Converted to AS1: {json_dumps(activity, indent=2)}')
1✔
756

757
        id = Object(our_as1=activity).as1.get('id')
1✔
758
        if not id:
1✔
759
            logger.warning('No id or URL!')
×
UNCOV
760
            continue
×
761

762
        if i == 0:
1✔
763
            logger.info(f'Setting feed_last_item to {id}')
1✔
764
            user.feed_last_item = id
1✔
765
        elif id == user.feed_last_item:
1✔
766
            logger.info(f'Already seen {id}, skipping rest of feed')
×
UNCOV
767
            break
×
768

769
        if Web.owns_id(id) is False:
1✔
770
            logger.warning(f'Skipping bad id {id}')
×
UNCOV
771
            continue
×
772

773
        if not obj.get('image'):
1✔
774
            # fetch and check the post itself
775
            logger.info(f'No image in {id} , trying metaformats')
1✔
776
            post = Web.load(id, metaformats=True, authorship_fetch_mf2=False)
1✔
777
            if post and post.as1:
1✔
778
                profile_images = (as1.get_ids(user.obj.as1, 'image')
1✔
779
                                  if user.obj.as1 else [])
780
                obj['image'] = [img for img in as1.get_ids(post.as1, 'image')
1✔
781
                                if img not in profile_images]
782

783
        activity['feed_index'] = i
1✔
784
        obj = Object.get_or_create(id=id, authed_as=user.key.id(), our_as1=activity,
1✔
785
                                   status='new', source_protocol=Web.ABBREV,
786
                                   users=[user.key], **obj_feed_prop)
787
        common.create_task(queue='receive', obj=obj.key.urlsafe(),
1✔
788
                           authed_as=user.key.id())
789

790
    user.put()
1✔
791
    return activities
1✔
792

793

794
@app.post(f'/queue/poll-feed')
1✔
795
@cloud_tasks_only
1✔
796
def poll_feed_task():
1✔
797
    """Task handler for polling a :class:`Web` user's feed.
798

799
    Params:
800
      ``domain`` (str): key id of the :class:`Web` user
801
    """
802
    domain = flask_util.get_required_param('domain')
1✔
803
    logger.info(f'Polling feed for {domain}')
1✔
804

805
    user = Web.get_by_id(domain)
1✔
806
    if not (user and user.obj and user.obj.mf2):
1✔
807
        error(f'No Web user or object found for domain {domain}', status=304)
1✔
808
    elif user.last_webmention_in:
1✔
809
        logger.info(f'Dropping since last_webmention_in is set')
1✔
810
        return 'OK'
1✔
811

812
    # discover feed URL
813
    for url, info in user.obj.mf2.get('rel-urls', {}).items():
1✔
814
        rel_type = FEED_TYPES.get(info.get('type', '').split(';')[0])
1✔
815
        if 'alternate' in info.get('rels', []) and rel_type:
1✔
816
            break
1✔
817
    else:
818
        msg = f"User {user.key.id()} has no feed URL, can't fetch feed"
1✔
819
        logger.info(msg)
1✔
820
        return msg
1✔
821

822
    # go go go!
823
    activities = []
1✔
824
    status = 200
1✔
825
    try:
1✔
826
        activities = poll_feed(user, url, rel_type)
1✔
827
    except (ValueError, ElementTree.ParseError) as e:
1✔
828
        logger.error(f"Couldn't parse feed: {e}")
1✔
829
        status = 304
1✔
830
    except BaseException as e:
1✔
831
        code, _ = util.interpret_http_exception(e)
1✔
832
        if code or util.is_connection_failure(e):
1✔
833
            logger.error(f"Couldn't fetch feed: {e}")
1✔
834
            status = 304
1✔
835
        else:
NEW
836
            raise
×
837

838
    # determine posting frequency
839
    published_last = None
1✔
840
    published_deltas = []  # timedeltas between entry published times
1✔
841
    for activity in activities:
1✔
842
        published = activity['object'].get('published')
1✔
843
        if published and published_last:
1✔
844
            published_deltas.append(
1✔
845
                abs(util.parse_iso8601(published) -
846
                    util.parse_iso8601(published_last)))
847
        published_last = published
1✔
848

849
    # create next poll task
850
    def clamp(delay):
1✔
851
        return max(min(delay, MAX_FEED_POLL_PERIOD), MIN_FEED_POLL_PERIOD)
1✔
852

853
    if published_deltas:
1✔
854
        delay = clamp(timedelta(seconds=statistics.mean(
1✔
855
            t.total_seconds() for t in published_deltas)))
856
    else:
857
        delay = clamp(util.now() - (user.last_polled_feed
1✔
858
                                    or user.created.replace(tzinfo=timezone.utc)))
859

860
    common.create_task(queue='poll-feed', domain=user.key.id(), delay=delay)
1✔
861
    return 'OK', status
1✔
862

863

864
@app.post('/queue/webmention')
1✔
865
@cloud_tasks_only
1✔
866
def webmention_task():
1✔
867
    """Handles inbound webmention task.
868

869
    Params:
870
      ``source`` (str): URL
871
    """
872
    logger.info(f'Params: {list(request.form.items())}')
1✔
873

874
    # load user
875
    source = flask_util.get_required_param('source').strip()
1✔
876
    domain = domain_from_link(source, minimize=False)
1✔
877
    logger.info(f'webmention from {domain}')
1✔
878

879
    if domain in common.DOMAINS:
1✔
880
        error(f'URL not supported: {source}')
1✔
881

882
    user = Web.get_by_id(domain)
1✔
883
    if not user:
1✔
UNCOV
884
        error(f'No user found for domain {domain}', status=304)
×
885
    logger.info(f'User: {user.key.id()}')
1✔
886

887
    # fetch source page
888
    try:
1✔
889
        # remote=True to force fetch, local=True to populate new/changed attrs
890
        obj = Web.load(source, local=True, remote=True,
1✔
891
                       check_backlink=not appengine_info.LOCAL_SERVER)
892
    except BadRequest as e:
1✔
893
        error(str(e.description), status=304)
1✔
894
    except HTTPError as e:
1✔
895
        if e.response.status_code not in (410, 404):
1✔
896
            error(f'{e} ; {e.response.text if e.response else ""}', status=502)
1✔
897

898
        create_id = f'{source}#bridgy-fed-create'
1✔
899
        logger.info(f'Interpreting as Delete. Looking for {create_id}')
1✔
900
        create = Object.get_by_id(create_id)
1✔
901
        if not create or create.status != 'complete':
1✔
902
            error(f"Bridgy Fed hasn't successfully published {source}", status=304)
1✔
903

904
        id = f'{source}#bridgy-fed-delete'
1✔
905
        obj = Object(id=id, status='new', our_as1={
1✔
906
            'id': id,
907
            'objectType': 'activity',
908
            'verb': 'delete',
909
            'actor': user.web_url(),
910
            'object': source,
911
        })
912

913
    if not obj or (not obj.mf2 and obj.type != 'delete'):
1✔
914
        error(f"Couldn't load {source} as microformats2 HTML", status=304)
1✔
915
    elif obj.mf2 and 'h-entry' in obj.mf2.get('type', []):
1✔
916
        authors = obj.mf2['properties'].setdefault('author', [])
1✔
917
        author_urls = microformats2.get_string_urls(authors)
1✔
918
        if not author_urls:
1✔
UNCOV
919
            authors.append(user.web_url())
×
920
        elif not user.is_web_url(author_urls[0]):
1✔
921
            logger.info(f'Overriding author {author_urls[0]} with {user.web_url()}')
1✔
922
            if isinstance(authors[0], dict):
1✔
923
                authors[0]['properties']['url'] = [user.web_url()]
1✔
924
            else:
UNCOV
925
                authors[0] = user.web_url()
×
926

927
    try:
1✔
928
        return Web.receive(obj, authed_as=user.key.id())
1✔
929
    except ValueError as e:
1✔
930
        logger.warning(e, exc_info=True)
×
UNCOV
931
        error(e, status=304)
×
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc