• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

snarfed / bridgy-fed / e56ba082-42b2-46a6-8b61-b5e23d282f33

29 Aug 2025 06:00AM UTC coverage: 92.631% (-0.06%) from 92.692%
e56ba082-42b2-46a6-8b61-b5e23d282f33

push

circleci

snarfed
ids methods: if from protocol doesn't own id, return id unchanged

in translate_user_id, translate_object_id, profile_id

for:
https://console.cloud.google.com/errors/detail/CMDC_cirnMT0FQ;time=P1D;locations=global?project=bridgy-federated
https://console.cloud.google.com/errors/detail/CIaI78mDj7DucA;time=P1D;locations=global?project=bridgy-federated

5 of 6 new or added lines in 1 file covered. (83.33%)

3 existing lines in 1 file now uncovered.

5669 of 6120 relevant lines covered (92.63%)

0.93 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.09
/web.py
1
"""Webmention protocol with microformats2 in HTML, aka the IndieWeb stack."""
2
from datetime import timedelta, timezone
1✔
3
import difflib
1✔
4
import logging
1✔
5
import re
1✔
6
import statistics
1✔
7
import urllib.parse
1✔
8
from urllib.parse import quote, urlencode, urljoin, urlparse
1✔
9
from xml.etree import ElementTree
1✔
10

11
import brevity
1✔
12
from flask import redirect, request
1✔
13
from google.cloud import ndb
1✔
14
from google.cloud.ndb import ComputedProperty
1✔
15
from granary import as1, as2, atom, microformats2, rss
1✔
16
import mf2util
1✔
17
from oauth_dropins.webutil import flask_util, util
1✔
18
from oauth_dropins.webutil.appengine_config import tasks_client
1✔
19
from oauth_dropins.webutil import appengine_info
1✔
20
from oauth_dropins.webutil.flask_util import cloud_tasks_only, error, flash
1✔
21
from oauth_dropins.webutil.util import domain_from_link, json_dumps, json_loads
1✔
22
from oauth_dropins.webutil import webmention
1✔
23
from requests import HTTPError, RequestException
1✔
24
from requests.auth import HTTPBasicAuth
1✔
25
from werkzeug.exceptions import BadGateway, BadRequest, HTTPException, NotFound
1✔
26

27
import common
1✔
28
from common import (
1✔
29
    CACHE_CONTROL,
30
    DOMAIN_RE,
31
    DOMAINS,
32
    PRIMARY_DOMAIN,
33
    PROTOCOL_DOMAINS,
34
    render_template,
35
    SUPERDOMAIN,
36
)
37
from flask_app import app
1✔
38
from ids import normalize_user_id, translate_object_id, translate_user_id
1✔
39
import memcache
1✔
40
from models import Follower, Object, PROTOCOLS, Target, User
1✔
41
from protocol import Protocol
1✔
42

43
logger = logging.getLogger(__name__)
1✔
44

45
# https://github.com/snarfed/bridgy-fed/issues/314
46
WWW_DOMAINS = frozenset((
1✔
47
    'www.jvt.me',
48
))
49

50
FEED_TYPES = {
1✔
51
    atom.CONTENT_TYPE.split(';')[0]: 'atom',
52
    rss.CONTENT_TYPE.split(';')[0]: 'rss',
53
    # https://stackoverflow.com/questions/4832357/whats-the-difference-between-text-xml-vs-application-xml-for-webservice-respons
54
    'application/xml': 'xml',
55
    'text/xml': 'xml',
56
}
57
MIN_FEED_POLL_PERIOD = timedelta(hours=2)
1✔
58
MAX_FEED_POLL_PERIOD = timedelta(days=1)
1✔
59
MAX_FEED_ITEMS_PER_POLL = 10
1✔
60

61
# populated into Web.redirects_error
62
OWNS_WEBFINGER = 'This site serves its own Webfinger, and likely ActivityPub too.'
1✔
63

64
# in addition to common.DOMAIN_BLOCKLIST
65
FETCH_BLOCKLIST = (
1✔
66
    'bsky.app',
67
)
68

69

70
def is_valid_domain(domain, allow_internal=True):
1✔
71
    """Returns True if this is a valid domain we can use, False otherwise.
72

73
    Args:
74
      domain (str):
75
      allow_internal (bool): whether to return True for internal domains
76
        like ``fed.brid.gy``, ``bsky.brid.gy``, etc
77

78
    Valid means TLD is ok, not blacklisted, etc.
79
    """
80
    if not domain or not re.match(DOMAIN_RE, domain):
1✔
81
        # logger.debug(f"{domain} doesn't look like a domain")
82
        return False
1✔
83

84
    if Web.is_blocklisted(domain, allow_internal=allow_internal):
1✔
85
        # logger.debug(f'{domain} is blocklisted')
86
        return False
1✔
87

88
    tld = domain.split('.')[-1]
1✔
89
    if tld not in brevity.TLDS:
1✔
90
        # logger.info(f"{domain} looks like a domain but {tld} isn't a TLD")
91
        return False
1✔
92

93
    return True
1✔
94

95

96
class Web(User, Protocol):
1✔
97
    """Web user and webmention protocol implementation.
98

99
    The key name is the domain.
100
    """
101
    ABBREV = 'web'
1✔
102
    ''
1✔
103
    PHRASE = 'the web'
1✔
104
    ''
1✔
105
    OTHER_LABELS = ('webmention',)
1✔
106
    ''
1✔
107
    LOGO_EMOJI = '🌐'  # used to be 🕸️
1✔
108
    ''
1✔
109
    CONTENT_TYPE = common.CONTENT_TYPE_HTML
1✔
110
    ''
1✔
111
    DEFAULT_ENABLED_PROTOCOLS = ('activitypub',)
1✔
112
    ''
1✔
113
    DEFAULT_SERVE_USER_PAGES = True
1✔
114
    ''
1✔
115
    SUPPORTED_AS1_TYPES = (
1✔
116
        tuple(as1.ACTOR_TYPES)
117
        + tuple(as1.POST_TYPES)
118
        + tuple(as1.CRUD_VERBS)
119
        + ('audio', 'bookmark', 'event', 'image', 'video')
120
        + ('follow', 'like', 'share', 'stop-following')
121
    )
122
    ''
1✔
123
    USES_OBJECT_FEED = True
1✔
124
    ''
1✔
125
    HTML_PROFILES = True
1✔
126
    ''
1✔
127

128
    has_redirects = ndb.BooleanProperty()
1✔
129
    ''
1✔
130
    redirects_error = ndb.TextProperty()
1✔
131
    ''
1✔
132
    has_hcard = ndb.BooleanProperty()
1✔
133
    'Currently unused, and I think now always ends up as ``True``. TODO: remove?'
1✔
134
    last_webmention_in = ndb.DateTimeProperty(tzinfo=timezone.utc)
1✔
135
    ''
1✔
136
    last_polled_feed = ndb.DateTimeProperty(tzinfo=timezone.utc)
1✔
137
    ''
1✔
138
    feed_last_item = ndb.StringProperty()
1✔
139
    """str: feed item id (URL)"""
1✔
140
    feed_etag = ndb.StringProperty()
1✔
141
    ''
1✔
142
    feed_last_modified = ndb.StringProperty()
1✔
143
    ''
1✔
144

145
    atproto_last_chat_log_cursor = ndb.StringProperty()
1✔
146
    """Only used by protocol bot users in Bluesky, for polling their chat
1✔
147
    messages with ``chat.bsky.convo.getLog``.
148
    """
149

150
    ap_subdomain = ndb.StringProperty(
1✔
151
        choices=['ap', 'bsky', 'efake', 'fa', 'fed', 'nostr', 'other', 'web'],
152
        default='web')
153
    """Originally, BF served Web users' AP actor ids on fed.brid.gy, eg
1✔
154
    https://fed.brid.gy/snarfed.org . When we started adding new protocols, we
155
    switched to per-protocol subdomains, eg https://web.brid.gy/snarfed.org .
156
    However, we need to preserve the old users' actor ids as is.
157

158
    Also, our per-protocol bot accounts in ActivityPub are on their own
159
    subdomains, eg @bsky.brid.gy@bsky.brid.gy.
160

161
    So, this property tracks which subdomain a given Web user's AP actor uses.
162
    """
163

164
    # OLD. some stored entities still have these; do not reuse.
165
    # superfeedr_subscribed = ndb.DateTimeProperty(tzinfo=timezone.utc)
166
    # superfeedr_subscribed_feed = ndb.StringProperty()
167

168
    @classmethod
1✔
169
    def _get_kind(cls):
1✔
170
        return 'MagicKey'
1✔
171

172
    def _pre_put_hook(self):
1✔
173
        """Validate domain id, don't allow upper case or invalid characters."""
174
        super()._pre_put_hook()
1✔
175
        id = self.key.id()
1✔
176
        assert is_valid_domain(id), id
1✔
177
        assert id.lower() == id, f'upper case is not allowed in Web key id: {id}'
1✔
178

179
    @classmethod
1✔
180
    def get_or_create(cls, id, allow_opt_out=False, verify=None, **kwargs):
1✔
181
        """Normalize domain, then pass through to :meth:`User.get_or_create`.
182

183
        Normalizing currently consists of lower casing and removing leading and
184
        trailing dots.
185

186
        Args:
187
          verify (bool): whether to call :meth:`verify` to load h-card, check
188
            redirects, etc. Defaults to calling it only if the user is new.
189
        """
190
        # normalize id (domain)
191
        domain = cls.key_for(id, allow_opt_out=True).id()
1✔
192
        if (util.domain_or_parent_in(domain, [SUPERDOMAIN.strip('.')])
1✔
193
                and not appengine_info.DEBUG):
194
            return super().get_by_id(domain)
1✔
195

196
        user = super().get_or_create(domain, allow_opt_out=True, **kwargs)
1✔
197
        if not user:
1✔
198
            return None
×
199

200
        if verify or (verify is None and not user.existing):
1✔
201
            user = user.verify()
1✔
202

203
        if not allow_opt_out and user.status:
1✔
204
            return None
1✔
205

206
        if not user.existing:
1✔
207
            common.create_task(queue='poll-feed', domain=user.key.id())
1✔
208

209
        return user
1✔
210

211
    @ndb.ComputedProperty
1✔
212
    def handle(self):
1✔
213
        """Returns this user's chosen username or domain, eg ``user.com``."""
214
        # prettify if domain, noop if username
215
        username = self.username()
1✔
216
        if username != self.key.id():
1✔
217
            return domain_from_link(username, minimize=False)
1✔
218
        return username
1✔
219

220
    def handle_as(self, to_proto, short=False):
1✔
221
        """Special case ActivityPub to use custom username."""
222
        if to_proto in ('activitypub', 'ap', PROTOCOLS['ap']):
1✔
223
            if self.has_redirects:
1✔
224
                handle = f'@{self.username()}@{self.key.id()}'
1✔
225
            else:
226
                handle = f'@{self.key.id()}@{self.ap_subdomain}{SUPERDOMAIN}'
1✔
227
            return handle.rsplit('@', maxsplit=1)[0] if short else handle
1✔
228

229
        return super().handle_as(to_proto, short=short)
1✔
230

231
    def id_as(self, to_proto):
1✔
232
        """Special case ActivityPub to use ``ap_subdomain``."""
233
        if isinstance(to_proto, str):
1✔
234
            to_proto = PROTOCOLS[to_proto]
×
235

236
        converted = translate_user_id(id=self.key.id(), from_=self,
1✔
237
                                      to=to_proto)
238

239
        if to_proto.LABEL == 'activitypub':
1✔
240
            other = 'web' if self.ap_subdomain == 'fed' else 'fed'
1✔
241
            converted = converted.replace(f'https://{other}.brid.gy/',
1✔
242
                                          f'https://{self.ap_subdomain}.brid.gy/')
243

244
        return converted
1✔
245

246
    web_url = User.profile_id
1✔
247

248
    def id_uri(self):
1✔
249
        return self.web_url()
1✔
250

251
    def is_web_url(self, url):
1✔
252
        return super().is_web_url(url, ignore_www=True)
1✔
253

254
    def user_page_path(self, rest=None, **kwargs):
1✔
255
        """Always prefer domain (id)."""
256
        kwargs['prefer_id'] = True
1✔
257
        return super().user_page_path(rest=rest, **kwargs)
1✔
258

259
    def username(self):
1✔
260
        """Returns the user's preferred username.
261

262
        Uses stored representative h-card if available, falls back to id.
263

264
        Returns:
265
          str:
266
        """
267
        id = self.key.id()
1✔
268

269
        if self.obj and self.obj.as1:
1✔
270
            for url in (util.get_list(self.obj.as1, 'url') +
1✔
271
                        util.get_list(self.obj.as1, 'urls')):
272
                url = url.get('value') if isinstance(url, dict) else url
1✔
273
                if url and url.startswith('acct:'):
1✔
274
                    try:
1✔
275
                        urluser, urldomain = util.parse_acct_uri(url)
1✔
276
                    except ValueError as e:
1✔
277
                        continue
1✔
278
                    if urldomain == id:
1✔
279
                        logger.info(f'Found custom username: {urluser}')
1✔
280
                        return urluser
1✔
281

282
        # logger.debug(f'Defaulting username to key id {id}')
283
        return id
1✔
284

285
    @ndb.ComputedProperty
1✔
286
    def status(self):
1✔
287
        if self.key.id() in common.DOMAINS:
1✔
288
            return None
1✔
289

290
        if self.redirects_error == OWNS_WEBFINGER:
1✔
291
            # looks like this site is already its own fediverse server
292
            return 'owns-webfinger'
1✔
293

294
        url, _ = self.feed_url()
1✔
295
        if (not url and not self.webmention_endpoint() and not self.last_webmention_in
1✔
296
                and not self.has_redirects):
297
            return 'no-feed-or-webmention'
1✔
298

299
        return super().status
1✔
300

301
    def verify(self):
1✔
302
        """Fetches site a couple ways to check for redirects and h-card.
303

304
        Returns:
305
          web.Web: user that was verified. May be different than self! eg if
306
          self's domain started with www and we switch to the root domain.
307
        """
308
        domain = self.key.id()
1✔
309
        logger.info(f'Verifying {domain}')
1✔
310

311
        if domain.startswith('www.') and domain not in WWW_DOMAINS:
1✔
312
            # if root domain serves ok, use it instead
313
            # https://github.com/snarfed/bridgy-fed/issues/314
314
            root = domain.removeprefix('www.')
1✔
315
            root_site = f'https://{root}/'
1✔
316
            try:
1✔
317
                resp = util.requests_get(root_site, gateway=False)
1✔
318
                if resp.ok and self.is_web_url(resp.url):
1✔
319
                    logger.info(f'{root_site} serves ok ; using {root} instead')
1✔
320
                    root_user = Web.get_or_create(
1✔
321
                        root, enabled_protocols=self.enabled_protocols,
322
                        allow_opt_out=True)
323
                    self.use_instead = root_user.key
1✔
324
                    self.put()
1✔
325
                    return root_user.verify()
1✔
326
            except RequestException as e:
×
327
                logger.info(f"Couldn't fetch {root_site} : {e}")
×
328
                logger.info(f"Continuing with {domain}")
×
329
                pass
×
330

331
        # check webfinger redirect
332
        path = f'/.well-known/webfinger?resource=acct:{domain}@{domain}'
1✔
333
        self.has_redirects = False
1✔
334
        self.redirects_error = None
1✔
335
        try:
1✔
336
            url = urljoin(self.web_url(), path)
1✔
337
            resp = util.requests_get(url, gateway=False)
1✔
338
            domain_urls = ([f'https://{domain}/' for domain in common.DOMAINS] +
1✔
339
                           [common.host_url()])
340
            expected = [urljoin(url, path) for url in domain_urls]
1✔
341
            if resp.url:
1✔
342
                got = urllib.parse.unquote(resp.url)
1✔
343
                if got in expected:
1✔
344
                    self.has_redirects = True
1✔
345
                else:
346
                    # check host-meta to see if they serve their own Webfinger
347
                    resp = util.requests_get(
1✔
348
                        urljoin(self.web_url(), '/.well-known/host-meta'),
349
                        gateway=False)
350
                    if (resp.status_code == 200
1✔
351
                            and domain_from_link(resp.url) not in common.DOMAINS):
352
                        logger.info(f"{domain} serves Webfinger! probably a fediverse server")
1✔
353
                        self.redirects_error = OWNS_WEBFINGER
1✔
354
                    else:
355
                        diff = '\n'.join(difflib.Differ().compare([got], [expected[0]]))
1✔
356
                        self.redirects_error = f'Current vs expected:<pre>{diff}</pre>'
1✔
357
            else:
358
                lines = [url, f'  returned HTTP {resp.status_code}']
1✔
359
                if resp.url and resp.url != url:
1✔
360
                    lines[1:1] = ['  redirected to:', resp.url]
×
361
                self.redirects_error = '<pre>' + '\n'.join(lines) + '</pre>'
1✔
362
        except RequestException:
1✔
363
            pass
×
364

365
        # check home page
366
        self.has_hcard = False
1✔
367
        if not getattr(self, 'existing', None) == False:  # ie this is a new user
1✔
368
            self.reload_profile(gateway=True, raise_=False)
1✔
369
        if self.obj and self.obj.as1:
1✔
370
            self.has_hcard = True
1✔
371

372
        self.put()
1✔
373
        return self
1✔
374

375
    @classmethod
1✔
376
    def key_for(cls, id, allow_opt_out=False):
1✔
377
        """Returns the :class:`ndb.Key` for a given id.
378

379
        If id is a domain, uses it as is. If it's a home page URL or fed.brid.gy
380
        or web.brid.gy AP actor URL, extracts the domain and uses that.
381
        Otherwise, returns None.
382

383
        Args:
384
          id (str)
385
          allow_opt_out (bool): whether to allow users who are currently opted out
386

387
        Returns:
388
        ndb.Key or None:
389
        """
390
        if not id:
1✔
391
            return None
1✔
392

393
        id = id.lower().strip('.')
1✔
394
        if util.is_web(id):
1✔
395
            parsed = urlparse(id)
1✔
396
            if parsed.path in ('', '/'):
1✔
397
                id = parsed.netloc
1✔
398

399
        if is_valid_domain(id, allow_internal=True):
1✔
400
            return super().key_for(id, allow_opt_out=allow_opt_out)
1✔
401

402
        return None
1✔
403

404
    @classmethod
1✔
405
    def owns_id(cls, id):
1✔
406
        """Returns True on domains and internal URLs, None on other URLs.
407

408
        All web pages are http(s) URLs, but not all http(s) URLs are web pages.
409
        """
410
        if not id:
1✔
411
            return False
×
412
        elif is_valid_domain(id, allow_internal=True):
1✔
413
            return True
1✔
414

415
        if not util.is_web(id):
1✔
416
            return False
1✔
417

418
        domain = domain_from_link(id)
1✔
419
        if domain == PRIMARY_DOMAIN or domain in PROTOCOL_DOMAINS:
1✔
420
            return True
1✔
421

422
        # we allowed internal domains for protocol bot actors above, but we
423
        # don't want to allow non-homepage URLs on those domains, eg
424
        # https://bsky.brid.gy/foo, so don't allow internal here
425
        if is_valid_domain(domain, allow_internal=False):
1✔
426
            return None
1✔
427

428
        return False
1✔
429

430
    @classmethod
1✔
431
    def owns_handle(cls, handle, allow_internal=False):
1✔
432
        if handle == PRIMARY_DOMAIN or handle in PROTOCOL_DOMAINS:
1✔
433
            return True
1✔
434
        elif not is_valid_domain(handle, allow_internal=allow_internal):
1✔
435
            return False
1✔
436

437
    @classmethod
1✔
438
    def handle_to_id(cls, handle):
1✔
439
        assert cls.owns_handle(handle) is not False
1✔
440
        return handle
1✔
441

442
    @classmethod
1✔
443
    def target_for(cls, obj, shared=False):
1✔
444
        """Returns `obj`'s id, as a URL webmention target."""
445
        # TODO: we have entities in prod that fail this, eg
446
        # https://indieweb.social/users/bismark has source_protocol webmention
447
        # assert obj.source_protocol in (cls.LABEL, cls.ABBREV, 'ui', None), str(obj)
448

449
        if not util.is_web(obj.key.id()):
1✔
450
            logger.warning(f"{obj.key.id()} is source_protocol web but id isn't a URL!")
1✔
451
            return None
1✔
452

453
        return obj.key.id()
1✔
454

455
    def feed_url(self):
1✔
456
        """Returns this web site's RSS or Atom feed URL and type, if any.
457

458
        Returns:
459
          (str, type) or (None, None):
460
        """
461
        if self.obj and self.obj.mf2:
1✔
462
            for url, info in self.obj.mf2.get('rel-urls', {}).items():
1✔
463
                type = FEED_TYPES.get(info.get('type', '').split(';')[0])
1✔
464
                if 'alternate' in info.get('rels', []) and type:
1✔
465
                    return url, type
1✔
466

467
        return None, None
1✔
468

469
    def webmention_endpoint(self):
1✔
470
        """Returns this web site's webmention endpoint, if any.
471

472
        Returns:
473
          str: webmention endpoint URL
474
        """
475
        if self.obj and self.obj.mf2:
1✔
476
            for url, info in self.obj.mf2.get('rel-urls', {}).items():
1✔
477
                if 'webmention' in info.get('rels', []):
1✔
478
                    return url
1✔
479

480
    @classmethod
1✔
481
    def send(to_cls, obj, target, from_user=None, orig_obj_id=None, **kwargs):
1✔
482
        """Sends a webmention to a given webmention target URL.
483

484
        See :meth:`Protocol.send` for details.
485

486
        Returns False if the target URL doesn't advertise a webmention endpoint,
487
        or if webmention/microformats2 don't support the activity type.
488
        https://fed.brid.gy/docs#error-handling
489
        """
490
        targets = as1.targets(obj.as1)
1✔
491
        if not (target in targets or
1✔
492
                # homepage, check domain too
493
                (urlparse(target).path.strip('/') == ''
494
                 and domain_from_link(target) in targets)):
495
            logger.debug(f'Skipping sending to {target} , not a target in the object')
1✔
496
            return False
1✔
497

498
        if to_cls.is_blocklisted(target):
1✔
499
            logger.info(f'Skipping sending to blocklisted {target}')
×
500
            return False
×
501

502
        source_id = translate_object_id(
1✔
503
            id=obj.key.id(), from_=PROTOCOLS[obj.source_protocol], to=Web)
504
        source_url = quote(source_id, safe=':/%+')
1✔
505
        logger.info(f'Sending webmention from {source_url} to {target}')
1✔
506

507
        # we only send webmentions for responses. for sending normal posts etc
508
        # to followers, we just update our stored objects (elsewhere) and web
509
        # users consume them via feeds.
510
        endpoint = webmention_discover(target).endpoint
1✔
511
        if not endpoint:
1✔
512
            return False
1✔
513

514
        webmention.send(endpoint, source_url, target)
1✔
515
        return True
1✔
516

517
    @classmethod
1✔
518
    def load(cls, id, **kwargs):
1✔
519
        """Wrap :meth:`Protocol.load` to convert domains to homepage URLs."""
520
        if re.match(DOMAIN_RE, id):
1✔
521
            id = f'https://{id}/'
1✔
522

523
        return super().load(id, **kwargs)
1✔
524

525
    @classmethod
1✔
526
    def fetch(cls, obj, gateway=False, check_backlink=False,
1✔
527
              authorship_fetch_mf2=True, metaformats=None, **kwargs):
528
        """Fetches a URL over HTTP and extracts its microformats2.
529

530
        Follows redirects, but doesn't change the original URL in ``obj``'s id!
531
        :class:`google.cloud.ndb.model.Model` doesn't allow that anyway, but more
532
        importantly, we want to preserve that original URL becase other objects
533
        may refer to it instead of the final redirect destination URL.
534

535
        See :meth:`Protocol.fetch` for other background.
536

537
        Args:
538
          gateway (bool): passed through to
539
            :func:`oauth_dropins.webutil.util.fetch_mf2`
540
          check_backlink (bool): optional, whether to require a link to Bridgy
541
            Fed. Ignored if the URL is a homepage, ie has no path.
542
          authorship_fetch_mf2 (bool): optional, when running the authorship
543
            algorithm, fetch author URL if necessary
544
          kwargs: ignored
545
        """
546
        url = obj.key.id()
1✔
547

548
        if not util.is_web(url) or not util.is_url(url):
1✔
549
            logger.info(f'{url} is not a URL')
1✔
550
            return False
1✔
551

552
        if (cls.is_blocklisted(url, allow_internal=True)
1✔
553
                or util.domain_or_parent_in(url, FETCH_BLOCKLIST)):
554
            return False
1✔
555

556
        is_homepage = urlparse(url).path.strip('/') == ''
1✔
557
        if is_homepage:
1✔
558
            domain = domain_from_link(url)
1✔
559
            if domain == PRIMARY_DOMAIN or domain in PROTOCOL_DOMAINS:
1✔
560
                profile = util.read(f'{domain}.as2.json')
1✔
561
                if profile:
1✔
562
                    obj.as2 = json_loads(profile)
1✔
563
                    return True
1✔
564
                return False
×
565

566
        require_backlink = (common.host_url().rstrip('/')
1✔
567
                            if check_backlink and not is_homepage
568
                            else None)
569
        if metaformats is None:
1✔
570
            # default to only for homepages
571
            metaformats = is_homepage
1✔
572

573
        try:
1✔
574
            parsed = util.fetch_mf2(url, gateway=gateway, metaformats=metaformats,
1✔
575
                                    require_backlink=require_backlink)
576
        except ValueError as e:
1✔
577
            error(str(e))
1✔
578

579
        if parsed is None or not parsed.get('items'):
1✔
580
            if parsed:
1✔
581
                # we got valid HTML. save the Object so that we know this URL is web
582
                obj.source_protocol = 'web'
1✔
583
                obj.put()
1✔
584
            logger.info(f'No microformats2 found in {url}')
1✔
585
            return False
1✔
586

587
        # find mf2 item
588
        if is_homepage:
1✔
589
            logger.info(f"{url} is user's web url")
1✔
590
            parsed_url = (parsed['url'] or '').rstrip('/')
1✔
591
            # try both with and without trailing slash
592
            entry = (mf2util.representative_hcard(parsed, parsed_url)
1✔
593
                     or mf2util.representative_hcard(parsed, parsed_url + '/'))
594
            if not entry:
1✔
595
                error(f"Couldn't find a representative h-card (http://microformats.org/wiki/representative-h-card-parsing) on {parsed['url']}")
1✔
596
            logger.info(f'Found representative h-card')
1✔
597
            # handle when eg https://user.com/ redirects to https://www.user.com/
598
            # we need to store this as https://user.com/
599
            if parsed['url'] != url:
1✔
600
                logger.info(f'overriding {parsed["url"]} with {url}')
1✔
601
                entry['properties'].setdefault('url', []).insert(0, url)
1✔
602
                if rel_url := parsed['rel-urls'].pop(parsed['url'], None):
1✔
603
                    parsed['rel-urls'][url] = rel_url
1✔
604
                parsed['url'] = url
1✔
605

606
        else:
607
            entry = mf2util.find_first_entry(parsed, ['h-entry'])
1✔
608
            if not entry:
1✔
609
                error(f'No microformats2 h-entry found in {url}')
×
610

611
        # discard uid if set; we use URL as id
612
        props = entry.setdefault('properties', {})
1✔
613
        if 'uid' in props:
1✔
614
            logger.info(f'Discarding uid property: {props["uid"]}')
1✔
615
            props.pop('uid')
1✔
616

617
        # store final URL in mf2 object
618
        if is_homepage:
1✔
619
            entry.setdefault('rel-urls', {}).update(parsed.get('rel-urls', {}))
1✔
620
            entry.setdefault('type', ['h-card'])
1✔
621
        if parsed['url']:
1✔
622
            entry['url'] = parsed['url']
1✔
623
        logger.info(f'Extracted microformats2 entry: {json_dumps(entry)[:500]}')
1✔
624

625
        if not is_homepage:
1✔
626
            # default actor/author to home page URL
627
            authors = props.setdefault('author', [])
1✔
628
            if not microformats2.get_string_urls(authors):
1✔
629
                homepage = urljoin(parsed.get('url') or url, '/')
1✔
630
                logger.info(f'Defaulting author URL to {homepage}')
1✔
631
                if authors and isinstance(authors[0], dict):
1✔
632
                    authors[0]['properties']['url'] = [homepage]
1✔
633
                else:
634
                    authors.insert(0, homepage)
1✔
635

636
            # run full authorship algorithm if necessary:
637
            # https://indieweb.org/authorship
638
            # duplicated in microformats2.json_to_object
639
            author = util.get_first(props, 'author')
1✔
640
            if not isinstance(author, dict):
1✔
641
                logger.info(f'Fetching full authorship for author {author}')
1✔
642
                fetch_fn = util.fetch_mf2 if authorship_fetch_mf2 else None
1✔
643
                try:
1✔
644
                    author = mf2util.find_author({'items': [entry]}, hentry=entry,
1✔
645
                                                 fetch_mf2_func=fetch_fn)
646
                except (ValueError, TypeError) as e:
1✔
647
                    logger.warning(e)
1✔
648
                    author = None
1✔
649
                logger.debug(f'Got: {author}')
1✔
650
                if author:
1✔
651
                    props['author'] = util.trim_nulls([{
1✔
652
                        "type": ["h-card"],
653
                        'properties': {
654
                            field: [author[field]] if author.get(field) else []
655
                            for field in ('name', 'photo', 'url')
656
                        },
657
                    }])
658

659
        obj.mf2 = entry
1✔
660
        return True
1✔
661

662
    @classmethod
1✔
663
    def _convert(cls, obj, from_user=None):
1✔
664
        """Converts a :class:`Object` to HTML.
665

666
        Args:
667
          obj (models.Object)
668
          from_user (models.User): user (actor) this activity/object is from
669

670
        Returns:
671
          str:
672
        """
673
        if not obj or not obj.as1:
1✔
674
            return ''
×
675

676
        obj_as1 = obj.as1
1✔
677
        if from_user and not from_user.is_enabled(cls):
1✔
678
            error(f'{from_user.key.id()} => {cls.LABEL} not enabled')
×
679

680
        from_proto = PROTOCOLS.get(obj.source_protocol)
1✔
681
        if from_proto:
1✔
682
            # fill in author/actor if available
683
            for field in 'author', 'actor':
1✔
684
                val = as1.get_object(obj_as1, field)
1✔
685
                if val.keys() == set(['id']) and val['id']:
1✔
686
                    loaded = from_proto.load(val['id'], raise_=False)
1✔
687
                    if loaded and loaded.as1:
1✔
688
                        obj_as1 = {**obj_as1, field: loaded.as1}
1✔
689
        else:
690
            logger.debug(f'Not hydrating actor or author due to source_protocol {obj.source_protocol}')
1✔
691

692
        html = microformats2.activities_to_html([cls.translate_ids(obj_as1)])
1✔
693

694
        # add HTML meta redirect to source page. should trigger for end users in
695
        # browsers but not for webmention receivers (hopefully).
696
        url = util.get_url(obj_as1) or obj_as1.get('id') or obj.key.id()
1✔
697
        if util.is_web(url):
1✔
698
            utf8 = '<meta charset="utf-8">'
1✔
699
            refresh = f'<meta http-equiv="refresh" content="0;url={url}">'
1✔
700
            html = html.replace(utf8, utf8 + '\n' + refresh)
1✔
701

702
        return html
1✔
703

704

705
@app.get('/web-site')
1✔
706
@flask_util.headers(CACHE_CONTROL)
1✔
707
def enter_web_site():
1✔
708
    return render_template('enter_web_site.html')
×
709

710

711
@app.post('/web-site')
1✔
712
def check_web_site():
1✔
713
    common.log_request()
1✔
714
    url = request.values['url']
1✔
715

716
    # this normalizes and lower cases domain
717
    try:
1✔
718
        domain = normalize_user_id(id=url, proto=Web)
1✔
UNCOV
719
    except (ValueError, AssertionError):
×
UNCOV
720
        logger.info(f'bad web id? {url}', exc_info=True)
×
UNCOV
721
        domain = None
×
722

723
    invalid_msg = util.linkify(f'{url} is not a <a href="/docs#web-get-started">valid or supported web site</a>', pretty=True)
1✔
724
    if not domain or not is_valid_domain(domain, allow_internal=False):
1✔
725
        flash(invalid_msg, escape=False)
1✔
726
        return render_template('enter_web_site.html'), 400
1✔
727

728
    if util.is_web(url) and urlparse(url).path.strip('/'):
1✔
729
        flash('Only top-level web sites and domains are supported.')
1✔
730
        return render_template('enter_web_site.html'), 400
1✔
731

732
    try:
1✔
733
        user = Web.get_or_create(domain, enabled_protocols=['atproto'],
1✔
734
                                 propagate=True, reload=True, verify=True)
735
    except BaseException as e:
×
736
        code, body = util.interpret_http_exception(e)
×
737
        if code:
×
738
            flash(f"Couldn't connect to {url}: {e}")
×
739
            return render_template('enter_web_site.html')
×
740
        raise
×
741

742
    if not user:  # opted out
1✔
743
        flash(invalid_msg, escape=False)
1✔
744
        return render_template('enter_web_site.html'), 400
1✔
745

746
    user.put()
1✔
747

748
    if user.redirects_error == OWNS_WEBFINGER:
1✔
749
        flash(f'{url} looks like a fediverse server! Try a normal web site.')
×
750
        return render_template('enter_web_site.html'), 400
×
751

752
    common.create_task(queue='poll-feed', domain=domain)
1✔
753
    return redirect(user.user_page_path())
1✔
754

755

756
@app.post('/webmention')
1✔
757
def webmention_external():
1✔
758
    """Handles inbound webmention, enqueue task to process.
759

760
    Use a task queue to deliver to followers because we send to each inbox in
761
    serial, which can take a long time with many followers/instances.
762
    """
763
    common.log_request()
1✔
764

765
    source = flask_util.get_required_param('source').strip()
1✔
766
    if Web.owns_id(source) is False:
1✔
767
        error(f'Bad URL {source}')
1✔
768
    elif urlparse(source).scheme != 'https':
1✔
769
        error('source URLs must be https (with SSL)')
1✔
770

771
    domain = domain_from_link(source, minimize=False)
1✔
772
    if not domain:
1✔
773
        error(f'Bad source URL {source}')
×
774

775
    user = Web.get_by_id(domain)
1✔
776
    if not user:
1✔
777
        error(f'No user found for domain {domain}')
1✔
778

779
    user.last_webmention_in = util.now()
1✔
780
    user.put()
1✔
781

782
    return common.create_task('webmention', **request.form)
1✔
783

784

785
def poll_feed(user, feed_url, rel_type):
1✔
786
    """Fetches a :class:`Web` site's feed and delivers new/updated posts.
787

788
    Args:
789
      user (Web)
790
      feed_url (str)
791
      rel_type (str): feed link's top-level rel type in home page HTML, usually
792
        either ``atom`` or ``rss``
793

794
    Returns:
795
      list of dict AS1 activities:
796
    """
797
    user.last_polled_feed = util.now()
1✔
798

799
    # fetch feed
800
    headers = {}
1✔
801
    if user.feed_etag:
1✔
802
        headers['If-None-Match'] = user.feed_etag
1✔
803
    if user.feed_last_modified:
1✔
804
        headers['If-Modified-Since'] = user.feed_last_modified
1✔
805
    resp = util.requests_get(feed_url, headers=headers, gateway=True)
1✔
806

807
    # update user
808
    user.feed_etag = resp.headers.get('ETag')
1✔
809
    user.feed_last_modified = resp.headers.get('Last-Modified')
1✔
810

811
    # parse feed
812
    content_type = resp.headers.get('Content-Type') or ''
1✔
813
    type = FEED_TYPES.get(content_type.split(';')[0])
1✔
814
    if resp.status_code == 304:
1✔
815
        logger.info('Feed is unchanged since last poll')
1✔
816
        user.put()
1✔
817
        return []
1✔
818
    elif type == 'atom' or (type == 'xml' and rel_type == 'atom'):
1✔
819
        activities = atom.atom_to_activities(resp.text)
1✔
820
    elif type == 'rss' or (type == 'xml' and rel_type == 'rss'):
1✔
821
        activities = rss.to_activities(resp.text)
1✔
822
    else:
823
        raise ValueError(f'Unknown feed type {content_type}')
1✔
824

825
    if len(activities) > MAX_FEED_ITEMS_PER_POLL:
1✔
826
        logger.info(f'Got {len(activities)} feed items, only processing the first {MAX_FEED_ITEMS_PER_POLL}')
1✔
827
        activities = activities[:MAX_FEED_ITEMS_PER_POLL]
1✔
828

829
    # create receive tasks
830
    for i, activity in enumerate(activities):
1✔
831
        # default actor and author to user
832
        activity.setdefault('actor', {}).setdefault('id', user.profile_id())
1✔
833
        obj = activity.setdefault('object', {})
1✔
834
        obj.setdefault('author', {}).setdefault('id', user.profile_id())
1✔
835

836
        # use URL as id since some feeds use non-URL (eg tag URI) ids
837
        for elem in obj, activity:
1✔
838
            if url := elem.get('url'):
1✔
839
                elem['id'] = elem['url']
1✔
840

841
        logger.debug(f'Converted to AS1: {json_dumps(activity, indent=2)}')
1✔
842

843
        id = Object(our_as1=activity).as1.get('id')
1✔
844
        if not id:
1✔
845
            logger.warning('No id or URL!')
×
846
            continue
×
847

848
        if i == 0:
1✔
849
            logger.info(f'Setting feed_last_item to {id}')
1✔
850
            user.feed_last_item = id
1✔
851
        elif id == user.feed_last_item:
1✔
852
            logger.info(f'Already seen {id}, skipping rest of feed')
×
853
            break
×
854

855
        if Web.owns_id(id) is False:
1✔
856
            logger.warning(f'Skipping bad id {id}')
×
857
            continue
×
858

859
        if not obj.get('image'):
1✔
860
            # fetch and check the post itself
861
            logger.info(f'No image in {id} , trying metaformats')
1✔
862
            post = Object(id=id)
1✔
863
            try:
1✔
864
                fetched = Web.fetch(post, metaformats=True, authorship_fetch_mf2=False)
1✔
865
            except (RequestException, HTTPException):
1✔
866
                fetched = False
1✔
867
            if fetched and post.as1:
1✔
868
                profile_images = (as1.get_ids(user.obj.as1, 'image')
1✔
869
                                  if user.obj.as1 else [])
870
                obj['image'] = [img for img in as1.get_ids(post.as1, 'image')
1✔
871
                                if img not in profile_images]
872

873
        common.create_task(queue='receive', id=id, our_as1=activity,
1✔
874
                           source_protocol=Web.ABBREV, authed_as=user.key.id(),
875
                           received_at=util.now().isoformat())
876

877
    return activities
1✔
878

879

880
@app.post(f'/queue/poll-feed')
1✔
881
@cloud_tasks_only(log=None)
1✔
882
def poll_feed_task():
1✔
883
    """Task handler for polling a :class:`Web` user's feed.
884

885
    Params:
886
      ``domain`` (str): key id of the :class:`Web` user
887
      ``last_polled`` (str): should match the user's ``last_polled_feed``. Used to detect duplicate poll tasks for the same user.
888
    """
889
    common.log_request()
1✔
890

891
    domain = flask_util.get_required_param('domain')
1✔
892
    logger.info(f'Polling feed for {domain}')
1✔
893

894
    user = Web.get_by_id(domain)
1✔
895
    if not (user and user.obj and user.obj.mf2):
1✔
896
        error(f'No Web user or object found for domain {domain}', status=304)
1✔
897
    elif user.last_webmention_in:
1✔
898
        logger.info(f'Dropping since last_webmention_in is set')
1✔
899
        return 'OK'
1✔
900

901
    logger.info(f'Last poll: {user.last_polled_feed}')
1✔
902
    last_polled = request.form.get('last_polled')
1✔
903
    if (last_polled and user.last_polled_feed
1✔
904
            and last_polled < user.last_polled_feed.isoformat()):
905
        logger.warning('duplicate poll feed task! deferring to other task')
1✔
906
        return '', 204
1✔
907

908
    # discover feed URL
909
    url, rel_type = user.feed_url()
1✔
910
    if not url:
1✔
911
        msg = f"User {user.key.id()} has no feed URL, can't fetch feed"
1✔
912
        logger.info(msg)
1✔
913
        return msg
1✔
914

915
    # go go go!
916
    activities = []
1✔
917
    status = 200
1✔
918
    try:
1✔
919
        activities = poll_feed(user, url, rel_type)
1✔
920
    except (ValueError, ElementTree.ParseError) as e:
1✔
921
        logger.error(f"Couldn't parse feed: {e}")
1✔
922
        status = 204
1✔
923
    except BaseException as e:
1✔
924
        code, _ = util.interpret_http_exception(e)
1✔
925
        if code or util.is_connection_failure(e):
1✔
926
            logger.error(f"Couldn't fetch feed: {e}")
1✔
927
            status = 204
1✔
928
        else:
929
            raise
×
930

931
    user.put()
1✔
932

933
    # determine posting frequency
934
    published_last = None
1✔
935
    published_deltas = []  # timedeltas between entry published times
1✔
936
    for activity in activities:
1✔
937
        try:
1✔
938
            published = util.parse_iso8601(activity['object']['published'])\
1✔
939
                            .astimezone(timezone.utc)
940
        except (KeyError, ValueError):
1✔
941
            continue
1✔
942

943
        if published_last:
1✔
944
            published_deltas.append(abs(published - published_last))
1✔
945
        published_last = published
1✔
946

947
    # create next poll task
948
    def clamp(delay):
1✔
949
        return max(min(delay, MAX_FEED_POLL_PERIOD), MIN_FEED_POLL_PERIOD)
1✔
950

951
    if published_deltas:
1✔
952
        delay = clamp(timedelta(seconds=statistics.mean(
1✔
953
            t.total_seconds() for t in published_deltas)))
954
    else:
955
        delay = clamp(util.now() -
1✔
956
                      (user.last_polled_feed if user.last_polled_feed and activities
957
                       else user.created.replace(tzinfo=timezone.utc)))
958

959
    common.create_task(queue='poll-feed', delay=delay, domain=user.key.id(),
1✔
960
                       last_polled=user.last_polled_feed.isoformat())
961
    return 'OK', status
1✔
962

963

964
@app.post('/queue/webmention')
1✔
965
@cloud_tasks_only(log=None)
1✔
966
def webmention_task():
1✔
967
    """Handles inbound webmention task.
968

969
    Allows source URLs on brid.gy subdomains if the ``Authorization`` header matches
970
    the Flask secret key.
971

972
    Params:
973
      ``source`` (str): URL
974
    """
975
    common.log_request()
1✔
976

977
    # load user
978
    source = flask_util.get_required_param('source').strip()
1✔
979
    domain = domain_from_link(source, minimize=False)
1✔
980
    logger.info(f'webmention from {domain}')
1✔
981

982
    internal = request.headers.get('Authorization') == app.config['SECRET_KEY']
1✔
983
    if domain in common.DOMAINS and not internal:
1✔
984
        error(f'URL not supported: {source}')
1✔
985

986
    user = Web.get_by_id(domain)
1✔
987
    if not user:
1✔
988
        error(f'No user found for domain {domain}', status=304)
×
989
    logger.info(f'User: {user.key.id()}')
1✔
990

991
    # fetch source page
992
    try:
1✔
993
        # remote=True to force fetch, local=True to populate new/changed attrs
994
        obj = Web.load(source, local=True, remote=True,
1✔
995
                       check_backlink=not appengine_info.LOCAL_SERVER)
996
    except BadRequest as e:
1✔
997
        error(str(e.description), status=304)
1✔
998
    except RequestException as e:
1✔
999
        code, body = util.interpret_http_exception(e)
1✔
1000
        if code not in ('410', '404') or user.is_web_url(source):
1✔
1001
            error(f'{e} ; {e.response.text if e.response else ""}', status=502)
1✔
1002

1003
        id = f'{source}#bridgy-fed-delete'
1✔
1004
        obj = Object(id=id, our_as1={
1✔
1005
            'id': id,
1006
            'objectType': 'activity',
1007
            'verb': 'delete',
1008
            'actor': user.web_url(),
1009
            'object': source,
1010
        })
1011

1012
    if not obj or (not obj.mf2 and obj.type != 'delete'):
1✔
1013
        error(f"Couldn't load {source} as microformats2 HTML", status=304)
1✔
1014
    elif obj.mf2 and 'h-entry' in obj.mf2.get('type', []):
1✔
1015
        authors = obj.mf2['properties'].setdefault('author', [])
1✔
1016
        author_urls = microformats2.get_string_urls(authors)
1✔
1017
        if not author_urls:
1✔
1018
            authors.append(user.web_url())
×
1019
        elif not user.is_web_url(author_urls[0]):
1✔
1020
            logger.info(f'Overriding author {author_urls[0]} with {user.web_url()}')
1✔
1021
            if isinstance(authors[0], dict):
1✔
1022
                authors[0]['properties']['url'] = [user.web_url()]
1✔
1023
            else:
1024
                authors[0] = user.web_url()
×
1025
            if obj.our_as1:
1✔
1026
                field = ('actor' if obj.our_as1.get('objectType') == 'activity'
×
1027
                         else 'author')
1028
                obj.our_as1[field] = user.web_url()
×
1029

1030
    try:
1✔
1031
        return Web.receive(obj, authed_as=user.key.id(), internal=internal)
1✔
1032
    except ValueError as e:
1✔
1033
        logger.warning(e, exc_info=True)
×
1034
        error(e, status=304)
×
1035

1036

1037
def webmention_endpoint_cache_key(url):
1✔
1038
    """Returns cache key for a cached webmention endpoint for a given URL.
1039

1040
    Just the domain by default. If the URL is the home page, ie path is ``/``,
1041
    the key includes a ``/`` at the end, so that we cache webmention endpoints
1042
    for home pages separate from other pages.
1043
    https://github.com/snarfed/bridgy/issues/701
1044

1045
    Example: ``snarfed.org /``
1046

1047
    https://github.com/snarfed/bridgy-fed/issues/423
1048

1049
    Adapted from ``bridgy/util.py``.
1050
    """
1051
    parsed = urllib.parse.urlparse(url)
1✔
1052
    key = parsed.netloc
1✔
1053
    if parsed.path in ('', '/'):
1✔
1054
        key += ' /'
1✔
1055

1056
    logger.debug(f'wm cache key {key}')
1✔
1057
    return key
1✔
1058

1059

1060
@memcache.memoize(expire=timedelta(hours=2), key=webmention_endpoint_cache_key)
1✔
1061
def webmention_discover(url, **kwargs):
1✔
1062
    """Thin cache around :func:`oauth_dropins.webutil.webmention.discover`."""
1063
    # discard the response since we don't use it and it's occasionally too big for
1064
    # memcache
1065
    return webmention.discover(url, **kwargs)._replace(response=None)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc