• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

snarfed / bridgy-fed / 79f3e4d8-6af3-4275-8e91-465357cbcc81

24 Jul 2025 03:15AM UTC coverage: 92.45% (+0.002%) from 92.448%
79f3e4d8-6af3-4275-8e91-465357cbcc81

push

circleci

snarfed
CodeQL: drop git checkout HEAD^2, cron schedule

5535 of 5987 relevant lines covered (92.45%)

0.92 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.53
/web.py
1
"""Webmention protocol with microformats2 in HTML, aka the IndieWeb stack."""
2
from datetime import timedelta, timezone
1✔
3
import difflib
1✔
4
import logging
1✔
5
import re
1✔
6
import statistics
1✔
7
import urllib.parse
1✔
8
from urllib.parse import quote, urlencode, urljoin, urlparse
1✔
9
from xml.etree import ElementTree
1✔
10

11
import brevity
1✔
12
from flask import redirect, request
1✔
13
from google.cloud import ndb
1✔
14
from google.cloud.ndb import ComputedProperty
1✔
15
from granary import as1, as2, atom, microformats2, rss
1✔
16
import mf2util
1✔
17
from oauth_dropins.webutil import flask_util, util
1✔
18
from oauth_dropins.webutil.appengine_config import tasks_client
1✔
19
from oauth_dropins.webutil import appengine_info
1✔
20
from oauth_dropins.webutil.flask_util import cloud_tasks_only, error, flash
1✔
21
from oauth_dropins.webutil.util import domain_from_link, json_dumps, json_loads
1✔
22
from oauth_dropins.webutil import webmention
1✔
23
from requests import HTTPError, RequestException
1✔
24
from requests.auth import HTTPBasicAuth
1✔
25
from werkzeug.exceptions import BadGateway, BadRequest, HTTPException, NotFound
1✔
26

27
import common
1✔
28
from common import (
1✔
29
    CACHE_CONTROL,
30
    DOMAIN_RE,
31
    DOMAINS,
32
    PRIMARY_DOMAIN,
33
    PROTOCOL_DOMAINS,
34
    render_template,
35
    SUPERDOMAIN,
36
)
37
from flask_app import app
1✔
38
from ids import normalize_user_id, translate_object_id, translate_user_id
1✔
39
import memcache
1✔
40
from models import Follower, Object, PROTOCOLS, Target, User
1✔
41
from protocol import Protocol
1✔
42

43
logger = logging.getLogger(__name__)
1✔
44

45
# https://github.com/snarfed/bridgy-fed/issues/314
46
WWW_DOMAINS = frozenset((
1✔
47
    'www.jvt.me',
48
))
49

50
FEED_TYPES = {
1✔
51
    atom.CONTENT_TYPE.split(';')[0]: 'atom',
52
    rss.CONTENT_TYPE.split(';')[0]: 'rss',
53
    # https://stackoverflow.com/questions/4832357/whats-the-difference-between-text-xml-vs-application-xml-for-webservice-respons
54
    'application/xml': 'xml',
55
    'text/xml': 'xml',
56
}
57
MIN_FEED_POLL_PERIOD = timedelta(hours=2)
1✔
58
MAX_FEED_POLL_PERIOD = timedelta(days=1)
1✔
59
MAX_FEED_ITEMS_PER_POLL = 10
1✔
60

61
# populated into Web.redirects_error
62
OWNS_WEBFINGER = 'This site serves its own Webfinger, and likely ActivityPub too.'
1✔
63

64
# in addition to common.DOMAIN_BLOCKLIST
65
FETCH_BLOCKLIST = (
1✔
66
    'bsky.app',
67
)
68

69

70
def is_valid_domain(domain, allow_internal=True):
1✔
71
    """Returns True if this is a valid domain we can use, False otherwise.
72

73
    Args:
74
      domain (str):
75
      allow_internal (bool): whether to return True for internal domains
76
        like ``fed.brid.gy``, ``bsky.brid.gy``, etc
77

78
    Valid means TLD is ok, not blacklisted, etc.
79
    """
80
    if not domain or not re.match(DOMAIN_RE, domain):
1✔
81
        # logger.debug(f"{domain} doesn't look like a domain")
82
        return False
1✔
83

84
    if Web.is_blocklisted(domain, allow_internal=allow_internal):
1✔
85
        # logger.debug(f'{domain} is blocklisted')
86
        return False
1✔
87

88
    tld = domain.split('.')[-1]
1✔
89
    if tld not in brevity.TLDS:
1✔
90
        # logger.info(f"{domain} looks like a domain but {tld} isn't a TLD")
91
        return False
1✔
92

93
    return True
1✔
94

95

96
class Web(User, Protocol):
1✔
97
    """Web user and webmention protocol implementation.
98

99
    The key name is the domain.
100
    """
101
    ABBREV = 'web'
1✔
102
    ''
1✔
103
    PHRASE = 'the web'
1✔
104
    ''
1✔
105
    OTHER_LABELS = ('webmention',)
1✔
106
    ''
1✔
107
    LOGO_HTML = '🌐'  # used to be 🕸️
1✔
108
    ''
1✔
109
    CONTENT_TYPE = common.CONTENT_TYPE_HTML
1✔
110
    ''
1✔
111
    DEFAULT_ENABLED_PROTOCOLS = ('activitypub',)
1✔
112
    ''
1✔
113
    DEFAULT_SERVE_USER_PAGES = True
1✔
114
    ''
1✔
115
    SUPPORTED_AS1_TYPES = (
1✔
116
        tuple(as1.ACTOR_TYPES)
117
        + tuple(as1.POST_TYPES)
118
        + tuple(as1.CRUD_VERBS)
119
        + ('audio', 'bookmark', 'event', 'image', 'video')
120
        + ('follow', 'like', 'share', 'stop-following')
121
    )
122
    ''
1✔
123
    USES_OBJECT_FEED = True
1✔
124
    ''
1✔
125

126
    has_redirects = ndb.BooleanProperty()
1✔
127
    ''
1✔
128
    redirects_error = ndb.TextProperty()
1✔
129
    ''
1✔
130
    has_hcard = ndb.BooleanProperty()
1✔
131
    'Currently unused, and I think now always ends up as ``True``. TODO: remove?'
1✔
132
    last_webmention_in = ndb.DateTimeProperty(tzinfo=timezone.utc)
1✔
133
    ''
1✔
134
    last_polled_feed = ndb.DateTimeProperty(tzinfo=timezone.utc)
1✔
135
    ''
1✔
136
    feed_last_item = ndb.StringProperty()
1✔
137
    """str: feed item id (URL)"""
1✔
138
    feed_etag = ndb.StringProperty()
1✔
139
    ''
1✔
140
    feed_last_modified = ndb.StringProperty()
1✔
141
    ''
1✔
142

143
    atproto_last_chat_log_cursor = ndb.StringProperty()
1✔
144
    """Only used by protocol bot users in Bluesky, for polling their chat
1✔
145
    messages with ``chat.bsky.convo.getLog``.
146
    """
147

148
    ap_subdomain = ndb.StringProperty(
1✔
149
        choices=['ap', 'bsky', 'efake', 'fake', 'fed', 'nostr', 'other', 'web'],
150
        default='web')
151
    """Originally, BF served Web users' AP actor ids on fed.brid.gy, eg
1✔
152
    https://fed.brid.gy/snarfed.org . When we started adding new protocols, we
153
    switched to per-protocol subdomains, eg https://web.brid.gy/snarfed.org .
154
    However, we need to preserve the old users' actor ids as is.
155

156
    Also, our per-protocol bot accounts in ActivityPub are on their own
157
    subdomains, eg @bsky.brid.gy@bsky.brid.gy.
158

159
    So, this property tracks which subdomain a given Web user's AP actor uses.
160
    """
161

162
    # OLD. some stored entities still have these; do not reuse.
163
    # superfeedr_subscribed = ndb.DateTimeProperty(tzinfo=timezone.utc)
164
    # superfeedr_subscribed_feed = ndb.StringProperty()
165

166
    @classmethod
1✔
167
    def _get_kind(cls):
1✔
168
        return 'MagicKey'
1✔
169

170
    def _pre_put_hook(self):
1✔
171
        """Validate domain id, don't allow upper case or invalid characters."""
172
        super()._pre_put_hook()
1✔
173
        id = self.key.id()
1✔
174
        assert is_valid_domain(id), id
1✔
175
        assert id.lower() == id, f'upper case is not allowed in Web key id: {id}'
1✔
176

177
    @classmethod
1✔
178
    def get_or_create(cls, id, allow_opt_out=False, verify=None, **kwargs):
1✔
179
        """Normalize domain, then pass through to :meth:`User.get_or_create`.
180

181
        Normalizing currently consists of lower casing and removing leading and
182
        trailing dots.
183

184
        Args:
185
          verify (bool): whether to call :meth:`verify` to load h-card, check
186
            redirects, etc. Defaults to calling it only if the user is new.
187
        """
188
        # normalize id (domain)
189
        domain = cls.key_for(id, allow_opt_out=True).id()
1✔
190
        if (util.domain_or_parent_in(domain, [SUPERDOMAIN.strip('.')])
1✔
191
                and not appengine_info.DEBUG):
192
            return super().get_by_id(domain)
1✔
193

194
        user = super().get_or_create(domain, allow_opt_out=True, **kwargs)
1✔
195
        if not user:
1✔
196
            return None
×
197

198
        if verify or (verify is None and not user.existing):
1✔
199
            user = user.verify()
1✔
200

201
        if not allow_opt_out and user.status:
1✔
202
            return None
1✔
203

204
        if not user.existing:
1✔
205
            common.create_task(queue='poll-feed', domain=user.key.id())
1✔
206

207
        return user
1✔
208

209
    @ndb.ComputedProperty
1✔
210
    def handle(self):
1✔
211
        """Returns this user's chosen username or domain, eg ``user.com``."""
212
        # prettify if domain, noop if username
213
        username = self.username()
1✔
214
        if username != self.key.id():
1✔
215
            return domain_from_link(username, minimize=False)
1✔
216
        return username
1✔
217

218
    def handle_as(self, to_proto):
1✔
219
        """Special case ActivityPub to use custom username."""
220
        if to_proto in ('activitypub', 'ap', PROTOCOLS['ap']):
1✔
221
            return (f'@{self.username()}@{self.key.id()}' if self.has_redirects
1✔
222
                    else f'@{self.key.id()}@{self.ap_subdomain}{SUPERDOMAIN}')
223

224
        return super().handle_as(to_proto)
1✔
225

226
    def id_as(self, to_proto):
1✔
227
        """Special case ActivityPub to use ``ap_subdomain``."""
228
        if isinstance(to_proto, str):
1✔
229
            to_proto = PROTOCOLS[to_proto]
×
230

231
        converted = translate_user_id(id=self.key.id(), from_=self,
1✔
232
                                      to=to_proto)
233

234
        if to_proto.LABEL == 'activitypub':
1✔
235
            other = 'web' if self.ap_subdomain == 'fed' else 'fed'
1✔
236
            converted = converted.replace(f'https://{other}.brid.gy/',
1✔
237
                                          f'https://{self.ap_subdomain}.brid.gy/')
238

239
        return converted
1✔
240

241
    web_url = User.profile_id
1✔
242

243
    def id_uri(self):
1✔
244
        return self.web_url()
1✔
245

246
    def is_web_url(self, url):
1✔
247
        return super().is_web_url(url, ignore_www=True)
1✔
248

249
    def user_page_path(self, rest=None, **kwargs):
1✔
250
        """Always prefer domain (id)."""
251
        kwargs['prefer_id'] = True
1✔
252
        return super().user_page_path(rest=rest, **kwargs)
1✔
253

254
    def username(self):
1✔
255
        """Returns the user's preferred username.
256

257
        Uses stored representative h-card if available, falls back to id.
258

259
        Returns:
260
          str:
261
        """
262
        id = self.key.id()
1✔
263

264
        if self.obj and self.obj.as1:
1✔
265
            for url in (util.get_list(self.obj.as1, 'url') +
1✔
266
                        util.get_list(self.obj.as1, 'urls')):
267
                url = url.get('value') if isinstance(url, dict) else url
1✔
268
                if url and url.startswith('acct:'):
1✔
269
                    try:
1✔
270
                        urluser, urldomain = util.parse_acct_uri(url)
1✔
271
                    except ValueError as e:
1✔
272
                        continue
1✔
273
                    if urldomain == id:
1✔
274
                        logger.info(f'Found custom username: {urluser}')
1✔
275
                        return urluser
1✔
276

277
        # logger.debug(f'Defaulting username to key id {id}')
278
        return id
1✔
279

280
    @ndb.ComputedProperty
1✔
281
    def status(self):
1✔
282
        if self.key.id() in common.DOMAINS:
1✔
283
            return None
1✔
284

285
        if self.redirects_error == OWNS_WEBFINGER:
1✔
286
            # looks like this site is already its own fediverse server
287
            return 'owns-webfinger'
1✔
288

289
        url, _ = self.feed_url()
1✔
290
        if (not url and not self.webmention_endpoint() and not self.last_webmention_in
1✔
291
                and not self.has_redirects):
292
            return 'no-feed-or-webmention'
1✔
293

294
        return super().status
1✔
295

296
    def verify(self):
1✔
297
        """Fetches site a couple ways to check for redirects and h-card.
298

299
        Returns:
300
          web.Web: user that was verified. May be different than self! eg if
301
          self's domain started with www and we switch to the root domain.
302
        """
303
        domain = self.key.id()
1✔
304
        logger.info(f'Verifying {domain}')
1✔
305

306
        if domain.startswith('www.') and domain not in WWW_DOMAINS:
1✔
307
            # if root domain serves ok, use it instead
308
            # https://github.com/snarfed/bridgy-fed/issues/314
309
            root = domain.removeprefix('www.')
1✔
310
            root_site = f'https://{root}/'
1✔
311
            try:
1✔
312
                resp = util.requests_get(root_site, gateway=False)
1✔
313
                if resp.ok and self.is_web_url(resp.url):
1✔
314
                    logger.info(f'{root_site} serves ok ; using {root} instead')
1✔
315
                    root_user = Web.get_or_create(
1✔
316
                        root, enabled_protocols=self.enabled_protocols,
317
                        allow_opt_out=True)
318
                    self.use_instead = root_user.key
1✔
319
                    self.put()
1✔
320
                    return root_user.verify()
1✔
321
            except RequestException as e:
×
322
                logger.info(f"Couldn't fetch {root_site} : {e}")
×
323
                logger.info(f"Continuing with {domain}")
×
324
                pass
×
325

326
        # check webfinger redirect
327
        path = f'/.well-known/webfinger?resource=acct:{domain}@{domain}'
1✔
328
        self.has_redirects = False
1✔
329
        self.redirects_error = None
1✔
330
        try:
1✔
331
            url = urljoin(self.web_url(), path)
1✔
332
            resp = util.requests_get(url, gateway=False)
1✔
333
            domain_urls = ([f'https://{domain}/' for domain in common.DOMAINS] +
1✔
334
                           [common.host_url()])
335
            expected = [urljoin(url, path) for url in domain_urls]
1✔
336
            if resp.url:
1✔
337
                got = urllib.parse.unquote(resp.url)
1✔
338
                if got in expected:
1✔
339
                    self.has_redirects = True
1✔
340
                else:
341
                    # check host-meta to see if they serve their own Webfinger
342
                    resp = util.requests_get(
1✔
343
                        urljoin(self.web_url(), '/.well-known/host-meta'),
344
                        gateway=False)
345
                    if (resp.status_code == 200
1✔
346
                            and domain_from_link(resp.url) not in common.DOMAINS):
347
                        logger.info(f"{domain} serves Webfinger! probably a fediverse server")
1✔
348
                        self.redirects_error = OWNS_WEBFINGER
1✔
349
                    else:
350
                        diff = '\n'.join(difflib.Differ().compare([got], [expected[0]]))
1✔
351
                        self.redirects_error = f'Current vs expected:<pre>{diff}</pre>'
1✔
352
            else:
353
                lines = [url, f'  returned HTTP {resp.status_code}']
1✔
354
                if resp.url and resp.url != url:
1✔
355
                    lines[1:1] = ['  redirected to:', resp.url]
×
356
                self.redirects_error = '<pre>' + '\n'.join(lines) + '</pre>'
1✔
357
        except RequestException:
1✔
358
            pass
×
359

360
        # check home page
361
        self.has_hcard = False
1✔
362
        if not getattr(self, 'existing', None) == False:  # ie this is a new user
1✔
363
            self.reload_profile(gateway=True, raise_=False)
1✔
364
        if self.obj and self.obj.as1:
1✔
365
            self.has_hcard = True
1✔
366

367
        self.put()
1✔
368
        return self
1✔
369

370
    @classmethod
1✔
371
    def key_for(cls, id, allow_opt_out=False):
1✔
372
        """Returns the :class:`ndb.Key` for a given id.
373

374
        If id is a domain, uses it as is. If it's a home page URL or fed.brid.gy
375
        or web.brid.gy AP actor URL, extracts the domain and uses that.
376
        Otherwise, returns None.
377

378
        Args:
379
          id (str)
380
          allow_opt_out (bool): whether to allow users who are currently opted out
381

382
        Returns:
383
        ndb.Key or None:
384
        """
385
        if not id:
1✔
386
            return None
1✔
387

388
        id = id.lower().strip('.')
1✔
389
        if util.is_web(id):
1✔
390
            parsed = urlparse(id)
1✔
391
            if parsed.path in ('', '/'):
1✔
392
                id = parsed.netloc
1✔
393

394
        if is_valid_domain(id, allow_internal=True):
1✔
395
            return super().key_for(id, allow_opt_out=allow_opt_out)
1✔
396

397
        return None
1✔
398

399
    @classmethod
1✔
400
    def owns_id(cls, id):
1✔
401
        """Returns True on domains and internal URLs, None on other URLs.
402

403
        All web pages are http(s) URLs, but not all http(s) URLs are web pages.
404
        """
405
        if not id:
1✔
406
            return False
×
407
        elif is_valid_domain(id, allow_internal=True):
1✔
408
            return True
1✔
409

410
        if not util.is_web(id):
1✔
411
            return False
1✔
412

413
        domain = domain_from_link(id)
1✔
414
        if domain == PRIMARY_DOMAIN or domain in PROTOCOL_DOMAINS:
1✔
415
            return True
1✔
416

417
        # we allowed internal domains for protocol bot actors above, but we
418
        # don't want to allow non-homepage URLs on those domains, eg
419
        # https://bsky.brid.gy/foo, so don't allow internal here
420
        if is_valid_domain(domain, allow_internal=False):
1✔
421
            return None
1✔
422

423
        return False
1✔
424

425
    @classmethod
1✔
426
    def owns_handle(cls, handle, allow_internal=False):
1✔
427
        if handle == PRIMARY_DOMAIN or handle in PROTOCOL_DOMAINS:
1✔
428
            return True
1✔
429
        elif not is_valid_domain(handle, allow_internal=allow_internal):
1✔
430
            return False
1✔
431

432
    @classmethod
1✔
433
    def handle_to_id(cls, handle):
1✔
434
        assert cls.owns_handle(handle) is not False
1✔
435
        return handle
1✔
436

437
    @classmethod
1✔
438
    def target_for(cls, obj, shared=False):
1✔
439
        """Returns `obj`'s id, as a URL webmention target."""
440
        # TODO: we have entities in prod that fail this, eg
441
        # https://indieweb.social/users/bismark has source_protocol webmention
442
        # assert obj.source_protocol in (cls.LABEL, cls.ABBREV, 'ui', None), str(obj)
443

444
        if not util.is_web(obj.key.id()):
1✔
445
            logger.warning(f"{obj.key.id()} is source_protocol web but id isn't a URL!")
1✔
446
            return None
1✔
447

448
        return obj.key.id()
1✔
449

450
    def feed_url(self):
1✔
451
        """Returns this web site's RSS or Atom feed URL and type, if any.
452

453
        Returns:
454
          (str, type) or (None, None):
455
        """
456
        if self.obj and self.obj.mf2:
1✔
457
            for url, info in self.obj.mf2.get('rel-urls', {}).items():
1✔
458
                type = FEED_TYPES.get(info.get('type', '').split(';')[0])
1✔
459
                if 'alternate' in info.get('rels', []) and type:
1✔
460
                    return url, type
1✔
461

462
        return None, None
1✔
463

464
    def webmention_endpoint(self):
1✔
465
        """Returns this web site's webmention endpoint, if any.
466

467
        Returns:
468
          str: webmention endpoint URL
469
        """
470
        if self.obj and self.obj.mf2:
1✔
471
            for url, info in self.obj.mf2.get('rel-urls', {}).items():
1✔
472
                if 'webmention' in info.get('rels', []):
1✔
473
                    return url
1✔
474

475
    @classmethod
1✔
476
    def send(to_cls, obj, target, from_user=None, orig_obj_id=None, **kwargs):
1✔
477
        """Sends a webmention to a given webmention target URL.
478

479
        See :meth:`Protocol.send` for details.
480

481
        Returns False if the target URL doesn't advertise a webmention endpoint,
482
        or if webmention/microformats2 don't support the activity type.
483
        https://fed.brid.gy/docs#error-handling
484
        """
485
        targets = as1.targets(obj.as1)
1✔
486
        if not (target in targets or
1✔
487
                # homepage, check domain too
488
                (urlparse(target).path.strip('/') == ''
489
                 and domain_from_link(target) in targets)):
490
            logger.debug(f'Skipping sending to {target} , not a target in the object')
1✔
491
            return False
1✔
492

493
        if to_cls.is_blocklisted(target):
1✔
494
            logger.info(f'Skipping sending to blocklisted {target}')
×
495
            return False
×
496

497
        source_id = translate_object_id(
1✔
498
            id=obj.key.id(), from_=PROTOCOLS[obj.source_protocol], to=Web)
499
        source_url = quote(source_id, safe=':/%+')
1✔
500
        logger.info(f'Sending webmention from {source_url} to {target}')
1✔
501

502
        # we only send webmentions for responses. for sending normal posts etc
503
        # to followers, we just update our stored objects (elsewhere) and web
504
        # users consume them via feeds.
505
        endpoint = webmention_discover(target).endpoint
1✔
506
        if not endpoint:
1✔
507
            return False
1✔
508

509
        webmention.send(endpoint, source_url, target)
1✔
510
        return True
1✔
511

512
    @classmethod
1✔
513
    def load(cls, id, **kwargs):
1✔
514
        """Wrap :meth:`Protocol.load` to convert domains to homepage URLs."""
515
        if re.match(DOMAIN_RE, id):
1✔
516
            id = f'https://{id}/'
1✔
517

518
        return super().load(id, **kwargs)
1✔
519

520
    @classmethod
1✔
521
    def fetch(cls, obj, gateway=False, check_backlink=False,
1✔
522
              authorship_fetch_mf2=True, metaformats=None, **kwargs):
523
        """Fetches a URL over HTTP and extracts its microformats2.
524

525
        Follows redirects, but doesn't change the original URL in ``obj``'s id!
526
        :class:`google.cloud.ndb.model.Model` doesn't allow that anyway, but more
527
        importantly, we want to preserve that original URL becase other objects
528
        may refer to it instead of the final redirect destination URL.
529

530
        See :meth:`Protocol.fetch` for other background.
531

532
        Args:
533
          gateway (bool): passed through to
534
            :func:`oauth_dropins.webutil.util.fetch_mf2`
535
          check_backlink (bool): optional, whether to require a link to Bridgy
536
            Fed. Ignored if the URL is a homepage, ie has no path.
537
          authorship_fetch_mf2 (bool): optional, when running the authorship
538
            algorithm, fetch author URL if necessary
539
          kwargs: ignored
540
        """
541
        url = obj.key.id()
1✔
542

543
        if not util.is_web(url) or not util.is_url(url):
1✔
544
            logger.info(f'{url} is not a URL')
1✔
545
            return False
1✔
546

547
        if (cls.is_blocklisted(url, allow_internal=True)
1✔
548
                or util.domain_or_parent_in(url, FETCH_BLOCKLIST)):
549
            return False
1✔
550

551
        is_homepage = urlparse(url).path.strip('/') == ''
1✔
552
        if is_homepage:
1✔
553
            domain = domain_from_link(url)
1✔
554
            if domain == PRIMARY_DOMAIN or domain in PROTOCOL_DOMAINS:
1✔
555
                profile = util.read(f'{domain}.as2.json')
1✔
556
                if profile:
1✔
557
                    obj.as2 = json_loads(profile)
1✔
558
                    return True
1✔
559
                return False
×
560

561
        require_backlink = (common.host_url().rstrip('/')
1✔
562
                            if check_backlink and not is_homepage
563
                            else None)
564
        if metaformats is None:
1✔
565
            # default to only for homepages
566
            metaformats = is_homepage
1✔
567

568
        try:
1✔
569
            parsed = util.fetch_mf2(url, gateway=gateway, metaformats=metaformats,
1✔
570
                                    require_backlink=require_backlink)
571
        except ValueError as e:
1✔
572
            error(str(e))
1✔
573

574
        if parsed is None or not parsed.get('items'):
1✔
575
            if parsed:
1✔
576
                # we got valid HTML. save the Object so that we know this URL is web
577
                obj.source_protocol = 'web'
1✔
578
                obj.put()
1✔
579
            logger.info(f'No microformats2 found in {url}')
1✔
580
            return False
1✔
581

582
        # find mf2 item
583
        if is_homepage:
1✔
584
            logger.info(f"{url} is user's web url")
1✔
585
            parsed_url = (parsed['url'] or '').rstrip('/')
1✔
586
            # try both with and without trailing slash
587
            entry = (mf2util.representative_hcard(parsed, parsed_url)
1✔
588
                     or mf2util.representative_hcard(parsed, parsed_url + '/'))
589
            if not entry:
1✔
590
                error(f"Couldn't find a representative h-card (http://microformats.org/wiki/representative-h-card-parsing) on {parsed['url']}")
1✔
591
            logger.info(f'Found representative h-card')
1✔
592
            # handle when eg https://user.com/ redirects to https://www.user.com/
593
            # we need to store this as https://user.com/
594
            if parsed['url'] != url:
1✔
595
                logger.info(f'overriding {parsed["url"]} with {url}')
1✔
596
                entry['properties'].setdefault('url', []).insert(0, url)
1✔
597
                if rel_url := parsed['rel-urls'].pop(parsed['url'], None):
1✔
598
                    parsed['rel-urls'][url] = rel_url
1✔
599
                parsed['url'] = url
1✔
600

601
        else:
602
            entry = mf2util.find_first_entry(parsed, ['h-entry'])
1✔
603
            if not entry:
1✔
604
                error(f'No microformats2 h-entry found in {url}')
×
605

606
        # discard uid if set; we use URL as id
607
        props = entry.setdefault('properties', {})
1✔
608
        if 'uid' in props:
1✔
609
            logger.info(f'Discarding uid property: {props["uid"]}')
1✔
610
            props.pop('uid')
1✔
611

612
        # store final URL in mf2 object
613
        if is_homepage:
1✔
614
            entry.setdefault('rel-urls', {}).update(parsed.get('rel-urls', {}))
1✔
615
            entry.setdefault('type', ['h-card'])
1✔
616
        if parsed['url']:
1✔
617
            entry['url'] = parsed['url']
1✔
618
        logger.info(f'Extracted microformats2 entry: {json_dumps(entry)[:500]}')
1✔
619

620
        if not is_homepage:
1✔
621
            # default actor/author to home page URL
622
            authors = props.setdefault('author', [])
1✔
623
            if not microformats2.get_string_urls(authors):
1✔
624
                homepage = urljoin(parsed.get('url') or url, '/')
1✔
625
                logger.info(f'Defaulting author URL to {homepage}')
1✔
626
                if authors and isinstance(authors[0], dict):
1✔
627
                    authors[0]['properties']['url'] = [homepage]
1✔
628
                else:
629
                    authors.insert(0, homepage)
1✔
630

631
            # run full authorship algorithm if necessary:
632
            # https://indieweb.org/authorship
633
            # duplicated in microformats2.json_to_object
634
            author = util.get_first(props, 'author')
1✔
635
            if not isinstance(author, dict):
1✔
636
                logger.info(f'Fetching full authorship for author {author}')
1✔
637
                fetch_fn = util.fetch_mf2 if authorship_fetch_mf2 else None
1✔
638
                try:
1✔
639
                    author = mf2util.find_author({'items': [entry]}, hentry=entry,
1✔
640
                                                 fetch_mf2_func=fetch_fn)
641
                except (ValueError, TypeError) as e:
1✔
642
                    logger.warning(e)
1✔
643
                    author = None
1✔
644
                logger.debug(f'Got: {author}')
1✔
645
                if author:
1✔
646
                    props['author'] = util.trim_nulls([{
1✔
647
                        "type": ["h-card"],
648
                        'properties': {
649
                            field: [author[field]] if author.get(field) else []
650
                            for field in ('name', 'photo', 'url')
651
                        },
652
                    }])
653

654
        obj.mf2 = entry
1✔
655
        return True
1✔
656

657
    @classmethod
1✔
658
    def _convert(cls, obj, from_user=None):
1✔
659
        """Converts a :class:`Object` to HTML.
660

661
        Args:
662
          obj (models.Object)
663
          from_user (models.User): user (actor) this activity/object is from
664

665
        Returns:
666
          str:
667
        """
668
        if not obj or not obj.as1:
1✔
669
            return ''
×
670

671
        obj_as1 = obj.as1
1✔
672
        if from_user and not from_user.is_enabled(cls):
1✔
673
            error(f'{from_user.key.id()} => {cls.LABEL} not enabled')
×
674

675
        from_proto = PROTOCOLS.get(obj.source_protocol)
1✔
676
        if from_proto:
1✔
677
            # fill in author/actor if available
678
            for field in 'author', 'actor':
1✔
679
                val = as1.get_object(obj_as1, field)
1✔
680
                if val.keys() == set(['id']) and val['id']:
1✔
681
                    loaded = from_proto.load(val['id'], raise_=False)
1✔
682
                    if loaded and loaded.as1:
1✔
683
                        obj_as1 = {**obj_as1, field: loaded.as1}
1✔
684
        else:
685
            logger.debug(f'Not hydrating actor or author due to source_protocol {obj.source_protocol}')
1✔
686

687
        html = microformats2.activities_to_html([cls.translate_ids(obj_as1)])
1✔
688

689
        # add HTML meta redirect to source page. should trigger for end users in
690
        # browsers but not for webmention receivers (hopefully).
691
        url = util.get_url(obj_as1) or obj_as1.get('id') or obj.key.id()
1✔
692
        if util.is_web(url):
1✔
693
            utf8 = '<meta charset="utf-8">'
1✔
694
            refresh = f'<meta http-equiv="refresh" content="0;url={url}">'
1✔
695
            html = html.replace(utf8, utf8 + '\n' + refresh)
1✔
696

697
        return html
1✔
698

699

700
@app.get('/web-site')
1✔
701
@flask_util.headers(CACHE_CONTROL)
1✔
702
def enter_web_site():
1✔
703
    return render_template('enter_web_site.html')
×
704

705

706
@app.post('/web-site')
1✔
707
def check_web_site():
1✔
708
    common.log_request()
1✔
709
    url = request.values['url']
1✔
710

711
    # this normalizes and lower cases domain
712
    try:
1✔
713
        domain = normalize_user_id(id=url, proto=Web)
1✔
714
    except (ValueError, AssertionError):
1✔
715
        logger.info(f'bad web id? {url}', exc_info=True)
1✔
716
        domain = None
1✔
717

718
    invalid_msg = util.linkify(f'{url} is not a <a href="/docs#web-get-started">valid or supported web site</a>', pretty=True)
1✔
719
    if not domain or not is_valid_domain(domain, allow_internal=False):
1✔
720
        flash(invalid_msg)
1✔
721
        return render_template('enter_web_site.html'), 400
1✔
722

723
    if util.is_web(url) and urlparse(url).path.strip('/'):
1✔
724
        flash('Only top-level web sites and domains are supported.')
1✔
725
        return render_template('enter_web_site.html'), 400
1✔
726

727
    try:
1✔
728
        user = Web.get_or_create(domain, enabled_protocols=['atproto'],
1✔
729
                                 propagate=True, reload=True, verify=True)
730
    except BaseException as e:
×
731
        code, body = util.interpret_http_exception(e)
×
732
        if code:
×
733
            flash(util.linkify(f"Couldn't connect to {url}: {e}", pretty=True))
×
734
            return render_template('enter_web_site.html')
×
735
        raise
×
736

737
    if not user:  # opted out
1✔
738
        flash(invalid_msg)
1✔
739
        return render_template('enter_web_site.html'), 400
1✔
740

741
    user.put()
1✔
742

743
    if user.redirects_error == OWNS_WEBFINGER:
1✔
744
        flash(f'{url} looks like a fediverse server! Try a normal web site.')
×
745
        return render_template('enter_web_site.html'), 400
×
746

747
    common.create_task(queue='poll-feed', domain=domain)
1✔
748
    return redirect(user.user_page_path())
1✔
749

750

751
@app.post('/webmention')
1✔
752
def webmention_external():
1✔
753
    """Handles inbound webmention, enqueue task to process.
754

755
    Use a task queue to deliver to followers because we send to each inbox in
756
    serial, which can take a long time with many followers/instances.
757
    """
758
    common.log_request()
1✔
759

760
    source = flask_util.get_required_param('source').strip()
1✔
761
    if Web.owns_id(source) is False:
1✔
762
        error(f'Bad URL {source}')
1✔
763
    elif urlparse(source).scheme != 'https':
1✔
764
        error('source URLs must be https (with SSL)')
1✔
765

766
    domain = domain_from_link(source, minimize=False)
1✔
767
    if not domain:
1✔
768
        error(f'Bad source URL {source}')
×
769

770
    user = Web.get_by_id(domain)
1✔
771
    if not user:
1✔
772
        error(f'No user found for domain {domain}')
1✔
773

774
    user.last_webmention_in = util.now()
1✔
775
    user.put()
1✔
776

777
    return common.create_task('webmention', **request.form)
1✔
778

779

780
def poll_feed(user, feed_url, rel_type):
1✔
781
    """Fetches a :class:`Web` site's feed and delivers new/updated posts.
782

783
    Args:
784
      user (Web)
785
      feed_url (str)
786
      rel_type (str): feed link's top-level rel type in home page HTML, usually
787
        either ``atom`` or ``rss``
788

789
    Returns:
790
      list of dict AS1 activities:
791
    """
792
    user.last_polled_feed = util.now()
1✔
793

794
    # fetch feed
795
    headers = {}
1✔
796
    if user.feed_etag:
1✔
797
        headers['If-None-Match'] = user.feed_etag
1✔
798
    if user.feed_last_modified:
1✔
799
        headers['If-Modified-Since'] = user.feed_last_modified
1✔
800
    resp = util.requests_get(feed_url, headers=headers, gateway=True)
1✔
801

802
    # update user
803
    user.feed_etag = resp.headers.get('ETag')
1✔
804
    user.feed_last_modified = resp.headers.get('Last-Modified')
1✔
805

806
    # parse feed
807
    content_type = resp.headers.get('Content-Type') or ''
1✔
808
    type = FEED_TYPES.get(content_type.split(';')[0])
1✔
809
    if resp.status_code == 304:
1✔
810
        logger.info('Feed is unchanged since last poll')
1✔
811
        user.put()
1✔
812
        return []
1✔
813
    elif type == 'atom' or (type == 'xml' and rel_type == 'atom'):
1✔
814
        activities = atom.atom_to_activities(resp.text)
1✔
815
    elif type == 'rss' or (type == 'xml' and rel_type == 'rss'):
1✔
816
        activities = rss.to_activities(resp.text)
1✔
817
    else:
818
        raise ValueError(f'Unknown feed type {content_type}')
1✔
819

820
    if len(activities) > MAX_FEED_ITEMS_PER_POLL:
1✔
821
        logger.info(f'Got {len(activities)} feed items, only processing the first {MAX_FEED_ITEMS_PER_POLL}')
1✔
822
        activities = activities[:MAX_FEED_ITEMS_PER_POLL]
1✔
823

824
    # create receive tasks
825
    for i, activity in enumerate(activities):
1✔
826
        # default actor and author to user
827
        activity.setdefault('actor', {}).setdefault('id', user.profile_id())
1✔
828
        obj = activity.setdefault('object', {})
1✔
829
        obj.setdefault('author', {}).setdefault('id', user.profile_id())
1✔
830

831
        # use URL as id since some feeds use non-URL (eg tag URI) ids
832
        for elem in obj, activity:
1✔
833
            if url := elem.get('url'):
1✔
834
                elem['id'] = elem['url']
1✔
835

836
        logger.debug(f'Converted to AS1: {json_dumps(activity, indent=2)}')
1✔
837

838
        id = Object(our_as1=activity).as1.get('id')
1✔
839
        if not id:
1✔
840
            logger.warning('No id or URL!')
×
841
            continue
×
842

843
        if i == 0:
1✔
844
            logger.info(f'Setting feed_last_item to {id}')
1✔
845
            user.feed_last_item = id
1✔
846
        elif id == user.feed_last_item:
1✔
847
            logger.info(f'Already seen {id}, skipping rest of feed')
×
848
            break
×
849

850
        if Web.owns_id(id) is False:
1✔
851
            logger.warning(f'Skipping bad id {id}')
×
852
            continue
×
853

854
        if not obj.get('image'):
1✔
855
            # fetch and check the post itself
856
            logger.info(f'No image in {id} , trying metaformats')
1✔
857
            post = Object(id=id)
1✔
858
            try:
1✔
859
                fetched = Web.fetch(post, metaformats=True, authorship_fetch_mf2=False)
1✔
860
            except (RequestException, HTTPException):
1✔
861
                fetched = False
1✔
862
            if fetched and post.as1:
1✔
863
                profile_images = (as1.get_ids(user.obj.as1, 'image')
1✔
864
                                  if user.obj.as1 else [])
865
                obj['image'] = [img for img in as1.get_ids(post.as1, 'image')
1✔
866
                                if img not in profile_images]
867

868
        common.create_task(queue='receive', id=id, our_as1=activity,
1✔
869
                           source_protocol=Web.ABBREV, authed_as=user.key.id(),
870
                           received_at=util.now().isoformat())
871

872
    return activities
1✔
873

874

875
@app.post(f'/queue/poll-feed')
1✔
876
@cloud_tasks_only(log=None)
1✔
877
def poll_feed_task():
1✔
878
    """Task handler for polling a :class:`Web` user's feed.
879

880
    Params:
881
      ``domain`` (str): key id of the :class:`Web` user
882
      ``last_polled`` (str): should match the user's ``last_polled_feed``. Used to detect duplicate poll tasks for the same user.
883
    """
884
    common.log_request()
1✔
885

886
    domain = flask_util.get_required_param('domain')
1✔
887
    logger.info(f'Polling feed for {domain}')
1✔
888

889
    user = Web.get_by_id(domain)
1✔
890
    if not (user and user.obj and user.obj.mf2):
1✔
891
        error(f'No Web user or object found for domain {domain}', status=304)
1✔
892
    elif user.last_webmention_in:
1✔
893
        logger.info(f'Dropping since last_webmention_in is set')
1✔
894
        return 'OK'
1✔
895

896
    logger.info(f'Last poll: {user.last_polled_feed}')
1✔
897
    last_polled = request.form.get('last_polled')
1✔
898
    if (last_polled and user.last_polled_feed
1✔
899
            and last_polled < user.last_polled_feed.isoformat()):
900
        logger.warning('duplicate poll feed task! deferring to other task')
1✔
901
        return '', 204
1✔
902

903
    # discover feed URL
904
    url, rel_type = user.feed_url()
1✔
905
    if not url:
1✔
906
        msg = f"User {user.key.id()} has no feed URL, can't fetch feed"
1✔
907
        logger.info(msg)
1✔
908
        return msg
1✔
909

910
    # go go go!
911
    activities = []
1✔
912
    status = 200
1✔
913
    try:
1✔
914
        activities = poll_feed(user, url, rel_type)
1✔
915
    except (ValueError, ElementTree.ParseError) as e:
1✔
916
        logger.error(f"Couldn't parse feed: {e}")
1✔
917
        status = 204
1✔
918
    except BaseException as e:
1✔
919
        code, _ = util.interpret_http_exception(e)
1✔
920
        if code or util.is_connection_failure(e):
1✔
921
            logger.error(f"Couldn't fetch feed: {e}")
1✔
922
            status = 204
1✔
923
        else:
924
            raise
×
925

926
    user.put()
1✔
927

928
    # determine posting frequency
929
    published_last = None
1✔
930
    published_deltas = []  # timedeltas between entry published times
1✔
931
    for activity in activities:
1✔
932
        try:
1✔
933
            published = util.parse_iso8601(activity['object']['published'])\
1✔
934
                            .astimezone(timezone.utc)
935
        except (KeyError, ValueError):
1✔
936
            continue
1✔
937

938
        if published_last:
1✔
939
            published_deltas.append(abs(published - published_last))
1✔
940
        published_last = published
1✔
941

942
    # create next poll task
943
    def clamp(delay):
1✔
944
        return max(min(delay, MAX_FEED_POLL_PERIOD), MIN_FEED_POLL_PERIOD)
1✔
945

946
    if published_deltas:
1✔
947
        delay = clamp(timedelta(seconds=statistics.mean(
1✔
948
            t.total_seconds() for t in published_deltas)))
949
    else:
950
        delay = clamp(util.now() -
1✔
951
                      (user.last_polled_feed if user.last_polled_feed and activities
952
                       else user.created.replace(tzinfo=timezone.utc)))
953

954
    common.create_task(queue='poll-feed', delay=delay, domain=user.key.id(),
1✔
955
                       last_polled=user.last_polled_feed.isoformat())
956
    return 'OK', status
1✔
957

958

959
@app.post('/queue/webmention')
1✔
960
@cloud_tasks_only(log=None)
1✔
961
def webmention_task():
1✔
962
    """Handles inbound webmention task.
963

964
    Allows source URLs on brid.gy subdomains if the ``Authorization`` header matches
965
    the Flask secret key.
966

967
    Params:
968
      ``source`` (str): URL
969
    """
970
    common.log_request()
1✔
971

972
    # load user
973
    source = flask_util.get_required_param('source').strip()
1✔
974
    domain = domain_from_link(source, minimize=False)
1✔
975
    logger.info(f'webmention from {domain}')
1✔
976

977
    internal = request.headers.get('Authorization') == app.config['SECRET_KEY']
1✔
978
    if domain in common.DOMAINS and not internal:
1✔
979
        error(f'URL not supported: {source}')
1✔
980

981
    user = Web.get_by_id(domain)
1✔
982
    if not user:
1✔
983
        error(f'No user found for domain {domain}', status=304)
×
984
    logger.info(f'User: {user.key.id()}')
1✔
985

986
    # fetch source page
987
    try:
1✔
988
        # remote=True to force fetch, local=True to populate new/changed attrs
989
        obj = Web.load(source, local=True, remote=True,
1✔
990
                       check_backlink=not appengine_info.LOCAL_SERVER)
991
    except BadRequest as e:
1✔
992
        error(str(e.description), status=304)
1✔
993
    except RequestException as e:
1✔
994
        code, body = util.interpret_http_exception(e)
1✔
995
        if code not in ('410', '404') or user.is_web_url(source):
1✔
996
            error(f'{e} ; {e.response.text if e.response else ""}', status=502)
1✔
997

998
        id = f'{source}#bridgy-fed-delete'
1✔
999
        obj = Object(id=id, our_as1={
1✔
1000
            'id': id,
1001
            'objectType': 'activity',
1002
            'verb': 'delete',
1003
            'actor': user.web_url(),
1004
            'object': source,
1005
        })
1006

1007
    if not obj or (not obj.mf2 and obj.type != 'delete'):
1✔
1008
        error(f"Couldn't load {source} as microformats2 HTML", status=304)
1✔
1009
    elif obj.mf2 and 'h-entry' in obj.mf2.get('type', []):
1✔
1010
        authors = obj.mf2['properties'].setdefault('author', [])
1✔
1011
        author_urls = microformats2.get_string_urls(authors)
1✔
1012
        if not author_urls:
1✔
1013
            authors.append(user.web_url())
×
1014
        elif not user.is_web_url(author_urls[0]):
1✔
1015
            logger.info(f'Overriding author {author_urls[0]} with {user.web_url()}')
1✔
1016
            if isinstance(authors[0], dict):
1✔
1017
                authors[0]['properties']['url'] = [user.web_url()]
1✔
1018
            else:
1019
                authors[0] = user.web_url()
×
1020
            if obj.our_as1:
1✔
1021
                field = ('actor' if obj.our_as1.get('objectType') == 'activity'
×
1022
                         else 'author')
1023
                obj.our_as1[field] = user.web_url()
×
1024

1025
    try:
1✔
1026
        return Web.receive(obj, authed_as=user.key.id(), internal=internal)
1✔
1027
    except ValueError as e:
1✔
1028
        logger.warning(e, exc_info=True)
×
1029
        error(e, status=304)
×
1030

1031

1032
def webmention_endpoint_cache_key(url):
1✔
1033
    """Returns cache key for a cached webmention endpoint for a given URL.
1034

1035
    Just the domain by default. If the URL is the home page, ie path is ``/``,
1036
    the key includes a ``/`` at the end, so that we cache webmention endpoints
1037
    for home pages separate from other pages.
1038
    https://github.com/snarfed/bridgy/issues/701
1039

1040
    Example: ``snarfed.org /``
1041

1042
    https://github.com/snarfed/bridgy-fed/issues/423
1043

1044
    Adapted from ``bridgy/util.py``.
1045
    """
1046
    parsed = urllib.parse.urlparse(url)
1✔
1047
    key = parsed.netloc
1✔
1048
    if parsed.path in ('', '/'):
1✔
1049
        key += ' /'
1✔
1050

1051
    logger.debug(f'wm cache key {key}')
1✔
1052
    return key
1✔
1053

1054

1055
@memcache.memoize(expire=timedelta(hours=2), key=webmention_endpoint_cache_key)
1✔
1056
def webmention_discover(url, **kwargs):
1✔
1057
    """Thin cache around :func:`oauth_dropins.webutil.webmention.discover`."""
1058
    # discard the response since we don't use it and it's occasionally too big for
1059
    # memcache
1060
    return webmention.discover(url, **kwargs)._replace(response=None)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc