• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

snarfed / bridgy-fed / 5b5c512e-4bd7-4108-8832-23f68575bca7

31 Jan 2025 10:57PM UTC coverage: 93.282% (+0.05%) from 93.228%
5b5c512e-4bd7-4108-8832-23f68575bca7

push

circleci

snarfed
drop Object.labels

3 of 3 new or added lines in 2 files covered. (100.0%)

144 existing lines in 5 files now uncovered.

4610 of 4942 relevant lines covered (93.28%)

0.93 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

93.68
/web.py
1
"""Webmention protocol with microformats2 in HTML, aka the IndieWeb stack."""
2
from datetime import timedelta, timezone
1✔
3
import difflib
1✔
4
import logging
1✔
5
import re
1✔
6
import statistics
1✔
7
import urllib.parse
1✔
8
from urllib.parse import quote, urlencode, urljoin, urlparse
1✔
9
from xml.etree import ElementTree
1✔
10

11
import brevity
1✔
12
from flask import redirect, render_template, request
1✔
13
from google.cloud import ndb
1✔
14
from google.cloud.ndb import ComputedProperty
1✔
15
from granary import as1, as2, atom, microformats2, rss
1✔
16
import mf2util
1✔
17
from oauth_dropins.webutil import flask_util, util
1✔
18
from oauth_dropins.webutil.appengine_config import tasks_client
1✔
19
from oauth_dropins.webutil import appengine_info
1✔
20
from oauth_dropins.webutil.flask_util import cloud_tasks_only, error, flash
1✔
21
from oauth_dropins.webutil.util import domain_from_link, json_dumps, json_loads
1✔
22
from oauth_dropins.webutil import webmention
1✔
23
from requests import HTTPError, RequestException
1✔
24
from requests.auth import HTTPBasicAuth
1✔
25
from werkzeug.exceptions import BadGateway, BadRequest, HTTPException, NotFound
1✔
26

27
import common
1✔
28
from common import (
1✔
29
    CACHE_CONTROL,
30
    DOMAIN_RE,
31
    DOMAINS,
32
    PRIMARY_DOMAIN,
33
    PROTOCOL_DOMAINS,
34
    SUPERDOMAIN,
35
)
36
from flask_app import app
1✔
37
from ids import normalize_user_id, translate_object_id, translate_user_id
1✔
38
import memcache
1✔
39
from models import Follower, Object, PROTOCOLS, Target, User
1✔
40
from protocol import Protocol
1✔
41

42
logger = logging.getLogger(__name__)
1✔
43

44
# https://github.com/snarfed/bridgy-fed/issues/314
45
WWW_DOMAINS = frozenset((
1✔
46
    'www.jvt.me',
47
))
48

49
FEED_TYPES = {
1✔
50
    atom.CONTENT_TYPE.split(';')[0]: 'atom',
51
    rss.CONTENT_TYPE.split(';')[0]: 'rss',
52
    # https://stackoverflow.com/questions/4832357/whats-the-difference-between-text-xml-vs-application-xml-for-webservice-respons
53
    'application/xml': 'xml',
54
    'text/xml': 'xml',
55
}
56
MIN_FEED_POLL_PERIOD = timedelta(hours=2)
1✔
57
MAX_FEED_POLL_PERIOD = timedelta(days=1)
1✔
58
MAX_FEED_ITEMS_PER_POLL = 10
1✔
59

60
# populated into Web.redirects_error
61
OWNS_WEBFINGER = 'This site serves its own Webfinger, and likely ActivityPub too.'
1✔
62

63
# in addition to common.DOMAIN_BLOCKLIST
64
FETCH_BLOCKLIST = (
1✔
65
    'bsky.app',
66
)
67

68

69
def is_valid_domain(domain, allow_internal=True):
1✔
70
    """Returns True if this is a valid domain we can use, False otherwise.
71

72
    Args:
73
      domain (str):
74
      allow_internal (bool): whether to return True for internal domains
75
        like ``fed.brid.gy``, ``bsky.brid.gy``, etc
76

77
    Valid means TLD is ok, not blacklisted, etc.
78
    """
79
    if not domain or not re.match(DOMAIN_RE, domain):
1✔
80
        # logger.debug(f"{domain} doesn't look like a domain")
81
        return False
1✔
82

83
    if Web.is_blocklisted(domain, allow_internal=allow_internal):
1✔
84
        # logger.debug(f'{domain} is blocklisted')
85
        return False
1✔
86

87
    tld = domain.split('.')[-1]
1✔
88
    if tld not in brevity.TLDS:
1✔
89
        # logger.info(f"{domain} looks like a domain but {tld} isn't a TLD")
90
        return False
1✔
91

92
    return True
1✔
93

94

95
class Web(User, Protocol):
1✔
96
    """Web user and webmention protocol implementation.
97

98
    The key name is the domain.
99
    """
100
    ABBREV = 'web'
1✔
101
    ''
1✔
102
    PHRASE = 'the web'
1✔
103
    ''
1✔
104
    OTHER_LABELS = ('webmention',)
1✔
105
    ''
1✔
106
    LOGO_HTML = '🌐'  # used to be 🕸️
1✔
107
    ''
1✔
108
    CONTENT_TYPE = common.CONTENT_TYPE_HTML
1✔
109
    ''
1✔
110
    DEFAULT_ENABLED_PROTOCOLS = ('activitypub',)
1✔
111
    ''
1✔
112
    DEFAULT_SERVE_USER_PAGES = True
1✔
113
    ''
1✔
114
    SUPPORTED_AS1_TYPES = (
1✔
115
        tuple(as1.ACTOR_TYPES)
116
        + tuple(as1.POST_TYPES)
117
        + tuple(as1.CRUD_VERBS)
118
        + ('audio', 'bookmark', 'event', 'image', 'video')
119
        + ('follow', 'like', 'share', 'stop-following')
120
    )
121
    ''
1✔
122

123
    has_redirects = ndb.BooleanProperty()
1✔
124
    ''
1✔
125
    redirects_error = ndb.TextProperty()
1✔
126
    ''
1✔
127
    has_hcard = ndb.BooleanProperty()
1✔
128
    ''
1✔
129
    last_webmention_in = ndb.DateTimeProperty(tzinfo=timezone.utc)
1✔
130
    ''
1✔
131
    last_polled_feed = ndb.DateTimeProperty(tzinfo=timezone.utc)
1✔
132
    ''
1✔
133
    feed_last_item = ndb.StringProperty()
1✔
134
    """str: feed item id (URL)"""
1✔
135
    feed_etag = ndb.StringProperty()
1✔
136
    ''
1✔
137
    feed_last_modified = ndb.StringProperty()
1✔
138
    ''
1✔
139

140
    atproto_last_chat_log_cursor = ndb.StringProperty()
1✔
141
    """Only used by protocol bot users in Bluesky, for polling their chat
1✔
142
    messages with ``chat.bsky.convo.getLog``.
143
    """
144

145
    ap_subdomain = ndb.StringProperty(
1✔
146
        choices=['ap', 'bsky', 'fed', 'web', 'fake', 'other', 'efake'],
147
        default='web')
148
    """Originally, BF served Web users' AP actor ids on fed.brid.gy, eg
1✔
149
    https://fed.brid.gy/snarfed.org . When we started adding new protocols, we
150
    switched to per-protocol subdomains, eg https://web.brid.gy/snarfed.org .
151
    However, we need to preserve the old users' actor ids as is.
152

153
    Also, our per-protocol bot accounts in ActivityPub are on their own
154
    subdomains, eg @bsky.brid.gy@bsky.brid.gy.
155

156
    So, this property tracks which subdomain a given Web user's AP actor uses.
157
    """
158

159
    # OLD. some stored entities still have these; do not reuse.
160
    # superfeedr_subscribed = ndb.DateTimeProperty(tzinfo=timezone.utc)
161
    # superfeedr_subscribed_feed = ndb.StringProperty()
162

163
    @classmethod
1✔
164
    def _get_kind(cls):
1✔
165
        return 'MagicKey'
1✔
166

167
    def _pre_put_hook(self):
1✔
168
        """Validate domain id, don't allow upper case or invalid characters."""
169
        super()._pre_put_hook()
1✔
170
        id = self.key.id()
1✔
171
        assert is_valid_domain(id), id
1✔
172
        assert id.lower() == id, f'upper case is not allowed in Web key id: {id}'
1✔
173

174
    @classmethod
1✔
175
    def get_or_create(cls, id, allow_opt_out=False, verify=None, **kwargs):
1✔
176
        """Normalize domain, then pass through to :meth:`User.get_or_create`.
177

178
        Normalizing currently consists of lower casing and removing leading and
179
        trailing dots.
180

181
        Args:
182
          verify (bool): whether to call :meth:`verify` to load h-card, check
183
            redirects, etc. Defaults to calling it only if the user is new.
184
        """
185
        # normalize id (domain)
186
        domain = cls.key_for(id, allow_opt_out=True).id()
1✔
187
        if util.domain_or_parent_in(domain, [SUPERDOMAIN.strip('.')]):
1✔
188
            return super().get_by_id(domain)
1✔
189

190
        user = super().get_or_create(domain, allow_opt_out=True, **kwargs)
1✔
191
        if not user:
1✔
UNCOV
192
            return None
×
193

194
        if verify or (verify is None and not user.existing):
1✔
195
            user = user.verify()
1✔
196

197
        if not allow_opt_out and user.status:
1✔
198
            return None
1✔
199

200
        if not user.existing:
1✔
201
            common.create_task(queue='poll-feed', domain=user.key.id())
1✔
202

203
        return user
1✔
204

205
    @ndb.ComputedProperty
1✔
206
    def handle(self):
1✔
207
        """Returns this user's chosen username or domain, eg ``user.com``."""
208
        # prettify if domain, noop if username
209
        username = self.username()
1✔
210
        if username != self.key.id():
1✔
211
            return domain_from_link(username, minimize=False)
1✔
212
        return username
1✔
213

214
    def handle_as(self, to_proto):
1✔
215
        """Special case ActivityPub to use custom username."""
216
        if to_proto in ('activitypub', 'ap', PROTOCOLS['ap']):
1✔
217
            return (f'@{self.username()}@{self.key.id()}' if self.has_redirects
1✔
218
                    else f'@{self.key.id()}@{self.ap_subdomain}{SUPERDOMAIN}')
219

220
        return super().handle_as(to_proto)
1✔
221

222
    def id_as(self, to_proto):
1✔
223
        """Special case ActivityPub to use ``ap_subdomain``."""
224
        if isinstance(to_proto, str):
1✔
UNCOV
225
            to_proto = PROTOCOLS[to_proto]
×
226

227
        converted = translate_user_id(id=self.key.id(), from_=self,
1✔
228
                                      to=to_proto)
229

230
        if to_proto.LABEL == 'activitypub':
1✔
231
            other = 'web' if self.ap_subdomain == 'fed' else 'fed'
1✔
232
            converted = converted.replace(f'https://{other}.brid.gy/',
1✔
233
                                          f'https://{self.ap_subdomain}.brid.gy/')
234

235
        return converted
1✔
236

237
    web_url = User.profile_id
1✔
238

239
    def id_uri(self):
1✔
240
        return self.web_url()
1✔
241

242
    def is_web_url(self, url):
1✔
243
        return super().is_web_url(url, ignore_www=True)
1✔
244

245
    def user_page_path(self, rest=None):
1✔
246
        """Always use domain."""
247
        path = f'/{self.ABBREV}/{self.key.id()}'
1✔
248

249
        if rest:
1✔
250
            if not rest.startswith('?'):
1✔
251
                path += '/'
1✔
252
            path += rest.lstrip('/')
1✔
253

254
        return path
1✔
255

256
    def username(self):
1✔
257
        """Returns the user's preferred username.
258

259
        Uses stored representative h-card if available, falls back to id.
260

261
        Returns:
262
          str:
263
        """
264
        id = self.key.id()
1✔
265

266
        if self.obj and self.obj.as1:
1✔
267
            for url in (util.get_list(self.obj.as1, 'url') +
1✔
268
                        util.get_list(self.obj.as1, 'urls')):
269
                url = url.get('value') if isinstance(url, dict) else url
1✔
270
                if url and url.startswith('acct:'):
1✔
271
                    try:
1✔
272
                        urluser, urldomain = util.parse_acct_uri(url)
1✔
273
                    except ValueError as e:
1✔
274
                        continue
1✔
275
                    if urldomain == id:
1✔
276
                        logger.info(f'Found custom username: {urluser}')
1✔
277
                        return urluser
1✔
278

279
        # logger.debug(f'Defaulting username to key id {id}')
280
        return id
1✔
281

282
    @ndb.ComputedProperty
1✔
283
    def status(self):
1✔
284
        if self.key.id() in common.DOMAINS:
1✔
285
            return None
1✔
286

287
        if self.redirects_error == OWNS_WEBFINGER:
1✔
288
            # looks like this site is already its own fediverse server
289
            return 'owns-webfinger'
1✔
290

291
        url, _ = self.feed_url()
1✔
292
        if (not url and not self.webmention_endpoint() and not self.last_webmention_in
1✔
293
                and not self.has_redirects):
294
            return 'no-feed-or-webmention'
1✔
295

296
        return super().status
1✔
297

298
    def verify(self):
1✔
299
        """Fetches site a couple ways to check for redirects and h-card.
300

301
        Returns:
302
          web.Web: user that was verified. May be different than self! eg if
303
          self's domain started with www and we switch to the root domain.
304
        """
305
        domain = self.key.id()
1✔
306
        logger.info(f'Verifying {domain}')
1✔
307

308
        if domain.startswith('www.') and domain not in WWW_DOMAINS:
1✔
309
            # if root domain serves ok, use it instead
310
            # https://github.com/snarfed/bridgy-fed/issues/314
311
            root = domain.removeprefix('www.')
1✔
312
            root_site = f'https://{root}/'
1✔
313
            try:
1✔
314
                resp = util.requests_get(root_site, gateway=False)
1✔
315
                if resp.ok and self.is_web_url(resp.url):
1✔
316
                    logger.info(f'{root_site} serves ok ; using {root} instead')
1✔
317
                    root_user = Web.get_or_create(
1✔
318
                        root, enabled_protocols=self.enabled_protocols,
319
                        allow_opt_out=True)
320
                    self.use_instead = root_user.key
1✔
321
                    self.put()
1✔
322
                    return root_user.verify()
1✔
UNCOV
323
            except RequestException as e:
×
UNCOV
324
                logger.info(f"Couldn't fetch {root_site} : {e}")
×
UNCOV
325
                logger.info(f"Continuing with {domain}")
×
UNCOV
326
                pass
×
327

328
        # check webfinger redirect
329
        path = f'/.well-known/webfinger?resource=acct:{domain}@{domain}'
1✔
330
        self.has_redirects = False
1✔
331
        self.redirects_error = None
1✔
332
        try:
1✔
333
            url = urljoin(self.web_url(), path)
1✔
334
            resp = util.requests_get(url, gateway=False)
1✔
335
            domain_urls = ([f'https://{domain}/' for domain in common.DOMAINS] +
1✔
336
                           [common.host_url()])
337
            expected = [urljoin(url, path) for url in domain_urls]
1✔
338
            if resp.url:
1✔
339
                got = urllib.parse.unquote(resp.url)
1✔
340
                if got in expected:
1✔
341
                    self.has_redirects = True
1✔
342
                else:
343
                    # check host-meta to see if they serve their own Webfinger
344
                    resp = util.requests_get(
1✔
345
                        urljoin(self.web_url(), '/.well-known/host-meta'),
346
                        gateway=False)
347
                    if (resp.status_code == 200
1✔
348
                            and domain_from_link(resp.url) not in common.DOMAINS):
349
                        logger.info(f"{domain} serves Webfinger! probably a fediverse server")
1✔
350
                        self.redirects_error = OWNS_WEBFINGER
1✔
351
                    else:
352
                        diff = '\n'.join(difflib.Differ().compare([got], [expected[0]]))
1✔
353
                        self.redirects_error = f'Current vs expected:<pre>{diff}</pre>'
1✔
354
            else:
355
                lines = [url, f'  returned HTTP {resp.status_code}']
1✔
356
                if resp.url and resp.url != url:
1✔
UNCOV
357
                    lines[1:1] = ['  redirected to:', resp.url]
×
358
                self.redirects_error = '<pre>' + '\n'.join(lines) + '</pre>'
1✔
UNCOV
359
        except RequestException:
×
UNCOV
360
            pass
×
361

362
        # check home page
363
        self.has_hcard = False
1✔
364
        if not getattr(self, 'existing', None) == False:  # ie this is a new user
1✔
365
            self.reload_profile(gateway=True, raise_=False)
1✔
366
        if self.obj and self.obj.as1:
1✔
367
            self.has_hcard = True
1✔
368

369
        self.put()
1✔
370
        return self
1✔
371

372
    @classmethod
1✔
373
    def key_for(cls, id, allow_opt_out=False):
1✔
374
        """Returns the :class:`ndb.Key` for a given id.
375

376
        If id is a domain, uses it as is. If it's a home page URL or fed.brid.gy
377
        or web.brid.gy AP actor URL, extracts the domain and uses that.
378
        Otherwise, returns None.
379

380
        Args:
381
          id (str)
382
          allow_opt_out (bool): whether to allow users who are currently opted out
383

384
        Returns:
385
        ndb.Key or None:
386
        """
387
        if not id:
1✔
388
            return None
1✔
389

390
        id = id.lower().strip('.')
1✔
391
        if util.is_web(id):
1✔
392
            parsed = urlparse(id)
1✔
393
            if parsed.path in ('', '/'):
1✔
394
                id = parsed.netloc
1✔
395

396
        if is_valid_domain(id, allow_internal=True):
1✔
397
            return super().key_for(id, allow_opt_out=allow_opt_out)
1✔
398

399
        return None
1✔
400

401
    @classmethod
1✔
402
    def owns_id(cls, id):
1✔
403
        """Returns True on domains and internal URLs, None on other URLs.
404

405
        All web pages are http(s) URLs, but not all http(s) URLs are web pages.
406
        """
407
        if not id:
1✔
UNCOV
408
            return False
×
409
        elif is_valid_domain(id, allow_internal=True):
1✔
410
            return True
1✔
411

412
        if not util.is_web(id):
1✔
413
            return False
1✔
414

415
        domain = domain_from_link(id)
1✔
416
        if domain == PRIMARY_DOMAIN or domain in PROTOCOL_DOMAINS:
1✔
417
            return True
1✔
418

419
        # we allowed internal domains for protocol bot actors above, but we
420
        # don't want to allow non-homepage URLs on those domains, eg
421
        # https://bsky.brid.gy/foo, so don't allow internal here
422
        if is_valid_domain(domain, allow_internal=False):
1✔
423
            return None
1✔
424

425
        return False
1✔
426

427
    @classmethod
1✔
428
    def owns_handle(cls, handle, allow_internal=False):
1✔
429
        if handle == PRIMARY_DOMAIN or handle in PROTOCOL_DOMAINS:
1✔
430
            return True
1✔
431
        elif not is_valid_domain(handle, allow_internal=allow_internal):
1✔
432
            return False
1✔
433

434
    @classmethod
1✔
435
    def handle_to_id(cls, handle):
1✔
436
        assert cls.owns_handle(handle) is not False
1✔
437
        return handle
1✔
438

439
    @classmethod
1✔
440
    def target_for(cls, obj, shared=False):
1✔
441
        """Returns `obj`'s id, as a URL webmention target."""
442
        # TODO: we have entities in prod that fail this, eg
443
        # https://indieweb.social/users/bismark has source_protocol webmention
444
        # assert obj.source_protocol in (cls.LABEL, cls.ABBREV, 'ui', None), str(obj)
445

446
        if not util.is_web(obj.key.id()):
1✔
447
            logger.warning(f"{obj.key.id()} is source_protocol web but id isn't a URL!")
1✔
448
            return None
1✔
449

450
        return obj.key.id()
1✔
451

452
    def feed_url(self):
1✔
453
        """Returns this web site's RSS or Atom feed URL and type, if any.
454

455
        Returns:
456
          (str, type) or (None, None):
457
        """
458
        if self.obj and self.obj.mf2:
1✔
459
            for url, info in self.obj.mf2.get('rel-urls', {}).items():
1✔
460
                type = FEED_TYPES.get(info.get('type', '').split(';')[0])
1✔
461
                if 'alternate' in info.get('rels', []) and type:
1✔
462
                    return url, type
1✔
463

464
        return None, None
1✔
465

466
    def webmention_endpoint(self):
1✔
467
        """Returns this web site's webmention endpoint, if any.
468

469
        Returns:
470
          str: webmention endpoint URL
471
        """
472
        if self.obj and self.obj.mf2:
1✔
473
            for url, info in self.obj.mf2.get('rel-urls', {}).items():
1✔
474
                if 'webmention' in info.get('rels', []):
1✔
475
                    return url
1✔
476

477
    @classmethod
1✔
478
    def send(to_cls, obj, url, from_user=None, orig_obj_id=None, **kwargs):
1✔
479
        """Sends a webmention to a given target URL.
480

481
        See :meth:`Protocol.send` for details.
482

483
        Returns False if the target URL doesn't advertise a webmention endpoint,
484
        or if webmention/microformats2 don't support the activity type.
485
        https://fed.brid.gy/docs#error-handling
486
        """
487
        targets = as1.targets(obj.as1)
1✔
488
        if not (url in targets or
1✔
489
                # homepage, check domain too
490
                (urlparse(url).path.strip('/') == ''
491
                 and domain_from_link(url) in targets)):
492
            logger.debug(f'Skipping sending to {url} , not a target in the object')
1✔
493
            return False
1✔
494

495
        if to_cls.is_blocklisted(url):
1✔
UNCOV
496
            logger.info(f'Skipping sending to blocklisted {url}')
×
UNCOV
497
            return False
×
498

499
        source_id = translate_object_id(
1✔
500
            id=obj.key.id(), from_=PROTOCOLS[obj.source_protocol], to=Web)
501
        source_url = quote(source_id, safe=':/%+')
1✔
502
        logger.info(f'Sending webmention from {source_url} to {url}')
1✔
503

504
        # we only send webmentions for responses. for sending normal posts etc
505
        # to followers, we just update our stored objects (elsewhere) and web
506
        # users consume them via feeds.
507
        endpoint = webmention_discover(url).endpoint
1✔
508
        if not endpoint:
1✔
509
            return False
1✔
510

511
        webmention.send(endpoint, source_url, url)
1✔
512
        return True
1✔
513

514
    @classmethod
1✔
515
    def load(cls, id, **kwargs):
1✔
516
        """Wrap :meth:`Protocol.load` to convert domains to homepage URLs."""
517
        if re.match(DOMAIN_RE, id):
1✔
518
            id = f'https://{id}/'
1✔
519

520
        return super().load(id, **kwargs)
1✔
521

522
    @classmethod
1✔
523
    def fetch(cls, obj, gateway=False, check_backlink=False,
1✔
524
              authorship_fetch_mf2=True, metaformats=None, **kwargs):
525
        """Fetches a URL over HTTP and extracts its microformats2.
526

527
        Follows redirects, but doesn't change the original URL in ``obj``'s id!
528
        :class:`google.cloud.ndb.model.Model` doesn't allow that anyway, but more
529
        importantly, we want to preserve that original URL becase other objects
530
        may refer to it instead of the final redirect destination URL.
531

532
        See :meth:`Protocol.fetch` for other background.
533

534
        Args:
535
          gateway (bool): passed through to
536
            :func:`oauth_dropins.webutil.util.fetch_mf2`
537
          check_backlink (bool): optional, whether to require a link to Bridgy
538
            Fed. Ignored if the URL is a homepage, ie has no path.
539
          authorship_fetch_mf2 (bool): optional, when running the authorship
540
            algorithm, fetch author URL if necessary
541
          kwargs: ignored
542
        """
543
        url = obj.key.id()
1✔
544
        if not util.is_web(url):
1✔
545
            logger.info(f'{url} is not a URL')
1✔
546
            return False
1✔
547
        elif (cls.is_blocklisted(url, allow_internal=True)
1✔
548
              or util.domain_or_parent_in(domain_from_link(url), FETCH_BLOCKLIST)):
549
            return False
1✔
550

551
        is_homepage = urlparse(url).path.strip('/') == ''
1✔
552
        if is_homepage:
1✔
553
            domain = domain_from_link(url)
1✔
554
            if domain == PRIMARY_DOMAIN or domain in PROTOCOL_DOMAINS:
1✔
555
                profile = util.read(f'{domain}.as2.json')
1✔
556
                if profile:
1✔
557
                    obj.as2 = json_loads(profile)
1✔
558
                    return True
1✔
UNCOV
559
                return False
×
560

561
        require_backlink = (common.host_url().rstrip('/')
1✔
562
                            if check_backlink and not is_homepage
563
                            else None)
564
        if metaformats is None:
1✔
565
            # default to only for homepages
566
            metaformats = is_homepage
1✔
567

568
        try:
1✔
569
            parsed = util.fetch_mf2(url, gateway=gateway, metaformats=metaformats,
1✔
570
                                    require_backlink=require_backlink)
571
        except ValueError as e:
1✔
572
            error(str(e))
1✔
573

574
        if parsed is None or not parsed.get('items'):
1✔
575
            if parsed:
1✔
576
                # we got valid HTML. save the Object so that we know this URL is web
577
                obj.source_protocol = 'web'
1✔
578
                obj.put()
1✔
579
            logger.info(f'No microformats2 found in {url}')
1✔
580
            return False
1✔
581

582
        # find mf2 item
583
        if is_homepage:
1✔
584
            logger.info(f"{url} is user's web url")
1✔
585
            parsed_url = (parsed['url'] or '').rstrip('/')
1✔
586
            # try both with and without trailing slash
587
            entry = (mf2util.representative_hcard(parsed, parsed_url)
1✔
588
                     or mf2util.representative_hcard(parsed, parsed_url + '/'))
589
            if not entry:
1✔
590
                error(f"Couldn't find a representative h-card (http://microformats.org/wiki/representative-h-card-parsing) on {parsed['url']}")
1✔
591
            logger.info(f'Found representative h-card')
1✔
592
            # handle when eg https://user.com/ redirects to https://www.user.com/
593
            # we need to store this as https://user.com/
594
            if parsed['url'] != url:
1✔
595
                logger.info(f'overriding {parsed["url"]} with {url}')
1✔
596
                entry['properties']['url'] = [url]
1✔
597
                if rel_url := parsed['rel-urls'].pop(parsed['url'], None):
1✔
598
                    parsed['rel-urls'][url] = rel_url
1✔
599
                parsed['url'] = url
1✔
600

601
        else:
602
            entry = mf2util.find_first_entry(parsed, ['h-entry'])
1✔
603
            if not entry:
1✔
UNCOV
604
                error(f'No microformats2 h-entry found in {url}')
×
605

606
        # discard uid if set; we use URL as id
607
        props = entry.setdefault('properties', {})
1✔
608
        if 'uid' in props:
1✔
609
            logger.info(f'Discarding uid property: {props["uid"]}')
1✔
610
            props.pop('uid')
1✔
611

612
        # store final URL in mf2 object
613
        if is_homepage:
1✔
614
            entry.setdefault('rel-urls', {}).update(parsed.get('rel-urls', {}))
1✔
615
            entry.setdefault('type', ['h-card'])
1✔
616
        if parsed['url']:
1✔
617
            entry['url'] = parsed['url']
1✔
618
        logger.info(f'Extracted microformats2 entry: {json_dumps(entry)[:500]}')
1✔
619

620
        if not is_homepage:
1✔
621
            # default actor/author to home page URL
622
            authors = props.setdefault('author', [])
1✔
623
            if not microformats2.get_string_urls(authors):
1✔
624
                homepage = urljoin(parsed.get('url') or url, '/')
1✔
625
                logger.info(f'Defaulting author URL to {homepage}')
1✔
626
                if authors and isinstance(authors[0], dict):
1✔
627
                    authors[0]['properties']['url'] = [homepage]
1✔
628
                else:
629
                    authors.insert(0, homepage)
1✔
630

631
            # run full authorship algorithm if necessary:
632
            # https://indieweb.org/authorship
633
            # duplicated in microformats2.json_to_object
634
            author = util.get_first(props, 'author')
1✔
635
            if not isinstance(author, dict):
1✔
636
                logger.info(f'Fetching full authorship for author {author}')
1✔
637
                fetch_fn = util.fetch_mf2 if authorship_fetch_mf2 else None
1✔
638
                try:
1✔
639
                    author = mf2util.find_author({'items': [entry]}, hentry=entry,
1✔
640
                                                 fetch_mf2_func=fetch_fn)
641
                except (ValueError, TypeError) as e:
1✔
642
                    logger.warning(e)
1✔
643
                    author = None
1✔
644
                logger.debug(f'Got: {author}')
1✔
645
                if author:
1✔
646
                    props['author'] = util.trim_nulls([{
1✔
647
                        "type": ["h-card"],
648
                        'properties': {
649
                            field: [author[field]] if author.get(field) else []
650
                            for field in ('name', 'photo', 'url')
651
                        },
652
                    }])
653

654
        obj.mf2 = entry
1✔
655
        return True
1✔
656

657
    @classmethod
1✔
658
    def _convert(cls, obj, from_user=None):
1✔
659
        """Converts a :class:`Object` to HTML.
660

661
        Args:
662
          obj (models.Object)
663
          from_user (models.User): user (actor) this activity/object is from
664

665
        Returns:
666
          str:
667
        """
668
        if not obj or not obj.as1:
1✔
UNCOV
669
            return ''
×
670

671
        obj_as1 = obj.as1
1✔
672
        if from_user and not from_user.is_enabled(cls):
1✔
UNCOV
673
            error(f'{from_user.key.id()} => {cls.LABEL} not enabled')
×
674

675
        from_proto = PROTOCOLS.get(obj.source_protocol)
1✔
676
        if from_proto:
1✔
677
            # fill in author/actor if available
678
            for field in 'author', 'actor':
1✔
679
                val = as1.get_object(obj.as1, field)
1✔
680
                if val.keys() == set(['id']) and val['id']:
1✔
681
                    loaded = from_proto.load(val['id'], raise_=False)
1✔
682
                    if loaded and loaded.as1:
1✔
683
                        obj_as1 = {**obj_as1, field: loaded.as1}
1✔
684
        else:
685
            logger.debug(f'Not hydrating actor or author due to source_protocol {obj.source_protocol}')
1✔
686

687
        html = microformats2.activities_to_html([cls.translate_ids(obj_as1)])
1✔
688

689
        # add HTML meta redirect to source page. should trigger for end users in
690
        # browsers but not for webmention receivers (hopefully).
691
        url = util.get_url(obj_as1) or obj_as1.get('id') or obj.key.id()
1✔
692
        if util.is_web(url):
1✔
693
            utf8 = '<meta charset="utf-8">'
1✔
694
            refresh = f'<meta http-equiv="refresh" content="0;url={url}">'
1✔
695
            html = html.replace(utf8, utf8 + '\n' + refresh)
1✔
696

697
        return html
1✔
698

699

700
@app.get('/web-site')
1✔
701
@flask_util.headers(CACHE_CONTROL)
1✔
702
def enter_web_site():
1✔
UNCOV
703
    return render_template('enter_web_site.html')
×
704

705

706
@app.post('/web-site')
1✔
707
def check_web_site():
1✔
708
    common.log_request()
1✔
709
    url = request.values['url']
1✔
710

711
    # this normalizes and lower cases domain
712
    try:
1✔
713
        domain = normalize_user_id(id=url, proto=Web)
1✔
714
    except (ValueError, AssertionError):
1✔
715
        logger.info(f'bad web id? {url}', exc_info=True)
1✔
716
        domain = None
1✔
717

718
    invalid_msg = util.linkify(f'{url} is not a <a href="/docs#web-get-started">valid or supported web site</a>', pretty=True)
1✔
719
    if not domain or not is_valid_domain(domain, allow_internal=False):
1✔
720
        flash(invalid_msg)
1✔
721
        return render_template('enter_web_site.html'), 400
1✔
722

723
    if util.is_web(url) and urlparse(url).path.strip('/'):
1✔
724
        flash('Only top-level web sites and domains are supported.')
1✔
725
        return render_template('enter_web_site.html'), 400
1✔
726

727
    try:
1✔
728
        user = Web.get_or_create(domain, enabled_protocols=['atproto'],
1✔
729
                                 propagate=True, reload=True, verify=True)
UNCOV
730
    except BaseException as e:
×
UNCOV
731
        code, body = util.interpret_http_exception(e)
×
UNCOV
732
        if code:
×
UNCOV
733
            flash(util.linkify(f"Couldn't connect to {url}: {e}", pretty=True))
×
UNCOV
734
            return render_template('enter_web_site.html')
×
UNCOV
735
        raise
×
736

737
    if not user:  # opted out
1✔
738
        flash(invalid_msg)
1✔
739
        return render_template('enter_web_site.html'), 400
1✔
740

741
    user.put()
1✔
742

743
    if user.redirects_error == OWNS_WEBFINGER:
1✔
UNCOV
744
        flash(f'{url} looks like a fediverse server! Try a normal web site.')
×
UNCOV
745
        return render_template('enter_web_site.html'), 400
×
746

747
    common.create_task(queue='poll-feed', domain=domain)
1✔
748
    return redirect(user.user_page_path())
1✔
749

750

751
@app.post('/webmention')
1✔
752
def webmention_external():
1✔
753
    """Handles inbound webmention, enqueue task to process.
754

755
    Use a task queue to deliver to followers because we send to each inbox in
756
    serial, which can take a long time with many followers/instances.
757
    """
758
    common.log_request()
1✔
759

760
    source = flask_util.get_required_param('source').strip()
1✔
761
    if Web.owns_id(source) is False:
1✔
762
        error(f'Bad URL {source}')
1✔
763
    elif urlparse(source).scheme != 'https':
1✔
764
        error('source URLs must be https (with SSL)')
1✔
765

766
    domain = domain_from_link(source, minimize=False)
1✔
767
    if not domain:
1✔
UNCOV
768
        error(f'Bad source URL {source}')
×
769

770
    user = Web.get_by_id(domain)
1✔
771
    if not user:
1✔
772
        error(f'No user found for domain {domain}')
1✔
773

774
    user.last_webmention_in = util.now()
1✔
775
    user.put()
1✔
776

777
    return common.create_task('webmention', **request.form)
1✔
778

779

780
def poll_feed(user, feed_url, rel_type):
1✔
781
    """Fetches a :class:`Web` site's feed and delivers new/updated posts.
782

783
    Args:
784
      user (Web)
785
      feed_url (str)
786
      rel_type (str): feed link's top-level rel type in home page HTML, usually
787
        either ``atom`` or ``rss``
788

789
    Returns:
790
      list of dict AS1 activities:
791
    """
792
    user.last_polled_feed = util.now()
1✔
793

794
    # fetch feed
795
    headers = {}
1✔
796
    if user.feed_etag:
1✔
797
        headers['If-None-Match'] = user.feed_etag
1✔
798
    if user.feed_last_modified:
1✔
799
        headers['If-Modified-Since'] = user.feed_last_modified
1✔
800
    resp = util.requests_get(feed_url, headers=headers, gateway=True)
1✔
801

802
    # update user
803
    user.feed_etag = resp.headers.get('ETag')
1✔
804
    user.feed_last_modified = resp.headers.get('Last-Modified')
1✔
805

806
    # parse feed
807
    content_type = resp.headers.get('Content-Type') or ''
1✔
808
    type = FEED_TYPES.get(content_type.split(';')[0])
1✔
809
    if resp.status_code == 304:
1✔
810
        logger.info('Feed is unchanged since last poll')
1✔
811
        user.put()
1✔
812
        return []
1✔
813
    elif type == 'atom' or (type == 'xml' and rel_type == 'atom'):
1✔
814
        activities = atom.atom_to_activities(resp.text)
1✔
815
    elif type == 'rss' or (type == 'xml' and rel_type == 'rss'):
1✔
816
        activities = rss.to_activities(resp.text)
1✔
817
    else:
818
        raise ValueError(f'Unknown feed type {content_type}')
1✔
819

820
    if len(activities) > MAX_FEED_ITEMS_PER_POLL:
1✔
821
        logger.info(f'Got {len(activities)} feed items, only processing the first {MAX_FEED_ITEMS_PER_POLL}')
1✔
822
        activities = activities[:MAX_FEED_ITEMS_PER_POLL]
1✔
823

824
    # create receive tasks
825
    for i, activity in enumerate(activities):
1✔
826
        # default actor and author to user
827
        activity.setdefault('actor', {}).setdefault('id', user.profile_id())
1✔
828
        obj = activity.setdefault('object', {})
1✔
829
        obj.setdefault('author', {}).setdefault('id', user.profile_id())
1✔
830

831
        # use URL as id since some feeds use non-URL (eg tag URI) ids
832
        for elem in obj, activity:
1✔
833
            if url := elem.get('url'):
1✔
834
                elem['id'] = elem['url']
1✔
835

836
        logger.debug(f'Converted to AS1: {json_dumps(activity, indent=2)}')
1✔
837

838
        id = Object(our_as1=activity).as1.get('id')
1✔
839
        if not id:
1✔
UNCOV
840
            logger.warning('No id or URL!')
×
UNCOV
841
            continue
×
842

843
        if i == 0:
1✔
844
            logger.info(f'Setting feed_last_item to {id}')
1✔
845
            user.feed_last_item = id
1✔
846
        elif id == user.feed_last_item:
1✔
UNCOV
847
            logger.info(f'Already seen {id}, skipping rest of feed')
×
UNCOV
848
            break
×
849

850
        if Web.owns_id(id) is False:
1✔
UNCOV
851
            logger.warning(f'Skipping bad id {id}')
×
UNCOV
852
            continue
×
853

854
        if not obj.get('image'):
1✔
855
            # fetch and check the post itself
856
            logger.info(f'No image in {id} , trying metaformats')
1✔
857
            post = Object(id=id)
1✔
858
            try:
1✔
859
                fetched = Web.fetch(post, metaformats=True, authorship_fetch_mf2=False)
1✔
860
            except (RequestException, HTTPException):
1✔
861
                fetched = False
1✔
862
            if fetched and post.as1:
1✔
863
                profile_images = (as1.get_ids(user.obj.as1, 'image')
1✔
864
                                  if user.obj.as1 else [])
865
                obj['image'] = [img for img in as1.get_ids(post.as1, 'image')
1✔
866
                                if img not in profile_images]
867

868
        common.create_task(queue='receive', id=id, our_as1=activity,
1✔
869
                           source_protocol=Web.ABBREV, authed_as=user.key.id(),
870
                           received_at=util.now().isoformat())
871

872
    return activities
1✔
873

874

875
@app.post(f'/queue/poll-feed')
1✔
876
@cloud_tasks_only(log=None)
1✔
877
def poll_feed_task():
1✔
878
    """Task handler for polling a :class:`Web` user's feed.
879

880
    Params:
881
      ``domain`` (str): key id of the :class:`Web` user
882
      ``last_polled`` (str): should match the user's ``last_polled_feed``. Used to detect duplicate poll tasks for the same user.
883
    """
884
    common.log_request()
1✔
885

886
    domain = flask_util.get_required_param('domain')
1✔
887
    logger.info(f'Polling feed for {domain}')
1✔
888

889
    user = Web.get_by_id(domain)
1✔
890
    if not (user and user.obj and user.obj.mf2):
1✔
891
        error(f'No Web user or object found for domain {domain}', status=304)
1✔
892
    elif user.last_webmention_in:
1✔
893
        logger.info(f'Dropping since last_webmention_in is set')
1✔
894
        return 'OK'
1✔
895

896
    logger.info(f'Last poll: {user.last_polled_feed}')
1✔
897
    last_polled = request.form.get('last_polled')
1✔
898
    if (last_polled and user.last_polled_feed
1✔
899
            and last_polled < user.last_polled_feed.isoformat()):
900
        logger.warning('duplicate poll feed task! deferring to other task')
1✔
901
        return '', 204
1✔
902

903
    # discover feed URL
904
    url, rel_type = user.feed_url()
1✔
905
    if not url:
1✔
906
        msg = f"User {user.key.id()} has no feed URL, can't fetch feed"
1✔
907
        logger.info(msg)
1✔
908
        return msg
1✔
909

910
    # go go go!
911
    activities = []
1✔
912
    status = 200
1✔
913
    try:
1✔
914
        activities = poll_feed(user, url, rel_type)
1✔
915
    except (ValueError, ElementTree.ParseError) as e:
1✔
916
        logger.error(f"Couldn't parse feed: {e}")
1✔
917
        status = 204
1✔
918
    except BaseException as e:
1✔
919
        code, _ = util.interpret_http_exception(e)
1✔
920
        if code or util.is_connection_failure(e):
1✔
921
            logger.error(f"Couldn't fetch feed: {e}")
1✔
922
            status = 204
1✔
923
        else:
UNCOV
924
            raise
×
925

926
    user.put()
1✔
927

928
    # determine posting frequency
929
    published_last = None
1✔
930
    published_deltas = []  # timedeltas between entry published times
1✔
931
    for activity in activities:
1✔
932
        try:
1✔
933
            published = util.parse_iso8601(activity['object']['published'])
1✔
934
        except (KeyError, ValueError):
1✔
935
            continue
1✔
936

937
        if published_last:
1✔
938
            published_deltas.append(abs(published - published_last))
1✔
939
        published_last = published
1✔
940

941
    # create next poll task
942
    def clamp(delay):
1✔
943
        return max(min(delay, MAX_FEED_POLL_PERIOD), MIN_FEED_POLL_PERIOD)
1✔
944

945
    if published_deltas:
1✔
946
        delay = clamp(timedelta(seconds=statistics.mean(
1✔
947
            t.total_seconds() for t in published_deltas)))
948
    else:
949
        delay = clamp(util.now() -
1✔
950
                      (user.last_polled_feed if user.last_polled_feed and activities
951
                       else user.created.replace(tzinfo=timezone.utc)))
952

953
    common.create_task(queue='poll-feed', delay=delay, domain=user.key.id(),
1✔
954
                       last_polled=user.last_polled_feed.isoformat())
955
    return 'OK', status
1✔
956

957

958
@app.post('/queue/webmention')
1✔
959
@cloud_tasks_only(log=None)
1✔
960
def webmention_task():
1✔
961
    """Handles inbound webmention task.
962

963
    Params:
964
      ``source`` (str): URL
965
    """
966
    common.log_request()
1✔
967

968
    # load user
969
    source = flask_util.get_required_param('source').strip()
1✔
970
    domain = domain_from_link(source, minimize=False)
1✔
971
    logger.info(f'webmention from {domain}')
1✔
972

973
    if domain in common.DOMAINS:
1✔
974
        error(f'URL not supported: {source}')
1✔
975

976
    user = Web.get_by_id(domain)
1✔
977
    if not user:
1✔
UNCOV
978
        error(f'No user found for domain {domain}', status=304)
×
979
    logger.info(f'User: {user.key.id()}')
1✔
980

981
    # fetch source page
982
    try:
1✔
983
        # remote=True to force fetch, local=True to populate new/changed attrs
984
        obj = Web.load(source, local=True, remote=True,
1✔
985
                       check_backlink=not appengine_info.LOCAL_SERVER)
986
    except BadRequest as e:
1✔
987
        error(str(e.description), status=304)
1✔
988
    except RequestException as e:
1✔
989
        code, body = util.interpret_http_exception(e)
1✔
990
        if code not in ('410', '404') or user.is_web_url(source):
1✔
991
            error(f'{e} ; {e.response.text if e.response else ""}', status=502)
1✔
992

993
        id = f'{source}#bridgy-fed-delete'
1✔
994
        obj = Object(id=id, our_as1={
1✔
995
            'id': id,
996
            'objectType': 'activity',
997
            'verb': 'delete',
998
            'actor': user.web_url(),
999
            'object': source,
1000
        })
1001

1002
    if not obj or (not obj.mf2 and obj.type != 'delete'):
1✔
1003
        error(f"Couldn't load {source} as microformats2 HTML", status=304)
1✔
1004
    elif obj.mf2 and 'h-entry' in obj.mf2.get('type', []):
1✔
1005
        authors = obj.mf2['properties'].setdefault('author', [])
1✔
1006
        author_urls = microformats2.get_string_urls(authors)
1✔
1007
        if not author_urls:
1✔
UNCOV
1008
            authors.append(user.web_url())
×
1009
        elif not user.is_web_url(author_urls[0]):
1✔
1010
            logger.info(f'Overriding author {author_urls[0]} with {user.web_url()}')
1✔
1011
            if isinstance(authors[0], dict):
1✔
1012
                authors[0]['properties']['url'] = [user.web_url()]
1✔
1013
            else:
UNCOV
1014
                authors[0] = user.web_url()
×
1015

1016
    try:
1✔
1017
        return Web.receive(obj, authed_as=user.key.id())
1✔
1018
    except ValueError as e:
1✔
UNCOV
1019
        logger.warning(e, exc_info=True)
×
UNCOV
1020
        error(e, status=304)
×
1021

1022

1023
def webmention_endpoint_cache_key(url):
1✔
1024
    """Returns cache key for a cached webmention endpoint for a given URL.
1025

1026
    Just the domain by default. If the URL is the home page, ie path is ``/``,
1027
    the key includes a ``/`` at the end, so that we cache webmention endpoints
1028
    for home pages separate from other pages.
1029
    https://github.com/snarfed/bridgy/issues/701
1030

1031
    Example: ``snarfed.org /``
1032

1033
    https://github.com/snarfed/bridgy-fed/issues/423
1034

1035
    Adapted from ``bridgy/util.py``.
1036
    """
1037
    parsed = urllib.parse.urlparse(url)
1✔
1038
    key = parsed.netloc
1✔
1039
    if parsed.path in ('', '/'):
1✔
1040
        key += ' /'
1✔
1041

1042
    logger.debug(f'wm cache key {key}')
1✔
1043
    return key
1✔
1044

1045

1046
@memcache.memoize(expire=timedelta(hours=2), key=webmention_endpoint_cache_key)
1✔
1047
def webmention_discover(url, **kwargs):
1✔
1048
    """Thin caching wrapper around :func:`oauth_dropins.webutil.webmention.discover`."""
1049
    return webmention.discover(url, **kwargs)
1✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2026 Coveralls, Inc