mastodon: suppress backfeed from blocked accounts

for #895
snarfed · Oct 30, 2019 · 73e8e91 · 73e8e91
1 parent 9913721
commit 73e8e91
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 33 deletions.
diff --git a/mastodon.py b/mastodon.py
@@ -41,6 +41,7 @@ class Mastodon(models.Source):
   OAUTH_START_HANDLER = StartHandler
   SHORT_NAME = 'mastodon'
   CAN_PUBLISH = True
+  HAS_BLOCKS = True
   TYPE_LABELS = {
     'post': 'toot',
     'comment': 'reply',

diff --git a/models.py b/models.py
@@ -31,6 +31,12 @@
 
 REFETCH_HFEED_TRIGGER = datetime.datetime.utcfromtimestamp(-1)
 
+BLOCKLIST_CACHE_TIME = 60 * 60 * 2  # 2h
+# limit size of cached block lists to try to stay under memcache 1MB value limit:
+# https://github.com/snarfed/bridgy/issues/764
+# https://cloud.google.com/appengine/docs/standard/python/memcache/#limits
+BLOCKLIST_MAX_IDS = 35000
+
 # maps string short name to Source subclass. populated by SourceMeta.
 sources = {}
 
@@ -110,7 +116,8 @@ class Source(with_metaclass(SourceMeta, StringIdModel)):
   RATE_LIMIT_HTTP_CODES = ('429',)
   DISABLE_HTTP_CODES = ('401',)
   TRANSIENT_ERROR_HTTP_CODES = ()
-
+  # whether granary supports fetching block lists
+  HAS_BLOCKS = False
   # whether to require a u-syndication link for backfeed
   BACKFEED_REQUIRES_SYNDICATION_LINK = False
 
@@ -174,6 +181,10 @@ class Source(with_metaclass(SourceMeta, StringIdModel)):
   # datastore transactionally. set this to {} before beginning.
   updates = None
 
+  # manually cache id blocklist (returned by granary's get_blocklist_ids()) in
+  # this attribute, per instance. set and used by is_blocked().
+  blocked_ids = None
+
   # gr_source is *not* set to None by default here, since it needs to be unset
   # for __getattr__ to run when it's accessed.
 
@@ -749,8 +760,28 @@ def is_blocked(self, obj):
     """Returns True if an object's author is being blocked.
 
     ...ie they're in this user's block list.
+
+    Note that this method is tested in test_twitter.py, not test_models.py, for
+    historical reasons.
     """
-    return False
+    if not self.HAS_BLOCKS:
+      return False
+
+    if self.blocked_ids is None:
+      cache_key = 'B %s' % self.bridgy_path()
+      self.blocked_ids = memcache.get(cache_key)
+      if self.blocked_ids is None:
+        try:
+          ids = self.gr_source.get_blocklist_ids()
+        except gr_source.RateLimited as e:
+          ids = e.partial or []
+        self.blocked_ids = ids[:BLOCKLIST_MAX_IDS]
+        memcache.set(cache_key, self.blocked_ids, time=BLOCKLIST_CACHE_TIME)
+
+    for o in [obj] + util.get_list(obj, 'object'):
+      for field in 'author', 'actor':
+        if o.get(field, {}).get('numeric_id') in self.blocked_ids:
+          return True
 
 
 class Webmentions(StringIdModel):

diff --git a/templates/about.html b/templates/about.html
@@ -507,7 +507,7 @@ <h3 id="listen">Pulling back responses</h3>
   </ul>
 <li>Instagram likes are even worse than Twitter likes. They <a href="https://github.com/snarfed/bridgy/issues/722">only show the first 10 individual likes per photo on the web</a>, and a total count after that, so Bridgy can only backfeed the first 10 likes.
 </li>
-<li>Have you blocked the author inside the silo? If so, Bridgy won't send you any of their responses. (This is <a href="https://github.com/snarfed/bridgy/issues/764">best effort only for Twitter</a>, due to <a href="https://developer.twitter.com/en/docs/accounts-and-users/mute-block-report-users/api-reference/get-blocks-ids#resource-information">their API's rate limits</a>. In practice, Bridgy will only filter out responses from the first 40,000 users on your Twitter block list.)
+<li>Have you blocked the author inside the silo? If so, Bridgy won't send you any of their responses. (This is <a href="https://github.com/snarfed/bridgy/issues/764">best effort only for Twitter</a>, due to <a href="https://developer.twitter.com/en/docs/accounts-and-users/mute-block-report-users/api-reference/get-blocks-ids#resource-information">their API's rate limits</a>. In practice, Bridgy will only filter out responses from the first 35,000 users on your Twitter block list.)
 </li>
 <li>Currently, GitHub responses require a <a href="#link">syndication link on the original post on your web site</a> (<a href="https://chat.indieweb.org/dev/2018-02-26#t1519610412751400">background</a>). They're also currently best effort only. <a href="https://developer.github.com/v3/">Their API</a> is almost entirely broken down by user and repository, so there's no good way to ask for all recent responses to your posts. We get close by using their <a href="https://developer.github.com/v3/activity/notifications/">Notifications API</a>, which is good, but not always comprehensive. Apologies!
 </li>

diff --git a/tests/test_twitter.py b/tests/test_twitter.py
@@ -200,7 +200,7 @@ def test_is_blocked_rate_limited(self):
 
   def test_is_blocked_size_limit(self):
     """Test that we cap block list sizes in memcache."""
-    self.mox.stubs.Set(twitter, 'BLOCKLIST_MAX_IDS', 2)
+    self.mox.stubs.Set(models, 'BLOCKLIST_MAX_IDS', 2)
     api_url = gr_twitter.API_BASE + gr_twitter.API_BLOCK_IDS % '-1'
     self.expect_urlopen(api_url, json_dumps({
       'ids': ['1', '2', '3'],

diff --git a/twitter.py b/twitter.py
@@ -22,12 +22,6 @@
 import models
 import util
 
-BLOCKLIST_CACHE_TIME = 60 * 60 * 2  # 2h
-# limit size of cached block lists to try to stay under memcache 1MB value limit:
-# https://github.com/snarfed/bridgy/issues/764
-# https://cloud.google.com/appengine/docs/standard/python/memcache/#limits
-BLOCKLIST_MAX_IDS = 35000
-
 
 class Twitter(models.Source):
   """A Twitter account.
@@ -45,11 +39,10 @@ class Twitter(models.Source):
   }
   TRANSIENT_ERROR_HTTP_CODES = ('404',)
   CAN_PUBLISH = True
+  HAS_BLOCKS = True
   URL_CANONICALIZER = gr_twitter.Twitter.URL_CANONICALIZER
   URL_CANONICALIZER.headers = util.REQUEST_HEADERS
 
-  blocked_ids = None
-
   @staticmethod
   def new(handler, auth_entity=None, **kwargs):
     """Creates and returns a :class:`Twitter` entity.
@@ -156,27 +149,6 @@ def canonicalize_url(self, url, activity=None, **kwargs):
     url = url.replace('/statuses/', '/status/')
     return super(Twitter, self).canonicalize_url(url, **kwargs)
 
-  def is_blocked(self, obj):
-    """Returns True if an object's author is being blocked.
-
-    ...ie they're in this user's block list."""
-
-    if self.blocked_ids is None:
-      cache_key = 'B %s' % self.bridgy_path()
-      self.blocked_ids = memcache.get(cache_key)
-      if self.blocked_ids is None:
-        try:
-          ids = self.gr_source.get_blocklist_ids()
-        except gr_source.RateLimited as e:
-          ids = e.partial or []
-        self.blocked_ids = ids[:BLOCKLIST_MAX_IDS]
-        memcache.set(cache_key, self.blocked_ids, time=BLOCKLIST_CACHE_TIME)
-
-    for o in [obj] + util.get_list(obj, 'object'):
-      for field in 'author', 'actor':
-        if o.get(field, {}).get('numeric_id') in self.blocked_ids:
-          return True
-
 
 class AuthHandler(util.Handler):
   """Base OAuth handler class."""