Skip to content
This repository has been archived by the owner on Apr 26, 2024. It is now read-only.

Fix URL preview bugs (type error when loading cache from db, content-type including quotes) #4157

Merged
merged 6 commits into from
Nov 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changelog.d/4157.bugfix
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Loading URL previews from the DB cache on Postgres will no longer cause Unicode type errors when responding to the request, and URL previews will no longer fail if the remote server returns a Content-Type header with the chartype in quotes.
8 changes: 4 additions & 4 deletions synapse/http/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,13 +468,13 @@ def set_cors_headers(request):
Args:
request (twisted.web.http.Request): The http request to add CORs to.
"""
request.setHeader("Access-Control-Allow-Origin", "*")
request.setHeader(b"Access-Control-Allow-Origin", b"*")
request.setHeader(
"Access-Control-Allow-Methods", "GET, POST, PUT, DELETE, OPTIONS"
b"Access-Control-Allow-Methods", b"GET, POST, PUT, DELETE, OPTIONS"
)
request.setHeader(
"Access-Control-Allow-Headers",
"Origin, X-Requested-With, Content-Type, Accept, Authorization"
b"Access-Control-Allow-Headers",
b"Origin, X-Requested-With, Content-Type, Accept, Authorization"
)


Expand Down
22 changes: 16 additions & 6 deletions synapse/rest/media/v1/preview_url_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import cgi
import datetime
import errno
Expand All @@ -24,6 +25,7 @@
import sys
import traceback

import six
from six import string_types
from six.moves import urllib_parse as urlparse

Expand Down Expand Up @@ -98,7 +100,7 @@ def _async_render_GET(self, request):
# XXX: if get_user_by_req fails, what should we do in an async render?
requester = yield self.auth.get_user_by_req(request)
url = parse_string(request, "url")
if "ts" in request.args:
if b"ts" in request.args:
ts = parse_integer(request, "ts")
else:
ts = self.clock.time_msec()
Expand Down Expand Up @@ -180,7 +182,12 @@ def _do_preview(self, url, user, ts):
cache_result["expires_ts"] > ts and
cache_result["response_code"] / 100 == 2
):
defer.returnValue(cache_result["og"])
# It may be stored as text in the database, not as bytes (such as
# PostgreSQL). If so, encode it back before handing it on.
og = cache_result["og"]
if isinstance(og, six.text_type):
og = og.encode('utf8')
defer.returnValue(og)
return

media_info = yield self._download_url(url, user)
Expand Down Expand Up @@ -213,14 +220,17 @@ def _do_preview(self, url, user, ts):
elif _is_html(media_info['media_type']):
# TODO: somehow stop a big HTML tree from exploding synapse's RAM

file = open(media_info['filename'])
body = file.read()
file.close()
with open(media_info['filename'], 'rb') as file:
body = file.read()

# clobber the encoding from the content-type, or default to utf-8
# XXX: this overrides any <meta/> or XML charset headers in the body
# which may pose problems, but so far seems to work okay.
match = re.match(r'.*; *charset=(.*?)(;|$)', media_info['media_type'], re.I)
match = re.match(
r'.*; *charset="?(.*?)"?(;|$)',
media_info['media_type'],
re.I
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this just a style change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No - the bugfix is the "?s on either side which will strip quotes from charset="utf-8", which used to break us. I had to reformat it as the line got too long.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aaaaaaaaah, I'm blind. Cool 👍

encoding = match.group(1) if match else "utf-8"

og = decode_and_calc_og(body, media_info['uri'], encoding)
Expand Down
164 changes: 164 additions & 0 deletions tests/rest/media/v1/test_url_preview.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
# -*- coding: utf-8 -*-
# Copyright 2018 New Vector Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

from mock import Mock

from twisted.internet.defer import Deferred

from synapse.config.repository import MediaStorageProviderConfig
from synapse.util.module_loader import load_module

from tests import unittest


class URLPreviewTests(unittest.HomeserverTestCase):

hijack_auth = True
user_id = "@test:user"

def make_homeserver(self, reactor, clock):

self.storage_path = self.mktemp()
os.mkdir(self.storage_path)

config = self.default_config()
config.url_preview_enabled = True
config.max_spider_size = 9999999
config.url_preview_url_blacklist = []
config.media_store_path = self.storage_path

provider_config = {
"module": "synapse.rest.media.v1.storage_provider.FileStorageProviderBackend",
"store_local": True,
"store_synchronous": False,
"store_remote": True,
"config": {"directory": self.storage_path},
}

loaded = list(load_module(provider_config)) + [
MediaStorageProviderConfig(False, False, False)
]

config.media_storage_providers = [loaded]

hs = self.setup_test_homeserver(config=config)

return hs

def prepare(self, reactor, clock, hs):

self.fetches = []

def get_file(url, output_stream, max_size):
"""
Returns tuple[int,dict,str,int] of file length, response headers,
absolute URI, and response code.
"""

def write_to(r):
data, response = r
output_stream.write(data)
return response

d = Deferred()
d.addCallback(write_to)
self.fetches.append((d, url))
return d

client = Mock()
client.get_file = get_file

self.media_repo = hs.get_media_repository_resource()
preview_url = self.media_repo.children[b'preview_url']
preview_url.client = client
self.preview_url = preview_url

def test_cache_returns_correct_type(self):

request, channel = self.make_request(
"GET", "url_preview?url=matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()

# We've made one fetch
self.assertEqual(len(self.fetches), 1)

end_content = (
b'<html><head>'
b'<meta property="og:title" content="~matrix~" />'
b'<meta property="og:description" content="hi" />'
b'</head></html>'
)

self.fetches[0][0].callback(
(
end_content,
(
len(end_content),
{
b"Content-Length": [b"%d" % (len(end_content))],
b"Content-Type": [b'text/html; charset="utf8"'],
},
"https://example.com",
200,
),
)
)

self.pump()
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
)

# Check the cache returns the correct response
request, channel = self.make_request(
"GET", "url_preview?url=matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()

# Only one fetch, still, since we'll lean on the cache
self.assertEqual(len(self.fetches), 1)

# Check the cache response has the same content
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
)

# Clear the in-memory cache
self.assertIn("matrix.org", self.preview_url._cache)
self.preview_url._cache.pop("matrix.org")
self.assertNotIn("matrix.org", self.preview_url._cache)

# Check the database cache returns the correct response
request, channel = self.make_request(
"GET", "url_preview?url=matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()

# Only one fetch, still, since we'll lean on the cache
self.assertEqual(len(self.fetches), 1)

# Check the cache response has the same content
self.assertEqual(channel.code, 200)
self.assertEqual(
channel.json_body, {"og:title": "~matrix~", "og:description": "hi"}
)
2 changes: 2 additions & 0 deletions tests/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def writeHeaders(self, version, code, reason, headers):
self.result["headers"] = headers

def write(self, content):
assert isinstance(content, bytes), "Should be bytes! " + repr(content)

if "body" not in self.result:
self.result["body"] = b""

Expand Down