diff --git a/telegram_export/downloader.py b/telegram_export/downloader.py index bf41cc3..b3ca70f 100755 --- a/telegram_export/downloader.py +++ b/telegram_export/downloader.py @@ -16,14 +16,12 @@ __log__ = logging.getLogger(__name__) - VALID_TYPES = { 'photo', 'document', 'video', 'audio', 'sticker', 'voice', 'chatphoto' } BAR_FORMAT = "{l_bar}{bar}| {n_fmt}/{total_fmt} " \ "[{elapsed}<{remaining}, {rate_noinv_fmt}{postfix}]" - QUEUE_TIMEOUT = 5 DOWNLOAD_PART_SIZE = 256 * 1024 @@ -40,6 +38,7 @@ class Downloader: Download dialogs and their associated data, and dump them. Make Telegram API requests and sleep for the appropriate time. """ + def __init__(self, client, config, dumper, loop): self.client = client self.loop = loop or asyncio.get_event_loop() @@ -82,9 +81,18 @@ def _check_media(self, media): """ if not media or not self.max_size: return False + if not self.types: return True - return export_utils.get_media_type(media) in self.types + + _, size = export_utils.get_file_location(media) + if export_utils.get_media_type(media) in self.types: + if size and size > self.max_size: + return False + else: + return True + + return False def _dump_full_entity(self, entity): """ @@ -221,6 +229,11 @@ async def _download_media(self, media_id, context_id, sender_id, date, 'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name, Size ' 'FROM Media WHERE ID = ?', (media_id,) ).fetchone() + + file_size = media_row[6] + if file_size is None or file_size > self.max_size: + return + # Documents have attributes and they're saved under the "document" # namespace so we need to split it before actually comparing. media_type = media_row[3].split('.') @@ -254,6 +267,9 @@ async def _download_media(self, media_id, context_id, sender_id, date, if not ext: ext = export_utils.get_extension(media_row[4]) + if isinstance(filename, str): + filename = export_utils.format_filename(filename) + # Apply the date to the user format string and then replace the map formatter['filename'] = filename filename = date.strftime(self.media_fmt).format_map(formatter) @@ -295,6 +311,7 @@ def progress(saved, total): bar.total += media_row[6] self._incomplete_download = filename + await self.client.download_file( location, file=filename, file_size=media_row[6], part_size_kb=DOWNLOAD_PART_SIZE // 1024, @@ -441,9 +458,9 @@ async def start(self, target_id): ) can_get_participants = ( - isinstance(target_in, types.InputPeerChat) - or (isinstance(target, types.Channel) - and (target.megagroup or target.admin_rights is not None)) + isinstance(target_in, types.InputPeerChat) + or (isinstance(target, types.Channel) + and (target.megagroup or target.admin_rights is not None)) ) if can_get_participants: try: @@ -515,7 +532,7 @@ async def start(self, target_id): # the highest ID ("closest" bound we need to reach), stop. if count < req.limit or req.offset_id <= stop_at: __log__.debug('Received less messages than limit, done.') - max_id = self.dumper.get_max_message_id(target_id) or 0 # can't have NULL + max_id = self.dumper.get_max_message_id(target_id) or 0 # can't have NULL self.dumper.save_resume(target_id, stop_at=max_id) break diff --git a/telegram_export/utils.py b/telegram_export/utils.py index 851c888..05c030c 100644 --- a/telegram_export/utils.py +++ b/telegram_export/utils.py @@ -1,5 +1,6 @@ """Utility functions for telegram-export which aren't specific to one purpose""" import mimetypes +import string from telethon.tl import types from urllib.parse import urlparse @@ -259,3 +260,20 @@ def parse_proxy_str(proxy_str): else: proxy = (proxy_type, host, port) return proxy + + +def format_filename(s): + """Take a string and return a valid filename constructed from the string. +Uses a whitelist approach: any characters not present in valid_chars are +removed. Also spaces are replaced with underscores. + +Note: this method may produce invalid filenames such as ``, `.` or `..` +When I use this method I prepend a date string like '2009_01_15_19_46_32_' +and append a file extension like '.txt', so I avoid the potential of using +an invalid filename. + +""" + valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) + filename = ''.join(c for c in s if c in valid_chars) + filename = filename.replace(' ', '_') # I don't like spaces in filenames. + return filename \ No newline at end of file