expectocode · parasteh · Dec 5, 2018 · Dec 5, 2018 · Dec 5, 2018 · Dec 6, 2018
diff --git a/telegram_export/downloader.py b/telegram_export/downloader.py
@@ -16,14 +16,12 @@
 
 __log__ = logging.getLogger(__name__)
 
-
 VALID_TYPES = {
     'photo', 'document', 'video', 'audio', 'sticker', 'voice', 'chatphoto'
 }
 BAR_FORMAT = "{l_bar}{bar}| {n_fmt}/{total_fmt} " \
              "[{elapsed}<{remaining}, {rate_noinv_fmt}{postfix}]"
 
-
 QUEUE_TIMEOUT = 5
 DOWNLOAD_PART_SIZE = 256 * 1024
 
@@ -40,6 +38,7 @@ class Downloader:
     Download dialogs and their associated data, and dump them.
     Make Telegram API requests and sleep for the appropriate time.
     """
+
     def __init__(self, client, config, dumper, loop):
         self.client = client
         self.loop = loop or asyncio.get_event_loop()
@@ -82,9 +81,18 @@ def _check_media(self, media):
         """
         if not media or not self.max_size:
             return False
+
         if not self.types:
             return True
-        return export_utils.get_media_type(media) in self.types
+
+        _, size = export_utils.get_file_location(media)
+        if export_utils.get_media_type(media) in self.types:
+            if size and size > self.max_size:
+                return False
+            else:
+                return True
+
+        return False
 
     def _dump_full_entity(self, entity):
         """
@@ -221,6 +229,11 @@ async def _download_media(self, media_id, context_id, sender_id, date,
             'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name, Size '
             'FROM Media WHERE ID = ?', (media_id,)
         ).fetchone()
+
+        file_size =  media_row[6]
+        if file_size is None or file_size > self.max_size:
+            return
+
         # Documents have attributes and they're saved under the "document"
         # namespace so we need to split it before actually comparing.
         media_type = media_row[3].split('.')
@@ -254,6 +267,9 @@ async def _download_media(self, media_id, context_id, sender_id, date,
         if not ext:
             ext = export_utils.get_extension(media_row[4])
 
+        if isinstance(filename, str):
+            filename = export_utils.format_filename(filename)
+
         # Apply the date to the user format string and then replace the map
         formatter['filename'] = filename
         filename = date.strftime(self.media_fmt).format_map(formatter)
@@ -295,6 +311,7 @@ def progress(saved, total):
             bar.total += media_row[6]
 
         self._incomplete_download = filename
+
         await self.client.download_file(
             location, file=filename, file_size=media_row[6],
             part_size_kb=DOWNLOAD_PART_SIZE // 1024,
@@ -441,9 +458,9 @@ async def start(self, target_id):
             )
 
             can_get_participants = (
-                isinstance(target_in, types.InputPeerChat)
-                or (isinstance(target, types.Channel)
-                    and (target.megagroup or target.admin_rights is not None))
+                    isinstance(target_in, types.InputPeerChat)
+                    or (isinstance(target, types.Channel)
+                        and (target.megagroup or target.admin_rights is not None))
             )
             if can_get_participants:
                 try:
@@ -515,7 +532,7 @@ async def start(self, target_id):
                 # the highest ID ("closest" bound we need to reach), stop.
                 if count < req.limit or req.offset_id <= stop_at:
                     __log__.debug('Received less messages than limit, done.')
-                    max_id = self.dumper.get_max_message_id(target_id) or 0 # can't have NULL
+                    max_id = self.dumper.get_max_message_id(target_id) or 0  # can't have NULL
                     self.dumper.save_resume(target_id, stop_at=max_id)
                     break
 

diff --git a/telegram_export/utils.py b/telegram_export/utils.py
@@ -1,5 +1,6 @@
 """Utility functions for telegram-export which aren't specific to one purpose"""
 import mimetypes
+import string
 
 from telethon.tl import types
 from urllib.parse import urlparse
@@ -259,3 +260,20 @@ def parse_proxy_str(proxy_str):
     else:
         proxy = (proxy_type, host, port)
     return proxy
+
+
+def format_filename(s):
+    """Take a string and return a valid filename constructed from the string.
+Uses a whitelist approach: any characters not present in valid_chars are
+removed. Also spaces are replaced with underscores.
+
+Note: this method may produce invalid filenames such as ``, `.` or `..`
+When I use this method I prepend a date string like '2009_01_15_19_46_32_'
+and append a file extension like '.txt', so I avoid the potential of using
+an invalid filename.
+
+"""
+    valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits)
+    filename = ''.join(c for c in s if c in valid_chars)
+    filename = filename.replace(' ', '_')  # I don't like spaces in filenames.
+    return filename