-
Notifications
You must be signed in to change notification settings - Fork 95
MAX_SIZE minor bug fix #99
base: master
Are you sure you want to change the base?
Changes from all commits
abe7caf
b40ec53
716c248
874af06
f2869c3
0b0e3c7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,14 +16,12 @@ | |
|
||
__log__ = logging.getLogger(__name__) | ||
|
||
|
||
VALID_TYPES = { | ||
'photo', 'document', 'video', 'audio', 'sticker', 'voice', 'chatphoto' | ||
} | ||
BAR_FORMAT = "{l_bar}{bar}| {n_fmt}/{total_fmt} " \ | ||
"[{elapsed}<{remaining}, {rate_noinv_fmt}{postfix}]" | ||
|
||
|
||
QUEUE_TIMEOUT = 5 | ||
DOWNLOAD_PART_SIZE = 256 * 1024 | ||
|
||
|
@@ -40,6 +38,7 @@ class Downloader: | |
Download dialogs and their associated data, and dump them. | ||
Make Telegram API requests and sleep for the appropriate time. | ||
""" | ||
|
||
def __init__(self, client, config, dumper, loop): | ||
self.client = client | ||
self.loop = loop or asyncio.get_event_loop() | ||
|
@@ -82,9 +81,18 @@ def _check_media(self, media): | |
""" | ||
if not media or not self.max_size: | ||
return False | ||
|
||
if not self.types: | ||
return True | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This bypasses the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it is if you check the config file, it says "Setting to "0" will not download any media." There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No, that's not what I meant. I mean, if there are no types (i.e. "all media is valid"), the maximum size is not checked at all. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's right, So the file size should be checked before this line! |
||
return export_utils.get_media_type(media) in self.types | ||
|
||
_, size = export_utils.get_file_location(media) | ||
if export_utils.get_media_type(media) in self.types: | ||
if size and size > self.max_size: | ||
return False | ||
else: | ||
return True | ||
|
||
return False | ||
|
||
def _dump_full_entity(self, entity): | ||
""" | ||
|
@@ -221,6 +229,11 @@ async def _download_media(self, media_id, context_id, sender_id, date, | |
'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name, Size ' | ||
'FROM Media WHERE ID = ?', (media_id,) | ||
).fetchone() | ||
|
||
file_size = media_row[6] | ||
if file_size is None or file_size > self.max_size: | ||
return | ||
|
||
# Documents have attributes and they're saved under the "document" | ||
# namespace so we need to split it before actually comparing. | ||
media_type = media_row[3].split('.') | ||
|
@@ -254,6 +267,9 @@ async def _download_media(self, media_id, context_id, sender_id, date, | |
if not ext: | ||
ext = export_utils.get_extension(media_row[4]) | ||
|
||
if isinstance(filename, str): | ||
filename = export_utils.format_filename(filename) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change is irrelevant to the pull request and should not belong here. If you want to make this change use a new pull request. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please let me know how to do this, I think I have to create a new branch and commit this part of code there and then create a new pull request? right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Making a new branch is a bit harder because you have to revert the changes you have made in
But you can look up online how to do it without deleting the fork if you want. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
||
# Apply the date to the user format string and then replace the map | ||
formatter['filename'] = filename | ||
filename = date.strftime(self.media_fmt).format_map(formatter) | ||
|
@@ -295,6 +311,7 @@ def progress(saved, total): | |
bar.total += media_row[6] | ||
|
||
self._incomplete_download = filename | ||
|
||
await self.client.download_file( | ||
location, file=filename, file_size=media_row[6], | ||
part_size_kb=DOWNLOAD_PART_SIZE // 1024, | ||
|
@@ -441,9 +458,9 @@ async def start(self, target_id): | |
) | ||
|
||
can_get_participants = ( | ||
isinstance(target_in, types.InputPeerChat) | ||
or (isinstance(target, types.Channel) | ||
and (target.megagroup or target.admin_rights is not None)) | ||
isinstance(target_in, types.InputPeerChat) | ||
or (isinstance(target, types.Channel) | ||
and (target.megagroup or target.admin_rights is not None)) | ||
) | ||
if can_get_participants: | ||
try: | ||
|
@@ -515,7 +532,7 @@ async def start(self, target_id): | |
# the highest ID ("closest" bound we need to reach), stop. | ||
if count < req.limit or req.offset_id <= stop_at: | ||
__log__.debug('Received less messages than limit, done.') | ||
max_id = self.dumper.get_max_message_id(target_id) or 0 # can't have NULL | ||
max_id = self.dumper.get_max_message_id(target_id) or 0 # can't have NULL | ||
self.dumper.save_resume(target_id, stop_at=max_id) | ||
break | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
"""Utility functions for telegram-export which aren't specific to one purpose""" | ||
import mimetypes | ||
import string | ||
|
||
from telethon.tl import types | ||
from urllib.parse import urlparse | ||
|
@@ -259,3 +260,20 @@ def parse_proxy_str(proxy_str): | |
else: | ||
proxy = (proxy_type, host, port) | ||
return proxy | ||
|
||
|
||
def format_filename(s): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change is irrelevant to the pull request and should not belong here. If you want to make this change use a new pull request. |
||
"""Take a string and return a valid filename constructed from the string. | ||
Uses a whitelist approach: any characters not present in valid_chars are | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This formatting is terrible and not consistent with the rest at all. A simple: """Removes invalid file characters from the input string""" Would be enough. |
||
removed. Also spaces are replaced with underscores. | ||
|
||
Note: this method may produce invalid filenames such as ``, `.` or `..` | ||
When I use this method I prepend a date string like '2009_01_15_19_46_32_' | ||
and append a file extension like '.txt', so I avoid the potential of using | ||
an invalid filename. | ||
|
||
""" | ||
valid_chars = "-_.() %s%s" % (string.ascii_letters, string.digits) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What about people using Chinese, Japanese, Persian or any other language that don't have ASCII characters in the file name? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you are right, I had to consider that, Thank you for the point, I will fix it |
||
filename = ''.join(c for c in s if c in valid_chars) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A far better option is to use a regex and replace all "invalid" characters with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That code is much better, I would not normally use those characters in any filename either way. But, I still think this should be a separate pull request. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Me neither, I think for some reasons it uses time for the file name! so the ':' in time format caused the exception for me! the code I have submitted is working and using it downloaded around 12000 docs from two separated group |
||
filename = filename.replace(' ', '_') # I don't like spaces in filenames. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We don't care about personal opinions in the code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I just used the code I refered, So I will write my own code to fix it again, Thank you for your comments :) |
||
return filename | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No newline at the end of the file. |
Uh oh!
There was an error while loading. Please reload this page.