Skip to content

Commit 4033e9e

Browse files
Merge pull request #455 from TeamMsgExtractor/next-release
Version 0.53.2
2 parents 207bfb9 + 4cd0e7b commit 4033e9e

File tree

9 files changed

+60
-16
lines changed

9 files changed

+60
-16
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1+
**v0.53.2**
2+
* [[TeamMsgExtractor #452](https://github.com/TeamMsgExtractor/msg-extractor/issues/452)] Adjusted code to allow html encoding to be cached to try to speed up `bs4` operations.
3+
* [[TeamMsgExtractor #453](https://github.com/TeamMsgExtractor/msg-extractor/issues/453)] Fixed handler for too large filetimes so that some filetimes being too large doesn't break the handler.
4+
* Fixed a bug that would cause an error in task objects due to a lack of `enumerate`.
5+
* Fix `TOCEntry` not initializing `DVTargetDevice` correctly.
6+
* Add temporary properties for `ContentID` to `SignedAttachment`. AFAIK these can't ever be set, but this prevents errors in some places.
7+
18
**v0.53.1**
29
* Expanded allowable range for `red-black-tree-mod`.
310
* Fix issue with `MessageBase.asEmailMessage()` that prevented embedded MSG files from being attached.

README.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,8 @@ your access to the newest major version of extract-msg.
260260
.. |License: GPL v3| image:: https://img.shields.io/badge/License-GPLv3-blue.svg
261261
:target: LICENSE.txt
262262

263-
.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.53.1-blue.svg
264-
:target: https://pypi.org/project/extract-msg/0.53.1/
263+
.. |PyPI3| image:: https://img.shields.io/badge/pypi-0.53.2-blue.svg
264+
:target: https://pypi.org/project/extract-msg/0.53.2/
265265

266266
.. |PyPI2| image:: https://img.shields.io/badge/python-3.8+-brightgreen.svg
267267
:target: https://www.python.org/downloads/release/python-3810/

extract_msg/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727
# along with this program. If not, see <http://www.gnu.org/licenses/>.
2828

2929
__author__ = 'Destiny Peterson & Matthew Walker'
30-
__date__ = '2025-02-05'
31-
__version__ = '0.53.1'
30+
__date__ = '2025-03-14'
31+
__version__ = '0.53.2'
3232

3333
__all__ = [
3434
# Modules:

extract_msg/attachments/signed_att.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def __init__(self, msg, data: bytes, name: str, mimetype: str, node: email.messa
4545
self.__node = node
4646
self.__treePath = msg.treePath + [makeWeakRef(self)]
4747

48-
self.__data = None
48+
self.__data = b''
4949
# To add support for embedded MSG files, we are going to completely
5050
# ignore the mimetype and just do a few simple checks to see if we can
5151
# use the bytes as am embedded file.
@@ -59,7 +59,7 @@ def __init__(self, msg, data: bytes, name: str, mimetype: str, node: email.messa
5959
except Exception:
6060
logger.exception('Signed message was an OLE file, but could not be read as an MSG file due to an exception.')
6161

62-
if self.__data is None:
62+
if not self.__data:
6363
self.__data = data
6464

6565
def _handleFnc(self, _zip, filename, customPath: pathlib.Path, kwargs) -> pathlib.Path:
@@ -205,6 +205,12 @@ def saveEmbededMessage(self, **kwargs) -> constants.SAVE_TYPE:
205205
def asBytes(self) -> bytes:
206206
return self.__asBytes
207207

208+
@property
209+
def contentID(self) -> None:
210+
return None
211+
212+
cid = contentID
213+
208214
@property
209215
def data(self) -> Union[bytes, MSGFile]:
210216
"""

extract_msg/msg_classes/message_base.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def __init__(self, path, **kwargs):
9696
except Exception as e:
9797
# Prevent an error in the body from preventing opening.
9898
logger.exception('Critical error accessing the body. File opened but accessing the body will throw an exception.')
99+
self._htmlEncoding = None
99100
except:
100101
try:
101102
self.close()
@@ -142,6 +143,16 @@ def _genRecipient(self, recipientStr: str, recipientType: RecipientType) -> Opti
142143

143144
return value
144145

146+
def _getHtmlEncoding(self, soup: bs4.BeautifulSoup) -> None:
147+
"""
148+
Helper function to set the html encoding.
149+
"""
150+
if not self._htmlEncoding:
151+
try:
152+
self._htmlEncoding = cast(Optional[str], soup.original_encoding or soup.declared_html_encoding)
153+
except AttributeError:
154+
pass
155+
145156
def asEmailMessage(self) -> EmailMessage:
146157
"""
147158
Returns an instance of EmailMessage used to represent the contents of
@@ -380,7 +391,8 @@ def getSaveHtmlBody(self, preparedHtml: bool = False, charset: str = 'utf-8', **
380391

381392
# If we are preparing the HTML, then we should
382393
if preparedHtml and charset:
383-
bs = bs4.BeautifulSoup(data, features = 'html.parser')
394+
bs = bs4.BeautifulSoup(data, features = 'html.parser', from_encoding = self._htmlEncoding)
395+
self._getHtmlEncoding(bs)
384396
if not bs.find('meta', {'http-equiv': 'Content-Type'}):
385397
# Setup the attributes for the tag.
386398
tagAttrs = {
@@ -405,7 +417,7 @@ def getSaveHtmlBody(self, preparedHtml: bool = False, charset: str = 'utf-8', **
405417

406418
return data
407419
else:
408-
return self.htmlBody
420+
return self.htmlBody or b''
409421

410422
def getSavePdfBody(self, wkPath = None, wkOptions = None, **kwargs) -> bytes:
411423
"""
@@ -501,7 +513,7 @@ def injectHtmlHeader(self, prepared: bool = False) -> bytes:
501513
body = self.htmlBody
502514

503515
# Validate the HTML.
504-
if not validateHtml(body):
516+
if not validateHtml(body, self._htmlEncoding):
505517
logger.warning('HTML body failed to validate. Code will attempt to correct it.')
506518

507519
# If we are here, then we need to do what we can to fix the HTML
@@ -511,7 +523,8 @@ def injectHtmlHeader(self, prepared: bool = False) -> bytes:
511523
# the <html> and <body> tag are missing, we determine where to put
512524
# the body tag (around everything if there is no <head> tag,
513525
# otherwise at the end) and then wrap it all in the <html> tag.
514-
parser = bs4.BeautifulSoup(body, features = 'html.parser')
526+
parser = bs4.BeautifulSoup(body, features = 'html.parser', from_encoding = self._htmlEncoding)
527+
self._getHtmlEncoding(parser)
515528
if not parser.find('html') and not parser.find('body'):
516529
if parser.find('head') or parser.find('footer'):
517530
# Create the parser we will be using for the corrections.
@@ -1186,7 +1199,8 @@ def htmlBodyPrepared(self) -> Optional[bytes]:
11861199
return self.htmlBody
11871200

11881201
# Create the BeautifulSoup instance to use.
1189-
soup = bs4.BeautifulSoup(self.htmlBody, 'html.parser')
1202+
soup = bs4.BeautifulSoup(self.htmlBody, 'html.parser', from_encoding = self._htmlEncoding)
1203+
self._getHtmlEncoding(soup)
11901204

11911205
# Get a list of image tags to see if we can inject into. If the source
11921206
# of an image starts with "cid:" that means it is one of the attachments

extract_msg/msg_classes/task_request.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def taskObject(self) -> Optional[Task]:
6363
# The task object MUST be the first attachment, but we will be
6464
# lenient and allow it to be in any position. It not existing,
6565
# however, will not be tolerated.
66-
task = next(((index, att) for index, att in self.attachments if isinstance(att.data, Task)), None)
66+
task = next(((index, att) for index, att in enumerate(self.attachments) if isinstance(att.data, Task)), None)
6767

6868
if task is None:
6969
if ErrorBehavior.STANDARDS_VIOLATION in self.errorBehavior:

extract_msg/structures/toc_entry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ def __init__(self, reader: Optional[Union[bytes, BytesReader]] = None):
2020
self.__lindex = 0
2121
self.__tymed = 0
2222
self.__advf = 0
23-
self.__targetDevice = DVTargetDevice()
23+
self.__targetDevice = DVTargetDevice(None)
2424
return
2525

2626
if isinstance(reader, bytes):

extract_msg/utils.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -295,7 +295,14 @@ def filetimeToDatetime(rawTime: int) -> datetime.datetime:
295295
# Just make null dates from all of these time stamps.
296296
from .null_date import NullDate
297297
date = NullDate(1970, 1, 1, 1)
298-
date += datetime.timedelta(seconds = filetimeToUtc(rawTime))
298+
try:
299+
date += datetime.timedelta(seconds = filetimeToUtc(rawTime))
300+
except OverflowError:
301+
# Time value is so large we physically can't represent it, so
302+
# let's just modify the date to it's highest possible value and
303+
# call it a day.
304+
m = date.max
305+
date = NullDate(m.year, m.month, m.day, m.hour, m.minute, m.second, m.microsecond)
299306
date.filetime = rawTime
300307

301308
return date
@@ -1241,14 +1248,14 @@ def unwrapMultipart(mp: Union[bytes, str, email.message.Message]) -> Dict:
12411248
}
12421249

12431250

1244-
def validateHtml(html: bytes) -> bool:
1251+
def validateHtml(html: bytes, encoding: Optional[str]) -> bool:
12451252
"""
12461253
Checks whether the HTML is considered valid.
12471254
12481255
To be valid, the HTML must, at minimum, contain an ``<html>`` tag, a
12491256
``<body>`` tag, and closing tags for each.
12501257
"""
1251-
bs = bs4.BeautifulSoup(html, 'html.parser')
1258+
bs = bs4.BeautifulSoup(html, 'html.parser', from_encoding = encoding)
12521259
if not bs.find('html') or not bs.find('body'):
12531260
return False
12541261
return True

extract_msg_tests/prop_tests.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,16 @@
207207
PropertyFlags.MANDATORY,
208208
NULL_DATE
209209
),
210+
(
211+
'Null Time 4',
212+
b'\x40\x00\x1C\x30\x06\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x7f',
213+
b'\x40\x00\x1C\x30\x06\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\x7f',
214+
FixedLengthProp,
215+
'301C0040',
216+
0x0040,
217+
PropertyFlags.READABLE | PropertyFlags.WRITABLE,
218+
NULL_DATE
219+
),
210220
# Variable Length Props.
211221
(
212222
'Object',

0 commit comments

Comments
 (0)