1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import re
import pytz
import email
import codecs
import mailbox
import datetime
from email.iterators import typed_subpart_iterator
import chardet
def get_charset(message, default='ASCII'):
"""Get the message charset"""
charset = message.get_content_charset()
if not charset:
charset = message.get_charset()
if not charset:
charset = default
try:
codecs.lookup(charset)
except LookupError:
charset = default
return charset
class Message(mailbox.mboxMessage):
RECEIVED_DELIMITER = re.compile('\n|;')
def get_subject(self):
subject = email.header.decode_header(self['Subject'])
if isinstance(subject, list):
new_subject = u''
for text_part, encoding in subject:
if not encoding:
encoding = get_charset(self)
try:
new_subject += unicode(text_part, encoding)
except (UnicodeDecodeError, LookupError):
try:
new_subject += unicode(text_part, get_charset(self))
except (UnicodeDecodeError, LookupError):
encoding = chardet.detect(text_part)['encoding']
new_subject += unicode(text_part, encoding)
return ''.join(new_subject)
def get_body(self):
"""Get the body of the email message"""
if self.is_multipart():
#get the plain text version only
text_parts = [part
for part in typed_subpart_iterator(self,
'text',
'plain')]
body = []
for part in text_parts:
charset = get_charset(part, get_charset(self))
body.append(unicode(part.get_payload(decode=True),
charset,
"replace"))
return u"\n".join(body).strip()
else: # if it is not multipart, the payload will be a string
# representing the message body
body = unicode(self.get_payload(decode=True),
get_charset(self),
"replace")
return body.strip()
def get_received_datetime(self):
if not self.has_key('Received'):
return None
# The time received should always be the last element
# in the `Received` attribute from the message headers
received_header = self.RECEIVED_DELIMITER.split(self['Received'])
received_time_header = received_header[-1].strip()
date_tuple = email.utils.parsedate_tz(received_time_header)
utc_timestamp = email.utils.mktime_tz(date_tuple)
utc_datetime = datetime.datetime.fromtimestamp(utc_timestamp,
pytz.utc)
return utc_datetime
def get_from_addr(self):
real_name_raw, from_ = email.utils.parseaddr(self['From'])
real_name_str, encoding = email.header.decode_header(real_name_raw)[0]
if not encoding:
encoding = 'ascii'
real_name = unicode(real_name_str, encoding, errors='replace')
return real_name, from_