import re from django.utils.html import strip_tags from html2text import html2text EXTENDED_PUNCTUATION = '!"#$%&\'()*+,-./:;=?@[\\]^_`{|}~ \t\n\r\x0b\x0c' RE_WRAPPED_BY_HTML = re.compile(r'^<[a-z]+[^>]*>.*]*>$', re.MULTILINE|re.IGNORECASE|re.DOTALL) RE_LINKS = re.compile(r'(?Phttps?://[^ \t\r\n\<]+)') LINK_MARKUP = u'\g' RE_REPLY_LINE = re.compile(r'^[\s\t>]*>[\s\t]*') RE_BR_TO_LINEBREAK = re.compile(r'<\s*/?\s*br\s*/?\s*>') class EmailBlock(list): def __init__(self, is_reply=False, mark_links=True, html2text=True): self.mark_links = mark_links self.html2text = html2text self.is_reply = is_reply def _html2text(self, text): if RE_WRAPPED_BY_HTML.match(text.strip()): return html2text(text) text, n = RE_BR_TO_LINEBREAK.subn('\n', text) text = strip_tags(text) return text def _mark_links(self, text): text, n = RE_LINKS.subn(LINK_MARKUP, text) return text @property def text(self): block = u''.join(self) if self.html2text: block = self._html2text(block) if self.mark_links: block = self._mark_links(block) return block def __unicode__(self): return self.text def __str__(self): return self.text class EmailBlockParser(list): def __init__(self, email): self.email = email self.thread_emails = email.thread.message_set message = email.body block = EmailBlock() for line in message.split('\n'): if self.context_switch(line, block): self.append(block) new_block_context = not block.is_reply block = EmailBlock(is_reply=new_block_context) block.append(line + '\n') self.append(block) def context_switch(self, line, block): if line.strip(EXTENDED_PUNCTUATION): if self.is_reply(line): if not block.is_reply: return True else: if block.is_reply: return True return False def is_reply(self, line): stripped_line = line.strip() if stripped_line.startswith('>') or RE_REPLY_LINE.match(line): return True clean_line = RE_REPLY_LINE.subn('', stripped_line)[0] queryset = self.thread_emails.filter( received_time__lt=self.email.received_time, body__contains=clean_line).order_by('-received_time') if queryset[:1]: return True return False