Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 21 additions & 19 deletions email_reply_parser/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ class EmailReplyParser(object):
""" Represents a email message that is parsed.
"""
def __init__(self, language='en'):
with open(dir_path + "/languages_support.json", "r") as read_file:
self.words_map = json.load(read_file)
self.language = language

def read(self, text):
Expand All @@ -21,7 +23,7 @@ def read(self, text):

Returns an EmailMessage instance
"""
return EmailMessage(text, self.language).read()
return EmailMessage(text, self.language, self.words_map).read()

def parse_reply(self, text):
""" Provides the reply portion of email.
Expand All @@ -36,7 +38,7 @@ def parse_reply(self, text):
class EmailMessage(object):
""" An email message represents a parsed email body.
"""
def __init__(self, text, language):
def __init__(self, text, language, words_map):
self.fragments = []
self.fragment = None
self.text = text.replace('\r\n', '\n')
Expand All @@ -49,36 +51,36 @@ def __init__(self, text, language):
self.MULTI_QUOTE_HDR_REGEX = None
self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None
dir_path = os.path.dirname(__file__)
with open(dir_path + "/languages_support.json", "r") as read_file:
self.words_diff_languages = json.load(read_file)
self.words_map = words_map
self.language = language
self.set_regex()

def default_quoted_header(self):
self.QUOTED_REGEX = re.compile(r'(>+)')
self.HEADER_REGEX = re.compile(
r'^\*?(' + self.words_diff_languages[self.language]['From'] +
'|' + self.words_diff_languages[self.language]['Sent'] +
'|' + self.words_diff_languages[self.language]['To'] +
'|' + self.words_diff_languages[self.language]['Subject'] +
r'^\*?(' + self.words_map[self.language]['From'] +
'|' + self.words_map[self.language]['Sent'] +
'|' + self.words_map[self.language]['To'] +
'|' + self.words_map[self.language]['Subject'] +
'):\*? .+'
)

def nl_support(self):
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})')
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})')
self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$')
self.default_quoted_header()
self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)'

def de_support(self):
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})')
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})')
self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$')
self.QUOTED_REGEX = re.compile(r'(>+)')
self.HEADER_REGEX = re.compile(
r'^\*?(' + self.words_diff_languages[self.language]['From'] +
'|' + self.words_diff_languages[self.language]['Sent'] +
'|' + self.words_diff_languages[self.language]['To'] +
'|' + self.words_diff_languages[self.language]['Subject'] +
r'^\*?(' + self.words_map[self.language]['From'] +
'|' + self.words_map[self.language]['Sent'] +
'|' + self.words_map[self.language]['To'] +
'|' + self.words_map[self.language]['Subject'] +
'|' + 'mailto'
'):\*? .+'
)
self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)'
Expand All @@ -87,18 +89,18 @@ def en_support(self):
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
self.QUOTED_REGEX = re.compile(r'(>+)')
self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject|mailto):\*? .+')
self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'

def set_regex(self):
if hasattr(self, self.language+"_support"):
getattr(self, self.language+"_support")()
else:
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})')
self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_diff_languages[self.language]['wrote'] + ':$')
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})')
self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + ':$')
self.default_quoted_header()
self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_diff_languages[self.language]['wrote'] + \
':)(On\s(.+?)' + self.words_diff_languages[self.language]['wrote'] + ':)'
self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] + \
':)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)'
self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
self.MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL)

Expand Down