Skip to content

Commit 54ef8f5

Browse files
authored
Merge pull request #1 from atc0m/language-support
language support
2 parents 0c0b73a + 2517a41 commit 54ef8f5

File tree

2 files changed

+230
-17
lines changed

2 files changed

+230
-17
lines changed

email_reply_parser/__init__.py

Lines changed: 68 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,105 @@
11
"""
22
email_reply_parser is a python library port of GitHub's Email Reply Parser.
33
4-
For more information, visit https://github.com/zapier/email-reply-parser
4+
For more information, visit https://github.com/zapier/email_reply_parser
55
"""
66

77
import re
8+
import json
89

910

1011
class EmailReplyParser(object):
1112
""" Represents a email message that is parsed.
1213
"""
14+
def __init__(self, language='en'):
15+
self.language = language
1316

14-
@staticmethod
15-
def read(text):
17+
def read(self, text):
1618
""" Factory method that splits email into list of fragments
1719
1820
text - A string email body
1921
2022
Returns an EmailMessage instance
2123
"""
22-
return EmailMessage(text).read()
24+
return EmailMessage(text, self.language).read()
2325

24-
@staticmethod
25-
def parse_reply(text):
26+
def parse_reply(self, text):
2627
""" Provides the reply portion of email.
2728
2829
text - A string email body
2930
3031
Returns reply body message
3132
"""
32-
return EmailReplyParser.read(text).reply
33+
return self.read(text).reply
3334

3435

3536
class EmailMessage(object):
3637
""" An email message represents a parsed email body.
3738
"""
38-
39-
SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
40-
QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
41-
QUOTED_REGEX = re.compile(r'(>+)')
42-
HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
43-
_MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
44-
MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
45-
MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL)
46-
47-
def __init__(self, text):
39+
def __init__(self, text, language):
4840
self.fragments = []
4941
self.fragment = None
5042
self.text = text.replace('\r\n', '\n')
5143
self.found_visible = False
44+
self.SIG_REGEX = None
45+
self.QUOTE_HDR_REGEX = None
46+
self.QUOTED_REGEX = None
47+
self.HEADER_REGEX = None
48+
self._MULTI_QUOTE_HDR_REGEX = None
49+
self.MULTI_QUOTE_HDR_REGEX = None
50+
self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None
51+
with open("languages_support.json", "r") as read_file:
52+
self.words_diff_languages = json.load(read_file)
53+
self.language = language
54+
self.set_regex()
55+
56+
def default_quoted_header(self):
57+
self.QUOTED_REGEX = re.compile(r'(>+)')
58+
self.HEADER_REGEX = re.compile(
59+
r'^\*?(' + self.words_diff_languages[self.language]['From'] +
60+
'|' + self.words_diff_languages[self.language]['Sent'] +
61+
'|' + self.words_diff_languages[self.language]['To'] +
62+
'|' + self.words_diff_languages[self.language]['Subject'] +
63+
'):\*? .+'
64+
)
65+
66+
def nl_support(self):
67+
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})')
68+
self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$')
69+
self.default_quoted_header()
70+
self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)'
71+
72+
def de_support(self):
73+
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})')
74+
self.QUOTE_HDR_REGEX = re.compile('Am.*schrieb.*>:$')
75+
self.QUOTED_REGEX = re.compile(r'(>+)')
76+
self.HEADER_REGEX = re.compile(
77+
r'^\*?(' + self.words_diff_languages[self.language]['From'] +
78+
'|' + self.words_diff_languages[self.language]['Sent'] +
79+
'|' + self.words_diff_languages[self.language]['To'] +
80+
'|' + self.words_diff_languages[self.language]['Subject'] +
81+
'):\*? .+'
82+
)
83+
self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*>:)(Am\s(.+?)schrieb.*>:)'
84+
85+
def en_support(self):
86+
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})')
87+
self.QUOTE_HDR_REGEX = re.compile('On.*wrote:$')
88+
self.QUOTED_REGEX = re.compile(r'(>+)')
89+
self.HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+')
90+
self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)'
91+
92+
def set_regex(self):
93+
if hasattr(self, self.language+"_support"):
94+
getattr(self, self.language+"_support")()
95+
else:
96+
self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_diff_languages[self.language]['Sent from'] + '(\w+\s*){1,3})')
97+
self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_diff_languages[self.language]['wrote'] + ':$')
98+
self.default_quoted_header()
99+
self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_diff_languages[self.language]['wrote'] + \
100+
':)(On\s(.+?)' + self.words_diff_languages[self.language]['wrote'] + ':)'
101+
self.MULTI_QUOTE_HDR_REGEX = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE)
102+
self.MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(self._MULTI_QUOTE_HDR_REGEX, re.DOTALL)
52103

53104
def read(self):
54105
""" Creates new fragment for each line

support/languages_support.json

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
{
2+
"vi": {
3+
"Sent from": "\u0110\u01b0\u1ee3c g\u1eedi t\u1eeb",
4+
"From": "T\u1eeb",
5+
"To": "\u0110\u1ebfn",
6+
"wrote": "\u0111\u00e3 vi\u1ebft",
7+
"Sent": "G\u1edfi",
8+
"Subject": "M\u00f4n h\u1ecdc"
9+
},
10+
"ru": {
11+
"Sent from": "\u041e\u0442\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043e \u0438\u0437",
12+
"From": "\u041e\u0442",
13+
"To": "\u043a",
14+
"wrote": "\u043f\u0438\u0441\u0430\u043b",
15+
"Sent": "\u041e\u0442\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043e",
16+
"Subject": "\u041f\u0440\u0435\u0434\u043c\u0435\u0442"
17+
},
18+
"fr": {
19+
"Sent from": "Envoy\u00e9 depuis",
20+
"From": "De",
21+
"To": "\u00c0",
22+
"wrote": "a \u00e9crit",
23+
"Sent": "Envoy\u00e9",
24+
"Subject": "Objet"
25+
},
26+
"en": {
27+
"Sent from": "Sent from",
28+
"From": "From",
29+
"To": "To",
30+
"wrote": "wrote",
31+
"Sent": "Sent",
32+
"Subject": "Subject"
33+
},
34+
"nl": {
35+
"Sent from": "Verzonden met",
36+
"From": "Van",
37+
"To": "Aan",
38+
"wrote": "schreef",
39+
"Sent": "Verzonden",
40+
"Subject": "Onderwerp"
41+
},
42+
"pt": {
43+
"Sent from": "Enviado de",
44+
"From": "De",
45+
"To": "Para",
46+
"wrote": "escrevi",
47+
"Sent": "Enviei",
48+
"Subject": "Sujeito"
49+
},
50+
"ko": {
51+
"Sent from": "\ubd80\ud130 \ubcf4\ub0b4\uc9c4",
52+
"From": "\uc5d0\uc11c",
53+
"To": "\uc5d0",
54+
"wrote": "\uc4f4",
55+
"Sent": "\uc804\uc1a1 \ub428",
56+
"Subject": "\uc81c\ubaa9"
57+
},
58+
"de": {
59+
"Sent from": "Gesendet von",
60+
"From": "Von",
61+
"To": "An",
62+
"wrote": "schrieb",
63+
"Sent": "geschickt",
64+
"Subject": "Betreff"
65+
},
66+
"tr": {
67+
"Sent from": "Den g\u00f6nderildi",
68+
"From": "itibaren",
69+
"To": "i\u00e7in",
70+
"wrote": "yazd\u0131",
71+
"Sent": "G\u00f6nderilen",
72+
"Subject": "konu"
73+
},
74+
"it": {
75+
"Sent from": "Inviato da",
76+
"From": "Da",
77+
"To": "A",
78+
"wrote": "ha scritto",
79+
"Sent": "Inviato",
80+
"Subject": "Oggetto"
81+
},
82+
"id": {
83+
"Sent from": "Dikirim dari",
84+
"From": "Dari",
85+
"To": "Untuk",
86+
"wrote": "menulis",
87+
"Sent": "Terkirim",
88+
"Subject": "Subyek"
89+
},
90+
"sk": {
91+
"Sent from": "Odoslan\u00e9 od",
92+
"From": "z",
93+
"To": "na",
94+
"wrote": "nap\u00edsal",
95+
"Sent": "odoslan\u00e9",
96+
"Subject": "predmet"
97+
},
98+
"ar": {
99+
"Sent from": "\u0627\u0631\u0633\u0644\u062a \u0645\u0646",
100+
"From": "\u0645\u0646 \u0639\u0646\u062f",
101+
"To": "\u0625\u0644\u0649",
102+
"wrote": "\u0643\u062a\u0628",
103+
"Sent": "\u0623\u0631\u0633\u0644\u062a",
104+
"Subject": "\u0645\u0648\u0636\u0648\u0639"
105+
},
106+
"es": {
107+
"Sent from": "Enviado desde",
108+
"From": "De",
109+
"To": "Para",
110+
"wrote": "escribi\u00f3",
111+
"Sent": "Expedido",
112+
"Subject": "Asunto"
113+
},
114+
"th": {
115+
"Sent from": "\u0e2a\u0e48\u0e07\u0e08\u0e32\u0e01",
116+
"From": "\u0e08\u0e32\u0e01",
117+
"To": "\u0e44\u0e1b\u0e22\u0e31\u0e07",
118+
"wrote": "\u0e40\u0e02\u0e35\u0e22\u0e19",
119+
"Sent": "\u0e2a\u0e48\u0e07",
120+
"Subject": "\u0e40\u0e23\u0e37\u0e48\u0e2d\u0e07"
121+
},
122+
"fi": {
123+
"Sent from": "L\u00e4hetetty",
124+
"From": "alkaen",
125+
"To": "jotta",
126+
"wrote": "kirjoitti",
127+
"Sent": "L\u00e4hetetyt",
128+
"Subject": "aihe"
129+
},
130+
"zh": {
131+
"Sent from": "\u6765\u81ea",
132+
"From": "\u4ece",
133+
"To": "\u81f3",
134+
"wrote": "\u5199",
135+
"Sent": "\u53d1\u9001",
136+
"Subject": "\u5b66\u79d1"
137+
},
138+
"ja": {
139+
"Sent from": "\u9001\u4fe1\u5143",
140+
"From": "\u304b\u3089",
141+
"To": "\u306b",
142+
"wrote": "\u66f8\u304d\u307e\u3057\u305f",
143+
"Sent": "\u9001\u4fe1\u6e08\u307f",
144+
"Subject": "\u4ef6\u540d"
145+
},
146+
"pl": {
147+
"Sent from": "Wys\u0142ane z",
148+
"From": "Z",
149+
"To": "Do",
150+
"wrote": "napisa\u0142",
151+
"Sent": "Wys\u0142ane",
152+
"Subject": "Przedmiot"
153+
},
154+
"he": {
155+
"Sent from": "\u05e0\u05e9\u05dc\u05d7 \u05de",
156+
"From": "\u05de",
157+
"To": "\u05dc",
158+
"wrote": "\u05db\u05ea\u05d1\u05ea\u05d9",
159+
"Sent": "\u05e0\u05e9\u05dc\u05d7",
160+
"Subject": "\u05e0\u05d5\u05e9\u05d0"
161+
}
162+
}

0 commit comments

Comments
 (0)