diff --git a/.gitignore b/.gitignore index 71f7c64..afc0a75 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,10 @@ tests/.DS_Store .DS_Store *.egg-info .project +env/ +venv/ dist/ dist/* - +*.csv +__pycache__/ +*.json diff --git a/README.md b/README.md index 8011821..a6c1b36 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,28 @@ # Email Reply Parser for Python -A port of GitHub's Email Reply Parser library, by the fine folks at [Zapier](https://zapier.com/). +A port of GitHub's Email Reply Parser library, by the fine folks at [Zapier](https://zapier.com/), with added language support. + +Currently supported languages: + +Arabic +German +English +Spanish +Finnish +French +Hebrew +Indonesian +Italian +Japanese +Korean +Dutch +Polish +Portuguese +Russian +Slovak +Thai +Turkish +Vietnamese +Chinese ## Summary @@ -45,7 +68,8 @@ from email_reply_parser import EmailReplyParser Step 2: Provide email message as type String ```python -EmailReplyParser.read(email_message) +parser = EmailReplyParser(language='en') +parser.read(email_message) ``` ### How to only retrieve the reply message @@ -56,10 +80,9 @@ Step 1: Import email reply parser package from email_reply_parser import EmailReplyParser ``` -Step 2: Provide email message as type string using parse_reply class method. +Step 2: Provide email message as type string using parse_reply. ```python +parser = EmailReplyParser(language='en') EmailReplyParser.parse_reply(email_message) ``` - - diff --git a/email_reply_parser/__init__.py b/email_reply_parser/__init__.py index 063f65b..dc6385a 100644 --- a/email_reply_parser/__init__.py +++ b/email_reply_parser/__init__.py @@ -1,80 +1,202 @@ """ - email_reply_parser is a python library port of GitHub's Email Reply Parser. - - For more information, visit https://github.com/zapier/email-reply-parser +email_reply_parser is a python library port of GitHub's Email Reply Parser. +For more information, visit https://github.com/zapier/email_reply_parser """ - +import os import re +import json class EmailReplyParser(object): """ Represents a email message that is parsed. """ - @staticmethod - def read(text): - """ Factory method that splits email into list of fragments + def __init__(self, language='en'): + dir_path = os.path.dirname(__file__) + with open(dir_path + "/languages_support.json", "r") as read_file: + self.words_map = json.load(read_file) + if language in self.words_map: + self.language = language + else: + self.language = 'en' + def read(self, text): + """ Factory method that splits email into list of fragments text - A string email body - Returns an EmailMessage instance """ - return EmailMessage(text).read() + return EmailMessage(text, self.language, self.words_map).read() - @staticmethod - def parse_reply(text): + def parse_reply(self, text): """ Provides the reply portion of email. - text - A string email body - Returns reply body message """ - return EmailReplyParser.read(text).reply + a = self.read(text).reply + return a + + def find_contacts(self, text): + """Provides a list of From To emails and the dates of these emails""" + contacts_dict = EmailContacts(text, self.language, self.words_map).contacts() + return contacts_dict class EmailMessage(object): """ An email message represents a parsed email body. """ - SIG_REGEX = re.compile(r'(--|__|-\w)|(^Sent from my (\w+\s*){1,3})') - QUOTE_HDR_REGEX = re.compile('On.*wrote:$') - QUOTED_REGEX = re.compile(r'(>+)') - HEADER_REGEX = re.compile(r'^\*?(From|Sent|To|Subject):\*? .+') - _MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote:)(On\s(.+?)wrote:)' - MULTI_QUOTE_HDR_REGEX = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL | re.MULTILINE) - MULTI_QUOTE_HDR_REGEX_MULTILINE = re.compile(_MULTI_QUOTE_HDR_REGEX, re.DOTALL) - - def __init__(self, text): + def __init__(self, text, language, words_map): self.fragments = [] self.fragment = None - self.text = text.replace('\r\n', '\n') + self.text = text.replace('\r\n', '\n').replace('\r', '\n') self.found_visible = False + self.SIG_REGEX = None + self.QUOTE_HDR_REGEX = None + self.QUOTED_REGEX = None + self.HEADER_REGEX = None + self._MULTI_QUOTE_HDR_REGEX = None + self.MULTI_QUOTE_HDR_REGEX = None + self.MULTI_QUOTE_HDR_REGEX_MULTILINE = None + self.WARNING_REGEX = None + self.words_map = words_map + self.language = language + self.default_language = 'en' + self.set_regex() + + def default_quoted_header(self): + self.QUOTED_REGEX = re.compile(r'(>+)') + self.HEADER_REGEX = re.compile( + r'^[* ]*(' + self.words_map[self.language]['From'] + + '|' + self.words_map[self.language]['Sent'] + + '|' + self.words_map[self.language]['To'] + + ')\s*:[\s\n\*]*.*' + ) + + def warnings(self): + dot = '\u200b' + single_space = f'[ {dot}\xA0\t]' + space = f'[,()]?{single_space}{{0,3}}[\n\r]?{single_space}{{0,3}}[,()]?' + sentence_start = f'(?:[\n\r.!?]|^){single_space}{{0,3}}' + confidential_variations = f'(privileged|confidential|private|sensitive|{space}(/|and|or|and{space}/{space}or|,){space}){{1,3}}' + message_variations = f'(electronic{space}|e[\-]?mail{space}|message{space}|communication{space}|transmission{space}){{1,3}}' + self.WARNING_REGEX = re.compile( + f'(CAUTION:|NOTICE:|Disclaimer:|Warning:|{confidential_variations}{space}Notice:|Please{space}do{space}not{space}reply' + f'|{confidential_variations}{space}information' + f'|{sentence_start}(The|This){space}information{space}(provided|transmitted|contained)?{space}(with)?in{space}this{space}{message_variations}' + f'|{sentence_start}(The|This){space}information{space}(may also be|is){space}legally' + f'|{sentence_start}(The|This){space}content[s]?{space}of{space}this{space}{message_variations}' + f'|{sentence_start}(The|This){space}{message_variations}{space}' + f'(may{space}contain|(and|or|and{space}/{space}or)?{space}(any|all)?{space}(files{space}transmitted|the{space}information{space}(contained|it{space}contains)|attach|associated)' + f'|[(]?including{space}(any|all)?{space}attachments[)]?|(is|are|contains){space}{confidential_variations}' + f'|is{space}for{space}the{space}recipients|is{space}intended{space}only|is{space}for{space}the{space}sole{space}user|has{space}been{space}scanned|with{space}its{space}contents' + f')|{sentence_start}(The|This){space}publication,{space}copying' + f'|{sentence_start}(The|This){space}sender{space}(cannot{space}guarantee|believes{space}that{space}this{space}{message_variations})' + f'|{sentence_start}If{space}you{space}have{space}received{space}this{space}{message_variations}{space}in{space}error' + f'|{sentence_start}The{space}contents{space}are{space}{confidential_variations}' + f'|{sentence_start}(Under|According to){space}(the)?{space}(General{space}Data{space}Protection{space}Regulation|GDPR)' + f'|{sentence_start}Click{space}here{space}to' + f'|{sentence_start}Copyright{space}' + f'|{sentence_start}Was{space}this{space}email{space}helpful\?' + f'|{sentence_start}For{space}Your{space}Information:' + f'|{sentence_start}Emails{space}are{space}not{space}secure' + f'|{sentence_start}To make{space}sure{space}you{space}continue{space}to{space}receive' + f'|{sentence_start}Please{space}choose{space}one{space}of{space}the{space}options{space}below' + f'|{sentence_start}Please{space}consider{space}the{space}environment{space}before{space}printing{space}this{space}{message_variations}' + f'|{sentence_start}This{space}e-mail{space}and{space}any{space}attachments{space}are{space}confidential' + f')[a-zA-Z0-9:;.,?!<>()@&/\'\"\“\” {dot}\xA0\t\-]*', + re.IGNORECASE + ) + + def nl_support(self): + self.SIG_REGEX = re.compile( + r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})' + ) + self.QUOTE_HDR_REGEX = re.compile('Op.*schreef.*>:$') + self._MULTI_QUOTE_HDR_REGEX = r'(?!Op.*Op\s.+?schreef.*>:)(Op\s(.+?)schreef.*>:)' + + def de_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] + '(\w+\s*){1,3})') + self.QUOTE_HDR_REGEX = re.compile('[a-zA-Z]{2,5}.*schrieb.*:$') + self._MULTI_QUOTE_HDR_REGEX = r'(?!Am.*Am\s.+?schrieb.*:)(Am\s(.+?)schrieb.*:)' + + def fr_support(self): + self.SIG_REGEX = re.compile( + r'(--|__|-\w)|(^' + self.words_map[self.language]['Sent from'] \ + + '(\w+\s*){1,3})|(.*(cordialement|bonne r[ée]ception|salutations' + r'|cdlt|cdt|crdt|regards|best regard|bonne journ[ée]e))', + re.IGNORECASE + ) + self.QUOTE_HDR_REGEX = re.compile('Le.*a écrit.*[> ]:$') + self._MULTI_QUOTE_HDR_REGEX = r'(?!Le.*Le\s.+?a écrit[a-zA-Z0-9.:;<>()&@ -]*:)(Le\s(.+?)a écrit[a-zA-Z0-9.:;<>()&@ -]*:)' + + def en_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^(sent from|get outlook)\s(\w+\s*){1,6})|(Best regards|Kind Regards|Thanks,|Thank you,|Best,|All the best|regards,)', flags=re.IGNORECASE) + self.QUOTE_HDR_REGEX = re.compile('\s*On.*wrote\s*:$') + self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') + self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' + + def es_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Enviado desde (\w+\s*){1,6})') + self.QUOTE_HDR_REGEX = re.compile('\s*El.*escribió\s*:$') + self._MULTI_QUOTE_HDR_REGEX = r'(?!El.*El\s.+?escribió\s*:)(El\s(.+?)escribió\s*:)' + + def ja_support(self): + self.SIG_REGEX = re.compile(r'--|__|-\w') + self.QUOTE_HDR_REGEX = re.compile( + r'[0-9]*年[0-9]*月[0-9]*日[\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\uFF00-\uFFEF\u4E00-\u9FAF\u2605-\u2606\u2190-\u2195\u203Ba-zA-Z0-9.:;<>()&@ -]*:?$' + ) + self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') + self._MULTI_QUOTE_HDR_REGEX = r'(?!On.*On\s.+?wrote\s*:)(On\s(.+?)wrote\s*:)' # Dummy multiline: doesnt work for japanese due to BeautifulSoup insreting new lines before ":" character + + def fi_support(self): + self.SIG_REGEX = re.compile(r'(--|__|-\w)|(^Lähetetty (\w+\s*){1,3})|(^Hanki Outlook for.*)') + self.QUOTE_HDR_REGEX = re.compile('(.+?kirjoitti(.+?kello.+?)?:)') + self.QUOTED_REGEX = re.compile(r'(>+)|((>)+)') + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?kirjoitti.+?kirjoitti[a-zA-Z0-9.:;<>()&@ -]*:$)((.+?)kirjoitti[a-zA-Z0-9.:;<>()&@ -]*:$)' + + def set_regex(self): + if hasattr(self, self.language + "_support"): + getattr(self, self.language + "_support")() + self.default_quoted_header() + else: + self.SIG_REGEX = re.compile( + r'(--|__|-\w)|(^(' + self.words_map[self.language]['Sent from'] + + '|' + self.words_map[self.default_language]['Sent from'] + + ')(\w+\s*){1,3})' + ) + self.QUOTE_HDR_REGEX = re.compile('.*' + self.words_map[self.language]['wrote'] + '\s?:$') + self.default_quoted_header() + self._MULTI_QUOTE_HDR_REGEX = r'(?!.+?' + self.words_map[self.language]['wrote'] \ + + '\s*:\s*)(On\s(.+?)' + self.words_map[self.language]['wrote'] + ':)' + self.warnings() + self.FOLLOW_UP_HDR_REGEX = re.compile(r'(?* ]*' + self.words_map[self.language]['From'] + '[ ]*:(.*)\n' + + '[>* ]*(?:' + self.words_map[self.language]['Sent'] + '|Date)[ ]*:(.*)\n' + + '[>* ]*' + self.words_map[self.language]['To'] + '[ ]*:(.*)\n' + + ')' + ) + EMAIL = re.compile(r'([a-zA-Z0-9_\-\.]+@[a-zA-Z0-9_\-\.]+\.[a-zA-Z]{2,5})') + headers = HEADER_BLOCK.findall(self.text) + json = [] + for header in headers: + contact = {'from': '', 'to': '', 'date': ''} + from_email = EMAIL.search(header[1]) + if from_email: + contact['from'] = from_email.groups()[0] + contact['date'] = header[2] + to_email = EMAIL.search(header[3]) + if to_email: + contact['to'] = to_email.groups()[0] + json.append(contact) + return json + + class Fragment(object): """ A Fragment is a part of an Email Message, labeling each part. diff --git a/email_reply_parser/languages_support.json b/email_reply_parser/languages_support.json new file mode 100644 index 0000000..4f3962c --- /dev/null +++ b/email_reply_parser/languages_support.json @@ -0,0 +1,162 @@ +{ + "vi": { + "Sent from": "\u0110\u01b0\u1ee3c g\u1eedi t\u1eeb", + "From": "T\u1eeb", + "To": "\u0110\u1ebfn", + "wrote": "\u0111\u00e3 vi\u1ebft", + "Sent": "G\u1edfi", + "Subject": "M\u00f4n h\u1ecdc" + }, + "ru": { + "Sent from": "\u041e\u0442\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043e \u0438\u0437", + "From": "\u041e\u0442", + "To": "\u043a", + "wrote": "\u043f\u0438\u0441\u0430\u043b", + "Sent": "\u041e\u0442\u043f\u0440\u0430\u0432\u043b\u0435\u043d\u043e", + "Subject": "\u041f\u0440\u0435\u0434\u043c\u0435\u0442" + }, + "fr": { + "Sent from": "Envoy\u00e9 depuis", + "From": "De", + "To": "\u00c0", + "wrote": "a \u00e9crit", + "Sent": "Envoy\u00e9", + "Subject": "Objet" + }, + "en": { + "Sent from": "Sent from", + "From": "From", + "To": "To", + "wrote": "wrote", + "Sent": "Sent", + "Subject": "Subject" + }, + "nl": { + "Sent from": "Verstuurd vanaf", + "From": "Van", + "To": "Aan", + "wrote": "schreef", + "Sent": "Verzonden", + "Subject": "Onderwerp" + }, + "pt": { + "Sent from": "Enviado de", + "From": "De", + "To": "Para", + "wrote": "escrevi", + "Sent": "Enviei", + "Subject": "Sujeito" + }, + "ko": { + "Sent from": "\ubd80\ud130 \ubcf4\ub0b4\uc9c4", + "From": "\uc5d0\uc11c", + "To": "\uc5d0", + "wrote": "\uc4f4", + "Sent": "\uc804\uc1a1 \ub428", + "Subject": "\uc81c\ubaa9" + }, + "de": { + "Sent from": "Gesendet von", + "From": "Von", + "To": "An", + "wrote": "schrieb", + "Sent": "geschickt", + "Subject": "Betreff" + }, + "tr": { + "Sent from": "Den g\u00f6nderildi", + "From": "itibaren", + "To": "i\u00e7in", + "wrote": "yazd\u0131", + "Sent": "G\u00f6nderilen", + "Subject": "konu" + }, + "it": { + "Sent from": "Inviato da", + "From": "Da", + "To": "A", + "wrote": "ha scritto", + "Sent": "Inviato", + "Subject": "Oggetto" + }, + "id": { + "Sent from": "Dikirim dari", + "From": "Dari", + "To": "Untuk", + "wrote": "menulis", + "Sent": "Terkirim", + "Subject": "Subyek" + }, + "sk": { + "Sent from": "Odoslan\u00e9 od", + "From": "z", + "To": "na", + "wrote": "nap\u00edsal", + "Sent": "odoslan\u00e9", + "Subject": "predmet" + }, + "ar": { + "Sent from": "\u0627\u0631\u0633\u0644\u062a \u0645\u0646", + "From": "\u0645\u0646 \u0639\u0646\u062f", + "To": "\u0625\u0644\u0649", + "wrote": "\u0643\u062a\u0628", + "Sent": "\u0623\u0631\u0633\u0644\u062a", + "Subject": "\u0645\u0648\u0636\u0648\u0639" + }, + "es": { + "Sent from": "Enviado desde", + "From": "De", + "To": "Para", + "wrote": "escribi\u00f3", + "Sent": "Expedido", + "Subject": "Asunto" + }, + "th": { + "Sent from": "\u0e2a\u0e48\u0e07\u0e08\u0e32\u0e01", + "From": "\u0e08\u0e32\u0e01", + "To": "\u0e44\u0e1b\u0e22\u0e31\u0e07", + "wrote": "\u0e40\u0e02\u0e35\u0e22\u0e19", + "Sent": "\u0e2a\u0e48\u0e07", + "Subject": "\u0e40\u0e23\u0e37\u0e48\u0e2d\u0e07" + }, + "fi": { + "Sent from": "L\u00e4hetetty", + "From": "L\u00e4hett\u00e4j\u00e4", + "To": "Vastaanottaja", + "wrote": "kirjoitti", + "Sent": "L\u00e4hetetty", + "Subject": "Aihe" + }, + "zh": { + "Sent from": "\u83b7\u53d6", + "From": "\u4ece|\u53d1\u4ef6\u4eba", + "To": "\u81f3|\u6536\u4ef6\u4eba", + "wrote": "\u5199|\u5199\u9053", + "Sent": "\u53d1\u9001", + "Subject": "\u5b66\u79d1" + }, + "ja": { + "Sent from": "\u9001\u4fe1\u5143", + "From": "\u304b\u3089", + "To": "\u306b", + "wrote": "\u66f8\u304d\u307e\u3057\u305f", + "Sent": "\u9001\u4fe1\u6e08\u307f", + "Subject": "\u4ef6\u540d" + }, + "pl": { + "Sent from": "Wys\u0142ane z", + "From": "Z", + "To": "Do", + "wrote": "napisa\u0142", + "Sent": "Wys\u0142ane", + "Subject": "Przedmiot" + }, + "he": { + "Sent from": "\u05e0\u05e9\u05dc\u05d7 \u05de", + "From": "\u05de", + "To": "\u05dc", + "wrote": "\u05db\u05ea\u05d1\u05ea\u05d9", + "Sent": "\u05e0\u05e9\u05dc\u05d7", + "Subject": "\u05e0\u05d5\u05e9\u05d0" + } +} diff --git a/setup.py b/setup.py index 5d3078e..6f25115 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,10 @@ version=version.VERSION, description='Email reply parser', packages=['email_reply_parser'], - package_data={'email_reply_parser': ['../VERSION']}, + package_data={ + 'email_reply_parser': ['../VERSION'], + '': ['./languages_support.json'] + }, author='Royce Haynes', author_email='royce.haynes@gmail.com', url='https://github.com/zapier/email-reply-parser', @@ -32,4 +35,4 @@ "Programming Language :: Python :: 3.3", "Programming Language :: Python :: 3.4", ] -) \ No newline at end of file +) diff --git a/test/emails/caution.txt b/test/emails/caution.txt new file mode 100644 index 0000000..67c69bb --- /dev/null +++ b/test/emails/caution.txt @@ -0,0 +1,50 @@ +CAUTION: This email originated from outside of this company. Do not click links or open attachments unless you recognize the sender and know the content is safe. + + + +Hi lads's Team, + + +Below THIS and THAT file for 31st August 2019 is still not available on "lads.lads.com" could you please check and advise us ASAP. + +pfg.Zip +pfg02.Zip + + +part01_07.Zip +_part02_07.Zip +_part03_07.Zip +_part04_07.Zip +_part05_07.Zip +_part06_07.Zip +_part07_07.Zip + + +job at our end. + + +Thanks, +___________________ +HAHA LOLO +Markets Application Production Services - Reference Data +ROLE +Bank of LADS - LADS LADS +BUILDING 5B,HAHA - THIS THAT, COUNTRY +Direct: (+00)00-000-0000 Mobile: (+00)000000 +______________________________________________________________________________ + +To report an issue or request for technical assistance with Product Reference Data applications, please send email to LADS SUPPORT. This is the only OO being monitored by the Product Reference Data support team. No other DGs or Mailboxes are being actively monitored. +Please make a note of this to avoid any delays. + +Escalation: LADS MANAGEMENT +___________________________________________________________ + +---------------------------------------------------------------------- +This message w/attachments (message) is intended solely for the use of the intended recipient(s) and may contain information that is privileged, confidential or proprietary. If you are not an intended recipient, please notify the sender, and then please delete and destroy all copies and attachments, and be advised that any review or dissemination of, or the taking of any action in reliance on, the information contained in or attached to this message is prohibited. +Unless specifically indicated, this message is not an offer to sell or a solicitation of any investment products or other financial product or service, an official confirmation of any transaction, or an official statement of Sender. Subject to applicable law, Sender may intercept, monitor, review and retain e-communications (EC) traveling through its networks/systems and may produce any such EC to regulators, law enforcement, in litigation and as required by law. +The laws of the country of each sender/recipient may impact the handling of EC, and EC may be archived, supervised and produced in countries other than the country in which you are located. This message cannot be guaranteed to be secure or free of errors or viruses. Attachments that are part of this EC may have additional important disclosures and disclaimers, which you should read. By messaging with Sender you consent to the foregoing. +----------------------------------------- + +lads's monitors email communications through its networks for regulatory compliance purposes and to protect its customers, employees and business and where allowed to do so by applicable law. The information contained in this e-mail message, and any attachment thereto, is confidential and may not be disclosed without our express permission. If you are not the intended recipient or an employee or agent responsible for delivering this message to the intended recipient, you are hereby notified that you have received this message in error and that any review, dissemination, distribution or copying of this message, or any attachment thereto, in whole or in part, is strictly prohibited. If you have received this message in error, please immediately notify us by telephone, fax or e-mail and delete the message and all of its attachments. Every effort is made to keep our network free from viruses. You should, however, review this e-mail message, as well as any attachment thereto, for viruses. We take no responsibility and have no liability for any computer virus which may be transferred via this e-mail message. + +----------------------------------------- diff --git a/test/emails/caution2.txt b/test/emails/caution2.txt new file mode 100644 index 0000000..8624943 --- /dev/null +++ b/test/emails/caution2.txt @@ -0,0 +1,12 @@ +Hello, + +I am trying to place an order and it keeps tell me my order cannot be processed at this time. I tried using two different consultants and it still won't work. I am not sure if it's a technical problem. My order is time sensitive. Thank you for your help! + +Person Person. + + +Disclaimer + +The information contained in this communication from the sender is confidential. It is intended solely for use by the recipient and others authorized to receive it. If you are not the recipient, you are hereby notified that any disclosure, copying, distribution or taking action in relation of the contents of this information is strictly prohibited and may be unlawful. + +This email has been scanned for viruses and malware, and may have been automatically archived by Thing Ltd, an innovator in Software as a Service (SaaS) for business. Providing a safer and more useful place for your human generated data. Specializing in; Security, archiving and compliance. To find out more Click Here (http://www.thisthat.com/things/) . diff --git a/test/emails/chinese.txt b/test/emails/chinese.txt new file mode 100644 index 0000000..340b62e --- /dev/null +++ b/test/emails/chinese.txt @@ -0,0 +1,22 @@ +Hi, I want to cancel my order, could you recall the package and return it? Thank you + +发件人: ""xyz.com"" +日期: 2020年3月6日 星期五 上午1:04 +收件人: ""zys@hotmail.com"" +主题: xyz.com - Order <111111111> has been shipped + +Shipping Confirmation + +Dear XYZ ZUS, + +Thank you for shopping at ZYS! + +Your order has been shipped. Your shipment details are shown below. + +Please note that it may take up to 24 hours for the shipping carrier to update the tracking information. + +If you have any questions you can contact us at + +ORDER + +SHIPPING diff --git a/test/emails/chinese2.txt b/test/emails/chinese2.txt new file mode 100644 index 0000000..fe6efa6 --- /dev/null +++ b/test/emails/chinese2.txt @@ -0,0 +1,18 @@ +Dear bbb +This is X. I just brought one sneaker and two slippers . Can I cancel my order please thanks + +XYZ Customer Care 于2020年3月9日 周一下午9:10写道: + +Thank you for shopping at SSENSE. Please allow us up to two business days to process your order. You’ll find a copy of your receipt and order information. + +MY ACCOUNT () CUSTOMER SUPPORT () + +Order Confirmation + +Dear XYZ XYZ, + +Thank you for shopping at ZYZ. Please allow us up to two business days to process your order. Once it’s been processed, you’ll receive a shipment confirmation email with your order’s tracking number. + +Below, you’ll find a copy of your receipt and order information. Please keep it for your records. + +\ diff --git a/test/emails/emails.json b/test/emails/emails.json new file mode 100644 index 0000000..fe51488 --- /dev/null +++ b/test/emails/emails.json @@ -0,0 +1 @@ +[] diff --git a/test/emails/forward.txt b/test/emails/forward.txt new file mode 100644 index 0000000..a715ce3 --- /dev/null +++ b/test/emails/forward.txt @@ -0,0 +1,6 @@ +FW: YYY Arrival Notice XYZ - YYY ELA/XYZ ETA: 2020-06-08 +This is a follow-up to your previous request #12345 "RE: XYZ and Manifest amendm..." +Hello team, +Can I get 1 Arrival Notice without PU# and invoice? +Thank you. +Best regards diff --git a/test/emails/multi_header.txt b/test/emails/multi_header.txt new file mode 100644 index 0000000..8eeded0 --- /dev/null +++ b/test/emails/multi_header.txt @@ -0,0 +1,41 @@ +No problem. I’ll just start a new order. + +On May 30, 2020, at 4:24 PM, XYZ wrote: + + +Hi XYZ, + +Unfortunately, we are unable to add items to your order, but if you would like we can cancel your order and issue a full refund so that you may order again with your preferred selection of pastries. This may result in a later delivery date, but please let us know if you would like us to cancel your order and we will set that up for you. + + + +On May 30, 2020, 4:15 PM XYZ xyz@xyz.com wrote: + +No worries and thank you. I wanted to add a couple of new items to the same shipment. Would that Be possible or should I just order thru the website? + +On May 30, 2020, at 3:54 PM, XYZ wrote: + + +Hi XYZ, + +We apologize for the incorrect product and for any inconvenience this may have caused. + +We have placed a replacement order of 1 X for delivery on June 4, 2020. You will be receiving an email confirmation for this new order and your tracking number will be emailed 1-2 nights before the delivery date. + +Thank you for your patience! + + + +On May 30, 2020, 10:01 AM XYZ xyz@xyz.com wrote: + +Hello XYZ, + +Thank you for emailing the Team! + +We are writing to you to confirm that we have received your email. + +We apologize for any inconvenience and assure you that we will find a solution for any question, concern, or comment you may have. + +We appreciate your patience during these times. + +-Team diff --git a/test/emails/outlook.txt b/test/emails/outlook.txt new file mode 100644 index 0000000..4111b57 --- /dev/null +++ b/test/emails/outlook.txt @@ -0,0 +1,9 @@ +Dear Team, + +Hello + +What is the best way to clear a Riak bucket of all key, values after +running a test? + +My number is: +00 0000 000 000 +Get Outlook for iOS diff --git a/test/performance.py b/test/performance.py new file mode 100644 index 0000000..67d067d --- /dev/null +++ b/test/performance.py @@ -0,0 +1,80 @@ +import pandas as pd +# import numpy as np +import json +import time +from bs4 import BeautifulSoup # requires lxml +from email_reply_parser import EmailReplyParser + + +def profile(): + df = pd.read_csv('test.csv') + ground = time.time() + content = df.content.values[np.argmax([len(d) for d in df.content.values])] + start = time.time() + parser = EmailReplyParser(language='fr') + print(str(time.time() - start) + 'init parser') + start = time.time() + res = parser.parse_reply(content) + print(str(time.time() - start) + 'parse') + start = time.time() + soup = BeautifulSoup(res, 'lxml') + text = soup.getText(' ') + print(str(time.time() - start) + 'soup') + print(f'Total time: {time.time() - ground}') + + +def verify(): + parser = EmailReplyParser(language='fi') + texts = json.load(open('test/emails/emails.json')) + texts = list(filter(lambda d: type(d) == str, texts)) + parsed = [] + for text in texts: + print('-'*100) + soup = BeautifulSoup(text, 'lxml') + text = soup.getText('\n') + text = parser.parse_reply(text) + parsed.append(text) + print(text) + + +def parse_df(): + parser = EmailReplyParser(language='en') + path = 'test/emails/zipwrotetest.csv' + df = pd.read_csv(path) + parsed = [] + for text in df.sentence.values: + soup = BeautifulSoup(text, 'lxml') + text = soup.getText('\n') + text = parser.parse_reply(text) + parsed.append(text) + df = df.assign(clean=parsed) + df.to_csv(path) + import code + code.interact(local=locals()) + + +def parse_json(): + parser = EmailReplyParser(language='en') + with open('english.json', 'rb') as fl: + messages = json.load(fl) + parsed = [] + for text in messages: + soup = BeautifulSoup(text, 'lxml') + text = soup.getText('\n') + text = parser.parse_reply(text) + parsed.append(text) + import code + code.interact(local=locals()) + + +def parse_text(): + parser = EmailReplyParser(language='en') + with open('test/emails/caution.txt', 'r') as fl: + message = fl.read() + text = parser.parse_reply(message) + print(text) + + +if __name__ == '__main__': + parse_text() + # parse_text() diff --git a/test/test_email_reply_parser.py b/test/test_email_reply_parser.py index 8d2849b..7e586ab 100644 --- a/test/test_email_reply_parser.py +++ b/test/test_email_reply_parser.py @@ -70,7 +70,6 @@ def test_reads_inline_replies(self): def test_reads_top_post(self): message = self.get_email('email_1_3') - self.assertEqual(5, len(message.fragments)) def test_multiline_reply_headers(self): @@ -125,30 +124,31 @@ def test_reply_is_parsed(self): def test_reply_from_gmail(self): with open('test/emails/email_gmail.txt') as f: self.assertEqual('This is a test for inbox replying to a github message.', - EmailReplyParser.parse_reply(f.read())) + EmailReplyParser().parse_reply(f.read())) def test_parse_out_just_top_for_outlook_reply(self): with open('test/emails/email_2_1.txt') as f: - self.assertEqual("Outlook with a reply", EmailReplyParser.parse_reply(f.read())) + self.assertEqual("Outlook with a reply", EmailReplyParser().parse_reply(f.read())) def test_parse_out_just_top_for_outlook_with_reply_directly_above_line(self): with open('test/emails/email_2_2.txt') as f: - self.assertEqual("Outlook with a reply directly above line", EmailReplyParser.parse_reply(f.read())) + self.assertEqual("Outlook with a reply directly above line", EmailReplyParser().parse_reply(f.read())) def test_parse_out_just_top_for_outlook_with_unusual_headers_format(self): with open('test/emails/email_2_3.txt') as f: self.assertEqual( "Outlook with a reply above headers using unusual format", - EmailReplyParser.parse_reply(f.read())) + EmailReplyParser().parse_reply(f.read())) def test_sent_from_iphone(self): with open('test/emails/email_iPhone.txt') as email: - self.assertTrue("Sent from my iPhone" not in EmailReplyParser.parse_reply(email.read())) + + self.assertTrue("Sent from my iPhone" not in EmailReplyParser().parse_reply(email.read())) def test_email_one_is_not_on(self): with open('test/emails/email_one_is_not_on.txt') as email: self.assertTrue( - "On Oct 1, 2012, at 11:55 PM, Dave Tapley wrote:" not in EmailReplyParser.parse_reply(email.read())) + "On Oct 1, 2012, at 11:55 PM, Dave Tapley wrote:" not in EmailReplyParser().parse_reply(email.read())) def test_partial_quote_header(self): message = self.get_email('email_partial_quote_header') @@ -194,7 +194,7 @@ def get_email(self, name): """ with open('test/emails/%s.txt' % name) as f: text = f.read() - return EmailReplyParser.read(text) + return EmailReplyParser().read(text) if __name__ == '__main__':