Pacific-AI-Corp · ArshaanNazir · Jun 1, 2023 · May 31, 2023 · May 31, 2023 · May 31, 2023
diff --git a/.github/workflows/requirements.txt b/.github/workflows/requirements.txt
@@ -13,6 +13,5 @@ jsonlines
 openai
 langchain
 evaluate
-inflect
 rouge_score
 typing-extensions < 4.6.0
diff --git a/nlptest/transform/robustness.py b/nlptest/transform/robustness.py
@@ -1,12 +1,12 @@
 import asyncio
 import random
 import re
-from inflect import engine
 from abc import ABC, abstractmethod
 from typing import Dict, List, Optional
 from nlptest.modelhandler.modelhandler import ModelFactory
 from .utils import (CONTRACTION_MAP, TYPO_FREQUENCY, default_user_prompt ,ocr_typo_dict, abbreviation_dict)
 from ..utils.custom_types import Sample, Span, Transformation
+from ..utils.number_to_word import engine
 from typing import List
 import string
 from ..utils.SoundsLikeFunctions import Search
@@ -622,15 +622,14 @@ def convert_numbers(regex, text):
                 token = match.group()
                 words = NumberToWord.infEng.number_to_words(
                     token, wantlist=True)
-                token_len = len(token) - 1
-                new_words_len = len(' '.join(words)) - 1
+                new_words_len = len(' '.join(words))
                 trans.append(text[start_offset:match.start()])
                 trans.append(' '.join(words))
                 start_offset = match.end()  
                 transformations.append(
                         Transformation(
                             original_span=Span(
-                                start=match.start(), end=match.end()-1, word=token),
+                                start=match.start(), end=match.end(), word=token),
                             new_span=Span(start=match.start(), end=match.start(
                             )+new_words_len, word=' '.join(words)),
                             ignore=False
@@ -644,17 +643,14 @@ def convert_numbers(regex, text):
 
         for idx, sample in enumerate(sample_list):
             if isinstance(sample, str):
-                sample_list[idx] = convert_numbers(
-                    r'(?<!\S)\d+(\.\d+)?(\.)?(?=(\s|\n|$))', sample)
+                sample_list[idx] = convert_numbers(r'(?<!\S)(\d+(\.\d+)?)(?=(\s|\n|$))', sample)
             else:
-                sample.test_case, transformations = convert_numbers(
-                    r'(?<!\S)\d+(\.\d+)?(\.)?(?=(\s|\n|$))', sample.original)
+                sample.test_case, transformations = convert_numbers(r'(?<!\S)(\d+(\.\d+)?)(?=(\s|\n|$))', sample.original)
                 if sample.task in ("ner", "text-classification"):
                     sample.transformations = transformations
                 sample.category = "robustness"
         return sample_list
 
-
 class AddOcrTypo(BaseRobustness):
     alias_name = "add_ocr_typo"
 
@@ -758,25 +754,19 @@ def insert_abbreviation(text):
                                         ignore=False
                                     )
                                 ) 
-            sample.category = "robustness"
-            if sample.task in ("ner", "text-classification"):
-                sample.transformations = transformations 
-            return perturbed_text
-
-        for sample in sample_list:
-            if sample.task == 'question-answering':
-                sample.perturbed_question = insert_abbreviation(sample.original_question)
-
-                if "perturbed_context" in sample.__annotations__:
-                    sample.perturbed_context = insert_abbreviation(sample.original_context)
+            return perturbed_text, transformations
 
+        for idx, sample in enumerate(sample_list):
+            if isinstance(sample, str):
+                sample_list[idx] = insert_abbreviation(sample)
             else:
-                sample.test_case = insert_abbreviation(sample.original)
-            sample.category = "robustness"
-
-        return sample_list 
-
-
+                sample.test_case, transformations = insert_abbreviation(sample.original)
+                if sample.task in ("ner", "text-classification"):
+                    sample.transformations = transformations
+                sample.category = "robustness"
+
+        return sample_list
+
 class AddSpeechToTextTypo(BaseRobustness):
     alias_name = "add_speech_to_text_typo"
 

diff --git a/nlptest/transform/utils.py b/nlptest/transform/utils.py
@@ -1,6 +1,6 @@
 from collections import defaultdict
 from typing import Dict, List
-
+import re
 import pandas as pd
 
 from nlptest.utils.custom_types import NERPrediction, Sample, SequenceLabel, NEROutput, SequenceClassificationOutput
@@ -7200,6 +7200,88 @@ def get_entity_representation_proportions(entity_representation):
     "xsum": "You are an intelligent Context summarizer. Please read the following context carefully. After understanding its content, create a concise summary, capturing the essential themes and key details. Please ensure that the summary does not end abruptly and remains within the max_tokens word limit. Context: {context}\n\n Summary: "
 }
 
+nth = {
+    0: "th",
+    1: "st",
+    2: "nd",
+    3: "rd",
+    4: "th",
+    5: "th",
+    6: "th",
+    7: "th",
+    8: "th",
+    9: "th",
+    11: "th",
+    12: "th",
+    13: "th",}
+
+ordinal = dict(
+    ty="tieth",
+    one="first",
+    two="second",
+    three="third",
+    five="fifth",
+    eight="eighth",
+    nine="ninth",
+    twelve="twelfth",
+)
+
+unit = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]
+
+teen = [
+    "ten",
+    "eleven",
+    "twelve",
+    "thirteen",
+    "fourteen",
+    "fifteen",
+    "sixteen",
+    "seventeen",
+    "eighteen",
+    "nineteen",
+]
+ten = [
+    "",
+    "",
+    "twenty",
+    "thirty",
+    "forty",
+    "fifty",
+    "sixty",
+    "seventy",
+    "eighty",
+    "ninety",
+]
+mill = [
+    " ",
+    " thousand",
+    " million",
+    " billion",
+    " trillion",
+    " quadrillion",
+    " quintillion",
+    " sextillion",
+    " septillion",
+    " octillion",
+    " nonillion",
+    " decillion",
+]
+
+nth_suff = set(nth.values())
+ordinal_suff = re.compile(fr"({'|'.join(ordinal)})\Z")
+
+NON_DIGIT = re.compile(r"\D")
+WHITESPACES_COMMA = re.compile(r"\s+,")
+COMMA_WORD = re.compile(r", (\S+)\s+\Z")
+WHITESPACES = re.compile(r"\s+")
+DIGIT_GROUP = re.compile(r"(\d)")
+TWO_DIGITS = re.compile(r"(\d)(\d)")
+THREE_DIGITS = re.compile(r"(\d)(\d)(\d)")
+THREE_DIGITS_WORD = re.compile(r"(\d)(\d)(\d)(?=\D*\Z)")
+TWO_DIGITS_WORD = re.compile(r"(\d)(\d)(?=\D*\Z)")
+ONE_DIGIT_WORD = re.compile(r"(\d)(?=\D*\Z)")
+FOUR_DIGIT_COMMA = re.compile(r"(\d)(\d{3}(?:,|\Z))")
+
 qa_prompt_template ="""
 You are a distinguished professor known for your expertise in meticulously grading students' answers to questions. Your extensive knowledge and experience make you the go-to authority in your field.
 You have been entrusted with the evaluation of the following question: