diff --git a/nlptest/transform/robustness.py b/nlptest/transform/robustness.py index b1c96115e..277a61dba 100644 --- a/nlptest/transform/robustness.py +++ b/nlptest/transform/robustness.py @@ -6,7 +6,7 @@ from nlptest.modelhandler.modelhandler import ModelFactory from .utils import (CONTRACTION_MAP, TYPO_FREQUENCY, default_user_prompt ,ocr_typo_dict,abbreviation_dict,Slang_Nouns, Slang_Adverbs, Slang_Adjectives) from ..utils.custom_types import Sample, Span, Transformation -from ..utils.number_to_word import engine +from ..utils.number_to_word import ConvertNumberToWord from typing import List import string from ..utils.SoundsLikeFunctions import Search @@ -599,7 +599,7 @@ def search_contraction(text): class NumberToWord(BaseRobustness): alias_name = "number_to_word" - infEng = engine() + num = ConvertNumberToWord() @staticmethod def transform(sample_list: List[Sample]) -> List[Sample]: @@ -620,8 +620,7 @@ def convert_numbers(regex, text): for match in re.finditer(regex, text): token = match.group() - words = NumberToWord.infEng.number_to_words( - token, wantlist=True) + words = NumberToWord.num.number_to_words(token, wantlist=True) new_words_len = len(' '.join(words)) trans.append(text[start_offset:match.start()]) trans.append(' '.join(words)) @@ -630,20 +629,18 @@ def convert_numbers(regex, text): Transformation( original_span=Span( start=match.start(), end=match.end(), word=token), - new_span=Span(start=match.start(), end=match.start( - )+new_words_len, word=' '.join(words)), + new_span=Span(start=match.start(), end=match.start()+new_words_len, word=' '.join(words)), ignore=False ) ) - + trans.append(text[start_offset:]) results.append(''.join(trans)) - return ''.join(results), transformations for idx, sample in enumerate(sample_list): if isinstance(sample, str): - sample_list[idx] = convert_numbers(r'(? None: + if STDOUT_ON: + print(txt) class BadChunkingOptionError(Exception): pass @@ -26,14 +31,59 @@ class BadChunkingOptionError(Exception): class NumOutOfRangeError(Exception): pass -STDOUT_ON = False -def print3(txt: str) -> None: - if STDOUT_ON: - print(txt) - -class engine: +class ConvertNumberToWord: def __init__(self) -> None: self.mill_count = 0 + + def millfn(self, ind: int = 0) -> str: + if ind >= len(mill): + print3("Number out of range") + raise NumOutOfRangeError + return mill[ind] + + def tenfn(self, tens: int, units: int, mindex: int = 0) -> str: + if tens != 1: + tens_part = ten[tens] + hyphen = "-" if tens and units else "" + unit_part = unit[units] + mill_part = self.millfn(mindex) + return f"{tens_part}{hyphen}{unit_part}{mill_part}" + return f"{teen[units]}{mill[mindex]}" + + def group1sub(self, mo: Match) -> str: + units = int(mo.group(1)) + if units == 1: + return f" {self.number_args['one']}, " + elif units: + return f"{unit[units]}, " + else: + return f" {self.number_args['zero']}, " + + def group1bsub(self, mo: Match) -> str: + units = int(mo.group(1)) + if units: + return f"{unit[units]}, " + else: + return f" {self.number_args['zero']}, " + + def group2sub(self, mo: Match) -> str: + tens = int(mo.group(1)) + units = int(mo.group(2)) + if tens: + return f"{self.tenfn(tens, units)}, " + if units: + return f" {self.number_args['zero']} {unit[units]}, " + return f" {self.number_args['zero']} {self.number_args['zero']}, " + + def group3sub(self, mo: Match) -> str: + hundreds = int(mo.group(1)) + tens = int(mo.group(2)) + units = int(mo.group(3)) + number_args = self.number_args + hunword = f" {number_args['one']}" if hundreds == 1 else str(unit[hundreds]) if hundreds else f" {number_args['zero']}" + tenword = self.tenfn(tens, units) if tens else f" {number_args['zero']} {unit[units]}" if units else f" {number_args['zero']} {number_args['zero']}" + return f"{hunword} {tenword}, " + def number_to_words( self, num: Union[Number, Word], @@ -47,145 +97,117 @@ def number_to_words( threshold: Optional[int] = None, ) -> Union[str, List[str]]: """ - Return a number in words. - - group = 1, 2 or 3 to group numbers before turning into words - comma: define comma + Return a number in words. - andword: - word for 'and'. Can be set to ''. - e.g. "one hundred and one" vs "one hundred one" + Args: + num (Union[Number, Word]): The number to convert to words. + wantlist (bool, optional): Whether to return a list of words instead of a single string. Defaults to False. + group (int, optional): The grouping option for numbers. Can be 1, 2, or 3. Defaults to 0. + comma (Union[Falsish, str], optional): The comma separator. Defaults to ",". + andword (str, optional): The word for 'and'. Can be set to '' for no 'and'. Defaults to "and". + zero (str, optional): The word for '0'. Defaults to "zero". + one (str, optional): The word for '1'. Defaults to "one". + decimal (Union[Falsish, str], optional): The word for the decimal point. Defaults to "point". + threshold (Optional[int], optional): Numbers above this threshold will not be turned into words. Defaults to None. - zero: word for '0' - one: word for '1' - decimal: word for decimal point - threshold: numbers above threshold not turned into words + Returns: + Union[str, List[str]]: The number in words as a string or a list of words. - parameters not remembered from last call. Departure from Perl version. - """ - self._number_args = {"andword": andword, "zero": zero, "one": one} + Notes: + - Parameters are not remembered from the last call (departure from Perl version). + """ + self.number_args = {"andword": andword, "zero": zero, "one": one} num = str(num) # Handle "stylistic" conversions (up to a given threshold)... if threshold is not None and float(num) > threshold: spnum = num.split(".", 1) while comma: - (spnum[0], n) = FOUR_DIGIT_COMMA.subn(r"\1,\2", spnum[0]) + spnum[0], n = FOUR_DIGIT_COMMA.subn(r"\1,\2", spnum[0]) if n == 0: break try: return f"{spnum[0]}.{spnum[1]}" except IndexError: return str(spnum[0]) - + if group < 0 or group > 3: raise BadChunkingOptionError + nowhite = num.lstrip() - if nowhite[0] == "+": - sign = "plus" - elif nowhite[0] == "-": - sign = "minus" - else: - sign = "" + sign = "plus" if nowhite.startswith("+") else "minus" if nowhite.startswith("-") else "" if num in nth_suff: num = zero - myord = num[-2:] in nth_suff + myord = num.endswith(tuple(nth_suff)) if myord: num = num[:-2] + finalpoint = False + if decimal: - if group != 0: - chunks = num.split(".") - else: - chunks = num.split(".", 1) - if chunks[-1] == "": # remove blank string if nothing after decimal - chunks = chunks[:-1] - finalpoint = True # add 'point' to end of output + chunks = num.split(".", 1) if group == 0 else num.split(".") + finalpoint = chunks[-1] == "" + if finalpoint: + chunks.pop() + else: chunks = [num] - first: Union[int, str, bool] = 1 - loopstart = 0 - - if chunks[0] == "": - first = 0 - if len(chunks) > 1: - loopstart = 1 + first = 1 if chunks[0] else 0 + loopstart = 1 if not first and len(chunks) > 1 else 0 for i in range(loopstart, len(chunks)): - chunk = chunks[i] - # remove all non numeric \D - chunk = NON_DIGIT.sub("", chunk) - if chunk == "": - chunk = "0" + chunk = NON_DIGIT.sub("", chunks[i] or "0") - if group == 0 and (first == 0 or first == ""): + if group == 0 and (not first or first == ""): chunk = self.enword(chunk, 1) else: chunk = self.enword(chunk, group) - if chunk[-2:] == ", ": - chunk = chunk[:-2] + chunk = chunk.rstrip(", ") chunk = WHITESPACES_COMMA.sub(",", chunk) - if group == 0 and first: chunk = COMMA_WORD.sub(f" {andword} \\1", chunk) chunk = WHITESPACES.sub(" ", chunk) - # chunk = re.sub(r"(\A\s|\s\Z)", self.blankfn, chunk) chunk = chunk.strip() + if first: first = "" + chunks[i] = chunk numchunks = [] if first != 0: numchunks = chunks[0].split(f"{comma} ") - if myord and numchunks: - # TODO: can this be just one re as it is in perl? - mo = ordinal_suff.search(numchunks[-1]) - if mo: - numchunks[-1] = ordinal_suff.sub(ordinal[mo.group(1)], numchunks[-1]) - else: - numchunks[-1] += "th" - - for chunk in chunks[1:]: - numchunks.append(decimal) - numchunks.extend(chunk.split(f"{comma} ")) - - if finalpoint: - numchunks.append(decimal) - - # wantlist: Perl list context. can explicitly specify in Python - if wantlist: - if sign: - numchunks = [sign] + numchunks - return numchunks - elif group: - signout = f"{sign} " if sign else "" - return f"{signout}{', '.join(numchunks)}" - else: - signout = f"{sign} " if sign else "" - num = f"{signout}{numchunks.pop(0)}" - if decimal is None: - first = True + if myord and numchunks: + last_chunk = numchunks[-1] + mo = ordinal_suff.search(last_chunk) + numchunks[-1] = ordinal_suff.sub(ordinal.get(mo.group(1), ""), last_chunk) if mo else last_chunk + "th" + + for chunk in chunks[1:]: + numchunks.append(decimal) + numchunks.extend(chunk.split(f"{comma} ")) + + if finalpoint: + numchunks.append(decimal) + + if wantlist: + numchunks = [sign] + numchunks if sign else numchunks + return numchunks + elif group: + signout = f"{sign} " if sign else "" + return f"{signout}{', '.join(numchunks)}" else: - first = not num.endswith(decimal) - for nc in numchunks: - if nc == decimal: - num += f" {nc}" - first = 0 - elif first: - num += f"{comma} {nc}" - else: - num += f" {nc}" - return num + signout = f"{sign} " if sign else "" + num = f"{signout}{numchunks.pop(0)}" + first = True if decimal is None else not num.endswith(decimal) + num += "".join([f" {nc}" if not first else f"{comma} {nc}" if nc == decimal else f" {nc}" for nc in numchunks]) + return num + def enword(self, num: str, group: int) -> str: - # import pdb - # pdb.set_trace() - if group == 1: num = DIGIT_GROUP.sub(self.group1sub, num) elif group == 2: @@ -196,19 +218,21 @@ def enword(self, num: str, group: int) -> str: num = TWO_DIGITS.sub(self.group2sub, num, 1) num = DIGIT_GROUP.sub(self.group1sub, num, 1) elif int(num) == 0: - num = self._number_args["zero"] + return self.number_args["zero"] elif int(num) == 1: - num = self._number_args["one"] - else: - num = num.lstrip().lstrip("0") - self.mill_count = 0 - # surely there's a better way to do the next bit + return self.number_args["one"] + + num = num.lstrip().lstrip("0") + self.mill_count = 0 + while True: mo = THREE_DIGITS_WORD.search(num) - while mo: - num = THREE_DIGITS_WORD.sub(self.hundsub, num, 1) - mo = THREE_DIGITS_WORD.search(num) - num = TWO_DIGITS_WORD.sub(self.tensub, num, 1) - num = ONE_DIGIT_WORD.sub(self.unitsub, num, 1) + if not mo: + break + num = THREE_DIGITS_WORD.sub(self.hundsub, num, 1) + + num = TWO_DIGITS_WORD.sub(self.tensub, num, 1) + num = ONE_DIGIT_WORD.sub(self.unitsub, num, 1) + return num def hundsub(self, mo: Match) -> str: ret = self.hundfn( @@ -216,6 +240,7 @@ def hundsub(self, mo: Match) -> str: ) self.mill_count += 1 return ret + def millfn(self, ind: int = 0) -> str: if ind > len(mill) - 1: print3("number out of range") @@ -225,13 +250,10 @@ def millfn(self, ind: int = 0) -> str: def unitfn(self, units: int, mindex: int = 0) -> str: return f"{unit[units]}{self.millfn(mindex)}" - def tenfn(self, tens, units, mindex=0) -> str: + def tenfn(self, tens: int, units: int, mindex: int = 0) -> str: if tens != 1: tens_part = ten[tens] - if tens and units: - hyphen = "-" - else: - hyphen = "" + hyphen = "-" if tens and units else "" unit_part = unit[units] mill_part = self.millfn(mindex) return f"{tens_part}{hyphen}{unit_part}{mill_part}" @@ -239,8 +261,7 @@ def tenfn(self, tens, units, mindex=0) -> str: def hundfn(self, hundreds: int, tens: int, units: int, mindex: int) -> str: if hundreds: - andword = f" {self._number_args['andword']} " if tens or units else "" - # use unit not unitfn as simpler + andword = f" {self.number_args['andword']} " if tens or units else "" return ( f"{unit[hundreds]} hundred{andword}" f"{self.tenfn(tens, units)}{self.millfn(mindex)}, " @@ -248,7 +269,8 @@ def hundfn(self, hundreds: int, tens: int, units: int, mindex: int) -> str: if tens or units: return f"{self.tenfn(tens, units)}{self.millfn(mindex)}, " return "" + def tensub(self, mo: Match) -> str: return f"{self.tenfn(int(mo.group(1)), int(mo.group(2)), self.mill_count)}, " def unitsub(self, mo: Match) -> str: - return f"{self.unitfn(int(mo.group(1)), self.mill_count)}, " + return f"{self.unitfn(int(mo.group(1)), self.mill_count)}, " \ No newline at end of file diff --git a/tests/test_robustness.py b/tests/test_robustness.py index a4f978ab6..ccad14660 100644 --- a/tests/test_robustness.py +++ b/tests/test_robustness.py @@ -156,7 +156,6 @@ def test_number_to_word(self) -> None: transformed_samples = NumberToWord.transform(self.number_sentences) self.assertIsInstance(transformed_samples, list) - def test_add_ocr_typo(self) -> None: """""" expected_corrected_sentences = [ "Tbis organization's a^rt c^an w^in tougb acts.", @@ -171,8 +170,7 @@ def test_add_ocr_typo(self) -> None: def test_abbreviation_insertion(self) -> None: """""" transformed_samples = AbbreviationInsertion.transform(self.abbreviation_sentences) - self.assertIsInstance(transformed_samples, list) - + self.assertIsInstance(transformed_samples, list) def test_add_speech_to_text_typo(self) -> None: """"""