Skip to content
1 change: 0 additions & 1 deletion .github/workflows/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,5 @@ jsonlines
openai
langchain
evaluate
inflect
rouge_score
typing-extensions < 4.6.0
42 changes: 16 additions & 26 deletions nlptest/transform/robustness.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import asyncio
import random
import re
from inflect import engine
from abc import ABC, abstractmethod
from typing import Dict, List, Optional
from nlptest.modelhandler.modelhandler import ModelFactory
from .utils import (CONTRACTION_MAP, TYPO_FREQUENCY, default_user_prompt ,ocr_typo_dict, abbreviation_dict)
from ..utils.custom_types import Sample, Span, Transformation
from ..utils.number_to_word import engine
from typing import List
import string
from ..utils.SoundsLikeFunctions import Search
Expand Down Expand Up @@ -622,15 +622,14 @@ def convert_numbers(regex, text):
token = match.group()
words = NumberToWord.infEng.number_to_words(
token, wantlist=True)
token_len = len(token) - 1
new_words_len = len(' '.join(words)) - 1
new_words_len = len(' '.join(words))
trans.append(text[start_offset:match.start()])
trans.append(' '.join(words))
start_offset = match.end()
transformations.append(
Transformation(
original_span=Span(
start=match.start(), end=match.end()-1, word=token),
start=match.start(), end=match.end(), word=token),
new_span=Span(start=match.start(), end=match.start(
)+new_words_len, word=' '.join(words)),
ignore=False
Expand All @@ -644,17 +643,14 @@ def convert_numbers(regex, text):

for idx, sample in enumerate(sample_list):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't refactor this section, this will handle when list of samples or strings in the function.

if isinstance(sample, str):
sample_list[idx] = convert_numbers(
r'(?<!\S)\d+(\.\d+)?(\.)?(?=(\s|\n|$))', sample)
sample_list[idx] = convert_numbers(r'(?<!\S)(\d+(\.\d+)?)(?=(\s|\n|$))', sample)
else:
sample.test_case, transformations = convert_numbers(
r'(?<!\S)\d+(\.\d+)?(\.)?(?=(\s|\n|$))', sample.original)
sample.test_case, transformations = convert_numbers(r'(?<!\S)(\d+(\.\d+)?)(?=(\s|\n|$))', sample.original)
if sample.task in ("ner", "text-classification"):
sample.transformations = transformations
sample.category = "robustness"
return sample_list


class AddOcrTypo(BaseRobustness):
alias_name = "add_ocr_typo"

Expand Down Expand Up @@ -758,25 +754,19 @@ def insert_abbreviation(text):
ignore=False
)
)
sample.category = "robustness"
if sample.task in ("ner", "text-classification"):
sample.transformations = transformations
return perturbed_text

for sample in sample_list:
if sample.task == 'question-answering':
sample.perturbed_question = insert_abbreviation(sample.original_question)

if "perturbed_context" in sample.__annotations__:
sample.perturbed_context = insert_abbreviation(sample.original_context)
return perturbed_text, transformations

for idx, sample in enumerate(sample_list):
if isinstance(sample, str):
sample_list[idx] = insert_abbreviation(sample)
else:
sample.test_case = insert_abbreviation(sample.original)
sample.category = "robustness"

return sample_list


sample.test_case, transformations = insert_abbreviation(sample.original)
if sample.task in ("ner", "text-classification"):
sample.transformations = transformations
sample.category = "robustness"

return sample_list

class AddSpeechToTextTypo(BaseRobustness):
alias_name = "add_speech_to_text_typo"

Expand Down
84 changes: 83 additions & 1 deletion nlptest/transform/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from collections import defaultdict
from typing import Dict, List

import re
import pandas as pd

from nlptest.utils.custom_types import NERPrediction, Sample, SequenceLabel, NEROutput, SequenceClassificationOutput
Expand Down Expand Up @@ -7200,6 +7200,88 @@ def get_entity_representation_proportions(entity_representation):
"xsum": "You are an intelligent Context summarizer. Please read the following context carefully. After understanding its content, create a concise summary, capturing the essential themes and key details. Please ensure that the summary does not end abruptly and remains within the max_tokens word limit. Context: {context}\n\n Summary: "
}

nth = {
0: "th",
1: "st",
2: "nd",
3: "rd",
4: "th",
5: "th",
6: "th",
7: "th",
8: "th",
9: "th",
11: "th",
12: "th",
13: "th",}

ordinal = dict(
ty="tieth",
one="first",
two="second",
three="third",
five="fifth",
eight="eighth",
nine="ninth",
twelve="twelfth",
)

unit = ["", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine"]

teen = [
"ten",
"eleven",
"twelve",
"thirteen",
"fourteen",
"fifteen",
"sixteen",
"seventeen",
"eighteen",
"nineteen",
]
ten = [
"",
"",
"twenty",
"thirty",
"forty",
"fifty",
"sixty",
"seventy",
"eighty",
"ninety",
]
mill = [
" ",
" thousand",
" million",
" billion",
" trillion",
" quadrillion",
" quintillion",
" sextillion",
" septillion",
" octillion",
" nonillion",
" decillion",
]

nth_suff = set(nth.values())
ordinal_suff = re.compile(fr"({'|'.join(ordinal)})\Z")

NON_DIGIT = re.compile(r"\D")
WHITESPACES_COMMA = re.compile(r"\s+,")
COMMA_WORD = re.compile(r", (\S+)\s+\Z")
WHITESPACES = re.compile(r"\s+")
DIGIT_GROUP = re.compile(r"(\d)")
TWO_DIGITS = re.compile(r"(\d)(\d)")
THREE_DIGITS = re.compile(r"(\d)(\d)(\d)")
THREE_DIGITS_WORD = re.compile(r"(\d)(\d)(\d)(?=\D*\Z)")
TWO_DIGITS_WORD = re.compile(r"(\d)(\d)(?=\D*\Z)")
ONE_DIGIT_WORD = re.compile(r"(\d)(?=\D*\Z)")
FOUR_DIGIT_COMMA = re.compile(r"(\d)(\d{3}(?:,|\Z))")

qa_prompt_template ="""
You are a distinguished professor known for your expertise in meticulously grading students' answers to questions. Your extensive knowledge and experience make you the go-to authority in your field.
You have been entrusted with the evaluation of the following question:
Expand Down
Loading