huggingface
diff --git a/‎comparisons/exact_match/exact_match.py‎
Lines changed: 1 addition & 2 deletions b/‎comparisons/exact_match/exact_match.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎comparisons/mcnemar/mcnemar.py‎
Lines changed: 1 addition & 2 deletions b/‎comparisons/mcnemar/mcnemar.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎comparisons/wilcoxon/wilcoxon.py‎
Lines changed: 1 addition & 2 deletions b/‎comparisons/wilcoxon/wilcoxon.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎docs/source/a_quick_tour.mdx‎
Lines changed: 0 additions & 18 deletions b/‎docs/source/a_quick_tour.mdx‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎measurements/label_distribution/label_distribution.py‎
Lines changed: 1 addition & 2 deletions b/‎measurements/label_distribution/label_distribution.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎measurements/perplexity/perplexity.py‎
Lines changed: 10 additions & 29 deletions b/‎measurements/perplexity/perplexity.py‎
Lines changed: 10 additions & 29 deletions
diff --git a/‎measurements/regard/regard.py‎
Lines changed: 6 additions & 19 deletions b/‎measurements/regard/regard.py‎
Lines changed: 6 additions & 19 deletions
diff --git a/‎measurements/text_duplicates/text_duplicates.py‎
Lines changed: 4 additions & 16 deletions b/‎measurements/text_duplicates/text_duplicates.py‎
Lines changed: 4 additions & 16 deletions
diff --git a/‎measurements/toxicity/README.md‎
Lines changed: 7 additions & 7 deletions b/‎measurements/toxicity/README.md‎
Lines changed: 7 additions & 7 deletions
@@ -46,13 +46,12 @@
 
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class ExactMatch(evaluate.Comparison):
-    def _info(self, config):
+    def _info(self):
         return evaluate.ComparisonInfo(
             module_type="comparison",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            config=config,
             features=datasets.Features(
                 {
                     "predictions1": datasets.Value("int64"),
 
@@ -62,13 +62,12 @@
 
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class McNemar(evaluate.Comparison):
-    def _info(self, config):
+    def _info(self):
         return evaluate.ComparisonInfo(
             module_type="comparison",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            config=config,
             features=datasets.Features(
                 {
                     "predictions1": datasets.Value("int64"),
 
@@ -55,13 +55,12 @@
 
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class Wilcoxon(evaluate.Comparison):
-    def _info(self, config):
+    def _info(self):
         return evaluate.ComparisonInfo(
             module_type="comparison",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            config=config,
             features=datasets.Features(
                 {
                     "predictions1": datasets.Value("float"),
 
@@ -65,7 +65,6 @@ All evalution modules come with a range of useful attributes that help to use a
 |---|---|
 |`description`|A short description of the evaluation module.|
 |`citation`|A BibTex string for citation when available.|
-|`config` | A `dataclass` containing the settings of the module. | 
 |`features`|A `Features` object defining the input format.|
 |`inputs_description`|This is equivalent to the modules docstring.|
 |`homepage`|The homepage of the module.|
@@ -179,23 +178,6 @@ A common way to overcome this issue is to fallback on single process evaluation.
 
 This solution allows 🤗 Evaluate to perform distributed predictions, which is important for evaluation speed in distributed settings. At the same time, you can also use complex non-additive metrics without wasting valuable GPU or CPU memory.
 
-## Configuration
-
-Some metrics can be configured with additional settings. For example, `accuracy` has an extra `normalize` setting which returns the fraction of correctly classified samples and is set to `True` by default. To change it you have two options: pass it as a keyword argument with `load()` or during `compute()`. With `load()`, the setting is changed permanently for the module, while passing it to `compute()` only changes it for the duration of the `compute()` call.
-
-```python
-
->>> metric = evaluate.load("accuracy", normalize=False)
->>> refs, preds = [1, 1], [1, 0]
->>> acc_1 = metric.compute(references=refs, predictions=preds)["accuracy"]
->>> acc_2 = metric.compute(references=refs, predictions=preds, normalize=True)["accuracy"]
->>> acc_3 = metric.compute(references=refs, predictions=preds)["accuracy"]
->>> print((acc_1, acc_2, acc_3))
-(1.0, 0.5, 1.0)
-```
-
-This is also useful for the following `combine()` method since it allows to load modules with specific settings before combining them.
-
 ## Combining several evaluations
 
 Often one wants to not only evaluate a single metric but a range of different metrics capturing different aspects of a model. E.g. for classification it is usually a good idea to compute F1-score, recall, and precision in addition to accuracy to get a better picture of model performance. Naturally, you can load a bunch of metrics and call them sequentially. However, a more convenient way is to use the [`~evaluate.combine`] function to bundle them together:
 
@@ -70,13 +70,12 @@
 
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class LabelDistribution(evaluate.Measurement):
-    def _info(self, config):
+    def _info(self):
         return evaluate.MeasurementInfo(
             module_type="measurement",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            config=config,
             features=[
                 datasets.Features({"data": datasets.Value("int32")}),
                 datasets.Features({"data": datasets.Value("string")}),
 
@@ -13,9 +13,6 @@
 # limitations under the License.
 """Perplexity Metric."""
 
-from dataclasses import dataclass
-from typing import Optional
-
 import datasets
 import numpy as np
 import torch
@@ -87,29 +84,14 @@
 """
 
 
-@dataclass
-class PerplexityConfig(evaluate.info.Config):
-
-    name: str = "default"
-
-    batch_size: int = 16
-    model_id: str = "gpt2"
-    add_start_token: bool = True
-    device: Optional[str] = None
-
-
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class Perplexity(evaluate.Measurement):
-    CONFIG_CLASS = PerplexityConfig
-    ALLOWED_CONFIG_NAMES = ["default"]
-
-    def _info(self, config):
+    def _info(self):
         return evaluate.MeasurementInfo(
             module_type="measurement",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            config=config,
             features=datasets.Features(
                 {
                     "data": datasets.Value("string"),
@@ -118,25 +100,24 @@ def _info(self, config):
             reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
         )
 
-    def _compute(self, data):
+    def _compute(self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None):
 
-        device = self.config.device
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
             if device == "gpu":
                 device = "cuda"
         else:
             device = "cuda" if torch.cuda.is_available() else "cpu"
 
-        model = AutoModelForCausalLM.from_pretrained(self.config.model_id)
+        model = AutoModelForCausalLM.from_pretrained(model_id)
         model = model.to(device)
 
-        tokenizer = AutoTokenizer.from_pretrained(self.config.model_id)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
 
         # if batch_size > 1 (which generally leads to padding being required), and
         # if there is not an already assigned pad_token, assign an existing
         # special token to also be the padding token
-        if tokenizer.pad_token is None and self.config.batch_size > 1:
+        if tokenizer.pad_token is None and batch_size > 1:
             existing_special_tokens = list(tokenizer.special_tokens_map_extended.values())
             # check that the model already has at least one special token defined
             assert (
@@ -145,7 +126,7 @@ def _compute(self, data):
             # assign one of the special tokens to also be the pad token
             tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
 
-        if self.config.add_start_token:
+        if add_start_token:
             # leave room for <BOS> token to be added:
             assert (
                 tokenizer.bos_token is not None
@@ -168,7 +149,7 @@ def _compute(self, data):
         attn_masks = encodings["attention_mask"]
 
         # check that each input is long enough:
-        if self.config.add_start_token:
+        if add_start_token:
             assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
         else:
             assert torch.all(
@@ -178,12 +159,12 @@ def _compute(self, data):
         ppls = []
         loss_fct = CrossEntropyLoss(reduction="none")
 
-        for start_index in logging.tqdm(range(0, len(encoded_texts), self.config.batch_size)):
-            end_index = min(start_index + self.config.batch_size, len(encoded_texts))
+        for start_index in logging.tqdm(range(0, len(encoded_texts), batch_size)):
+            end_index = min(start_index + batch_size, len(encoded_texts))
             encoded_batch = encoded_texts[start_index:end_index]
             attn_mask = attn_masks[start_index:end_index]
 
-            if self.config.add_start_token:
+            if add_start_token:
                 bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_batch.size(dim=0)).to(device)
                 encoded_batch = torch.cat([bos_tokens_tensor, encoded_batch], dim=1)
                 attn_mask = torch.cat(
 
@@ -15,10 +15,8 @@
 """ Regard measurement. """
 
 from collections import defaultdict
-from dataclasses import dataclass
 from operator import itemgetter
 from statistics import mean
-from typing import Optional
 
 import datasets
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
@@ -117,28 +115,16 @@ def regard(group, regard_classifier):
     return group_regard, dict(group_scores)
 
 
-@dataclass
-class RegardConfig(evaluate.info.Config):
-
-    name: str = "default"
-
-    aggregation: Optional[str] = None
-
-
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class Regard(evaluate.Measurement):
-    CONFIG_CLASS = RegardConfig
-    ALLOWED_CONFIG_NAMES = ["default", "compare"]
-
-    def _info(self, config):
+    def _info(self):
         if self.config_name not in ["compare", "default"]:
             raise KeyError("You should supply a configuration name selected in " '["config", "default"]')
         return evaluate.MeasurementInfo(
             module_type="measurement",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            config=config,
             features=datasets.Features(
                 {
                     "data": datasets.Value("string", id="sequence"),
@@ -164,6 +150,7 @@ def _compute(
         self,
         data,
         references=None,
+        aggregation=None,
     ):
         if self.config_name == "compare":
             pred_scores, pred_regard = regard(data, self.regard_classifier)
@@ -172,22 +159,22 @@ def _compute(
             pred_max = {k: max(v) for k, v in pred_regard.items()}
             ref_mean = {k: mean(v) for k, v in ref_regard.items()}
             ref_max = {k: max(v) for k, v in ref_regard.items()}
-            if self.config.aggregation == "maximum":
+            if aggregation == "maximum":
                 return {
                     "max_data_regard": pred_max,
                     "max_references_regard": ref_max,
                 }
-            elif self.config.aggregation == "average":
+            elif aggregation == "average":
                 return {"average_data_regard": pred_mean, "average_references_regard": ref_mean}
             else:
                 return {"regard_difference": {key: pred_mean[key] - ref_mean.get(key, 0) for key in pred_mean}}
         else:
             pred_scores, pred_regard = regard(data, self.regard_classifier)
             pred_mean = {k: mean(v) for k, v in pred_regard.items()}
             pred_max = {k: max(v) for k, v in pred_regard.items()}
-            if self.config.aggregation == "maximum":
+            if aggregation == "maximum":
                 return {"max_regard": pred_max}
-            elif self.config.aggregation == "average":
+            elif aggregation == "average":
                 return {"average_regard": pred_mean}
             else:
                 return {"regard": pred_scores}
@@ -14,7 +14,6 @@
 
 import hashlib
 from collections import Counter
-from dataclasses import dataclass
 
 import datasets
 
@@ -58,29 +57,18 @@ def get_hash(example):
     return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
 
 
-@dataclass
-class TextDuplicatesConfig(evaluate.info.Config):
-
-    name: str = "default"
-
-    list_duplicates: bool = False
-
-
 @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
 class TextDuplicates(evaluate.Measurement):
     """This measurement returns the duplicate strings contained in the input(s)."""
 
-    CONFIG_CLASS = TextDuplicatesConfig
-    ALLOWED_CONFIG_NAMES = ["default"]
-
-    def _info(self, config):
+    def _info(self):
+        # TODO: Specifies the evaluate.MeasurementInfo object
         return evaluate.MeasurementInfo(
             # This is the description that will appear on the modules page.
             module_type="measurement",
             description=_DESCRIPTION,
             citation=_CITATION,
             inputs_description=_KWARGS_DESCRIPTION,
-            config=config,
             # This defines the format of each prediction and reference
             features=datasets.Features(
                 {
@@ -89,9 +77,9 @@ def _info(self, config):
             ),
         )
 
-    def _compute(self, data):
+    def _compute(self, data, list_duplicates=False):
         """Returns the duplicates contained in the input data and the number of times they are repeated."""
-        if self.config.list_duplicates == True:
+        if list_duplicates == True:
             logger.warning("This functionality can be memory-intensive for large datasets!")
             n_dedup = len(set([get_hash(d) for d in data]))
             c = Counter(data)
 
@@ -30,7 +30,7 @@ The model should be compatible with the AutoModelForSequenceClassification class
 For more information, see [the AutoModelForSequenceClassification documentation]( https://huggingface.co/docs/transformers/master/en/model_doc/auto#transformers.AutoModelForSequenceClassification).
 
 Args:
-    `data` (list of str): prediction/candidate sentences
+    `predictions` (list of str): prediction/candidate sentences
     `toxic_label` (str) (optional): the toxic label that you want to detect, depending on the labels that the model has been trained on.
         This can be found using the `id2label` function, e.g.:
         ```python
@@ -47,7 +47,7 @@ Args:
 
 ## Output values
 
-    `toxicity`: a list of toxicity scores, one for each sentence in `data` (default behavior)
+    `toxicity`: a list of toxicity scores, one for each sentence in `predictions` (default behavior)
 
     `max_toxicity`: the maximum toxicity over all scores (if `aggregation` = `maximum`)
 
@@ -62,31 +62,31 @@ Args:
 ```python
 >>> toxicity = evaluate.load("toxicity", module_type="measurement")
 >>> input_texts = ["she went to the library", "he is a douchebag"]
->>> results = toxicity.compute(data=input_texts)
+>>> results = toxicity.compute(predictions=input_texts)
 >>> print([round(s, 4) for s in results["toxicity"]])
 [0.0002, 0.8564]
 ```
     Example 2 (returns ratio of toxic sentences):
 ```python
 >>> toxicity = evaluate.load("toxicity", module_type="measurement")
 >>> input_texts = ["she went to the library", "he is a douchebag"]
->>> results = toxicity.compute(data=input_texts, aggregation="ratio")
+>>> results = toxicity.compute(predictions=input_texts, aggregation="ratio")
 >>> print(results['toxicity_ratio'])
 0.5
 ```
     Example 3 (returns the maximum toxicity score):
 ```python
 >>> toxicity = evaluate.load("toxicity", module_type="measurement")
 >>> input_texts = ["she went to the library", "he is a douchebag"]
->>> results = toxicity.compute(data=input_texts, aggregation="maximum")
+>>> results = toxicity.compute(predictions=input_texts, aggregation="maximum")
 >>> print(round(results['max_toxicity'], 4))
 0.8564
 ```
     Example 4 (uses a custom model):
 ```python
->>> toxicity = evaluate.load("toxicity", model_name='DaNLP/da-electra-hatespeech-detection')
+>>> toxicity = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection')
 >>> input_texts = ["she went to the library", "he is a douchebag"]
->>> results = toxicity.compute(data=input_texts, toxic_label='offensive')
+>>> results = toxicity.compute(predictions=input_texts, toxic_label='offensive')
 >>> print([round(s, 4) for s in results["toxicity"]])
 [0.0176, 0.0203]
 ```