Update codeparrot data preprocessing (#16944)

loubnabnl · lvwerra · Loubna ben allal · web-flow · commit e730e1256732 · 2022-05-16T14:43:25.000+02:00
* add new preprocessing arguments

* add new filters

* add new filters to readme

* fix config and test count, update function names and docstrings

* reformat code

* update readme

* Update readme

* rename config_test filter

Co-authored-by: Leandro von Werra &lt;lvwerra@users.noreply.github.com&gt;

* rename few_assignments filter

Co-authored-by: Leandro von Werra &lt;lvwerra@users.noreply.github.com&gt;

* rename tokenizer in arguments

Co-authored-by: Leandro von Werra &lt;lvwerra@users.noreply.github.com&gt;

* rename functions and add limit_line argument for config_test filter

* update threshold for config_test filter

Co-authored-by: Leandro von Werra &lt;lvwerra@users.noreply.github.com&gt;
Co-authored-by: Loubna ben allal &lt;loubnabenallal@gmail.com&gt;
diff --git a/examples/research_projects/codeparrot/README.md b/examples/research_projects/codeparrot/README.md
@@ -37,20 +37,25 @@ Additionally, sure you have git-lfs installed. You can find instructions for how
 The source of the dataset is the GitHub dump available on Google's [BigQuery](https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code). The database was queried for all Python files with less than 1MB in size resulting in a 180GB dataset with over 20M files. The dataset is available on the Hugging Face Hub [here](https://huggingface.co/datasets/transformersbook/codeparrot).
 
 ### Preprocessing
-The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374):
+The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374) and some new ones:
 
 - exact deduplication using each file's hash
 - filtering files with max line length > 1000
 - filtering files with mean line length > 100
 - fraction of alphanumeric characters < 0.25
 - containing the word "auto-generated" or similar in the first 5 lines
+- filtering with a probability of 0.7 of files with a mention of "test file" or "configuration file" or similar in the first 5 lines
+- filtering with a probability of 0.7 of files with high occurence of the keywords "test " or "config" 
+- filtering with a probability of 0.7  of files without a mention of the keywords `def` , `for`, `while`  and `class`
+- filtering files that use the assignment operator `=` less than 5 times 
+- filtering files with ratio between number of characters and number of tokens after tokenization < 1.5 (the average ratio is 3.6)
 
-The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/lvwerra/codeparrot-clean-train) and [validation](https://huggingface.co/datasets/lvwerra/codeparrot-clean-valid) splits are also available on the Hub if you want to skip this step or use the data for another project.
+The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-train-v2) and [validation](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-valid-v2) splits are also available on the Hub if you want to skip this step or use the data for another project.
 
 To execute the preprocessing run the following command:
 ```bash
 python scripts/preprocessing.py \
---dataset_name lvwerra/codeparrot \
+--dataset_name transformersbook/codeparrot \
 --output_dir codeparrot-clean
 ```
 During preprocessing the dataset is downloaded and stored locally as well as caches of the computations. Make sure you have more than 500GB free disk space to execute it.
diff --git a/examples/research_projects/codeparrot/scripts/arguments.py b/examples/research_projects/codeparrot/scripts/arguments.py
@@ -133,7 +133,7 @@ class PreprocessingArguments:
         },
     )
     dataset_name: Optional[str] = field(
-        default="codeparrot", metadata={"help": "Folder or name of dataset to process."}
+        default="transformersbook/codeparrot", metadata={"help": "Folder or name of dataset to process."}
     )
     output_dir: Optional[str] = field(
         default="codeparrot-clean", metadata={"help": "Folder to save processed processed dataset."}
@@ -151,6 +151,16 @@ class PreprocessingArguments:
     alpha_frac: Optional[float] = field(
         default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."}
     )
+    min_token_ratio: Optional[float] = field(
+        default=1.5, metadata={"help": "Minimum character token ratio for the file, otherwise file is filtered."}
+    )
+    filter_proba: Optional[float] = field(
+        default=0.7, metadata={"help": "Probability for filtering config, test and uncommon files."}
+    )
+    tokenizer: Optional[str] = field(
+        default="lvwerra/codeparrot",
+        metadata={"help": "Name or path to the tokenizer."},
+    )
 
 
 @dataclass
diff --git a/examples/research_projects/codeparrot/scripts/preprocessing.py b/examples/research_projects/codeparrot/scripts/preprocessing.py
@@ -9,7 +9,7 @@
 from datasets import load_dataset
 
 from arguments import PreprocessingArguments
-from transformers import HfArgumentParser
+from transformers import AutoTokenizer, HfArgumentParser
 
 
 def get_hash(example):
@@ -50,18 +50,77 @@ def is_autogenerated(example, scan_width=5):
         return {"autogenerated": False}
 
 
+def is_config_or_test(example, scan_width=5, coeff=0.05):
+    """Check if file is a configuration file or a unit test by :
+    1- looking for keywords in the first few lines of the file.
+    2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
+    """
+
+    keywords = ["unit tests", "test file", "configuration file"]
+    lines = example["content"].splitlines()
+    count_config = 0
+    count_test = 0
+    # first test
+    for _, line in zip(range(scan_width), lines):
+        for keyword in keywords:
+            if keyword in line.lower():
+                return {"config_or_test": True}
+    # second test
+    nlines = example["content"].count("\n")
+    threshold = int(coeff * nlines)
+    for line in lines:
+        count_config += line.lower().count("config")
+        count_test += line.lower().count("test")
+        if count_config > threshold or count_test > threshold:
+            return {"config_or_test": True}
+    return {"config_or_test": False}
+
+
+def has_no_keywords(example):
+    """Check if a python file has none of the keywords for: funcion, class, for loop, while loop."""
+    keywords = ["def ", "class ", "for ", "while "]
+    lines = example["content"].splitlines()
+    for line in lines:
+        for keyword in keywords:
+            if keyword in line.lower():
+                return {"has_no_keywords": False}
+    return {"has_no_keywords": True}
+
+
+def has_few_assignments(example, minimum=4):
+    """Check if file uses symbol '=' less than `minimum` times."""
+    lines = example["content"].splitlines()
+    counter = 0
+    for line in lines:
+        counter += line.lower().count("=")
+        if counter > minimum:
+            return {"has_few_assignments": False}
+    return {"has_few_assignments": True}
+
+
+def char_token_ratio(example):
+    """Compute character/token ratio of the file with tokenizer."""
+    input_ids = tokenizer(example["content"], truncation=False)["input_ids"]
+    ratio = len(example["content"]) / len(input_ids)
+    return {"ratio": ratio}
+
+
 def preprocess(example):
     """Chain all preprocessing steps into one function to not fill cache."""
     results = dict()
     results.update(get_hash(example))
     results.update(line_stats(example))
     results.update(alpha_stats(example))
+    results.update(char_token_ratio(example))
     results.update(is_autogenerated(example))
+    results.update(is_config_or_test(example))
+    results.update(has_no_keywords(example))
+    results.update(has_few_assignments(example))
     return results
 
 
 def filter(example, uniques, args):
-    """Filter dataset with heuristics."""
+    """Filter dataset with heuristics. Config, test and has_no_keywords files are removed with a given probability."""
     if not check_uniques(example, uniques):
         return False
     elif example["autogenerated"]:
@@ -72,6 +131,14 @@ def filter(example, uniques, args):
         return False
     elif example["alpha_frac"] < args.alpha_frac:
         return False
+    elif example["ratio"] < args.min_token_ratio:
+        return False
+    elif example["config_or_test"] and np.random.rand() <= args.filter_proba:
+        return False
+    elif example["has_no_keywords"] and np.random.rand() <= args.filter_proba:
+        return False
+    elif example["has_few_assignments"]:
+        return False
     else:
         return True
 
@@ -89,6 +156,7 @@ def compress_file(file_path):
 args = parser.parse_args()
 if args.num_workers is None:
     args.num_workers = multiprocessing.cpu_count()
+tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)
 
 # Load dataset
 t_start = time.time()