Skip to content
11 changes: 8 additions & 3 deletions examples/research_projects/codeparrot/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,20 +37,25 @@ Additionally, sure you have git-lfs installed. You can find instructions for how
The source of the dataset is the GitHub dump available on Google's [BigQuery](https://cloud.google.com/blog/topics/public-datasets/github-on-bigquery-analyze-all-the-open-source-code). The database was queried for all Python files with less than 1MB in size resulting in a 180GB dataset with over 20M files. The dataset is available on the Hugging Face Hub [here](https://huggingface.co/datasets/transformersbook/codeparrot).

### Preprocessing
The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374):
The raw dataset contains many duplicates. We deduplicated and filtered the dataset using the heuristics proposed in OpenAI's Codex [paper](https://arxiv.org/abs/2107.03374) and some new ones:

- exact deduplication using each file's hash
- filtering files with max line length > 1000
- filtering files with mean line length > 100
- fraction of alphanumeric characters < 0.25
- containing the word "auto-generated" or similar in the first 5 lines
- filtering with a probability of 0.7 of files with a mention of "test file" or "configuration file" or similar in the first 5 lines
- filtering with a probability of 0.7 of files with high occurence of the keywords "test " or "config"
- filtering with a probability of 0.7 of files without a mention of the keywords `def` , `for`, `while` and `class`
- filtering files that use the assignment operator `=` less than 5 times
- filtering files with ratio between number of characters and number of tokens after tokenization < 1.5 (the average ratio is 3.6)

The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/lvwerra/codeparrot-clean-train) and [validation](https://huggingface.co/datasets/lvwerra/codeparrot-clean-valid) splits are also available on the Hub if you want to skip this step or use the data for another project.
The script to process the full dataset can be found in `scripts/preprocessing.py`. Executing the script on 16 vCPUs takes roughly 3h and removes 70% of the original dataset. The cleaned [train](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-train-v2) and [validation](https://huggingface.co/datasets/loubnabnl/codeparrot-clean-valid-v2) splits are also available on the Hub if you want to skip this step or use the data for another project.

To execute the preprocessing run the following command:
```bash
python scripts/preprocessing.py \
--dataset_name lvwerra/codeparrot \
--dataset_name transformersbook/codeparrot \
--output_dir codeparrot-clean
```
During preprocessing the dataset is downloaded and stored locally as well as caches of the computations. Make sure you have more than 500GB free disk space to execute it.
Expand Down
12 changes: 11 additions & 1 deletion examples/research_projects/codeparrot/scripts/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ class PreprocessingArguments:
},
)
dataset_name: Optional[str] = field(
default="codeparrot", metadata={"help": "Folder or name of dataset to process."}
default="transformersbook/codeparrot", metadata={"help": "Folder or name of dataset to process."}
)
output_dir: Optional[str] = field(
default="codeparrot-clean", metadata={"help": "Folder to save processed processed dataset."}
Expand All @@ -148,6 +148,16 @@ class PreprocessingArguments:
alpha_frac: Optional[float] = field(
default=0.25, metadata={"help": "Maximum fraction of non-alphanumeric characters, otherwise file is filtered."}
)
min_token_ratio: Optional[float] = field(
default=1.5, metadata={"help": "Minimum character token ratio for the file, otherwise file is filtered."}
)
filter_proba: Optional[float] = field(
default=0.7, metadata={"help": "Probability for filtering config, test and uncommon files."}
)
tokenizer: Optional[str] = field(
default="lvwerra/codeparrot",
metadata={"help": "Name or path to the tokenizer."},
)


@dataclass
Expand Down
72 changes: 70 additions & 2 deletions examples/research_projects/codeparrot/scripts/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from datasets import load_dataset

from arguments import PreprocessingArguments
from transformers import HfArgumentParser
from transformers import AutoTokenizer, HfArgumentParser


def get_hash(example):
Expand Down Expand Up @@ -50,18 +50,77 @@ def is_autogenerated(example, scan_width=5):
return {"autogenerated": False}


def is_config_or_test(example, scan_width=5, coeff=0.05):
"""Check if file is a configuration file or a unit test by :
1- looking for keywords in the first few lines of the file.
2- counting number of occurence of the words 'config' and 'test' with respect to number of lines.
"""

keywords = ["unit tests", "test file", "configuration file"]
lines = example["content"].splitlines()
count_config = 0
count_test = 0
# first test
for _, line in zip(range(scan_width), lines):
for keyword in keywords:
if keyword in line.lower():
return {"config_or_test": True}
# second test
nlines = example["content"].count("\n")
threshold = int(coeff * nlines)
for line in lines:
count_config += line.lower().count("config")
count_test += line.lower().count("test")
if count_config > threshold or count_test > threshold:
return {"config_or_test": True}
return {"config_or_test": False}


def has_no_keywords(example):
"""Check if a python file has none of the keywords for: funcion, class, for loop, while loop."""
keywords = ["def ", "class ", "for ", "while "]
lines = example["content"].splitlines()
for line in lines:
for keyword in keywords:
if keyword in line.lower():
return {"has_no_keywords": False}
return {"has_no_keywords": True}


def has_few_assignments(example, minimum=4):
"""Check if file uses symbol '=' less than `minimum` times."""
lines = example["content"].splitlines()
counter = 0
for line in lines:
counter += line.lower().count("=")
if counter > minimum:
return {"has_few_assignments": False}
return {"has_few_assignments": True}


def char_token_ratio(example):
"""Compute character/token ratio of the file with tokenizer."""
input_ids = tokenizer(example["content"], truncation=False)["input_ids"]
ratio = len(example["content"]) / len(input_ids)
return {"ratio": ratio}


def preprocess(example):
"""Chain all preprocessing steps into one function to not fill cache."""
results = dict()
results.update(get_hash(example))
results.update(line_stats(example))
results.update(alpha_stats(example))
results.update(char_token_ratio(example))
results.update(is_autogenerated(example))
results.update(is_config_or_test(example))
results.update(has_no_keywords(example))
results.update(has_few_assignments(example))
return results


def filter(example, uniques, args):
"""Filter dataset with heuristics."""
"""Filter dataset with heuristics. Config, test and has_no_keywords files are removed with a given probability."""
if not check_uniques(example, uniques):
return False
elif example["autogenerated"]:
Expand All @@ -72,6 +131,14 @@ def filter(example, uniques, args):
return False
elif example["alpha_frac"] < args.alpha_frac:
return False
elif example["ratio"] < args.min_token_ratio:
return False
elif example["config_or_test"] and np.random.rand() <= args.filter_proba:
return False
elif example["has_no_keywords"] and np.random.rand() <= args.filter_proba:
return False
elif example["has_few_assignments"]:
return False
else:
return True

Expand All @@ -89,6 +156,7 @@ def compress_file(file_path):
args = parser.parse_args()
if args.num_workers is None:
args.num_workers = multiprocessing.cpu_count()
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_dir)

# Load dataset
t_start = time.time()
Expand Down