From f60d45b76ed9cf52198b88443e593598efe8ff3e Mon Sep 17 00:00:00 2001 From: asolergi-nv Date: Wed, 26 Nov 2025 15:32:25 +0000 Subject: [PATCH 1/3] Add Common Crawl tutorial Signed-off-by: asolergi-nv --- tutorials/data/common-crawl/README.md | 74 ++++++++++ .../data/common-crawl/prepare_commoncrawl.py | 129 ++++++++++++++++++ 2 files changed, 203 insertions(+) create mode 100644 tutorials/data/common-crawl/README.md create mode 100644 tutorials/data/common-crawl/prepare_commoncrawl.py diff --git a/tutorials/data/common-crawl/README.md b/tutorials/data/common-crawl/README.md new file mode 100644 index 000000000..352cd1a22 --- /dev/null +++ b/tutorials/data/common-crawl/README.md @@ -0,0 +1,74 @@ +# Common Crawl Data Preprocessing Tutorial W/ NeMo Curator + +This guide explains how to prepare the Common Crawl dataset for language model pretraining. + +**Common Crawl** is a non-profit organization that maintains an open repository of web crawl data. The dataset consists of petabytes of raw web page data, metadata extracts, and text extracts collected monthly since 2008. Each crawl contains billions of web pages from across the internet, making it one of the largest openly available datasets for training language models. + +Dataset source: [Common Crawl](https://commoncrawl.org/) + +## Dataset overview + +The Common Crawl dataset is organized by **snapshots**, with each snapshot representing a complete web crawl: + +- **Main crawls**: General purpose web crawl covering diverse websites across the internet. Released approximately monthly (format: `CC-MAIN-YYYY-WW`, e.g., `CC-MAIN-2025-30`) +- **News crawls**: Focused crawl of news websites and articles, useful for domain-specific training on journalism and current events. Released monthly (format: `YYYY-MM`, e.g., `2025-08`) + +Each `CC-MAIN` snapshot contains multiple WARC (Web ARChive) files, with each file approximately **~1 GB compressed**, alongside a `warc.paths.gz` file that lists all WARC files in that snapshot. A typical main crawl snapshot includes around **~80,000 WARC files**, totaling approximately ~70-100 TB compressed. Each WARC file contains raw HTTP response data (HTML pages, headers, metadata) that requires extraction and filtering for language model training. + +## Requirements + +Install NeMo Curator directly from the GitHub repository with the CPU text processing extension: + +```bash +pip install "nemo-curator[text_cpu] @ git+https://github.com/NVIDIA-NeMo/Curator.git@main" +``` + +For more information about NeMo Curator, visit the [official repository](https://github.com/NVIDIA-NeMo/Curator). +For detailed installation instructions, see the [installation guide](https://docs.nvidia.com/nemo/curator/latest/admin/installation.html). + +## Usage + +To **download, uncompress, filter, and tokenize** CommonCrawl data, simply run the `prepare_commoncrawl.py` script. +All stages of the pipeline are handled within a single script thanks to **NeMo Curator**, which provides modular processing stages for downloading, cleaning, and tokenizing large-scale text datasets. + +You will need to specify a few configuration options when running the script: + +* **`--output_dir`** + Path where processed data will be stored. + + * Raw files will be placed in: `output_dir/raw` + * Tokenized files will be placed in: `output_dir/tokens` + +* **`--tokenizer-model`** + The tokenizer model identifier from the Hugging Face Hub used to tokenize the dataset. + +* **`--append-eod`** + When set, appends an End-of-Document (EOD) token to every sample. + +* **`--filter-data`** + Enables NeMo Curator’s filtering stages to remove short documents, documents with excessive URLs, and documents with highly repetitive n-grams. + +* **`--use-aws-to-download`** + Enables downloading CommonCrawl snapshots from S3. + +* **`--start-snapshot`** and **`--end-snapshot`** + Specify the range of snapshots to download. + +* **`--url-limit`** + Maximum number of files to download. Each file is approximately **1 GB**. + +#### Example + +```bash +python3 prepare_commoncrawl.py \ + --output_dir $DATASETS_PATH/CommonCrawl \ + --tokenizer-model nvidia/NVIDIA-Nemotron-Nano-12B-v2 \ + --append-eod \ + --filter-data \ + --url-limit 5 +``` + +--- + +When the script completes, it will automatically generate a **`dataset-prefixes.txt`** file in the output directory. +This file contains the dataset file prefixes required by **Megatron-LM** and **Megatron-Bridge** via the `--data-args-path` configuration. diff --git a/tutorials/data/common-crawl/prepare_commoncrawl.py b/tutorials/data/common-crawl/prepare_commoncrawl.py new file mode 100644 index 000000000..dfe57e2de --- /dev/null +++ b/tutorials/data/common-crawl/prepare_commoncrawl.py @@ -0,0 +1,129 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +import time +from pathlib import Path + +from nemo_curator.core.client import RayClient +from nemo_curator.pipeline import Pipeline +from nemo_curator.stages.text.download import CommonCrawlDownloadExtractStage +from nemo_curator.stages.text.filters import RepeatingTopNGramsFilter, UrlsFilter, WordCountFilter +from nemo_curator.stages.text.io.writer import MegatronTokenizerWriter +from nemo_curator.stages.text.modules import ScoreFilter + + +def main(args: argparse.Namespace) -> None: + # Initialize and start the Ray client + ray_client = RayClient() + ray_client.start() + + raw_dir = os.path.join(args.output_dir, "raw") + tokens_dir = os.path.join(args.output_dir, "tokens") + + os.makedirs(raw_dir, exist_ok=True) + os.makedirs(tokens_dir, exist_ok=True) + + print("Filtering and tokenizing Common Crawl data for Megatron-LM training") + print(f" The raw dataset will be written to '{raw_dir}'") + print(f" The filtered and tokenized dataset will be written to '{tokens_dir}'") + + # Create a pipeline with the stages + pipeline = Pipeline( + name="commoncrawl-filter-and-tokenize", + description="Filter and tokenize Common Crawl data for Megatron-LM training.", + ) + # Download and extract the Common Crawl data + pipeline.add_stage( + CommonCrawlDownloadExtractStage( + start_snapshot=args.start_snapshot, + end_snapshot=args.end_snapshot, + download_dir=raw_dir, + crawl_type="main", + use_aws_to_download=args.use_aws_to_download, + verbose=True, + url_limit=args.url_limit, + ) + ) + + # Filter the data + if args.filter_data: + ## Filter short documents + pipeline.add_stage(ScoreFilter(filter_obj=WordCountFilter(min_words=50))) + ## Filter documents with too many URLs + pipeline.add_stage(ScoreFilter(filter_obj=UrlsFilter(max_url_to_text_ratio=0.1))) + ## Filter documents with too many repeating n-grams + pipeline.add_stage(ScoreFilter(filter_obj=RepeatingTopNGramsFilter())) + + # Tokenize the data + pipeline.add_stage( + MegatronTokenizerWriter( + path=tokens_dir, + model_identifier=args.tokenizer_model, + append_eod=args.append_eod, + ) + ) + + print("Starting the filtering and tokenization pipeline") + start_time = time.time() + # Run the pipeline + results = pipeline.run() + end_time = time.time() + execution_time = end_time - start_time + # Count the total number of records + print(f"\n\nFiltering and tokenization pipeline finished (took {execution_time} seconds)") + print(f"The results were written to '{[result.data for result in results]}'") + + # Stop the Ray client + ray_client.stop() + + # Create --data-args-path file + data_args_path = os.path.join(args.output_dir, "dataset-prefixes.txt") + file_prefixes = [str(file)[:-4] for file in Path(tokens_dir).glob("**/*.bin")] + with open(data_args_path, "w") as f: + for file_prefix in file_prefixes: + f.write(file_prefix + "\n") + + print(f"The --data-args-path file was written to '{data_args_path}'") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + group = parser.add_argument_group(title="input data") + group.add_argument("--output_dir", type=str, required=True, help="Path to output directory") + group.add_argument( + "--tokenizer-model", type=str, required=True, help="Hugging Face model identifier for the tokenizer" + ) + group.add_argument("--append-eod", action="store_true", help="Append an token to the end of each sample.") + group.add_argument( + "--filter-data", + action="store_true", + help="Filter short documents, documents with too many URLs, and documents with too many repeating n-grams using NeMo Curator's filters", + ) + group.add_argument( + "--use-aws-to-download", + action="store_true", + help="Use the s5cmd command to download from the Common Crawl's S3 bucket", + ) + group.add_argument("--start-snapshot", type=str, default="2025-30", help="Start snapshot to download") + group.add_argument("--end-snapshot", type=str, default="2025-30", help="End snapshot to download") + group.add_argument( + "--url-limit", + type=int, + default=2, + help="Limit the number of URLs/WARC files to download. Each WARC file is ~1GB", + ) + args = parser.parse_args() + main(args) From 75af8d399200a1898732936a2424249f155754a8 Mon Sep 17 00:00:00 2001 From: asolergi-nv Date: Wed, 26 Nov 2025 16:14:37 +0000 Subject: [PATCH 2/3] Cite tutorial from NeMo Curator Signed-off-by: asolergi-nv --- tutorials/data/common-crawl/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tutorials/data/common-crawl/README.md b/tutorials/data/common-crawl/README.md index 352cd1a22..16dca3fc6 100644 --- a/tutorials/data/common-crawl/README.md +++ b/tutorials/data/common-crawl/README.md @@ -72,3 +72,5 @@ python3 prepare_commoncrawl.py \ When the script completes, it will automatically generate a **`dataset-prefixes.txt`** file in the output directory. This file contains the dataset file prefixes required by **Megatron-LM** and **Megatron-Bridge** via the `--data-args-path` configuration. + +For more details about the new `MegatronTokenizerWriter` stage, refer to the ["megatron-tokenizer" tutorial in NeMo Curator](https://github.com/NVIDIA-NeMo/Curator/tree/main/tutorials/text/megatron-tokenizer). From 75c0b764c60d1d7c29e9a235428218d0b79596d9 Mon Sep 17 00:00:00 2001 From: asolergi-nv Date: Wed, 26 Nov 2025 16:20:42 +0000 Subject: [PATCH 3/3] lint Signed-off-by: asolergi-nv --- tutorials/data/common-crawl/prepare_commoncrawl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tutorials/data/common-crawl/prepare_commoncrawl.py b/tutorials/data/common-crawl/prepare_commoncrawl.py index dfe57e2de..6f19ca633 100644 --- a/tutorials/data/common-crawl/prepare_commoncrawl.py +++ b/tutorials/data/common-crawl/prepare_commoncrawl.py @@ -25,7 +25,7 @@ from nemo_curator.stages.text.modules import ScoreFilter -def main(args: argparse.Namespace) -> None: +def main(args: argparse.Namespace) -> None: # noqa: D103 # Initialize and start the Ray client ray_client = RayClient() ray_client.start()