fix openwebtext

zheyuye · zheyuye · commit 9773efddc5d6 · 2020-06-30T16:39:04.000+08:00
diff --git a/scripts/datasets/pretrain_corpus/README.md b/scripts/datasets/pretrain_corpus/README.md
@@ -2,24 +2,24 @@
 
 We provide a series of shared scripts for downloading/preparing the text corpus for pretraining NLP models.
 This helps create a unified text corpus for studying the performance of different pretraining algorithms.
-When releasing the datasets, we follow the [FAIR principle](https://www.go-fair.org/fair-principles/), 
-i.e., the dataset needs to be findable, accessible, interoperable, and reusable. 
+When releasing the datasets, we follow the [FAIR principle](https://www.go-fair.org/fair-principles/),
+i.e., the dataset needs to be findable, accessible, interoperable, and reusable.
 
 ## BookCorpus
 Unfortunately, we are unable to provide the original [Toronto BookCorpus dataset](https://yknzhu.wixsite.com/mbweb) due to licensing issues.
 
 There are some open source efforts for reproducing the dataset, e.g.,
- using [soskek/bookcorpus](https://github.com/soskek/bookcorpus) or directly downloading the [preprocessed version](https://drive.google.com/file/d/16KCjV9z_FHm8LgZw05RSuk4EsAWPOP_z/view). 
- 
+ using [soskek/bookcorpus](https://github.com/soskek/bookcorpus) or directly downloading the [preprocessed version](https://drive.google.com/file/d/16KCjV9z_FHm8LgZw05RSuk4EsAWPOP_z/view).
+
 Nevertheless, we utilize the [Project Gutenberg](https://www.gutenberg.org/) as an alternative to Toronto BookCorpus.
 
-You can use the following command to download and prepare the Gutenberg dataset. 
+You can use the following command to download and prepare the Gutenberg dataset.
 
 ```bash
 python prepare_bookcorpus.py --dataset gutenberg
 ```
 
-Also, you should follow the [license](https://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License) for using the data. 
+Also, you should follow the [license](https://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License) for using the data.
 
 ## Wikipedia
 
@@ -43,7 +43,7 @@ You can download the OpenWebText from [link](https://skylion007.github.io/OpenWe
 After downloading and extracting the OpenWebText (i.e., `tar xf openwebtext.tar.xz`), you can use the following command to preprocess the dataset.
 
 ```bash
-python prepare_openwebtext.py --input openwebtext/ --output prepared_owt
+python prepare_openwebtext.py --input openwebtext/ --output prepared_owt --shuffle
 ```
 
 In this step, the archived txt are directly read without decompressing.
diff --git a/scripts/datasets/pretrain_corpus/prepare_openwebtext.py b/scripts/datasets/pretrain_corpus/prepare_openwebtext.py
@@ -51,7 +51,7 @@ def extract_files(full_name, output_dir, shuffle=False):
     """
     if not full_name.endswith(".xz"):
         return
-    file_prefix =  re.split('\.|/',full_name)[1]
+    file_prefix =  re.split('\.|/', full_name)[-2]
     with open("{}.txt".format(os.path.join(output_dir, file_prefix)),"w") as fp:
         with tarfile.open(full_name) as t:
             txt_names = t.getnames()
@@ -65,7 +65,7 @@ def extract_files(full_name, output_dir, shuffle=False):
                     if line:
                         fp.write(line.decode()+'\n')
                 # Two extra line break to mark the document separation
-                fp.write('\n\n')
+                fp.write('\n')
 
 
 @DATA_MAIN_REGISTRY.register('prepare_openwebtext')
diff --git a/scripts/pretraining/README.md b/scripts/pretraining/README.md
@@ -3,7 +3,7 @@
 Following the instruction of [Prepare OpenWebTextCorpus](../datasets/pretrain_corpus#openwebtext), download and prepare the dataset, obtaining a total of 20610 text files in the folder `prepared_owt`.
 
 ```bash
-python preprocesse_owt.py --input prepared_owt --output preprocessed_owt --shuffle
+python preprocesse_owt.py --input prepared_owt --output preprocessed_owt --max_seq_length 128
 ```
 The above command allows us to generate the preprocessed Numpy features saved in `.npz`.
 # Pretrain Model
diff --git a/scripts/pretraining/preprocesse_owt.py b/scripts/pretraining/preprocesse_owt.py
@@ -8,8 +8,11 @@
 import multiprocessing
 
 from pretraining_utils import get_all_features
+from gluonnlp.base import get_repo_model_zoo_url
+from gluonnlp.utils.misc import download
 from gluonnlp.data.tokenizers import HuggingFaceWordPieceTokenizer
 
+VOCAB_PATH = 'google_electra_small/vocab-e6d2b21d.json'
 
 def get_parser():
     parser = argparse.ArgumentParser(description=__doc__)
@@ -19,9 +22,6 @@ def get_parser():
                         help="directory for preprocessed features")
     parser.add_argument("--num_process", type=int, default=8,
                         help="number of processes for multiprocessing")
-    parser.add_argument("--vocab_file", default="vocab-c3b41053.json",
-                        help="vocabulary file of HuggingFaceWordPieceTokenizer"
-                             " for electra small model")
     parser.add_argument("--max_seq_length", type=int, default=128,
                         help="the maximum length of the pretraining sequence")
     parser.add_argument("--num_out_files", type=int, default=1000,
@@ -40,10 +40,11 @@ def get_parser():
 
 def main(args):
     num_process = min(multiprocessing.cpu_count(), args.num_process)
-    assert os.path.isfile(args.vocab_file), 'Cannot find vocab file'
-    # TODO(zheyuye), download the vocab_file from zoos and check it with sha1 hash.
+    vocab_file = os.path.join(os.getcwd(), 'vocab-e6d2b21d.json')
+    download(get_repo_model_zoo_url() + VOCAB_PATH, vocab_file,
+             sha1_hash='e6d2b21d910ccb356aa18f27a1c7d70660edc058')
     tokenizer = HuggingFaceWordPieceTokenizer(
-        vocab_file=args.vocab_file,
+        vocab_file=vocab_file,
         unk_token='[UNK]',
         pad_token='[PAD]',
         cls_token='[CLS]',
diff --git a/scripts/pretraining/pretraining_utils.py b/scripts/pretraining/pretraining_utils.py
@@ -41,17 +41,14 @@ def tokenize_lines_to_ids(lines, tokenizer):
     """
     results = []
     # tag line delimiters or doc delimiters
-    line_delimiters = False
     for line in lines:
         if not line:
             break
         line = line.strip()
         # Single empty lines are used as line delimiters
         # Double empty lines are used as document delimiters
         if not line:
-            if not line_delimiters:
-                results.append([])
-            line_delimiters = not line_delimiters
+            results.append([])
         else:
             token_ids = tokenizer.encode(line, int)
             if token_ids:
@@ -125,7 +122,7 @@ def process_a_text(text_file, tokenizer, max_seq_length, short_seq_prob=0.05):
         for tokenized_line in tokenized_lines:
             current_sentences.append(tokenized_line)
             current_length += len(tokenized_line)
-            # Create feature when meets the empty line or  reaches the target length
+            # Create feature when meets the empty line or reaches the target length
             if (not tokenized_line and current_length != 0) or (current_length >= target_seq_length):
                 first_segment, second_segment = \
                     sentenceize(current_sentences, max_seq_length, target_seq_length)