From 00a28dfd0d21a05458a075d1c6b8786030e816a2 Mon Sep 17 00:00:00 2001 From: richardyy1188 Date: Sun, 7 Jun 2020 21:42:39 +0900 Subject: [PATCH 1/4] add Tornoto BooksCorpus --- datasets/bookscorpus/bookscorpus.py | 131 ++++++++++++++++++++++++ datasets/bookscorpus/dataset_infos.json | 1 + 2 files changed, 132 insertions(+) create mode 100644 datasets/bookscorpus/bookscorpus.py create mode 100644 datasets/bookscorpus/dataset_infos.json diff --git a/datasets/bookscorpus/bookscorpus.py b/datasets/bookscorpus/bookscorpus.py new file mode 100644 index 00000000000..739c3f7626b --- /dev/null +++ b/datasets/bookscorpus/bookscorpus.py @@ -0,0 +1,131 @@ +# coding=utf-8 +# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace NLP Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Lint as: python3 +"""Toronto BooksCorpus dataset.""" + +from __future__ import absolute_import, division, print_function + +import glob +import os +import re + +import requests + +from tqdm import tqdm + +import nlp + +_DESCRIPTION = """\ +Books are a rich source of both fine-grained information, how a character, \ +an object or a scene looks like, as well as high-level semantics, what \ +someone is thinking, feeling and how these states evolve through a story.\ +This work aims to align books to their movie releases in order to provide\ +rich descriptive explanations for visual content that go semantically far\ +beyond the captions available in current datasets. \ +""" + +_CITATION = """\ +@InProceedings{Zhu_2015_ICCV, + title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books}, + author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja}, + booktitle = {The IEEE International Conference on Computer Vision (ICCV)}, + month = {December}, + year = {2015} +} +""" + +_GDRIVE_FILE_ID = "16KCjV9z_FHm8LgZw05RSuk4EsAWPOP_z" + +class BookscorpusConfig(nlp.BuilderConfig): + """BuilderConfig for BooksCorpus.""" + + def __init__(self, **kwargs): + """BuilderConfig for BooksCorpus. + Args: + **kwargs: keyword arguments forwarded to super. + """ + super(BookscorpusConfig, self).__init__( + version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs + ) + +class Bookscorpus(nlp.GeneratorBasedBuilder): + """BooksCorpus dataset.""" + + BUILDER_CONFIGS = [BookscorpusConfig(name="plain_text", description="Plain text",)] + + def _info(self): + return nlp.DatasetInfo( + description=_DESCRIPTION, + features=nlp.Features( + {"text": nlp.Value("string"),} + ), + supervised_keys=None, + homepage="https://yknzhu.wixsite.com/mbweb", + citation=_CITATION, + ) + + def _vocab_text_gen(self, archive): + for _, ex in self._generate_examples(archive): + yield ex["text"] + + def _split_generators(self, dl_manager): + downloaded_path_or_paths = dl_manager.download_custom(_GDRIVE_FILE_ID, download_file_from_google_drive) + arch_path = dl_manager.extract(downloaded_path_or_paths) + + return [ + nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"directory": arch_path}), + ] + + def _generate_examples(self, directory): + files = [os.path.join(directory, 'books_large_p1.txt'), + os.path.join(directory, 'books_large_p2.txt'),] + _id = 0 + for txt_file in files: + with open(txt_file, mode="r") as f: + for line in f: + yield _id, {'text': line.strip()} + _id += 1 + +def download_file_from_google_drive(id, destination): + def get_confirm_token(response): + for key, value in response.cookies.items(): + if key.startswith('download_warning'): + return value + + return None + + def save_response_content(response, destination): + CHUNK_SIZE = 32768 + + with open(destination, "wb") as f: + with tqdm(unit='B', unit_scale=True, unit_divisor=1024, leave=False) as bar: + for chunk in response.iter_content(CHUNK_SIZE): + if chunk: # filter out keep-alive new chunks + f.write(chunk) + bar.update(CHUNK_SIZE) + + URL = "https://docs.google.com/uc?export=download" + + session = requests.Session() + + response = session.get(URL, params = { 'id' : id }, stream = True) + token = get_confirm_token(response) + + if token: + params = { 'id' : id, 'confirm' : token } + response = session.get(URL, params = params, stream = True) + + save_response_content(response, destination) diff --git a/datasets/bookscorpus/dataset_infos.json b/datasets/bookscorpus/dataset_infos.json new file mode 100644 index 00000000000..f1a6b6a569d --- /dev/null +++ b/datasets/bookscorpus/dataset_infos.json @@ -0,0 +1 @@ +{"plain_text": {"description": "Books are a rich source of both fine-grained information, how a character, an object or a scene looks like, as well as high-level semantics, what someone is thinking, feeling and how these states evolve through a story.This work aims to align books to their movie releases in order to providerich descriptive explanations for visual content that go semantically farbeyond the captions available in current datasets. ", "citation": "@InProceedings{Zhu_2015_ICCV,\n title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},\n author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},\n booktitle = {The IEEE International Conference on Computer Vision (ICCV)},\n month = {December},\n year = {2015}\n}\n", "homepage": "https://yknzhu.wixsite.com/mbweb", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "bookscorpus", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4853859824, "num_examples": 74004228, "dataset_name": "bookscorpus"}}, "download_checksums": {"16KCjV9z_FHm8LgZw05RSuk4EsAWPOP_z": {"num_bytes": 1179510242, "checksum": "03a29333b2b35e98a375b6ad7d2f1651835655cd348fb89c864136bce69a964c"}}, "download_size": 1179510242, "dataset_size": 4853859824, "size_in_bytes": 6033370066}} \ No newline at end of file From 4bcc7b4c7dfdfd39070c592f07da3f34bc0ecd6a Mon Sep 17 00:00:00 2001 From: richardyy1188 Date: Sun, 7 Jun 2020 22:08:20 +0900 Subject: [PATCH 2/4] add forgot dummy data for Toronto BooksCorpus --- .../dummy/plain_text/1.0.0/dummy_data.zip | Bin 0 -> 864 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 datasets/bookscorpus/dummy/plain_text/1.0.0/dummy_data.zip diff --git a/datasets/bookscorpus/dummy/plain_text/1.0.0/dummy_data.zip b/datasets/bookscorpus/dummy/plain_text/1.0.0/dummy_data.zip new file mode 100644 index 0000000000000000000000000000000000000000..a57e36450c7f3059366f4da18b1aafb7f07f9973 GIT binary patch literal 864 zcmWIWW@h1H00HNX#{<9&D8bDj!;n&%n_C&5l30?c9~#2Rz^wQDPMi}EmsW5yFtYq) zW?*1>2_yo5hKK;QaWEJI4cWO#hGzwk_Xmh&Q4LAT&(AK7&q*vwPmM1y)GMhd0UIj> zG!lf-j7{y|$alzq$K`w7(cIV{d;-kE9%nLNn*~bmE|OXF&(>2i(*J$UdH#ndWp8Ad zxg1)sNvM3i&&s3aBGVJ%RQ2UlXS7eZ{cve-<|BDw{(w7g8!pcX@Zj9EUE1z7r-=BP zFoW)-$^5bZCY`d1d(Edf^Kwfi}0|B?rwV6#DxqAOCDX!l2n+f54P#8YsPc)eU^!if*-Zz4K9~7z}jOKmU zK5vNkEswt0;eK@4?>GfjKjEuem)1n>urQ1rhCo-FMx3PH%TV zwaL+6%i@IZ-i2yR!8y|kRfW!Q96im~KZ$+0p!RCE^6TCcScB7Q)6D60#MT ik%Vl83(zJcH=|eqjWS%mU}Xb&m>CGgfrc0}fp`E-Ar+ng literal 0 HcmV?d00001 From a126256b8af0c84f6a520916a255de453e81aea1 Mon Sep 17 00:00:00 2001 From: richardyy1188 Date: Tue, 9 Jun 2020 09:25:34 +0900 Subject: [PATCH 3/4] Use download manager to download google drive files, thanks for Quentin Lhoest's advice. --- datasets/bookscorpus/bookscorpus.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/datasets/bookscorpus/bookscorpus.py b/datasets/bookscorpus/bookscorpus.py index 739c3f7626b..c7310dcf203 100644 --- a/datasets/bookscorpus/bookscorpus.py +++ b/datasets/bookscorpus/bookscorpus.py @@ -47,7 +47,7 @@ } """ -_GDRIVE_FILE_ID = "16KCjV9z_FHm8LgZw05RSuk4EsAWPOP_z" +URL = "https://drive.google.com/uc?export=download&id=16KCjV9z_FHm8LgZw05RSuk4EsAWPOP_z" class BookscorpusConfig(nlp.BuilderConfig): """BuilderConfig for BooksCorpus.""" @@ -82,8 +82,7 @@ def _vocab_text_gen(self, archive): yield ex["text"] def _split_generators(self, dl_manager): - downloaded_path_or_paths = dl_manager.download_custom(_GDRIVE_FILE_ID, download_file_from_google_drive) - arch_path = dl_manager.extract(downloaded_path_or_paths) + arch_path = dl_manager.download_and_extract(URL) return [ nlp.SplitGenerator(name=nlp.Split.TRAIN, gen_kwargs={"directory": arch_path}), From 5093963c5d085adb01f4f4cf7445b3066e2ecae6 Mon Sep 17 00:00:00 2001 From: richardyy1188 Date: Fri, 12 Jun 2020 01:11:50 +0900 Subject: [PATCH 4/4] [BookCorpus] rename bookscorpus to bookcorpus and use file from gs --- .../bookcorpus.py} | 53 +++--------------- .../dataset_infos.json | 2 +- .../dummy/plain_text/1.0.0/dummy_data.zip | Bin 3 files changed, 10 insertions(+), 45 deletions(-) rename datasets/{bookscorpus/bookscorpus.py => bookcorpus/bookcorpus.py} (65%) rename datasets/{bookscorpus => bookcorpus}/dataset_infos.json (59%) rename datasets/{bookscorpus => bookcorpus}/dummy/plain_text/1.0.0/dummy_data.zip (100%) diff --git a/datasets/bookscorpus/bookscorpus.py b/datasets/bookcorpus/bookcorpus.py similarity index 65% rename from datasets/bookscorpus/bookscorpus.py rename to datasets/bookcorpus/bookcorpus.py index c7310dcf203..c21927a232e 100644 --- a/datasets/bookscorpus/bookscorpus.py +++ b/datasets/bookcorpus/bookcorpus.py @@ -14,7 +14,7 @@ # limitations under the License. # Lint as: python3 -"""Toronto BooksCorpus dataset.""" +"""The BookCorpus dataset.""" from __future__ import absolute_import, division, print_function @@ -22,10 +22,6 @@ import os import re -import requests - -from tqdm import tqdm - import nlp _DESCRIPTION = """\ @@ -47,24 +43,24 @@ } """ -URL = "https://drive.google.com/uc?export=download&id=16KCjV9z_FHm8LgZw05RSuk4EsAWPOP_z" +URL = "https://storage.googleapis.com/huggingface-nlp/datasets/bookcorpus/bookcorpus.tar.bz2" -class BookscorpusConfig(nlp.BuilderConfig): - """BuilderConfig for BooksCorpus.""" +class BookcorpusConfig(nlp.BuilderConfig): + """BuilderConfig for BookCorpus.""" def __init__(self, **kwargs): - """BuilderConfig for BooksCorpus. + """BuilderConfig for BookCorpus. Args: **kwargs: keyword arguments forwarded to super. """ - super(BookscorpusConfig, self).__init__( + super(BookcorpusConfig, self).__init__( version=nlp.Version("1.0.0", "New split API (https://tensorflow.org/datasets/splits)"), **kwargs ) -class Bookscorpus(nlp.GeneratorBasedBuilder): - """BooksCorpus dataset.""" +class Bookcorpus(nlp.GeneratorBasedBuilder): + """BookCorpus dataset.""" - BUILDER_CONFIGS = [BookscorpusConfig(name="plain_text", description="Plain text",)] + BUILDER_CONFIGS = [BookcorpusConfig(name="plain_text", description="Plain text",)] def _info(self): return nlp.DatasetInfo( @@ -97,34 +93,3 @@ def _generate_examples(self, directory): for line in f: yield _id, {'text': line.strip()} _id += 1 - -def download_file_from_google_drive(id, destination): - def get_confirm_token(response): - for key, value in response.cookies.items(): - if key.startswith('download_warning'): - return value - - return None - - def save_response_content(response, destination): - CHUNK_SIZE = 32768 - - with open(destination, "wb") as f: - with tqdm(unit='B', unit_scale=True, unit_divisor=1024, leave=False) as bar: - for chunk in response.iter_content(CHUNK_SIZE): - if chunk: # filter out keep-alive new chunks - f.write(chunk) - bar.update(CHUNK_SIZE) - - URL = "https://docs.google.com/uc?export=download" - - session = requests.Session() - - response = session.get(URL, params = { 'id' : id }, stream = True) - token = get_confirm_token(response) - - if token: - params = { 'id' : id, 'confirm' : token } - response = session.get(URL, params = params, stream = True) - - save_response_content(response, destination) diff --git a/datasets/bookscorpus/dataset_infos.json b/datasets/bookcorpus/dataset_infos.json similarity index 59% rename from datasets/bookscorpus/dataset_infos.json rename to datasets/bookcorpus/dataset_infos.json index f1a6b6a569d..53dfff3042d 100644 --- a/datasets/bookscorpus/dataset_infos.json +++ b/datasets/bookcorpus/dataset_infos.json @@ -1 +1 @@ -{"plain_text": {"description": "Books are a rich source of both fine-grained information, how a character, an object or a scene looks like, as well as high-level semantics, what someone is thinking, feeling and how these states evolve through a story.This work aims to align books to their movie releases in order to providerich descriptive explanations for visual content that go semantically farbeyond the captions available in current datasets. ", "citation": "@InProceedings{Zhu_2015_ICCV,\n title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},\n author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},\n booktitle = {The IEEE International Conference on Computer Vision (ICCV)},\n month = {December},\n year = {2015}\n}\n", "homepage": "https://yknzhu.wixsite.com/mbweb", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "bookscorpus", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4853859824, "num_examples": 74004228, "dataset_name": "bookscorpus"}}, "download_checksums": {"16KCjV9z_FHm8LgZw05RSuk4EsAWPOP_z": {"num_bytes": 1179510242, "checksum": "03a29333b2b35e98a375b6ad7d2f1651835655cd348fb89c864136bce69a964c"}}, "download_size": 1179510242, "dataset_size": 4853859824, "size_in_bytes": 6033370066}} \ No newline at end of file +{"plain_text": {"description": "Books are a rich source of both fine-grained information, how a character, an object or a scene looks like, as well as high-level semantics, what someone is thinking, feeling and how these states evolve through a story.This work aims to align books to their movie releases in order to providerich descriptive explanations for visual content that go semantically farbeyond the captions available in current datasets. ", "citation": "@InProceedings{Zhu_2015_ICCV,\n title = {Aligning Books and Movies: Towards Story-Like Visual Explanations by Watching Movies and Reading Books},\n author = {Zhu, Yukun and Kiros, Ryan and Zemel, Rich and Salakhutdinov, Ruslan and Urtasun, Raquel and Torralba, Antonio and Fidler, Sanja},\n booktitle = {The IEEE International Conference on Computer Vision (ICCV)},\n month = {December},\n year = {2015}\n}\n", "homepage": "https://yknzhu.wixsite.com/mbweb", "license": "", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "bookcorpus", "config_name": "plain_text", "version": {"version_str": "1.0.0", "description": "New split API (https://tensorflow.org/datasets/splits)", "nlp_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4853859824, "num_examples": 74004228, "dataset_name": "bookcorpus"}}, "download_checksums": {"https://storage.googleapis.com/huggingface-nlp/datasets/bookcorpus/bookcorpus.tar.bz2": {"num_bytes": 1179510242, "checksum": "03a29333b2b35e98a375b6ad7d2f1651835655cd348fb89c864136bce69a964c"}}, "download_size": 1179510242, "dataset_size": 4853859824, "size_in_bytes": 6033370066}} \ No newline at end of file diff --git a/datasets/bookscorpus/dummy/plain_text/1.0.0/dummy_data.zip b/datasets/bookcorpus/dummy/plain_text/1.0.0/dummy_data.zip similarity index 100% rename from datasets/bookscorpus/dummy/plain_text/1.0.0/dummy_data.zip rename to datasets/bookcorpus/dummy/plain_text/1.0.0/dummy_data.zip