diff --git a/datasets/cats_vs_dogs/README.md b/datasets/cats_vs_dogs/README.md new file mode 100644 index 00000000000..93c4de3bc06 --- /dev/null +++ b/datasets/cats_vs_dogs/README.md @@ -0,0 +1,177 @@ +--- +annotations_creators: +- crowdsourced +language_creators: +- crowdsourced +languages: +- en +licenses: +- unknown +multilinguality: +- monolingual +pretty_name: Cats Vs. Dogs +size_categories: +- 10K List[datasets.SplitGenerator]: + images_path = Path(dl_manager.download_and_extract(_URL)) / "PetImages" + return [ + datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"images_path": images_path}), + ] + + def _generate_examples(self, images_path): + logger.info("generating examples from = %s", images_path) + for i, filepath in enumerate(images_path.glob("**/*.jpg")): + with filepath.open("rb") as f: + if b"JFIF" in f.peek(10): + yield str(i), { + "image_file_path": str(filepath), + "labels": filepath.parent.name.lower(), + } + continue diff --git a/datasets/cats_vs_dogs/dataset_infos.json b/datasets/cats_vs_dogs/dataset_infos.json new file mode 100644 index 00000000000..4d80d7f6881 --- /dev/null +++ b/datasets/cats_vs_dogs/dataset_infos.json @@ -0,0 +1 @@ +{"default": {"description": "A large set of images of cats and dogs. There are 1738 corrupted images that are dropped.", "citation": "@Inproceedings (Conference){asirra-a-captcha-that-exploits-interest-aligned-manual-image-categorization,\n author = {Elson, Jeremy and Douceur, John (JD) and Howell, Jon and Saul, Jared},\n title = {Asirra: A CAPTCHA that Exploits Interest-Aligned Manual Image Categorization},\n booktitle = {Proceedings of 14th ACM Conference on Computer and Communications Security (CCS)},\n year = {2007},\n month = {October},\n publisher = {Association for Computing Machinery, Inc.},\n url = {https://www.microsoft.com/en-us/research/publication/asirra-a-captcha-that-exploits-interest-aligned-manual-image-categorization/},\n edition = {Proceedings of 14th ACM Conference on Computer and Communications Security (CCS)},\n}\n", "homepage": "https://www.microsoft.com/en-us/download/details.aspx?id=54765", "license": "", "features": {"image_file_path": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"num_classes": 2, "names": ["cat", "dog"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": {"input": "image_file_path", "output": "labels"}, "task_templates": [{"task": "image-classification", "image_file_path_column": "image_file_path", "label_column": "labels", "labels": ["cat", "dog"]}], "builder_name": "cats_vs_dogs", "config_name": "default", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 3701417, "num_examples": 23410, "dataset_name": "cats_vs_dogs"}}, "download_checksums": {"https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_3367a.zip": {"num_bytes": 824894548, "checksum": "f9553e426bd725354ed3a27e3c6920caadb55c835d1ebd880d2e56d3f1fbb22b"}}, "download_size": 824894548, "post_processing_size": null, "dataset_size": 3701417, "size_in_bytes": 828595965}} \ No newline at end of file diff --git a/datasets/cats_vs_dogs/dummy/0.0.0/dummy_data.zip b/datasets/cats_vs_dogs/dummy/0.0.0/dummy_data.zip new file mode 100644 index 00000000000..8c038efb6a2 Binary files /dev/null and b/datasets/cats_vs_dogs/dummy/0.0.0/dummy_data.zip differ