From 7e5c02c7c99cf11b2d3b6b33ebf07c9960d83c47 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Fri, 13 Aug 2021 12:35:28 +0300
Subject: [PATCH 01/23] add cedr dataset

---
 datasets/cedr/README.md                       | 181 +++++++++++++++++
 datasets/cedr/cedr.py                         | 184 ++++++++++++++++++
 datasets/cedr/dataset_infos.json              | 146 ++++++++++++++
 .../cedr/dummy/enriched/0.1.1/dummy_data.zip  | Bin 0 -> 3940 bytes
 datasets/cedr/dummy/main/0.1.1/dummy_data.zip | Bin 0 -> 2386 bytes
 5 files changed, 511 insertions(+)
 create mode 100644 datasets/cedr/README.md
 create mode 100644 datasets/cedr/cedr.py
 create mode 100644 datasets/cedr/dataset_infos.json
 create mode 100644 datasets/cedr/dummy/enriched/0.1.1/dummy_data.zip
 create mode 100644 datasets/cedr/dummy/main/0.1.1/dummy_data.zip

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
new file mode 100644
index 00000000000..6399d7844b7
--- /dev/null
+++ b/datasets/cedr/README.md
@@ -0,0 +1,181 @@
+---
+YAML tags:
+annotations_creators:
+- crowdsourced
+language_creators:
+- found
+languages:
+- ru-RU
+licenses:
+- apache-2.0
+multilinguality:
+- monolingual
+pretty_name: CEDR
+size_categories:
+- unknown
+source_datasets:
+- original
+task_categories:
+- text-classification
+task_ids:
+- multi-class-classification
+- emotion-classification
+- sentiment-classification
+---
+
+# Dataset Card for [Dataset Name]
+
+## Table of Contents
+- [Table of Contents](#table-of-contents)
+- [Dataset Description](#dataset-description)
+  - [Dataset Summary](#dataset-summary)
+  - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
+  - [Languages](#languages)
+- [Dataset Structure](#dataset-structure)
+  - [Data Instances](#data-instances)
+  - [Data Fields](#data-fields)
+  - [Data Splits](#data-splits)
+- [Dataset Creation](#dataset-creation)
+  - [Curation Rationale](#curation-rationale)
+  - [Source Data](#source-data)
+  - [Annotations](#annotations)
+  - [Personal and Sensitive Information](#personal-and-sensitive-information)
+- [Considerations for Using the Data](#considerations-for-using-the-data)
+  - [Social Impact of Dataset](#social-impact-of-dataset)
+  - [Discussion of Biases](#discussion-of-biases)
+  - [Other Known Limitations](#other-known-limitations)
+- [Additional Information](#additional-information)
+  - [Dataset Curators](#dataset-curators)
+  - [Licensing Information](#licensing-information)
+  - [Citation Information](#citation-information)
+  - [Contributions](#contributions)
+
+## Dataset Description
+
+- **Homepage:** https://github.com/sag111/CEDR
+- **Repository:** https://github.com/sag111/CEDR
+- **Paper:** https://www.sciencedirect.com/science/article/pii/S1877050921013247
+- **Leaderboard:**
+- **Point of Contact:** sag111@mail.ru
+
+### Dataset Summary
+
+The Corpus for Emotions Detecting in Russian-language text sentences of different social sources (CEDR) contains 9410  comments labeled for 5 emotion categories (joy, sadness, surprise, fear, and anger). 
+
+Here are 2 dataset configurations:
+- "main" - contains "text", "labels", and "source" features;
+- "enriched" - includes all "main" features and "sentences".
+
+Dataset with predefined train/test splits.
+
+### Supported Tasks and Leaderboards
+
+This dataset is intended for multi-class, multi-label emotion classification.
+
+### Languages
+
+The data is in Russian.
+
+## Dataset Structure
+
+### Data Instances
+
+Each instance is a text sentence in Russian from several sources with one or more emotion annotations.
+
+### Data Fields
+
+The main configuration includes:
+- text: the text of the sentence;
+- labels: the emotion annotations;
+- source: the tag name of the corresponding source
+
+In addition to the above, the raw data includes:
+- sentences: text tokenized and lemmatized with [udpipe](https://ufal.mff.cuni.cz/udpipe)
+  - 'forma': the original word form;
+  - 'lemma': the lemma of this word
+
+### Data Splits
+
+The dataset includes a set of train/test splits. 
+with 7528, and 1882 examples respectively.
+
+## Dataset Creation
+
+### Curation Rationale
+
+The formed dataset of examples consists of sentences in Russian from several sources (blogs, microblogs, news), which allows creating methods to analyse various types of texts. The created methodology for building the dataset based on applying a crowdsourcing service can be used to expand the number of examples to improve the accuracy of supervised classifiers.
+
+### Source Data
+
+#### Initial Data Collection and Normalization
+
+Data was collected from several sources: posts of the Live Journal social network, texts of the online news agency Lenta.ru, and Twitter microblog posts.
+
+Only those sentences were selected that contained marker words from the dictionary of [the emotive vocabulary of the Russian language](http://lexrus.ru/default.aspx?p=2876). We manually formed a list of marker words for each emotion by choosing words from different categories of the dictionary.
+
+In total, 3098 sentences were selected from LiveJournal posts, 2847 sentences from Lenta.Ru, and 3465 sentencesfrom Twitter. After selection, sentences were offered to annotators for labeling.
+
+#### Who are the source language producers?
+
+Russian-speaking LiveJournal and Tweeter users, and authors of news articles on the site lenta.ru.
+
+### Annotations
+
+#### Annotation process
+
+Annotating sentences with labels of their emotions was performed with the help of [a crowdsourcing platform](https://yandex.ru/support/toloka/index.html?lang=en).
+
+The annotators’ task was: “What emotions did the author express in the sentence?”. The annotators were allowed to put an arbitrary number of the following emotion labels: "joy", "sadness", "anger", "fear", and "surprise"
+
+Sentences were split into tasks and assigned to annotators so that each sentence was annotated at least three times.
+
+#### Who are the annotators?
+
+Only those of the 30% of the best-performing active users (by the platform’s internal rating) who spoke Russian and were over 18 years old were allowed into the annotation process.
+
+### Personal and Sensitive Information
+
+The text of the sentences may contain profanity.
+
+## Considerations for Using the Data
+
+### Social Impact of Dataset
+
+[More Information Needed]
+
+### Discussion of Biases
+
+[More Information Needed]
+
+### Other Known Limitations
+
+[More Information Needed]
+
+## Additional Information
+
+### Dataset Curators
+
+Researchers at AI technology lab at NRC "Kurchatov Institute". See the author [list](https://www.sciencedirect.com/science/article/pii/S1877050921013247).
+
+### Licensing Information
+
+The GitHub repository which houses this dataset has an Apache License 2.0.
+
+### Citation Information
+
+If you have found our results helpful in your work, feel free to cite our publication and this repository as
+'''
+@article{sboev2021data,
+  title={Data-Driven Model for Emotion Detection in Russian Texts},
+  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},
+  journal={Procedia Computer Science},
+  volume={190},
+  pages={637--642},
+  year={2021},
+  publisher={Elsevier}
+}
+'''
+
+### Contributions
+
+Thanks to [@naumov-al](https://github.com/naumov-al) for adding this dataset.
\ No newline at end of file
diff --git a/datasets/cedr/cedr.py b/datasets/cedr/cedr.py
new file mode 100644
index 00000000000..07c60205b31
--- /dev/null
+++ b/datasets/cedr/cedr.py
@@ -0,0 +1,184 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""CEDR dataset"""
+
+import json
+import os
+
+import datasets
+
+
+# TODO: Add BibTeX citation
+# Find for instance the citation on arxiv or on the dataset repo/website
+_CITATION = """\
+@article{sboev2021data,
+  title={Data-Driven Model for Emotion Detection in Russian Texts},
+  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},
+  journal={Procedia Computer Science},
+  volume={190},
+  pages={637--642},
+  year={2021},
+  publisher={Elsevier}
+}
+"""
+
+_LICENSE = """http://www.apache.org/licenses/LICENSE-2.0"""
+
+# TODO: Add description of the dataset here
+# You can copy an official description
+_DESCRIPTION = """\
+This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in
+Russian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion
+categories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news
+agency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant
+is include tokenization and lemmatization. Dataset with predefined train/test splits.
+"""
+
+# TODO: Add a link to an official homepage for the dataset here
+_HOMEPAGE = "https://github.com/sag111/CEDR"
+
+# TODO: Add link to the official dataset URLs here
+# The HuggingFace dataset library don't host the datasets but only point to the original files
+# This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
+_URLs = {
+    "main": "https://drive.google.com/u/0/uc?id=14W03yKPvsTW6_pHkSw7vWi7T80wLAkJL&export=download",
+    "enriched": "https://drive.google.com/u/0/uc?id=1PsSr0PkIhXNQxXBjfPJ9RoPklH0zDnRv&export=download",
+}
+
+
+# TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
+class cedr(datasets.GeneratorBasedBuilder):
+    """This dataset is designed to solve emotion recognition task for text data in Russian."""
+
+    VERSION = datasets.Version("0.1.1")
+
+    # This is an example of a dataset with multiple configurations.
+    # If you don't want/need to define several sub-sets in your dataset,
+    # just remove the BUILDER_CONFIG_CLASS and the BUILDER_CONFIGS attributes.
+
+    # If you need to make complex sub-parts in the datasets with configurable options
+    # You can create your own builder configuration class to store attribute, inheriting from datasets.BuilderConfig
+    # BUILDER_CONFIG_CLASS = MyBuilderConfig
+
+    # You will be able to load one or the other configurations in the following list with
+    # data = datasets.load_dataset('my_dataset', 'first_domain')
+    # data = datasets.load_dataset('my_dataset', 'second_domain')
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name="main", version=VERSION, description="This part of CEDR dataset covers a main version"
+        ),
+        datasets.BuilderConfig(
+            name="enriched", version=VERSION, description="This part of CEDR dataset covers a enriched version"
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = "main"  # It's not mandatory to have a default configuration. Just use one if it make sense.
+
+    def _info(self):
+        # TODO: This method specifies the datasets.DatasetInfo object which contains informations and typings for the dataset
+        if self.config.name == "main":  # This is the name of the configuration selected in BUILDER_CONFIGS above
+            features = datasets.Features(
+                {
+                    "text": datasets.Value("string"),
+                    "labels": datasets.features.Sequence(datasets.Value("string")),
+                    "source": datasets.Value("string"),
+                    # These are the features of your dataset like images, labels ...
+                }
+            )
+        else:  # This is an example to show how to have different features for "first_domain" and "second_domain"
+            features = datasets.Features(
+                {
+                    "text": datasets.Value("string"),
+                    "labels": datasets.features.Sequence(datasets.Value("string")),
+                    "source": datasets.Value("string"),
+                    "sentences": [
+                        [
+                            {
+                                "forma": datasets.Value("string"),
+                                "lemma": datasets.Value("string"),
+                            }
+                        ]
+                    ]
+                    # These are the features of your dataset like images, labels ...
+                }
+            )
+        return datasets.DatasetInfo(
+            # This is the description that will appear on the datasets page.
+            description=_DESCRIPTION,
+            # This defines the different columns of the dataset and their types
+            features=features,  # Here we define them above because they are different between the two configurations
+            # If there's a common (input, target) tuple from the features,
+            # specify them here. They'll be used if as_supervised=True in
+            # builder.as_dataset.
+            supervised_keys=None,
+            # Homepage of the dataset for documentation
+            homepage=_HOMEPAGE,
+            # License for the dataset if available
+            license=_LICENSE,
+            # Citation for the dataset
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        """Returns SplitGenerators."""
+        # TODO: This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
+        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name
+
+        # dl_manager is a datasets.download.DownloadManager that can be used to download and extract URLs
+        # It can accept any type or nested list/dict and will give back the same structure with the url replaced with path to local files.
+        # By default the archives will be extracted and a path to a cached folder where they are extracted is returned instead of the archive
+        my_urls = _URLs[self.config.name]
+        data_dir = dl_manager.download_and_extract(my_urls)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, self.config.name, "train.jsonl"),
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                # These kwargs will be passed to _generate_examples
+                gen_kwargs={"filepath": os.path.join(data_dir, self.config.name, "test.jsonl"), "split": "test"},
+            ),
+        ]
+
+    def _generate_examples(
+        self, filepath, split  # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
+    ):
+        """Yields examples as (key, example) tuples."""
+        # This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
+        # The `key` is here for legacy reason (tfds) and is not important in itself.
+
+        with open(filepath, encoding="utf-8") as f:
+            for id_, row in enumerate(f):
+                data = json.loads(row)
+                if self.config.name == "main":
+                    yield id_, {
+                        "text": data["text"],
+                        "source": data["source"],
+                        "labels": data["labels"],
+                    }
+                else:
+                    yield id_, {
+                        "text": data["text"],
+                        "source": data["source"],
+                        "sentences": data["sentences"],
+                        "labels": data["labels"],
+                    }
diff --git a/datasets/cedr/dataset_infos.json b/datasets/cedr/dataset_infos.json
new file mode 100644
index 00000000000..126f4438dc7
--- /dev/null
+++ b/datasets/cedr/dataset_infos.json
@@ -0,0 +1,146 @@
+{
+  "main": {
+    "description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n",
+    "citation": "@article{sboev2021data,\n  title={Data-Driven Model for Emotion Detection in Russian Texts},\n  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},\n  journal={Procedia Computer Science},\n  volume={190},\n  pages={637--642},\n  year={2021},\n  publisher={Elsevier}\n}\n",
+    "homepage": "https://github.com/sag111/CEDR",
+    "license": "http://www.apache.org/licenses/LICENSE-2.0",
+    "features": {
+      "text": {
+        "dtype": "string",
+        "id": null,
+        "_type": "Value"
+      },
+      "labels": {
+        "feature": {
+          "dtype": "string",
+          "id": null,
+          "_type": "Value"
+        },
+        "length": -1,
+        "id": null,
+        "_type": "Sequence"
+      },
+      "source": {
+        "dtype": "string",
+        "id": null,
+        "_type": "Value"
+      }
+    },
+    "post_processed": null,
+    "supervised_keys": null,
+    "task_templates": null,
+    "builder_name": "cedr",
+    "config_name": "main",
+    "version": {
+      "version_str": "0.1.1",
+      "description": null,
+      "major": 0,
+      "minor": 1,
+      "patch": 1
+    },
+    "splits": {
+      "train": {
+        "name": "train",
+        "num_bytes": 1423761,
+        "num_examples": 7528,
+        "dataset_name": "cedr"
+      },
+      "test": {
+        "name": "test",
+        "num_bytes": 351850,
+        "num_examples": 1882,
+        "dataset_name": "cedr"
+      }
+    },
+    "download_checksums": {
+      "https://drive.google.com/u/0/uc?id=14W03yKPvsTW6_pHkSw7vWi7T80wLAkJL&export=download": {
+        "num_bytes": 693022,
+        "checksum": "27c23ac9c2ea836daf3f87f128130643b3a8b1ed5dae23794acaa9cdb374bf09"
+      }
+    },
+    "download_size": 693022,
+    "post_processing_size": null,
+    "dataset_size": 1775611,
+    "size_in_bytes": 2468633
+  },
+  "enriched": {
+    "description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n",
+    "citation": "@article{sboev2021data,\n  title={Data-Driven Model for Emotion Detection in Russian Texts},\n  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},\n  journal={Procedia Computer Science},\n  volume={190},\n  pages={637--642},\n  year={2021},\n  publisher={Elsevier}\n}\n",
+    "homepage": "https://github.com/sag111/CEDR",
+    "license": "http://www.apache.org/licenses/LICENSE-2.0",
+    "features": {
+      "text": {
+        "dtype": "string",
+        "id": null,
+        "_type": "Value"
+      },
+      "labels": {
+        "feature": {
+          "dtype": "string",
+          "id": null,
+          "_type": "Value"
+        },
+        "length": -1,
+        "id": null,
+        "_type": "Sequence"
+      },
+      "source": {
+        "dtype": "string",
+        "id": null,
+        "_type": "Value"
+      },
+      "sentences": [
+        [
+          {
+            "forma": {
+              "dtype": "string",
+              "id": null,
+              "_type": "Value"
+            },
+            "lemma": {
+              "dtype": "string",
+              "id": null,
+              "_type": "Value"
+            }
+          }
+        ]
+      ]
+    },
+    "post_processed": null,
+    "supervised_keys": null,
+    "task_templates": null,
+    "builder_name": "cedr",
+    "config_name": "enriched",
+    "version": {
+      "version_str": "0.1.1",
+      "description": null,
+      "major": 0,
+      "minor": 1,
+      "patch": 1
+    },
+    "splits": {
+      "train": {
+        "name": "train",
+        "num_bytes": 4797772,
+        "num_examples": 7528,
+        "dataset_name": "cedr"
+      },
+      "test": {
+        "name": "test",
+        "num_bytes": 1183918,
+        "num_examples": 1882,
+        "dataset_name": "cedr"
+      }
+    },
+    "download_checksums": {
+      "https://drive.google.com/u/0/uc?id=1PsSr0PkIhXNQxXBjfPJ9RoPklH0zDnRv&export=download": {
+        "num_bytes": 1822528,
+        "checksum": "ba194fe6446c639ddf70d2c776d41c50b7a2b6f2547f3cdb300bc5cc7ab89c89"
+      }
+    },
+    "download_size": 1822528,
+    "post_processing_size": null,
+    "dataset_size": 5981690,
+    "size_in_bytes": 7804218
+  }
+}
\ No newline at end of file
diff --git a/datasets/cedr/dummy/enriched/0.1.1/dummy_data.zip b/datasets/cedr/dummy/enriched/0.1.1/dummy_data.zip
new file mode 100644
index 0000000000000000000000000000000000000000..5255ad2f629635ed9fefcf440d218f26dacf138e
GIT binary patch
literal 3940
zcmcJScTiLL7RN&%Are9p0g<jiC{h+gs`MJeh9bQLAxKLCgf1XOim-GHMUW~@I${v%
zNN1%;FCxVN3J4-bLE-7{+qb(jyYHX(_S~8KJ9p-s`F`i#nfsj&3PDZ73iw_}jxdv7
znt$J*-%clAEH=>I$-&1#67{!(h(8bdItt3@xH<{SXi1|8Cf-sgcYPNNL!$r-4L4_$
zfr2p}<?f*;6{LkT_7nU;4in&s_xAZV?1cBndEgzK{)>?5|5pfu^LBM~!ThcW`Q;rz
z^zRJ<!2ke<BmiLl4=MV1JGkN`+z5D_$D|b!zbePnN-<|r><MF;N#z=9U8{%JP8X7}
z3`FrpOQ*|G+xZtGzEI`_8bgb+mSd*jbMYSf!QIQApPZv7to=5Q6`fSGx2C59uZbH@
zS8!GDzi8Zwc9&LmjXrUg`9hyC7}WH*dlJa`shtNMIqFTloeP0lGfcl(V7yXpMaXz9
zbn~U`;{I{jdTHP4;_++WX+PnrJk`9HJ5Ot3cHHM-uq&;YuR}XSLRgfh@M4BcWn5(I
zy7TxF#WH$xFL#CJYvJ(ogV@Ni<DJ;dr(0k5_P)kkenSBL<d}A@_7V5U$1g9RN9Lxz
z=LsTV6{#;HoKtr^9#{0NvrRQ**@fj|RjcZZVHO3^i9yjEUYCdc2MIq`Pb-w+*YR2B
zc6&5jpMuH}59jC<b$xG_e4UUt6rR5!Gvrk}Rk6D?H(EX=_4L)20%Jwz2LiNg_Stg!
zH7QDq$qH|)gOF7i-?|&Q6gi)j#QSB~do3NAX%kEFW+|xIT<O=}x3Q{VucmGbSOrZ=
z<-K?vJo|oSt~j3QL#$Oz%R6{xWAdnbBNH&H*cN!%liRVw#I2qGI_^Sl>THZVQ9@vH
z-K5D7Iwy^mmem;(A{o$M-IDBRjia~$$aOLTXyS8KqvD#+HFVI*4Ak&$bq1fs##1uO
zTfn|8XI_g<rs%|~WdT#axgkXDNM0Ky;25AbAN)nliVOcDtvl76&Tm8ILR`}uL~SNv
z6a=s-QWKr=s;QS``WZMNeX*r6_rqvYxNx=^PGnxTY|ijZ72W*;4Np(tMefs2XV}Gj
z)OMfDxq3SC6P)>Cv>0tm+*WX-L^ONq<DU-K#3iee(bi&Y+>Y_h%L%W}T4#0v4=0^_
zu7yQ{??T<Cjq046SaOs3k}1}o?7j0Y%TSZt(<4F?jG8aK$nud{fRi73)sn->W3$J;
zl)fF;CCp}H(7>H?`pMcvCi!}ElQ;6csBsr6`jftiEC>rkQBBKABqdz{s$6mx{3wN}
z{AQxGIF1Lfy-I{M-4!QxR76Pjzv`dmJe8f~9n8NWwkto4aASr0nRXqpD=A*J6*St@
zv=(GWoAGF|N^2Ynv$=)KlWpy3n8dG~M19svDg-8#+-b`9f%a(P70v@q$xChx@SujN
z!+Gefv)QQ%0f=$9x|hv`2ZAl{1ZUW1NzsA#iHs*lfoA)n*kdZG`i{FTa6$Y&`w^YR
zY}r2h@#tHtbEiYN`!DKA1bRh#IUI_xzAKLqnp0*<VKjDV4r?M0Vuoo30-u5H@U``P
zeqJD<8X5E2p!J`Oh|JfB2wy4jG{w?P?<l38H#=$d3Tm$#;AZu#E3cC+=$5P9VOA?~
z{x{_s^f7y%U%%5g^7^#D>AB(8IVgbZ%9PaZqQMI_JP%24Gqb$k1i7K!wij!JDMzr+
z=nN~J<{Go7Dtoo&p-Uk@%Mq>u>RDWvd#pem!)RS-mi?IjXNTM!W?X^W7;-@#X*V)g
z?j0AaR(!gt8GO$aEzd$dEZOc)Uf&^=eoZl)5#q$jqmeRTyuTsB8`3x2zX9X(a^%vn
zUJkSiU$Wxc;!*008X?8^>x<oQIfXScjcTIHfUPtME>dV~ADC9-9Kd9zIvf4ioj6{_
z-EB5uwoHy9#aW4(D5N46FK+zQ>C*XRUxa-rPCv;S(tZ9W)=P}IzMFYmNx1sM416z)
z!~!mPpj-9==);ouqdLNPmQ9|u%&ho)qgc+fzAA4)OZqPOCRcd`L~-#2+xe7a=Ov!Q
z%QhM|tBamv9R|sF<dYuwrsPBf3<?!+r9<1GXN?g`EfF~(I|n1A!hW+O3j`>}K<nVP
zi5u%DIn7bry~21V=$9bou+ZMO5wU)8N^NG1Cg&4Na>2<X5f{0W!ycr9u@406H`NXG
zolHOjA{f)HHO<7ZI!nZY0ZU=SEirwdO<repv&Q6_`plHsy<`8R#$qQO6<Pj-iHl#<
zj*GBsVzr21Sy9rsK`?Ae{($<Jn4It02czdw${2WJH$3gf&ny~WF;LV?<V3p>HkHzE
z0e(x?U*jigN`DOq0swGA0D#>;6+ak)&#&pzYBi6Wm!c0@GUo^tL$kKQk@fbQhiE0a
z+5+}`WF3|ENyr`T5nZ8j1m~6@;{2ZI`{crpDZDYNeP3=8PXZ*7zVf%^H08dU<f$Sr
zhtzC!eL%Qo%x@++A6_xcd;P&RaI0i8MIL#Q$?4Y!?Jgk=ndy*fFIn@=<j758Fk3IF
zr7rj?CA+I0!IgrBLdO<AKp78P)z$qR+)rjAxNZIJv0EO`_;s(&50p|EOWR-Cy|h1C
z-dgUnU!No`f0kFKbU#`?KF@1!mp6Anb=4!f6jPAJekI;RyJ)`e$%Zwk&4_jwqm_K=
z+!%wpy+a<4&QErW%^9D-D@g>(Y0tCkyS83gp)5XLEB6q786bPQfsR%&>|xU0-Q)n9
zRE?`k_j*+8k?Gwz?^^gq<;$OokY2qw$XZaG$M28C#Ejs1Sw6DB6W~MH*=K0d0PSlp
zIc;I-3?65D-!p_{UX0P9oe39ZnCB++@l4EkO>DENm)zfWLDA;(6RlCr;5wM6c!q3F
zCl9^9PxJ;~J@Z3`_Y&cmiAA?7HLGow$BE@&tZauPGzjARSbJ9d977zlDO&&J=WJSa
zQ^vt9wJg}D0uX*iQ`#TB34T7?ToM7IVWq`MGl+FhbXO7BY|2yVI4(&-&Pq6OmuAR4
zX`zcLV{%VGAtyMwGF}@9PQ{A6DmLJVppy{M;_LTJjwam3@FogJrg`7j7YAM%S68}~
z;6oD9O`$GOE?mGM-y-(R>cpcLg9|%w=^qdPLN$`6PMB81SN$y*1RSWp9np0$UVzv<
z678Rtz+>(<J1&3}hB#gIp4`82HzSu3DS+pju5Pe-;|q%2t0MY_Hg}nYI~oY*JH+Tz
z&NELN!p|UtpXT4H={%ZB<XH!)JaRFd!Yy-^Cn-ra8fekXMCQ&dfCLseZEjFhggZU2
zYdZ7gclajeOe+ima~<v8E?lV}9K*D+g!L9*Xa_HL3Td%3uuyX*Tyf{W6Yaaw#B)H&
zD}CMoBjsyo1yG;ZW3EY70#$mS%w*FD7*g@(s!dBPcubO&!+Y4#HhQev!7e5fW!i%k
zvCHwSQ9+I@ddMo3z0`q>La2l>3q72(N^UcQ`Cz=RuzW$L@)_jRqYYS)&WMhp1~yh!
zl2#&npo!d`s?)G<3y`EiV&TN+8{>Pk;0uA0ZQvnl5I=y+Pe9(XSjAt;<A+S$GYt@3
z6~p#$3L=O5`fj>~a`|n*EY<rmJE#=Xsr{T-|K2n4H6gt#{Zcg%!_MX|r`s3bA1Q0!
zklPAmSN}|MTvDhNO^}3z(PAS`73jsCz0jt!_hX_i3|(XyV633hao5!FhEqu8%bnGF
zUDuleccOrIb7EV9wHtODubTIJR7r&<lppeN02sGbQ2hJaI3UogeC*klO&5WLNRe=6
z>*vsxUC@`=dFb$PN-iY4as7TC!-{m8og|dp9xGSu$LkbC4{5PnE%SiC+D-tnc29hM
z$2^sBb64Q?L_&yd0p-Ww^Tj*Rp~I4n-mdGE{+>~n$Ed-ww8-uX7xL<88yUB$e0&_J
z%uv(0Vo5-y@r!&cS<i|5m|3YZKNM54Sc^8%;?xaG;A9&tI)&vn5>3_*rSBm+KQw$b
z*yP2rki$~izZR`;FdwKM!czp8=IJ0~8lOR!`Bk&Ea)@b0R<1iMhK0sWvG-Ch!HNkq
z!PSs1^k;Syf(pn={r3p;?HBy5m;hhZzn^~sqQ77KjzquK_eGI!?H?BZcR>1k;qQp_
zYke0^{nq{;h5rCdzd878Wctm+qi`Ckf5yQd0P6P-_}2mYyO~JZf5^;#;uM7deIrKz
Q0QBub0s#Oj*!QP@0n%ugX8-^I

literal 0
HcmV?d00001

diff --git a/datasets/cedr/dummy/main/0.1.1/dummy_data.zip b/datasets/cedr/dummy/main/0.1.1/dummy_data.zip
new file mode 100644
index 0000000000000000000000000000000000000000..11b721df8d4a10932e05a8d96486c5aeb759f8f6
GIT binary patch
literal 2386
zcmWIWW@Zs#0D)i0yun}wl;8%^DW$o&mGLQwC5ie0I2C%MDlAP_HFnEPQ8jikGzm8_
zuJjHlD-H=ai!boV4lXw@3(qtUu`nq2am@DeQ8hA4ttiMZDgi1^$uG~#$xlqdV^IKg
zEy_*I%!Av;0rr1u|MaFkObiSsc^DX?X<%DP5ztP(tm6E<oVC+@`!752w7vh+73deO
z+5O4z=8~8l3M%V5qzg_Q7umHc;>t3nEsyqJmi51y{)lCl&~#5D$=mDSUjJsJw)nTD
z+IpW)FK66Ts*lQjdHqeM?%a6p{g;1!-};fienrloTfakDtoE$mJ0m=5n}vKxN5s?c
z>WznGZ_l3YnKdc)_RhNxTRuO2e);;thdbWi6@GQM_S5g9$!}a3S#q-;v?)mH{Xgni
zxMNRqq2*&9JFTZZyDf66-*w+`Yc+IlIpUY^vf5EVO5&DA=jJ=TUOtr@Mb@M>=ucc9
zRD3b7yLB`3mN2HZZH0$sTJD<3rk)@)`>xi~%|fAjym^?n_#JJ%$<2}_@XLoqYO2Mu
zGfLlYWgO}64OBd}P$S^%p4{s6dwYLchu+=3@oJ84sV<L!?9sk6>jaj~iRjMW@v<c5
zY-_;7?Mfdymv2u>X+A7lHZ6cv$s_KK0c#d_tdU0Zf&dP=qpo#V&fJ*tS)Ik4?McU*
zYhCJBizl}59}_C<{B=^&e-+o!P5Y_>y_0y39P5`%<MBvj-0dC7`S+gkhK0U=<U4vN
zX?q^>Xo_8Yyn3Z&ei+;Q>@3CgH@{aIpG`SYuw7tZ+qqfS3lr7bAOHJZS6y(@et95Y
zUfZ>aipCe`WPaS8+O#N(w{+F%t!$~rM|)p!ZSwjUF)hqR{N}1<6Zw*|W#30N%1l{q
zdve0WK$%&*0h96<{#>udxvTz%>B+M{lCAsl)(c-@SQ-1M@#lxDGrzw({jK2ALMegW
z=Cd!(+>rR<+Ju5O(IagW89hzE-ug5t=!2TuMr$c|4er>ii#MLQra#jOjXV_@)KO$L
zN!L@xslJ;hWkKePrz(Q)XIB}qZPhRiD>L~pdnTLv{~Hg|gtp5#r<(oyd#A8YZcd%|
zwRy=rM^9vme!H$|-Ti6O^?2*YH-frVmdVUWTertS()V)Hx&(%^vD)i*hW%xPl?9MY
zBY*JWLO)=pN#|l<h@wTNNi8mc<e9nCF80kf5OBT!TeM@l*9s#8mP_%OzZ+zgw{}Ho
zo?bOI?Du=$4+?!VmuCHTQrMRMZf|k=gQasGuiyD6V#@sabFWN4zP0!0r^`0$9!>vk
zrK|IQ#}*qs{<lT;y5ZYnf_oAp=bBvlq|95^8-2iR$<rx^^AndIZ+y3JW#QH(d&OBo
zcF0v;HvJ>oKjBdR4b>dOFaM`?UUHf*SM}k;(e_PRr(VB!{PpF!m&^6PFJCTye!GA?
zSN#5|>)*%S6N;JW)3bDm_Eol7+JD+Y3XBeGsXgf|>Gl;7J;CfHR%TFgzQHhJ<=<%@
zv!=Rx^H?latZvAgu+7zF`^?m}1|8`$n0a0%cx;tQwwAe8WMR=8sGBf-&&^4T72Nn`
z@=dOvxO1#}I<wHN(60Trk6&=(VlZa#VV8AVF|kWY-QaD&a~0oS)^D-bu6{|>Ws}>!
zF80Iux0f4SorG^JX5>m$eY3FrRE6OU8SNfhkt|)Nvsy9c!J4bJmM^)qV7I~AppcJ}
z5pzzz=#Bg=UfP!W)<Yp|LK)ZG4V`M+&h@DHo^_jH_>Omm_OIukj=%q0VdAiR@6tCV
zK6}+%R?OnHVDR?7p*%(SSnmdltqYH&ov%61d22S?Cyh5*OH!WJ?#{S5yOzoE>^09s
zcAs3Yay_4;M^|*sc^e>m_79_58N1B6YbUG>*^2Y6pCz=tOn0`aYCqV`<x}y){?GL%
zn=K|7@1B+49Px1L>O(>I-V|#T+Ba_%`%t^_XM9whT*m#4^I~1yD=U<}Cx7z4+E^F)
z^ReIBqOQkl&zR1xubcb4LOZ^mIl!BdNrV}9<p)g4Kp?>I*AYacRE4-Tf+|A@0BO_&
zGLbY=UNz#j7*shz0LbD=KqeI}hE|m@+p$-cF#ESOGB8r#cBHBk5pmcHAcR9qS!m!8
WcyScq&B_L{ixmhRfd$}iFb@FPQ>W+v

literal 0
HcmV?d00001


From 59876f2d3cb2ac8f6668dffc0b155e74f5afff2d Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Fri, 13 Aug 2021 13:24:36 +0300
Subject: [PATCH 02/23] fix yaml tags

---
 datasets/cedr/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index 6399d7844b7..e6e556c8094 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -10,7 +10,8 @@ licenses:
 - apache-2.0
 multilinguality:
 - monolingual
-pretty_name: CEDR
+paperswithcode_id: null
+pretty_name: cedr
 size_categories:
 - unknown
 source_datasets:
@@ -19,7 +20,6 @@ task_categories:
 - text-classification
 task_ids:
 - multi-class-classification
-- emotion-classification
 - sentiment-classification
 ---
 
@@ -178,4 +178,4 @@ If you have found our results helpful in your work, feel free to cite our public
 
 ### Contributions
 
-Thanks to [@naumov-al](https://github.com/naumov-al) for adding this dataset.
\ No newline at end of file
+Thanks to [@naumov-al](https://github.com/naumov-al) for adding this dataset.

From 24767ad36065bce7c7051814a20f7ce6cfee58ad Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Fri, 13 Aug 2021 13:54:28 +0300
Subject: [PATCH 03/23] fix yaml tags

---
 datasets/cedr/README.md | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index e6e556c8094..36038415e07 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -1,5 +1,4 @@
 ---
-YAML tags:
 annotations_creators:
 - crowdsourced
 language_creators:
@@ -10,8 +9,7 @@ licenses:
 - apache-2.0
 multilinguality:
 - monolingual
-paperswithcode_id: null
-pretty_name: cedr
+pretty_name: The Corpus for Emotions Detecting in Russian-language text sentences
 size_categories:
 - unknown
 source_datasets:
@@ -19,11 +17,11 @@ source_datasets:
 task_categories:
 - text-classification
 task_ids:
-- multi-class-classification
-- sentiment-classification
+- emotion-classification
+- multi-label-classification
 ---
 
-# Dataset Card for [Dataset Name]
+# Dataset Card for [cedr]
 
 ## Table of Contents
 - [Table of Contents](#table-of-contents)

From 1104c703d05eb202e8bbacc27efc6283f6d8717d Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Fri, 13 Aug 2021 14:03:05 +0300
Subject: [PATCH 04/23] final fix yaml tags

---
 datasets/cedr/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index 36038415e07..366cf12dd91 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -17,8 +17,9 @@ source_datasets:
 task_categories:
 - text-classification
 task_ids:
-- emotion-classification
+- sentiment-classification
 - multi-label-classification
+- text-classification-other-emotion-classification
 ---
 
 # Dataset Card for [cedr]

From 292549e69411ba0db093157b5477714af4bda4fc Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Fri, 13 Aug 2021 14:33:21 +0300
Subject: [PATCH 05/23] add some info about annotations in readme

---
 datasets/cedr/README.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index 366cf12dd91..dab4117ee47 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -69,7 +69,7 @@ Dataset with predefined train/test splits.
 
 ### Supported Tasks and Leaderboards
 
-This dataset is intended for multi-class, multi-label emotion classification.
+This dataset is intended for multi-label emotion classification.
 
 ### Languages
 
@@ -124,13 +124,15 @@ Russian-speaking LiveJournal and Tweeter users, and authors of news articles on
 
 Annotating sentences with labels of their emotions was performed with the help of [a crowdsourcing platform](https://yandex.ru/support/toloka/index.html?lang=en).
 
-The annotators’ task was: “What emotions did the author express in the sentence?”. The annotators were allowed to put an arbitrary number of the following emotion labels: "joy", "sadness", "anger", "fear", and "surprise"
+The annotators’ task was: “What emotions did the author express in the sentence?”. The annotators were allowed to put an arbitrary number of the following emotion labels: "joy", "sadness", "anger", "fear", and "surprise".
 
-Sentences were split into tasks and assigned to annotators so that each sentence was annotated at least three times.
+If the accuracy of an annotator on the control sentences (including the trial run) became less than 70%, or if the accuracy was less than 66% over the last six control samples, the annotator was dismissed. 
+
+Sentences were split into tasks and assigned to annotators so that each sentence was annotated at least three times. A label of a specific emotion was assigned to a sentence if put by more than half of the annotators.
 
 #### Who are the annotators?
 
-Only those of the 30% of the best-performing active users (by the platform’s internal rating) who spoke Russian and were over 18 years old were allowed into the annotation process.
+Only those of the 30% of the best-performing active users (by the platform’s internal rating) who spoke Russian and were over 18 years old were allowed into the annotation process. Moreover, before a platform user could be employed as an annotator, they underwent a training task, after which they were to mark 25 trial samples with more than 80% agreement compared to the annotation that we had performed ourselves.
 
 ### Personal and Sensitive Information
 
@@ -163,7 +165,7 @@ The GitHub repository which houses this dataset has an Apache License 2.0.
 ### Citation Information
 
 If you have found our results helpful in your work, feel free to cite our publication and this repository as
-'''
+```
 @article{sboev2021data,
   title={Data-Driven Model for Emotion Detection in Russian Texts},
   author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},
@@ -173,7 +175,7 @@ If you have found our results helpful in your work, feel free to cite our public
   year={2021},
   publisher={Elsevier}
 }
-'''
+```
 
 ### Contributions
 

From 99746c1c7f4b34f4efc32ea4541ade6125d93305 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Fri, 13 Aug 2021 17:38:54 +0300
Subject: [PATCH 06/23] update citation Information

---
 datasets/cedr/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index dab4117ee47..070083f2096 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -163,8 +163,7 @@ Researchers at AI technology lab at NRC "Kurchatov Institute". See the author [l
 The GitHub repository which houses this dataset has an Apache License 2.0.
 
 ### Citation Information
-
-If you have found our results helpful in your work, feel free to cite our publication and this repository as
+If you have found our results helpful in your work, feel free to cite our publication. This is an updated version of the dataset, the collection and preparation of which is described here:
 ```
 @article{sboev2021data,
   title={Data-Driven Model for Emotion Detection in Russian Texts},

From 5182d554ea5bcfd9941a86293b732eac46106d49 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Wed, 18 Aug 2021 14:21:23 +0300
Subject: [PATCH 07/23] fix data urls

---
 datasets/cedr/cedr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasets/cedr/cedr.py b/datasets/cedr/cedr.py
index 07c60205b31..6437fd7ec3a 100644
--- a/datasets/cedr/cedr.py
+++ b/datasets/cedr/cedr.py
@@ -55,8 +55,8 @@
 # The HuggingFace dataset library don't host the datasets but only point to the original files
 # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
 _URLs = {
-    "main": "https://drive.google.com/u/0/uc?id=14W03yKPvsTW6_pHkSw7vWi7T80wLAkJL&export=download",
-    "enriched": "https://drive.google.com/u/0/uc?id=1PsSr0PkIhXNQxXBjfPJ9RoPklH0zDnRv&export=download",
+    "main": "https://drive.google.com/u/0/uc?id=1flukWj075o7InAlBZm7htXEqZM6D-LfA&export=download",
+    "enriched": "https://drive.google.com/u/0/uc?id=1w9fJf9SwxtEABy5uhougn_2ZlxKBbZHV&export=download",
 }
 
 
From 845df318e30829b4ff7e22c72d82054734a63ccb Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Wed, 18 Aug 2021 15:26:00 +0300
Subject: [PATCH 08/23] update dataset

---
 datasets/cedr/cedr.py                         |  14 +-
 datasets/cedr/dataset_infos.json              | 147 +-----------------
 .../cedr/dummy/enriched/0.1.1/dummy_data.zip  | Bin 3940 -> 3940 bytes
 datasets/cedr/dummy/main/0.1.1/dummy_data.zip | Bin 2386 -> 2386 bytes
 4 files changed, 3 insertions(+), 158 deletions(-)

diff --git a/datasets/cedr/cedr.py b/datasets/cedr/cedr.py
index 6437fd7ec3a..a0450a5bea9 100644
--- a/datasets/cedr/cedr.py
+++ b/datasets/cedr/cedr.py
@@ -105,14 +105,7 @@ def _info(self):
                     "text": datasets.Value("string"),
                     "labels": datasets.features.Sequence(datasets.Value("string")),
                     "source": datasets.Value("string"),
-                    "sentences": [
-                        [
-                            {
-                                "forma": datasets.Value("string"),
-                                "lemma": datasets.Value("string"),
-                            }
-                        ]
-                    ]
+                    "sentences": [[{"forma": datasets.Value("string"), "lemma": datasets.Value("string"),}]]
                     # These are the features of your dataset like images, labels ...
                 }
             )
@@ -147,10 +140,7 @@ def _split_generators(self, dl_manager):
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 # These kwargs will be passed to _generate_examples
-                gen_kwargs={
-                    "filepath": os.path.join(data_dir, self.config.name, "train.jsonl"),
-                    "split": "train",
-                },
+                gen_kwargs={"filepath": os.path.join(data_dir, self.config.name, "train.jsonl"), "split": "train",},
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,
diff --git a/datasets/cedr/dataset_infos.json b/datasets/cedr/dataset_infos.json
index 126f4438dc7..1778adc3733 100644
--- a/datasets/cedr/dataset_infos.json
+++ b/datasets/cedr/dataset_infos.json
@@ -1,146 +1 @@
-{
-  "main": {
-    "description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n",
-    "citation": "@article{sboev2021data,\n  title={Data-Driven Model for Emotion Detection in Russian Texts},\n  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},\n  journal={Procedia Computer Science},\n  volume={190},\n  pages={637--642},\n  year={2021},\n  publisher={Elsevier}\n}\n",
-    "homepage": "https://github.com/sag111/CEDR",
-    "license": "http://www.apache.org/licenses/LICENSE-2.0",
-    "features": {
-      "text": {
-        "dtype": "string",
-        "id": null,
-        "_type": "Value"
-      },
-      "labels": {
-        "feature": {
-          "dtype": "string",
-          "id": null,
-          "_type": "Value"
-        },
-        "length": -1,
-        "id": null,
-        "_type": "Sequence"
-      },
-      "source": {
-        "dtype": "string",
-        "id": null,
-        "_type": "Value"
-      }
-    },
-    "post_processed": null,
-    "supervised_keys": null,
-    "task_templates": null,
-    "builder_name": "cedr",
-    "config_name": "main",
-    "version": {
-      "version_str": "0.1.1",
-      "description": null,
-      "major": 0,
-      "minor": 1,
-      "patch": 1
-    },
-    "splits": {
-      "train": {
-        "name": "train",
-        "num_bytes": 1423761,
-        "num_examples": 7528,
-        "dataset_name": "cedr"
-      },
-      "test": {
-        "name": "test",
-        "num_bytes": 351850,
-        "num_examples": 1882,
-        "dataset_name": "cedr"
-      }
-    },
-    "download_checksums": {
-      "https://drive.google.com/u/0/uc?id=14W03yKPvsTW6_pHkSw7vWi7T80wLAkJL&export=download": {
-        "num_bytes": 693022,
-        "checksum": "27c23ac9c2ea836daf3f87f128130643b3a8b1ed5dae23794acaa9cdb374bf09"
-      }
-    },
-    "download_size": 693022,
-    "post_processing_size": null,
-    "dataset_size": 1775611,
-    "size_in_bytes": 2468633
-  },
-  "enriched": {
-    "description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n",
-    "citation": "@article{sboev2021data,\n  title={Data-Driven Model for Emotion Detection in Russian Texts},\n  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},\n  journal={Procedia Computer Science},\n  volume={190},\n  pages={637--642},\n  year={2021},\n  publisher={Elsevier}\n}\n",
-    "homepage": "https://github.com/sag111/CEDR",
-    "license": "http://www.apache.org/licenses/LICENSE-2.0",
-    "features": {
-      "text": {
-        "dtype": "string",
-        "id": null,
-        "_type": "Value"
-      },
-      "labels": {
-        "feature": {
-          "dtype": "string",
-          "id": null,
-          "_type": "Value"
-        },
-        "length": -1,
-        "id": null,
-        "_type": "Sequence"
-      },
-      "source": {
-        "dtype": "string",
-        "id": null,
-        "_type": "Value"
-      },
-      "sentences": [
-        [
-          {
-            "forma": {
-              "dtype": "string",
-              "id": null,
-              "_type": "Value"
-            },
-            "lemma": {
-              "dtype": "string",
-              "id": null,
-              "_type": "Value"
-            }
-          }
-        ]
-      ]
-    },
-    "post_processed": null,
-    "supervised_keys": null,
-    "task_templates": null,
-    "builder_name": "cedr",
-    "config_name": "enriched",
-    "version": {
-      "version_str": "0.1.1",
-      "description": null,
-      "major": 0,
-      "minor": 1,
-      "patch": 1
-    },
-    "splits": {
-      "train": {
-        "name": "train",
-        "num_bytes": 4797772,
-        "num_examples": 7528,
-        "dataset_name": "cedr"
-      },
-      "test": {
-        "name": "test",
-        "num_bytes": 1183918,
-        "num_examples": 1882,
-        "dataset_name": "cedr"
-      }
-    },
-    "download_checksums": {
-      "https://drive.google.com/u/0/uc?id=1PsSr0PkIhXNQxXBjfPJ9RoPklH0zDnRv&export=download": {
-        "num_bytes": 1822528,
-        "checksum": "ba194fe6446c639ddf70d2c776d41c50b7a2b6f2547f3cdb300bc5cc7ab89c89"
-      }
-    },
-    "download_size": 1822528,
-    "post_processing_size": null,
-    "dataset_size": 5981690,
-    "size_in_bytes": 7804218
-  }
-}
\ No newline at end of file
+{"main": {"description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n", "citation": "@article{sboev2021data,\n  title={Data-Driven Model for Emotion Detection in Russian Texts},\n  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},\n  journal={Procedia Computer Science},\n  volume={190},\n  pages={637--642},\n  year={2021},\n  publisher={Elsevier}\n}\n", "homepage": "https://github.com/sag111/CEDR", "license": "http://www.apache.org/licenses/LICENSE-2.0", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "cedr", "config_name": "main", "version": {"version_str": "0.1.1", "description": null, "major": 0, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1423880, "num_examples": 7528, "dataset_name": "cedr"}, "test": {"name": "test", "num_bytes": 351868, "num_examples": 1882, "dataset_name": "cedr"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=1flukWj075o7InAlBZm7htXEqZM6D-LfA&export=download": {"num_bytes": 693026, "checksum": "d81e6d19679a903773b8776c4c0f68755d55596e6b34866fbaa9d39d2e385bd3"}}, "download_size": 693026, "post_processing_size": null, "dataset_size": 1775748, "size_in_bytes": 2468774}, "enriched": {"description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n", "citation": "@article{sboev2021data,\n  title={Data-Driven Model for Emotion Detection in Russian Texts},\n  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},\n  journal={Procedia Computer Science},\n  volume={190},\n  pages={637--642},\n  year={2021},\n  publisher={Elsevier}\n}\n", "homepage": "https://github.com/sag111/CEDR", "license": "http://www.apache.org/licenses/LICENSE-2.0", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "sentences": [[{"forma": {"dtype": "string", "id": null, "_type": "Value"}, "lemma": {"dtype": "string", "id": null, "_type": "Value"}}]]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "cedr", "config_name": "enriched", "version": {"version_str": "0.1.1", "description": null, "major": 0, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4797891, "num_examples": 7528, "dataset_name": "cedr"}, "test": {"name": "test", "num_bytes": 1183936, "num_examples": 1882, "dataset_name": "cedr"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=1w9fJf9SwxtEABy5uhougn_2ZlxKBbZHV&export=download": {"num_bytes": 1822522, "checksum": "3b0ee43108ca6a52ce21037d35c99538a4a80e9dba5bd3d02b3ff17d4d89b2b7"}}, "download_size": 1822522, "post_processing_size": null, "dataset_size": 5981827, "size_in_bytes": 7804349}}
\ No newline at end of file
diff --git a/datasets/cedr/dummy/enriched/0.1.1/dummy_data.zip b/datasets/cedr/dummy/enriched/0.1.1/dummy_data.zip
index 5255ad2f629635ed9fefcf440d218f26dacf138e..8b34b08f2df44ad9b6e0809657ba805266c98a81 100644
GIT binary patch
delta 431
zcmaDN_e732z?+#xgaHKZRtZhyRbvKGX6g#%mT6vTmciu}C9aN6m8PW``K9T3@kUWO
z72ZxsQ66EF=kbbjBWs;cgr1pv2n#tt7ETsqBubm+=0-*yHjvvUPhclXhy3Pm?Bc9o
z9Rj>&VA_+{lo)R*f)yVkMlsMo`~lvKOd`ynAcXkHg^vhZX7ZVWZMjICV)@Cg{PF;z
C7>WP@

delta 431
zcmaDN_e732z?+#xgaHJ8C-Y9^RbvKGX6gz7#lb}e0ok4z5q^Oc5l&fY0bZ6t`2pEE
z9tKq|c|m29=kbbjBWs;cgr1pv2n#tt7ETsqBubm+=0-*yHjvvUPhclXhy3Pm?Bc9o
z9Rj>&VA_+{lo)R*f)yVkMlsMo`~lvKOd`ynAcXkHg^vhZX7ZVWZMjICV)@Cg{PF;o
CyoY=M

diff --git a/datasets/cedr/dummy/main/0.1.1/dummy_data.zip b/datasets/cedr/dummy/main/0.1.1/dummy_data.zip
index 11b721df8d4a10932e05a8d96486c5aeb759f8f6..19236e6d5a4f0c39b0fb8f7666bdfa5fa26e46e1 100644
GIT binary patch
delta 439
zcmca4bV-Oez?+#xgaHKZR0&PwRbvKGX6g!QIi=a*SqA2&`R1N^jyX<Ix#k%q5w3+%
zzGg1EK534V1vx~~G|wkWmkc+=LJp9HlX)45(j>JxkMSZC$ZeCIScufIm?Z_GgNxk^
ztkap@lqhdaUdzq{*06^-4d5`FY{fy8=~9#P7`gcaycwB9m_gwV35LVOYBu6j006rL
Bf4cwx

delta 439
zcmca4bV-Oez?+#xgaHJ8CG$??RbvKGX6gzi;ReQ)-T`IBA>n561s>VK<>qDKndTuD
z2IW4E*<L=A1vx~~G|wkWmkc+=LJp9HlX)45(j>JxkMSZC$ZeCIScufIm?Z_GgNxk^
ztkap@lqhdaUdzq{*06^-4d5`FY{fy8=~9#P7`gcaycwB9m_gwV35LVOYBu6j003(w
Bf0qCN


From 2ff3d20bf295359b103a1be7aefe76ab32d7822e Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:25:44 +0300
Subject: [PATCH 09/23] update pretty_name in readme

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/cedr/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index 070083f2096..2f5c8944b3d 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -9,7 +9,7 @@ licenses:
 - apache-2.0
 multilinguality:
 - monolingual
-pretty_name: The Corpus for Emotions Detecting in Russian-language text sentences
+pretty_name: The Corpus for Emotions Detecting in Russian-language text sentences (CEDR)
 size_categories:
 - unknown
 source_datasets:

From 78ca79bca98acdc1097cb475d79323649bc777d9 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:27:26 +0300
Subject: [PATCH 10/23] add size_categories

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/cedr/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index 2f5c8944b3d..a154c76f69c 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -11,7 +11,7 @@ multilinguality:
 - monolingual
 pretty_name: The Corpus for Emotions Detecting in Russian-language text sentences (CEDR)
 size_categories:
-- unknown
+- 1K<n<10K
 source_datasets:
 - original
 task_categories:

From 3b3e052a2f623f748f6716c046fc8088c1776918 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:27:56 +0300
Subject: [PATCH 11/23] Update point of contact

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/cedr/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index a154c76f69c..5b0414f3d89 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -55,7 +55,7 @@ task_ids:
 - **Repository:** https://github.com/sag111/CEDR
 - **Paper:** https://www.sciencedirect.com/science/article/pii/S1877050921013247
 - **Leaderboard:**
-- **Point of Contact:** sag111@mail.ru
+- **Point of Contact:** [@sag111](mailto:sag111@mail.ru)
 
 ### Dataset Summary
 

From a9b6ce1bb3ea14f26d8887dda062b8edf8ebdd0f Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:28:23 +0300
Subject: [PATCH 12/23] update links

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/cedr/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index 5b0414f3d89..cf115c07dd9 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -51,9 +51,9 @@ task_ids:
 
 ## Dataset Description
 
-- **Homepage:** https://github.com/sag111/CEDR
-- **Repository:** https://github.com/sag111/CEDR
-- **Paper:** https://www.sciencedirect.com/science/article/pii/S1877050921013247
+- **Homepage:** [GitHub](https://github.com/sag111/CEDR)
+- **Repository:** [GitHub](https://github.com/sag111/CEDR)
+- **Paper:** [ScienceDirect](https://www.sciencedirect.com/science/article/pii/S1877050921013247)
 - **Leaderboard:**
 - **Point of Contact:** [@sag111](mailto:sag111@mail.ru)
 

From 01c375899b3d0c158490e2483f44a4a126111ea2 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:31:39 +0300
Subject: [PATCH 13/23] update class name

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/cedr/cedr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/cedr/cedr.py b/datasets/cedr/cedr.py
index a0450a5bea9..83cf2d9a868 100644
--- a/datasets/cedr/cedr.py
+++ b/datasets/cedr/cedr.py
@@ -61,7 +61,7 @@
 
 
 # TODO: Name of the dataset usually match the script name with CamelCase instead of snake_case
-class cedr(datasets.GeneratorBasedBuilder):
+class Cedr(datasets.GeneratorBasedBuilder):
     """This dataset is designed to solve emotion recognition task for text data in Russian."""
 
     VERSION = datasets.Version("0.1.1")

From 33f58297d9904a6fff3167340cd802b2ce95448c Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:32:49 +0300
Subject: [PATCH 14/23] update readme text

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/cedr/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index cf115c07dd9..41297236267 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -110,7 +110,7 @@ The formed dataset of examples consists of sentences in Russian from several sou
 
 Data was collected from several sources: posts of the Live Journal social network, texts of the online news agency Lenta.ru, and Twitter microblog posts.
 
-Only those sentences were selected that contained marker words from the dictionary of [the emotive vocabulary of the Russian language](http://lexrus.ru/default.aspx?p=2876). We manually formed a list of marker words for each emotion by choosing words from different categories of the dictionary.
+Only those sentences were selected that contained marker words from the dictionary of [the emotive vocabulary of the Russian language](http://lexrus.ru/default.aspx?p=2876). The authors manually formed a list of marker words for each emotion by choosing words from different categories of the dictionary.
 
 In total, 3098 sentences were selected from LiveJournal posts, 2847 sentences from Lenta.Ru, and 3465 sentencesfrom Twitter. After selection, sentences were offered to annotators for labeling.
 

From 37bea9534ece1483e8d769ab1ddff3eb99d9ee4c Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:33:02 +0300
Subject: [PATCH 15/23] update readme text

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/cedr/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index 41297236267..d1c218f9600 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -132,7 +132,7 @@ Sentences were split into tasks and assigned to annotators so that each sentence
 
 #### Who are the annotators?
 
-Only those of the 30% of the best-performing active users (by the platform’s internal rating) who spoke Russian and were over 18 years old were allowed into the annotation process. Moreover, before a platform user could be employed as an annotator, they underwent a training task, after which they were to mark 25 trial samples with more than 80% agreement compared to the annotation that we had performed ourselves.
+Only those of the 30% of the best-performing active users (by the platform’s internal rating) who spoke Russian and were over 18 years old were allowed into the annotation process. Moreover, before a platform user could be employed as an annotator, they underwent a training task, after which they were to mark 25 trial samples with more than 80% agreement compared to the annotation that the authors had performed themselves.
 
 ### Personal and Sensitive Information
 

From 2470532cfda1911e67d21511cc1c96fa76aa92a4 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:37:51 +0300
Subject: [PATCH 16/23] add ClassLabel

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/cedr/cedr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/cedr/cedr.py b/datasets/cedr/cedr.py
index 83cf2d9a868..8c11279290c 100644
--- a/datasets/cedr/cedr.py
+++ b/datasets/cedr/cedr.py
@@ -94,7 +94,7 @@ def _info(self):
             features = datasets.Features(
                 {
                     "text": datasets.Value("string"),
-                    "labels": datasets.features.Sequence(datasets.Value("string")),
+                    "labels": datasets.features.Sequence(datasets.ClassLabel(names=["joy," "sadness", "surprise", "fear", "anger"])),
                     "source": datasets.Value("string"),
                     # These are the features of your dataset like images, labels ...
                 }

From ebccf7b54e88ee665a285c6bb7d19b2b93a862ef Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:38:07 +0300
Subject: [PATCH 17/23] add ClassLabel

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/cedr/cedr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/cedr/cedr.py b/datasets/cedr/cedr.py
index 8c11279290c..7d5634b4993 100644
--- a/datasets/cedr/cedr.py
+++ b/datasets/cedr/cedr.py
@@ -103,7 +103,7 @@ def _info(self):
             features = datasets.Features(
                 {
                     "text": datasets.Value("string"),
-                    "labels": datasets.features.Sequence(datasets.Value("string")),
+                    "labels": datasets.features.Sequence(datasets.ClassLabel(names=["joy," "sadness", "surprise", "fear", "anger"])),
                     "source": datasets.Value("string"),
                     "sentences": [[{"forma": datasets.Value("string"), "lemma": datasets.Value("string"),}]]
                     # These are the features of your dataset like images, labels ...

From f4b9317081e147caf959bcbbcd96c63db04a5b8a Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:38:29 +0300
Subject: [PATCH 18/23] update language tag

Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
---
 datasets/cedr/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index d1c218f9600..33a10584eae 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -4,7 +4,7 @@ annotations_creators:
 language_creators:
 - found
 languages:
-- ru-RU
+- ru
 licenses:
 - apache-2.0
 multilinguality:

From 17eaa47981edce8981c4ec20974b2bc3f9c51169 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 14:56:11 +0300
Subject: [PATCH 19/23] fix typo

---
 datasets/cedr/cedr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/datasets/cedr/cedr.py b/datasets/cedr/cedr.py
index 7d5634b4993..d3aabd2ef08 100644
--- a/datasets/cedr/cedr.py
+++ b/datasets/cedr/cedr.py
@@ -94,7 +94,7 @@ def _info(self):
             features = datasets.Features(
                 {
                     "text": datasets.Value("string"),
-                    "labels": datasets.features.Sequence(datasets.ClassLabel(names=["joy," "sadness", "surprise", "fear", "anger"])),
+                    "labels": datasets.features.Sequence(datasets.ClassLabel(names=["joy", "sadness", "surprise", "fear", "anger"])),
                     "source": datasets.Value("string"),
                     # These are the features of your dataset like images, labels ...
                 }
@@ -103,7 +103,7 @@ def _info(self):
             features = datasets.Features(
                 {
                     "text": datasets.Value("string"),
-                    "labels": datasets.features.Sequence(datasets.ClassLabel(names=["joy," "sadness", "surprise", "fear", "anger"])),
+                    "labels": datasets.features.Sequence(datasets.ClassLabel(names=["joy", "sadness", "surprise", "fear", "anger"])),
                     "source": datasets.Value("string"),
                     "sentences": [[{"forma": datasets.Value("string"), "lemma": datasets.Value("string"),}]]
                     # These are the features of your dataset like images, labels ...

From 1d36b846076d1de2cc43f15337e513ba55208fc5 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 15:29:52 +0300
Subject: [PATCH 20/23] add example from the dataset

---
 datasets/cedr/README.md | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/datasets/cedr/README.md b/datasets/cedr/README.md
index 33a10584eae..eb4d3921ea9 100644
--- a/datasets/cedr/README.md
+++ b/datasets/cedr/README.md
@@ -79,7 +79,33 @@ The data is in Russian.
 
 ### Data Instances
 
-Each instance is a text sentence in Russian from several sources with one or more emotion annotations.
+Each instance is a text sentence in Russian from several sources with one or more emotion annotations (or no emotion at all).
+
+An example for an instance from the dataset is shown below:
+```
+{
+  'text': 'Забавно как люди в возрасте удивляются входящим звонкам на мобильник)',
+  'labels': [0],
+  'source': 'twitter',
+  'sentences': [
+    [
+      {'forma': 'Забавно', 'lemma': 'Забавно'},
+      {'forma': 'как', 'lemma': 'как'},
+      {'forma': 'люди', 'lemma': 'человек'},
+      {'forma': 'в', 'lemma': 'в'},
+      {'forma': 'возрасте', 'lemma': 'возраст'},
+      {'forma': 'удивляются', 'lemma': 'удивляться'},
+      {'forma': 'входящим', 'lemma': 'входить'},
+      {'forma': 'звонкам', 'lemma': 'звонок'},
+      {'forma': 'на', 'lemma': 'на'},
+      {'forma': 'мобильник', 'lemma': 'мобильник'},
+      {'forma': ')', 'lemma': ')'}
+    ]
+  ]
+}
+```
+
+Emotion label codes: {0: "joy", 1: "sadness", 2: "surprise", 3: "fear", 4: "anger"}
 
 ### Data Fields
 
@@ -112,7 +138,7 @@ Data was collected from several sources: posts of the Live Journal social networ
 
 Only those sentences were selected that contained marker words from the dictionary of [the emotive vocabulary of the Russian language](http://lexrus.ru/default.aspx?p=2876). The authors manually formed a list of marker words for each emotion by choosing words from different categories of the dictionary.
 
-In total, 3098 sentences were selected from LiveJournal posts, 2847 sentences from Lenta.Ru, and 3465 sentencesfrom Twitter. After selection, sentences were offered to annotators for labeling.
+In total, 3069 sentences were selected from LiveJournal posts, 2851 sentences from Lenta.Ru, and 3490 sentencesfrom Twitter. After selection, sentences were offered to annotators for labeling.
 
 #### Who are the source language producers?
 

From ffd81542cf19f7a16851636f3589d4bb54ac23f2 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 15:30:53 +0300
Subject: [PATCH 21/23] update host in urls

---
 datasets/cedr/cedr.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/datasets/cedr/cedr.py b/datasets/cedr/cedr.py
index d3aabd2ef08..4bcdcf7c90a 100644
--- a/datasets/cedr/cedr.py
+++ b/datasets/cedr/cedr.py
@@ -55,8 +55,8 @@
 # The HuggingFace dataset library don't host the datasets but only point to the original files
 # This can be an arbitrary nested dict/list of URLs (see below in `_split_generators` method)
 _URLs = {
-    "main": "https://drive.google.com/u/0/uc?id=1flukWj075o7InAlBZm7htXEqZM6D-LfA&export=download",
-    "enriched": "https://drive.google.com/u/0/uc?id=1w9fJf9SwxtEABy5uhougn_2ZlxKBbZHV&export=download",
+    "main": "https://sagteam.ru/cedr/main.zip",
+    "enriched": "https://sagteam.ru/cedr/enriched.zip",
 }
 
 
@@ -94,7 +94,9 @@ def _info(self):
             features = datasets.Features(
                 {
                     "text": datasets.Value("string"),
-                    "labels": datasets.features.Sequence(datasets.ClassLabel(names=["joy", "sadness", "surprise", "fear", "anger"])),
+                    "labels": datasets.features.Sequence(
+                        datasets.ClassLabel(names=["joy", "sadness", "surprise", "fear", "anger"])
+                    ),
                     "source": datasets.Value("string"),
                     # These are the features of your dataset like images, labels ...
                 }
@@ -103,9 +105,18 @@ def _info(self):
             features = datasets.Features(
                 {
                     "text": datasets.Value("string"),
-                    "labels": datasets.features.Sequence(datasets.ClassLabel(names=["joy", "sadness", "surprise", "fear", "anger"])),
+                    "labels": datasets.features.Sequence(
+                        datasets.ClassLabel(names=["joy", "sadness", "surprise", "fear", "anger"])
+                    ),
                     "source": datasets.Value("string"),
-                    "sentences": [[{"forma": datasets.Value("string"), "lemma": datasets.Value("string"),}]]
+                    "sentences": [
+                        [
+                            {
+                                "forma": datasets.Value("string"),
+                                "lemma": datasets.Value("string"),
+                            }
+                        ]
+                    ]
                     # These are the features of your dataset like images, labels ...
                 }
             )
@@ -140,7 +151,10 @@ def _split_generators(self, dl_manager):
             datasets.SplitGenerator(
                 name=datasets.Split.TRAIN,
                 # These kwargs will be passed to _generate_examples
-                gen_kwargs={"filepath": os.path.join(data_dir, self.config.name, "train.jsonl"), "split": "train",},
+                gen_kwargs={
+                    "filepath": os.path.join(data_dir, self.config.name, "train.jsonl"),
+                    "split": "train",
+                },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split.TEST,

From a0b49410eb3eebcf105594fc282b081003d7e8ed Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 15:31:14 +0300
Subject: [PATCH 22/23] update info

---
 datasets/cedr/dataset_infos.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasets/cedr/dataset_infos.json b/datasets/cedr/dataset_infos.json
index 1778adc3733..e072d0a3c76 100644
--- a/datasets/cedr/dataset_infos.json
+++ b/datasets/cedr/dataset_infos.json
@@ -1 +1 @@
-{"main": {"description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n", "citation": "@article{sboev2021data,\n  title={Data-Driven Model for Emotion Detection in Russian Texts},\n  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},\n  journal={Procedia Computer Science},\n  volume={190},\n  pages={637--642},\n  year={2021},\n  publisher={Elsevier}\n}\n", "homepage": "https://github.com/sag111/CEDR", "license": "http://www.apache.org/licenses/LICENSE-2.0", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "cedr", "config_name": "main", "version": {"version_str": "0.1.1", "description": null, "major": 0, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1423880, "num_examples": 7528, "dataset_name": "cedr"}, "test": {"name": "test", "num_bytes": 351868, "num_examples": 1882, "dataset_name": "cedr"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=1flukWj075o7InAlBZm7htXEqZM6D-LfA&export=download": {"num_bytes": 693026, "checksum": "d81e6d19679a903773b8776c4c0f68755d55596e6b34866fbaa9d39d2e385bd3"}}, "download_size": 693026, "post_processing_size": null, "dataset_size": 1775748, "size_in_bytes": 2468774}, "enriched": {"description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n", "citation": "@article{sboev2021data,\n  title={Data-Driven Model for Emotion Detection in Russian Texts},\n  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},\n  journal={Procedia Computer Science},\n  volume={190},\n  pages={637--642},\n  year={2021},\n  publisher={Elsevier}\n}\n", "homepage": "https://github.com/sag111/CEDR", "license": "http://www.apache.org/licenses/LICENSE-2.0", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "sentences": [[{"forma": {"dtype": "string", "id": null, "_type": "Value"}, "lemma": {"dtype": "string", "id": null, "_type": "Value"}}]]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "cedr", "config_name": "enriched", "version": {"version_str": "0.1.1", "description": null, "major": 0, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4797891, "num_examples": 7528, "dataset_name": "cedr"}, "test": {"name": "test", "num_bytes": 1183936, "num_examples": 1882, "dataset_name": "cedr"}}, "download_checksums": {"https://drive.google.com/u/0/uc?id=1w9fJf9SwxtEABy5uhougn_2ZlxKBbZHV&export=download": {"num_bytes": 1822522, "checksum": "3b0ee43108ca6a52ce21037d35c99538a4a80e9dba5bd3d02b3ff17d4d89b2b7"}}, "download_size": 1822522, "post_processing_size": null, "dataset_size": 5981827, "size_in_bytes": 7804349}}
\ No newline at end of file
+{"main": {"description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n", "citation": "@article{sboev2021data,\n  title={Data-Driven Model for Emotion Detection in Russian Texts},\n  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},\n  journal={Procedia Computer Science},\n  volume={190},\n  pages={637--642},\n  year={2021},\n  publisher={Elsevier}\n}\n", "homepage": "https://github.com/sag111/CEDR", "license": "http://www.apache.org/licenses/LICENSE-2.0", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"feature": {"num_classes": 5, "names": ["joy", "sadness", "surprise", "fear", "anger"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "cedr", "config_name": "main", "version": {"version_str": "0.1.1", "description": null, "major": 0, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1418355, "num_examples": 7528, "dataset_name": "cedr"}, "test": {"name": "test", "num_bytes": 350275, "num_examples": 1882, "dataset_name": "cedr"}}, "download_checksums": {"https://sagteam.ru/cedr/main.zip": {"num_bytes": 693026, "checksum": "d81e6d19679a903773b8776c4c0f68755d55596e6b34866fbaa9d39d2e385bd3"}}, "download_size": 693026, "post_processing_size": null, "dataset_size": 1768630, "size_in_bytes": 2461656}, "enriched": {"description": "This new dataset is designed to solve emotion recognition task for text data in Russian. The Corpus for Emotions Detecting in\nRussian-language text sentences of different social sources (CEDR) contains 9410 sentences in Russian labeled for 5 emotion\ncategories. The data collected from different sources: posts of the LiveJournal social network, texts of the online news\nagency Lenta.ru, and Twitter microblog posts. There are two variants of the corpus: main and enriched. The enriched variant\nis include tokenization and lemmatization. Dataset with predefined train/test splits.\n", "citation": "@article{sboev2021data,\n  title={Data-Driven Model for Emotion Detection in Russian Texts},\n  author={Sboev, Alexander and Naumov, Aleksandr and Rybka, Roman},\n  journal={Procedia Computer Science},\n  volume={190},\n  pages={637--642},\n  year={2021},\n  publisher={Elsevier}\n}\n", "homepage": "https://github.com/sag111/CEDR", "license": "http://www.apache.org/licenses/LICENSE-2.0", "features": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "labels": {"feature": {"num_classes": 5, "names": ["joy", "sadness", "surprise", "fear", "anger"], "names_file": null, "id": null, "_type": "ClassLabel"}, "length": -1, "id": null, "_type": "Sequence"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "sentences": [[{"forma": {"dtype": "string", "id": null, "_type": "Value"}, "lemma": {"dtype": "string", "id": null, "_type": "Value"}}]]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "cedr", "config_name": "enriched", "version": {"version_str": "0.1.1", "description": null, "major": 0, "minor": 1, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4792366, "num_examples": 7528, "dataset_name": "cedr"}, "test": {"name": "test", "num_bytes": 1182343, "num_examples": 1882, "dataset_name": "cedr"}}, "download_checksums": {"https://sagteam.ru/cedr/enriched.zip": {"num_bytes": 1822522, "checksum": "3b0ee43108ca6a52ce21037d35c99538a4a80e9dba5bd3d02b3ff17d4d89b2b7"}}, "download_size": 1822522, "post_processing_size": null, "dataset_size": 5974709, "size_in_bytes": 7797231}}
\ No newline at end of file

From e826ae6e8481a23ccb6b40a7b51788b2a66a58f9 Mon Sep 17 00:00:00 2001
From: Aleksandr <sanya-naumov@mail.ru>
Date: Thu, 19 Aug 2021 15:31:31 +0300
Subject: [PATCH 23/23] update dummy

---
 .../cedr/dummy/enriched/0.1.1/dummy_data.zip  | Bin 3940 -> 3532 bytes
 datasets/cedr/dummy/main/0.1.1/dummy_data.zip | Bin 2386 -> 1946 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)

diff --git a/datasets/cedr/dummy/enriched/0.1.1/dummy_data.zip b/datasets/cedr/dummy/enriched/0.1.1/dummy_data.zip
index 8b34b08f2df44ad9b6e0809657ba805266c98a81..14fb2f45504faea34e219512b685529a4f6a452f 100644
GIT binary patch
delta 530
zcmaDNcSf2wz?+#xgaHJCs)Z-=s?`IjU@#v_NC4@S(%jt2_>{zwME%sfqRixs)D*p{
z%mRHB%_zDQ@#umX0oTp}vOAc=bM`4FHlRiceFg>{-1;}?Fe<WXgLJfB^L)a@%D|AP
z#=xMBN5gA&C01sjhRFiFW?20P)C&Xx41XOpfi!O4GBSxU!vhP$zhFbaro`hm#UCDm
zK<~rMLk<9h{~_ivFl=d@hugf#Yxxu{u>}S~_bWD>ewu8>ufWyF$iQIA15ytF&NqF(

delta 944
zcmX>j{X~v8z?+#xgaHKZRtZhyRjUV5!C*d=@CMQ;rMbD4@hOQViTb6<s>W`aDXPXU
zhUJ!NUTK!W<rO8aj!u=Pr5X99>3Q)+Q8^XfPDxQ7VX8)EsTBqJMI}JRDf#7jIr)hx
z`Y86HSQG-bsGe#zrREi7CTFA~tmFWBF_^=1_9-Sd1_lXz28MV{zgbh@xs8_<+2TNU
zv|jUk!o<qJkf+AL5Qk|81y*d9XIEln2HG*%g~v<~Bd#WQ^T^aw9z~2yBFykafe}eS
z4*`Jy!(YcqKpG=$)RUh$kfIFc1niLpb^(ys(#XPy?gSf3(&uCkUPW*gK#xL%jm_-n
oHd11tKHSeGsl_FFS;hHz;yD4{tiTk?z`zQGzDx`ZjJzNo03W>$ssI20

diff --git a/datasets/cedr/dummy/main/0.1.1/dummy_data.zip b/datasets/cedr/dummy/main/0.1.1/dummy_data.zip
index 19236e6d5a4f0c39b0fb8f7666bdfa5fa26e46e1..c5e218d1be523917af6de216748d0e01f07a6231 100644
GIT binary patch
delta 491
zcmca4G>e}%z?+#xgaHJCs)Z-=s?`IjU@#v_fFx5&b8{=>QxZ!O^>Y(5^Yp4R3-nPm
zqUeysp#!89u8jj^UvO*x^rk&b3=AiE7#LK*+5|Cd+ML07ok<;}TK?d}g?>QQ>0As9
z%GfR0JeMVli5aMPG8em<AckKi2eQl5BYA<5NrV~hdkjwitpEZ6hQE%{KpI;hK)nPr
z2D_)grT~d8jgvse;54VC2o$zi#rb(T0p6%aqWc_S<OF7{E}N{wDZ`b@$iN`a22u+E
Ds>N|c

delta 980
zcmbQme@Tcpz?+#xgaHKZR0&PwRjUV5!C*d=@CMQ;rMbD4@hOQViTb6<s>W`aDXPXU
zhG{va+2L6R=BD}Po_UTrPEon$86^>}g;Bm{F1kKxj;cmxsTBqJMI}JRDf#7jIr)hx
z`Y86HSQG%ZsGe#z<tAq4!L8!}c`vxNe|pm%CI*I+JPZuc*!^WrrpGoOzRnZ}vP1sh
z!-al8JJPur7^1M-L8cX(4>D&lF$3+GtjA`CHJ*Uh0f7L+Uq?+KP1RUqWD;S9rxRej
z)q|}96QGEL0Fcd-fJ`dd42?yY_3(JCM~O(71GY3WFjC+8k|JPI(#tB&&&vt$hPtGl
m0pb+&=tj82l!XQ^Ni8mcI7BSKo0Sb@87mMv0@K@XFb@DUnGziU