ShakeHakobyan · ShakeHakobyan · Feb 6, 2023 · Mar 6, 2023 · Mar 7, 2023
diff --git a/stanza/utils/datasets/ner/convert_hy_armtdp.py b/stanza/utils/datasets/ner/convert_hy_armtdp.py
@@ -0,0 +1,140 @@
+"""
+Convert a ArmTDP-NER dataset to BIO format
+
+The dataset is here:
+
+https://github.com/myavrum/ArmTDP-NER.git
+"""
+
+import argparse
+import os
+import json
+import re
+import stanza
+import random
+from tqdm import tqdm
+nlp_hy = stanza.Pipeline(lang='hy', processors='tokenize')
+
+
+def read_data(path: str) -> list:
+    """
+    Reads the Armenian named entity recognition dataset
+
+    Returns a list of dictionaries.
+    Each dictionary contains information
+    about a paragraph (text, labels, etc.)
+    """
+    with open(path, 'r') as file:
+        paragraphs = [json.loads(line) for line in file]
+    return paragraphs
+
+
+def filter_unicode_broken_characters(text: str) -> str:
+    """
+    Removes all unicode characters in text
+    """
+    return re.sub(r'\\u[A-Za-z0-9]{4}', '', text)
+
+
+def get_label(tok_start_char: int, tok_end_char: int, labels: list) -> list:
+    """
+    Returns the label that corresponds to the given token
+    """
+    for label in labels:
+        if label[0] <= tok_start_char and label[1] >= tok_end_char:
+            return label
+    return []
+
+
+def format_sentences(paragraphs: list) -> list:
+    """
+    Takes a list of paragraphs and returns a list of sentences,
+    where each sentence is a list of tokens along with their respective entity tags.
+    """
+    sentences = []
+    for paragraph in tqdm(paragraphs):
+        doc = nlp_hy(filter_unicode_broken_characters(paragraph['text']))
+        for sentence in doc.sentences:
+            sentence_ents = []
+            entity = []
+            for token in sentence.tokens:
+                label = get_label(token.start_char, token.end_char, paragraph['labels'])
+                if label:
+                    entity.append(token.text)
+                    if token.end_char == label[1]:
+                        sentence_ents.append({'tokens': entity,
+                                              'tag': label[2]})
+                        entity = []
+                else:
+                    sentence_ents.append({'tokens': [token.text],
+                                          'tag': 'O'})
+            sentences.append(sentence_ents)
+    return sentences
+
+
+def convert_to_bioes(sentences: list) -> list:
+    """
+    Returns a list of strings where each string represents a sentence in BIOES format
+    """
+    beios_sents = []
+    for sentence in tqdm(sentences):
+        sentence_toc = ''
+        for ent in sentence:
+            if ent['tag'] == 'O':
+                sentence_toc += ent['tokens'][0] + '\tO' + '\n'
+            else:
+                if len(ent['tokens']) == 1:
+                    sentence_toc += ent['tokens'][0] + '\tS-' + ent['tag'] + '\n'
+                else:
+                    sentence_toc += ent['tokens'][0] + '\tB-' + ent['tag'] + '\n'
+                    for token in ent['tokens'][1:-1]:
+                        sentence_toc += token + '\tI-' + ent['tag'] + '\n'
+                    sentence_toc += ent['tokens'][-1] + '\tE-' + ent['tag'] + '\n'
+        beios_sents.append(sentence_toc)
+    return beios_sents
+
+
+def write_sentences_to_file(sents, filename):
+    print(f"Writing {len(sents)} sentences to {filename}")
+    with open(filename, 'w') as outfile:
+        for sent in sents:
+            outfile.write(sent + '\n\n')
+
+
+def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7, dev_fraction=0.15):
+    """
+    Splits a list of sentences into training, dev, and test sets,
+    and writes each set to a separate file with write_sentences_to_file
+    """
+    num = len(sents)
+    train_num = int(num * train_fraction)
+    dev_num = int(num * dev_fraction)
+    if train_fraction + dev_fraction > 1.0:
+        raise ValueError(
+            "Train and dev fractions added up to more than 1: {} {} {}".format(train_fraction, dev_fraction))
+
+    random.shuffle(sents)
+    train_sents = sents[:train_num]
+    dev_sents = sents[train_num:train_num + dev_num]
+    test_sents = sents[train_num + dev_num:]
+    batches = [train_sents, dev_sents, test_sents]
+    filenames = [f'{short_name}.train.tsv', f'{short_name}.dev.tsv', f'{short_name}.test.tsv']
+    for batch, filename in zip(batches, filenames):
+        write_sentences_to_file(batch, os.path.join(base_output_path, filename))
+
+
+def convert_dataset(base_input_path, base_output_path, short_name):
+    paragraphs = read_data(os.path.join(base_input_path, 'ArmNER-HY.json1'))
+    taged_sentences = format_sentences(paragraphs)
+    beios_sentences = convert_to_bioes(taged_sentences)
+    train_test_dev_split(beios_sentences, base_output_path, short_name)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_path', type=str, default="/armtdp/ArmTDP-NER", help="Path to input file")
+    parser.add_argument('--output_path', type=str, default="/armtdp/ArmTDP-NER/data", help="Path to the output directory")
+    parser.add_argument('--short_name', type=str, default="hy_armtdp", help="Name to identify the dataset and the model")
+    args = parser.parse_args()
+
+    convert_dataset(args.input_path, args.output_path, args.short_name)
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -317,6 +317,18 @@
   https://github.com/stanfordnlp/stanza-train
   this is not meant for any kind of actual NER use
 
+ArmTDP-NER is an Armenian NER dataset
+  - https://github.com/myavrum/ArmTDP-NER.git
+    ArmTDP-NER: The corpus was developed by the ArmTDP team led by Marat M. Yavrumyan
+    at the Yerevan State University by the collaboration of "Armenia National SDG Innovation Lab"
+    and "UC Berkley's Armenian Linguists' network".
+  - in $NERBASE, make a "armtdp" directory, then git clone the repo there
+    mkdir -p $NERBASE/armtdp
+    cd $NERBASE/armtdp
+    git clone https://github.com/myavrum/ArmTDP-NER.git
+  - Then run
+    python3 -m stanza.utils.datasets.ner.prepare_ner_dataset hy_armtdp
+
 """
 
 import glob
@@ -349,8 +361,10 @@
 import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file
 import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob
 import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob
+import stanza.utils.datasets.ner.convert_hy_armtdp as convert_hy_armtdp
 from stanza.utils.datasets.ner.utils import convert_bio_to_json, get_tags, read_tsv, write_dataset
 
+
 SHARDS = ('train', 'dev', 'test')
 
 class UnknownDatasetError(ValueError):
@@ -938,6 +952,17 @@ def process_masakhane(paths, dataset_name):
 
 def process_toy_dataset(paths, short_name):
     convert_bio_to_json(os.path.join(paths["NERBASE"], "English-SAMPLE"), paths["NER_DATA_DIR"], short_name)
+def process_armtdp(paths, short_name):
+    assert short_name == 'hy_armtdp'
+    base_input_path = os.path.join(paths["NERBASE"], "armtdp", "ArmTDP-NER")
+    base_output_path = paths["NER_DATA_DIR"]
+    convert_hy_armtdp.convert_dataset(base_input_path, base_output_path, short_name)
+    for shard in SHARDS:
+        input_filename = os.path.join(base_output_path, f'{short_name}.{shard}.tsv')
+        if not os.path.exists(input_filename):
+            raise FileNotFoundError('Cannot find %s component of %s in %s' % (shard, short_name, input_filename))
+        output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard))
+        prepare_ner_file.process_dataset(input_filename, output_filename)
 
 DATASET_MAPPING = {
     "bn_daffodil":       process_bn_daffodil,
@@ -960,6 +985,7 @@ def process_toy_dataset(paths, short_name):
     "sv_suc3shuffle":    process_sv_suc3shuffle,
     "tr_starlang":       process_starlang,
     "th_lst20":          process_lst20,
+    "hy_armtdp":         process_armtdp,
 }
 
 def main(dataset_name):

diff --git a/stanza/utils/training/common.py b/stanza/utils/training/common.py
@@ -123,6 +123,12 @@ class Mode(Enum):
     # herbert-large-cased (dev/test): 92.25/91.62
     # sdadas/polish-roberta-large-v2 (dev/test): 92.66/91.22
     "pl": "allegro/herbert-base-cased",
+
+    # https://huggingface.co/xlm-roberta-base
+    # Scores by entity on 18 labels:
+    # no bert : 86.68
+    # xlm-roberta-base : 89.31
+    "hy": "xlm-roberta-base",
 }
 
 def build_argparse():