From c64e48ca83b15c29e6969cb61fe1c09b441b9578 Mon Sep 17 00:00:00 2001
From: ShakeHakobyan <shake.hakobyan23@gmail.com>
Date: Mon, 6 Feb 2023 19:21:46 +0400
Subject: [PATCH 1/3] Added code for hy_armtdp

---
 .../utils/datasets/ner/convert_hy_armtdp.py   | 111 ++++++++++++++++++
 .../utils/datasets/ner/prepare_ner_dataset.py |  25 ++++
 stanza/utils/training/common.py               |   6 +
 3 files changed, 142 insertions(+)
 create mode 100644 stanza/utils/datasets/ner/convert_hy_armtdp.py

diff --git a/stanza/utils/datasets/ner/convert_hy_armtdp.py b/stanza/utils/datasets/ner/convert_hy_armtdp.py
new file mode 100644
index 0000000000..a34557acd2
--- /dev/null
+++ b/stanza/utils/datasets/ner/convert_hy_armtdp.py
@@ -0,0 +1,111 @@
+"""
+Convert a ArmTDP-NER dataset to BIO format
+
+The dataset is here:
+
+https://github.com/myavrum/ArmTDP-NER.git
+"""
+
+import os
+import json
+import re
+import stanza
+import random
+nlp = stanza.Pipeline(lang='hy', processors='tokenize')
+
+
+def read_data(path: str) -> list:
+    """
+    Reads Armenian data file
+
+    Returns list of dictionaries, where each dictionary represents
+    a paragraph's information (text, labels, etc.)
+    """
+    with open(path, 'r') as file:
+        paragraphs = [json.loads(line) for line in file]
+    return paragraphs
+
+
+def filter_unicode_broken_characters(paragraphs: list) -> list:
+    """
+    Removes all '\u202c' unicode characters in texts
+    TODO: why?
+    """
+    for paragraph in paragraphs:
+        paragraph['text'] = re.sub('\u202c', '', paragraph['text'])
+
+
+def format_sentence_as_beios(sentence, labels) -> list:
+    sentence_toc = ''
+    current_label = []
+    for token in sentence.tokens:
+        if current_label:
+            tag = current_label[2]
+            if token.end_char == current_label[1]:
+                sentence_toc += token.text + '\tE-' + tag + '\n'
+                current_label = []
+            else:
+                sentence_toc += token.text + '\tI-' + tag + '\n'
+        else:
+            current_label = get_label(token.start_char, labels)
+            if current_label:
+                tag = current_label[2]
+                if token.start_char == current_label[0] and token.end_char == current_label[1]:
+                    sentence_toc += token.text + '\tS-' + tag + '\n'
+                    current_label = []
+                elif token.start_char == current_label[0]:
+                    sentence_toc += token.text + '\tB-' + tag + '\n'
+            else:
+                sentence_toc += token.text + '\tO' + '\n'
+                current_label = []
+    return sentence_toc[:-1]
+
+
+def get_label(tok_start_char: int, labels: list) -> list:
+    for label in labels:
+        if label[0] == tok_start_char:
+            return label
+    return []
+
+
+def convert_to_bioes(paragraphs):
+    beios_sents = []
+    for paragraph in paragraphs:
+        doc = nlp(paragraph['text'])
+        for sentence in doc.sentences:
+            beios_sents.append(format_sentence_as_beios(sentence, paragraph['labels']))
+    return beios_sents
+
+
+def write_sentences_to_file_(sents, filename):
+    print(f"Writing {len(sents)} sentences to {filename}")
+    with open(filename, 'w') as outfile:
+        for sent in sents:
+            outfile.write(sent + '\n\n')
+
+
+def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7, dev_fraction=0.15):
+    num = len(sents)
+    train_num = int(num * train_fraction)
+    dev_num = int(num * dev_fraction)
+    if train_fraction + dev_fraction > 1.0:
+        raise ValueError(
+            "Train and dev fractions added up to more than 1: {} {} {}".format(train_fraction, dev_fraction))
+
+    random.shuffle(sents)
+    train_sents = sents[:train_num]
+    dev_sents = sents[train_num:train_num + dev_num]
+    test_sents = sents[train_num + dev_num:]
+    batches = [train_sents, dev_sents, test_sents]
+    filenames = [f'{short_name}.train.tsv', f'{short_name}.dev.tsv', f'{short_name}.test.tsv']
+    for batch, filename in zip(batches, filenames):
+        write_sentences_to_file_(batch, os.path.join(base_output_path, filename))
+
+
+def convert_hy_armtdp(base_input_path, base_output_path, short_name):
+    paragraphs = read_data(os.path.join(base_input_path, 'ArmNER-HY.json1'))
+    filter_unicode_broken_characters(paragraphs)
+    beios_sentences = convert_to_bioes(paragraphs)
+    train_test_dev_split(beios_sentences, base_output_path, short_name)
+
+
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
index bfb75ce143..cd7e787914 100644
--- a/stanza/utils/datasets/ner/prepare_ner_dataset.py
+++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -317,6 +317,18 @@
   https://github.com/stanfordnlp/stanza-train
   this is not meant for any kind of actual NER use
 
+ArmTDP-NER is an Armenian NER dataset
+  - https://github.com/myavrum/ArmTDP-NER.git
+    ArmTDP-NER: The corpus was developed by the ArmTDP team led by Marat M. Yavrumyan
+    at the Yerevan State University by the collaboration of "Armenia National SDG Innovation Lab"
+    and "UC Berkley's Armenian Linguists' network".
+  - in $NERBASE, make a "armtdp" directory, then git clone the repo there
+    mkdir -p $NERBASE/armtdp
+    cd $NERBASE/armtdp
+    git clone https://github.com/myavrum/ArmTDP-NER.git
+  - Then run
+    python3 -m stanza.utils.datasets.ner.prepare_ner_dataset hy_armtdp
+
 """
 
 import glob
@@ -350,6 +362,7 @@
 import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob
 import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob
 from stanza.utils.datasets.ner.utils import convert_bio_to_json, get_tags, read_tsv, write_dataset
+from stanza.utils.datasets.ner.convert_hy_armtdp import convert_hy_armtdp
 
 SHARDS = ('train', 'dev', 'test')
 
@@ -938,6 +951,17 @@ def process_masakhane(paths, dataset_name):
 
 def process_toy_dataset(paths, short_name):
     convert_bio_to_json(os.path.join(paths["NERBASE"], "English-SAMPLE"), paths["NER_DATA_DIR"], short_name)
+def process_armtdp(paths, short_name):
+    assert short_name == 'hy_armtdp'
+    base_input_path = os.path.join(paths["NERBASE"], "armtdp", "ArmTDP-NER")
+    base_output_path = paths["NER_DATA_DIR"]
+    convert_hy_armtdp(base_input_path, base_output_path, short_name)
+    for shard in SHARDS:
+        input_filename = os.path.join(base_output_path, f'{short_name}.{shard}.tsv')
+        if not os.path.exists(input_filename):
+            raise FileNotFoundError('Cannot find %s component of %s in %s' % (shard, short_name, input_filename))
+        output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard))
+        prepare_ner_file.process_dataset(input_filename, output_filename)
 
 DATASET_MAPPING = {
     "bn_daffodil":       process_bn_daffodil,
@@ -960,6 +984,7 @@ def process_toy_dataset(paths, short_name):
     "sv_suc3shuffle":    process_sv_suc3shuffle,
     "tr_starlang":       process_starlang,
     "th_lst20":          process_lst20,
+    "hy_armtdp":         process_armtdp,
 }
 
 def main(dataset_name):
diff --git a/stanza/utils/training/common.py b/stanza/utils/training/common.py
index ef2e5a387d..16b6d197c9 100644
--- a/stanza/utils/training/common.py
+++ b/stanza/utils/training/common.py
@@ -123,6 +123,12 @@ class Mode(Enum):
     # herbert-large-cased (dev/test): 92.25/91.62
     # sdadas/polish-roberta-large-v2 (dev/test): 92.66/91.22
     "pl": "allegro/herbert-base-cased",
+
+    # https://huggingface.co/xlm-roberta-base
+    # Scores by entity on 18 labels:
+    # no bert : 86.68
+    # xlm-roberta-base : 89.31
+    "hy": "xlm-roberta-base",
 }
 
 def build_argparse():

From d00b46141fb46a5271df939f162fb84afe1f74be Mon Sep 17 00:00:00 2001
From: Shake Hakobyan <shake@cast.shake>
Date: Mon, 6 Mar 2023 18:11:52 +0400
Subject: [PATCH 2/3] Updated after review

---
 .../utils/datasets/ner/convert_hy_armtdp.py   | 118 +++++++++++-------
 .../utils/datasets/ner/prepare_ner_dataset.py |   5 +-
 2 files changed, 76 insertions(+), 47 deletions(-)

diff --git a/stanza/utils/datasets/ner/convert_hy_armtdp.py b/stanza/utils/datasets/ner/convert_hy_armtdp.py
index a34557acd2..46e69f07fe 100644
--- a/stanza/utils/datasets/ner/convert_hy_armtdp.py
+++ b/stanza/utils/datasets/ner/convert_hy_armtdp.py
@@ -6,17 +6,19 @@
 https://github.com/myavrum/ArmTDP-NER.git
 """
 
+import argparse
 import os
 import json
 import re
 import stanza
 import random
-nlp = stanza.Pipeline(lang='hy', processors='tokenize')
+from tqdm import tqdm
+nlp_hy = stanza.Pipeline(lang='hy', processors='tokenize')
 
 
 def read_data(path: str) -> list:
     """
-    Reads Armenian data file
+    Takes a full path to the Armenian ner dataset
 
     Returns list of dictionaries, where each dictionary represents
     a paragraph's information (text, labels, etc.)
@@ -26,58 +28,72 @@ def read_data(path: str) -> list:
     return paragraphs
 
 
-def filter_unicode_broken_characters(paragraphs: list) -> list:
+def filter_unicode_broken_characters(text: str) -> str:
     """
-    Removes all '\u202c' unicode characters in texts
-    TODO: why?
+    Removes all unicode characters in text
     """
-    for paragraph in paragraphs:
-        paragraph['text'] = re.sub('\u202c', '', paragraph['text'])
-
-
-def format_sentence_as_beios(sentence, labels) -> list:
-    sentence_toc = ''
-    current_label = []
-    for token in sentence.tokens:
-        if current_label:
-            tag = current_label[2]
-            if token.end_char == current_label[1]:
-                sentence_toc += token.text + '\tE-' + tag + '\n'
-                current_label = []
-            else:
-                sentence_toc += token.text + '\tI-' + tag + '\n'
-        else:
-            current_label = get_label(token.start_char, labels)
-            if current_label:
-                tag = current_label[2]
-                if token.start_char == current_label[0] and token.end_char == current_label[1]:
-                    sentence_toc += token.text + '\tS-' + tag + '\n'
-                    current_label = []
-                elif token.start_char == current_label[0]:
-                    sentence_toc += token.text + '\tB-' + tag + '\n'
-            else:
-                sentence_toc += token.text + '\tO' + '\n'
-                current_label = []
-    return sentence_toc[:-1]
+    return re.sub(r'\\u[A-Za-z0-9]{4}', '', text)
 
 
-def get_label(tok_start_char: int, labels: list) -> list:
+def get_label(tok_start_char: int, tok_end_char: int, labels: list) -> list:
+    """
+    Returns the label that corresponds to the token
+    """
     for label in labels:
-        if label[0] == tok_start_char:
+        if label[0] <= tok_start_char and label[1] >= tok_end_char:
             return label
     return []
 
 
-def convert_to_bioes(paragraphs):
-    beios_sents = []
-    for paragraph in paragraphs:
-        doc = nlp(paragraph['text'])
+def format_sentences(paragraphs: list) -> list:
+    """
+    Takes a list of paragraphs and returns a list of sentences,
+    where each sentence is a list of tokens along with their respective entity tags.
+    """
+    sentences = []
+    for paragraph in tqdm(paragraphs):
+        doc = nlp_hy(filter_unicode_broken_characters(paragraph['text']))
         for sentence in doc.sentences:
-            beios_sents.append(format_sentence_as_beios(sentence, paragraph['labels']))
+            sentence_ents = []
+            entity = []
+            for token in sentence.tokens:
+                label = get_label(token.start_char, token.end_char, paragraph['labels'])
+                if label:
+                    entity.append(token.text)
+                    if token.end_char == label[1]:
+                        sentence_ents.append({'tokens': entity,
+                                              'tag': label[2]})
+                        entity = []
+                else:
+                    sentence_ents.append({'tokens': [token.text],
+                                          'tag': 'O'})
+            sentences.append(sentence_ents)
+    return sentences
+
+
+def convert_to_bioes(sentences: list) -> list:
+    """
+    Рeturns a list of strings where each string represents a sentence in BIOES format
+    """
+    beios_sents = []
+    for sentence in tqdm(sentences):
+        sentence_toc = ''
+        for ent in sentence:
+            if ent['tag'] == 'O':
+                sentence_toc += ent['tokens'][0] + '\tO' + '\n'
+            else:
+                if len(ent['tokens']) == 1:
+                    sentence_toc += ent['tokens'][0] + '\tS-' + ent['tag'] + '\n'
+                else:
+                    sentence_toc += ent['tokens'][0] + '\tB-' + ent['tag'] + '\n'
+                    for token in ent['tokens'][1:-1]:
+                        sentence_toc += token + '\tI-' + ent['tag'] + '\n'
+                    sentence_toc += ent['tokens'][-1] + '\tE-' + ent['tag'] + '\n'
+        beios_sents.append(sentence_toc)
     return beios_sents
 
 
-def write_sentences_to_file_(sents, filename):
+def write_sentences_to_file(sents, filename):
     print(f"Writing {len(sents)} sentences to {filename}")
     with open(filename, 'w') as outfile:
         for sent in sents:
@@ -85,6 +101,10 @@ def write_sentences_to_file_(sents, filename):
 
 
 def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7, dev_fraction=0.15):
+    """
+    Takes in a list of sentences and splits them into training, dev, and test sets
+    Writes each set to a separate file with write_sentences_to_file
+    """
     num = len(sents)
     train_num = int(num * train_fraction)
     dev_num = int(num * dev_fraction)
@@ -99,13 +119,21 @@ def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7
     batches = [train_sents, dev_sents, test_sents]
     filenames = [f'{short_name}.train.tsv', f'{short_name}.dev.tsv', f'{short_name}.test.tsv']
     for batch, filename in zip(batches, filenames):
-        write_sentences_to_file_(batch, os.path.join(base_output_path, filename))
+        write_sentences_to_file(batch, os.path.join(base_output_path, filename))
 
 
-def convert_hy_armtdp(base_input_path, base_output_path, short_name):
+def convert_dataset(base_input_path, base_output_path, short_name):
     paragraphs = read_data(os.path.join(base_input_path, 'ArmNER-HY.json1'))
-    filter_unicode_broken_characters(paragraphs)
-    beios_sentences = convert_to_bioes(paragraphs)
+    taged_sentences = format_sentences(paragraphs)
+    beios_sentences = convert_to_bioes(taged_sentences)
     train_test_dev_split(beios_sentences, base_output_path, short_name)
 
 
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_path', type=str, default="/armtdp/ArmTDP-NER", help="Where to find input file")
+    parser.add_argument('--output_path', type=str, default="/armtdp/ArmTDP-NER/data", help="Where to output the results")
+    parser.add_argument('--short_name', type=str, default="hy_armtdp", help="Language and dataset identifier")
+    args = parser.parse_args()
+
+    convert_dataset(args.input_path, args.output_path, args.short_name)
diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py
index cd7e787914..99668128c5 100644
--- a/stanza/utils/datasets/ner/prepare_ner_dataset.py
+++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py
@@ -361,8 +361,9 @@
 import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file
 import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob
 import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob
+import stanza.utils.datasets.ner.convert_hy_armtdp as convert_hy_armtdp
 from stanza.utils.datasets.ner.utils import convert_bio_to_json, get_tags, read_tsv, write_dataset
-from stanza.utils.datasets.ner.convert_hy_armtdp import convert_hy_armtdp
+
 
 SHARDS = ('train', 'dev', 'test')
 
@@ -955,7 +956,7 @@ def process_armtdp(paths, short_name):
     assert short_name == 'hy_armtdp'
     base_input_path = os.path.join(paths["NERBASE"], "armtdp", "ArmTDP-NER")
     base_output_path = paths["NER_DATA_DIR"]
-    convert_hy_armtdp(base_input_path, base_output_path, short_name)
+    convert_hy_armtdp.convert_dataset(base_input_path, base_output_path, short_name)
     for shard in SHARDS:
         input_filename = os.path.join(base_output_path, f'{short_name}.{shard}.tsv')
         if not os.path.exists(input_filename):

From e4a8813e66ad88241d7035e8fae0c1c3f0ca6fb3 Mon Sep 17 00:00:00 2001
From: Shake Hakobyan <shake@cast.shake>
Date: Tue, 7 Mar 2023 13:43:49 +0400
Subject: [PATCH 3/3] Updated after review

---
 .../utils/datasets/ner/convert_hy_armtdp.py   | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/stanza/utils/datasets/ner/convert_hy_armtdp.py b/stanza/utils/datasets/ner/convert_hy_armtdp.py
index 46e69f07fe..2f9a7c5da2 100644
--- a/stanza/utils/datasets/ner/convert_hy_armtdp.py
+++ b/stanza/utils/datasets/ner/convert_hy_armtdp.py
@@ -18,10 +18,11 @@
 
 def read_data(path: str) -> list:
     """
-    Takes a full path to the Armenian ner dataset
+    Reads the Armenian named entity recognition dataset
 
-    Returns list of dictionaries, where each dictionary represents
-    a paragraph's information (text, labels, etc.)
+    Returns a list of dictionaries.
+    Each dictionary contains information
+    about a paragraph (text, labels, etc.)
     """
     with open(path, 'r') as file:
         paragraphs = [json.loads(line) for line in file]
@@ -37,7 +38,7 @@ def filter_unicode_broken_characters(text: str) -> str:
 
 def get_label(tok_start_char: int, tok_end_char: int, labels: list) -> list:
     """
-    Returns the label that corresponds to the token
+    Returns the label that corresponds to the given token
     """
     for label in labels:
         if label[0] <= tok_start_char and label[1] >= tok_end_char:
@@ -73,7 +74,7 @@ def format_sentences(paragraphs: list) -> list:
 
 def convert_to_bioes(sentences: list) -> list:
     """
-    Рeturns a list of strings where each string represents a sentence in BIOES format
+    Returns a list of strings where each string represents a sentence in BIOES format
     """
     beios_sents = []
     for sentence in tqdm(sentences):
@@ -102,8 +103,8 @@ def write_sentences_to_file(sents, filename):
 
 def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7, dev_fraction=0.15):
     """
-    Takes in a list of sentences and splits them into training, dev, and test sets
-    Writes each set to a separate file with write_sentences_to_file
+    Splits a list of sentences into training, dev, and test sets,
+    and writes each set to a separate file with write_sentences_to_file
     """
     num = len(sents)
     train_num = int(num * train_fraction)
@@ -131,9 +132,9 @@ def convert_dataset(base_input_path, base_output_path, short_name):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--input_path', type=str, default="/armtdp/ArmTDP-NER", help="Where to find input file")
-    parser.add_argument('--output_path', type=str, default="/armtdp/ArmTDP-NER/data", help="Where to output the results")
-    parser.add_argument('--short_name', type=str, default="hy_armtdp", help="Language and dataset identifier")
+    parser.add_argument('--input_path', type=str, default="/armtdp/ArmTDP-NER", help="Path to input file")
+    parser.add_argument('--output_path', type=str, default="/armtdp/ArmTDP-NER/data", help="Path to the output directory")
+    parser.add_argument('--short_name', type=str, default="hy_armtdp", help="Name to identify the dataset and the model")
     args = parser.parse_args()
 
     convert_dataset(args.input_path, args.output_path, args.short_name)