diff --git a/stanza/utils/datasets/ner/convert_hy_armtdp.py b/stanza/utils/datasets/ner/convert_hy_armtdp.py new file mode 100644 index 0000000000..2f9a7c5da2 --- /dev/null +++ b/stanza/utils/datasets/ner/convert_hy_armtdp.py @@ -0,0 +1,140 @@ +""" +Convert a ArmTDP-NER dataset to BIO format + +The dataset is here: + +https://github.com/myavrum/ArmTDP-NER.git +""" + +import argparse +import os +import json +import re +import stanza +import random +from tqdm import tqdm +nlp_hy = stanza.Pipeline(lang='hy', processors='tokenize') + + +def read_data(path: str) -> list: + """ + Reads the Armenian named entity recognition dataset + + Returns a list of dictionaries. + Each dictionary contains information + about a paragraph (text, labels, etc.) + """ + with open(path, 'r') as file: + paragraphs = [json.loads(line) for line in file] + return paragraphs + + +def filter_unicode_broken_characters(text: str) -> str: + """ + Removes all unicode characters in text + """ + return re.sub(r'\\u[A-Za-z0-9]{4}', '', text) + + +def get_label(tok_start_char: int, tok_end_char: int, labels: list) -> list: + """ + Returns the label that corresponds to the given token + """ + for label in labels: + if label[0] <= tok_start_char and label[1] >= tok_end_char: + return label + return [] + + +def format_sentences(paragraphs: list) -> list: + """ + Takes a list of paragraphs and returns a list of sentences, + where each sentence is a list of tokens along with their respective entity tags. + """ + sentences = [] + for paragraph in tqdm(paragraphs): + doc = nlp_hy(filter_unicode_broken_characters(paragraph['text'])) + for sentence in doc.sentences: + sentence_ents = [] + entity = [] + for token in sentence.tokens: + label = get_label(token.start_char, token.end_char, paragraph['labels']) + if label: + entity.append(token.text) + if token.end_char == label[1]: + sentence_ents.append({'tokens': entity, + 'tag': label[2]}) + entity = [] + else: + sentence_ents.append({'tokens': [token.text], + 'tag': 'O'}) + sentences.append(sentence_ents) + return sentences + + +def convert_to_bioes(sentences: list) -> list: + """ + Returns a list of strings where each string represents a sentence in BIOES format + """ + beios_sents = [] + for sentence in tqdm(sentences): + sentence_toc = '' + for ent in sentence: + if ent['tag'] == 'O': + sentence_toc += ent['tokens'][0] + '\tO' + '\n' + else: + if len(ent['tokens']) == 1: + sentence_toc += ent['tokens'][0] + '\tS-' + ent['tag'] + '\n' + else: + sentence_toc += ent['tokens'][0] + '\tB-' + ent['tag'] + '\n' + for token in ent['tokens'][1:-1]: + sentence_toc += token + '\tI-' + ent['tag'] + '\n' + sentence_toc += ent['tokens'][-1] + '\tE-' + ent['tag'] + '\n' + beios_sents.append(sentence_toc) + return beios_sents + + +def write_sentences_to_file(sents, filename): + print(f"Writing {len(sents)} sentences to {filename}") + with open(filename, 'w') as outfile: + for sent in sents: + outfile.write(sent + '\n\n') + + +def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7, dev_fraction=0.15): + """ + Splits a list of sentences into training, dev, and test sets, + and writes each set to a separate file with write_sentences_to_file + """ + num = len(sents) + train_num = int(num * train_fraction) + dev_num = int(num * dev_fraction) + if train_fraction + dev_fraction > 1.0: + raise ValueError( + "Train and dev fractions added up to more than 1: {} {} {}".format(train_fraction, dev_fraction)) + + random.shuffle(sents) + train_sents = sents[:train_num] + dev_sents = sents[train_num:train_num + dev_num] + test_sents = sents[train_num + dev_num:] + batches = [train_sents, dev_sents, test_sents] + filenames = [f'{short_name}.train.tsv', f'{short_name}.dev.tsv', f'{short_name}.test.tsv'] + for batch, filename in zip(batches, filenames): + write_sentences_to_file(batch, os.path.join(base_output_path, filename)) + + +def convert_dataset(base_input_path, base_output_path, short_name): + paragraphs = read_data(os.path.join(base_input_path, 'ArmNER-HY.json1')) + taged_sentences = format_sentences(paragraphs) + beios_sentences = convert_to_bioes(taged_sentences) + train_test_dev_split(beios_sentences, base_output_path, short_name) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--input_path', type=str, default="/armtdp/ArmTDP-NER", help="Path to input file") + parser.add_argument('--output_path', type=str, default="/armtdp/ArmTDP-NER/data", help="Path to the output directory") + parser.add_argument('--short_name', type=str, default="hy_armtdp", help="Name to identify the dataset and the model") + args = parser.parse_args() + + convert_dataset(args.input_path, args.output_path, args.short_name) diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py index bfb75ce143..99668128c5 100644 --- a/stanza/utils/datasets/ner/prepare_ner_dataset.py +++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py @@ -317,6 +317,18 @@ https://github.com/stanfordnlp/stanza-train this is not meant for any kind of actual NER use +ArmTDP-NER is an Armenian NER dataset + - https://github.com/myavrum/ArmTDP-NER.git + ArmTDP-NER: The corpus was developed by the ArmTDP team led by Marat M. Yavrumyan + at the Yerevan State University by the collaboration of "Armenia National SDG Innovation Lab" + and "UC Berkley's Armenian Linguists' network". + - in $NERBASE, make a "armtdp" directory, then git clone the repo there + mkdir -p $NERBASE/armtdp + cd $NERBASE/armtdp + git clone https://github.com/myavrum/ArmTDP-NER.git + - Then run + python3 -m stanza.utils.datasets.ner.prepare_ner_dataset hy_armtdp + """ import glob @@ -349,8 +361,10 @@ import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob +import stanza.utils.datasets.ner.convert_hy_armtdp as convert_hy_armtdp from stanza.utils.datasets.ner.utils import convert_bio_to_json, get_tags, read_tsv, write_dataset + SHARDS = ('train', 'dev', 'test') class UnknownDatasetError(ValueError): @@ -938,6 +952,17 @@ def process_masakhane(paths, dataset_name): def process_toy_dataset(paths, short_name): convert_bio_to_json(os.path.join(paths["NERBASE"], "English-SAMPLE"), paths["NER_DATA_DIR"], short_name) +def process_armtdp(paths, short_name): + assert short_name == 'hy_armtdp' + base_input_path = os.path.join(paths["NERBASE"], "armtdp", "ArmTDP-NER") + base_output_path = paths["NER_DATA_DIR"] + convert_hy_armtdp.convert_dataset(base_input_path, base_output_path, short_name) + for shard in SHARDS: + input_filename = os.path.join(base_output_path, f'{short_name}.{shard}.tsv') + if not os.path.exists(input_filename): + raise FileNotFoundError('Cannot find %s component of %s in %s' % (shard, short_name, input_filename)) + output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard)) + prepare_ner_file.process_dataset(input_filename, output_filename) DATASET_MAPPING = { "bn_daffodil": process_bn_daffodil, @@ -960,6 +985,7 @@ def process_toy_dataset(paths, short_name): "sv_suc3shuffle": process_sv_suc3shuffle, "tr_starlang": process_starlang, "th_lst20": process_lst20, + "hy_armtdp": process_armtdp, } def main(dataset_name): diff --git a/stanza/utils/training/common.py b/stanza/utils/training/common.py index ef2e5a387d..16b6d197c9 100644 --- a/stanza/utils/training/common.py +++ b/stanza/utils/training/common.py @@ -123,6 +123,12 @@ class Mode(Enum): # herbert-large-cased (dev/test): 92.25/91.62 # sdadas/polish-roberta-large-v2 (dev/test): 92.66/91.22 "pl": "allegro/herbert-base-cased", + + # https://huggingface.co/xlm-roberta-base + # Scores by entity on 18 labels: + # no bert : 86.68 + # xlm-roberta-base : 89.31 + "hy": "xlm-roberta-base", } def build_argparse():