From c64e48ca83b15c29e6969cb61fe1c09b441b9578 Mon Sep 17 00:00:00 2001 From: ShakeHakobyan Date: Mon, 6 Feb 2023 19:21:46 +0400 Subject: [PATCH 1/3] Added code for hy_armtdp --- .../utils/datasets/ner/convert_hy_armtdp.py | 111 ++++++++++++++++++ .../utils/datasets/ner/prepare_ner_dataset.py | 25 ++++ stanza/utils/training/common.py | 6 + 3 files changed, 142 insertions(+) create mode 100644 stanza/utils/datasets/ner/convert_hy_armtdp.py diff --git a/stanza/utils/datasets/ner/convert_hy_armtdp.py b/stanza/utils/datasets/ner/convert_hy_armtdp.py new file mode 100644 index 0000000000..a34557acd2 --- /dev/null +++ b/stanza/utils/datasets/ner/convert_hy_armtdp.py @@ -0,0 +1,111 @@ +""" +Convert a ArmTDP-NER dataset to BIO format + +The dataset is here: + +https://github.com/myavrum/ArmTDP-NER.git +""" + +import os +import json +import re +import stanza +import random +nlp = stanza.Pipeline(lang='hy', processors='tokenize') + + +def read_data(path: str) -> list: + """ + Reads Armenian data file + + Returns list of dictionaries, where each dictionary represents + a paragraph's information (text, labels, etc.) + """ + with open(path, 'r') as file: + paragraphs = [json.loads(line) for line in file] + return paragraphs + + +def filter_unicode_broken_characters(paragraphs: list) -> list: + """ + Removes all '\u202c' unicode characters in texts + TODO: why? + """ + for paragraph in paragraphs: + paragraph['text'] = re.sub('\u202c', '', paragraph['text']) + + +def format_sentence_as_beios(sentence, labels) -> list: + sentence_toc = '' + current_label = [] + for token in sentence.tokens: + if current_label: + tag = current_label[2] + if token.end_char == current_label[1]: + sentence_toc += token.text + '\tE-' + tag + '\n' + current_label = [] + else: + sentence_toc += token.text + '\tI-' + tag + '\n' + else: + current_label = get_label(token.start_char, labels) + if current_label: + tag = current_label[2] + if token.start_char == current_label[0] and token.end_char == current_label[1]: + sentence_toc += token.text + '\tS-' + tag + '\n' + current_label = [] + elif token.start_char == current_label[0]: + sentence_toc += token.text + '\tB-' + tag + '\n' + else: + sentence_toc += token.text + '\tO' + '\n' + current_label = [] + return sentence_toc[:-1] + + +def get_label(tok_start_char: int, labels: list) -> list: + for label in labels: + if label[0] == tok_start_char: + return label + return [] + + +def convert_to_bioes(paragraphs): + beios_sents = [] + for paragraph in paragraphs: + doc = nlp(paragraph['text']) + for sentence in doc.sentences: + beios_sents.append(format_sentence_as_beios(sentence, paragraph['labels'])) + return beios_sents + + +def write_sentences_to_file_(sents, filename): + print(f"Writing {len(sents)} sentences to {filename}") + with open(filename, 'w') as outfile: + for sent in sents: + outfile.write(sent + '\n\n') + + +def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7, dev_fraction=0.15): + num = len(sents) + train_num = int(num * train_fraction) + dev_num = int(num * dev_fraction) + if train_fraction + dev_fraction > 1.0: + raise ValueError( + "Train and dev fractions added up to more than 1: {} {} {}".format(train_fraction, dev_fraction)) + + random.shuffle(sents) + train_sents = sents[:train_num] + dev_sents = sents[train_num:train_num + dev_num] + test_sents = sents[train_num + dev_num:] + batches = [train_sents, dev_sents, test_sents] + filenames = [f'{short_name}.train.tsv', f'{short_name}.dev.tsv', f'{short_name}.test.tsv'] + for batch, filename in zip(batches, filenames): + write_sentences_to_file_(batch, os.path.join(base_output_path, filename)) + + +def convert_hy_armtdp(base_input_path, base_output_path, short_name): + paragraphs = read_data(os.path.join(base_input_path, 'ArmNER-HY.json1')) + filter_unicode_broken_characters(paragraphs) + beios_sentences = convert_to_bioes(paragraphs) + train_test_dev_split(beios_sentences, base_output_path, short_name) + + diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py index bfb75ce143..cd7e787914 100644 --- a/stanza/utils/datasets/ner/prepare_ner_dataset.py +++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py @@ -317,6 +317,18 @@ https://github.com/stanfordnlp/stanza-train this is not meant for any kind of actual NER use +ArmTDP-NER is an Armenian NER dataset + - https://github.com/myavrum/ArmTDP-NER.git + ArmTDP-NER: The corpus was developed by the ArmTDP team led by Marat M. Yavrumyan + at the Yerevan State University by the collaboration of "Armenia National SDG Innovation Lab" + and "UC Berkley's Armenian Linguists' network". + - in $NERBASE, make a "armtdp" directory, then git clone the repo there + mkdir -p $NERBASE/armtdp + cd $NERBASE/armtdp + git clone https://github.com/myavrum/ArmTDP-NER.git + - Then run + python3 -m stanza.utils.datasets.ner.prepare_ner_dataset hy_armtdp + """ import glob @@ -350,6 +362,7 @@ import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob from stanza.utils.datasets.ner.utils import convert_bio_to_json, get_tags, read_tsv, write_dataset +from stanza.utils.datasets.ner.convert_hy_armtdp import convert_hy_armtdp SHARDS = ('train', 'dev', 'test') @@ -938,6 +951,17 @@ def process_masakhane(paths, dataset_name): def process_toy_dataset(paths, short_name): convert_bio_to_json(os.path.join(paths["NERBASE"], "English-SAMPLE"), paths["NER_DATA_DIR"], short_name) +def process_armtdp(paths, short_name): + assert short_name == 'hy_armtdp' + base_input_path = os.path.join(paths["NERBASE"], "armtdp", "ArmTDP-NER") + base_output_path = paths["NER_DATA_DIR"] + convert_hy_armtdp(base_input_path, base_output_path, short_name) + for shard in SHARDS: + input_filename = os.path.join(base_output_path, f'{short_name}.{shard}.tsv') + if not os.path.exists(input_filename): + raise FileNotFoundError('Cannot find %s component of %s in %s' % (shard, short_name, input_filename)) + output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard)) + prepare_ner_file.process_dataset(input_filename, output_filename) DATASET_MAPPING = { "bn_daffodil": process_bn_daffodil, @@ -960,6 +984,7 @@ def process_toy_dataset(paths, short_name): "sv_suc3shuffle": process_sv_suc3shuffle, "tr_starlang": process_starlang, "th_lst20": process_lst20, + "hy_armtdp": process_armtdp, } def main(dataset_name): diff --git a/stanza/utils/training/common.py b/stanza/utils/training/common.py index ef2e5a387d..16b6d197c9 100644 --- a/stanza/utils/training/common.py +++ b/stanza/utils/training/common.py @@ -123,6 +123,12 @@ class Mode(Enum): # herbert-large-cased (dev/test): 92.25/91.62 # sdadas/polish-roberta-large-v2 (dev/test): 92.66/91.22 "pl": "allegro/herbert-base-cased", + + # https://huggingface.co/xlm-roberta-base + # Scores by entity on 18 labels: + # no bert : 86.68 + # xlm-roberta-base : 89.31 + "hy": "xlm-roberta-base", } def build_argparse(): From d00b46141fb46a5271df939f162fb84afe1f74be Mon Sep 17 00:00:00 2001 From: Shake Hakobyan Date: Mon, 6 Mar 2023 18:11:52 +0400 Subject: [PATCH 2/3] Updated after review --- .../utils/datasets/ner/convert_hy_armtdp.py | 118 +++++++++++------- .../utils/datasets/ner/prepare_ner_dataset.py | 5 +- 2 files changed, 76 insertions(+), 47 deletions(-) diff --git a/stanza/utils/datasets/ner/convert_hy_armtdp.py b/stanza/utils/datasets/ner/convert_hy_armtdp.py index a34557acd2..46e69f07fe 100644 --- a/stanza/utils/datasets/ner/convert_hy_armtdp.py +++ b/stanza/utils/datasets/ner/convert_hy_armtdp.py @@ -6,17 +6,19 @@ https://github.com/myavrum/ArmTDP-NER.git """ +import argparse import os import json import re import stanza import random -nlp = stanza.Pipeline(lang='hy', processors='tokenize') +from tqdm import tqdm +nlp_hy = stanza.Pipeline(lang='hy', processors='tokenize') def read_data(path: str) -> list: """ - Reads Armenian data file + Takes a full path to the Armenian ner dataset Returns list of dictionaries, where each dictionary represents a paragraph's information (text, labels, etc.) @@ -26,58 +28,72 @@ def read_data(path: str) -> list: return paragraphs -def filter_unicode_broken_characters(paragraphs: list) -> list: +def filter_unicode_broken_characters(text: str) -> str: """ - Removes all '\u202c' unicode characters in texts - TODO: why? + Removes all unicode characters in text """ - for paragraph in paragraphs: - paragraph['text'] = re.sub('\u202c', '', paragraph['text']) - - -def format_sentence_as_beios(sentence, labels) -> list: - sentence_toc = '' - current_label = [] - for token in sentence.tokens: - if current_label: - tag = current_label[2] - if token.end_char == current_label[1]: - sentence_toc += token.text + '\tE-' + tag + '\n' - current_label = [] - else: - sentence_toc += token.text + '\tI-' + tag + '\n' - else: - current_label = get_label(token.start_char, labels) - if current_label: - tag = current_label[2] - if token.start_char == current_label[0] and token.end_char == current_label[1]: - sentence_toc += token.text + '\tS-' + tag + '\n' - current_label = [] - elif token.start_char == current_label[0]: - sentence_toc += token.text + '\tB-' + tag + '\n' - else: - sentence_toc += token.text + '\tO' + '\n' - current_label = [] - return sentence_toc[:-1] + return re.sub(r'\\u[A-Za-z0-9]{4}', '', text) -def get_label(tok_start_char: int, labels: list) -> list: +def get_label(tok_start_char: int, tok_end_char: int, labels: list) -> list: + """ + Returns the label that corresponds to the token + """ for label in labels: - if label[0] == tok_start_char: + if label[0] <= tok_start_char and label[1] >= tok_end_char: return label return [] -def convert_to_bioes(paragraphs): - beios_sents = [] - for paragraph in paragraphs: - doc = nlp(paragraph['text']) +def format_sentences(paragraphs: list) -> list: + """ + Takes a list of paragraphs and returns a list of sentences, + where each sentence is a list of tokens along with their respective entity tags. + """ + sentences = [] + for paragraph in tqdm(paragraphs): + doc = nlp_hy(filter_unicode_broken_characters(paragraph['text'])) for sentence in doc.sentences: - beios_sents.append(format_sentence_as_beios(sentence, paragraph['labels'])) + sentence_ents = [] + entity = [] + for token in sentence.tokens: + label = get_label(token.start_char, token.end_char, paragraph['labels']) + if label: + entity.append(token.text) + if token.end_char == label[1]: + sentence_ents.append({'tokens': entity, + 'tag': label[2]}) + entity = [] + else: + sentence_ents.append({'tokens': [token.text], + 'tag': 'O'}) + sentences.append(sentence_ents) + return sentences + + +def convert_to_bioes(sentences: list) -> list: + """ + Рeturns a list of strings where each string represents a sentence in BIOES format + """ + beios_sents = [] + for sentence in tqdm(sentences): + sentence_toc = '' + for ent in sentence: + if ent['tag'] == 'O': + sentence_toc += ent['tokens'][0] + '\tO' + '\n' + else: + if len(ent['tokens']) == 1: + sentence_toc += ent['tokens'][0] + '\tS-' + ent['tag'] + '\n' + else: + sentence_toc += ent['tokens'][0] + '\tB-' + ent['tag'] + '\n' + for token in ent['tokens'][1:-1]: + sentence_toc += token + '\tI-' + ent['tag'] + '\n' + sentence_toc += ent['tokens'][-1] + '\tE-' + ent['tag'] + '\n' + beios_sents.append(sentence_toc) return beios_sents -def write_sentences_to_file_(sents, filename): +def write_sentences_to_file(sents, filename): print(f"Writing {len(sents)} sentences to {filename}") with open(filename, 'w') as outfile: for sent in sents: @@ -85,6 +101,10 @@ def write_sentences_to_file_(sents, filename): def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7, dev_fraction=0.15): + """ + Takes in a list of sentences and splits them into training, dev, and test sets + Writes each set to a separate file with write_sentences_to_file + """ num = len(sents) train_num = int(num * train_fraction) dev_num = int(num * dev_fraction) @@ -99,13 +119,21 @@ def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7 batches = [train_sents, dev_sents, test_sents] filenames = [f'{short_name}.train.tsv', f'{short_name}.dev.tsv', f'{short_name}.test.tsv'] for batch, filename in zip(batches, filenames): - write_sentences_to_file_(batch, os.path.join(base_output_path, filename)) + write_sentences_to_file(batch, os.path.join(base_output_path, filename)) -def convert_hy_armtdp(base_input_path, base_output_path, short_name): +def convert_dataset(base_input_path, base_output_path, short_name): paragraphs = read_data(os.path.join(base_input_path, 'ArmNER-HY.json1')) - filter_unicode_broken_characters(paragraphs) - beios_sentences = convert_to_bioes(paragraphs) + taged_sentences = format_sentences(paragraphs) + beios_sentences = convert_to_bioes(taged_sentences) train_test_dev_split(beios_sentences, base_output_path, short_name) +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--input_path', type=str, default="/armtdp/ArmTDP-NER", help="Where to find input file") + parser.add_argument('--output_path', type=str, default="/armtdp/ArmTDP-NER/data", help="Where to output the results") + parser.add_argument('--short_name', type=str, default="hy_armtdp", help="Language and dataset identifier") + args = parser.parse_args() + + convert_dataset(args.input_path, args.output_path, args.short_name) diff --git a/stanza/utils/datasets/ner/prepare_ner_dataset.py b/stanza/utils/datasets/ner/prepare_ner_dataset.py index cd7e787914..99668128c5 100644 --- a/stanza/utils/datasets/ner/prepare_ner_dataset.py +++ b/stanza/utils/datasets/ner/prepare_ner_dataset.py @@ -361,8 +361,9 @@ import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob +import stanza.utils.datasets.ner.convert_hy_armtdp as convert_hy_armtdp from stanza.utils.datasets.ner.utils import convert_bio_to_json, get_tags, read_tsv, write_dataset -from stanza.utils.datasets.ner.convert_hy_armtdp import convert_hy_armtdp + SHARDS = ('train', 'dev', 'test') @@ -955,7 +956,7 @@ def process_armtdp(paths, short_name): assert short_name == 'hy_armtdp' base_input_path = os.path.join(paths["NERBASE"], "armtdp", "ArmTDP-NER") base_output_path = paths["NER_DATA_DIR"] - convert_hy_armtdp(base_input_path, base_output_path, short_name) + convert_hy_armtdp.convert_dataset(base_input_path, base_output_path, short_name) for shard in SHARDS: input_filename = os.path.join(base_output_path, f'{short_name}.{shard}.tsv') if not os.path.exists(input_filename): From e4a8813e66ad88241d7035e8fae0c1c3f0ca6fb3 Mon Sep 17 00:00:00 2001 From: Shake Hakobyan Date: Tue, 7 Mar 2023 13:43:49 +0400 Subject: [PATCH 3/3] Updated after review --- .../utils/datasets/ner/convert_hy_armtdp.py | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/stanza/utils/datasets/ner/convert_hy_armtdp.py b/stanza/utils/datasets/ner/convert_hy_armtdp.py index 46e69f07fe..2f9a7c5da2 100644 --- a/stanza/utils/datasets/ner/convert_hy_armtdp.py +++ b/stanza/utils/datasets/ner/convert_hy_armtdp.py @@ -18,10 +18,11 @@ def read_data(path: str) -> list: """ - Takes a full path to the Armenian ner dataset + Reads the Armenian named entity recognition dataset - Returns list of dictionaries, where each dictionary represents - a paragraph's information (text, labels, etc.) + Returns a list of dictionaries. + Each dictionary contains information + about a paragraph (text, labels, etc.) """ with open(path, 'r') as file: paragraphs = [json.loads(line) for line in file] @@ -37,7 +38,7 @@ def filter_unicode_broken_characters(text: str) -> str: def get_label(tok_start_char: int, tok_end_char: int, labels: list) -> list: """ - Returns the label that corresponds to the token + Returns the label that corresponds to the given token """ for label in labels: if label[0] <= tok_start_char and label[1] >= tok_end_char: @@ -73,7 +74,7 @@ def format_sentences(paragraphs: list) -> list: def convert_to_bioes(sentences: list) -> list: """ - Рeturns a list of strings where each string represents a sentence in BIOES format + Returns a list of strings where each string represents a sentence in BIOES format """ beios_sents = [] for sentence in tqdm(sentences): @@ -102,8 +103,8 @@ def write_sentences_to_file(sents, filename): def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7, dev_fraction=0.15): """ - Takes in a list of sentences and splits them into training, dev, and test sets - Writes each set to a separate file with write_sentences_to_file + Splits a list of sentences into training, dev, and test sets, + and writes each set to a separate file with write_sentences_to_file """ num = len(sents) train_num = int(num * train_fraction) @@ -131,9 +132,9 @@ def convert_dataset(base_input_path, base_output_path, short_name): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--input_path', type=str, default="/armtdp/ArmTDP-NER", help="Where to find input file") - parser.add_argument('--output_path', type=str, default="/armtdp/ArmTDP-NER/data", help="Where to output the results") - parser.add_argument('--short_name', type=str, default="hy_armtdp", help="Language and dataset identifier") + parser.add_argument('--input_path', type=str, default="/armtdp/ArmTDP-NER", help="Path to input file") + parser.add_argument('--output_path', type=str, default="/armtdp/ArmTDP-NER/data", help="Path to the output directory") + parser.add_argument('--short_name', type=str, default="hy_armtdp", help="Name to identify the dataset and the model") args = parser.parse_args() convert_dataset(args.input_path, args.output_path, args.short_name)