Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 140 additions & 0 deletions stanza/utils/datasets/ner/convert_hy_armtdp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""
Convert a ArmTDP-NER dataset to BIO format

The dataset is here:

https://github.com/myavrum/ArmTDP-NER.git
"""

import argparse
import os
import json
import re
import stanza
import random
from tqdm import tqdm
nlp_hy = stanza.Pipeline(lang='hy', processors='tokenize')


def read_data(path: str) -> list:
"""
Reads the Armenian named entity recognition dataset

Returns a list of dictionaries.
Each dictionary contains information
about a paragraph (text, labels, etc.)
"""
with open(path, 'r') as file:
paragraphs = [json.loads(line) for line in file]
return paragraphs


def filter_unicode_broken_characters(text: str) -> str:
"""
Removes all unicode characters in text
"""
return re.sub(r'\\u[A-Za-z0-9]{4}', '', text)


def get_label(tok_start_char: int, tok_end_char: int, labels: list) -> list:
"""
Returns the label that corresponds to the given token
"""
for label in labels:
if label[0] <= tok_start_char and label[1] >= tok_end_char:
return label
return []


def format_sentences(paragraphs: list) -> list:
"""
Takes a list of paragraphs and returns a list of sentences,
where each sentence is a list of tokens along with their respective entity tags.
"""
sentences = []
for paragraph in tqdm(paragraphs):
doc = nlp_hy(filter_unicode_broken_characters(paragraph['text']))
for sentence in doc.sentences:
sentence_ents = []
entity = []
for token in sentence.tokens:
label = get_label(token.start_char, token.end_char, paragraph['labels'])
if label:
entity.append(token.text)
if token.end_char == label[1]:
sentence_ents.append({'tokens': entity,
'tag': label[2]})
entity = []
else:
sentence_ents.append({'tokens': [token.text],
'tag': 'O'})
sentences.append(sentence_ents)
return sentences


def convert_to_bioes(sentences: list) -> list:
"""
Returns a list of strings where each string represents a sentence in BIOES format
"""
beios_sents = []
for sentence in tqdm(sentences):
sentence_toc = ''
for ent in sentence:
if ent['tag'] == 'O':
sentence_toc += ent['tokens'][0] + '\tO' + '\n'
else:
if len(ent['tokens']) == 1:
sentence_toc += ent['tokens'][0] + '\tS-' + ent['tag'] + '\n'
else:
sentence_toc += ent['tokens'][0] + '\tB-' + ent['tag'] + '\n'
for token in ent['tokens'][1:-1]:
sentence_toc += token + '\tI-' + ent['tag'] + '\n'
sentence_toc += ent['tokens'][-1] + '\tE-' + ent['tag'] + '\n'
beios_sents.append(sentence_toc)
return beios_sents


def write_sentences_to_file(sents, filename):
print(f"Writing {len(sents)} sentences to {filename}")
with open(filename, 'w') as outfile:
for sent in sents:
outfile.write(sent + '\n\n')


def train_test_dev_split(sents, base_output_path, short_name, train_fraction=0.7, dev_fraction=0.15):
"""
Splits a list of sentences into training, dev, and test sets,
and writes each set to a separate file with write_sentences_to_file
"""
num = len(sents)
train_num = int(num * train_fraction)
dev_num = int(num * dev_fraction)
if train_fraction + dev_fraction > 1.0:
raise ValueError(
"Train and dev fractions added up to more than 1: {} {} {}".format(train_fraction, dev_fraction))

random.shuffle(sents)
train_sents = sents[:train_num]
dev_sents = sents[train_num:train_num + dev_num]
test_sents = sents[train_num + dev_num:]
batches = [train_sents, dev_sents, test_sents]
filenames = [f'{short_name}.train.tsv', f'{short_name}.dev.tsv', f'{short_name}.test.tsv']
for batch, filename in zip(batches, filenames):
write_sentences_to_file(batch, os.path.join(base_output_path, filename))


def convert_dataset(base_input_path, base_output_path, short_name):
paragraphs = read_data(os.path.join(base_input_path, 'ArmNER-HY.json1'))
taged_sentences = format_sentences(paragraphs)
beios_sentences = convert_to_bioes(taged_sentences)
train_test_dev_split(beios_sentences, base_output_path, short_name)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--input_path', type=str, default="/armtdp/ArmTDP-NER", help="Path to input file")
parser.add_argument('--output_path', type=str, default="/armtdp/ArmTDP-NER/data", help="Path to the output directory")
parser.add_argument('--short_name', type=str, default="hy_armtdp", help="Name to identify the dataset and the model")
args = parser.parse_args()

convert_dataset(args.input_path, args.output_path, args.short_name)
26 changes: 26 additions & 0 deletions stanza/utils/datasets/ner/prepare_ner_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,18 @@
https://github.com/stanfordnlp/stanza-train
this is not meant for any kind of actual NER use

ArmTDP-NER is an Armenian NER dataset
- https://github.com/myavrum/ArmTDP-NER.git
ArmTDP-NER: The corpus was developed by the ArmTDP team led by Marat M. Yavrumyan
at the Yerevan State University by the collaboration of "Armenia National SDG Innovation Lab"
and "UC Berkley's Armenian Linguists' network".
- in $NERBASE, make a "armtdp" directory, then git clone the repo there
mkdir -p $NERBASE/armtdp
cd $NERBASE/armtdp
git clone https://github.com/myavrum/ArmTDP-NER.git
- Then run
python3 -m stanza.utils.datasets.ner.prepare_ner_dataset hy_armtdp

"""

import glob
Expand Down Expand Up @@ -349,8 +361,10 @@
import stanza.utils.datasets.ner.prepare_ner_file as prepare_ner_file
import stanza.utils.datasets.ner.suc_to_iob as suc_to_iob
import stanza.utils.datasets.ner.suc_conll_to_iob as suc_conll_to_iob
import stanza.utils.datasets.ner.convert_hy_armtdp as convert_hy_armtdp
from stanza.utils.datasets.ner.utils import convert_bio_to_json, get_tags, read_tsv, write_dataset


SHARDS = ('train', 'dev', 'test')

class UnknownDatasetError(ValueError):
Expand Down Expand Up @@ -938,6 +952,17 @@ def process_masakhane(paths, dataset_name):

def process_toy_dataset(paths, short_name):
convert_bio_to_json(os.path.join(paths["NERBASE"], "English-SAMPLE"), paths["NER_DATA_DIR"], short_name)
def process_armtdp(paths, short_name):
assert short_name == 'hy_armtdp'
base_input_path = os.path.join(paths["NERBASE"], "armtdp", "ArmTDP-NER")
base_output_path = paths["NER_DATA_DIR"]
convert_hy_armtdp.convert_dataset(base_input_path, base_output_path, short_name)
for shard in SHARDS:
input_filename = os.path.join(base_output_path, f'{short_name}.{shard}.tsv')
if not os.path.exists(input_filename):
raise FileNotFoundError('Cannot find %s component of %s in %s' % (shard, short_name, input_filename))
output_filename = os.path.join(base_output_path, '%s.%s.json' % (short_name, shard))
prepare_ner_file.process_dataset(input_filename, output_filename)

DATASET_MAPPING = {
"bn_daffodil": process_bn_daffodil,
Expand All @@ -960,6 +985,7 @@ def process_toy_dataset(paths, short_name):
"sv_suc3shuffle": process_sv_suc3shuffle,
"tr_starlang": process_starlang,
"th_lst20": process_lst20,
"hy_armtdp": process_armtdp,
}

def main(dataset_name):
Expand Down
6 changes: 6 additions & 0 deletions stanza/utils/training/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ class Mode(Enum):
# herbert-large-cased (dev/test): 92.25/91.62
# sdadas/polish-roberta-large-v2 (dev/test): 92.66/91.22
"pl": "allegro/herbert-base-cased",

# https://huggingface.co/xlm-roberta-base
# Scores by entity on 18 labels:
# no bert : 86.68
# xlm-roberta-base : 89.31
"hy": "xlm-roberta-base",
}

def build_argparse():
Expand Down