diff --git a/.gitignore b/.gitignore index ecefa7f..3612c56 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,19 @@ python/eflomal/bin # Debug files *.dSYM/ *.su + +# ctags +tags +*.TAG + +# Generated +python/eflomal/eflomal.c + +# Virtualenv and build +pyvenv.cfg +bin +build +lib +lib64 +**/*.egg-info +**/__pycache__ diff --git a/devscripts/ctags.sh b/devscripts/ctags.sh new file mode 100644 index 0000000..a3cd85d --- /dev/null +++ b/devscripts/ctags.sh @@ -0,0 +1 @@ +ctags --exclude=@.gitignore -R python/ src/ diff --git a/devscripts/curl_server.sh b/devscripts/curl_server.sh new file mode 100755 index 0000000..99bbcf8 --- /dev/null +++ b/devscripts/curl_server.sh @@ -0,0 +1,8 @@ +#!/bin/env sh + +echo "Run eflomal-server in this directory, so it picks up server_config.json, then perform the sample request:" + +curl -X POST $@ localhost:5000/api/align/v1 -H 'Content-type: application/json' -H 'Accept: application/json' -d '{"aligner": "my-align", "samplers": 3, "iters": {"1":64, "2": 32, "3": 8}, "trust_sents": false, "sents":[{"s":"The cow and grass", "t":"Die Kuh und das Gras"}]}' + +# Approximate response: +# {"aligns":[{"fwd":"0-0 0-1 1-2 2-3 3-4","norm_score_fwd":3.954102087565899,"norm_score_rev":2.250345638880109,"rev":"0-0 1-1 2-3 3-4","score_fwd":5.56354,"score_rev":3.63664}]} diff --git a/devscripts/server_config.json b/devscripts/server_config.json new file mode 100644 index 0000000..52a21b3 --- /dev/null +++ b/devscripts/server_config.json @@ -0,0 +1,8 @@ +{ + "aligners": [ + { "name": "my-align", + "priors": "testdata/my-align.pri" + } + ], + "log_level": "debug" +} diff --git a/devscripts/testdata/create-prior.sh b/devscripts/testdata/create-prior.sh new file mode 100755 index 0000000..91eb82f --- /dev/null +++ b/devscripts/testdata/create-prior.sh @@ -0,0 +1,3 @@ +#!/bin/env sh +eflomal-align -i my-align.txt -f my-align.fwd -r my-align.rev +eflomal-makepriors -i my-align.txt -f my-align.fwd -r my-align.rev -p my-align.pri diff --git a/devscripts/testdata/my-align.fwd b/devscripts/testdata/my-align.fwd new file mode 100644 index 0000000..c97e1a8 --- /dev/null +++ b/devscripts/testdata/my-align.fwd @@ -0,0 +1,6 @@ +1-0 2-2 3-3 +0-0 1-1 2-2 4-3 6-4 +1-0 2-2 3-3 +0-0 2-1 4-2 +0-0 2-1 3-2 4-3 +1-0 2-2 3-3 4-4 5-5 6-6 diff --git a/devscripts/testdata/my-align.pri b/devscripts/testdata/my-align.pri new file mode 100644 index 0000000..93af577 --- /dev/null +++ b/devscripts/testdata/my-align.pri @@ -0,0 +1,56 @@ +LEX Auf There 1 +LEX Die cow 3 +LEX Freunde friends 1 +LEX Gras grass 1 +LEX Häschen rabbit 2 +LEX Lass Let 2 +LEX Wiese a 1 +LEX das the 2 +LEX der is 1 +LEX die cow 1 +LEX eine grass 1 +LEX frisst eats 1 +LEX ist is 1 +LEX mich see 2 +LEX schläfrig sleepy 1 +LEX sind are 1 +LEX steht on 1 +LEX und and 1 +FERF Let 1 2 +FERF There 1 1 +FERF a 1 1 +FERF and 1 1 +FERF are 1 1 +FERF cow 1 4 +FERF eats 1 1 +FERF friends 1 1 +FERF grass 1 2 +FERF is 1 2 +FERF on 1 1 +FERF rabbit 1 2 +FERF see 1 2 +FERF sleepy 1 1 +FERF the 1 2 +FERR Auf 1 1 +FERR Die 1 3 +FERR Freunde 1 1 +FERR Gras 1 1 +FERR Häschen 1 2 +FERR Kuh 1 5 +FERR Lass 1 2 +FERR Wiese 1 1 +FERR das 1 2 +FERR der 1 1 +FERR die 1 1 +FERR eine 1 1 +FERR frisst 1 1 +FERR ist 1 1 +FERR mich 1 2 +FERR schläfrig 1 1 +FERR sind 1 1 +FERR steht 1 1 +FERR und 1 1 +HMMF 1 22 +HMMF 2 8 +HMMR 1 33 +HMMR 2 2 diff --git a/devscripts/testdata/my-align.rev b/devscripts/testdata/my-align.rev new file mode 100644 index 0000000..85f02b9 --- /dev/null +++ b/devscripts/testdata/my-align.rev @@ -0,0 +1,6 @@ +0-0 1-1 2-2 3-3 +0-0 1-1 2-2 3-3 4-4 5-5 +0-0 1-1 2-2 3-3 +0-0 1-1 3-2 4-3 +0-0 1-1 3-2 4-3 +0-0 1-1 2-2 3-3 4-4 5-5 6-6 diff --git a/devscripts/testdata/my-align.txt b/devscripts/testdata/my-align.txt new file mode 100644 index 0000000..c55f1d2 --- /dev/null +++ b/devscripts/testdata/my-align.txt @@ -0,0 +1,6 @@ +The cow eats grass ||| Die Kuh frisst Gras +There is a cow on the grass ||| Auf der Wiese steht eine Kuh +The cow is sleepy ||| Die Kuh ist schläfrig +Let me see the cow ||| Lass mich die Kuh sehen +Let me see the rabbit ||| Lass mich das Häschen sehen +The cow and the rabbit are friends ||| Die Kuh und das Häschen sind Freunde diff --git a/python/eflomal/__init__.py b/python/eflomal/__init__.py index e0fb3a5..1ef5399 100644 --- a/python/eflomal/__init__.py +++ b/python/eflomal/__init__.py @@ -6,11 +6,11 @@ from tempfile import NamedTemporaryFile from .cython import align, read_text, write_text - +import time +#import shutil logger = logging.getLogger(__name__) - class Aligner: """Aligner class""" @@ -27,12 +27,80 @@ def __init__(self, model=3, score_model=0, self.null_prior = null_prior self.source_prefix_len = source_prefix_len self.source_suffix_len = source_suffix_len + self.source_lowercase = True self.target_prefix_len = target_prefix_len self.target_suffix_len = target_suffix_len + self.target_lowercase = True + # + self._preloaded_priors = None + # Note(preloaded-priors,development): Set to True when developing to + # ensure consistency between normal and preloaded priors. + self._assert_preloaded_prior_eq = False + + def preload_priors(self, priors_input): + """ + Preloads the priors into quick to index structures. Useful in server + mode, where individual requests typically use a small part of the + prior words, so iterating the full prior would be wasteful. + + Note that the preprocessing performs the same text transform operations + that the sentence word transformer would do. So the preprocessed prior + is already in terms of transformed words, and so is only suitable to + use with sentence words using the same transformation (which, for the + same Aligner, is always true). + + """ + t0 = time.time() + priors = read_priors(priors_input) + priors_list, hmmf_priors, hmmr_priors, ferf_priors, ferr_priors = priors + src_tf = TextIndex({}, self.source_prefix_len, self.source_suffix_len, + self.source_lowercase) + trg_tf = TextIndex({}, self.target_prefix_len, self.target_suffix_len, + self.target_lowercase) + + priors_tree = {} + # TODO(NULL): is not supported. Could. + for src_word, trg_word, alpha in priors_list: + src_word = src_tf.transform(src_word) + trg_word = trg_tf.transform(trg_word) + + if src_word not in priors_tree: + priors_tree[src_word] = {} + trg_tree = priors_tree[src_word] + + trg_tree[trg_word] = trg_tree.get(trg_word, 0.0) + alpha + + ferf_map = {} + for src_word, fert, alpha in ferf_priors: + # Note(preloaded-priors,development): for example comment following + # line to trigger an orig vs preloaded prior difference check. + src_word = src_tf.transform(src_word) + + if src_word not in ferf_map: + ferf_map[src_word] = {} + smap = ferf_map[src_word] + smap[fert] = smap.get(fert, 0.0) + alpha + + ferr_map = {} + for trg_word, fert, alpha in ferr_priors: + trg_word = trg_tf.transform(trg_word) + + if trg_word not in ferr_map: + ferr_map[trg_word] = {} + smap = ferr_map[trg_word] + smap[fert] = smap.get(fert, 0.0) + alpha + + dt = time.time() - t0 + logger.info(f"Prior preprocessing took {dt} seconds") + + preloaded = (priors_tree, ferf_map, ferr_map) + + self._preloaded_priors = (priors, preloaded) def prepare_files(self, src_input_file, src_output_file, trg_input_file, trg_output_file, - priors_input_file, priors_output_file): + priors_input_file, + priors_output_file, orig_priors_output_file=None): """Convert text files to formats used by eflomal Inputs should be file objects or any iterables over lines. Outputs @@ -51,7 +119,18 @@ def prepare_files(self, src_input_file, src_output_file, n_src_sents, n_trg_sents) raise ValueError('Mismatched file sizes') logger.info('Prepared %d sentences for alignment', n_src_sents) - if priors_input_file: + if self._preloaded_priors: + t0 = time.time() + (priors, _) = self._preloaded_priors + preloaded_to_eflomal_priors_file(self._preloaded_priors, src_index, + trg_index, priors_output_file) + dt = time.time() - t0 + logger.info(f"Prior calculation took {dt} seconds using preloaded") + if orig_priors_output_file is not None: + # output normal processing-based priors for comparison + to_eflomal_priors_file( + priors, src_index, trg_index, orig_priors_output_file) + elif priors_input_file: logger.info('Reading lexical priors...') priors = read_priors(priors_input_file) to_eflomal_priors_file( @@ -60,50 +139,93 @@ def prepare_files(self, src_input_file, src_output_file, def align(self, src_input, trg_input, links_filename_fwd=None, links_filename_rev=None, scores_filename_fwd=None, scores_filename_rev=None, - priors_input=None, quiet=True, use_gdb=False): + priors_input=None, trust_sents=True, + quiet=True, use_gdb=False): """Run alignment for the input""" with NamedTemporaryFile('wb') as srcf, \ NamedTemporaryFile('wb') as trgf, \ - NamedTemporaryFile('w', encoding='utf-8') as priorsf: - # Write input files for the eflomal binary - self.prepare_files( - src_input, srcf, trg_input, trgf, priors_input, priorsf) + NamedTemporaryFile('w', encoding='utf-8', + delete_on_close=False) as priorsf: + + use_prior = self._preloaded_priors or priors_input + if self._preloaded_priors and self._assert_preloaded_prior_eq: + with NamedTemporaryFile('w', encoding='utf-8', + delete_on_close=False) as orig_priorsf: + self.prepare_files( + src_input, srcf, trg_input, trgf, priors_input, + priorsf, orig_priorsf) + # Note: opening NamedTemporaryFile-s is safe as long as + # 1) happens using context-manager, and 2) delete_on_close + # was set to False, as above. + with open(orig_priorsf.name, 'r') as of, \ + open(priorsf.name, 'r') as f: + orig = of.read() + pre = f.read() + if orig != pre: + #shutil.copy(orig_priorsf.name, "/tmp/prior.orig") + #shutil.copy(priorsf.name, "/tmp/prior.preloaded") + raise Exception("===== ERROR! Preloaded prior leads to differing processed prior! ======") + else: + # Write input files for the eflomal binary + # + # Note(preloaded-priors): if priors were preloaded, then + # priors_input is not used at this point (but then likely they + # are not passed either). + # + self.prepare_files( + src_input, srcf, trg_input, trgf, priors_input, priorsf) + # Run wrapper for the eflomal binary + t0 = time.time() align(srcf.name, trgf.name, links_filename_fwd=links_filename_fwd, links_filename_rev=links_filename_rev, statistics_filename=None, scores_filename_fwd=scores_filename_fwd, scores_filename_rev=scores_filename_rev, - priors_filename=(None if priors_input is None - else priorsf.name), + priors_filename=(priorsf.name if use_prior else None), model=self.model, score_model=self.score_model, n_iterations=self.n_iterations, n_samplers=self.n_samplers, + n_clean=-1 if trust_sents else 0, quiet=quiet, rel_iterations=self.rel_iterations, null_prior=self.null_prior, use_gdb=use_gdb) + dt = time.time() - t0 + logger.info(f"Align call took {dt} seconds") class TextIndex: - """Word to index mapping with lowercasing and prefix/suffix removal""" + """ + Word to index mapping with lowercasing and prefix/suffix removal. + + Note that the returned indices are one larger than the indices in the + passed-in index, due to reserving output index 0 to the token. + + """ - def __init__(self, index, prefix_len=0, suffix_len=0): + def __init__(self, index, prefix_len=0, suffix_len=0, lowercase=True): self.index = index self.prefix_len = prefix_len self.suffix_len = suffix_len + self.lowercase = lowercase def __len__(self): return len(self.index) - def __getitem__(self, word): - word = word.lower() + def transform(self, word): + if self.lowercase: + word = word.lower() if self.prefix_len != 0: word = word[:self.prefix_len] if self.suffix_len != 0: word = word[-self.suffix_len:] + return word + + def __getitem__(self, word): + word = self.transform(word) e = self.index.get(word) if e is not None: e = e + 1 @@ -152,6 +274,10 @@ def calculate_priors(src_sentences, trg_sentences, If `reverse` is True, compute priors for the opposite alignment direction. + + Note: stored priors are agnostic of the word transform used during + alignment, and is in terms of the original sentence words. + """ priors = Counter() hmmf_priors = Counter() @@ -160,6 +286,8 @@ def calculate_priors(src_sentences, trg_sentences, ferr_priors = Counter() for lineno, (src_sent, trg_sent, fwd_line, rev_line) in enumerate( zip(src_sentences, trg_sentences, fwd_alignments, rev_alignments)): + if lineno % 10000 == 0: + logger.info('processing line #%d', lineno) src_sent = src_sent.strip().split() trg_sent = trg_sent.strip().split() fwd_links = [tuple(map(int, s.split('-'))) for s in fwd_line.split()] @@ -169,7 +297,9 @@ def calculate_priors(src_sentences, trg_sentences, logger.error('alignment out of bounds in line %d: ' '(%d, %d)', lineno + 1, i, j) raise ValueError('Invalid input on line %d' % lineno + 1) - priors[(src_sent[i], trg_sent[j])] += 1 + s, t = src_sent[i], trg_sent[j] + k = (t,s) if rev_alignments else (s,t) + priors[k] += 1 last_j = -1 last_i = -1 @@ -315,3 +445,68 @@ def to_eflomal_priors_file(priors, src_index, trg_index, outfile): for (f, fert), alpha in sorted(ferr_indexed.items()): print('%d %d %g' % (f, fert, alpha), file=outfile) outfile.flush() + +def preloaded_to_eflomal_priors_file(pp, src_index, trg_index, outfile): + """Write priors to a file read by eflomal binary + + Arguments: + + priors - tuple of priors (priors_list, hmmf_priors, hmmr_priors, + ferf_priors, ferr_priors) + src_index - vocabulary index for source text + tgt_index - vocabulary index for target text + outfile - file object for output + + """ + (priors, preloaded_priors) = pp + priors_list, hmmf_priors, hmmr_priors, ferf_priors, ferr_priors = priors + (priors_tree, ferf_map, ferr_map) = preloaded_priors + + priors_indexed = {} + # TODO(NULL): not yet supported. + for src_word, e in src_index.index.items(): + e = e + 1 + trg_tree = priors_tree.get(src_word) + if trg_tree is None: continue + for trg_word, f in trg_index.index.items(): + f = f + 1 + alpha = trg_tree.get(trg_word) + if alpha is not None: + priors_indexed[(e, f)] = priors_indexed.get((e, f), 0.0) + alpha + + logger.info('%d (of %d) pairs of lexical priors used', + len(priors_indexed), len(priors_list)) + + ferf_indexed = {} + for src_word, e in src_index.index.items(): + e = e + 1 + falphas = ferf_map.get(src_word) + if falphas is None: continue + for fert, alpha in falphas.items(): + ferf_indexed[(e, fert)] = ferf_indexed.get((e, fert), 0.0) + alpha + + ferr_indexed = {} + for trg_word, f in trg_index.index.items(): + f = f + 1 + falphas = ferr_map.get(trg_word) + if falphas is None: continue + for fert, alpha in falphas.items(): + ferr_indexed[(f, fert)] = ferr_indexed.get((f, fert), 0.0) + alpha + + print('%d %d %d %d %d %d %d' % ( + len(src_index)+1, len(trg_index)+1, len(priors_indexed), + len(hmmf_priors), len(hmmr_priors), + len(ferf_indexed), len(ferr_indexed)), + file=outfile) + for (e, f), alpha in sorted(priors_indexed.items()): + print('%d %d %g' % (e, f, alpha), file=outfile) + for jump, alpha in sorted(hmmf_priors.items()): + print('%d %g' % (jump, alpha), file=outfile) + for jump, alpha in sorted(hmmr_priors.items()): + print('%d %g' % (jump, alpha), file=outfile) + for (e, fert), alpha in sorted(ferf_indexed.items()): + print('%d %d %g' % (e, fert, alpha), file=outfile) + for (f, fert), alpha in sorted(ferr_indexed.items()): + print('%d %d %g' % (f, fert, alpha), file=outfile) + outfile.flush() + diff --git a/python/eflomal/eflomal.pyx b/python/eflomal/eflomal.pyx index f759184..f396661 100644 --- a/python/eflomal/eflomal.pyx +++ b/python/eflomal/eflomal.pyx @@ -7,7 +7,6 @@ import os import sys import math import subprocess -from tempfile import NamedTemporaryFile import numpy as np @@ -61,6 +60,9 @@ cpdef tuple read_text(pyfile, bool lowercase, int prefix_len, int suffix_len): cpdef write_text(pyfile, tuple sents, int voc_size): """Write a sequence of sentences in the format expected by eflomal + NOTE(token-limit): if more than 1024 tokens are in a sentence, an empty + sentence is written instead of that sentence. + Arguments: pyfile -- Python file object to write to sents -- tuple of sentences, each encoded as np.ndarray(uint32) @@ -74,6 +76,7 @@ cpdef write_text(pyfile, tuple sents, int voc_size): fprintf(f, '%d %d\n', len(sents), voc_size) for sent in sents: n = len(sent) + # NOTE(token-limit). if n < 0x400: i = 0 fprintf(f, '%d', n) @@ -100,6 +103,7 @@ def align( int score_model=0, tuple n_iterations=None, int n_samplers=1, + int n_clean=-1, bool quiet=True, double rel_iterations=1.0, double null_prior=0.2, @@ -120,6 +124,8 @@ def align( not given the numbers will be computed automatically based on rel_iterations n_samplers -- number of independent samplers to run + n_clean -- number of first N sentences to deem clean and use for stats + update (-1 = all). quiet -- if True, suppress output rel_iterations -- number of iterations relative to the default """ @@ -144,6 +150,7 @@ def align( '-s', source_filename, '-t', target_filename, '-n', str(n_samplers), + '-c', str(n_clean), '-N', str(null_prior), '-1', str(n_iterations[0])] if quiet: args.append('-q') @@ -160,3 +167,4 @@ def align( if use_gdb: args = ['gdb', '-ex=run', '--args'] + args subprocess.run(args, check=True) + diff --git a/python/eflomal/server.py b/python/eflomal/server.py new file mode 100644 index 0000000..7d95311 --- /dev/null +++ b/python/eflomal/server.py @@ -0,0 +1,166 @@ +from flask import Flask, request, make_response + +import json +import os +import functools +import time +import math + +from eflomal import Aligner, sentences_from_joint_file +from tempfile import TemporaryDirectory + +import logging +logger = logging.getLogger(__name__) + + +DEFAULT_LOG_FORMAT = "[%(asctime)s] [%(process)d] [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s" + +ACCEPT_LOG_LEVELS = ["error", "info", "debug"] + + +class InputFormatException(Exception): + def __init__(self, msg): + self.msg = msg + + +def create_app(): + app = Flask(__name__, instance_relative_config=True) # why? + + app_config_path = os.environ.get('FLASK_APP_CONFIG') + with open(app_config_path) as f: + cfg = json.load(f) + + log_format = cfg.get("log_format", DEFAULT_LOG_FORMAT) + + log_level = cfg.get("log_level", "info") + if not log_level in ACCEPT_LOG_LEVELS: + raise Exception(f"log_level not one of {ACCEPT_LOG_LEVELS}") + + ll = None + if log_level == "error": + ll = logging.ERROR + elif log_level == "info": + ll = logging.INFO + elif log_level == "debug": + ll = logging.DEBUG + + logging.basicConfig( level=ll, format=log_format) + + logger.info("Read application config: %s", cfg) + + aligners = {} + for acfg in cfg["aligners"]: + name = acfg["name"] + pri = acfg["priors"] + logger.info(f"Loading aligner {name} with priors {pri}") + aligners[name] = create_aligner(pri) + + @app.route('/api/align/v1', methods=['POST']) + def alignV1(): + req = request.get_json() + aligner = aligners[req['aligner']] + + iters = [32, 32, 32] + if 'iters' in req and req['iters']: + req_iters = req['iters'] + if "1" in req_iters: iters[0] = req_iters["1"] + if "2" in req_iters: iters[1] = req_iters["2"] + if "3" in req_iters: iters[2] = req_iters["3"] + iters = tuple(iters) + + trust_sents = True + if 'trust_sents' in req: + f = req['trust_sents'] + if type(f) == bool: + trust_sents = f + else: + raise InputFormatException("trust_sents should be bool") + + samplers = 3 # copied default + if 'samplers' in req: + samplers = int(req['samplers']) + + scoring = True + if 'scoring' in req: + f = req['scoring'] + if type(f) == bool: + scoring = f + else: + raise InputFormatException("scoring should be bool") + + num_sents = len(req['sents']) + sent_stoks = [0] * num_sents + sent_ttoks = [0] * num_sents + def input_iter(field, toks): + for n, sent in enumerate(req['sents']): + f = sent[field] + if type(f) == list: + toks[n] = len(f) + f = ' '.join(f) + elif type(f) == str: + toks[n] = len(f.split()) + else: + raise InputFormatException("Sentence should be string or list of strings") + yield f + src_iter = input_iter("s", sent_stoks) + trg_iter = input_iter("t", sent_ttoks) + + t10 = time.time() + with TemporaryDirectory() as td: + fwd_fp = os.path.join(td, "req.fwd") + rev_fp = os.path.join(td, "req.rev") + fsc_fp = os.path.join(td, "rsc.fwd") if scoring else None + rsc_fp = os.path.join(td, "rsc.rev") if scoring else None + + aligner.n_iterations = iters + aligner.n_samplers = samplers + try: + aligner.align(src_iter, trg_iter, + links_filename_fwd=fwd_fp, + links_filename_rev=rev_fp, + scores_filename_fwd=fsc_fp, + scores_filename_rev=rsc_fp, + trust_sents=trust_sents, + quiet=log_level != "debug") + except InputFormatException as e: + return make_response(e.msg, 400) + + scores = [] + if scoring: + with open(fsc_fp, 'r') as fscf, open(rsc_fp, 'r') as rscf: + for fs, rs in zip(fscf, rscf): + scores.append((float(fs), float(rs))) + + with open(fwd_fp, 'r') as fwdf, open(rev_fp, 'r') as revf: + fr_pairs = [] + for n, (f, r) in enumerate(zip(fwdf, revf)): + res = { "fwd": f.strip(), "rev": r.strip() } + if scoring: + fs, rs = scores[n] + res["score_fwd"] = fs + res["score_rev"] = rs + res["norm_score_fwd"] = fs - math.log(sent_ttoks[n]) + res["norm_score_rev"] = rs - math.log(sent_stoks[n]) + fr_pairs.append(res) + if len(fr_pairs) != num_sents: + raise Exception(f'Number of alignments differ from inputs: {len(fr_pairs)} != {num_sents}') + res = { "aligns": fr_pairs } + return res + + # Don't forget this. + return app + +def create_aligner(prior_path): + # TODO(config) more config if needed + aligner = Aligner() + with open(prior_path, 'r', encoding='utf-8') as priors_input: + aligner.preload_priors(priors_input) + return aligner + + +def main(): + app = create_app() + + +if __name__ == '__main__': + main() diff --git a/python/scripts/eflomal-align b/python/scripts/eflomal-align index 873b480..5a1f4a6 100755 --- a/python/scripts/eflomal-align +++ b/python/scripts/eflomal-align @@ -11,6 +11,32 @@ import sys, argparse, os logger = logging.getLogger(__name__) +class LineSkipIterator: + def __init__(self, wrapped, skip=None, limit=None): + self._wrapped = wrapped + self._left = limit + if skip: + for i in range(0, skip): next(wrapped) + + def __iter__(self): + return self + + def __next__(self): + if self._left is not None: + if self._left <= 0: + raise StopIteration + else: + self._left -= 1 + return next(self._wrapped) + + def __enter__(self): + self._ctx = self._wrapped.__enter__() + return self + + def __exit__(self, et, ev, tb): + return self._wrapped.__exit__(et, ev, tb) + + def main(): parser = argparse.ArgumentParser( description='eflomal: efficient low-memory aligner') @@ -70,6 +96,14 @@ def main(): parser.add_argument( '-i', '--input', dest='joint_filename', type=str, metavar='filename', help='fast_align style ||| separated file') + parser.add_argument( + '-k', '--skip-lines', dest='skip_lines', default=None, metavar='X', + type=int, + help='Number of initial lines to skip in input') + parser.add_argument( + '-n', '--n-lines', dest='keep_lines', default=None, metavar='X', + type=int, + help='Number of lines to process at most (after optional skipping)') parser.add_argument( '-f', '--forward-links', dest='links_filename_fwd', type=str, metavar='filename', @@ -137,17 +171,28 @@ def main(): logger.info('Reading source/target sentences from %s...', args.joint_filename) src_in_f = stack.enter_context( - open(args.joint_filename, 'r', encoding='utf-8')) + LineSkipIterator( + open(args.joint_filename, 'r', encoding='utf-8'), + skip = args.skip_lines, + limit = args.keep_lines)) src_input = sentences_from_joint_file(src_in_f, 0) trg_in_f = stack.enter_context( - open(args.joint_filename, 'r', encoding='utf-8')) + LineSkipIterator( + open(args.joint_filename, 'r', encoding='utf-8'), + skip = args.skip_lines, + limit = args.keep_lines)) trg_input = sentences_from_joint_file(trg_in_f, 1) else: src_input = stack.enter_context( - open(args.source_filename, 'r', encoding='utf-8')) + LineSkipIterator( + open(args.source_filename, 'r', encoding='utf-8'), + skip = args.skip_lines, + limit = args.keep_lines)) trg_input = stack.enter_context( - open(args.target_filename, 'r', encoding='utf-8')) - + LineSkipIterator( + open(args.target_filename, 'r', encoding='utf-8'), + skip = args.skip_lines, + limit = args.keep_lines)) aligner.align(src_input, trg_input, links_filename_fwd=args.links_filename_fwd, links_filename_rev=args.links_filename_rev, diff --git a/python/scripts/eflomal-server b/python/scripts/eflomal-server new file mode 100755 index 0000000..56d4146 --- /dev/null +++ b/python/scripts/eflomal-server @@ -0,0 +1,9 @@ +#!/bin/env bash +set -eu + +FLASK_APP_CONFIG=${FLASK_APP_CONFIG:-$(realpath server_config.json)} gunicorn \ + "eflomal.server:create_app()" \ + -b ${FLASK_HOST:-127.0.0.1}:${FLASK_PORT:-5000} \ + --access-logfile \ + - \ + --workers=${WORKERS:-2} diff --git a/server_config.json.example b/server_config.json.example new file mode 100644 index 0000000..48faaf9 --- /dev/null +++ b/server_config.json.example @@ -0,0 +1,8 @@ +{ + "aligners": [ + { "name": "some-name" + , "priors": "/path/to/prior" + } + ], + "log_level": "info" +} diff --git a/setup.py b/setup.py index 284ccd4..fc6b5a6 100755 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def run(self): setup( name='eflomal', - version='1.0.0-beta2', + version='1.0.1', author='Robert Östling', url='https://github.com/robertostling/eflomal', license='GNU GPLv3', @@ -36,6 +36,7 @@ def run(self): long_description_content_type='text/markdown', install_requires=install_requires, tests_require=tests_require, + python_requires='>=3.12', extras_require={'test': tests_require}, packages=['eflomal'], package_dir={'': 'python'}, @@ -43,6 +44,6 @@ def run(self): 'eflomal': ['bin/eflomal'] }, ext_modules=cythonize(cyalign_ext, language_level='3'), - scripts=['python/scripts/eflomal-align', 'python/scripts/eflomal-makepriors'], + scripts=['python/scripts/eflomal-align', 'python/scripts/eflomal-makepriors', 'python/scripts/eflomal-server'], cmdclass={'build_py': build_py} ) diff --git a/src/eflomal.c b/src/eflomal.c index bb504d3..dac532d 100644 --- a/src/eflomal.c +++ b/src/eflomal.c @@ -100,7 +100,7 @@ struct text_alignment { // this number of sentences contain clean parallel data and should // contribute to the statistics (anything after this should still be // aligned, but don't trust the statistics): - size_t n_clean; // 0 (the default) means all sentences should be used + int32_t n_clean; // -1 (the default) means all sentences should be used count null_prior; }; @@ -230,7 +230,7 @@ void text_alignment_sample( count *jump_counts = ta->jump_counts; count *fert_counts = ta->fert_counts; const size_t n_sentences = - ta->n_clean? ta->n_clean: ta->target->n_sentences; + ta->n_clean >= 0 ? ta->n_clean: ta->target->n_sentences; // the fertility distributions (unlike the jump and lexical distributions) // are sampled explicitly, and the categorical distributions are fixed @@ -683,7 +683,7 @@ void text_alignment_make_counts(struct text_alignment *ta) { } } const size_t n_sentences = - ta->n_clean? ta->n_clean: ta->target->n_sentences; + ta->n_clean >= 0 ? ta->n_clean: ta->target->n_sentences; for (size_t sent=0; sentsentence_links[sent]; if (links == NULL) continue; @@ -971,7 +971,7 @@ struct text_alignment *text_alignment_create( ta->model = 1; ta->source = source; ta->target = target; - ta->n_clean = 0; + ta->n_clean = -1; // These should be initialized with text_alignment_load_priors() ta->source_prior = NULL; @@ -1136,6 +1136,20 @@ struct text* text_read(const char *filename) { return text; } +void check_openmp() { + int n_threads = 0; +#pragma omp parallel + { +#pragma omp atomic + n_threads += 1; + } + if (n_threads > 1) { + fprintf(stderr, "OpenMP is active! Number of threads: %d\n", n_threads); + } else { + fprintf(stderr, "Running without OpenMP concurrency?\n"); + } +} + static void align( int reverse, const struct text *source, @@ -1144,6 +1158,7 @@ static void align( int score_model, double null_prior, int n_samplers, + int n_clean, int quiet, const int *n_iters, const char *links_filename, @@ -1152,6 +1167,7 @@ static void align( const char *priors_filename) { double t0; + const char fr = reverse ? 'R' : 'F'; random_state state; struct text_alignment *tas[n_samplers]; @@ -1161,6 +1177,7 @@ static void align( for (int i=0; in_clean = n_clean; tas[i]->null_prior = null_prior; if (priors_filename != NULL) { // TODO: since read-only, could use the pointer from tas[0] @@ -1174,8 +1191,8 @@ static void align( } } if (!quiet) - fprintf(stderr, "Created alignment structures: %.3f s\n", - seconds() - t0); + fprintf(stderr, "[%c] Created alignment structures: %.3f s\n", + fr, seconds() - t0); t0 = seconds(); #pragma omp parallel for @@ -1188,13 +1205,15 @@ static void align( text_alignment_randomize(tas[i], &local_state); } if (!quiet) - fprintf(stderr, "Randomized alignment: %.3f s\n", seconds() - t0); + fprintf(stderr, "[%c] Randomized alignment: %.3f s\n", fr, + seconds() - t0); for (int m=1; m<=model; m++) { if (n_iters[m-1]) { if (!quiet) - fprintf(stderr, "Aligning with model %d (%d iterations)\n", - m, n_iters[m-1]); + fprintf(stderr, + "[%c] Aligning with model %d (%d iterations)\n", fr, m, + n_iters[m-1]); t0 = seconds(); #pragma omp parallel for @@ -1210,23 +1229,28 @@ static void align( for (int j=0; jtarget->n_sentences); FILE *file = (!strcmp(links_filename, "-"))? stdout : fopen(links_filename, "w"); @@ -1253,10 +1278,11 @@ static void align( FILE *file = (!strcmp(scores_filename, "-"))? stdout : fopen(scores_filename, "w"); + t0 = seconds(); if (!quiet) fprintf(stderr, - "Computing scores using model %d for %Zu sentences\n", - score_model, ta->source->n_sentences); + "[%c] Computing scores using model %d for %Zu sentences\n", + fr, score_model, ta->source->n_sentences); // Switch to whatever model is specified for scoring ta->model = score_model; @@ -1267,6 +1293,8 @@ static void align( if (file != stdout) fclose(file); free(scores); + if (!quiet) + fprintf(stderr, "[%c] Scoring took: %.3f s\n", fr, seconds() - t0); } @@ -1292,14 +1320,14 @@ int main(int argc, char *argv[]) { *stats_filename = NULL, *scores_filename_fwd = NULL, *scores_filename_rev = NULL; int n_iters[3]; - int n_samplers = 1, quiet = 0, model = -1, score_model = -1; + int n_samplers = 1, n_clean = -1, quiet = 0, model = -1, score_model = -1; double null_prior = 0.2; n_iters[0] = 1; n_iters[1] = 1; n_iters[2] = 1; omp_set_nested(1); - while ((opt = getopt(argc, argv, "s:t:p:f:r:S:F:R:1:2:3:n:qm:M:N:h")) + while ((opt = getopt(argc, argv, "s:t:p:f:r:S:F:R:1:2:3:n:c:qm:M:N:h")) != -1) { switch(opt) { @@ -1315,6 +1343,7 @@ int main(int argc, char *argv[]) { case '2': n_iters[1] = atoi(optarg); break; case '3': n_iters[2] = atoi(optarg); break; case 'n': n_samplers = atoi(optarg); break; + case 'c': n_clean = atoi(optarg); break; case 'q': quiet = 1; break; case 'm': model = atoi(optarg); if (model < 1 || model > 3) { @@ -1342,6 +1371,10 @@ int main(int argc, char *argv[]) { return 1; } + if (!quiet) { + check_openmp(); + } + if (score_model == -1) score_model = model; t0 = seconds(); @@ -1371,7 +1404,7 @@ int main(int argc, char *argv[]) { (!reverse && links_filename_fwd == NULL && links_filename_rev == NULL)) align(reverse, source, target, model, score_model, null_prior, - n_samplers, + n_samplers, n_clean, quiet, n_iters, links_filename, stats_filename, scores_filename, priors_filename); }