diff --git a/.gitignore b/.gitignore index 3c02613..1c9424a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ build __pycache__ .history-mahmoudi SLTev.egg-info +elitr-testset/ \ No newline at end of file diff --git a/README.md b/README.md index 0b5fa3d..bd92b26 100644 --- a/README.md +++ b/README.md @@ -194,8 +194,26 @@ Demo example: ``` ASReval -i sample-data/sample.en.en.asrt sample-data/sample.en.OSt sample-data/sample.en.OStt -f asrt ost ostt ``` +#### Parsing index files +See `SLTev/index_parser.py` for detailed description. Structure of the index file: +``` +# SRC -> *. +# REF -> *. +# ALIGN -> *. +PATH_TO_DIRECTORY +PATH_TO_ANOTHER_DIRECTORY_WITH_SAME_EXTENSIONS + +# SRC -> *. +# REF -> *. +PATH_TO_DIRECTORY_WITH_DIFFERENT_EXTENSIONS +``` +`SRC` and `REF` annotations are mandatory. Specifying a `SRC` annotation "clears" the rest of the annotations. +Usage: +``` +SLTIndexParser path_to_index_file path_to_dataset +``` #### Notes 1. *.asrt and *.slt files have timestamps and, *.mt and *.asr do not have them. 2. For using ``MTeval``, ``SLTeval``, ``ASReval`` commands, you do not need to follow naming templates, it is the ``-f`` parameter that specifies the use of the file. diff --git a/SLTev/index_parser.py b/SLTev/index_parser.py new file mode 100644 index 0000000..ad48b13 --- /dev/null +++ b/SLTev/index_parser.py @@ -0,0 +1,66 @@ +import os +import sys +import glob +import re +import json + +""" +Read an index file with meta-annotations (SRC, REF, ALIGNMENT...) + +Meta-annotation format: +# NAME -> *. + +Return an iterable of dicts containing paths to the specified files +If invoked on the command line, return a JSON of the list of dicts +Multiple directories can share the same meta-annotations, as long as there isn't a blank line between them +SRC line resets the meta-annotations +SRC and REF are mandatory annotations + +Example: + +# SRC -> *. +# REF -> *. +PATH_TO_DIRECTORY +PATH_TO_ANOTHER_DIRECTORY_WITH_SAME_PREFIXES + +# SRC -> *. +# REF -> *. +PATH_TO_DIRECTORY_WITH_DIFFERENT_PREFIXES +""" + + +def parseIndexFile(indexFilePath, testsetPath): + fileExtensions = {} # Dict of file extensions + with open(indexFilePath) as indexFile: + for line in indexFile: + line = line.rstrip() + if line.startswith("#"): + if "->" in line: + _, fileType, _, extension = line.split(" ") + if not extension.startswith("*"): + raise Exception(f"{line} -- extension must start with a *") + if fileType == "SRC": + fileExtensions = {} + fileExtensions[fileType] = extension + elif len(line) > 0: + if "SRC" not in fileExtensions or "REF" not in fileExtensions: + raise Exception(f"{line} -- SRC or REF not specified") + sourceExtension = fileExtensions["SRC"] + sources = glob.glob(f"{testsetPath}/{line}/{sourceExtension}") + + # Source file is guaranteed to exist, verify all other requested files exist + for source in sources: + evalEntry = {} + for name, extension in fileExtensions.items(): + matchingFileName = re.sub(sourceExtension[1:] + "$", "", source) + extension[1:] + if not os.path.exists(matchingFileName): + raise Exception(f"{name} {extension} -- {matchingFileName} does not exist") + evalEntry[name] = os.path.realpath(matchingFileName) + yield (evalEntry) + +def main(): + paths = [path for path in parseIndexFile(sys.argv[1], sys.argv[2])] + print(json.dumps(paths)) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/setup.py b/setup.py index 55c216b..74a151b 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,7 @@ "SLTeval = SLTev.SLTeval:main_point", "ASReval = SLTev.ASReval:main_point", "MTeval = SLTev.MTeval:main_point", + "SLTIndexParser = SLTev.index_parser:main" ], }, python_requires=">=3.6",