diff --git a/ovos_number_parser/numbers_ca.py b/ovos_number_parser/numbers_ca.py index 4ba2a78..9f4bf21 100644 --- a/ovos_number_parser/numbers_ca.py +++ b/ovos_number_parser/numbers_ca.py @@ -1,4 +1,5 @@ from typing import List +import collections from ovos_number_parser.util import (convert_to_mixed_fraction, look_for_fractions, is_numeric, tokenize, Token) @@ -122,6 +123,140 @@ 90: 'noranta' } +# source: https://ca.wikipedia.org/wiki/Escales_curta_i_llarga +_LONG_SCALE_CA = collections.OrderedDict([ + (100, 'cent'), + (1000, 'mil'), + (1000000, 'milions'), + (1e9, "miliards"), + (1e12, "bilions"), + (1e15, "biliards"), + (1e18, "trilions"), + (1e21, "triliards"), + (1e24, "quadrilions"), + (1e27, "quadriliards"), + (1e30, "quintilions"), + (1e33, "quintiliards"), + (1e36, "sextilió"), + (1e39, "sextiliard"), + (1e42, "septilió"), + (1e45, "septiliard"), + (1e48, "octilió"), + (1e51, "octiliards"), + (1e54, "nonilió"), + (1e57, "noniliards"), + (1e60, "decilió"), + (1e63, "deciliard"), + (1e66, "undecilió"), + (1e69, "undeciliard"), + (1e72, "duodecilió"), + (1e75, "duodeciliard"), + (1e78, "tredecilió"), + (1e81, "tredeciliard"), + (1e84, "quattuordecilió"), + (1e87, "quattuordeciliard"), + (1e90, "quindecilió"), + (1e93, "quindeciliard"), + (1e96, "sexdecilió"), + (1e99, "sexdeciliard"), + (1e102, "septendecilió"), + (1e105, "septendeciliard"), + (1e108, "octodecilió"), + (1e111, "octodeciliard"), + (1e114, "novemdecilió"), + (1e117, "novemdeciliard"), + (1e120, "vigintilions"), + (1e123, "vigintiliard"), + + (1e306, "unquinquagintilió"), + (1e312, "duoquinquagintilió"), + (1e360, "sexagintilió"), + (1e363, "sexagintiliard"), + (1e420, "septuagintilió"), + (1e423, "septuagintiliard"), + (1e480, "octogintilió"), + (1e483, "octogintilliard"), + (1e540, "nonagintilió"), + (1e543, "nonagintiliard"), + (1e600, "centilió"), + (1e603, "centiliard") +]) + +_SHORT_SCALE_CA = collections.OrderedDict([ + (100, 'cent'), + (1000, 'mil'), + (1000000, 'milions'), + (1e9, 'bilions'), + (1e12, "trilions"), + (1e15, "quadrilions"), + (1e18, "quintilions"), + (1e21, "sextilions"), + (1e24, "septilions"), + (1e27, "octilions"), + (1e30, "nonilions"), + (1e33, "decilions"), + (1e36, "undecilions"), + (1e39, "duodecilions"), + (1e42, "tredecilions"), + (1e45, "quattordecilions"), + (1e48, "quindecilions"), + (1e51, "sexdecilions"), + (1e54, "septendecilions"), + (1e57, "octodecilions"), + (1e60, "novemdecilions"), + (1e63, "vigintilions"), + (1e66, "unvigintilions"), + (1e69, "duovigintilions"), + (1e72, "tresvigintilions"), + (1e75, "quattuorvigintilions"), + (1e78, "quinquavigintilions"), + (1e81, "sexvigintilions"), + (1e84, "septemvigintilions"), + (1e87, "octovigintilions"), + (1e90, "novemvigintilions"), + (1e93, "trigintilions"), + (1e96, "untrigintilions"), + (1e99, "duotrigintilions"), + (1e102, "trestrigintilions"), + (1e105, "quattuortrigintilions"), + (1e108, "quinquatrigintilions"), + (1e111, "sestrigintilions"), + (1e114, "septentrigintilions"), + (1e117, "octotrigintilions"), + (1e120, "noventrigintilions"), + (1e123, "quadragintilions"), + (1e153, "quinquagintilions"), + (1e183, "sexagintilions"), + (1e213, "septuagintilions"), + (1e243, "octogintilions"), + (1e273, "nonagintilions"), + (1e303, "centilions"), + (1e306, "uncentilions"), + (1e309, "duocentilions"), + (1e312, "trescentilions"), + (1e333, "decicentilions"), + (1e336, "undecicentilions"), + (1e363, "viginticentilions"), + (1e366, "unviginticentilions"), + (1e393, "trigintacentilions"), + (1e423, "quadragintacentilions"), + (1e453, "quinquagintacentilions"), + (1e483, "sexagintacentilions"), + (1e513, "septuagintacentilions"), + (1e543, "octogintacentilions"), + (1e573, "nonagintacentilions"), + (1e603, "ducentilions"), + (1e903, "trecentilions"), + (1e1203, "quadringentilions"), + (1e1503, "quingentilions"), + (1e1803, "sescentilions"), + (1e2103, "septingentilions"), + (1e2403, "octingentilions"), + (1e2703, "nongentilions"), + (1e3003, "milinilions") +]) + + _TENS_CA = { "vint": 20, "trenta": 30, @@ -207,52 +342,155 @@ def nice_number_ca(number, speech, denominators=range(1, 21)): # un desè return_string = 'un {}'.format(den_str) else: - # tres mig + # quatre cinquens return_string = '{} {}'.format(num, den_str) - # inteiros >10 - elif num == 1: - # trenta-un - return_string = '{}-{}'.format(whole, den_str) - # inteiros >10 com fracções + else: # vint i 3 desens return_string = '{} i {} {}'.format(whole, num, den_str) # plural if num > 1: - return_string += 's' + if return_string[-1] == "è": + return_string = return_string[:-1] + "ens" + else: + return_string += 's' return return_string -def pronounce_number_ca(number, places=2): +def pronounce_number_ca(number, places=2, short_scale=False, scientific=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'cinc coma dos' Args: - number(float or int): the number to pronounce (under 100) + number(float or int): the number to pronounce places(int): maximum decimal places to speak Returns: (str): The pronounced number """ - if abs(number) >= 100: - # TODO: Support n > 100 - return str(number) - + result = "" if number < 0: result = "menys " number = abs(number) - if number >= 20: - tens = int(number - int(number) % 10) - ones = int(number - tens) - result += _NUM_STRING_CA[tens] - if ones > 0: - if tens == 20: - result += "-i-" + _NUM_STRING_CA[ones] - else: - result += "-" + _NUM_STRING_CA[ones] + if number == float("inf"): + return "infinit" + elif number == float("-inf"): + return "menys infinit" + + number_names = _NUM_STRING_CA.copy() + + if short_scale: + number_names.update(_SHORT_SCALE_CA) else: - result += _NUM_STRING_CA[int(number)] + number_names.update(_LONG_SCALE_CA) + + digits = [number_names[n] for n in range(0, 20)] + + tens = [number_names[n] for n in range(10, 100, 10)] + + if short_scale==True: + hundreds = [_SHORT_SCALE_CA[n] for n in _SHORT_SCALE_CA.keys()] + else: + hundreds = [_LONG_SCALE_CA[n] for n in _LONG_SCALE_CA.keys()] + + + + if number in number_names: # check for a direct match + result += number_names[number] + else: + def _sub_thousand(n): + assert 0 <= n <= 999 + if n <= 19: + return digits[n] + elif n <= 99: + q, r = divmod(n, 10) + _deci = tens[q - 1] + _unit = r + _partial = _deci + if _unit > 0: + if _deci == "vint": + _partial = _partial + "-i-" + number_names[_unit] + else: + _partial = _partial + "-" + number_names[_unit] + return _partial + else: + q, r = divmod(n, 100) + if q == 1: + _partial = "cent" + else: + _partial = digits[q] + "-cents" + _partial += ( + " " + _sub_thousand(r) if r else "") # separa centenars + return _partial + + def _short_scale(n): + if n >= max(_SHORT_SCALE_CA.keys()): + return "número extremadament gran" + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + number = _sub_thousand(z) + if i: + number += " " # separa ordres de magnitud + number += hundreds[i] + if number == "un mil": + number = "mil" + res.append(number) + + return " ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + def _long_scale(n): + if n >= max(_LONG_SCALE_CA.keys()): + return "número extremadament gran" + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000000)): + if not z: + continue + number = pronounce_number_ca(z, places, True, scientific) + # strip off the comma after the thousand + if i: + # plus one as we skip 'thousand' + # (and 'hundred', but this is excluded by index value) + number = number.replace(',', '') + number += " " + hundreds[i + 1] + res.append(number) + return " ".join(reversed(res)) + + if short_scale: + result += _short_scale(number) + else: + result += _long_scale(number) + + big_nums = [_LONG_SCALE_CA[a] for a in _LONG_SCALE_CA] + if result in big_nums: + if result[-3:] == "rds": + result = "un " + result[:-1] + elif result[-3:] == "ons": + result = "un " + result[:-3] + "ó" + if len(result.split(" ")) > 1 and result.split(" ")[0] == "un": + big_num = result.split(" ")[1] + if big_num in big_nums: + new_big_num = big_num + if big_num[-3:] == "rds": + new_big_num = big_num[:-1] + + elif big_num[-3:] == "ons": + new_big_num = big_num[:-3] + "ó" + result = result.replace(big_num, new_big_num) # Deal with decimal part, in Catalan is commonly used the comma # instead the dot. Decimal part can be written both with comma diff --git a/ovos_number_parser/numbers_es.py b/ovos_number_parser/numbers_es.py index fb77dbb..f9660ff 100644 --- a/ovos_number_parser/numbers_es.py +++ b/ovos_number_parser/numbers_es.py @@ -8,7 +8,7 @@ _NUM_STRING_ES = { 0: 'cero', - 1: 'uno', + 1: 'uno', 2: 'dos', 3: 'tres', 4: 'cuatro', @@ -24,7 +24,7 @@ 14: 'catorce', 15: 'quince', 16: 'dieciséis', - 17: 'diecisete', + 17: 'diecisiete', 18: 'dieciocho', 19: 'diecinueve', 20: 'veinte', @@ -126,107 +126,107 @@ # https://www.grobauer.at/es_eur/zahlnamen.php _LONG_SCALE_ES = OrderedDict([ - (100, 'centena'), - (1000, 'millar'), - (1000000, 'millón'), - (1e9, "millardo"), - (1e12, "billón"), - (1e18, 'trillón'), - (1e24, "cuatrillón"), - (1e30, "quintillón"), - (1e36, "sextillón"), - (1e42, "septillón"), - (1e48, "octillón"), - (1e54, "nonillón"), - (1e60, "decillón"), - (1e66, "undecillón"), - (1e72, "duodecillón"), - (1e78, "tredecillón"), - (1e84, "cuatrodecillón"), - (1e90, "quindecillón"), - (1e96, "sexdecillón"), - (1e102, "septendecillón"), - (1e108, "octodecillón"), - (1e114, "novendecillón"), - (1e120, "vigintillón"), - (1e306, "unquinquagintillón"), - (1e312, "duoquinquagintillón"), - (1e336, "sexquinquagintillón"), - (1e366, "unsexagintillón") + (100, 'cien'), + (1000, 'mil'), + (1000000, 'millones'), + (1e9, "millardos"), + (1e12, "billones"), + (1e18, 'trillones'), + (1e24, "cuatrillones"), + (1e30, "quintillones"), + (1e36, "sextillones"), + (1e42, "septillones"), + (1e48, "octillones"), + (1e54, "nonillones"), + (1e60, "decillones"), + (1e66, "undecillones"), + (1e72, "duodecillones"), + (1e78, "tredecillones"), + (1e84, "cuatrodecillones"), + (1e90, "quindecillones"), + (1e96, "sexdecillones"), + (1e102, "septendecillones"), + (1e108, "octodecillones"), + (1e114, "novendecillones"), + (1e120, "vigintillones"), + (1e306, "unquinquagintillones"), + (1e312, "duoquinquagintillones"), + (1e336, "sexquinquagintillones"), + (1e366, "unsexagintillones") ]) _SHORT_SCALE_ES = OrderedDict([ - (100, 'centena'), - (1000, 'millar'), - (1000000, 'millón'), - (1e9, "billón"), - (1e12, 'trillón'), - (1e15, "cuatrillón"), - (1e18, "quintillón"), - (1e21, "sextillón"), - (1e24, "septillón"), - (1e27, "octillón"), - (1e30, "nonillón"), - (1e33, "decillón"), - (1e36, "undecillón"), - (1e39, "duodecillón"), - (1e42, "tredecillón"), - (1e45, "cuatrodecillón"), - (1e48, "quindecillón"), - (1e51, "sexdecillón"), - (1e54, "septendecillón"), - (1e57, "octodecillón"), - (1e60, "novendecillón"), - (1e63, "vigintillón"), - (1e66, "unvigintillón"), - (1e69, "uuovigintillón"), - (1e72, "tresvigintillón"), - (1e75, "quattuorvigintillón"), - (1e78, "quinquavigintillón"), - (1e81, "qesvigintillón"), - (1e84, "septemvigintillón"), - (1e87, "octovigintillón"), - (1e90, "novemvigintillón"), - (1e93, "trigintillón"), - (1e96, "untrigintillón"), - (1e99, "duotrigintillón"), - (1e102, "trestrigintillón"), - (1e105, "quattuortrigintillón"), - (1e108, "quinquatrigintillón"), - (1e111, "sestrigintillón"), - (1e114, "septentrigintillón"), - (1e117, "octotrigintillón"), - (1e120, "noventrigintillón"), - (1e123, "quadragintillón"), - (1e153, "quinquagintillón"), - (1e183, "sexagintillón"), - (1e213, "septuagintillón"), - (1e243, "octogintillón"), - (1e273, "nonagintillón"), - (1e303, "centillón"), - (1e306, "uncentillón"), - (1e309, "duocentillón"), - (1e312, "trescentillón"), - (1e333, "decicentillón"), - (1e336, "undecicentillón"), - (1e363, "viginticentillón"), - (1e366, "unviginticentillón"), - (1e393, "trigintacentillón"), - (1e423, "quadragintacentillón"), - (1e453, "quinquagintacentillón"), - (1e483, "sexagintacentillón"), - (1e513, "septuagintacentillón"), - (1e543, "ctogintacentillón"), - (1e573, "nonagintacentillón"), - (1e603, "ducentillón"), - (1e903, "trecentillón"), - (1e1203, "quadringentillón"), - (1e1503, "quingentillón"), - (1e1803, "sexcentillón"), - (1e2103, "septingentillón"), - (1e2403, "octingentillón"), - (1e2703, "nongentillón"), - (1e3003, "millinillón") + (100, 'cien'), + (1000, 'mil'), + (1000000, 'millones'), + (1e9, "billones"), + (1e12, 'trillones'), + (1e15, "cuatrillones"), + (1e18, "quintillones"), + (1e21, "sextillones"), + (1e24, "septillones"), + (1e27, "octillones"), + (1e30, "nonillones"), + (1e33, "decillones"), + (1e36, "undecillones"), + (1e39, "duodecillones"), + (1e42, "tredecillones"), + (1e45, "cuatrodecillones"), + (1e48, "quindecillones"), + (1e51, "sexdecillones"), + (1e54, "septendecillones"), + (1e57, "octodecillones"), + (1e60, "novendecillones"), + (1e63, "vigintillones"), + (1e66, "unvigintillones"), + (1e69, "uuovigintillones"), + (1e72, "tresvigintillones"), + (1e75, "quattuorvigintillones"), + (1e78, "quinquavigintillones"), + (1e81, "qesvigintillones"), + (1e84, "septemvigintillones"), + (1e87, "octovigintillones"), + (1e90, "novemvigintillones"), + (1e93, "trigintillones"), + (1e96, "untrigintillones"), + (1e99, "duotrigintillones"), + (1e102, "trestrigintillones"), + (1e105, "quattuortrigintillones"), + (1e108, "quinquatrigintillones"), + (1e111, "sestrigintillones"), + (1e114, "septentrigintillones"), + (1e117, "octotrigintillones"), + (1e120, "noventrigintillones"), + (1e123, "quadragintillones"), + (1e153, "quinquagintillones"), + (1e183, "sexagintillones"), + (1e213, "septuagintillones"), + (1e243, "octogintillones"), + (1e273, "nonagintillones"), + (1e303, "centillones"), + (1e306, "uncentillones"), + (1e309, "duocentillones"), + (1e312, "trescentillones"), + (1e333, "decicentillones"), + (1e336, "undecicentillones"), + (1e363, "viginticentillones"), + (1e366, "unviginticentillones"), + (1e393, "trigintacentillones"), + (1e423, "quadragintacentillones"), + (1e453, "quinquagintacentillones"), + (1e483, "sexagintacentillones"), + (1e513, "septuagintacentillones"), + (1e543, "octogintacentillones"), + (1e573, "nonagintacentillones"), + (1e603, "ducentillones"), + (1e903, "trecentillones"), + (1e1203, "quadringentillones"), + (1e1503, "quingentillones"), + (1e1803, "sexcentillones"), + (1e2103, "septingentillones"), + (1e2403, "octingentillones"), + (1e2703, "nongentillones"), + (1e3003, "millinillones") ]) # TODO: female forms. @@ -608,53 +608,164 @@ def nice_number_es(number, speech=True, denominators=range(1, 21)): return strNumber -def pronounce_number_es(number, places=2): +def pronounce_number_es(number, places=2, short_scale=False): """ Convert a number to it's spoken equivalent For example, '5.2' would return 'cinco coma dos' Args: - num(float or int): the number to pronounce (under 100) + num(float or int): the number to pronounce places(int): maximum decimal places to speak Returns: (str): The pronounced number """ - if abs(number) >= 100: - # TODO: Soporta a números por encima de 100 - return str(number) - + result = "" if number < 0: result = "menos " number = abs(number) - # del 21 al 29 tienen una pronunciación especial - if 20 <= number <= 29: - tens = int(number - int(number) % 10) - ones = int(number - tens) - result += _NUM_STRING_ES[tens] - if ones > 0: - result = result[:-1] - # a veinte le quitamos la "e" final para construir los - # números del 21 - 29. Pero primero tenemos en cuenta - # las excepciones: 22, 23 y 26, que llevan tilde. - if ones == 2: - result += "idós" - elif ones == 3: - result += "itrés" - elif ones == 6: - result += "iséis" - else: - result += "i" + _NUM_STRING_ES[ones] - elif number >= 30: # de 30 en adelante - tens = int(number - int(number) % 10) - ones = int(number - tens) - result += _NUM_STRING_ES[tens] - if ones > 0: - result += " y " + _NUM_STRING_ES[ones] + number_names = _NUM_STRING_ES.copy() + + if short_scale: + number_names.update(_SHORT_SCALE_ES) + else: + number_names.update(_LONG_SCALE_ES) + + digits = [number_names[n] for n in range(0, 20)] + + tens = [number_names[n] for n in range(10, 100, 10)] + + if short_scale==True: + hundreds = [_SHORT_SCALE_ES[n] for n in _SHORT_SCALE_ES.keys()] + else: + hundreds = [_LONG_SCALE_ES[n] for n in _LONG_SCALE_ES.keys()] + + + + if number in number_names: # check for a direct match + result += number_names[number] else: - result += _NUM_STRING_ES[int(number)] + def _sub_thousand(n): + assert 0 <= n <= 999 + if n <= 19: + return digits[n] + elif n <= 99: + q, r = divmod(n, 10) + _deci = tens[q - 1] + _unit = r + _partial = _deci + if _unit > 0: + if q == 2: + _partial = _partial[:-1] + if r == 2: + _partial += "idós" + elif r == 3: + _partial += "itrés" + elif r == 6: + _partial += "iséis" + else: + _partial += "i" + number_names[_unit] + else: + _partial = _partial + " y " + number_names[_unit] + return _partial + else: + q, r = divmod(n, 100) + if q == 1: + _partial = "ciento" + elif q == 5: + _partial = "quinientos" + elif q == 7: + _partial = "setecientos" + elif q == 9: + _partial = "novecientos" + else: + _partial = digits[q] + "cientos" + _partial += ( + " " + _sub_thousand(r) if r else "") # separa centenars + + return _partial + + def _un_uno(number): + if number[-9:] == "veintiuno": + number = number.replace("veintiuno", "veintiún") + elif number[-3:] == "uno": + number = number[:-1] + return number + + def _short_scale(n): + if n >= max(_SHORT_SCALE_ES.keys()): + return "número exageradamente grande" + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000)): + if not z: + continue + number = _sub_thousand(z) + if i > 0: + number = _un_uno(number) + if i: + number += " " # separa ordres de magnitud + number += hundreds[i] + if number == "un mil": + number = "mil" + res.append(number) + return " ".join(reversed(res)) + + def _split_by(n, split=1000): + assert 0 <= n + res = [] + while n: + n, r = divmod(n, split) + res.append(r) + return res + + def _long_scale(n): + if n >= max(_LONG_SCALE_ES.keys()): + return "número exageradamente grande" + n = int(n) + assert 0 <= n + res = [] + for i, z in enumerate(_split_by(n, 1000000)): + if not z: + continue + number = pronounce_number_es(z, places, True) + + + # strip off the comma after the thousand + if i: + # plus one as we skip 'thousand' + # (and 'hundred', but this is excluded by index value) + number = number.replace(',', '') + number = _un_uno(number) + number += " " + hundreds[i + 1] + res.append(number) + return " ".join(reversed(res)) + + if short_scale: + result += _short_scale(number) + else: + result += _long_scale(number) + + big_nums = [_LONG_SCALE_ES[a] for a in _LONG_SCALE_ES] + if result in big_nums: + + if result[-4:] == "rdos" or result[-4:] == "ones": + result = "un " + result[:-1] + + if len(result.split(" ")) > 1 and result.split(" ")[0] in ["un", "uno"]: + big_num = result.split(" ")[1] + if big_num in big_nums: + new_big_num = big_num + if big_num[-4:] == "rdos": + new_big_num = big_num[:-1] + + elif big_num[-4:] == "ones": + new_big_num = big_num[:-4] + "ón" + result = result.replace(big_num, new_big_num) + # Deal with decimal part, in spanish is commonly used the comma # instead the dot. Decimal part can be written both with comma