diff --git a/src/alexp.py b/src/alexp.py index 3567cb2..4c29151 100644 --- a/src/alexp.py +++ b/src/alexp.py @@ -31,7 +31,7 @@ import re,nltk, time, random from os.path import expanduser from os import environ, path from Aelius.Extras import carrega -from Aelius import AnotaCorpus +from Aelius import AnotaCorpus, Toqueniza from unicodedata import normalize sentenca_anotada="" @@ -42,7 +42,7 @@ def toqueniza(s): """ regex = re.compile('[%s]' % re.escape('“”')) decodificada=regex.sub('"',s.replace("–", "-").replace("—", "-")).decode("utf-8") - return AnotaCorpus.TOK_PORT.tokenize(decodificada) + return Toqueniza.TOK_PORT.tokenize(decodificada) def getAnaliseMorfologica(): return sentenca_anotada @@ -60,9 +60,14 @@ def etiquetaSentenca(s): anotada_corrigida = [] for x in anotada: if x[1] not in tag_punctuation: - if x[1] == "NUM" and x[1].isdigit(): - anotada_corrigida.append(x) - continue + if x[1] == "NUM": + try: + float(x[0].replace(',', '.')) + anotada_corrigida.append(x) + continue + except: + pass + tupla = [regex.sub('',x[0]).lower(),x[1]] if tupla[0] != "": anotada_corrigida.append(tupla) else: -- libgit2 0.21.2