From 62051edaeccbf00b5ec4f061a590dc3ce913edd4 Mon Sep 17 00:00:00 2001 From: Erickson Silva Date: Mon, 10 Oct 2016 19:02:20 -0300 Subject: [PATCH] Altera tokenizador para manter hífens --- src/ClassificaSentencas.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ClassificaSentencas.py b/src/ClassificaSentencas.py index bc43707..3392920 100644 --- a/src/ClassificaSentencas.py +++ b/src/ClassificaSentencas.py @@ -49,9 +49,8 @@ class ClassificaSentencas(object): decodificada = s.replace("“","").replace("”","").replace("«","").replace("»","").replace("’","").replace("‘","").replace("º","").decode("utf-8") except: decodificada = s.decode("utf-8") - - return Toqueniza.TOK_PORT.tokenize(decodificada) - + return Toqueniza.TOK_PORT_LX.tokenize(decodificada) + def obter_classificacao_morfologica(self): return self.sentenca_anotada @@ -63,7 +62,7 @@ class ClassificaSentencas(object): while (anotada[0][1] is None): time.sleep(random.choice(sleep_times)) anotada = AnotaCorpus.anota_sentencas([s],etiquetador,"hunpos")[0] - regex = re.compile('[%s]' % re.escape(u'\u2022''!"#&\'()*+,-./:;<=>?@[\\]^_`{|}~')) + regex = re.compile('[%s]' % re.escape(u'\u2022''!"#&\'()*+,./:;<=>?@[\\]^_`{|}~')) tag_punctuation = [".",",","QT","("] anotada_corrigida = [] for x in anotada: -- libgit2 0.21.2