From 408ce64883cb450ab0acbf9694e90e75d3952b46 Mon Sep 17 00:00:00 2001 From: Erickson Silva Date: Fri, 29 Jan 2016 15:48:11 -0300 Subject: [PATCH] Remove caractere especial 'marcador' ao classificar sentença. --- src/ClassificaSentencas.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ClassificaSentencas.py b/src/ClassificaSentencas.py index e4ad7d9..70d4129 100644 --- a/src/ClassificaSentencas.py +++ b/src/ClassificaSentencas.py @@ -50,6 +50,7 @@ class ClassificaSentencas(object): decodificada = regex2.sub('',regex.sub('"',s.replace("–", "-").replace("—", "-"))).decode("utf-8") except: decodificada = s.decode("utf-8") + return Toqueniza.TOK_PORT.tokenize(decodificada) def obter_classificacao_morfologica(self): @@ -63,7 +64,7 @@ class ClassificaSentencas(object): while (anotada[0][1] is None): time.sleep(random.choice(sleep_times)) anotada = AnotaCorpus.anota_sentencas([s],etiquetador,"hunpos")[0] - regex = re.compile('[%s]' % re.escape('!"#&\'()*+,-./:;<=>?@[\\]^_`{|}~')) + regex = re.compile('[%s]' % re.escape(u'\u2022''!"#&\'()*+,-./:;<=>?@[\\]^_`{|}~')) tag_punctuation = [".",",","QT","("] anotada_corrigida = [] for x in anotada: -- libgit2 0.21.2