From e86fa40a66b65131a4a8a1aa1172e1747e279eb0 Mon Sep 17 00:00:00 2001 From: Erickson Silva Date: Mon, 25 Jul 2016 10:14:56 -0300 Subject: [PATCH] Remove caracteres especiais --- src/ClassificaSentencas.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/ClassificaSentencas.py b/src/ClassificaSentencas.py index f7440e3..5d11232 100644 --- a/src/ClassificaSentencas.py +++ b/src/ClassificaSentencas.py @@ -34,7 +34,6 @@ from Aelius.Extras import carrega from Aelius import AnotaCorpus, Toqueniza from unicodedata import normalize - class ClassificaSentencas(object): def __init__(self): @@ -44,10 +43,10 @@ class ClassificaSentencas(object): def toqueniza(self, s): """Decodifica string utilizando utf-8, retornando uma lista de tokens em unicode. """ - regex = re.compile('[%s]' % re.escape('“”')) - regex2 = re.compile('[%s]' % re.escape('«»')) try: - decodificada = regex2.sub('',regex.sub('"',s.replace("–", "-").replace("—", "-"))).decode("utf-8") + decodificada = s.translate(None, "“”«»’‘º").decode("utf-8") + except UnicodeDecodeError: + decodificada = s.replace("“","").replace("”","").replace("«","").replace("»","").replace("’","").replace("‘","").replace("º","").decode("utf-8") except: decodificada = s.decode("utf-8") -- libgit2 0.21.2