Commit e86fa40a66b65131a4a8a1aa1172e1747e279eb0

Authored by Erickson Silva
1 parent be54a507
Exists in devel

Remove caracteres especiais

Showing 1 changed file with 3 additions and 4 deletions   Show diff stats
src/ClassificaSentencas.py
@@ -34,7 +34,6 @@ from Aelius.Extras import carrega @@ -34,7 +34,6 @@ from Aelius.Extras import carrega
34 from Aelius import AnotaCorpus, Toqueniza 34 from Aelius import AnotaCorpus, Toqueniza
35 from unicodedata import normalize 35 from unicodedata import normalize
36 36
37 -  
38 class ClassificaSentencas(object): 37 class ClassificaSentencas(object):
39 38
40 def __init__(self): 39 def __init__(self):
@@ -44,10 +43,10 @@ class ClassificaSentencas(object): @@ -44,10 +43,10 @@ class ClassificaSentencas(object):
44 def toqueniza(self, s): 43 def toqueniza(self, s):
45 """Decodifica string utilizando utf-8, retornando uma lista de tokens em unicode. 44 """Decodifica string utilizando utf-8, retornando uma lista de tokens em unicode.
46 """ 45 """
47 - regex = re.compile('[%s]' % re.escape('“”'))  
48 - regex2 = re.compile('[%s]' % re.escape('«»'))  
49 try: 46 try:
50 - decodificada = regex2.sub('',regex.sub('"',s.replace("–", "-").replace("—", "-"))).decode("utf-8") 47 + decodificada = s.translate(None, "“”«»’‘º").decode("utf-8")
  48 + except UnicodeDecodeError:
  49 + decodificada = s.replace("“","").replace("”","").replace("«","").replace("»","").replace("’","").replace("‘","").replace("º","").decode("utf-8")
51 except: 50 except:
52 decodificada = s.decode("utf-8") 51 decodificada = s.decode("utf-8")
53 52