Commit e86fa40a66b65131a4a8a1aa1172e1747e279eb0

Authored by Erickson Silva
1 parent be54a507
Exists in devel

Remove caracteres especiais

Showing 1 changed file with 3 additions and 4 deletions   Show diff stats
src/ClassificaSentencas.py
... ... @@ -34,7 +34,6 @@ from Aelius.Extras import carrega
34 34 from Aelius import AnotaCorpus, Toqueniza
35 35 from unicodedata import normalize
36 36  
37   -
38 37 class ClassificaSentencas(object):
39 38  
40 39 def __init__(self):
... ... @@ -44,10 +43,10 @@ class ClassificaSentencas(object):
44 43 def toqueniza(self, s):
45 44 """Decodifica string utilizando utf-8, retornando uma lista de tokens em unicode.
46 45 """
47   - regex = re.compile('[%s]' % re.escape('“”'))
48   - regex2 = re.compile('[%s]' % re.escape('«»'))
49 46 try:
50   - decodificada = regex2.sub('',regex.sub('"',s.replace("–", "-").replace("—", "-"))).decode("utf-8")
  47 + decodificada = s.translate(None, "“”«»’‘º").decode("utf-8")
  48 + except UnicodeDecodeError:
  49 + decodificada = s.replace("“","").replace("”","").replace("«","").replace("»","").replace("’","").replace("‘","").replace("º","").decode("utf-8")
51 50 except:
52 51 decodificada = s.decode("utf-8")
53 52  
... ...