Commit e86fa40a66b65131a4a8a1aa1172e1747e279eb0
1 parent
be54a507
Exists in
devel
Remove caracteres especiais
Showing
1 changed file
with
3 additions
and
4 deletions
Show diff stats
src/ClassificaSentencas.py
... | ... | @@ -34,7 +34,6 @@ from Aelius.Extras import carrega |
34 | 34 | from Aelius import AnotaCorpus, Toqueniza |
35 | 35 | from unicodedata import normalize |
36 | 36 | |
37 | - | |
38 | 37 | class ClassificaSentencas(object): |
39 | 38 | |
40 | 39 | def __init__(self): |
... | ... | @@ -44,10 +43,10 @@ class ClassificaSentencas(object): |
44 | 43 | def toqueniza(self, s): |
45 | 44 | """Decodifica string utilizando utf-8, retornando uma lista de tokens em unicode. |
46 | 45 | """ |
47 | - regex = re.compile('[%s]' % re.escape('“”')) | |
48 | - regex2 = re.compile('[%s]' % re.escape('«»')) | |
49 | 46 | try: |
50 | - decodificada = regex2.sub('',regex.sub('"',s.replace("–", "-").replace("—", "-"))).decode("utf-8") | |
47 | + decodificada = s.translate(None, "“”«»’‘º").decode("utf-8") | |
48 | + except UnicodeDecodeError: | |
49 | + decodificada = s.replace("“","").replace("”","").replace("«","").replace("»","").replace("’","").replace("‘","").replace("º","").decode("utf-8") | |
51 | 50 | except: |
52 | 51 | decodificada = s.decode("utf-8") |
53 | 52 | ... | ... |