Commit e86fa40a66b65131a4a8a1aa1172e1747e279eb0
1 parent
be54a507
Exists in
devel
Remove caracteres especiais
Showing
1 changed file
with
3 additions
and
4 deletions
Show diff stats
src/ClassificaSentencas.py
@@ -34,7 +34,6 @@ from Aelius.Extras import carrega | @@ -34,7 +34,6 @@ from Aelius.Extras import carrega | ||
34 | from Aelius import AnotaCorpus, Toqueniza | 34 | from Aelius import AnotaCorpus, Toqueniza |
35 | from unicodedata import normalize | 35 | from unicodedata import normalize |
36 | 36 | ||
37 | - | ||
38 | class ClassificaSentencas(object): | 37 | class ClassificaSentencas(object): |
39 | 38 | ||
40 | def __init__(self): | 39 | def __init__(self): |
@@ -44,10 +43,10 @@ class ClassificaSentencas(object): | @@ -44,10 +43,10 @@ class ClassificaSentencas(object): | ||
44 | def toqueniza(self, s): | 43 | def toqueniza(self, s): |
45 | """Decodifica string utilizando utf-8, retornando uma lista de tokens em unicode. | 44 | """Decodifica string utilizando utf-8, retornando uma lista de tokens em unicode. |
46 | """ | 45 | """ |
47 | - regex = re.compile('[%s]' % re.escape('“”')) | ||
48 | - regex2 = re.compile('[%s]' % re.escape('«»')) | ||
49 | try: | 46 | try: |
50 | - decodificada = regex2.sub('',regex.sub('"',s.replace("–", "-").replace("—", "-"))).decode("utf-8") | 47 | + decodificada = s.translate(None, "“”«»’‘º").decode("utf-8") |
48 | + except UnicodeDecodeError: | ||
49 | + decodificada = s.replace("“","").replace("”","").replace("«","").replace("»","").replace("’","").replace("‘","").replace("º","").decode("utf-8") | ||
51 | except: | 50 | except: |
52 | decodificada = s.decode("utf-8") | 51 | decodificada = s.decode("utf-8") |
53 | 52 |