Commit 548a8ac1ff799d65fff40e18b6cdc001ca3e8e1a
1 parent
20777b90
Exists in
master
and in
1 other branch
Mantêm vírgula quando é um float
Showing
1 changed file
with
10 additions
and
5 deletions
Show diff stats
src/alexp.py
| ... | ... | @@ -31,7 +31,7 @@ import re,nltk, time, random |
| 31 | 31 | from os.path import expanduser |
| 32 | 32 | from os import environ, path |
| 33 | 33 | from Aelius.Extras import carrega |
| 34 | -from Aelius import AnotaCorpus | |
| 34 | +from Aelius import AnotaCorpus, Toqueniza | |
| 35 | 35 | from unicodedata import normalize |
| 36 | 36 | |
| 37 | 37 | sentenca_anotada="" |
| ... | ... | @@ -42,7 +42,7 @@ def toqueniza(s): |
| 42 | 42 | """ |
| 43 | 43 | regex = re.compile('[%s]' % re.escape('“”')) |
| 44 | 44 | decodificada=regex.sub('"',s.replace("–", "-").replace("—", "-")).decode("utf-8") |
| 45 | - return AnotaCorpus.TOK_PORT.tokenize(decodificada) | |
| 45 | + return Toqueniza.TOK_PORT.tokenize(decodificada) | |
| 46 | 46 | |
| 47 | 47 | def getAnaliseMorfologica(): |
| 48 | 48 | return sentenca_anotada |
| ... | ... | @@ -60,9 +60,14 @@ def etiquetaSentenca(s): |
| 60 | 60 | anotada_corrigida = [] |
| 61 | 61 | for x in anotada: |
| 62 | 62 | if x[1] not in tag_punctuation: |
| 63 | - if x[1] == "NUM" and x[1].isdigit(): | |
| 64 | - anotada_corrigida.append(x) | |
| 65 | - continue | |
| 63 | + if x[1] == "NUM": | |
| 64 | + try: | |
| 65 | + float(x[0].replace(',', '.')) | |
| 66 | + anotada_corrigida.append(x) | |
| 67 | + continue | |
| 68 | + except: | |
| 69 | + pass | |
| 70 | + | |
| 66 | 71 | tupla = [regex.sub('',x[0]).lower(),x[1]] |
| 67 | 72 | if tupla[0] != "": anotada_corrigida.append(tupla) |
| 68 | 73 | else: | ... | ... |