Commit 548a8ac1ff799d65fff40e18b6cdc001ca3e8e1a
1 parent
20777b90
Exists in
master
and in
1 other branch
Mantêm vírgula quando é um float
Showing
1 changed file
with
10 additions
and
5 deletions
Show diff stats
src/alexp.py
... | ... | @@ -31,7 +31,7 @@ import re,nltk, time, random |
31 | 31 | from os.path import expanduser |
32 | 32 | from os import environ, path |
33 | 33 | from Aelius.Extras import carrega |
34 | -from Aelius import AnotaCorpus | |
34 | +from Aelius import AnotaCorpus, Toqueniza | |
35 | 35 | from unicodedata import normalize |
36 | 36 | |
37 | 37 | sentenca_anotada="" |
... | ... | @@ -42,7 +42,7 @@ def toqueniza(s): |
42 | 42 | """ |
43 | 43 | regex = re.compile('[%s]' % re.escape('“”')) |
44 | 44 | decodificada=regex.sub('"',s.replace("–", "-").replace("—", "-")).decode("utf-8") |
45 | - return AnotaCorpus.TOK_PORT.tokenize(decodificada) | |
45 | + return Toqueniza.TOK_PORT.tokenize(decodificada) | |
46 | 46 | |
47 | 47 | def getAnaliseMorfologica(): |
48 | 48 | return sentenca_anotada |
... | ... | @@ -60,9 +60,14 @@ def etiquetaSentenca(s): |
60 | 60 | anotada_corrigida = [] |
61 | 61 | for x in anotada: |
62 | 62 | if x[1] not in tag_punctuation: |
63 | - if x[1] == "NUM" and x[1].isdigit(): | |
64 | - anotada_corrigida.append(x) | |
65 | - continue | |
63 | + if x[1] == "NUM": | |
64 | + try: | |
65 | + float(x[0].replace(',', '.')) | |
66 | + anotada_corrigida.append(x) | |
67 | + continue | |
68 | + except: | |
69 | + pass | |
70 | + | |
66 | 71 | tupla = [regex.sub('',x[0]).lower(),x[1]] |
67 | 72 | if tupla[0] != "": anotada_corrigida.append(tupla) |
68 | 73 | else: | ... | ... |