Commit 548a8ac1ff799d65fff40e18b6cdc001ca3e8e1a
1 parent
20777b90
Exists in
master
and in
1 other branch
Mantêm vírgula quando é um float
Showing
1 changed file
with
10 additions
and
5 deletions
Show diff stats
src/alexp.py
@@ -31,7 +31,7 @@ import re,nltk, time, random | @@ -31,7 +31,7 @@ import re,nltk, time, random | ||
31 | from os.path import expanduser | 31 | from os.path import expanduser |
32 | from os import environ, path | 32 | from os import environ, path |
33 | from Aelius.Extras import carrega | 33 | from Aelius.Extras import carrega |
34 | -from Aelius import AnotaCorpus | 34 | +from Aelius import AnotaCorpus, Toqueniza |
35 | from unicodedata import normalize | 35 | from unicodedata import normalize |
36 | 36 | ||
37 | sentenca_anotada="" | 37 | sentenca_anotada="" |
@@ -42,7 +42,7 @@ def toqueniza(s): | @@ -42,7 +42,7 @@ def toqueniza(s): | ||
42 | """ | 42 | """ |
43 | regex = re.compile('[%s]' % re.escape('“”')) | 43 | regex = re.compile('[%s]' % re.escape('“”')) |
44 | decodificada=regex.sub('"',s.replace("–", "-").replace("—", "-")).decode("utf-8") | 44 | decodificada=regex.sub('"',s.replace("–", "-").replace("—", "-")).decode("utf-8") |
45 | - return AnotaCorpus.TOK_PORT.tokenize(decodificada) | 45 | + return Toqueniza.TOK_PORT.tokenize(decodificada) |
46 | 46 | ||
47 | def getAnaliseMorfologica(): | 47 | def getAnaliseMorfologica(): |
48 | return sentenca_anotada | 48 | return sentenca_anotada |
@@ -60,9 +60,14 @@ def etiquetaSentenca(s): | @@ -60,9 +60,14 @@ def etiquetaSentenca(s): | ||
60 | anotada_corrigida = [] | 60 | anotada_corrigida = [] |
61 | for x in anotada: | 61 | for x in anotada: |
62 | if x[1] not in tag_punctuation: | 62 | if x[1] not in tag_punctuation: |
63 | - if x[1] == "NUM" and x[1].isdigit(): | ||
64 | - anotada_corrigida.append(x) | ||
65 | - continue | 63 | + if x[1] == "NUM": |
64 | + try: | ||
65 | + float(x[0].replace(',', '.')) | ||
66 | + anotada_corrigida.append(x) | ||
67 | + continue | ||
68 | + except: | ||
69 | + pass | ||
70 | + | ||
66 | tupla = [regex.sub('',x[0]).lower(),x[1]] | 71 | tupla = [regex.sub('',x[0]).lower(),x[1]] |
67 | if tupla[0] != "": anotada_corrigida.append(tupla) | 72 | if tupla[0] != "": anotada_corrigida.append(tupla) |
68 | else: | 73 | else: |