Commit 548a8ac1ff799d65fff40e18b6cdc001ca3e8e1a

Authored by Erickson Silva
1 parent 20777b90
Exists in master and in 1 other branch devel

Mantêm vírgula quando é um float

Showing 1 changed file with 10 additions and 5 deletions   Show diff stats
@@ -31,7 +31,7 @@ import re,nltk, time, random @@ -31,7 +31,7 @@ import re,nltk, time, random
31 from os.path import expanduser 31 from os.path import expanduser
32 from os import environ, path 32 from os import environ, path
33 from Aelius.Extras import carrega 33 from Aelius.Extras import carrega
34 -from Aelius import AnotaCorpus 34 +from Aelius import AnotaCorpus, Toqueniza
35 from unicodedata import normalize 35 from unicodedata import normalize
36 36
37 sentenca_anotada="" 37 sentenca_anotada=""
@@ -42,7 +42,7 @@ def toqueniza(s): @@ -42,7 +42,7 @@ def toqueniza(s):
42 """ 42 """
43 regex = re.compile('[%s]' % re.escape('“”')) 43 regex = re.compile('[%s]' % re.escape('“”'))
44 decodificada=regex.sub('"',s.replace("–", "-").replace("—", "-")).decode("utf-8") 44 decodificada=regex.sub('"',s.replace("–", "-").replace("—", "-")).decode("utf-8")
45 - return AnotaCorpus.TOK_PORT.tokenize(decodificada) 45 + return Toqueniza.TOK_PORT.tokenize(decodificada)
46 46
47 def getAnaliseMorfologica(): 47 def getAnaliseMorfologica():
48 return sentenca_anotada 48 return sentenca_anotada
@@ -60,9 +60,14 @@ def etiquetaSentenca(s): @@ -60,9 +60,14 @@ def etiquetaSentenca(s):
60 anotada_corrigida = [] 60 anotada_corrigida = []
61 for x in anotada: 61 for x in anotada:
62 if x[1] not in tag_punctuation: 62 if x[1] not in tag_punctuation:
63 - if x[1] == "NUM" and x[1].isdigit():  
64 - anotada_corrigida.append(x)  
65 - continue 63 + if x[1] == "NUM":
  64 + try:
  65 + float(x[0].replace(',', '.'))
  66 + anotada_corrigida.append(x)
  67 + continue
  68 + except:
  69 + pass
  70 +
66 tupla = [regex.sub('',x[0]).lower(),x[1]] 71 tupla = [regex.sub('',x[0]).lower(),x[1]]
67 if tupla[0] != "": anotada_corrigida.append(tupla) 72 if tupla[0] != "": anotada_corrigida.append(tupla)
68 else: 73 else: