Commit 62051edaeccbf00b5ec4f061a590dc3ce913edd4

Authored by Erickson Silva
1 parent 4d22eb93
Exists in devel

Altera tokenizador para manter hífens

Showing 1 changed file with 3 additions and 4 deletions   Show diff stats
src/ClassificaSentencas.py
... ... @@ -49,9 +49,8 @@ class ClassificaSentencas(object):
49 49 decodificada = s.replace("“","").replace("”","").replace("«","").replace("»","").replace("’","").replace("‘","").replace("º","").decode("utf-8")
50 50 except:
51 51 decodificada = s.decode("utf-8")
52   -
53   - return Toqueniza.TOK_PORT.tokenize(decodificada)
54   -
  52 + return Toqueniza.TOK_PORT_LX.tokenize(decodificada)
  53 +
55 54 def obter_classificacao_morfologica(self):
56 55 return self.sentenca_anotada
57 56  
... ... @@ -63,7 +62,7 @@ class ClassificaSentencas(object):
63 62 while (anotada[0][1] is None):
64 63 time.sleep(random.choice(sleep_times))
65 64 anotada = AnotaCorpus.anota_sentencas([s],etiquetador,"hunpos")[0]
66   - regex = re.compile('[%s]' % re.escape(u'\u2022''!"#&\'()*+,-./:;<=>?@[\]^_`{|}~'))
  65 + regex = re.compile('[%s]' % re.escape(u'\u2022''!"#&\'()*+,./:;<=>?@[\]^_`{|}~'))
67 66 tag_punctuation = [".",",","QT","("]
68 67 anotada_corrigida = []
69 68 for x in anotada:
... ...