Commit 62051edaeccbf00b5ec4f061a590dc3ce913edd4

Authored by Erickson Silva
1 parent 4d22eb93
Exists in devel

Altera tokenizador para manter hífens

Showing 1 changed file with 3 additions and 4 deletions   Show diff stats
src/ClassificaSentencas.py
@@ -49,9 +49,8 @@ class ClassificaSentencas(object): @@ -49,9 +49,8 @@ class ClassificaSentencas(object):
49 decodificada = s.replace("“","").replace("”","").replace("«","").replace("»","").replace("’","").replace("‘","").replace("º","").decode("utf-8") 49 decodificada = s.replace("“","").replace("”","").replace("«","").replace("»","").replace("’","").replace("‘","").replace("º","").decode("utf-8")
50 except: 50 except:
51 decodificada = s.decode("utf-8") 51 decodificada = s.decode("utf-8")
52 -  
53 - return Toqueniza.TOK_PORT.tokenize(decodificada)  
54 - 52 + return Toqueniza.TOK_PORT_LX.tokenize(decodificada)
  53 +
55 def obter_classificacao_morfologica(self): 54 def obter_classificacao_morfologica(self):
56 return self.sentenca_anotada 55 return self.sentenca_anotada
57 56
@@ -63,7 +62,7 @@ class ClassificaSentencas(object): @@ -63,7 +62,7 @@ class ClassificaSentencas(object):
63 while (anotada[0][1] is None): 62 while (anotada[0][1] is None):
64 time.sleep(random.choice(sleep_times)) 63 time.sleep(random.choice(sleep_times))
65 anotada = AnotaCorpus.anota_sentencas([s],etiquetador,"hunpos")[0] 64 anotada = AnotaCorpus.anota_sentencas([s],etiquetador,"hunpos")[0]
66 - regex = re.compile('[%s]' % re.escape(u'\u2022''!"#&\'()*+,-./:;<=>?@[\]^_`{|}~')) 65 + regex = re.compile('[%s]' % re.escape(u'\u2022''!"#&\'()*+,./:;<=>?@[\]^_`{|}~'))
67 tag_punctuation = [".",",","QT","("] 66 tag_punctuation = [".",",","QT","("]
68 anotada_corrigida = [] 67 anotada_corrigida = []
69 for x in anotada: 68 for x in anotada: