Commit 62051edaeccbf00b5ec4f061a590dc3ce913edd4
1 parent
4d22eb93
Exists in
devel
Altera tokenizador para manter hífens
Showing
1 changed file
with
3 additions
and
4 deletions
Show diff stats
src/ClassificaSentencas.py
... | ... | @@ -49,9 +49,8 @@ class ClassificaSentencas(object): |
49 | 49 | decodificada = s.replace("“","").replace("”","").replace("«","").replace("»","").replace("’","").replace("‘","").replace("º","").decode("utf-8") |
50 | 50 | except: |
51 | 51 | decodificada = s.decode("utf-8") |
52 | - | |
53 | - return Toqueniza.TOK_PORT.tokenize(decodificada) | |
54 | - | |
52 | + return Toqueniza.TOK_PORT_LX.tokenize(decodificada) | |
53 | + | |
55 | 54 | def obter_classificacao_morfologica(self): |
56 | 55 | return self.sentenca_anotada |
57 | 56 | |
... | ... | @@ -63,7 +62,7 @@ class ClassificaSentencas(object): |
63 | 62 | while (anotada[0][1] is None): |
64 | 63 | time.sleep(random.choice(sleep_times)) |
65 | 64 | anotada = AnotaCorpus.anota_sentencas([s],etiquetador,"hunpos")[0] |
66 | - regex = re.compile('[%s]' % re.escape(u'\u2022''!"#&\'()*+,-./:;<=>?@[\]^_`{|}~')) | |
65 | + regex = re.compile('[%s]' % re.escape(u'\u2022''!"#&\'()*+,./:;<=>?@[\]^_`{|}~')) | |
67 | 66 | tag_punctuation = [".",",","QT","("] |
68 | 67 | anotada_corrigida = [] |
69 | 68 | for x in anotada: | ... | ... |