Commit 408ce64883cb450ab0acbf9694e90e75d3952b46
1 parent
a5237893
Exists in
master
and in
1 other branch
Remove caractere especial 'marcador' ao classificar sentença.
Showing
1 changed file
with
2 additions
and
1 deletions
Show diff stats
src/ClassificaSentencas.py
@@ -50,6 +50,7 @@ class ClassificaSentencas(object): | @@ -50,6 +50,7 @@ class ClassificaSentencas(object): | ||
50 | decodificada = regex2.sub('',regex.sub('"',s.replace("–", "-").replace("—", "-"))).decode("utf-8") | 50 | decodificada = regex2.sub('',regex.sub('"',s.replace("–", "-").replace("—", "-"))).decode("utf-8") |
51 | except: | 51 | except: |
52 | decodificada = s.decode("utf-8") | 52 | decodificada = s.decode("utf-8") |
53 | + | ||
53 | return Toqueniza.TOK_PORT.tokenize(decodificada) | 54 | return Toqueniza.TOK_PORT.tokenize(decodificada) |
54 | 55 | ||
55 | def obter_classificacao_morfologica(self): | 56 | def obter_classificacao_morfologica(self): |
@@ -63,7 +64,7 @@ class ClassificaSentencas(object): | @@ -63,7 +64,7 @@ class ClassificaSentencas(object): | ||
63 | while (anotada[0][1] is None): | 64 | while (anotada[0][1] is None): |
64 | time.sleep(random.choice(sleep_times)) | 65 | time.sleep(random.choice(sleep_times)) |
65 | anotada = AnotaCorpus.anota_sentencas([s],etiquetador,"hunpos")[0] | 66 | anotada = AnotaCorpus.anota_sentencas([s],etiquetador,"hunpos")[0] |
66 | - regex = re.compile('[%s]' % re.escape('!"#&()*+,-./:;<=>?@[\]^_`{|}~')) | 67 | + regex = re.compile('[%s]' % re.escape(u'\u2022''!"#&()*+,-./:;<=>?@[\]^_`{|}~')) |
67 | tag_punctuation = [".",",","QT","("] | 68 | tag_punctuation = [".",",","QT","("] |
68 | anotada_corrigida = [] | 69 | anotada_corrigida = [] |
69 | for x in anotada: | 70 | for x in anotada: |