Commit 8630d9ec982edca0b1c9402348adb9c94c1ad276
1 parent
f367a715
Exists in
master
and in
1 other branch
Remove tgrep e adiciona como dependencia
Showing
3 changed files
with
0 additions
and
975 deletions
Show diff stats
src/new/AplicaRegras.py
... | ... | @@ -1,336 +0,0 @@ |
1 | -#!/usr/bin/python | |
2 | -# -*- coding: utf-8 -*- | |
3 | - | |
4 | -#Autor: Erickson Silva | |
5 | -#Email: <erickson.silva@lavid.ufpb.br> <ericksonsilva@live.com> | |
6 | - | |
7 | -#LAViD - Laboratório de Aplicações de Vídeo Digital | |
8 | - | |
9 | -from collections import deque | |
10 | -import xml.etree.ElementTree as ET | |
11 | -from os.path import expanduser | |
12 | -import platform | |
13 | -from LerDicionarios import * | |
14 | -from Iterator import * | |
15 | -from StringAux import * | |
16 | -from ConverteExtenso import * | |
17 | - | |
18 | -class AplicaRegras(object): | |
19 | - | |
20 | - # inicializa todos as variaveis | |
21 | - def __init__(self): | |
22 | - | |
23 | - self.__root = self.getRoot() | |
24 | - self.__dicionarios = LeitorDicionarios() | |
25 | - | |
26 | - def getRoot(self): | |
27 | - | |
28 | - so = platform.system() | |
29 | - if so == 'Windows': | |
30 | - return ET.parse(expanduser("~")+'\\vlibras-translate\data\\regras.xml').getroot() | |
31 | - else: | |
32 | - return ET.parse(expanduser("~")+'/vlibras-translate/data/regras.xml').getroot() | |
33 | - | |
34 | - def aplicarRegrasMorfo(self, lista): | |
35 | - | |
36 | - self.__especificos = {"advt" : self.verificarAdvTempo, "v" : self.verificarVbInfinitivo, "x" : self.verificarPrepos, "c" : self.verificarSubs2Generos, "a" : self.verificarArtigo, "l" : self.verificarVbLigacao, "i": self.verificarAdvIntensidade, "vbi":"zero", "n":"zero", "abmn":"zero", "adji":"zero", "adjn":"zero", "advi":"zero"} | |
37 | - self.pularIteracoes = 0 | |
38 | - self.__tAux = [] | |
39 | - it = Iterator() | |
40 | - it.load(lista) | |
41 | - | |
42 | - while(it.hasNext()): | |
43 | - if self.pularIteracoes > 0: | |
44 | - self.pularIteracoes-=1 | |
45 | - continue | |
46 | - | |
47 | - for morpho in self.__root.findall('morphological'): | |
48 | - self.hasRule = False | |
49 | - for rule in morpho.findall('rule'): # procura a tag rule | |
50 | - if rule.find('active').text == "true" and rule.get('name').split("_")[0] == it.getAtualT(): | |
51 | - count = int(rule.find('count').text) | |
52 | - self.listaIter = [] | |
53 | - if count == 1: | |
54 | - self.listaIter = [it.getToken()] | |
55 | - else: | |
56 | - try: | |
57 | - self.listaIter = it.getInterval(count) | |
58 | - self.pularIteracoes = count-1 | |
59 | - except: | |
60 | - continue | |
61 | - | |
62 | - | |
63 | - self.nomeRegra = self.gerarNomeDaRegra(self.listaIter) | |
64 | - if rule.get('name') == self.nomeRegra: # verifica se a regra é aplicavel e a mesma esta ativa | |
65 | - print "Achou regra: " + self.nomeRegra | |
66 | - #subIter = Iterator() | |
67 | - #subIter.load(self.listaIter) | |
68 | - #while(subIter.hasNext()): | |
69 | - self.hasRule = True | |
70 | - self.listaTmp = count * [None] | |
71 | - self.countListaIter = -1 | |
72 | - for classe in rule.iter('class'): # for nas tags class | |
73 | - title = classe.find('title') | |
74 | - newpos = classe.find('newpos') | |
75 | - newprop = classe.find('newprop') | |
76 | - newtoken = classe.find('newtoken') | |
77 | - newtokenpos = classe.find('newtokenpos') | |
78 | - self.specific = classe.find('specific') | |
79 | - | |
80 | - self.countListaIter += 1 | |
81 | - token = self.listaIter[self.countListaIter] | |
82 | - | |
83 | - if self.specific is not None: | |
84 | - self.specific = self.__especificos[self.specific.text](token[0]) | |
85 | - if newprop is not None and type(self.specific) != bool: | |
86 | - self.__tAux.append([self.specific,newprop.text]) | |
87 | - | |
88 | - if newpos is not None: | |
89 | - if newpos.text != "-1": | |
90 | - if type(self.specific) == bool: | |
91 | - self.listaTmp[int(newpos.text)] = token | |
92 | - else: | |
93 | - self.__tAux.append([self.specific, title.text]) | |
94 | - | |
95 | - if newtoken is not None: | |
96 | - self.listaTmp[int(newtokenpos.text)] = [newtoken.text, "NEWTOKEN"] | |
97 | - | |
98 | - self.listaTmp = filter(None, self.listaTmp) | |
99 | - for i in self.listaTmp: | |
100 | - self.__tAux.append(i) | |
101 | - | |
102 | - break | |
103 | - | |
104 | - if (self.hasRule == False): self.__tAux.append(it.getToken()) | |
105 | - if self.__tAux: return self.__tAux | |
106 | - return lista # retorna a lista sem alteracoes (nao existe regra) | |
107 | - | |
108 | - | |
109 | - def gerarNomeDaRegra(self, lista): | |
110 | - self.__nomeRegra = [] | |
111 | - for t in lista: | |
112 | - self.__nomeRegra.append(t[1]) | |
113 | - return "_".join(self.__nomeRegra) | |
114 | - | |
115 | - def verificarAdvTempo(self, lista): | |
116 | - for i in lista: | |
117 | - if i[1][:3] == "ADV": | |
118 | - if (self.__dicionarios.hasTempoVerbal(i[0])): | |
119 | - return True | |
120 | - return False | |
121 | - | |
122 | - def verificarVbInfinitivo(self, token): | |
123 | - if self.__dicionarios.hasVerboInfinitivo(token): # verifica se ha um verbo infinitivo desse token | |
124 | - return self.__dicionarios.getVerboInfinitivo(token) | |
125 | - return False | |
126 | - | |
127 | - #TODO | |
128 | - def verificarPrepos(self, token): | |
129 | - return None | |
130 | - | |
131 | - def verificarSubs2Generos(self, token): | |
132 | - return self.__dicionarios.hasSubst2Genero(token) | |
133 | - | |
134 | - #TODO | |
135 | - def verificarArtigo(self, token): | |
136 | - return None | |
137 | - | |
138 | - #TODO | |
139 | - def verificarVbLigacao(self, token): | |
140 | - return None | |
141 | - | |
142 | - #TODO | |
143 | - def verificarAdvIntensidade(self, token): | |
144 | - return None | |
145 | - | |
146 | - # retira artigos e preposicoes; passa verbos para infinitivo e verificar se há sinonimos | |
147 | - def inicializar(self, texto): | |
148 | - it = Iterator() | |
149 | - it.load(texto) | |
150 | - self.__ts = [] | |
151 | - self.__verb = False | |
152 | - self.__adv = False | |
153 | - self.__num = False | |
154 | - self.__plural = False | |
155 | - self.__countVerb = 0 | |
156 | - self.__countAdv = 0 | |
157 | - while(it.hasNext()): | |
158 | - token = it.getAtualW() | |
159 | - tag = it.getAtualT() | |
160 | - self.__b = False | |
161 | - | |
162 | - if self.__dicionarios.hasPalavraIgnorada(tag) == False: # verifica se nao eh artigo/preposicao | |
163 | - | |
164 | - if tag == "NUM": | |
165 | - self.__num = True | |
166 | - | |
167 | - if tag[-2:] == "-P": | |
168 | - self.__plural = True | |
169 | - | |
170 | - #VERIFICA SE É ADVERBIO E CONTA A QUANTIDADE | |
171 | - if tag[:3] == "ADV": | |
172 | - if (self.__dicionarios.hasTempoVerbal(token)): | |
173 | - self.__adv = True | |
174 | - | |
175 | - if tag[:2] == "VB": | |
176 | - | |
177 | - #VERIFICA SE É VERBO NO INFINITIVO | |
178 | - if self.__dicionarios.hasVerboInfinitivo(token): # verifica se ha um verbo infinitivo desse token | |
179 | - verboInfinitivo = self.__dicionarios.getVerboInfinitivo(token) # se sim, adiciona numa string aux | |
180 | - self.__ts.append([verboInfinitivo,tag]) # caso contrario, adiciona so o verbo infinitivo msm | |
181 | - self.__b = True | |
182 | - | |
183 | - #VERIFICA SE É VERBO DE TEMPO E CONTA A QUANTIDADE | |
184 | - if tag == "VB-P" or tag == "VB-D" or tag == "VB-R": | |
185 | - self.__verb = True | |
186 | - self.__countVerb += 1 | |
187 | - | |
188 | - #VERIFICA SE É SUBTANTIVO COMUM DOS 2 GENEROS | |
189 | - if self.__dicionarios.hasSubst2Genero(token): | |
190 | - #del self.__ts[-1] | |
191 | - lenTicket = len(it.getAntT()) | |
192 | - if ((self.__dicionarios.hasPalavraIgnorada(it.getAntT())) and (it.getAntT()[lenTicket-1:] == "F") or (it.getAntT()[lenTicket-3:] == "F-P")): | |
193 | - self.__ts.append(["MULHER ", "2GEN"]) | |
194 | - self.__ts.append([token,tag]) | |
195 | - else: | |
196 | - self.__ts.append(["HOMEM ", "2GEN"]) | |
197 | - self.__ts.append([token,tag]) | |
198 | - self.__b = True | |
199 | - | |
200 | - #SE NÃO HOUVE NENHUM ALTERAÇÃO, OU SEJA, NÃO APLICOU NENHUMA REGRA, ADICIONA O TOKEN ORIGINAL | |
201 | - if self.__b == False: # verifica se nao encontrou nem verbo infinito ou sinonimo | |
202 | - self.__ts.append([token,tag]) | |
203 | - | |
204 | - #SE ENCONTROU VERBO, ENTÃO ANALISA a SENTENCA NOVAMENTE (again?) | |
205 | - if self.__verb == True and self.__adv == False: | |
206 | - self.__ts = self.verbalAnalysis(self.__ts) | |
207 | - | |
208 | - #VERIFICA SE É PLURAL | |
209 | - if self.__plural: | |
210 | - self.__ts = self.hasPlural(self.__ts) | |
211 | - | |
212 | - #CONVERTE EXTENSO PARA NUMERO | |
213 | - if self.__num: return self.converteExtenso(self.__ts) | |
214 | - | |
215 | - return self.__ts | |
216 | - | |
217 | - | |
218 | - # converte romano para numero | |
219 | - def auxConvert(self, tag): | |
220 | - try: | |
221 | - return roman_to_int(tag) | |
222 | - except: | |
223 | - return tag | |
224 | - | |
225 | - def verbalAnalysis(self, lista): | |
226 | - lv = [] | |
227 | - it = Iterator() | |
228 | - it.load(lista) | |
229 | - hasFut = False | |
230 | - hasPas = False | |
231 | - count = 0 | |
232 | - while(it.hasNext()): | |
233 | - token = it.getAtualW().upper() | |
234 | - tag = it.getAtualT() | |
235 | - | |
236 | - if(tag == "VB-P"): | |
237 | - if (self.__countVerb > 1): | |
238 | - count += 1 | |
239 | - #print "VB-P: Incrementou" | |
240 | - if(count == self.__countVerb): | |
241 | - #print "VB-P Adicionou " + token | |
242 | - lv.append([token,tag]) | |
243 | - else: | |
244 | - #print "VB-P: retornou lista original" | |
245 | - it.reset() | |
246 | - return lista | |
247 | - elif(tag == "VB-D"): | |
248 | - count += 1 | |
249 | - hasPas = True | |
250 | - #print "VB-D: Incrementou" | |
251 | - if(count == self.__countVerb): | |
252 | - #print "VB-D Adicionou " + token | |
253 | - lv.append([token,tag]) | |
254 | - elif(tag == "VB-R"): | |
255 | - count += 1 | |
256 | - hasFut = True | |
257 | - #print "VB-R: Incrementou" | |
258 | - if(count == self.__countVerb): | |
259 | - #print "VB-R Adicionou " + token | |
260 | - lv.append([token,tag]) | |
261 | - else: | |
262 | - lv.append([token,tag]) | |
263 | - if (hasFut): | |
264 | - lv.append(["FUTURO", "T-VB"]) | |
265 | - elif (hasPas): | |
266 | - lv.append(["PASSADO", "T-VB"]) | |
267 | - it.reset() | |
268 | - return lv | |
269 | - | |
270 | - | |
271 | - def hasPlural(self, lista): | |
272 | - | |
273 | - tmp = lista | |
274 | - for e in tmp: | |
275 | - if e[1][-2:] == "-P": | |
276 | - e[0] = self.analisarPlural(e[0]) | |
277 | - | |
278 | - return tmp | |
279 | - | |
280 | - | |
281 | - def analisarPlural(self, word): | |
282 | - | |
283 | - if(word[-3:] == "OES" or word[-2:] == "AES" or word[-2:] == "AOS"): | |
284 | - return word[0:-3]+"AO" | |
285 | - elif(word[-3:] == "RES" or word[-2:] == "ZES" or word[-2:] == "NES"): | |
286 | - return word[0:-2] | |
287 | - elif(word[-3:] == "SES"): | |
288 | - #TODO: Algumas palavras possuem marcações gráficas na raiz singular. Ex: Gás – Gases | |
289 | - return word[0:-2] | |
290 | - elif(word[-2:] == "NS"): | |
291 | - return word[0:-2]+"M" | |
292 | - elif(word[-3:] == "EIS"): | |
293 | - return word[0:-3]+"IL" | |
294 | - elif(word[-2:] == "IS"): | |
295 | - if(word[-3] == "A" or word[-3] == "E" or word[-3] == "O" or word[-3] == "U"): | |
296 | - return word[0:-2]+"L" | |
297 | - else: | |
298 | - return word | |
299 | - elif(word[-1] == "S"): | |
300 | - #TODO: Palavras paroxítonas ou proparoxítonas terminadas em S. Ex: lápis, vírus, tagênis, ônibus, etc | |
301 | - return word[0:-1] | |
302 | - else: | |
303 | - return word | |
304 | - | |
305 | - | |
306 | - def converteExtenso(self, lista): | |
307 | - | |
308 | - listAux = [] | |
309 | - indexDel = [] | |
310 | - count = 0 | |
311 | - isRunning = False | |
312 | - | |
313 | - for i in range(0, len(lista)): | |
314 | - token = lista[i][0] | |
315 | - tag = lista[i][1] | |
316 | - if (tag == "NUM"): | |
317 | - if (isRunning == False and len(listAux) == count): | |
318 | - listAux.append([i,[token]]) | |
319 | - isRunning = True | |
320 | - else: | |
321 | - listAux[count][1].append(token) | |
322 | - indexDel.append(i) | |
323 | - elif (isRunning == True): | |
324 | - if ((lista[i-1][1] == "NUM") and (lista[i+1][1] == "NUM") and (tag == "CONJ")): | |
325 | - indexDel.append(i) | |
326 | - else: | |
327 | - isRunning = False | |
328 | - count += 1 | |
329 | - | |
330 | - for i in listAux: | |
331 | - ext = extenso(' '.join(i[1])) | |
332 | - lista[i[0]] = [ext, "NUM"] | |
333 | - | |
334 | - deque((list.pop(lista, i) for i in sorted(indexDel, reverse=True)), maxlen=0) | |
335 | - | |
336 | - return lista |
src/new/TraduzSentencas.py
... | ... | @@ -1,42 +0,0 @@ |
1 | -#!/usr/bin/python | |
2 | -# -*- coding: utf-8 -*- | |
3 | - | |
4 | -#Autor: Erickson Silva | |
5 | -#Email: <erickson.silva@lavid.ufpb.br> <ericksonsilva@live.com> | |
6 | - | |
7 | -#LAViD - Laboratório de Aplicações de Vídeo Digital | |
8 | - | |
9 | -import alexp | |
10 | -from AplicaSinonimos import * | |
11 | -from AplicaRegras import * | |
12 | - | |
13 | - | |
14 | -def iniciar_traducao(texto): | |
15 | - texto_quebrado = texto.split(".") | |
16 | - texto_traduzido = [] | |
17 | - for sentenca in texto_quebrado: | |
18 | - if len(sentenca) > 0 and sentenca != " ": | |
19 | - texto_traduzido.append(gerar_analise(sentenca)) | |
20 | - try: | |
21 | - return " ".join(texto_traduzido) | |
22 | - except: | |
23 | - return "" | |
24 | - | |
25 | -def gerar_analise(sentenca): | |
26 | - sinonimos = AplicaSinonimos() | |
27 | - regras = AplicaRegras() | |
28 | - analise = None | |
29 | - | |
30 | - try: | |
31 | - analise = alexp.run(sentenca) | |
32 | - except ValueError: | |
33 | - # TODO: Permitir acentos na sentença | |
34 | - analise = None | |
35 | - | |
36 | - if (isinstance(analise,type(None))): | |
37 | - morfologica = alexp.getAnaliseMorfologica() | |
38 | - analise = regras.aplicar_regras_morfo(morfologica) | |
39 | - else: | |
40 | - analise = regras.aplicar_regras_sint(arvoreSintatica) | |
41 | - | |
42 | - return sinonimos.aplicar_sinonimos(analise) | |
43 | 0 | \ No newline at end of file |
src/new/tgrep.py
... | ... | @@ -1,597 +0,0 @@ |
1 | -#!/usr/bin/env python | |
2 | -# -*- coding: utf-8 -*- | |
3 | -# | |
4 | -# Permission is hereby granted, free of charge, to any person | |
5 | -# obtaining a copy of this software and associated documentation files | |
6 | -# (the "Software"), to deal in the Software without restriction, | |
7 | -# including without limitation the rights to use, copy, modify, merge, | |
8 | -# publish, distribute, sublicense, and/or sell copies of the Software, | |
9 | -# and to permit persons to whom the Software is furnished to do so, | |
10 | -# subject to the following conditions: | |
11 | -# | |
12 | -# The above copyright notice and this permission notice shall be | |
13 | -# included in all copies or substantial portions of the Software. | |
14 | -# | |
15 | -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | -# SOFTWARE. | |
23 | - | |
24 | -''' | |
25 | -TGrep search implementation for NTLK trees. | |
26 | - | |
27 | -(c) 16 March, 2013 Will Roberts <wildwilhelm@gmail.com>. | |
28 | - | |
29 | -This module supports TGrep2 syntax for matching parts of NLTK Trees. | |
30 | -Note that many tgrep operators require the tree passed to be a | |
31 | -ParentedTree. | |
32 | - | |
33 | -Tgrep tutorial: | |
34 | -http://www.stanford.edu/dept/linguistics/corpora/cas-tut-tgrep.html | |
35 | -Tgrep2 manual: | |
36 | -http://tedlab.mit.edu/~dr/Tgrep2/tgrep2.pdf | |
37 | -Tgrep2 source: | |
38 | -http://tedlab.mit.edu/~dr/Tgrep2/ | |
39 | -''' | |
40 | - | |
41 | -from builtins import bytes, range, str | |
42 | -import nltk.tree | |
43 | -import pyparsing | |
44 | -import re | |
45 | - | |
46 | -def ancestors(node): | |
47 | - ''' | |
48 | - Returns the list of all nodes dominating the given tree node. | |
49 | - This method will not work with leaf nodes, since there is no way | |
50 | - to recover the parent. | |
51 | - ''' | |
52 | - results = [] | |
53 | - try: | |
54 | - current = node.parent() | |
55 | - except AttributeError: | |
56 | - # if node is a leaf, we cannot retrieve its parent | |
57 | - return results | |
58 | - while current: | |
59 | - results.append(current) | |
60 | - current = current.parent() | |
61 | - return results | |
62 | - | |
63 | -def unique_ancestors(node): | |
64 | - ''' | |
65 | - Returns the list of all nodes dominating the given node, where | |
66 | - there is only a single path of descent. | |
67 | - ''' | |
68 | - results = [] | |
69 | - try: | |
70 | - current = node.parent() | |
71 | - except AttributeError: | |
72 | - # if node is a leaf, we cannot retrieve its parent | |
73 | - return results | |
74 | - while current and len(current) == 1: | |
75 | - results.append(current) | |
76 | - current = current.parent() | |
77 | - return results | |
78 | - | |
79 | -def _descendants(node): | |
80 | - ''' | |
81 | - Returns the list of all nodes which are descended from the given | |
82 | - tree node in some way. | |
83 | - ''' | |
84 | - try: | |
85 | - treepos = node.treepositions() | |
86 | - except AttributeError: | |
87 | - return [] | |
88 | - return [node[x] for x in treepos[1:]] | |
89 | - | |
90 | -def _leftmost_descendants(node): | |
91 | - ''' | |
92 | - Returns the set of all nodes descended in some way through | |
93 | - left branches from this node. | |
94 | - ''' | |
95 | - try: | |
96 | - treepos = node.treepositions() | |
97 | - except AttributeError: | |
98 | - return [] | |
99 | - return [node[x] for x in treepos[1:] if all(y == 0 for y in x)] | |
100 | - | |
101 | -def _rightmost_descendants(node): | |
102 | - ''' | |
103 | - Returns the set of all nodes descended in some way through | |
104 | - right branches from this node. | |
105 | - ''' | |
106 | - try: | |
107 | - rightmost_leaf = max(node.treepositions()) | |
108 | - except AttributeError: | |
109 | - return [] | |
110 | - return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] | |
111 | - | |
112 | -def _istree(obj): | |
113 | - '''Predicate to check whether `obj` is a nltk.tree.Tree.''' | |
114 | - return isinstance(obj, nltk.tree.Tree) | |
115 | - | |
116 | -def _unique_descendants(node): | |
117 | - ''' | |
118 | - Returns the list of all nodes descended from the given node, where | |
119 | - there is only a single path of descent. | |
120 | - ''' | |
121 | - results = [] | |
122 | - current = node | |
123 | - while current and _istree(current) and len(current) == 1: | |
124 | - current = current[0] | |
125 | - results.append(current) | |
126 | - return results | |
127 | - | |
128 | -def _before(node): | |
129 | - ''' | |
130 | - Returns the set of all nodes that are before the given node. | |
131 | - ''' | |
132 | - try: | |
133 | - pos = node.treeposition() | |
134 | - tree = node.root() | |
135 | - except AttributeError: | |
136 | - return [] | |
137 | - return [tree[x] for x in tree.treepositions() | |
138 | - if x[:len(pos)] < pos[:len(x)]] | |
139 | - | |
140 | -def _immediately_before(node): | |
141 | - ''' | |
142 | - Returns the set of all nodes that are immediately before the given | |
143 | - node. | |
144 | - | |
145 | - Tree node A immediately precedes node B if the last terminal | |
146 | - symbol (word) produced by A immediately precedes the first | |
147 | - terminal symbol produced by B. | |
148 | - ''' | |
149 | - try: | |
150 | - pos = node.treeposition() | |
151 | - tree = node.root() | |
152 | - except AttributeError: | |
153 | - return [] | |
154 | - # go "upwards" from pos until there is a place we can go to the left | |
155 | - idx = len(pos) - 1 | |
156 | - while 0 <= idx and pos[idx] == 0: | |
157 | - idx -= 1 | |
158 | - if idx < 0: | |
159 | - return [] | |
160 | - pos = list(pos[:idx + 1]) | |
161 | - pos[-1] -= 1 | |
162 | - before = tree[pos] | |
163 | - return [before] + _rightmost_descendants(before) | |
164 | - | |
165 | -def _after(node): | |
166 | - ''' | |
167 | - Returns the set of all nodes that are after the given node. | |
168 | - ''' | |
169 | - try: | |
170 | - pos = node.treeposition() | |
171 | - tree = node.root() | |
172 | - except AttributeError: | |
173 | - return [] | |
174 | - return [tree[x] for x in tree.treepositions() | |
175 | - if x[:len(pos)] > pos[:len(x)]] | |
176 | - | |
177 | -def _immediately_after(node): | |
178 | - ''' | |
179 | - Returns the set of all nodes that are immediately after the given | |
180 | - node. | |
181 | - | |
182 | - Tree node A immediately follows node B if the first terminal | |
183 | - symbol (word) produced by A immediately follows the last | |
184 | - terminal symbol produced by B. | |
185 | - ''' | |
186 | - try: | |
187 | - pos = node.treeposition() | |
188 | - tree = node.root() | |
189 | - current = node.parent() | |
190 | - except AttributeError: | |
191 | - return [] | |
192 | - # go "upwards" from pos until there is a place we can go to the | |
193 | - # right | |
194 | - idx = len(pos) - 1 | |
195 | - while 0 <= idx and pos[idx] == len(current) - 1: | |
196 | - idx -= 1 | |
197 | - current = current.parent() | |
198 | - if idx < 0: | |
199 | - return [] | |
200 | - pos = list(pos[:idx + 1]) | |
201 | - pos[-1] += 1 | |
202 | - after = tree[pos] | |
203 | - return [after] + _leftmost_descendants(after) | |
204 | - | |
205 | -def _tgrep_node_literal_value(node): | |
206 | - ''' | |
207 | - Gets the string value of a given parse tree node, for comparison | |
208 | - using the tgrep node literal predicates. | |
209 | - ''' | |
210 | - return (node.label() if _istree(node) else str(node)) | |
211 | - | |
212 | -def _tgrep_node_action(_s, _l, tokens): | |
213 | - ''' | |
214 | - Builds a lambda function representing a predicate on a tree node | |
215 | - depending on the name of its node. | |
216 | - ''' | |
217 | - # print 'node tokens: ', tokens | |
218 | - if tokens[0] == u"'": | |
219 | - # strip initial apostrophe (tgrep2 print command) | |
220 | - tokens = tokens[1:] | |
221 | - if len(tokens) > 1: | |
222 | - # disjunctive definition of a node name | |
223 | - assert list(set(tokens[1::2])) == [u'|'] | |
224 | - # recursively call self to interpret each node name definition | |
225 | - tokens = [_tgrep_node_action(None, None, [node]) | |
226 | - for node in tokens[::2]] | |
227 | - # capture tokens and return the disjunction | |
228 | - return (lambda t: lambda n: any(f(n) for f in t))(tokens) | |
229 | - else: | |
230 | - if hasattr(tokens[0], u'__call__'): | |
231 | - # this is a previously interpreted parenthetical node | |
232 | - # definition (lambda function) | |
233 | - return tokens[0] | |
234 | - elif tokens[0] == u'*' or tokens[0] == u'__': | |
235 | - return lambda n: True | |
236 | - elif tokens[0].startswith(u'"'): | |
237 | - return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip(u'"')) | |
238 | - elif tokens[0].startswith(u'/'): | |
239 | - return (lambda r: lambda n: | |
240 | - r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip(u'/'))) | |
241 | - elif tokens[0].startswith(u'i@'): | |
242 | - return (lambda s: lambda n: | |
243 | - _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower()) | |
244 | - else: | |
245 | - return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0]) | |
246 | - | |
247 | -def _tgrep_parens_action(_s, _l, tokens): | |
248 | - ''' | |
249 | - Builds a lambda function representing a predicate on a tree node | |
250 | - from a parenthetical notation. | |
251 | - ''' | |
252 | - # print 'parenthetical tokens: ', tokens | |
253 | - assert len(tokens) == 3 | |
254 | - assert tokens[0] == u'(' | |
255 | - assert tokens[2] == u')' | |
256 | - return tokens[1] | |
257 | - | |
258 | -def _tgrep_nltk_tree_pos_action(_s, _l, tokens): | |
259 | - ''' | |
260 | - Builds a lambda function representing a predicate on a tree node | |
261 | - which returns true if the node is located at a specific tree | |
262 | - position. | |
263 | - ''' | |
264 | - # recover the tuple from the parsed sting | |
265 | - node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) | |
266 | - # capture the node's tree position | |
267 | - return (lambda i: lambda n: (hasattr(n, u'treeposition') and | |
268 | - n.treeposition() == i))(node_tree_position) | |
269 | - | |
270 | -def _tgrep_relation_action(_s, _l, tokens): | |
271 | - ''' | |
272 | - Builds a lambda function representing a predicate on a tree node | |
273 | - depending on its relation to other nodes in the tree. | |
274 | - ''' | |
275 | - # print 'relation tokens: ', tokens | |
276 | - # process negation first if needed | |
277 | - negated = False | |
278 | - if tokens[0] == u'!': | |
279 | - negated = True | |
280 | - tokens = tokens[1:] | |
281 | - if tokens[0] == u'[': | |
282 | - # process square-bracketed relation expressions | |
283 | - assert len(tokens) == 3 | |
284 | - assert tokens[2] == u']' | |
285 | - retval = tokens[1] | |
286 | - else: | |
287 | - # process operator-node relation expressions | |
288 | - assert len(tokens) == 2 | |
289 | - operator, predicate = tokens | |
290 | - # A < B A is the parent of (immediately dominates) B. | |
291 | - if operator == u'<': | |
292 | - retval = lambda n: (_istree(n) and | |
293 | - any(predicate(x) for x in n)) | |
294 | - # A > B A is the child of B. | |
295 | - elif operator == u'>': | |
296 | - retval = lambda n: (hasattr(n, u'parent') and | |
297 | - bool(n.parent()) and | |
298 | - predicate(n.parent())) | |
299 | - # A <, B Synonymous with A <1 B. | |
300 | - elif operator == u'<,' or operator == u'<1': | |
301 | - retval = lambda n: (_istree(n) and | |
302 | - bool(list(n)) and | |
303 | - predicate(n[0])) | |
304 | - # A >, B Synonymous with A >1 B. | |
305 | - elif operator == u'>,' or operator == u'>1': | |
306 | - retval = lambda n: (hasattr(n, u'parent') and | |
307 | - bool(n.parent()) and | |
308 | - (n is n.parent()[0]) and | |
309 | - predicate(n.parent())) | |
310 | - # A <N B B is the Nth child of A (the first child is <1). | |
311 | - elif operator[0] == u'<' and operator[1:].isdigit(): | |
312 | - idx = int(operator[1:]) | |
313 | - # capture the index parameter | |
314 | - retval = (lambda i: lambda n: (_istree(n) and | |
315 | - bool(list(n)) and | |
316 | - 0 <= i < len(n) and | |
317 | - predicate(n[i])))(idx - 1) | |
318 | - # A >N B A is the Nth child of B (the first child is >1). | |
319 | - elif operator[0] == u'>' and operator[1:].isdigit(): | |
320 | - idx = int(operator[1:]) | |
321 | - # capture the index parameter | |
322 | - retval = (lambda i: lambda n: (hasattr(n, u'parent') and | |
323 | - bool(n.parent()) and | |
324 | - 0 <= i < len(n.parent()) and | |
325 | - (n is n.parent()[i]) and | |
326 | - predicate(n.parent())))(idx - 1) | |
327 | - # A <' B B is the last child of A (also synonymous with A <-1 B). | |
328 | - # A <- B B is the last child of A (synonymous with A <-1 B). | |
329 | - elif operator == u'<\'' or operator == u'<-' or operator == u'<-1': | |
330 | - retval = lambda n: (_istree(n) and bool(list(n)) | |
331 | - and predicate(n[-1])) | |
332 | - # A >' B A is the last child of B (also synonymous with A >-1 B). | |
333 | - # A >- B A is the last child of B (synonymous with A >-1 B). | |
334 | - elif operator == u'>\'' or operator == u'>-' or operator == u'>-1': | |
335 | - retval = lambda n: (hasattr(n, u'parent') and | |
336 | - bool(n.parent()) and | |
337 | - (n is n.parent()[-1]) and | |
338 | - predicate(n.parent())) | |
339 | - # A <-N B B is the N th-to-last child of A (the last child is <-1). | |
340 | - elif operator[:2] == u'<-' and operator[2:].isdigit(): | |
341 | - idx = -int(operator[2:]) | |
342 | - # capture the index parameter | |
343 | - retval = (lambda i: lambda n: (_istree(n) and | |
344 | - bool(list(n)) and | |
345 | - 0 <= (i + len(n)) < len(n) and | |
346 | - predicate(n[i + len(n)])))(idx) | |
347 | - # A >-N B A is the N th-to-last child of B (the last child is >-1). | |
348 | - elif operator[:2] == u'>-' and operator[2:].isdigit(): | |
349 | - idx = -int(operator[2:]) | |
350 | - # capture the index parameter | |
351 | - retval = (lambda i: lambda n: | |
352 | - (hasattr(n, u'parent') and | |
353 | - bool(n.parent()) and | |
354 | - 0 <= (i + len(n.parent())) < len(n.parent()) and | |
355 | - (n is n.parent()[i + len(n.parent())]) and | |
356 | - predicate(n.parent())))(idx) | |
357 | - # A <: B B is the only child of A | |
358 | - elif operator == u'<:': | |
359 | - retval = lambda n: (_istree(n) and | |
360 | - len(n) == 1 and | |
361 | - predicate(n[0])) | |
362 | - # A >: B A is the only child of B. | |
363 | - elif operator == u'>:': | |
364 | - retval = lambda n: (hasattr(n, u'parent') and | |
365 | - bool(n.parent()) and | |
366 | - len(n.parent()) == 1 and | |
367 | - predicate(n.parent())) | |
368 | - # A << B A dominates B (A is an ancestor of B). | |
369 | - elif operator == u'<<': | |
370 | - retval = lambda n: (_istree(n) and | |
371 | - any(predicate(x) for x in _descendants(n))) | |
372 | - # A >> B A is dominated by B (A is a descendant of B). | |
373 | - elif operator == u'>>': | |
374 | - retval = lambda n: any(predicate(x) for x in ancestors(n)) | |
375 | - # A <<, B B is a left-most descendant of A. | |
376 | - elif operator == u'<<,' or operator == u'<<1': | |
377 | - retval = lambda n: (_istree(n) and | |
378 | - any(predicate(x) | |
379 | - for x in _leftmost_descendants(n))) | |
380 | - # A >>, B A is a left-most descendant of B. | |
381 | - elif operator == u'>>,': | |
382 | - retval = lambda n: any((predicate(x) and | |
383 | - n in _leftmost_descendants(x)) | |
384 | - for x in ancestors(n)) | |
385 | - # A <<' B B is a right-most descendant of A. | |
386 | - elif operator == u'<<\'': | |
387 | - retval = lambda n: (_istree(n) and | |
388 | - any(predicate(x) | |
389 | - for x in _rightmost_descendants(n))) | |
390 | - # A >>' B A is a right-most descendant of B. | |
391 | - elif operator == u'>>\'': | |
392 | - retval = lambda n: any((predicate(x) and | |
393 | - n in _rightmost_descendants(x)) | |
394 | - for x in ancestors(n)) | |
395 | - # A <<: B There is a single path of descent from A and B is on it. | |
396 | - elif operator == u'<<:': | |
397 | - retval = lambda n: (_istree(n) and | |
398 | - any(predicate(x) | |
399 | - for x in _unique_descendants(n))) | |
400 | - # A >>: B There is a single path of descent from B and A is on it. | |
401 | - elif operator == u'>>:': | |
402 | - retval = lambda n: any(predicate(x) for x in unique_ancestors(n)) | |
403 | - # A . B A immediately precedes B. | |
404 | - elif operator == u'.': | |
405 | - retval = lambda n: any(predicate(x) | |
406 | - for x in _immediately_after(n)) | |
407 | - # A , B A immediately follows B. | |
408 | - elif operator == u',': | |
409 | - retval = lambda n: any(predicate(x) | |
410 | - for x in _immediately_before(n)) | |
411 | - # A .. B A precedes B. | |
412 | - elif operator == u'..': | |
413 | - retval = lambda n: any(predicate(x) for x in _after(n)) | |
414 | - # A ,, B A follows B. | |
415 | - elif operator == u',,': | |
416 | - retval = lambda n: any(predicate(x) for x in _before(n)) | |
417 | - # A $ B A is a sister of B (and A != B). | |
418 | - elif operator == u'$' or operator == u'%': | |
419 | - retval = lambda n: (hasattr(n, u'parent') and | |
420 | - bool(n.parent()) and | |
421 | - any(predicate(x) | |
422 | - for x in n.parent() if x is not n)) | |
423 | - # A $. B A is a sister of and immediately precedes B. | |
424 | - elif operator == u'$.' or operator == u'%.': | |
425 | - retval = lambda n: (hasattr(n, u'right_sibling') and | |
426 | - bool(n.right_sibling()) and | |
427 | - predicate(n.right_sibling())) | |
428 | - # A $, B A is a sister of and immediately follows B. | |
429 | - elif operator == u'$,' or operator == u'%,': | |
430 | - retval = lambda n: (hasattr(n, u'left_sibling') and | |
431 | - bool(n.left_sibling()) and | |
432 | - predicate(n.left_sibling())) | |
433 | - # A $.. B A is a sister of and precedes B. | |
434 | - elif operator == u'$..' or operator == u'%..': | |
435 | - retval = lambda n: (hasattr(n, u'parent') and | |
436 | - hasattr(n, u'parent_index') and | |
437 | - bool(n.parent()) and | |
438 | - any(predicate(x) for x in | |
439 | - n.parent()[n.parent_index() + 1:])) | |
440 | - # A $,, B A is a sister of and follows B. | |
441 | - elif operator == u'$,,' or operator == u'%,,': | |
442 | - retval = lambda n: (hasattr(n, u'parent') and | |
443 | - hasattr(n, u'parent_index') and | |
444 | - bool(n.parent()) and | |
445 | - any(predicate(x) for x in | |
446 | - n.parent()[:n.parent_index()])) | |
447 | - else: | |
448 | - assert False, u'cannot interpret tgrep operator "{0}"'.format( | |
449 | - operator) | |
450 | - # now return the built function | |
451 | - if negated: | |
452 | - return (lambda r: (lambda n: not r(n)))(retval) | |
453 | - else: | |
454 | - return retval | |
455 | - | |
456 | -def _tgrep_rel_conjunction_action(_s, _l, tokens): | |
457 | - ''' | |
458 | - Builds a lambda function representing a predicate on a tree node | |
459 | - from the conjunction of several other such lambda functions. | |
460 | - ''' | |
461 | - # filter out the ampersand | |
462 | - tokens = [x for x in tokens if x != u'&'] | |
463 | - # print 'relation conjunction tokens: ', tokens | |
464 | - if len(tokens) == 1: | |
465 | - return tokens[0] | |
466 | - elif len(tokens) == 2: | |
467 | - return (lambda a, b: lambda n: a(n) and b(n))(tokens[0], tokens[1]) | |
468 | - | |
469 | -def _tgrep_rel_disjunction_action(_s, _l, tokens): | |
470 | - ''' | |
471 | - Builds a lambda function representing a predicate on a tree node | |
472 | - from the disjunction of several other such lambda functions. | |
473 | - ''' | |
474 | - # filter out the pipe | |
475 | - tokens = [x for x in tokens if x != u'|'] | |
476 | - # print 'relation disjunction tokens: ', tokens | |
477 | - if len(tokens) == 1: | |
478 | - return tokens[0] | |
479 | - elif len(tokens) == 2: | |
480 | - return (lambda a, b: lambda n: a(n) or b(n))(tokens[0], tokens[1]) | |
481 | - | |
482 | -def _build_tgrep_parser(set_parse_actions = True): | |
483 | - ''' | |
484 | - Builds a pyparsing-based parser object for tokenizing and | |
485 | - interpreting tgrep search strings. | |
486 | - ''' | |
487 | - tgrep_op = (pyparsing.Optional(u'!') + | |
488 | - pyparsing.Regex(u'[$%,.<>][%,.<>0-9-\':]*')) | |
489 | - tgrep_qstring = pyparsing.QuotedString(quoteChar=u'"', escChar=u'\\', | |
490 | - unquoteResults=False) | |
491 | - tgrep_node_regex = pyparsing.QuotedString(quoteChar=u'/', escChar=u'\\', | |
492 | - unquoteResults=False) | |
493 | - tgrep_node_literal = pyparsing.Regex(u'[^][ \r\t\n;:.,&|<>()$!@%\'^=]+') | |
494 | - tgrep_expr = pyparsing.Forward() | |
495 | - tgrep_relations = pyparsing.Forward() | |
496 | - tgrep_parens = pyparsing.Literal(u'(') + tgrep_expr + u')' | |
497 | - tgrep_nltk_tree_pos = ( | |
498 | - pyparsing.Literal(u'N(') + | |
499 | - pyparsing.Optional(pyparsing.Word(pyparsing.nums) + u',' + | |
500 | - pyparsing.Optional(pyparsing.delimitedList( | |
501 | - pyparsing.Word(pyparsing.nums), delim=u',') + | |
502 | - pyparsing.Optional(u','))) + u')') | |
503 | - tgrep_node_expr = (tgrep_qstring | | |
504 | - tgrep_node_regex | | |
505 | - u'*' | | |
506 | - tgrep_node_literal) | |
507 | - tgrep_node = (tgrep_parens | | |
508 | - tgrep_nltk_tree_pos | | |
509 | - (pyparsing.Optional(u"'") + | |
510 | - tgrep_node_expr + | |
511 | - pyparsing.ZeroOrMore(u"|" + tgrep_node_expr))) | |
512 | - tgrep_relation = pyparsing.Forward() | |
513 | - tgrep_brackets = pyparsing.Optional(u'!') + u'[' + tgrep_relations + u']' | |
514 | - tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node | |
515 | - tgrep_rel_conjunction = pyparsing.Forward() | |
516 | - tgrep_rel_conjunction << (tgrep_relation + | |
517 | - pyparsing.ZeroOrMore(pyparsing.Optional(u'&') + | |
518 | - tgrep_rel_conjunction)) | |
519 | - tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( | |
520 | - u"|" + tgrep_relations) | |
521 | - tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) | |
522 | - if set_parse_actions: | |
523 | - tgrep_node.setParseAction(_tgrep_node_action) | |
524 | - tgrep_parens.setParseAction(_tgrep_parens_action) | |
525 | - tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action) | |
526 | - tgrep_relation.setParseAction(_tgrep_relation_action) | |
527 | - tgrep_rel_conjunction.setParseAction(_tgrep_rel_conjunction_action) | |
528 | - tgrep_relations.setParseAction(_tgrep_rel_disjunction_action) | |
529 | - # the whole expression is also the conjunction of two | |
530 | - # predicates: the first node predicate, and the remaining | |
531 | - # relation predicates | |
532 | - tgrep_expr.setParseAction(_tgrep_rel_conjunction_action) | |
533 | - return tgrep_expr | |
534 | - | |
535 | -def tgrep_tokenize(tgrep_string): | |
536 | - ''' | |
537 | - Tokenizes a TGrep search string into separate tokens. | |
538 | - ''' | |
539 | - parser = _build_tgrep_parser(False) | |
540 | - if isinstance(tgrep_string, bytes): | |
541 | - tgrep_string = tgrep_string.decode() | |
542 | - return list(parser.parseString(tgrep_string)) | |
543 | - | |
544 | -def tgrep_compile(tgrep_string): | |
545 | - ''' | |
546 | - Parses (and tokenizes, if necessary) a TGrep search string into a | |
547 | - lambda function. | |
548 | - ''' | |
549 | - parser = _build_tgrep_parser(True) | |
550 | - if isinstance(tgrep_string, bytes): | |
551 | - tgrep_string = tgrep_string.decode() | |
552 | - return list(parser.parseString(tgrep_string, parseAll=True))[0] | |
553 | - | |
554 | -def treepositions_no_leaves(tree): | |
555 | - ''' | |
556 | - Returns all the tree positions in the given tree which are not | |
557 | - leaf nodes. | |
558 | - ''' | |
559 | - treepositions = tree.treepositions() | |
560 | - # leaves are treeposition tuples that are not prefixes of any | |
561 | - # other treeposition | |
562 | - prefixes = set() | |
563 | - for pos in treepositions: | |
564 | - for length in range(len(pos)): | |
565 | - prefixes.add(pos[:length]) | |
566 | - return [pos for pos in treepositions if pos in prefixes] | |
567 | - | |
568 | -def tgrep_positions(tree, tgrep_string, search_leaves = True): | |
569 | - ''' | |
570 | - Return all tree positions in the given tree which match the given | |
571 | - `tgrep_string`. | |
572 | - | |
573 | - If `search_leaves` is False, the method will not return any | |
574 | - results in leaf positions. | |
575 | - ''' | |
576 | - try: | |
577 | - if search_leaves: | |
578 | - search_positions = tree.treepositions() | |
579 | - else: | |
580 | - search_positions = treepositions_no_leaves(tree) | |
581 | - except AttributeError: | |
582 | - return [] | |
583 | - if isinstance(tgrep_string, (bytes, str)): | |
584 | - tgrep_string = tgrep_compile(tgrep_string) | |
585 | - return [position for position in search_positions | |
586 | - if tgrep_string(tree[position])] | |
587 | - | |
588 | -def tgrep_nodes(tree, tgrep_string, search_leaves = True): | |
589 | - ''' | |
590 | - Return all tree nodes in the given tree which match the given | |
591 | - `tgrep_ string`. | |
592 | - | |
593 | - If `search_leaves` is False, the method will not return any | |
594 | - results in leaf positions. | |
595 | - ''' | |
596 | - return [tree[position] for position in tgrep_positions(tree, tgrep_string, | |
597 | - search_leaves)] |