Commit 8630d9ec982edca0b1c9402348adb9c94c1ad276

Authored by Erickson Silva
1 parent f367a715
Exists in master and in 1 other branch devel

Remove tgrep e adiciona como dependencia

src/new/AplicaRegras.py
@@ -1,336 +0,0 @@ @@ -1,336 +0,0 @@
1 -#!/usr/bin/python  
2 -# -*- coding: utf-8 -*-  
3 -  
4 -#Autor: Erickson Silva  
5 -#Email: <erickson.silva@lavid.ufpb.br> <ericksonsilva@live.com>  
6 -  
7 -#LAViD - Laboratório de Aplicações de Vídeo Digital  
8 -  
9 -from collections import deque  
10 -import xml.etree.ElementTree as ET  
11 -from os.path import expanduser  
12 -import platform  
13 -from LerDicionarios import *  
14 -from Iterator import *  
15 -from StringAux import *  
16 -from ConverteExtenso import *  
17 -  
18 -class AplicaRegras(object):  
19 -  
20 - # inicializa todos as variaveis  
21 - def __init__(self):  
22 -  
23 - self.__root = self.getRoot()  
24 - self.__dicionarios = LeitorDicionarios()  
25 -  
26 - def getRoot(self):  
27 -  
28 - so = platform.system()  
29 - if so == 'Windows':  
30 - return ET.parse(expanduser("~")+'\\vlibras-translate\data\\regras.xml').getroot()  
31 - else:  
32 - return ET.parse(expanduser("~")+'/vlibras-translate/data/regras.xml').getroot()  
33 -  
34 - def aplicarRegrasMorfo(self, lista):  
35 -  
36 - self.__especificos = {"advt" : self.verificarAdvTempo, "v" : self.verificarVbInfinitivo, "x" : self.verificarPrepos, "c" : self.verificarSubs2Generos, "a" : self.verificarArtigo, "l" : self.verificarVbLigacao, "i": self.verificarAdvIntensidade, "vbi":"zero", "n":"zero", "abmn":"zero", "adji":"zero", "adjn":"zero", "advi":"zero"}  
37 - self.pularIteracoes = 0  
38 - self.__tAux = []  
39 - it = Iterator()  
40 - it.load(lista)  
41 -  
42 - while(it.hasNext()):  
43 - if self.pularIteracoes > 0:  
44 - self.pularIteracoes-=1  
45 - continue  
46 -  
47 - for morpho in self.__root.findall('morphological'):  
48 - self.hasRule = False  
49 - for rule in morpho.findall('rule'): # procura a tag rule  
50 - if rule.find('active').text == "true" and rule.get('name').split("_")[0] == it.getAtualT():  
51 - count = int(rule.find('count').text)  
52 - self.listaIter = []  
53 - if count == 1:  
54 - self.listaIter = [it.getToken()]  
55 - else:  
56 - try:  
57 - self.listaIter = it.getInterval(count)  
58 - self.pularIteracoes = count-1  
59 - except:  
60 - continue  
61 -  
62 -  
63 - self.nomeRegra = self.gerarNomeDaRegra(self.listaIter)  
64 - if rule.get('name') == self.nomeRegra: # verifica se a regra é aplicavel e a mesma esta ativa  
65 - print "Achou regra: " + self.nomeRegra  
66 - #subIter = Iterator()  
67 - #subIter.load(self.listaIter)  
68 - #while(subIter.hasNext()):  
69 - self.hasRule = True  
70 - self.listaTmp = count * [None]  
71 - self.countListaIter = -1  
72 - for classe in rule.iter('class'): # for nas tags class  
73 - title = classe.find('title')  
74 - newpos = classe.find('newpos')  
75 - newprop = classe.find('newprop')  
76 - newtoken = classe.find('newtoken')  
77 - newtokenpos = classe.find('newtokenpos')  
78 - self.specific = classe.find('specific')  
79 -  
80 - self.countListaIter += 1  
81 - token = self.listaIter[self.countListaIter]  
82 -  
83 - if self.specific is not None:  
84 - self.specific = self.__especificos[self.specific.text](token[0])  
85 - if newprop is not None and type(self.specific) != bool:  
86 - self.__tAux.append([self.specific,newprop.text])  
87 -  
88 - if newpos is not None:  
89 - if newpos.text != "-1":  
90 - if type(self.specific) == bool:  
91 - self.listaTmp[int(newpos.text)] = token  
92 - else:  
93 - self.__tAux.append([self.specific, title.text])  
94 -  
95 - if newtoken is not None:  
96 - self.listaTmp[int(newtokenpos.text)] = [newtoken.text, "NEWTOKEN"]  
97 -  
98 - self.listaTmp = filter(None, self.listaTmp)  
99 - for i in self.listaTmp:  
100 - self.__tAux.append(i)  
101 -  
102 - break  
103 -  
104 - if (self.hasRule == False): self.__tAux.append(it.getToken())  
105 - if self.__tAux: return self.__tAux  
106 - return lista # retorna a lista sem alteracoes (nao existe regra)  
107 -  
108 -  
109 - def gerarNomeDaRegra(self, lista):  
110 - self.__nomeRegra = []  
111 - for t in lista:  
112 - self.__nomeRegra.append(t[1])  
113 - return "_".join(self.__nomeRegra)  
114 -  
115 - def verificarAdvTempo(self, lista):  
116 - for i in lista:  
117 - if i[1][:3] == "ADV":  
118 - if (self.__dicionarios.hasTempoVerbal(i[0])):  
119 - return True  
120 - return False  
121 -  
122 - def verificarVbInfinitivo(self, token):  
123 - if self.__dicionarios.hasVerboInfinitivo(token): # verifica se ha um verbo infinitivo desse token  
124 - return self.__dicionarios.getVerboInfinitivo(token)  
125 - return False  
126 -  
127 - #TODO  
128 - def verificarPrepos(self, token):  
129 - return None  
130 -  
131 - def verificarSubs2Generos(self, token):  
132 - return self.__dicionarios.hasSubst2Genero(token)  
133 -  
134 - #TODO  
135 - def verificarArtigo(self, token):  
136 - return None  
137 -  
138 - #TODO  
139 - def verificarVbLigacao(self, token):  
140 - return None  
141 -  
142 - #TODO  
143 - def verificarAdvIntensidade(self, token):  
144 - return None  
145 -  
146 - # retira artigos e preposicoes; passa verbos para infinitivo e verificar se há sinonimos  
147 - def inicializar(self, texto):  
148 - it = Iterator()  
149 - it.load(texto)  
150 - self.__ts = []  
151 - self.__verb = False  
152 - self.__adv = False  
153 - self.__num = False  
154 - self.__plural = False  
155 - self.__countVerb = 0  
156 - self.__countAdv = 0  
157 - while(it.hasNext()):  
158 - token = it.getAtualW()  
159 - tag = it.getAtualT()  
160 - self.__b = False  
161 -  
162 - if self.__dicionarios.hasPalavraIgnorada(tag) == False: # verifica se nao eh artigo/preposicao  
163 -  
164 - if tag == "NUM":  
165 - self.__num = True  
166 -  
167 - if tag[-2:] == "-P":  
168 - self.__plural = True  
169 -  
170 - #VERIFICA SE É ADVERBIO E CONTA A QUANTIDADE  
171 - if tag[:3] == "ADV":  
172 - if (self.__dicionarios.hasTempoVerbal(token)):  
173 - self.__adv = True  
174 -  
175 - if tag[:2] == "VB":  
176 -  
177 - #VERIFICA SE É VERBO NO INFINITIVO  
178 - if self.__dicionarios.hasVerboInfinitivo(token): # verifica se ha um verbo infinitivo desse token  
179 - verboInfinitivo = self.__dicionarios.getVerboInfinitivo(token) # se sim, adiciona numa string aux  
180 - self.__ts.append([verboInfinitivo,tag]) # caso contrario, adiciona so o verbo infinitivo msm  
181 - self.__b = True  
182 -  
183 - #VERIFICA SE É VERBO DE TEMPO E CONTA A QUANTIDADE  
184 - if tag == "VB-P" or tag == "VB-D" or tag == "VB-R":  
185 - self.__verb = True  
186 - self.__countVerb += 1  
187 -  
188 - #VERIFICA SE É SUBTANTIVO COMUM DOS 2 GENEROS  
189 - if self.__dicionarios.hasSubst2Genero(token):  
190 - #del self.__ts[-1]  
191 - lenTicket = len(it.getAntT())  
192 - if ((self.__dicionarios.hasPalavraIgnorada(it.getAntT())) and (it.getAntT()[lenTicket-1:] == "F") or (it.getAntT()[lenTicket-3:] == "F-P")):  
193 - self.__ts.append(["MULHER ", "2GEN"])  
194 - self.__ts.append([token,tag])  
195 - else:  
196 - self.__ts.append(["HOMEM ", "2GEN"])  
197 - self.__ts.append([token,tag])  
198 - self.__b = True  
199 -  
200 - #SE NÃO HOUVE NENHUM ALTERAÇÃO, OU SEJA, NÃO APLICOU NENHUMA REGRA, ADICIONA O TOKEN ORIGINAL  
201 - if self.__b == False: # verifica se nao encontrou nem verbo infinito ou sinonimo  
202 - self.__ts.append([token,tag])  
203 -  
204 - #SE ENCONTROU VERBO, ENTÃO ANALISA a SENTENCA NOVAMENTE (again?)  
205 - if self.__verb == True and self.__adv == False:  
206 - self.__ts = self.verbalAnalysis(self.__ts)  
207 -  
208 - #VERIFICA SE É PLURAL  
209 - if self.__plural:  
210 - self.__ts = self.hasPlural(self.__ts)  
211 -  
212 - #CONVERTE EXTENSO PARA NUMERO  
213 - if self.__num: return self.converteExtenso(self.__ts)  
214 -  
215 - return self.__ts  
216 -  
217 -  
218 - # converte romano para numero  
219 - def auxConvert(self, tag):  
220 - try:  
221 - return roman_to_int(tag)  
222 - except:  
223 - return tag  
224 -  
225 - def verbalAnalysis(self, lista):  
226 - lv = []  
227 - it = Iterator()  
228 - it.load(lista)  
229 - hasFut = False  
230 - hasPas = False  
231 - count = 0  
232 - while(it.hasNext()):  
233 - token = it.getAtualW().upper()  
234 - tag = it.getAtualT()  
235 -  
236 - if(tag == "VB-P"):  
237 - if (self.__countVerb > 1):  
238 - count += 1  
239 - #print "VB-P: Incrementou"  
240 - if(count == self.__countVerb):  
241 - #print "VB-P Adicionou " + token  
242 - lv.append([token,tag])  
243 - else:  
244 - #print "VB-P: retornou lista original"  
245 - it.reset()  
246 - return lista  
247 - elif(tag == "VB-D"):  
248 - count += 1  
249 - hasPas = True  
250 - #print "VB-D: Incrementou"  
251 - if(count == self.__countVerb):  
252 - #print "VB-D Adicionou " + token  
253 - lv.append([token,tag])  
254 - elif(tag == "VB-R"):  
255 - count += 1  
256 - hasFut = True  
257 - #print "VB-R: Incrementou"  
258 - if(count == self.__countVerb):  
259 - #print "VB-R Adicionou " + token  
260 - lv.append([token,tag])  
261 - else:  
262 - lv.append([token,tag])  
263 - if (hasFut):  
264 - lv.append(["FUTURO", "T-VB"])  
265 - elif (hasPas):  
266 - lv.append(["PASSADO", "T-VB"])  
267 - it.reset()  
268 - return lv  
269 -  
270 -  
271 - def hasPlural(self, lista):  
272 -  
273 - tmp = lista  
274 - for e in tmp:  
275 - if e[1][-2:] == "-P":  
276 - e[0] = self.analisarPlural(e[0])  
277 -  
278 - return tmp  
279 -  
280 -  
281 - def analisarPlural(self, word):  
282 -  
283 - if(word[-3:] == "OES" or word[-2:] == "AES" or word[-2:] == "AOS"):  
284 - return word[0:-3]+"AO"  
285 - elif(word[-3:] == "RES" or word[-2:] == "ZES" or word[-2:] == "NES"):  
286 - return word[0:-2]  
287 - elif(word[-3:] == "SES"):  
288 - #TODO: Algumas palavras possuem marcações gráficas na raiz singular. Ex: Gás – Gases  
289 - return word[0:-2]  
290 - elif(word[-2:] == "NS"):  
291 - return word[0:-2]+"M"  
292 - elif(word[-3:] == "EIS"):  
293 - return word[0:-3]+"IL"  
294 - elif(word[-2:] == "IS"):  
295 - if(word[-3] == "A" or word[-3] == "E" or word[-3] == "O" or word[-3] == "U"):  
296 - return word[0:-2]+"L"  
297 - else:  
298 - return word  
299 - elif(word[-1] == "S"):  
300 - #TODO: Palavras paroxítonas ou proparoxítonas terminadas em S. Ex: lápis, vírus, tagênis, ônibus, etc  
301 - return word[0:-1]  
302 - else:  
303 - return word  
304 -  
305 -  
306 - def converteExtenso(self, lista):  
307 -  
308 - listAux = []  
309 - indexDel = []  
310 - count = 0  
311 - isRunning = False  
312 -  
313 - for i in range(0, len(lista)):  
314 - token = lista[i][0]  
315 - tag = lista[i][1]  
316 - if (tag == "NUM"):  
317 - if (isRunning == False and len(listAux) == count):  
318 - listAux.append([i,[token]])  
319 - isRunning = True  
320 - else:  
321 - listAux[count][1].append(token)  
322 - indexDel.append(i)  
323 - elif (isRunning == True):  
324 - if ((lista[i-1][1] == "NUM") and (lista[i+1][1] == "NUM") and (tag == "CONJ")):  
325 - indexDel.append(i)  
326 - else:  
327 - isRunning = False  
328 - count += 1  
329 -  
330 - for i in listAux:  
331 - ext = extenso(' '.join(i[1]))  
332 - lista[i[0]] = [ext, "NUM"]  
333 -  
334 - deque((list.pop(lista, i) for i in sorted(indexDel, reverse=True)), maxlen=0)  
335 -  
336 - return lista  
src/new/TraduzSentencas.py
@@ -1,42 +0,0 @@ @@ -1,42 +0,0 @@
1 -#!/usr/bin/python  
2 -# -*- coding: utf-8 -*-  
3 -  
4 -#Autor: Erickson Silva  
5 -#Email: <erickson.silva@lavid.ufpb.br> <ericksonsilva@live.com>  
6 -  
7 -#LAViD - Laboratório de Aplicações de Vídeo Digital  
8 -  
9 -import alexp  
10 -from AplicaSinonimos import *  
11 -from AplicaRegras import *  
12 -  
13 -  
14 -def iniciar_traducao(texto):  
15 - texto_quebrado = texto.split(".")  
16 - texto_traduzido = []  
17 - for sentenca in texto_quebrado:  
18 - if len(sentenca) > 0 and sentenca != " ":  
19 - texto_traduzido.append(gerar_analise(sentenca))  
20 - try:  
21 - return " ".join(texto_traduzido)  
22 - except:  
23 - return ""  
24 -  
25 -def gerar_analise(sentenca):  
26 - sinonimos = AplicaSinonimos()  
27 - regras = AplicaRegras()  
28 - analise = None  
29 -  
30 - try:  
31 - analise = alexp.run(sentenca)  
32 - except ValueError:  
33 - # TODO: Permitir acentos na sentença  
34 - analise = None  
35 -  
36 - if (isinstance(analise,type(None))):  
37 - morfologica = alexp.getAnaliseMorfologica()  
38 - analise = regras.aplicar_regras_morfo(morfologica)  
39 - else:  
40 - analise = regras.aplicar_regras_sint(arvoreSintatica)  
41 -  
42 - return sinonimos.aplicar_sinonimos(analise)  
43 \ No newline at end of file 0 \ No newline at end of file
src/new/tgrep.py
@@ -1,597 +0,0 @@ @@ -1,597 +0,0 @@
1 -#!/usr/bin/env python  
2 -# -*- coding: utf-8 -*-  
3 -#  
4 -# Permission is hereby granted, free of charge, to any person  
5 -# obtaining a copy of this software and associated documentation files  
6 -# (the "Software"), to deal in the Software without restriction,  
7 -# including without limitation the rights to use, copy, modify, merge,  
8 -# publish, distribute, sublicense, and/or sell copies of the Software,  
9 -# and to permit persons to whom the Software is furnished to do so,  
10 -# subject to the following conditions:  
11 -#  
12 -# The above copyright notice and this permission notice shall be  
13 -# included in all copies or substantial portions of the Software.  
14 -#  
15 -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,  
16 -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF  
17 -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND  
18 -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS  
19 -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN  
20 -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN  
21 -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE  
22 -# SOFTWARE.  
23 -  
24 -'''  
25 -TGrep search implementation for NTLK trees.  
26 -  
27 -(c) 16 March, 2013 Will Roberts <wildwilhelm@gmail.com>.  
28 -  
29 -This module supports TGrep2 syntax for matching parts of NLTK Trees.  
30 -Note that many tgrep operators require the tree passed to be a  
31 -ParentedTree.  
32 -  
33 -Tgrep tutorial:  
34 -http://www.stanford.edu/dept/linguistics/corpora/cas-tut-tgrep.html  
35 -Tgrep2 manual:  
36 -http://tedlab.mit.edu/~dr/Tgrep2/tgrep2.pdf  
37 -Tgrep2 source:  
38 -http://tedlab.mit.edu/~dr/Tgrep2/  
39 -'''  
40 -  
41 -from builtins import bytes, range, str  
42 -import nltk.tree  
43 -import pyparsing  
44 -import re  
45 -  
46 -def ancestors(node):  
47 - '''  
48 - Returns the list of all nodes dominating the given tree node.  
49 - This method will not work with leaf nodes, since there is no way  
50 - to recover the parent.  
51 - '''  
52 - results = []  
53 - try:  
54 - current = node.parent()  
55 - except AttributeError:  
56 - # if node is a leaf, we cannot retrieve its parent  
57 - return results  
58 - while current:  
59 - results.append(current)  
60 - current = current.parent()  
61 - return results  
62 -  
63 -def unique_ancestors(node):  
64 - '''  
65 - Returns the list of all nodes dominating the given node, where  
66 - there is only a single path of descent.  
67 - '''  
68 - results = []  
69 - try:  
70 - current = node.parent()  
71 - except AttributeError:  
72 - # if node is a leaf, we cannot retrieve its parent  
73 - return results  
74 - while current and len(current) == 1:  
75 - results.append(current)  
76 - current = current.parent()  
77 - return results  
78 -  
79 -def _descendants(node):  
80 - '''  
81 - Returns the list of all nodes which are descended from the given  
82 - tree node in some way.  
83 - '''  
84 - try:  
85 - treepos = node.treepositions()  
86 - except AttributeError:  
87 - return []  
88 - return [node[x] for x in treepos[1:]]  
89 -  
90 -def _leftmost_descendants(node):  
91 - '''  
92 - Returns the set of all nodes descended in some way through  
93 - left branches from this node.  
94 - '''  
95 - try:  
96 - treepos = node.treepositions()  
97 - except AttributeError:  
98 - return []  
99 - return [node[x] for x in treepos[1:] if all(y == 0 for y in x)]  
100 -  
101 -def _rightmost_descendants(node):  
102 - '''  
103 - Returns the set of all nodes descended in some way through  
104 - right branches from this node.  
105 - '''  
106 - try:  
107 - rightmost_leaf = max(node.treepositions())  
108 - except AttributeError:  
109 - return []  
110 - return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)]  
111 -  
112 -def _istree(obj):  
113 - '''Predicate to check whether `obj` is a nltk.tree.Tree.'''  
114 - return isinstance(obj, nltk.tree.Tree)  
115 -  
116 -def _unique_descendants(node):  
117 - '''  
118 - Returns the list of all nodes descended from the given node, where  
119 - there is only a single path of descent.  
120 - '''  
121 - results = []  
122 - current = node  
123 - while current and _istree(current) and len(current) == 1:  
124 - current = current[0]  
125 - results.append(current)  
126 - return results  
127 -  
128 -def _before(node):  
129 - '''  
130 - Returns the set of all nodes that are before the given node.  
131 - '''  
132 - try:  
133 - pos = node.treeposition()  
134 - tree = node.root()  
135 - except AttributeError:  
136 - return []  
137 - return [tree[x] for x in tree.treepositions()  
138 - if x[:len(pos)] < pos[:len(x)]]  
139 -  
140 -def _immediately_before(node):  
141 - '''  
142 - Returns the set of all nodes that are immediately before the given  
143 - node.  
144 -  
145 - Tree node A immediately precedes node B if the last terminal  
146 - symbol (word) produced by A immediately precedes the first  
147 - terminal symbol produced by B.  
148 - '''  
149 - try:  
150 - pos = node.treeposition()  
151 - tree = node.root()  
152 - except AttributeError:  
153 - return []  
154 - # go "upwards" from pos until there is a place we can go to the left  
155 - idx = len(pos) - 1  
156 - while 0 <= idx and pos[idx] == 0:  
157 - idx -= 1  
158 - if idx < 0:  
159 - return []  
160 - pos = list(pos[:idx + 1])  
161 - pos[-1] -= 1  
162 - before = tree[pos]  
163 - return [before] + _rightmost_descendants(before)  
164 -  
165 -def _after(node):  
166 - '''  
167 - Returns the set of all nodes that are after the given node.  
168 - '''  
169 - try:  
170 - pos = node.treeposition()  
171 - tree = node.root()  
172 - except AttributeError:  
173 - return []  
174 - return [tree[x] for x in tree.treepositions()  
175 - if x[:len(pos)] > pos[:len(x)]]  
176 -  
177 -def _immediately_after(node):  
178 - '''  
179 - Returns the set of all nodes that are immediately after the given  
180 - node.  
181 -  
182 - Tree node A immediately follows node B if the first terminal  
183 - symbol (word) produced by A immediately follows the last  
184 - terminal symbol produced by B.  
185 - '''  
186 - try:  
187 - pos = node.treeposition()  
188 - tree = node.root()  
189 - current = node.parent()  
190 - except AttributeError:  
191 - return []  
192 - # go "upwards" from pos until there is a place we can go to the  
193 - # right  
194 - idx = len(pos) - 1  
195 - while 0 <= idx and pos[idx] == len(current) - 1:  
196 - idx -= 1  
197 - current = current.parent()  
198 - if idx < 0:  
199 - return []  
200 - pos = list(pos[:idx + 1])  
201 - pos[-1] += 1  
202 - after = tree[pos]  
203 - return [after] + _leftmost_descendants(after)  
204 -  
205 -def _tgrep_node_literal_value(node):  
206 - '''  
207 - Gets the string value of a given parse tree node, for comparison  
208 - using the tgrep node literal predicates.  
209 - '''  
210 - return (node.label() if _istree(node) else str(node))  
211 -  
212 -def _tgrep_node_action(_s, _l, tokens):  
213 - '''  
214 - Builds a lambda function representing a predicate on a tree node  
215 - depending on the name of its node.  
216 - '''  
217 - # print 'node tokens: ', tokens  
218 - if tokens[0] == u"'":  
219 - # strip initial apostrophe (tgrep2 print command)  
220 - tokens = tokens[1:]  
221 - if len(tokens) > 1:  
222 - # disjunctive definition of a node name  
223 - assert list(set(tokens[1::2])) == [u'|']  
224 - # recursively call self to interpret each node name definition  
225 - tokens = [_tgrep_node_action(None, None, [node])  
226 - for node in tokens[::2]]  
227 - # capture tokens and return the disjunction  
228 - return (lambda t: lambda n: any(f(n) for f in t))(tokens)  
229 - else:  
230 - if hasattr(tokens[0], u'__call__'):  
231 - # this is a previously interpreted parenthetical node  
232 - # definition (lambda function)  
233 - return tokens[0]  
234 - elif tokens[0] == u'*' or tokens[0] == u'__':  
235 - return lambda n: True  
236 - elif tokens[0].startswith(u'"'):  
237 - return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip(u'"'))  
238 - elif tokens[0].startswith(u'/'):  
239 - return (lambda r: lambda n:  
240 - r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip(u'/')))  
241 - elif tokens[0].startswith(u'i@'):  
242 - return (lambda s: lambda n:  
243 - _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower())  
244 - else:  
245 - return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0])  
246 -  
247 -def _tgrep_parens_action(_s, _l, tokens):  
248 - '''  
249 - Builds a lambda function representing a predicate on a tree node  
250 - from a parenthetical notation.  
251 - '''  
252 - # print 'parenthetical tokens: ', tokens  
253 - assert len(tokens) == 3  
254 - assert tokens[0] == u'('  
255 - assert tokens[2] == u')'  
256 - return tokens[1]  
257 -  
258 -def _tgrep_nltk_tree_pos_action(_s, _l, tokens):  
259 - '''  
260 - Builds a lambda function representing a predicate on a tree node  
261 - which returns true if the node is located at a specific tree  
262 - position.  
263 - '''  
264 - # recover the tuple from the parsed sting  
265 - node_tree_position = tuple(int(x) for x in tokens if x.isdigit())  
266 - # capture the node's tree position  
267 - return (lambda i: lambda n: (hasattr(n, u'treeposition') and  
268 - n.treeposition() == i))(node_tree_position)  
269 -  
270 -def _tgrep_relation_action(_s, _l, tokens):  
271 - '''  
272 - Builds a lambda function representing a predicate on a tree node  
273 - depending on its relation to other nodes in the tree.  
274 - '''  
275 - # print 'relation tokens: ', tokens  
276 - # process negation first if needed  
277 - negated = False  
278 - if tokens[0] == u'!':  
279 - negated = True  
280 - tokens = tokens[1:]  
281 - if tokens[0] == u'[':  
282 - # process square-bracketed relation expressions  
283 - assert len(tokens) == 3  
284 - assert tokens[2] == u']'  
285 - retval = tokens[1]  
286 - else:  
287 - # process operator-node relation expressions  
288 - assert len(tokens) == 2  
289 - operator, predicate = tokens  
290 - # A < B A is the parent of (immediately dominates) B.  
291 - if operator == u'<':  
292 - retval = lambda n: (_istree(n) and  
293 - any(predicate(x) for x in n))  
294 - # A > B A is the child of B.  
295 - elif operator == u'>':  
296 - retval = lambda n: (hasattr(n, u'parent') and  
297 - bool(n.parent()) and  
298 - predicate(n.parent()))  
299 - # A <, B Synonymous with A <1 B.  
300 - elif operator == u'<,' or operator == u'<1':  
301 - retval = lambda n: (_istree(n) and  
302 - bool(list(n)) and  
303 - predicate(n[0]))  
304 - # A >, B Synonymous with A >1 B.  
305 - elif operator == u'>,' or operator == u'>1':  
306 - retval = lambda n: (hasattr(n, u'parent') and  
307 - bool(n.parent()) and  
308 - (n is n.parent()[0]) and  
309 - predicate(n.parent()))  
310 - # A <N B B is the Nth child of A (the first child is <1).  
311 - elif operator[0] == u'<' and operator[1:].isdigit():  
312 - idx = int(operator[1:])  
313 - # capture the index parameter  
314 - retval = (lambda i: lambda n: (_istree(n) and  
315 - bool(list(n)) and  
316 - 0 <= i < len(n) and  
317 - predicate(n[i])))(idx - 1)  
318 - # A >N B A is the Nth child of B (the first child is >1).  
319 - elif operator[0] == u'>' and operator[1:].isdigit():  
320 - idx = int(operator[1:])  
321 - # capture the index parameter  
322 - retval = (lambda i: lambda n: (hasattr(n, u'parent') and  
323 - bool(n.parent()) and  
324 - 0 <= i < len(n.parent()) and  
325 - (n is n.parent()[i]) and  
326 - predicate(n.parent())))(idx - 1)  
327 - # A <' B B is the last child of A (also synonymous with A <-1 B).  
328 - # A <- B B is the last child of A (synonymous with A <-1 B).  
329 - elif operator == u'<\'' or operator == u'<-' or operator == u'<-1':  
330 - retval = lambda n: (_istree(n) and bool(list(n))  
331 - and predicate(n[-1]))  
332 - # A >' B A is the last child of B (also synonymous with A >-1 B).  
333 - # A >- B A is the last child of B (synonymous with A >-1 B).  
334 - elif operator == u'>\'' or operator == u'>-' or operator == u'>-1':  
335 - retval = lambda n: (hasattr(n, u'parent') and  
336 - bool(n.parent()) and  
337 - (n is n.parent()[-1]) and  
338 - predicate(n.parent()))  
339 - # A <-N B B is the N th-to-last child of A (the last child is <-1).  
340 - elif operator[:2] == u'<-' and operator[2:].isdigit():  
341 - idx = -int(operator[2:])  
342 - # capture the index parameter  
343 - retval = (lambda i: lambda n: (_istree(n) and  
344 - bool(list(n)) and  
345 - 0 <= (i + len(n)) < len(n) and  
346 - predicate(n[i + len(n)])))(idx)  
347 - # A >-N B A is the N th-to-last child of B (the last child is >-1).  
348 - elif operator[:2] == u'>-' and operator[2:].isdigit():  
349 - idx = -int(operator[2:])  
350 - # capture the index parameter  
351 - retval = (lambda i: lambda n:  
352 - (hasattr(n, u'parent') and  
353 - bool(n.parent()) and  
354 - 0 <= (i + len(n.parent())) < len(n.parent()) and  
355 - (n is n.parent()[i + len(n.parent())]) and  
356 - predicate(n.parent())))(idx)  
357 - # A <: B B is the only child of A  
358 - elif operator == u'<:':  
359 - retval = lambda n: (_istree(n) and  
360 - len(n) == 1 and  
361 - predicate(n[0]))  
362 - # A >: B A is the only child of B.  
363 - elif operator == u'>:':  
364 - retval = lambda n: (hasattr(n, u'parent') and  
365 - bool(n.parent()) and  
366 - len(n.parent()) == 1 and  
367 - predicate(n.parent()))  
368 - # A << B A dominates B (A is an ancestor of B).  
369 - elif operator == u'<<':  
370 - retval = lambda n: (_istree(n) and  
371 - any(predicate(x) for x in _descendants(n)))  
372 - # A >> B A is dominated by B (A is a descendant of B).  
373 - elif operator == u'>>':  
374 - retval = lambda n: any(predicate(x) for x in ancestors(n))  
375 - # A <<, B B is a left-most descendant of A.  
376 - elif operator == u'<<,' or operator == u'<<1':  
377 - retval = lambda n: (_istree(n) and  
378 - any(predicate(x)  
379 - for x in _leftmost_descendants(n)))  
380 - # A >>, B A is a left-most descendant of B.  
381 - elif operator == u'>>,':  
382 - retval = lambda n: any((predicate(x) and  
383 - n in _leftmost_descendants(x))  
384 - for x in ancestors(n))  
385 - # A <<' B B is a right-most descendant of A.  
386 - elif operator == u'<<\'':  
387 - retval = lambda n: (_istree(n) and  
388 - any(predicate(x)  
389 - for x in _rightmost_descendants(n)))  
390 - # A >>' B A is a right-most descendant of B.  
391 - elif operator == u'>>\'':  
392 - retval = lambda n: any((predicate(x) and  
393 - n in _rightmost_descendants(x))  
394 - for x in ancestors(n))  
395 - # A <<: B There is a single path of descent from A and B is on it.  
396 - elif operator == u'<<:':  
397 - retval = lambda n: (_istree(n) and  
398 - any(predicate(x)  
399 - for x in _unique_descendants(n)))  
400 - # A >>: B There is a single path of descent from B and A is on it.  
401 - elif operator == u'>>:':  
402 - retval = lambda n: any(predicate(x) for x in unique_ancestors(n))  
403 - # A . B A immediately precedes B.  
404 - elif operator == u'.':  
405 - retval = lambda n: any(predicate(x)  
406 - for x in _immediately_after(n))  
407 - # A , B A immediately follows B.  
408 - elif operator == u',':  
409 - retval = lambda n: any(predicate(x)  
410 - for x in _immediately_before(n))  
411 - # A .. B A precedes B.  
412 - elif operator == u'..':  
413 - retval = lambda n: any(predicate(x) for x in _after(n))  
414 - # A ,, B A follows B.  
415 - elif operator == u',,':  
416 - retval = lambda n: any(predicate(x) for x in _before(n))  
417 - # A $ B A is a sister of B (and A != B).  
418 - elif operator == u'$' or operator == u'%':  
419 - retval = lambda n: (hasattr(n, u'parent') and  
420 - bool(n.parent()) and  
421 - any(predicate(x)  
422 - for x in n.parent() if x is not n))  
423 - # A $. B A is a sister of and immediately precedes B.  
424 - elif operator == u'$.' or operator == u'%.':  
425 - retval = lambda n: (hasattr(n, u'right_sibling') and  
426 - bool(n.right_sibling()) and  
427 - predicate(n.right_sibling()))  
428 - # A $, B A is a sister of and immediately follows B.  
429 - elif operator == u'$,' or operator == u'%,':  
430 - retval = lambda n: (hasattr(n, u'left_sibling') and  
431 - bool(n.left_sibling()) and  
432 - predicate(n.left_sibling()))  
433 - # A $.. B A is a sister of and precedes B.  
434 - elif operator == u'$..' or operator == u'%..':  
435 - retval = lambda n: (hasattr(n, u'parent') and  
436 - hasattr(n, u'parent_index') and  
437 - bool(n.parent()) and  
438 - any(predicate(x) for x in  
439 - n.parent()[n.parent_index() + 1:]))  
440 - # A $,, B A is a sister of and follows B.  
441 - elif operator == u'$,,' or operator == u'%,,':  
442 - retval = lambda n: (hasattr(n, u'parent') and  
443 - hasattr(n, u'parent_index') and  
444 - bool(n.parent()) and  
445 - any(predicate(x) for x in  
446 - n.parent()[:n.parent_index()]))  
447 - else:  
448 - assert False, u'cannot interpret tgrep operator "{0}"'.format(  
449 - operator)  
450 - # now return the built function  
451 - if negated:  
452 - return (lambda r: (lambda n: not r(n)))(retval)  
453 - else:  
454 - return retval  
455 -  
456 -def _tgrep_rel_conjunction_action(_s, _l, tokens):  
457 - '''  
458 - Builds a lambda function representing a predicate on a tree node  
459 - from the conjunction of several other such lambda functions.  
460 - '''  
461 - # filter out the ampersand  
462 - tokens = [x for x in tokens if x != u'&']  
463 - # print 'relation conjunction tokens: ', tokens  
464 - if len(tokens) == 1:  
465 - return tokens[0]  
466 - elif len(tokens) == 2:  
467 - return (lambda a, b: lambda n: a(n) and b(n))(tokens[0], tokens[1])  
468 -  
469 -def _tgrep_rel_disjunction_action(_s, _l, tokens):  
470 - '''  
471 - Builds a lambda function representing a predicate on a tree node  
472 - from the disjunction of several other such lambda functions.  
473 - '''  
474 - # filter out the pipe  
475 - tokens = [x for x in tokens if x != u'|']  
476 - # print 'relation disjunction tokens: ', tokens  
477 - if len(tokens) == 1:  
478 - return tokens[0]  
479 - elif len(tokens) == 2:  
480 - return (lambda a, b: lambda n: a(n) or b(n))(tokens[0], tokens[1])  
481 -  
482 -def _build_tgrep_parser(set_parse_actions = True):  
483 - '''  
484 - Builds a pyparsing-based parser object for tokenizing and  
485 - interpreting tgrep search strings.  
486 - '''  
487 - tgrep_op = (pyparsing.Optional(u'!') +  
488 - pyparsing.Regex(u'[$%,.<>][%,.<>0-9-\':]*'))  
489 - tgrep_qstring = pyparsing.QuotedString(quoteChar=u'"', escChar=u'\\',  
490 - unquoteResults=False)  
491 - tgrep_node_regex = pyparsing.QuotedString(quoteChar=u'/', escChar=u'\\',  
492 - unquoteResults=False)  
493 - tgrep_node_literal = pyparsing.Regex(u'[^][ \r\t\n;:.,&|<>()$!@%\'^=]+')  
494 - tgrep_expr = pyparsing.Forward()  
495 - tgrep_relations = pyparsing.Forward()  
496 - tgrep_parens = pyparsing.Literal(u'(') + tgrep_expr + u')'  
497 - tgrep_nltk_tree_pos = (  
498 - pyparsing.Literal(u'N(') +  
499 - pyparsing.Optional(pyparsing.Word(pyparsing.nums) + u',' +  
500 - pyparsing.Optional(pyparsing.delimitedList(  
501 - pyparsing.Word(pyparsing.nums), delim=u',') +  
502 - pyparsing.Optional(u','))) + u')')  
503 - tgrep_node_expr = (tgrep_qstring |  
504 - tgrep_node_regex |  
505 - u'*' |  
506 - tgrep_node_literal)  
507 - tgrep_node = (tgrep_parens |  
508 - tgrep_nltk_tree_pos |  
509 - (pyparsing.Optional(u"'") +  
510 - tgrep_node_expr +  
511 - pyparsing.ZeroOrMore(u"|" + tgrep_node_expr)))  
512 - tgrep_relation = pyparsing.Forward()  
513 - tgrep_brackets = pyparsing.Optional(u'!') + u'[' + tgrep_relations + u']'  
514 - tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node  
515 - tgrep_rel_conjunction = pyparsing.Forward()  
516 - tgrep_rel_conjunction << (tgrep_relation +  
517 - pyparsing.ZeroOrMore(pyparsing.Optional(u'&') +  
518 - tgrep_rel_conjunction))  
519 - tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore(  
520 - u"|" + tgrep_relations)  
521 - tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations)  
522 - if set_parse_actions:  
523 - tgrep_node.setParseAction(_tgrep_node_action)  
524 - tgrep_parens.setParseAction(_tgrep_parens_action)  
525 - tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action)  
526 - tgrep_relation.setParseAction(_tgrep_relation_action)  
527 - tgrep_rel_conjunction.setParseAction(_tgrep_rel_conjunction_action)  
528 - tgrep_relations.setParseAction(_tgrep_rel_disjunction_action)  
529 - # the whole expression is also the conjunction of two  
530 - # predicates: the first node predicate, and the remaining  
531 - # relation predicates  
532 - tgrep_expr.setParseAction(_tgrep_rel_conjunction_action)  
533 - return tgrep_expr  
534 -  
535 -def tgrep_tokenize(tgrep_string):  
536 - '''  
537 - Tokenizes a TGrep search string into separate tokens.  
538 - '''  
539 - parser = _build_tgrep_parser(False)  
540 - if isinstance(tgrep_string, bytes):  
541 - tgrep_string = tgrep_string.decode()  
542 - return list(parser.parseString(tgrep_string))  
543 -  
544 -def tgrep_compile(tgrep_string):  
545 - '''  
546 - Parses (and tokenizes, if necessary) a TGrep search string into a  
547 - lambda function.  
548 - '''  
549 - parser = _build_tgrep_parser(True)  
550 - if isinstance(tgrep_string, bytes):  
551 - tgrep_string = tgrep_string.decode()  
552 - return list(parser.parseString(tgrep_string, parseAll=True))[0]  
553 -  
554 -def treepositions_no_leaves(tree):  
555 - '''  
556 - Returns all the tree positions in the given tree which are not  
557 - leaf nodes.  
558 - '''  
559 - treepositions = tree.treepositions()  
560 - # leaves are treeposition tuples that are not prefixes of any  
561 - # other treeposition  
562 - prefixes = set()  
563 - for pos in treepositions:  
564 - for length in range(len(pos)):  
565 - prefixes.add(pos[:length])  
566 - return [pos for pos in treepositions if pos in prefixes]  
567 -  
568 -def tgrep_positions(tree, tgrep_string, search_leaves = True):  
569 - '''  
570 - Return all tree positions in the given tree which match the given  
571 - `tgrep_string`.  
572 -  
573 - If `search_leaves` is False, the method will not return any  
574 - results in leaf positions.  
575 - '''  
576 - try:  
577 - if search_leaves:  
578 - search_positions = tree.treepositions()  
579 - else:  
580 - search_positions = treepositions_no_leaves(tree)  
581 - except AttributeError:  
582 - return []  
583 - if isinstance(tgrep_string, (bytes, str)):  
584 - tgrep_string = tgrep_compile(tgrep_string)  
585 - return [position for position in search_positions  
586 - if tgrep_string(tree[position])]  
587 -  
588 -def tgrep_nodes(tree, tgrep_string, search_leaves = True):  
589 - '''  
590 - Return all tree nodes in the given tree which match the given  
591 - `tgrep_ string`.  
592 -  
593 - If `search_leaves` is False, the method will not return any  
594 - results in leaf positions.  
595 - '''  
596 - return [tree[position] for position in tgrep_positions(tree, tgrep_string,  
597 - search_leaves)]