Commit 4dbd58071f11754174c4a1d563abf3e944f71969

Authored by Erickson Silva
1 parent f367a715
Exists in master and in 1 other branch devel

Remove tgrep e adiciona como dependencia

src/new/AplicaRegras.py
... ... @@ -1,336 +0,0 @@
1   -#!/usr/bin/python
2   -# -*- coding: utf-8 -*-
3   -
4   -#Autor: Erickson Silva
5   -#Email: <erickson.silva@lavid.ufpb.br> <ericksonsilva@live.com>
6   -
7   -#LAViD - Laboratório de Aplicações de Vídeo Digital
8   -
9   -from collections import deque
10   -import xml.etree.ElementTree as ET
11   -from os.path import expanduser
12   -import platform
13   -from LerDicionarios import *
14   -from Iterator import *
15   -from StringAux import *
16   -from ConverteExtenso import *
17   -
18   -class AplicaRegras(object):
19   -
20   - # inicializa todos as variaveis
21   - def __init__(self):
22   -
23   - self.__root = self.getRoot()
24   - self.__dicionarios = LeitorDicionarios()
25   -
26   - def getRoot(self):
27   -
28   - so = platform.system()
29   - if so == 'Windows':
30   - return ET.parse(expanduser("~")+'\\vlibras-translate\data\\regras.xml').getroot()
31   - else:
32   - return ET.parse(expanduser("~")+'/vlibras-translate/data/regras.xml').getroot()
33   -
34   - def aplicarRegrasMorfo(self, lista):
35   -
36   - self.__especificos = {"advt" : self.verificarAdvTempo, "v" : self.verificarVbInfinitivo, "x" : self.verificarPrepos, "c" : self.verificarSubs2Generos, "a" : self.verificarArtigo, "l" : self.verificarVbLigacao, "i": self.verificarAdvIntensidade, "vbi":"zero", "n":"zero", "abmn":"zero", "adji":"zero", "adjn":"zero", "advi":"zero"}
37   - self.pularIteracoes = 0
38   - self.__tAux = []
39   - it = Iterator()
40   - it.load(lista)
41   -
42   - while(it.hasNext()):
43   - if self.pularIteracoes > 0:
44   - self.pularIteracoes-=1
45   - continue
46   -
47   - for morpho in self.__root.findall('morphological'):
48   - self.hasRule = False
49   - for rule in morpho.findall('rule'): # procura a tag rule
50   - if rule.find('active').text == "true" and rule.get('name').split("_")[0] == it.getAtualT():
51   - count = int(rule.find('count').text)
52   - self.listaIter = []
53   - if count == 1:
54   - self.listaIter = [it.getToken()]
55   - else:
56   - try:
57   - self.listaIter = it.getInterval(count)
58   - self.pularIteracoes = count-1
59   - except:
60   - continue
61   -
62   -
63   - self.nomeRegra = self.gerarNomeDaRegra(self.listaIter)
64   - if rule.get('name') == self.nomeRegra: # verifica se a regra é aplicavel e a mesma esta ativa
65   - print "Achou regra: " + self.nomeRegra
66   - #subIter = Iterator()
67   - #subIter.load(self.listaIter)
68   - #while(subIter.hasNext()):
69   - self.hasRule = True
70   - self.listaTmp = count * [None]
71   - self.countListaIter = -1
72   - for classe in rule.iter('class'): # for nas tags class
73   - title = classe.find('title')
74   - newpos = classe.find('newpos')
75   - newprop = classe.find('newprop')
76   - newtoken = classe.find('newtoken')
77   - newtokenpos = classe.find('newtokenpos')
78   - self.specific = classe.find('specific')
79   -
80   - self.countListaIter += 1
81   - token = self.listaIter[self.countListaIter]
82   -
83   - if self.specific is not None:
84   - self.specific = self.__especificos[self.specific.text](token[0])
85   - if newprop is not None and type(self.specific) != bool:
86   - self.__tAux.append([self.specific,newprop.text])
87   -
88   - if newpos is not None:
89   - if newpos.text != "-1":
90   - if type(self.specific) == bool:
91   - self.listaTmp[int(newpos.text)] = token
92   - else:
93   - self.__tAux.append([self.specific, title.text])
94   -
95   - if newtoken is not None:
96   - self.listaTmp[int(newtokenpos.text)] = [newtoken.text, "NEWTOKEN"]
97   -
98   - self.listaTmp = filter(None, self.listaTmp)
99   - for i in self.listaTmp:
100   - self.__tAux.append(i)
101   -
102   - break
103   -
104   - if (self.hasRule == False): self.__tAux.append(it.getToken())
105   - if self.__tAux: return self.__tAux
106   - return lista # retorna a lista sem alteracoes (nao existe regra)
107   -
108   -
109   - def gerarNomeDaRegra(self, lista):
110   - self.__nomeRegra = []
111   - for t in lista:
112   - self.__nomeRegra.append(t[1])
113   - return "_".join(self.__nomeRegra)
114   -
115   - def verificarAdvTempo(self, lista):
116   - for i in lista:
117   - if i[1][:3] == "ADV":
118   - if (self.__dicionarios.hasTempoVerbal(i[0])):
119   - return True
120   - return False
121   -
122   - def verificarVbInfinitivo(self, token):
123   - if self.__dicionarios.hasVerboInfinitivo(token): # verifica se ha um verbo infinitivo desse token
124   - return self.__dicionarios.getVerboInfinitivo(token)
125   - return False
126   -
127   - #TODO
128   - def verificarPrepos(self, token):
129   - return None
130   -
131   - def verificarSubs2Generos(self, token):
132   - return self.__dicionarios.hasSubst2Genero(token)
133   -
134   - #TODO
135   - def verificarArtigo(self, token):
136   - return None
137   -
138   - #TODO
139   - def verificarVbLigacao(self, token):
140   - return None
141   -
142   - #TODO
143   - def verificarAdvIntensidade(self, token):
144   - return None
145   -
146   - # retira artigos e preposicoes; passa verbos para infinitivo e verificar se há sinonimos
147   - def inicializar(self, texto):
148   - it = Iterator()
149   - it.load(texto)
150   - self.__ts = []
151   - self.__verb = False
152   - self.__adv = False
153   - self.__num = False
154   - self.__plural = False
155   - self.__countVerb = 0
156   - self.__countAdv = 0
157   - while(it.hasNext()):
158   - token = it.getAtualW()
159   - tag = it.getAtualT()
160   - self.__b = False
161   -
162   - if self.__dicionarios.hasPalavraIgnorada(tag) == False: # verifica se nao eh artigo/preposicao
163   -
164   - if tag == "NUM":
165   - self.__num = True
166   -
167   - if tag[-2:] == "-P":
168   - self.__plural = True
169   -
170   - #VERIFICA SE É ADVERBIO E CONTA A QUANTIDADE
171   - if tag[:3] == "ADV":
172   - if (self.__dicionarios.hasTempoVerbal(token)):
173   - self.__adv = True
174   -
175   - if tag[:2] == "VB":
176   -
177   - #VERIFICA SE É VERBO NO INFINITIVO
178   - if self.__dicionarios.hasVerboInfinitivo(token): # verifica se ha um verbo infinitivo desse token
179   - verboInfinitivo = self.__dicionarios.getVerboInfinitivo(token) # se sim, adiciona numa string aux
180   - self.__ts.append([verboInfinitivo,tag]) # caso contrario, adiciona so o verbo infinitivo msm
181   - self.__b = True
182   -
183   - #VERIFICA SE É VERBO DE TEMPO E CONTA A QUANTIDADE
184   - if tag == "VB-P" or tag == "VB-D" or tag == "VB-R":
185   - self.__verb = True
186   - self.__countVerb += 1
187   -
188   - #VERIFICA SE É SUBTANTIVO COMUM DOS 2 GENEROS
189   - if self.__dicionarios.hasSubst2Genero(token):
190   - #del self.__ts[-1]
191   - lenTicket = len(it.getAntT())
192   - if ((self.__dicionarios.hasPalavraIgnorada(it.getAntT())) and (it.getAntT()[lenTicket-1:] == "F") or (it.getAntT()[lenTicket-3:] == "F-P")):
193   - self.__ts.append(["MULHER ", "2GEN"])
194   - self.__ts.append([token,tag])
195   - else:
196   - self.__ts.append(["HOMEM ", "2GEN"])
197   - self.__ts.append([token,tag])
198   - self.__b = True
199   -
200   - #SE NÃO HOUVE NENHUM ALTERAÇÃO, OU SEJA, NÃO APLICOU NENHUMA REGRA, ADICIONA O TOKEN ORIGINAL
201   - if self.__b == False: # verifica se nao encontrou nem verbo infinito ou sinonimo
202   - self.__ts.append([token,tag])
203   -
204   - #SE ENCONTROU VERBO, ENTÃO ANALISA a SENTENCA NOVAMENTE (again?)
205   - if self.__verb == True and self.__adv == False:
206   - self.__ts = self.verbalAnalysis(self.__ts)
207   -
208   - #VERIFICA SE É PLURAL
209   - if self.__plural:
210   - self.__ts = self.hasPlural(self.__ts)
211   -
212   - #CONVERTE EXTENSO PARA NUMERO
213   - if self.__num: return self.converteExtenso(self.__ts)
214   -
215   - return self.__ts
216   -
217   -
218   - # converte romano para numero
219   - def auxConvert(self, tag):
220   - try:
221   - return roman_to_int(tag)
222   - except:
223   - return tag
224   -
225   - def verbalAnalysis(self, lista):
226   - lv = []
227   - it = Iterator()
228   - it.load(lista)
229   - hasFut = False
230   - hasPas = False
231   - count = 0
232   - while(it.hasNext()):
233   - token = it.getAtualW().upper()
234   - tag = it.getAtualT()
235   -
236   - if(tag == "VB-P"):
237   - if (self.__countVerb > 1):
238   - count += 1
239   - #print "VB-P: Incrementou"
240   - if(count == self.__countVerb):
241   - #print "VB-P Adicionou " + token
242   - lv.append([token,tag])
243   - else:
244   - #print "VB-P: retornou lista original"
245   - it.reset()
246   - return lista
247   - elif(tag == "VB-D"):
248   - count += 1
249   - hasPas = True
250   - #print "VB-D: Incrementou"
251   - if(count == self.__countVerb):
252   - #print "VB-D Adicionou " + token
253   - lv.append([token,tag])
254   - elif(tag == "VB-R"):
255   - count += 1
256   - hasFut = True
257   - #print "VB-R: Incrementou"
258   - if(count == self.__countVerb):
259   - #print "VB-R Adicionou " + token
260   - lv.append([token,tag])
261   - else:
262   - lv.append([token,tag])
263   - if (hasFut):
264   - lv.append(["FUTURO", "T-VB"])
265   - elif (hasPas):
266   - lv.append(["PASSADO", "T-VB"])
267   - it.reset()
268   - return lv
269   -
270   -
271   - def hasPlural(self, lista):
272   -
273   - tmp = lista
274   - for e in tmp:
275   - if e[1][-2:] == "-P":
276   - e[0] = self.analisarPlural(e[0])
277   -
278   - return tmp
279   -
280   -
281   - def analisarPlural(self, word):
282   -
283   - if(word[-3:] == "OES" or word[-2:] == "AES" or word[-2:] == "AOS"):
284   - return word[0:-3]+"AO"
285   - elif(word[-3:] == "RES" or word[-2:] == "ZES" or word[-2:] == "NES"):
286   - return word[0:-2]
287   - elif(word[-3:] == "SES"):
288   - #TODO: Algumas palavras possuem marcações gráficas na raiz singular. Ex: Gás – Gases
289   - return word[0:-2]
290   - elif(word[-2:] == "NS"):
291   - return word[0:-2]+"M"
292   - elif(word[-3:] == "EIS"):
293   - return word[0:-3]+"IL"
294   - elif(word[-2:] == "IS"):
295   - if(word[-3] == "A" or word[-3] == "E" or word[-3] == "O" or word[-3] == "U"):
296   - return word[0:-2]+"L"
297   - else:
298   - return word
299   - elif(word[-1] == "S"):
300   - #TODO: Palavras paroxítonas ou proparoxítonas terminadas em S. Ex: lápis, vírus, tagênis, ônibus, etc
301   - return word[0:-1]
302   - else:
303   - return word
304   -
305   -
306   - def converteExtenso(self, lista):
307   -
308   - listAux = []
309   - indexDel = []
310   - count = 0
311   - isRunning = False
312   -
313   - for i in range(0, len(lista)):
314   - token = lista[i][0]
315   - tag = lista[i][1]
316   - if (tag == "NUM"):
317   - if (isRunning == False and len(listAux) == count):
318   - listAux.append([i,[token]])
319   - isRunning = True
320   - else:
321   - listAux[count][1].append(token)
322   - indexDel.append(i)
323   - elif (isRunning == True):
324   - if ((lista[i-1][1] == "NUM") and (lista[i+1][1] == "NUM") and (tag == "CONJ")):
325   - indexDel.append(i)
326   - else:
327   - isRunning = False
328   - count += 1
329   -
330   - for i in listAux:
331   - ext = extenso(' '.join(i[1]))
332   - lista[i[0]] = [ext, "NUM"]
333   -
334   - deque((list.pop(lista, i) for i in sorted(indexDel, reverse=True)), maxlen=0)
335   -
336   - return lista
src/new/TraduzSentencas.py
... ... @@ -23,20 +23,20 @@ def iniciar_traducao(texto):
23 23 return ""
24 24  
25 25 def gerar_analise(sentenca):
26   - sinonimos = AplicaSinonimos()
27   - regras = AplicaRegras()
28   - analise = None
  26 + aplic_sinonimos = AplicaSinonimos()
  27 + aplic_regras = AplicaRegras()
29 28  
  29 + analise = None
30 30 try:
31 31 analise = alexp.run(sentenca)
32 32 except ValueError:
33 33 # TODO: Permitir acentos na sentença
34 34 analise = None
35 35  
  36 + morfologica = alexp.getAnaliseMorfologica()
36 37 if (isinstance(analise,type(None))):
37   - morfologica = alexp.getAnaliseMorfologica()
38   - analise = regras.aplicar_regras_morfo(morfologica)
  38 + analise = aplic_regras.aplicar_regras_morfo(morfologica)
39 39 else:
40   - analise = regras.aplicar_regras_sint(arvoreSintatica)
41   -
42   - return sinonimos.aplicar_sinonimos(analise)
43 40 \ No newline at end of file
  41 + analise = aplic_regras.aplicar_regras_sint(morfologica, analise)
  42 + analise = aplic_regras.simplificar_sentenca(analise)
  43 + return aplic_sinonimos.aplicar_sinonimos(analise)
44 44 \ No newline at end of file
... ...
src/new/tgrep.py
... ... @@ -1,597 +0,0 @@
1   -#!/usr/bin/env python
2   -# -*- coding: utf-8 -*-
3   -#
4   -# Permission is hereby granted, free of charge, to any person
5   -# obtaining a copy of this software and associated documentation files
6   -# (the "Software"), to deal in the Software without restriction,
7   -# including without limitation the rights to use, copy, modify, merge,
8   -# publish, distribute, sublicense, and/or sell copies of the Software,
9   -# and to permit persons to whom the Software is furnished to do so,
10   -# subject to the following conditions:
11   -#
12   -# The above copyright notice and this permission notice shall be
13   -# included in all copies or substantial portions of the Software.
14   -#
15   -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16   -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17   -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18   -# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19   -# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20   -# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21   -# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22   -# SOFTWARE.
23   -
24   -'''
25   -TGrep search implementation for NTLK trees.
26   -
27   -(c) 16 March, 2013 Will Roberts <wildwilhelm@gmail.com>.
28   -
29   -This module supports TGrep2 syntax for matching parts of NLTK Trees.
30   -Note that many tgrep operators require the tree passed to be a
31   -ParentedTree.
32   -
33   -Tgrep tutorial:
34   -http://www.stanford.edu/dept/linguistics/corpora/cas-tut-tgrep.html
35   -Tgrep2 manual:
36   -http://tedlab.mit.edu/~dr/Tgrep2/tgrep2.pdf
37   -Tgrep2 source:
38   -http://tedlab.mit.edu/~dr/Tgrep2/
39   -'''
40   -
41   -from builtins import bytes, range, str
42   -import nltk.tree
43   -import pyparsing
44   -import re
45   -
46   -def ancestors(node):
47   - '''
48   - Returns the list of all nodes dominating the given tree node.
49   - This method will not work with leaf nodes, since there is no way
50   - to recover the parent.
51   - '''
52   - results = []
53   - try:
54   - current = node.parent()
55   - except AttributeError:
56   - # if node is a leaf, we cannot retrieve its parent
57   - return results
58   - while current:
59   - results.append(current)
60   - current = current.parent()
61   - return results
62   -
63   -def unique_ancestors(node):
64   - '''
65   - Returns the list of all nodes dominating the given node, where
66   - there is only a single path of descent.
67   - '''
68   - results = []
69   - try:
70   - current = node.parent()
71   - except AttributeError:
72   - # if node is a leaf, we cannot retrieve its parent
73   - return results
74   - while current and len(current) == 1:
75   - results.append(current)
76   - current = current.parent()
77   - return results
78   -
79   -def _descendants(node):
80   - '''
81   - Returns the list of all nodes which are descended from the given
82   - tree node in some way.
83   - '''
84   - try:
85   - treepos = node.treepositions()
86   - except AttributeError:
87   - return []
88   - return [node[x] for x in treepos[1:]]
89   -
90   -def _leftmost_descendants(node):
91   - '''
92   - Returns the set of all nodes descended in some way through
93   - left branches from this node.
94   - '''
95   - try:
96   - treepos = node.treepositions()
97   - except AttributeError:
98   - return []
99   - return [node[x] for x in treepos[1:] if all(y == 0 for y in x)]
100   -
101   -def _rightmost_descendants(node):
102   - '''
103   - Returns the set of all nodes descended in some way through
104   - right branches from this node.
105   - '''
106   - try:
107   - rightmost_leaf = max(node.treepositions())
108   - except AttributeError:
109   - return []
110   - return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)]
111   -
112   -def _istree(obj):
113   - '''Predicate to check whether `obj` is a nltk.tree.Tree.'''
114   - return isinstance(obj, nltk.tree.Tree)
115   -
116   -def _unique_descendants(node):
117   - '''
118   - Returns the list of all nodes descended from the given node, where
119   - there is only a single path of descent.
120   - '''
121   - results = []
122   - current = node
123   - while current and _istree(current) and len(current) == 1:
124   - current = current[0]
125   - results.append(current)
126   - return results
127   -
128   -def _before(node):
129   - '''
130   - Returns the set of all nodes that are before the given node.
131   - '''
132   - try:
133   - pos = node.treeposition()
134   - tree = node.root()
135   - except AttributeError:
136   - return []
137   - return [tree[x] for x in tree.treepositions()
138   - if x[:len(pos)] < pos[:len(x)]]
139   -
140   -def _immediately_before(node):
141   - '''
142   - Returns the set of all nodes that are immediately before the given
143   - node.
144   -
145   - Tree node A immediately precedes node B if the last terminal
146   - symbol (word) produced by A immediately precedes the first
147   - terminal symbol produced by B.
148   - '''
149   - try:
150   - pos = node.treeposition()
151   - tree = node.root()
152   - except AttributeError:
153   - return []
154   - # go "upwards" from pos until there is a place we can go to the left
155   - idx = len(pos) - 1
156   - while 0 <= idx and pos[idx] == 0:
157   - idx -= 1
158   - if idx < 0:
159   - return []
160   - pos = list(pos[:idx + 1])
161   - pos[-1] -= 1
162   - before = tree[pos]
163   - return [before] + _rightmost_descendants(before)
164   -
165   -def _after(node):
166   - '''
167   - Returns the set of all nodes that are after the given node.
168   - '''
169   - try:
170   - pos = node.treeposition()
171   - tree = node.root()
172   - except AttributeError:
173   - return []
174   - return [tree[x] for x in tree.treepositions()
175   - if x[:len(pos)] > pos[:len(x)]]
176   -
177   -def _immediately_after(node):
178   - '''
179   - Returns the set of all nodes that are immediately after the given
180   - node.
181   -
182   - Tree node A immediately follows node B if the first terminal
183   - symbol (word) produced by A immediately follows the last
184   - terminal symbol produced by B.
185   - '''
186   - try:
187   - pos = node.treeposition()
188   - tree = node.root()
189   - current = node.parent()
190   - except AttributeError:
191   - return []
192   - # go "upwards" from pos until there is a place we can go to the
193   - # right
194   - idx = len(pos) - 1
195   - while 0 <= idx and pos[idx] == len(current) - 1:
196   - idx -= 1
197   - current = current.parent()
198   - if idx < 0:
199   - return []
200   - pos = list(pos[:idx + 1])
201   - pos[-1] += 1
202   - after = tree[pos]
203   - return [after] + _leftmost_descendants(after)
204   -
205   -def _tgrep_node_literal_value(node):
206   - '''
207   - Gets the string value of a given parse tree node, for comparison
208   - using the tgrep node literal predicates.
209   - '''
210   - return (node.label() if _istree(node) else str(node))
211   -
212   -def _tgrep_node_action(_s, _l, tokens):
213   - '''
214   - Builds a lambda function representing a predicate on a tree node
215   - depending on the name of its node.
216   - '''
217   - # print 'node tokens: ', tokens
218   - if tokens[0] == u"'":
219   - # strip initial apostrophe (tgrep2 print command)
220   - tokens = tokens[1:]
221   - if len(tokens) > 1:
222   - # disjunctive definition of a node name
223   - assert list(set(tokens[1::2])) == [u'|']
224   - # recursively call self to interpret each node name definition
225   - tokens = [_tgrep_node_action(None, None, [node])
226   - for node in tokens[::2]]
227   - # capture tokens and return the disjunction
228   - return (lambda t: lambda n: any(f(n) for f in t))(tokens)
229   - else:
230   - if hasattr(tokens[0], u'__call__'):
231   - # this is a previously interpreted parenthetical node
232   - # definition (lambda function)
233   - return tokens[0]
234   - elif tokens[0] == u'*' or tokens[0] == u'__':
235   - return lambda n: True
236   - elif tokens[0].startswith(u'"'):
237   - return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip(u'"'))
238   - elif tokens[0].startswith(u'/'):
239   - return (lambda r: lambda n:
240   - r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip(u'/')))
241   - elif tokens[0].startswith(u'i@'):
242   - return (lambda s: lambda n:
243   - _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower())
244   - else:
245   - return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0])
246   -
247   -def _tgrep_parens_action(_s, _l, tokens):
248   - '''
249   - Builds a lambda function representing a predicate on a tree node
250   - from a parenthetical notation.
251   - '''
252   - # print 'parenthetical tokens: ', tokens
253   - assert len(tokens) == 3
254   - assert tokens[0] == u'('
255   - assert tokens[2] == u')'
256   - return tokens[1]
257   -
258   -def _tgrep_nltk_tree_pos_action(_s, _l, tokens):
259   - '''
260   - Builds a lambda function representing a predicate on a tree node
261   - which returns true if the node is located at a specific tree
262   - position.
263   - '''
264   - # recover the tuple from the parsed sting
265   - node_tree_position = tuple(int(x) for x in tokens if x.isdigit())
266   - # capture the node's tree position
267   - return (lambda i: lambda n: (hasattr(n, u'treeposition') and
268   - n.treeposition() == i))(node_tree_position)
269   -
270   -def _tgrep_relation_action(_s, _l, tokens):
271   - '''
272   - Builds a lambda function representing a predicate on a tree node
273   - depending on its relation to other nodes in the tree.
274   - '''
275   - # print 'relation tokens: ', tokens
276   - # process negation first if needed
277   - negated = False
278   - if tokens[0] == u'!':
279   - negated = True
280   - tokens = tokens[1:]
281   - if tokens[0] == u'[':
282   - # process square-bracketed relation expressions
283   - assert len(tokens) == 3
284   - assert tokens[2] == u']'
285   - retval = tokens[1]
286   - else:
287   - # process operator-node relation expressions
288   - assert len(tokens) == 2
289   - operator, predicate = tokens
290   - # A < B A is the parent of (immediately dominates) B.
291   - if operator == u'<':
292   - retval = lambda n: (_istree(n) and
293   - any(predicate(x) for x in n))
294   - # A > B A is the child of B.
295   - elif operator == u'>':
296   - retval = lambda n: (hasattr(n, u'parent') and
297   - bool(n.parent()) and
298   - predicate(n.parent()))
299   - # A <, B Synonymous with A <1 B.
300   - elif operator == u'<,' or operator == u'<1':
301   - retval = lambda n: (_istree(n) and
302   - bool(list(n)) and
303   - predicate(n[0]))
304   - # A >, B Synonymous with A >1 B.
305   - elif operator == u'>,' or operator == u'>1':
306   - retval = lambda n: (hasattr(n, u'parent') and
307   - bool(n.parent()) and
308   - (n is n.parent()[0]) and
309   - predicate(n.parent()))
310   - # A <N B B is the Nth child of A (the first child is <1).
311   - elif operator[0] == u'<' and operator[1:].isdigit():
312   - idx = int(operator[1:])
313   - # capture the index parameter
314   - retval = (lambda i: lambda n: (_istree(n) and
315   - bool(list(n)) and
316   - 0 <= i < len(n) and
317   - predicate(n[i])))(idx - 1)
318   - # A >N B A is the Nth child of B (the first child is >1).
319   - elif operator[0] == u'>' and operator[1:].isdigit():
320   - idx = int(operator[1:])
321   - # capture the index parameter
322   - retval = (lambda i: lambda n: (hasattr(n, u'parent') and
323   - bool(n.parent()) and
324   - 0 <= i < len(n.parent()) and
325   - (n is n.parent()[i]) and
326   - predicate(n.parent())))(idx - 1)
327   - # A <' B B is the last child of A (also synonymous with A <-1 B).
328   - # A <- B B is the last child of A (synonymous with A <-1 B).
329   - elif operator == u'<\'' or operator == u'<-' or operator == u'<-1':
330   - retval = lambda n: (_istree(n) and bool(list(n))
331   - and predicate(n[-1]))
332   - # A >' B A is the last child of B (also synonymous with A >-1 B).
333   - # A >- B A is the last child of B (synonymous with A >-1 B).
334   - elif operator == u'>\'' or operator == u'>-' or operator == u'>-1':
335   - retval = lambda n: (hasattr(n, u'parent') and
336   - bool(n.parent()) and
337   - (n is n.parent()[-1]) and
338   - predicate(n.parent()))
339   - # A <-N B B is the N th-to-last child of A (the last child is <-1).
340   - elif operator[:2] == u'<-' and operator[2:].isdigit():
341   - idx = -int(operator[2:])
342   - # capture the index parameter
343   - retval = (lambda i: lambda n: (_istree(n) and
344   - bool(list(n)) and
345   - 0 <= (i + len(n)) < len(n) and
346   - predicate(n[i + len(n)])))(idx)
347   - # A >-N B A is the N th-to-last child of B (the last child is >-1).
348   - elif operator[:2] == u'>-' and operator[2:].isdigit():
349   - idx = -int(operator[2:])
350   - # capture the index parameter
351   - retval = (lambda i: lambda n:
352   - (hasattr(n, u'parent') and
353   - bool(n.parent()) and
354   - 0 <= (i + len(n.parent())) < len(n.parent()) and
355   - (n is n.parent()[i + len(n.parent())]) and
356   - predicate(n.parent())))(idx)
357   - # A <: B B is the only child of A
358   - elif operator == u'<:':
359   - retval = lambda n: (_istree(n) and
360   - len(n) == 1 and
361   - predicate(n[0]))
362   - # A >: B A is the only child of B.
363   - elif operator == u'>:':
364   - retval = lambda n: (hasattr(n, u'parent') and
365   - bool(n.parent()) and
366   - len(n.parent()) == 1 and
367   - predicate(n.parent()))
368   - # A << B A dominates B (A is an ancestor of B).
369   - elif operator == u'<<':
370   - retval = lambda n: (_istree(n) and
371   - any(predicate(x) for x in _descendants(n)))
372   - # A >> B A is dominated by B (A is a descendant of B).
373   - elif operator == u'>>':
374   - retval = lambda n: any(predicate(x) for x in ancestors(n))
375   - # A <<, B B is a left-most descendant of A.
376   - elif operator == u'<<,' or operator == u'<<1':
377   - retval = lambda n: (_istree(n) and
378   - any(predicate(x)
379   - for x in _leftmost_descendants(n)))
380   - # A >>, B A is a left-most descendant of B.
381   - elif operator == u'>>,':
382   - retval = lambda n: any((predicate(x) and
383   - n in _leftmost_descendants(x))
384   - for x in ancestors(n))
385   - # A <<' B B is a right-most descendant of A.
386   - elif operator == u'<<\'':
387   - retval = lambda n: (_istree(n) and
388   - any(predicate(x)
389   - for x in _rightmost_descendants(n)))
390   - # A >>' B A is a right-most descendant of B.
391   - elif operator == u'>>\'':
392   - retval = lambda n: any((predicate(x) and
393   - n in _rightmost_descendants(x))
394   - for x in ancestors(n))
395   - # A <<: B There is a single path of descent from A and B is on it.
396   - elif operator == u'<<:':
397   - retval = lambda n: (_istree(n) and
398   - any(predicate(x)
399   - for x in _unique_descendants(n)))
400   - # A >>: B There is a single path of descent from B and A is on it.
401   - elif operator == u'>>:':
402   - retval = lambda n: any(predicate(x) for x in unique_ancestors(n))
403   - # A . B A immediately precedes B.
404   - elif operator == u'.':
405   - retval = lambda n: any(predicate(x)
406   - for x in _immediately_after(n))
407   - # A , B A immediately follows B.
408   - elif operator == u',':
409   - retval = lambda n: any(predicate(x)
410   - for x in _immediately_before(n))
411   - # A .. B A precedes B.
412   - elif operator == u'..':
413   - retval = lambda n: any(predicate(x) for x in _after(n))
414   - # A ,, B A follows B.
415   - elif operator == u',,':
416   - retval = lambda n: any(predicate(x) for x in _before(n))
417   - # A $ B A is a sister of B (and A != B).
418   - elif operator == u'$' or operator == u'%':
419   - retval = lambda n: (hasattr(n, u'parent') and
420   - bool(n.parent()) and
421   - any(predicate(x)
422   - for x in n.parent() if x is not n))
423   - # A $. B A is a sister of and immediately precedes B.
424   - elif operator == u'$.' or operator == u'%.':
425   - retval = lambda n: (hasattr(n, u'right_sibling') and
426   - bool(n.right_sibling()) and
427   - predicate(n.right_sibling()))
428   - # A $, B A is a sister of and immediately follows B.
429   - elif operator == u'$,' or operator == u'%,':
430   - retval = lambda n: (hasattr(n, u'left_sibling') and
431   - bool(n.left_sibling()) and
432   - predicate(n.left_sibling()))
433   - # A $.. B A is a sister of and precedes B.
434   - elif operator == u'$..' or operator == u'%..':
435   - retval = lambda n: (hasattr(n, u'parent') and
436   - hasattr(n, u'parent_index') and
437   - bool(n.parent()) and
438   - any(predicate(x) for x in
439   - n.parent()[n.parent_index() + 1:]))
440   - # A $,, B A is a sister of and follows B.
441   - elif operator == u'$,,' or operator == u'%,,':
442   - retval = lambda n: (hasattr(n, u'parent') and
443   - hasattr(n, u'parent_index') and
444   - bool(n.parent()) and
445   - any(predicate(x) for x in
446   - n.parent()[:n.parent_index()]))
447   - else:
448   - assert False, u'cannot interpret tgrep operator "{0}"'.format(
449   - operator)
450   - # now return the built function
451   - if negated:
452   - return (lambda r: (lambda n: not r(n)))(retval)
453   - else:
454   - return retval
455   -
456   -def _tgrep_rel_conjunction_action(_s, _l, tokens):
457   - '''
458   - Builds a lambda function representing a predicate on a tree node
459   - from the conjunction of several other such lambda functions.
460   - '''
461   - # filter out the ampersand
462   - tokens = [x for x in tokens if x != u'&']
463   - # print 'relation conjunction tokens: ', tokens
464   - if len(tokens) == 1:
465   - return tokens[0]
466   - elif len(tokens) == 2:
467   - return (lambda a, b: lambda n: a(n) and b(n))(tokens[0], tokens[1])
468   -
469   -def _tgrep_rel_disjunction_action(_s, _l, tokens):
470   - '''
471   - Builds a lambda function representing a predicate on a tree node
472   - from the disjunction of several other such lambda functions.
473   - '''
474   - # filter out the pipe
475   - tokens = [x for x in tokens if x != u'|']
476   - # print 'relation disjunction tokens: ', tokens
477   - if len(tokens) == 1:
478   - return tokens[0]
479   - elif len(tokens) == 2:
480   - return (lambda a, b: lambda n: a(n) or b(n))(tokens[0], tokens[1])
481   -
482   -def _build_tgrep_parser(set_parse_actions = True):
483   - '''
484   - Builds a pyparsing-based parser object for tokenizing and
485   - interpreting tgrep search strings.
486   - '''
487   - tgrep_op = (pyparsing.Optional(u'!') +
488   - pyparsing.Regex(u'[$%,.<>][%,.<>0-9-\':]*'))
489   - tgrep_qstring = pyparsing.QuotedString(quoteChar=u'"', escChar=u'\\',
490   - unquoteResults=False)
491   - tgrep_node_regex = pyparsing.QuotedString(quoteChar=u'/', escChar=u'\\',
492   - unquoteResults=False)
493   - tgrep_node_literal = pyparsing.Regex(u'[^][ \r\t\n;:.,&|<>()$!@%\'^=]+')
494   - tgrep_expr = pyparsing.Forward()
495   - tgrep_relations = pyparsing.Forward()
496   - tgrep_parens = pyparsing.Literal(u'(') + tgrep_expr + u')'
497   - tgrep_nltk_tree_pos = (
498   - pyparsing.Literal(u'N(') +
499   - pyparsing.Optional(pyparsing.Word(pyparsing.nums) + u',' +
500   - pyparsing.Optional(pyparsing.delimitedList(
501   - pyparsing.Word(pyparsing.nums), delim=u',') +
502   - pyparsing.Optional(u','))) + u')')
503   - tgrep_node_expr = (tgrep_qstring |
504   - tgrep_node_regex |
505   - u'*' |
506   - tgrep_node_literal)
507   - tgrep_node = (tgrep_parens |
508   - tgrep_nltk_tree_pos |
509   - (pyparsing.Optional(u"'") +
510   - tgrep_node_expr +
511   - pyparsing.ZeroOrMore(u"|" + tgrep_node_expr)))
512   - tgrep_relation = pyparsing.Forward()
513   - tgrep_brackets = pyparsing.Optional(u'!') + u'[' + tgrep_relations + u']'
514   - tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node
515   - tgrep_rel_conjunction = pyparsing.Forward()
516   - tgrep_rel_conjunction << (tgrep_relation +
517   - pyparsing.ZeroOrMore(pyparsing.Optional(u'&') +
518   - tgrep_rel_conjunction))
519   - tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore(
520   - u"|" + tgrep_relations)
521   - tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations)
522   - if set_parse_actions:
523   - tgrep_node.setParseAction(_tgrep_node_action)
524   - tgrep_parens.setParseAction(_tgrep_parens_action)
525   - tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action)
526   - tgrep_relation.setParseAction(_tgrep_relation_action)
527   - tgrep_rel_conjunction.setParseAction(_tgrep_rel_conjunction_action)
528   - tgrep_relations.setParseAction(_tgrep_rel_disjunction_action)
529   - # the whole expression is also the conjunction of two
530   - # predicates: the first node predicate, and the remaining
531   - # relation predicates
532   - tgrep_expr.setParseAction(_tgrep_rel_conjunction_action)
533   - return tgrep_expr
534   -
535   -def tgrep_tokenize(tgrep_string):
536   - '''
537   - Tokenizes a TGrep search string into separate tokens.
538   - '''
539   - parser = _build_tgrep_parser(False)
540   - if isinstance(tgrep_string, bytes):
541   - tgrep_string = tgrep_string.decode()
542   - return list(parser.parseString(tgrep_string))
543   -
544   -def tgrep_compile(tgrep_string):
545   - '''
546   - Parses (and tokenizes, if necessary) a TGrep search string into a
547   - lambda function.
548   - '''
549   - parser = _build_tgrep_parser(True)
550   - if isinstance(tgrep_string, bytes):
551   - tgrep_string = tgrep_string.decode()
552   - return list(parser.parseString(tgrep_string, parseAll=True))[0]
553   -
554   -def treepositions_no_leaves(tree):
555   - '''
556   - Returns all the tree positions in the given tree which are not
557   - leaf nodes.
558   - '''
559   - treepositions = tree.treepositions()
560   - # leaves are treeposition tuples that are not prefixes of any
561   - # other treeposition
562   - prefixes = set()
563   - for pos in treepositions:
564   - for length in range(len(pos)):
565   - prefixes.add(pos[:length])
566   - return [pos for pos in treepositions if pos in prefixes]
567   -
568   -def tgrep_positions(tree, tgrep_string, search_leaves = True):
569   - '''
570   - Return all tree positions in the given tree which match the given
571   - `tgrep_string`.
572   -
573   - If `search_leaves` is False, the method will not return any
574   - results in leaf positions.
575   - '''
576   - try:
577   - if search_leaves:
578   - search_positions = tree.treepositions()
579   - else:
580   - search_positions = treepositions_no_leaves(tree)
581   - except AttributeError:
582   - return []
583   - if isinstance(tgrep_string, (bytes, str)):
584   - tgrep_string = tgrep_compile(tgrep_string)
585   - return [position for position in search_positions
586   - if tgrep_string(tree[position])]
587   -
588   -def tgrep_nodes(tree, tgrep_string, search_leaves = True):
589   - '''
590   - Return all tree nodes in the given tree which match the given
591   - `tgrep_ string`.
592   -
593   - If `search_leaves` is False, the method will not return any
594   - results in leaf positions.
595   - '''
596   - return [tree[position] for position in tgrep_positions(tree, tgrep_string,
597   - search_leaves)]