Commit 9ba27f8347a6d6a430526782cb96d8e69e0c58c3
1 parent
f806b639
Exists in
master
and in
1 other branch
Adiciona classe (tgrep) para procurar nós na arvore NLTK
Showing
1 changed file
with
575 additions
and
0 deletions
Show diff stats
... | ... | @@ -0,0 +1,575 @@ |
1 | +#!/usr/bin/env python | |
2 | +# -*- coding: utf-8 -*- | |
3 | +# | |
4 | +# Permission is hereby granted, free of charge, to any person | |
5 | +# obtaining a copy of this software and associated documentation files | |
6 | +# (the "Software"), to deal in the Software without restriction, | |
7 | +# including without limitation the rights to use, copy, modify, merge, | |
8 | +# publish, distribute, sublicense, and/or sell copies of the Software, | |
9 | +# and to permit persons to whom the Software is furnished to do so, | |
10 | +# subject to the following conditions: | |
11 | +# | |
12 | +# The above copyright notice and this permission notice shall be | |
13 | +# included in all copies or substantial portions of the Software. | |
14 | +# | |
15 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
16 | +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |
17 | +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
18 | +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | |
19 | +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | |
20 | +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | |
21 | +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
22 | +# SOFTWARE. | |
23 | + | |
24 | +''' | |
25 | +TGrep search implementation for NTLK trees. | |
26 | + | |
27 | +(c) 16 March, 2013 Will Roberts <wildwilhelm@gmail.com>. | |
28 | + | |
29 | +This module supports TGrep2 syntax for matching parts of NLTK Trees. | |
30 | +Note that many tgrep operators require the tree passed to be a | |
31 | +ParentedTree. | |
32 | + | |
33 | +Tgrep tutorial: | |
34 | +http://www.stanford.edu/dept/linguistics/corpora/cas-tut-tgrep.html | |
35 | +Tgrep2 manual: | |
36 | +http://tedlab.mit.edu/~dr/Tgrep2/tgrep2.pdf | |
37 | +Tgrep2 source: | |
38 | +http://tedlab.mit.edu/~dr/Tgrep2/ | |
39 | +''' | |
40 | + | |
41 | +import nltk.tree | |
42 | +import pyparsing | |
43 | +import re | |
44 | + | |
45 | +def ancestors(node): | |
46 | + ''' | |
47 | + Returns the list of all nodes dominating the given tree node. | |
48 | + This method will not work with leaf nodes, since there is no way | |
49 | + to recover the parent. | |
50 | + ''' | |
51 | + # if node is a leaf, we cannot retrieve its parent | |
52 | + if not hasattr(node, 'parent'): | |
53 | + return [] | |
54 | + results = [] | |
55 | + current = node.parent() | |
56 | + while current: | |
57 | + results.append(current) | |
58 | + current = current.parent() | |
59 | + return results | |
60 | + | |
61 | +def unique_ancestors(node): | |
62 | + ''' | |
63 | + Returns the list of all nodes dominating the given node, where | |
64 | + there is only a single path of descent. | |
65 | + ''' | |
66 | + # if node is a leaf, we cannot retrieve its parent | |
67 | + if not hasattr(node, 'parent'): | |
68 | + return [] | |
69 | + results = [] | |
70 | + current = node.parent() | |
71 | + while current and len(current) == 1: | |
72 | + results.append(current) | |
73 | + current = current.parent() | |
74 | + return results | |
75 | + | |
76 | +def _descendants(node): | |
77 | + ''' | |
78 | + Returns the list of all nodes which are descended from the given | |
79 | + tree node in some way. | |
80 | + ''' | |
81 | + if not hasattr(node, 'treepositions'): | |
82 | + return [] | |
83 | + return [node[x] for x in node.treepositions()[1:]] | |
84 | + | |
85 | +def _leftmost_descendants(node): | |
86 | + ''' | |
87 | + Returns the set of all nodes descended in some way through | |
88 | + left branches from this node. | |
89 | + ''' | |
90 | + if not hasattr(node, 'treepositions'): | |
91 | + return [] | |
92 | + return [node[x] for x in node.treepositions()[1:] if all(y == 0 for y in x)] | |
93 | + | |
94 | +def _rightmost_descendants(node): | |
95 | + ''' | |
96 | + Returns the set of all nodes descended in some way through | |
97 | + right branches from this node. | |
98 | + ''' | |
99 | + if not hasattr(node, 'treepositions'): | |
100 | + return [] | |
101 | + rightmost_leaf = max(node.treepositions()) | |
102 | + return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] | |
103 | + | |
104 | +def _unique_descendants(node): | |
105 | + ''' | |
106 | + Returns the list of all nodes descended from the given node, where | |
107 | + there is only a single path of descent. | |
108 | + ''' | |
109 | + results = [] | |
110 | + current = node | |
111 | + while current and isinstance(current, nltk.tree.Tree) and len(current) == 1: | |
112 | + current = current[0] | |
113 | + results.append(current) | |
114 | + return results | |
115 | + | |
116 | +def _before(node): | |
117 | + ''' | |
118 | + Returns the set of all nodes that are before the given node. | |
119 | + ''' | |
120 | + if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | |
121 | + return [] | |
122 | + pos = node.treeposition() | |
123 | + tree = node.root() | |
124 | + return [tree[x] for x in tree.treepositions() | |
125 | + if x[:len(pos)] < pos[:len(x)]] | |
126 | + | |
127 | +def _immediately_before(node): | |
128 | + ''' | |
129 | + Returns the set of all nodes that are immediately before the given | |
130 | + node. | |
131 | + | |
132 | + Tree node A immediately precedes node B if the last terminal | |
133 | + symbol (word) produced by A immediately precedes the first | |
134 | + terminal symbol produced by B. | |
135 | + ''' | |
136 | + if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | |
137 | + return [] | |
138 | + pos = node.treeposition() | |
139 | + # go "upwards" from pos until there is a place we can go to the left | |
140 | + idx = len(pos) - 1 | |
141 | + while 0 <= idx and pos[idx] == 0: | |
142 | + idx -= 1 | |
143 | + if idx < 0: | |
144 | + return [] | |
145 | + pos = list(pos[:idx + 1]) | |
146 | + pos[-1] -= 1 | |
147 | + before = node.root()[pos] | |
148 | + return [before] + _rightmost_descendants(before) | |
149 | + | |
150 | +def _after(node): | |
151 | + ''' | |
152 | + Returns the set of all nodes that are after the given node. | |
153 | + ''' | |
154 | + if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | |
155 | + return [] | |
156 | + pos = node.treeposition() | |
157 | + tree = node.root() | |
158 | + return [tree[x] for x in tree.treepositions() | |
159 | + if x[:len(pos)] > pos[:len(x)]] | |
160 | + | |
161 | +def _immediately_after(node): | |
162 | + ''' | |
163 | + Returns the set of all nodes that are immediately after the given | |
164 | + node. | |
165 | + | |
166 | + Tree node A immediately follows node B if the first terminal | |
167 | + symbol (word) produced by A immediately follows the last | |
168 | + terminal symbol produced by B. | |
169 | + ''' | |
170 | + if (not hasattr(node, 'root') or not hasattr(node, 'treeposition') or | |
171 | + not hasattr(node, 'parent')): | |
172 | + return [] | |
173 | + pos = node.treeposition() | |
174 | + # go "upwards" from pos until there is a place we can go to the | |
175 | + # right | |
176 | + idx = len(pos) - 1 | |
177 | + current = node.parent() | |
178 | + while 0 <= idx and pos[idx] == len(current) - 1: | |
179 | + idx -= 1 | |
180 | + current = current.parent() | |
181 | + if idx < 0: | |
182 | + return [] | |
183 | + pos = list(pos[:idx + 1]) | |
184 | + pos[-1] += 1 | |
185 | + after = node.root()[pos] | |
186 | + return [after] + _leftmost_descendants(after) | |
187 | + | |
188 | +def _tgrep_node_literal_value(node): | |
189 | + ''' | |
190 | + Gets the string value of a given parse tree node, for comparison | |
191 | + using the tgrep node literal predicates. | |
192 | + ''' | |
193 | + return (node.label() if isinstance(node, nltk.tree.Tree) else unicode(node)) | |
194 | + | |
195 | +def _tgrep_node_action(_s, _l, tokens): | |
196 | + ''' | |
197 | + Builds a lambda function representing a predicate on a tree node | |
198 | + depending on the name of its node. | |
199 | + ''' | |
200 | + # print 'node tokens: ', tokens | |
201 | + if tokens[0] == "'": | |
202 | + # strip initial apostrophe (tgrep2 print command) | |
203 | + tokens = tokens[1:] | |
204 | + if len(tokens) > 1: | |
205 | + # disjunctive definition of a node name | |
206 | + assert list(set(tokens[1::2])) == ['|'] | |
207 | + # recursively call self to interpret each node name definition | |
208 | + tokens = [_tgrep_node_action(None, None, [node]) | |
209 | + for node in tokens[::2]] | |
210 | + # capture tokens and return the disjunction | |
211 | + return (lambda t: lambda n: any(f(n) for f in t))(tokens) | |
212 | + else: | |
213 | + if hasattr(tokens[0], '__call__'): | |
214 | + # this is a previously interpreted parenthetical node | |
215 | + # definition (lambda function) | |
216 | + return tokens[0] | |
217 | + elif tokens[0] == '*' or tokens[0] == '__': | |
218 | + return lambda n: True | |
219 | + elif tokens[0].startswith('"'): | |
220 | + return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip('"')) | |
221 | + elif tokens[0].startswith('/'): | |
222 | + return (lambda r: lambda n: | |
223 | + r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip('/'))) | |
224 | + elif tokens[0].startswith('i@'): | |
225 | + return (lambda s: lambda n: | |
226 | + _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower()) | |
227 | + else: | |
228 | + return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0]) | |
229 | + | |
230 | +def _tgrep_parens_action(_s, _l, tokens): | |
231 | + ''' | |
232 | + Builds a lambda function representing a predicate on a tree node | |
233 | + from a parenthetical notation. | |
234 | + ''' | |
235 | + # print 'parenthetical tokens: ', tokens | |
236 | + assert len(tokens) == 3 | |
237 | + assert tokens[0] == '(' | |
238 | + assert tokens[2] == ')' | |
239 | + return tokens[1] | |
240 | + | |
241 | +def _tgrep_nltk_tree_pos_action(_s, _l, tokens): | |
242 | + ''' | |
243 | + Builds a lambda function representing a predicate on a tree node | |
244 | + which returns true if the node is located at a specific tree | |
245 | + position. | |
246 | + ''' | |
247 | + # recover the tuple from the parsed sting | |
248 | + node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) | |
249 | + # capture the node's tree position | |
250 | + return (lambda i: lambda n: (hasattr(n, 'treeposition') and | |
251 | + n.treeposition() == i))(node_tree_position) | |
252 | + | |
253 | +def _tgrep_relation_action(_s, _l, tokens): | |
254 | + ''' | |
255 | + Builds a lambda function representing a predicate on a tree node | |
256 | + depending on its relation to other nodes in the tree. | |
257 | + ''' | |
258 | + # print 'relation tokens: ', tokens | |
259 | + # process negation first if needed | |
260 | + negated = False | |
261 | + if tokens[0] == '!': | |
262 | + negated = True | |
263 | + tokens = tokens[1:] | |
264 | + if tokens[0] == '[': | |
265 | + # process square-bracketed relation expressions | |
266 | + assert len(tokens) == 3 | |
267 | + assert tokens[2] == ']' | |
268 | + retval = tokens[1] | |
269 | + else: | |
270 | + # process operator-node relation expressions | |
271 | + assert len(tokens) == 2 | |
272 | + operator, predicate = tokens | |
273 | + # A < B A is the parent of (immediately dominates) B. | |
274 | + if operator == '<': | |
275 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
276 | + any(predicate(x) for x in n)) | |
277 | + # A > B A is the child of B. | |
278 | + elif operator == '>': | |
279 | + retval = lambda n: (hasattr(n, 'parent') and | |
280 | + bool(n.parent()) and | |
281 | + predicate(n.parent())) | |
282 | + # A <, B Synonymous with A <1 B. | |
283 | + elif operator == '<,' or operator == '<1': | |
284 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
285 | + bool(list(n)) and | |
286 | + predicate(n[0])) | |
287 | + # A >, B Synonymous with A >1 B. | |
288 | + elif operator == '>,' or operator == '>1': | |
289 | + retval = lambda n: (hasattr(n, 'parent') and | |
290 | + bool(n.parent()) and | |
291 | + (n is n.parent()[0]) and | |
292 | + predicate(n.parent())) | |
293 | + # A <N B B is the Nth child of A (the first child is <1). | |
294 | + elif operator[0] == '<' and operator[1:].isdigit(): | |
295 | + idx = int(operator[1:]) | |
296 | + # capture the index parameter | |
297 | + retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and | |
298 | + bool(list(n)) and | |
299 | + 0 <= i < len(n) and | |
300 | + predicate(n[i])))(idx - 1) | |
301 | + # A >N B A is the Nth child of B (the first child is >1). | |
302 | + elif operator[0] == '>' and operator[1:].isdigit(): | |
303 | + idx = int(operator[1:]) | |
304 | + # capture the index parameter | |
305 | + retval = (lambda i: lambda n: (hasattr(n, 'parent') and | |
306 | + bool(n.parent()) and | |
307 | + 0 <= i < len(n.parent()) and | |
308 | + (n is n.parent()[i]) and | |
309 | + predicate(n.parent())))(idx - 1) | |
310 | + # A <' B B is the last child of A (also synonymous with A <-1 B). | |
311 | + # A <- B B is the last child of A (synonymous with A <-1 B). | |
312 | + elif operator == '<\'' or operator == '<-' or operator == '<-1': | |
313 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and bool(list(n)) | |
314 | + and predicate(n[-1])) | |
315 | + # A >' B A is the last child of B (also synonymous with A >-1 B). | |
316 | + # A >- B A is the last child of B (synonymous with A >-1 B). | |
317 | + elif operator == '>\'' or operator == '>-' or operator == '>-1': | |
318 | + retval = lambda n: (hasattr(n, 'parent') and | |
319 | + bool(n.parent()) and | |
320 | + (n is n.parent()[-1]) and | |
321 | + predicate(n.parent())) | |
322 | + # A <-N B B is the N th-to-last child of A (the last child is <-1). | |
323 | + elif operator[:2] == '<-' and operator[2:].isdigit(): | |
324 | + idx = -int(operator[2:]) | |
325 | + # capture the index parameter | |
326 | + retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and | |
327 | + bool(list(n)) and | |
328 | + 0 <= (i + len(n)) < len(n) and | |
329 | + predicate(n[i + len(n)])))(idx) | |
330 | + # A >-N B A is the N th-to-last child of B (the last child is >-1). | |
331 | + elif operator[:2] == '>-' and operator[2:].isdigit(): | |
332 | + idx = -int(operator[2:]) | |
333 | + # capture the index parameter | |
334 | + retval = (lambda i: lambda n: | |
335 | + (hasattr(n, 'parent') and | |
336 | + bool(n.parent()) and | |
337 | + 0 <= (i + len(n.parent())) < len(n.parent()) and | |
338 | + (n is n.parent()[i + len(n.parent())]) and | |
339 | + predicate(n.parent())))(idx) | |
340 | + # A <: B B is the only child of A | |
341 | + elif operator == '<:': | |
342 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
343 | + len(n) == 1 and | |
344 | + predicate(n[0])) | |
345 | + # A >: B A is the only child of B. | |
346 | + elif operator == '>:': | |
347 | + retval = lambda n: (hasattr(n, 'parent') and | |
348 | + bool(n.parent()) and | |
349 | + len(n.parent()) == 1 and | |
350 | + predicate(n.parent())) | |
351 | + # A << B A dominates B (A is an ancestor of B). | |
352 | + elif operator == '<<': | |
353 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
354 | + any(predicate(x) for x in _descendants(n))) | |
355 | + # A >> B A is dominated by B (A is a descendant of B). | |
356 | + elif operator == '>>': | |
357 | + retval = lambda n: any(predicate(x) for x in ancestors(n)) | |
358 | + # A <<, B B is a left-most descendant of A. | |
359 | + elif operator == '<<,' or operator == '<<1': | |
360 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
361 | + any(predicate(x) | |
362 | + for x in _leftmost_descendants(n))) | |
363 | + # A >>, B A is a left-most descendant of B. | |
364 | + elif operator == '>>,': | |
365 | + retval = lambda n: any((predicate(x) and | |
366 | + n in _leftmost_descendants(x)) | |
367 | + for x in ancestors(n)) | |
368 | + # A <<' B B is a right-most descendant of A. | |
369 | + elif operator == '<<\'': | |
370 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
371 | + any(predicate(x) | |
372 | + for x in _rightmost_descendants(n))) | |
373 | + # A >>' B A is a right-most descendant of B. | |
374 | + elif operator == '>>\'': | |
375 | + retval = lambda n: any((predicate(x) and | |
376 | + n in _rightmost_descendants(x)) | |
377 | + for x in ancestors(n)) | |
378 | + # A <<: B There is a single path of descent from A and B is on it. | |
379 | + elif operator == '<<:': | |
380 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
381 | + any(predicate(x) | |
382 | + for x in _unique_descendants(n))) | |
383 | + # A >>: B There is a single path of descent from B and A is on it. | |
384 | + elif operator == '>>:': | |
385 | + retval = lambda n: any(predicate(x) for x in unique_ancestors(n)) | |
386 | + # A . B A immediately precedes B. | |
387 | + elif operator == '.': | |
388 | + retval = lambda n: any(predicate(x) | |
389 | + for x in _immediately_after(n)) | |
390 | + # A , B A immediately follows B. | |
391 | + elif operator == ',': | |
392 | + retval = lambda n: any(predicate(x) | |
393 | + for x in _immediately_before(n)) | |
394 | + # A .. B A precedes B. | |
395 | + elif operator == '..': | |
396 | + retval = lambda n: any(predicate(x) for x in _after(n)) | |
397 | + # A ,, B A follows B. | |
398 | + elif operator == ',,': | |
399 | + retval = lambda n: any(predicate(x) for x in _before(n)) | |
400 | + # A $ B A is a sister of B (and A != B). | |
401 | + elif operator == '$' or operator == '%': | |
402 | + retval = lambda n: (hasattr(n, 'parent') and | |
403 | + bool(n.parent()) and | |
404 | + any(predicate(x) | |
405 | + for x in n.parent() if x is not n)) | |
406 | + # A $. B A is a sister of and immediately precedes B. | |
407 | + elif operator == '$.' or operator == '%.': | |
408 | + retval = lambda n: (hasattr(n, 'right_sibling') and | |
409 | + bool(n.right_sibling()) and | |
410 | + predicate(n.right_sibling())) | |
411 | + # A $, B A is a sister of and immediately follows B. | |
412 | + elif operator == '$,' or operator == '%,': | |
413 | + retval = lambda n: (hasattr(n, 'left_sibling') and | |
414 | + bool(n.left_sibling()) and | |
415 | + predicate(n.left_sibling())) | |
416 | + # A $.. B A is a sister of and precedes B. | |
417 | + elif operator == '$..' or operator == '%..': | |
418 | + retval = lambda n: (hasattr(n, 'parent') and | |
419 | + hasattr(n, 'parent_index') and | |
420 | + bool(n.parent()) and | |
421 | + any(predicate(x) for x in | |
422 | + n.parent()[n.parent_index() + 1:])) | |
423 | + # A $,, B A is a sister of and follows B. | |
424 | + elif operator == '$,,' or operator == '%,,': | |
425 | + retval = lambda n: (hasattr(n, 'parent') and | |
426 | + hasattr(n, 'parent_index') and | |
427 | + bool(n.parent()) and | |
428 | + any(predicate(x) for x in | |
429 | + n.parent()[:n.parent_index()])) | |
430 | + else: | |
431 | + assert False, 'cannot interpret tgrep operator "{0}"'.format( | |
432 | + operator) | |
433 | + # now return the built function | |
434 | + if negated: | |
435 | + return (lambda r: (lambda n: not r(n)))(retval) | |
436 | + else: | |
437 | + return retval | |
438 | + | |
439 | +def _tgrep_rel_conjunction_action(_s, _l, tokens): | |
440 | + ''' | |
441 | + Builds a lambda function representing a predicate on a tree node | |
442 | + from the conjunction of several other such lambda functions. | |
443 | + ''' | |
444 | + # filter out the ampersand | |
445 | + tokens = [x for x in tokens if x != '&'] | |
446 | + # print 'relation conjunction tokens: ', tokens | |
447 | + if len(tokens) == 1: | |
448 | + return tokens[0] | |
449 | + elif len(tokens) == 2: | |
450 | + return (lambda a, b: lambda n: a(n) and b(n))(tokens[0], tokens[1]) | |
451 | + | |
452 | +def _tgrep_rel_disjunction_action(_s, _l, tokens): | |
453 | + ''' | |
454 | + Builds a lambda function representing a predicate on a tree node | |
455 | + from the disjunction of several other such lambda functions. | |
456 | + ''' | |
457 | + # filter out the pipe | |
458 | + tokens = [x for x in tokens if x != '|'] | |
459 | + # print 'relation disjunction tokens: ', tokens | |
460 | + if len(tokens) == 1: | |
461 | + return tokens[0] | |
462 | + elif len(tokens) == 2: | |
463 | + return (lambda a, b: lambda n: a(n) or b(n))(tokens[0], tokens[1]) | |
464 | + | |
465 | +def _build_tgrep_parser(set_parse_actions = True): | |
466 | + ''' | |
467 | + Builds a pyparsing-based parser object for tokenizing and | |
468 | + interpreting tgrep search strings. | |
469 | + ''' | |
470 | + tgrep_op = (pyparsing.Optional('!') + | |
471 | + pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')) | |
472 | + tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\', | |
473 | + unquoteResults=False) | |
474 | + tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\\', | |
475 | + unquoteResults=False) | |
476 | + tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+') | |
477 | + tgrep_expr = pyparsing.Forward() | |
478 | + tgrep_relations = pyparsing.Forward() | |
479 | + tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')' | |
480 | + tgrep_nltk_tree_pos = ( | |
481 | + pyparsing.Literal('N(') + | |
482 | + pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' + | |
483 | + pyparsing.Optional(pyparsing.delimitedList( | |
484 | + pyparsing.Word(pyparsing.nums), delim=',') + | |
485 | + pyparsing.Optional(','))) + ')') | |
486 | + tgrep_node_expr = (tgrep_qstring | | |
487 | + tgrep_node_regex | | |
488 | + '*' | | |
489 | + tgrep_node_literal) | |
490 | + tgrep_node = (tgrep_parens | | |
491 | + tgrep_nltk_tree_pos | | |
492 | + (pyparsing.Optional("'") + | |
493 | + tgrep_node_expr + | |
494 | + pyparsing.ZeroOrMore("|" + tgrep_node_expr))) | |
495 | + tgrep_relation = pyparsing.Forward() | |
496 | + tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']' | |
497 | + tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node | |
498 | + tgrep_rel_conjunction = pyparsing.Forward() | |
499 | + tgrep_rel_conjunction << (tgrep_relation + | |
500 | + pyparsing.ZeroOrMore(pyparsing.Optional('&') + | |
501 | + tgrep_rel_conjunction)) | |
502 | + tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( | |
503 | + "|" + tgrep_relations) | |
504 | + tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) | |
505 | + if set_parse_actions: | |
506 | + tgrep_node.setParseAction(_tgrep_node_action) | |
507 | + tgrep_parens.setParseAction(_tgrep_parens_action) | |
508 | + tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action) | |
509 | + tgrep_relation.setParseAction(_tgrep_relation_action) | |
510 | + tgrep_rel_conjunction.setParseAction(_tgrep_rel_conjunction_action) | |
511 | + tgrep_relations.setParseAction(_tgrep_rel_disjunction_action) | |
512 | + # the whole expression is also the conjunction of two | |
513 | + # predicates: the first node predicate, and the remaining | |
514 | + # relation predicates | |
515 | + tgrep_expr.setParseAction(_tgrep_rel_conjunction_action) | |
516 | + return tgrep_expr | |
517 | + | |
518 | +def tgrep_tokenize(tgrep_string): | |
519 | + ''' | |
520 | + Tokenizes a TGrep search string into separate tokens. | |
521 | + ''' | |
522 | + parser = _build_tgrep_parser(False) | |
523 | + return list(parser.parseString(tgrep_string)) | |
524 | + | |
525 | +def tgrep_compile(tgrep_string): | |
526 | + ''' | |
527 | + Parses (and tokenizes, if necessary) a TGrep search string into a | |
528 | + lambda function. | |
529 | + ''' | |
530 | + parser = _build_tgrep_parser(True) | |
531 | + return list(parser.parseString(tgrep_string, parseAll=True))[0] | |
532 | + | |
533 | +def treepositions_no_leaves(tree): | |
534 | + ''' | |
535 | + Returns all the tree positions in the given tree which are not | |
536 | + leaf nodes. | |
537 | + ''' | |
538 | + treepositions = tree.treepositions() | |
539 | + # leaves are treeposition tuples that are not prefixes of any | |
540 | + # other treeposition | |
541 | + prefixes = set() | |
542 | + for pos in treepositions: | |
543 | + for length in range(len(pos)): | |
544 | + prefixes.add(pos[:length]) | |
545 | + return [pos for pos in treepositions if pos in prefixes] | |
546 | + | |
547 | +def tgrep_positions(tree, tgrep_string, search_leaves = True): | |
548 | + ''' | |
549 | + Return all tree positions in the given tree which match the given | |
550 | + `tgrep_string`. | |
551 | + | |
552 | + If `search_leaves` is False, the method will not return any | |
553 | + results in leaf positions. | |
554 | + ''' | |
555 | + if not hasattr(tree, 'treepositions'): | |
556 | + return [] | |
557 | + if isinstance(tgrep_string, basestring): | |
558 | + tgrep_string = tgrep_compile(tgrep_string) | |
559 | + if search_leaves: | |
560 | + search_positions = tree.treepositions() | |
561 | + else: | |
562 | + search_positions = treepositions_no_leaves(tree) | |
563 | + return [position for position in search_positions | |
564 | + if tgrep_string(tree[position])] | |
565 | + | |
566 | +def tgrep_nodes(tree, tgrep_string, search_leaves = True): | |
567 | + ''' | |
568 | + Return all tree nodes in the given tree which match the given | |
569 | + `tgrep_ string`. | |
570 | + | |
571 | + If `search_leaves` is False, the method will not return any | |
572 | + results in leaf positions. | |
573 | + ''' | |
574 | + return [tree[position] for position in tgrep_positions(tree, tgrep_string, | |
575 | + search_leaves)] | ... | ... |