Commit 9ba27f8347a6d6a430526782cb96d8e69e0c58c3
1 parent
f806b639
Exists in
master
and in
1 other branch
Adiciona classe (tgrep) para procurar nós na arvore NLTK
Showing
1 changed file
with
575 additions
and
0 deletions
Show diff stats
@@ -0,0 +1,575 @@ | @@ -0,0 +1,575 @@ | ||
1 | +#!/usr/bin/env python | ||
2 | +# -*- coding: utf-8 -*- | ||
3 | +# | ||
4 | +# Permission is hereby granted, free of charge, to any person | ||
5 | +# obtaining a copy of this software and associated documentation files | ||
6 | +# (the "Software"), to deal in the Software without restriction, | ||
7 | +# including without limitation the rights to use, copy, modify, merge, | ||
8 | +# publish, distribute, sublicense, and/or sell copies of the Software, | ||
9 | +# and to permit persons to whom the Software is furnished to do so, | ||
10 | +# subject to the following conditions: | ||
11 | +# | ||
12 | +# The above copyright notice and this permission notice shall be | ||
13 | +# included in all copies or substantial portions of the Software. | ||
14 | +# | ||
15 | +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||
16 | +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||
17 | +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||
18 | +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||
19 | +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||
20 | +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||
21 | +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
22 | +# SOFTWARE. | ||
23 | + | ||
24 | +''' | ||
25 | +TGrep search implementation for NTLK trees. | ||
26 | + | ||
27 | +(c) 16 March, 2013 Will Roberts <wildwilhelm@gmail.com>. | ||
28 | + | ||
29 | +This module supports TGrep2 syntax for matching parts of NLTK Trees. | ||
30 | +Note that many tgrep operators require the tree passed to be a | ||
31 | +ParentedTree. | ||
32 | + | ||
33 | +Tgrep tutorial: | ||
34 | +http://www.stanford.edu/dept/linguistics/corpora/cas-tut-tgrep.html | ||
35 | +Tgrep2 manual: | ||
36 | +http://tedlab.mit.edu/~dr/Tgrep2/tgrep2.pdf | ||
37 | +Tgrep2 source: | ||
38 | +http://tedlab.mit.edu/~dr/Tgrep2/ | ||
39 | +''' | ||
40 | + | ||
41 | +import nltk.tree | ||
42 | +import pyparsing | ||
43 | +import re | ||
44 | + | ||
45 | +def ancestors(node): | ||
46 | + ''' | ||
47 | + Returns the list of all nodes dominating the given tree node. | ||
48 | + This method will not work with leaf nodes, since there is no way | ||
49 | + to recover the parent. | ||
50 | + ''' | ||
51 | + # if node is a leaf, we cannot retrieve its parent | ||
52 | + if not hasattr(node, 'parent'): | ||
53 | + return [] | ||
54 | + results = [] | ||
55 | + current = node.parent() | ||
56 | + while current: | ||
57 | + results.append(current) | ||
58 | + current = current.parent() | ||
59 | + return results | ||
60 | + | ||
61 | +def unique_ancestors(node): | ||
62 | + ''' | ||
63 | + Returns the list of all nodes dominating the given node, where | ||
64 | + there is only a single path of descent. | ||
65 | + ''' | ||
66 | + # if node is a leaf, we cannot retrieve its parent | ||
67 | + if not hasattr(node, 'parent'): | ||
68 | + return [] | ||
69 | + results = [] | ||
70 | + current = node.parent() | ||
71 | + while current and len(current) == 1: | ||
72 | + results.append(current) | ||
73 | + current = current.parent() | ||
74 | + return results | ||
75 | + | ||
76 | +def _descendants(node): | ||
77 | + ''' | ||
78 | + Returns the list of all nodes which are descended from the given | ||
79 | + tree node in some way. | ||
80 | + ''' | ||
81 | + if not hasattr(node, 'treepositions'): | ||
82 | + return [] | ||
83 | + return [node[x] for x in node.treepositions()[1:]] | ||
84 | + | ||
85 | +def _leftmost_descendants(node): | ||
86 | + ''' | ||
87 | + Returns the set of all nodes descended in some way through | ||
88 | + left branches from this node. | ||
89 | + ''' | ||
90 | + if not hasattr(node, 'treepositions'): | ||
91 | + return [] | ||
92 | + return [node[x] for x in node.treepositions()[1:] if all(y == 0 for y in x)] | ||
93 | + | ||
94 | +def _rightmost_descendants(node): | ||
95 | + ''' | ||
96 | + Returns the set of all nodes descended in some way through | ||
97 | + right branches from this node. | ||
98 | + ''' | ||
99 | + if not hasattr(node, 'treepositions'): | ||
100 | + return [] | ||
101 | + rightmost_leaf = max(node.treepositions()) | ||
102 | + return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] | ||
103 | + | ||
104 | +def _unique_descendants(node): | ||
105 | + ''' | ||
106 | + Returns the list of all nodes descended from the given node, where | ||
107 | + there is only a single path of descent. | ||
108 | + ''' | ||
109 | + results = [] | ||
110 | + current = node | ||
111 | + while current and isinstance(current, nltk.tree.Tree) and len(current) == 1: | ||
112 | + current = current[0] | ||
113 | + results.append(current) | ||
114 | + return results | ||
115 | + | ||
116 | +def _before(node): | ||
117 | + ''' | ||
118 | + Returns the set of all nodes that are before the given node. | ||
119 | + ''' | ||
120 | + if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | ||
121 | + return [] | ||
122 | + pos = node.treeposition() | ||
123 | + tree = node.root() | ||
124 | + return [tree[x] for x in tree.treepositions() | ||
125 | + if x[:len(pos)] < pos[:len(x)]] | ||
126 | + | ||
127 | +def _immediately_before(node): | ||
128 | + ''' | ||
129 | + Returns the set of all nodes that are immediately before the given | ||
130 | + node. | ||
131 | + | ||
132 | + Tree node A immediately precedes node B if the last terminal | ||
133 | + symbol (word) produced by A immediately precedes the first | ||
134 | + terminal symbol produced by B. | ||
135 | + ''' | ||
136 | + if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | ||
137 | + return [] | ||
138 | + pos = node.treeposition() | ||
139 | + # go "upwards" from pos until there is a place we can go to the left | ||
140 | + idx = len(pos) - 1 | ||
141 | + while 0 <= idx and pos[idx] == 0: | ||
142 | + idx -= 1 | ||
143 | + if idx < 0: | ||
144 | + return [] | ||
145 | + pos = list(pos[:idx + 1]) | ||
146 | + pos[-1] -= 1 | ||
147 | + before = node.root()[pos] | ||
148 | + return [before] + _rightmost_descendants(before) | ||
149 | + | ||
150 | +def _after(node): | ||
151 | + ''' | ||
152 | + Returns the set of all nodes that are after the given node. | ||
153 | + ''' | ||
154 | + if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | ||
155 | + return [] | ||
156 | + pos = node.treeposition() | ||
157 | + tree = node.root() | ||
158 | + return [tree[x] for x in tree.treepositions() | ||
159 | + if x[:len(pos)] > pos[:len(x)]] | ||
160 | + | ||
161 | +def _immediately_after(node): | ||
162 | + ''' | ||
163 | + Returns the set of all nodes that are immediately after the given | ||
164 | + node. | ||
165 | + | ||
166 | + Tree node A immediately follows node B if the first terminal | ||
167 | + symbol (word) produced by A immediately follows the last | ||
168 | + terminal symbol produced by B. | ||
169 | + ''' | ||
170 | + if (not hasattr(node, 'root') or not hasattr(node, 'treeposition') or | ||
171 | + not hasattr(node, 'parent')): | ||
172 | + return [] | ||
173 | + pos = node.treeposition() | ||
174 | + # go "upwards" from pos until there is a place we can go to the | ||
175 | + # right | ||
176 | + idx = len(pos) - 1 | ||
177 | + current = node.parent() | ||
178 | + while 0 <= idx and pos[idx] == len(current) - 1: | ||
179 | + idx -= 1 | ||
180 | + current = current.parent() | ||
181 | + if idx < 0: | ||
182 | + return [] | ||
183 | + pos = list(pos[:idx + 1]) | ||
184 | + pos[-1] += 1 | ||
185 | + after = node.root()[pos] | ||
186 | + return [after] + _leftmost_descendants(after) | ||
187 | + | ||
188 | +def _tgrep_node_literal_value(node): | ||
189 | + ''' | ||
190 | + Gets the string value of a given parse tree node, for comparison | ||
191 | + using the tgrep node literal predicates. | ||
192 | + ''' | ||
193 | + return (node.label() if isinstance(node, nltk.tree.Tree) else unicode(node)) | ||
194 | + | ||
195 | +def _tgrep_node_action(_s, _l, tokens): | ||
196 | + ''' | ||
197 | + Builds a lambda function representing a predicate on a tree node | ||
198 | + depending on the name of its node. | ||
199 | + ''' | ||
200 | + # print 'node tokens: ', tokens | ||
201 | + if tokens[0] == "'": | ||
202 | + # strip initial apostrophe (tgrep2 print command) | ||
203 | + tokens = tokens[1:] | ||
204 | + if len(tokens) > 1: | ||
205 | + # disjunctive definition of a node name | ||
206 | + assert list(set(tokens[1::2])) == ['|'] | ||
207 | + # recursively call self to interpret each node name definition | ||
208 | + tokens = [_tgrep_node_action(None, None, [node]) | ||
209 | + for node in tokens[::2]] | ||
210 | + # capture tokens and return the disjunction | ||
211 | + return (lambda t: lambda n: any(f(n) for f in t))(tokens) | ||
212 | + else: | ||
213 | + if hasattr(tokens[0], '__call__'): | ||
214 | + # this is a previously interpreted parenthetical node | ||
215 | + # definition (lambda function) | ||
216 | + return tokens[0] | ||
217 | + elif tokens[0] == '*' or tokens[0] == '__': | ||
218 | + return lambda n: True | ||
219 | + elif tokens[0].startswith('"'): | ||
220 | + return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip('"')) | ||
221 | + elif tokens[0].startswith('/'): | ||
222 | + return (lambda r: lambda n: | ||
223 | + r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip('/'))) | ||
224 | + elif tokens[0].startswith('i@'): | ||
225 | + return (lambda s: lambda n: | ||
226 | + _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower()) | ||
227 | + else: | ||
228 | + return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0]) | ||
229 | + | ||
230 | +def _tgrep_parens_action(_s, _l, tokens): | ||
231 | + ''' | ||
232 | + Builds a lambda function representing a predicate on a tree node | ||
233 | + from a parenthetical notation. | ||
234 | + ''' | ||
235 | + # print 'parenthetical tokens: ', tokens | ||
236 | + assert len(tokens) == 3 | ||
237 | + assert tokens[0] == '(' | ||
238 | + assert tokens[2] == ')' | ||
239 | + return tokens[1] | ||
240 | + | ||
241 | +def _tgrep_nltk_tree_pos_action(_s, _l, tokens): | ||
242 | + ''' | ||
243 | + Builds a lambda function representing a predicate on a tree node | ||
244 | + which returns true if the node is located at a specific tree | ||
245 | + position. | ||
246 | + ''' | ||
247 | + # recover the tuple from the parsed sting | ||
248 | + node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) | ||
249 | + # capture the node's tree position | ||
250 | + return (lambda i: lambda n: (hasattr(n, 'treeposition') and | ||
251 | + n.treeposition() == i))(node_tree_position) | ||
252 | + | ||
253 | +def _tgrep_relation_action(_s, _l, tokens): | ||
254 | + ''' | ||
255 | + Builds a lambda function representing a predicate on a tree node | ||
256 | + depending on its relation to other nodes in the tree. | ||
257 | + ''' | ||
258 | + # print 'relation tokens: ', tokens | ||
259 | + # process negation first if needed | ||
260 | + negated = False | ||
261 | + if tokens[0] == '!': | ||
262 | + negated = True | ||
263 | + tokens = tokens[1:] | ||
264 | + if tokens[0] == '[': | ||
265 | + # process square-bracketed relation expressions | ||
266 | + assert len(tokens) == 3 | ||
267 | + assert tokens[2] == ']' | ||
268 | + retval = tokens[1] | ||
269 | + else: | ||
270 | + # process operator-node relation expressions | ||
271 | + assert len(tokens) == 2 | ||
272 | + operator, predicate = tokens | ||
273 | + # A < B A is the parent of (immediately dominates) B. | ||
274 | + if operator == '<': | ||
275 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | ||
276 | + any(predicate(x) for x in n)) | ||
277 | + # A > B A is the child of B. | ||
278 | + elif operator == '>': | ||
279 | + retval = lambda n: (hasattr(n, 'parent') and | ||
280 | + bool(n.parent()) and | ||
281 | + predicate(n.parent())) | ||
282 | + # A <, B Synonymous with A <1 B. | ||
283 | + elif operator == '<,' or operator == '<1': | ||
284 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | ||
285 | + bool(list(n)) and | ||
286 | + predicate(n[0])) | ||
287 | + # A >, B Synonymous with A >1 B. | ||
288 | + elif operator == '>,' or operator == '>1': | ||
289 | + retval = lambda n: (hasattr(n, 'parent') and | ||
290 | + bool(n.parent()) and | ||
291 | + (n is n.parent()[0]) and | ||
292 | + predicate(n.parent())) | ||
293 | + # A <N B B is the Nth child of A (the first child is <1). | ||
294 | + elif operator[0] == '<' and operator[1:].isdigit(): | ||
295 | + idx = int(operator[1:]) | ||
296 | + # capture the index parameter | ||
297 | + retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and | ||
298 | + bool(list(n)) and | ||
299 | + 0 <= i < len(n) and | ||
300 | + predicate(n[i])))(idx - 1) | ||
301 | + # A >N B A is the Nth child of B (the first child is >1). | ||
302 | + elif operator[0] == '>' and operator[1:].isdigit(): | ||
303 | + idx = int(operator[1:]) | ||
304 | + # capture the index parameter | ||
305 | + retval = (lambda i: lambda n: (hasattr(n, 'parent') and | ||
306 | + bool(n.parent()) and | ||
307 | + 0 <= i < len(n.parent()) and | ||
308 | + (n is n.parent()[i]) and | ||
309 | + predicate(n.parent())))(idx - 1) | ||
310 | + # A <' B B is the last child of A (also synonymous with A <-1 B). | ||
311 | + # A <- B B is the last child of A (synonymous with A <-1 B). | ||
312 | + elif operator == '<\'' or operator == '<-' or operator == '<-1': | ||
313 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and bool(list(n)) | ||
314 | + and predicate(n[-1])) | ||
315 | + # A >' B A is the last child of B (also synonymous with A >-1 B). | ||
316 | + # A >- B A is the last child of B (synonymous with A >-1 B). | ||
317 | + elif operator == '>\'' or operator == '>-' or operator == '>-1': | ||
318 | + retval = lambda n: (hasattr(n, 'parent') and | ||
319 | + bool(n.parent()) and | ||
320 | + (n is n.parent()[-1]) and | ||
321 | + predicate(n.parent())) | ||
322 | + # A <-N B B is the N th-to-last child of A (the last child is <-1). | ||
323 | + elif operator[:2] == '<-' and operator[2:].isdigit(): | ||
324 | + idx = -int(operator[2:]) | ||
325 | + # capture the index parameter | ||
326 | + retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and | ||
327 | + bool(list(n)) and | ||
328 | + 0 <= (i + len(n)) < len(n) and | ||
329 | + predicate(n[i + len(n)])))(idx) | ||
330 | + # A >-N B A is the N th-to-last child of B (the last child is >-1). | ||
331 | + elif operator[:2] == '>-' and operator[2:].isdigit(): | ||
332 | + idx = -int(operator[2:]) | ||
333 | + # capture the index parameter | ||
334 | + retval = (lambda i: lambda n: | ||
335 | + (hasattr(n, 'parent') and | ||
336 | + bool(n.parent()) and | ||
337 | + 0 <= (i + len(n.parent())) < len(n.parent()) and | ||
338 | + (n is n.parent()[i + len(n.parent())]) and | ||
339 | + predicate(n.parent())))(idx) | ||
340 | + # A <: B B is the only child of A | ||
341 | + elif operator == '<:': | ||
342 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | ||
343 | + len(n) == 1 and | ||
344 | + predicate(n[0])) | ||
345 | + # A >: B A is the only child of B. | ||
346 | + elif operator == '>:': | ||
347 | + retval = lambda n: (hasattr(n, 'parent') and | ||
348 | + bool(n.parent()) and | ||
349 | + len(n.parent()) == 1 and | ||
350 | + predicate(n.parent())) | ||
351 | + # A << B A dominates B (A is an ancestor of B). | ||
352 | + elif operator == '<<': | ||
353 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | ||
354 | + any(predicate(x) for x in _descendants(n))) | ||
355 | + # A >> B A is dominated by B (A is a descendant of B). | ||
356 | + elif operator == '>>': | ||
357 | + retval = lambda n: any(predicate(x) for x in ancestors(n)) | ||
358 | + # A <<, B B is a left-most descendant of A. | ||
359 | + elif operator == '<<,' or operator == '<<1': | ||
360 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | ||
361 | + any(predicate(x) | ||
362 | + for x in _leftmost_descendants(n))) | ||
363 | + # A >>, B A is a left-most descendant of B. | ||
364 | + elif operator == '>>,': | ||
365 | + retval = lambda n: any((predicate(x) and | ||
366 | + n in _leftmost_descendants(x)) | ||
367 | + for x in ancestors(n)) | ||
368 | + # A <<' B B is a right-most descendant of A. | ||
369 | + elif operator == '<<\'': | ||
370 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | ||
371 | + any(predicate(x) | ||
372 | + for x in _rightmost_descendants(n))) | ||
373 | + # A >>' B A is a right-most descendant of B. | ||
374 | + elif operator == '>>\'': | ||
375 | + retval = lambda n: any((predicate(x) and | ||
376 | + n in _rightmost_descendants(x)) | ||
377 | + for x in ancestors(n)) | ||
378 | + # A <<: B There is a single path of descent from A and B is on it. | ||
379 | + elif operator == '<<:': | ||
380 | + retval = lambda n: (isinstance(n, nltk.tree.Tree) and | ||
381 | + any(predicate(x) | ||
382 | + for x in _unique_descendants(n))) | ||
383 | + # A >>: B There is a single path of descent from B and A is on it. | ||
384 | + elif operator == '>>:': | ||
385 | + retval = lambda n: any(predicate(x) for x in unique_ancestors(n)) | ||
386 | + # A . B A immediately precedes B. | ||
387 | + elif operator == '.': | ||
388 | + retval = lambda n: any(predicate(x) | ||
389 | + for x in _immediately_after(n)) | ||
390 | + # A , B A immediately follows B. | ||
391 | + elif operator == ',': | ||
392 | + retval = lambda n: any(predicate(x) | ||
393 | + for x in _immediately_before(n)) | ||
394 | + # A .. B A precedes B. | ||
395 | + elif operator == '..': | ||
396 | + retval = lambda n: any(predicate(x) for x in _after(n)) | ||
397 | + # A ,, B A follows B. | ||
398 | + elif operator == ',,': | ||
399 | + retval = lambda n: any(predicate(x) for x in _before(n)) | ||
400 | + # A $ B A is a sister of B (and A != B). | ||
401 | + elif operator == '$' or operator == '%': | ||
402 | + retval = lambda n: (hasattr(n, 'parent') and | ||
403 | + bool(n.parent()) and | ||
404 | + any(predicate(x) | ||
405 | + for x in n.parent() if x is not n)) | ||
406 | + # A $. B A is a sister of and immediately precedes B. | ||
407 | + elif operator == '$.' or operator == '%.': | ||
408 | + retval = lambda n: (hasattr(n, 'right_sibling') and | ||
409 | + bool(n.right_sibling()) and | ||
410 | + predicate(n.right_sibling())) | ||
411 | + # A $, B A is a sister of and immediately follows B. | ||
412 | + elif operator == '$,' or operator == '%,': | ||
413 | + retval = lambda n: (hasattr(n, 'left_sibling') and | ||
414 | + bool(n.left_sibling()) and | ||
415 | + predicate(n.left_sibling())) | ||
416 | + # A $.. B A is a sister of and precedes B. | ||
417 | + elif operator == '$..' or operator == '%..': | ||
418 | + retval = lambda n: (hasattr(n, 'parent') and | ||
419 | + hasattr(n, 'parent_index') and | ||
420 | + bool(n.parent()) and | ||
421 | + any(predicate(x) for x in | ||
422 | + n.parent()[n.parent_index() + 1:])) | ||
423 | + # A $,, B A is a sister of and follows B. | ||
424 | + elif operator == '$,,' or operator == '%,,': | ||
425 | + retval = lambda n: (hasattr(n, 'parent') and | ||
426 | + hasattr(n, 'parent_index') and | ||
427 | + bool(n.parent()) and | ||
428 | + any(predicate(x) for x in | ||
429 | + n.parent()[:n.parent_index()])) | ||
430 | + else: | ||
431 | + assert False, 'cannot interpret tgrep operator "{0}"'.format( | ||
432 | + operator) | ||
433 | + # now return the built function | ||
434 | + if negated: | ||
435 | + return (lambda r: (lambda n: not r(n)))(retval) | ||
436 | + else: | ||
437 | + return retval | ||
438 | + | ||
439 | +def _tgrep_rel_conjunction_action(_s, _l, tokens): | ||
440 | + ''' | ||
441 | + Builds a lambda function representing a predicate on a tree node | ||
442 | + from the conjunction of several other such lambda functions. | ||
443 | + ''' | ||
444 | + # filter out the ampersand | ||
445 | + tokens = [x for x in tokens if x != '&'] | ||
446 | + # print 'relation conjunction tokens: ', tokens | ||
447 | + if len(tokens) == 1: | ||
448 | + return tokens[0] | ||
449 | + elif len(tokens) == 2: | ||
450 | + return (lambda a, b: lambda n: a(n) and b(n))(tokens[0], tokens[1]) | ||
451 | + | ||
452 | +def _tgrep_rel_disjunction_action(_s, _l, tokens): | ||
453 | + ''' | ||
454 | + Builds a lambda function representing a predicate on a tree node | ||
455 | + from the disjunction of several other such lambda functions. | ||
456 | + ''' | ||
457 | + # filter out the pipe | ||
458 | + tokens = [x for x in tokens if x != '|'] | ||
459 | + # print 'relation disjunction tokens: ', tokens | ||
460 | + if len(tokens) == 1: | ||
461 | + return tokens[0] | ||
462 | + elif len(tokens) == 2: | ||
463 | + return (lambda a, b: lambda n: a(n) or b(n))(tokens[0], tokens[1]) | ||
464 | + | ||
465 | +def _build_tgrep_parser(set_parse_actions = True): | ||
466 | + ''' | ||
467 | + Builds a pyparsing-based parser object for tokenizing and | ||
468 | + interpreting tgrep search strings. | ||
469 | + ''' | ||
470 | + tgrep_op = (pyparsing.Optional('!') + | ||
471 | + pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')) | ||
472 | + tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\', | ||
473 | + unquoteResults=False) | ||
474 | + tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\\', | ||
475 | + unquoteResults=False) | ||
476 | + tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+') | ||
477 | + tgrep_expr = pyparsing.Forward() | ||
478 | + tgrep_relations = pyparsing.Forward() | ||
479 | + tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')' | ||
480 | + tgrep_nltk_tree_pos = ( | ||
481 | + pyparsing.Literal('N(') + | ||
482 | + pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' + | ||
483 | + pyparsing.Optional(pyparsing.delimitedList( | ||
484 | + pyparsing.Word(pyparsing.nums), delim=',') + | ||
485 | + pyparsing.Optional(','))) + ')') | ||
486 | + tgrep_node_expr = (tgrep_qstring | | ||
487 | + tgrep_node_regex | | ||
488 | + '*' | | ||
489 | + tgrep_node_literal) | ||
490 | + tgrep_node = (tgrep_parens | | ||
491 | + tgrep_nltk_tree_pos | | ||
492 | + (pyparsing.Optional("'") + | ||
493 | + tgrep_node_expr + | ||
494 | + pyparsing.ZeroOrMore("|" + tgrep_node_expr))) | ||
495 | + tgrep_relation = pyparsing.Forward() | ||
496 | + tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']' | ||
497 | + tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node | ||
498 | + tgrep_rel_conjunction = pyparsing.Forward() | ||
499 | + tgrep_rel_conjunction << (tgrep_relation + | ||
500 | + pyparsing.ZeroOrMore(pyparsing.Optional('&') + | ||
501 | + tgrep_rel_conjunction)) | ||
502 | + tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( | ||
503 | + "|" + tgrep_relations) | ||
504 | + tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) | ||
505 | + if set_parse_actions: | ||
506 | + tgrep_node.setParseAction(_tgrep_node_action) | ||
507 | + tgrep_parens.setParseAction(_tgrep_parens_action) | ||
508 | + tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action) | ||
509 | + tgrep_relation.setParseAction(_tgrep_relation_action) | ||
510 | + tgrep_rel_conjunction.setParseAction(_tgrep_rel_conjunction_action) | ||
511 | + tgrep_relations.setParseAction(_tgrep_rel_disjunction_action) | ||
512 | + # the whole expression is also the conjunction of two | ||
513 | + # predicates: the first node predicate, and the remaining | ||
514 | + # relation predicates | ||
515 | + tgrep_expr.setParseAction(_tgrep_rel_conjunction_action) | ||
516 | + return tgrep_expr | ||
517 | + | ||
518 | +def tgrep_tokenize(tgrep_string): | ||
519 | + ''' | ||
520 | + Tokenizes a TGrep search string into separate tokens. | ||
521 | + ''' | ||
522 | + parser = _build_tgrep_parser(False) | ||
523 | + return list(parser.parseString(tgrep_string)) | ||
524 | + | ||
525 | +def tgrep_compile(tgrep_string): | ||
526 | + ''' | ||
527 | + Parses (and tokenizes, if necessary) a TGrep search string into a | ||
528 | + lambda function. | ||
529 | + ''' | ||
530 | + parser = _build_tgrep_parser(True) | ||
531 | + return list(parser.parseString(tgrep_string, parseAll=True))[0] | ||
532 | + | ||
533 | +def treepositions_no_leaves(tree): | ||
534 | + ''' | ||
535 | + Returns all the tree positions in the given tree which are not | ||
536 | + leaf nodes. | ||
537 | + ''' | ||
538 | + treepositions = tree.treepositions() | ||
539 | + # leaves are treeposition tuples that are not prefixes of any | ||
540 | + # other treeposition | ||
541 | + prefixes = set() | ||
542 | + for pos in treepositions: | ||
543 | + for length in range(len(pos)): | ||
544 | + prefixes.add(pos[:length]) | ||
545 | + return [pos for pos in treepositions if pos in prefixes] | ||
546 | + | ||
547 | +def tgrep_positions(tree, tgrep_string, search_leaves = True): | ||
548 | + ''' | ||
549 | + Return all tree positions in the given tree which match the given | ||
550 | + `tgrep_string`. | ||
551 | + | ||
552 | + If `search_leaves` is False, the method will not return any | ||
553 | + results in leaf positions. | ||
554 | + ''' | ||
555 | + if not hasattr(tree, 'treepositions'): | ||
556 | + return [] | ||
557 | + if isinstance(tgrep_string, basestring): | ||
558 | + tgrep_string = tgrep_compile(tgrep_string) | ||
559 | + if search_leaves: | ||
560 | + search_positions = tree.treepositions() | ||
561 | + else: | ||
562 | + search_positions = treepositions_no_leaves(tree) | ||
563 | + return [position for position in search_positions | ||
564 | + if tgrep_string(tree[position])] | ||
565 | + | ||
566 | +def tgrep_nodes(tree, tgrep_string, search_leaves = True): | ||
567 | + ''' | ||
568 | + Return all tree nodes in the given tree which match the given | ||
569 | + `tgrep_ string`. | ||
570 | + | ||
571 | + If `search_leaves` is False, the method will not return any | ||
572 | + results in leaf positions. | ||
573 | + ''' | ||
574 | + return [tree[position] for position in tgrep_positions(tree, tgrep_string, | ||
575 | + search_leaves)] |