Commit 3dc55945f34e948398d13b7346524af6a001348d
1 parent
4fd9f50a
Exists in
master
and in
1 other branch
Adiciona nova versao tgrep
Showing
1 changed file
with
144 additions
and
122 deletions
Show diff stats
src/new/tgrep.py
... | ... | @@ -38,6 +38,7 @@ Tgrep2 source: |
38 | 38 | http://tedlab.mit.edu/~dr/Tgrep2/ |
39 | 39 | ''' |
40 | 40 | |
41 | +from builtins import bytes, range, str | |
41 | 42 | import nltk.tree |
42 | 43 | import pyparsing |
43 | 44 | import re |
... | ... | @@ -48,11 +49,12 @@ def ancestors(node): |
48 | 49 | This method will not work with leaf nodes, since there is no way |
49 | 50 | to recover the parent. |
50 | 51 | ''' |
51 | - # if node is a leaf, we cannot retrieve its parent | |
52 | - if not hasattr(node, 'parent'): | |
53 | - return [] | |
54 | 52 | results = [] |
55 | - current = node.parent() | |
53 | + try: | |
54 | + current = node.parent() | |
55 | + except AttributeError: | |
56 | + # if node is a leaf, we cannot retrieve its parent | |
57 | + return results | |
56 | 58 | while current: |
57 | 59 | results.append(current) |
58 | 60 | current = current.parent() |
... | ... | @@ -63,11 +65,12 @@ def unique_ancestors(node): |
63 | 65 | Returns the list of all nodes dominating the given node, where |
64 | 66 | there is only a single path of descent. |
65 | 67 | ''' |
66 | - # if node is a leaf, we cannot retrieve its parent | |
67 | - if not hasattr(node, 'parent'): | |
68 | - return [] | |
69 | 68 | results = [] |
70 | - current = node.parent() | |
69 | + try: | |
70 | + current = node.parent() | |
71 | + except AttributeError: | |
72 | + # if node is a leaf, we cannot retrieve its parent | |
73 | + return results | |
71 | 74 | while current and len(current) == 1: |
72 | 75 | results.append(current) |
73 | 76 | current = current.parent() |
... | ... | @@ -78,29 +81,38 @@ def _descendants(node): |
78 | 81 | Returns the list of all nodes which are descended from the given |
79 | 82 | tree node in some way. |
80 | 83 | ''' |
81 | - if not hasattr(node, 'treepositions'): | |
84 | + try: | |
85 | + treepos = node.treepositions() | |
86 | + except AttributeError: | |
82 | 87 | return [] |
83 | - return [node[x] for x in node.treepositions()[1:]] | |
88 | + return [node[x] for x in treepos[1:]] | |
84 | 89 | |
85 | 90 | def _leftmost_descendants(node): |
86 | 91 | ''' |
87 | 92 | Returns the set of all nodes descended in some way through |
88 | 93 | left branches from this node. |
89 | 94 | ''' |
90 | - if not hasattr(node, 'treepositions'): | |
95 | + try: | |
96 | + treepos = node.treepositions() | |
97 | + except AttributeError: | |
91 | 98 | return [] |
92 | - return [node[x] for x in node.treepositions()[1:] if all(y == 0 for y in x)] | |
99 | + return [node[x] for x in treepos[1:] if all(y == 0 for y in x)] | |
93 | 100 | |
94 | 101 | def _rightmost_descendants(node): |
95 | 102 | ''' |
96 | 103 | Returns the set of all nodes descended in some way through |
97 | 104 | right branches from this node. |
98 | 105 | ''' |
99 | - if not hasattr(node, 'treepositions'): | |
106 | + try: | |
107 | + rightmost_leaf = max(node.treepositions()) | |
108 | + except AttributeError: | |
100 | 109 | return [] |
101 | - rightmost_leaf = max(node.treepositions()) | |
102 | 110 | return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] |
103 | 111 | |
112 | +def _istree(obj): | |
113 | + '''Predicate to check whether `obj` is a nltk.tree.Tree.''' | |
114 | + return isinstance(obj, nltk.tree.Tree) | |
115 | + | |
104 | 116 | def _unique_descendants(node): |
105 | 117 | ''' |
106 | 118 | Returns the list of all nodes descended from the given node, where |
... | ... | @@ -108,7 +120,7 @@ def _unique_descendants(node): |
108 | 120 | ''' |
109 | 121 | results = [] |
110 | 122 | current = node |
111 | - while current and isinstance(current, nltk.tree.Tree) and len(current) == 1: | |
123 | + while current and _istree(current) and len(current) == 1: | |
112 | 124 | current = current[0] |
113 | 125 | results.append(current) |
114 | 126 | return results |
... | ... | @@ -117,10 +129,11 @@ def _before(node): |
117 | 129 | ''' |
118 | 130 | Returns the set of all nodes that are before the given node. |
119 | 131 | ''' |
120 | - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | |
132 | + try: | |
133 | + pos = node.treeposition() | |
134 | + tree = node.root() | |
135 | + except AttributeError: | |
121 | 136 | return [] |
122 | - pos = node.treeposition() | |
123 | - tree = node.root() | |
124 | 137 | return [tree[x] for x in tree.treepositions() |
125 | 138 | if x[:len(pos)] < pos[:len(x)]] |
126 | 139 | |
... | ... | @@ -133,9 +146,11 @@ def _immediately_before(node): |
133 | 146 | symbol (word) produced by A immediately precedes the first |
134 | 147 | terminal symbol produced by B. |
135 | 148 | ''' |
136 | - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | |
149 | + try: | |
150 | + pos = node.treeposition() | |
151 | + tree = node.root() | |
152 | + except AttributeError: | |
137 | 153 | return [] |
138 | - pos = node.treeposition() | |
139 | 154 | # go "upwards" from pos until there is a place we can go to the left |
140 | 155 | idx = len(pos) - 1 |
141 | 156 | while 0 <= idx and pos[idx] == 0: |
... | ... | @@ -144,17 +159,18 @@ def _immediately_before(node): |
144 | 159 | return [] |
145 | 160 | pos = list(pos[:idx + 1]) |
146 | 161 | pos[-1] -= 1 |
147 | - before = node.root()[pos] | |
162 | + before = tree[pos] | |
148 | 163 | return [before] + _rightmost_descendants(before) |
149 | 164 | |
150 | 165 | def _after(node): |
151 | 166 | ''' |
152 | 167 | Returns the set of all nodes that are after the given node. |
153 | 168 | ''' |
154 | - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | |
169 | + try: | |
170 | + pos = node.treeposition() | |
171 | + tree = node.root() | |
172 | + except AttributeError: | |
155 | 173 | return [] |
156 | - pos = node.treeposition() | |
157 | - tree = node.root() | |
158 | 174 | return [tree[x] for x in tree.treepositions() |
159 | 175 | if x[:len(pos)] > pos[:len(x)]] |
160 | 176 | |
... | ... | @@ -167,14 +183,15 @@ def _immediately_after(node): |
167 | 183 | symbol (word) produced by A immediately follows the last |
168 | 184 | terminal symbol produced by B. |
169 | 185 | ''' |
170 | - if (not hasattr(node, 'root') or not hasattr(node, 'treeposition') or | |
171 | - not hasattr(node, 'parent')): | |
186 | + try: | |
187 | + pos = node.treeposition() | |
188 | + tree = node.root() | |
189 | + current = node.parent() | |
190 | + except AttributeError: | |
172 | 191 | return [] |
173 | - pos = node.treeposition() | |
174 | 192 | # go "upwards" from pos until there is a place we can go to the |
175 | 193 | # right |
176 | 194 | idx = len(pos) - 1 |
177 | - current = node.parent() | |
178 | 195 | while 0 <= idx and pos[idx] == len(current) - 1: |
179 | 196 | idx -= 1 |
180 | 197 | current = current.parent() |
... | ... | @@ -182,7 +199,7 @@ def _immediately_after(node): |
182 | 199 | return [] |
183 | 200 | pos = list(pos[:idx + 1]) |
184 | 201 | pos[-1] += 1 |
185 | - after = node.root()[pos] | |
202 | + after = tree[pos] | |
186 | 203 | return [after] + _leftmost_descendants(after) |
187 | 204 | |
188 | 205 | def _tgrep_node_literal_value(node): |
... | ... | @@ -190,7 +207,7 @@ def _tgrep_node_literal_value(node): |
190 | 207 | Gets the string value of a given parse tree node, for comparison |
191 | 208 | using the tgrep node literal predicates. |
192 | 209 | ''' |
193 | - return (node.label() if isinstance(node, nltk.tree.Tree) else unicode(node)) | |
210 | + return (node.label() if _istree(node) else str(node)) | |
194 | 211 | |
195 | 212 | def _tgrep_node_action(_s, _l, tokens): |
196 | 213 | ''' |
... | ... | @@ -198,30 +215,30 @@ def _tgrep_node_action(_s, _l, tokens): |
198 | 215 | depending on the name of its node. |
199 | 216 | ''' |
200 | 217 | # print 'node tokens: ', tokens |
201 | - if tokens[0] == "'": | |
218 | + if tokens[0] == u"'": | |
202 | 219 | # strip initial apostrophe (tgrep2 print command) |
203 | 220 | tokens = tokens[1:] |
204 | 221 | if len(tokens) > 1: |
205 | 222 | # disjunctive definition of a node name |
206 | - assert list(set(tokens[1::2])) == ['|'] | |
223 | + assert list(set(tokens[1::2])) == [u'|'] | |
207 | 224 | # recursively call self to interpret each node name definition |
208 | 225 | tokens = [_tgrep_node_action(None, None, [node]) |
209 | 226 | for node in tokens[::2]] |
210 | 227 | # capture tokens and return the disjunction |
211 | 228 | return (lambda t: lambda n: any(f(n) for f in t))(tokens) |
212 | 229 | else: |
213 | - if hasattr(tokens[0], '__call__'): | |
230 | + if hasattr(tokens[0], u'__call__'): | |
214 | 231 | # this is a previously interpreted parenthetical node |
215 | 232 | # definition (lambda function) |
216 | 233 | return tokens[0] |
217 | - elif tokens[0] == '*' or tokens[0] == '__': | |
234 | + elif tokens[0] == u'*' or tokens[0] == u'__': | |
218 | 235 | return lambda n: True |
219 | - elif tokens[0].startswith('"'): | |
220 | - return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip('"')) | |
221 | - elif tokens[0].startswith('/'): | |
236 | + elif tokens[0].startswith(u'"'): | |
237 | + return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip(u'"')) | |
238 | + elif tokens[0].startswith(u'/'): | |
222 | 239 | return (lambda r: lambda n: |
223 | - r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip('/'))) | |
224 | - elif tokens[0].startswith('i@'): | |
240 | + r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip(u'/'))) | |
241 | + elif tokens[0].startswith(u'i@'): | |
225 | 242 | return (lambda s: lambda n: |
226 | 243 | _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower()) |
227 | 244 | else: |
... | ... | @@ -234,8 +251,8 @@ def _tgrep_parens_action(_s, _l, tokens): |
234 | 251 | ''' |
235 | 252 | # print 'parenthetical tokens: ', tokens |
236 | 253 | assert len(tokens) == 3 |
237 | - assert tokens[0] == '(' | |
238 | - assert tokens[2] == ')' | |
254 | + assert tokens[0] == u'(' | |
255 | + assert tokens[2] == u')' | |
239 | 256 | return tokens[1] |
240 | 257 | |
241 | 258 | def _tgrep_nltk_tree_pos_action(_s, _l, tokens): |
... | ... | @@ -247,7 +264,7 @@ def _tgrep_nltk_tree_pos_action(_s, _l, tokens): |
247 | 264 | # recover the tuple from the parsed sting |
248 | 265 | node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) |
249 | 266 | # capture the node's tree position |
250 | - return (lambda i: lambda n: (hasattr(n, 'treeposition') and | |
267 | + return (lambda i: lambda n: (hasattr(n, u'treeposition') and | |
251 | 268 | n.treeposition() == i))(node_tree_position) |
252 | 269 | |
253 | 270 | def _tgrep_relation_action(_s, _l, tokens): |
... | ... | @@ -258,177 +275,177 @@ def _tgrep_relation_action(_s, _l, tokens): |
258 | 275 | # print 'relation tokens: ', tokens |
259 | 276 | # process negation first if needed |
260 | 277 | negated = False |
261 | - if tokens[0] == '!': | |
278 | + if tokens[0] == u'!': | |
262 | 279 | negated = True |
263 | 280 | tokens = tokens[1:] |
264 | - if tokens[0] == '[': | |
281 | + if tokens[0] == u'[': | |
265 | 282 | # process square-bracketed relation expressions |
266 | 283 | assert len(tokens) == 3 |
267 | - assert tokens[2] == ']' | |
284 | + assert tokens[2] == u']' | |
268 | 285 | retval = tokens[1] |
269 | 286 | else: |
270 | 287 | # process operator-node relation expressions |
271 | 288 | assert len(tokens) == 2 |
272 | 289 | operator, predicate = tokens |
273 | 290 | # A < B A is the parent of (immediately dominates) B. |
274 | - if operator == '<': | |
275 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
291 | + if operator == u'<': | |
292 | + retval = lambda n: (_istree(n) and | |
276 | 293 | any(predicate(x) for x in n)) |
277 | 294 | # A > B A is the child of B. |
278 | - elif operator == '>': | |
279 | - retval = lambda n: (hasattr(n, 'parent') and | |
295 | + elif operator == u'>': | |
296 | + retval = lambda n: (hasattr(n, u'parent') and | |
280 | 297 | bool(n.parent()) and |
281 | 298 | predicate(n.parent())) |
282 | 299 | # A <, B Synonymous with A <1 B. |
283 | - elif operator == '<,' or operator == '<1': | |
284 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
300 | + elif operator == u'<,' or operator == u'<1': | |
301 | + retval = lambda n: (_istree(n) and | |
285 | 302 | bool(list(n)) and |
286 | 303 | predicate(n[0])) |
287 | 304 | # A >, B Synonymous with A >1 B. |
288 | - elif operator == '>,' or operator == '>1': | |
289 | - retval = lambda n: (hasattr(n, 'parent') and | |
305 | + elif operator == u'>,' or operator == u'>1': | |
306 | + retval = lambda n: (hasattr(n, u'parent') and | |
290 | 307 | bool(n.parent()) and |
291 | 308 | (n is n.parent()[0]) and |
292 | 309 | predicate(n.parent())) |
293 | 310 | # A <N B B is the Nth child of A (the first child is <1). |
294 | - elif operator[0] == '<' and operator[1:].isdigit(): | |
311 | + elif operator[0] == u'<' and operator[1:].isdigit(): | |
295 | 312 | idx = int(operator[1:]) |
296 | 313 | # capture the index parameter |
297 | - retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and | |
314 | + retval = (lambda i: lambda n: (_istree(n) and | |
298 | 315 | bool(list(n)) and |
299 | 316 | 0 <= i < len(n) and |
300 | 317 | predicate(n[i])))(idx - 1) |
301 | 318 | # A >N B A is the Nth child of B (the first child is >1). |
302 | - elif operator[0] == '>' and operator[1:].isdigit(): | |
319 | + elif operator[0] == u'>' and operator[1:].isdigit(): | |
303 | 320 | idx = int(operator[1:]) |
304 | 321 | # capture the index parameter |
305 | - retval = (lambda i: lambda n: (hasattr(n, 'parent') and | |
322 | + retval = (lambda i: lambda n: (hasattr(n, u'parent') and | |
306 | 323 | bool(n.parent()) and |
307 | 324 | 0 <= i < len(n.parent()) and |
308 | 325 | (n is n.parent()[i]) and |
309 | 326 | predicate(n.parent())))(idx - 1) |
310 | 327 | # A <' B B is the last child of A (also synonymous with A <-1 B). |
311 | 328 | # A <- B B is the last child of A (synonymous with A <-1 B). |
312 | - elif operator == '<\'' or operator == '<-' or operator == '<-1': | |
313 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and bool(list(n)) | |
329 | + elif operator == u'<\'' or operator == u'<-' or operator == u'<-1': | |
330 | + retval = lambda n: (_istree(n) and bool(list(n)) | |
314 | 331 | and predicate(n[-1])) |
315 | 332 | # A >' B A is the last child of B (also synonymous with A >-1 B). |
316 | 333 | # A >- B A is the last child of B (synonymous with A >-1 B). |
317 | - elif operator == '>\'' or operator == '>-' or operator == '>-1': | |
318 | - retval = lambda n: (hasattr(n, 'parent') and | |
334 | + elif operator == u'>\'' or operator == u'>-' or operator == u'>-1': | |
335 | + retval = lambda n: (hasattr(n, u'parent') and | |
319 | 336 | bool(n.parent()) and |
320 | 337 | (n is n.parent()[-1]) and |
321 | 338 | predicate(n.parent())) |
322 | 339 | # A <-N B B is the N th-to-last child of A (the last child is <-1). |
323 | - elif operator[:2] == '<-' and operator[2:].isdigit(): | |
340 | + elif operator[:2] == u'<-' and operator[2:].isdigit(): | |
324 | 341 | idx = -int(operator[2:]) |
325 | 342 | # capture the index parameter |
326 | - retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and | |
343 | + retval = (lambda i: lambda n: (_istree(n) and | |
327 | 344 | bool(list(n)) and |
328 | 345 | 0 <= (i + len(n)) < len(n) and |
329 | 346 | predicate(n[i + len(n)])))(idx) |
330 | 347 | # A >-N B A is the N th-to-last child of B (the last child is >-1). |
331 | - elif operator[:2] == '>-' and operator[2:].isdigit(): | |
348 | + elif operator[:2] == u'>-' and operator[2:].isdigit(): | |
332 | 349 | idx = -int(operator[2:]) |
333 | 350 | # capture the index parameter |
334 | 351 | retval = (lambda i: lambda n: |
335 | - (hasattr(n, 'parent') and | |
352 | + (hasattr(n, u'parent') and | |
336 | 353 | bool(n.parent()) and |
337 | 354 | 0 <= (i + len(n.parent())) < len(n.parent()) and |
338 | 355 | (n is n.parent()[i + len(n.parent())]) and |
339 | 356 | predicate(n.parent())))(idx) |
340 | 357 | # A <: B B is the only child of A |
341 | - elif operator == '<:': | |
342 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
358 | + elif operator == u'<:': | |
359 | + retval = lambda n: (_istree(n) and | |
343 | 360 | len(n) == 1 and |
344 | 361 | predicate(n[0])) |
345 | 362 | # A >: B A is the only child of B. |
346 | - elif operator == '>:': | |
347 | - retval = lambda n: (hasattr(n, 'parent') and | |
363 | + elif operator == u'>:': | |
364 | + retval = lambda n: (hasattr(n, u'parent') and | |
348 | 365 | bool(n.parent()) and |
349 | 366 | len(n.parent()) == 1 and |
350 | 367 | predicate(n.parent())) |
351 | 368 | # A << B A dominates B (A is an ancestor of B). |
352 | - elif operator == '<<': | |
353 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
369 | + elif operator == u'<<': | |
370 | + retval = lambda n: (_istree(n) and | |
354 | 371 | any(predicate(x) for x in _descendants(n))) |
355 | 372 | # A >> B A is dominated by B (A is a descendant of B). |
356 | - elif operator == '>>': | |
373 | + elif operator == u'>>': | |
357 | 374 | retval = lambda n: any(predicate(x) for x in ancestors(n)) |
358 | 375 | # A <<, B B is a left-most descendant of A. |
359 | - elif operator == '<<,' or operator == '<<1': | |
360 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
376 | + elif operator == u'<<,' or operator == u'<<1': | |
377 | + retval = lambda n: (_istree(n) and | |
361 | 378 | any(predicate(x) |
362 | 379 | for x in _leftmost_descendants(n))) |
363 | 380 | # A >>, B A is a left-most descendant of B. |
364 | - elif operator == '>>,': | |
381 | + elif operator == u'>>,': | |
365 | 382 | retval = lambda n: any((predicate(x) and |
366 | 383 | n in _leftmost_descendants(x)) |
367 | 384 | for x in ancestors(n)) |
368 | 385 | # A <<' B B is a right-most descendant of A. |
369 | - elif operator == '<<\'': | |
370 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
386 | + elif operator == u'<<\'': | |
387 | + retval = lambda n: (_istree(n) and | |
371 | 388 | any(predicate(x) |
372 | 389 | for x in _rightmost_descendants(n))) |
373 | 390 | # A >>' B A is a right-most descendant of B. |
374 | - elif operator == '>>': | |
391 | + elif operator == u'>>': | |
375 | 392 | retval = lambda n: any((predicate(x) and |
376 | 393 | n in _rightmost_descendants(x)) |
377 | 394 | for x in ancestors(n)) |
378 | 395 | # A <<: B There is a single path of descent from A and B is on it. |
379 | - elif operator == '<<:': | |
380 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | |
396 | + elif operator == u'<<:': | |
397 | + retval = lambda n: (_istree(n) and | |
381 | 398 | any(predicate(x) |
382 | 399 | for x in _unique_descendants(n))) |
383 | 400 | # A >>: B There is a single path of descent from B and A is on it. |
384 | - elif operator == '>>:': | |
401 | + elif operator == u'>>:': | |
385 | 402 | retval = lambda n: any(predicate(x) for x in unique_ancestors(n)) |
386 | 403 | # A . B A immediately precedes B. |
387 | - elif operator == '.': | |
404 | + elif operator == u'.': | |
388 | 405 | retval = lambda n: any(predicate(x) |
389 | 406 | for x in _immediately_after(n)) |
390 | 407 | # A , B A immediately follows B. |
391 | - elif operator == ',': | |
408 | + elif operator == u',': | |
392 | 409 | retval = lambda n: any(predicate(x) |
393 | 410 | for x in _immediately_before(n)) |
394 | 411 | # A .. B A precedes B. |
395 | - elif operator == '..': | |
412 | + elif operator == u'..': | |
396 | 413 | retval = lambda n: any(predicate(x) for x in _after(n)) |
397 | 414 | # A ,, B A follows B. |
398 | - elif operator == ',,': | |
415 | + elif operator == u',,': | |
399 | 416 | retval = lambda n: any(predicate(x) for x in _before(n)) |
400 | 417 | # A $ B A is a sister of B (and A != B). |
401 | - elif operator == '$' or operator == '%': | |
402 | - retval = lambda n: (hasattr(n, 'parent') and | |
418 | + elif operator == u'$' or operator == u'%': | |
419 | + retval = lambda n: (hasattr(n, u'parent') and | |
403 | 420 | bool(n.parent()) and |
404 | 421 | any(predicate(x) |
405 | 422 | for x in n.parent() if x is not n)) |
406 | 423 | # A $. B A is a sister of and immediately precedes B. |
407 | - elif operator == '$.' or operator == '%.': | |
408 | - retval = lambda n: (hasattr(n, 'right_sibling') and | |
424 | + elif operator == u'$.' or operator == u'%.': | |
425 | + retval = lambda n: (hasattr(n, u'right_sibling') and | |
409 | 426 | bool(n.right_sibling()) and |
410 | 427 | predicate(n.right_sibling())) |
411 | 428 | # A $, B A is a sister of and immediately follows B. |
412 | - elif operator == '$,' or operator == '%,': | |
413 | - retval = lambda n: (hasattr(n, 'left_sibling') and | |
429 | + elif operator == u'$,' or operator == u'%,': | |
430 | + retval = lambda n: (hasattr(n, u'left_sibling') and | |
414 | 431 | bool(n.left_sibling()) and |
415 | 432 | predicate(n.left_sibling())) |
416 | 433 | # A $.. B A is a sister of and precedes B. |
417 | - elif operator == '$..' or operator == '%..': | |
418 | - retval = lambda n: (hasattr(n, 'parent') and | |
419 | - hasattr(n, 'parent_index') and | |
434 | + elif operator == u'$..' or operator == u'%..': | |
435 | + retval = lambda n: (hasattr(n, u'parent') and | |
436 | + hasattr(n, u'parent_index') and | |
420 | 437 | bool(n.parent()) and |
421 | 438 | any(predicate(x) for x in |
422 | 439 | n.parent()[n.parent_index() + 1:])) |
423 | 440 | # A $,, B A is a sister of and follows B. |
424 | - elif operator == '$,,' or operator == '%,,': | |
425 | - retval = lambda n: (hasattr(n, 'parent') and | |
426 | - hasattr(n, 'parent_index') and | |
441 | + elif operator == u'$,,' or operator == u'%,,': | |
442 | + retval = lambda n: (hasattr(n, u'parent') and | |
443 | + hasattr(n, u'parent_index') and | |
427 | 444 | bool(n.parent()) and |
428 | 445 | any(predicate(x) for x in |
429 | 446 | n.parent()[:n.parent_index()])) |
430 | 447 | else: |
431 | - assert False, 'cannot interpret tgrep operator "{0}"'.format( | |
448 | + assert False, u'cannot interpret tgrep operator "{0}"'.format( | |
432 | 449 | operator) |
433 | 450 | # now return the built function |
434 | 451 | if negated: |
... | ... | @@ -442,7 +459,7 @@ def _tgrep_rel_conjunction_action(_s, _l, tokens): |
442 | 459 | from the conjunction of several other such lambda functions. |
443 | 460 | ''' |
444 | 461 | # filter out the ampersand |
445 | - tokens = [x for x in tokens if x != '&'] | |
462 | + tokens = [x for x in tokens if x != u'&'] | |
446 | 463 | # print 'relation conjunction tokens: ', tokens |
447 | 464 | if len(tokens) == 1: |
448 | 465 | return tokens[0] |
... | ... | @@ -455,7 +472,7 @@ def _tgrep_rel_disjunction_action(_s, _l, tokens): |
455 | 472 | from the disjunction of several other such lambda functions. |
456 | 473 | ''' |
457 | 474 | # filter out the pipe |
458 | - tokens = [x for x in tokens if x != '|'] | |
475 | + tokens = [x for x in tokens if x != u'|'] | |
459 | 476 | # print 'relation disjunction tokens: ', tokens |
460 | 477 | if len(tokens) == 1: |
461 | 478 | return tokens[0] |
... | ... | @@ -467,40 +484,40 @@ def _build_tgrep_parser(set_parse_actions = True): |
467 | 484 | Builds a pyparsing-based parser object for tokenizing and |
468 | 485 | interpreting tgrep search strings. |
469 | 486 | ''' |
470 | - tgrep_op = (pyparsing.Optional('!') + | |
471 | - pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')) | |
472 | - tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\', | |
487 | + tgrep_op = (pyparsing.Optional(u'!') + | |
488 | + pyparsing.Regex(u'[$%,.<>][%,.<>0-9-\':]*')) | |
489 | + tgrep_qstring = pyparsing.QuotedString(quoteChar=u'"', escChar=u'\\', | |
473 | 490 | unquoteResults=False) |
474 | - tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\', | |
491 | + tgrep_node_regex = pyparsing.QuotedString(quoteChar=u'/', escChar=u'\', | |
475 | 492 | unquoteResults=False) |
476 | - tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%^=]+') | |
493 | + tgrep_node_literal = pyparsing.Regex(u'[^][ \r\t\n;:.,&|<>()$!@%^=]+') | |
477 | 494 | tgrep_expr = pyparsing.Forward() |
478 | 495 | tgrep_relations = pyparsing.Forward() |
479 | - tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')' | |
496 | + tgrep_parens = pyparsing.Literal(u'(') + tgrep_expr + u')' | |
480 | 497 | tgrep_nltk_tree_pos = ( |
481 | - pyparsing.Literal('N(') + | |
482 | - pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' + | |
498 | + pyparsing.Literal(u'N(') + | |
499 | + pyparsing.Optional(pyparsing.Word(pyparsing.nums) + u',' + | |
483 | 500 | pyparsing.Optional(pyparsing.delimitedList( |
484 | - pyparsing.Word(pyparsing.nums), delim=',') + | |
485 | - pyparsing.Optional(','))) + ')') | |
501 | + pyparsing.Word(pyparsing.nums), delim=u',') + | |
502 | + pyparsing.Optional(u','))) + u')') | |
486 | 503 | tgrep_node_expr = (tgrep_qstring | |
487 | 504 | tgrep_node_regex | |
488 | - '*' | | |
505 | + u'*' | | |
489 | 506 | tgrep_node_literal) |
490 | 507 | tgrep_node = (tgrep_parens | |
491 | 508 | tgrep_nltk_tree_pos | |
492 | - (pyparsing.Optional("'") + | |
509 | + (pyparsing.Optional(u"'") + | |
493 | 510 | tgrep_node_expr + |
494 | - pyparsing.ZeroOrMore("|" + tgrep_node_expr))) | |
511 | + pyparsing.ZeroOrMore(u"|" + tgrep_node_expr))) | |
495 | 512 | tgrep_relation = pyparsing.Forward() |
496 | - tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']' | |
513 | + tgrep_brackets = pyparsing.Optional(u'!') + u'[' + tgrep_relations + u']' | |
497 | 514 | tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node |
498 | 515 | tgrep_rel_conjunction = pyparsing.Forward() |
499 | 516 | tgrep_rel_conjunction << (tgrep_relation + |
500 | - pyparsing.ZeroOrMore(pyparsing.Optional('&') + | |
517 | + pyparsing.ZeroOrMore(pyparsing.Optional(u'&') + | |
501 | 518 | tgrep_rel_conjunction)) |
502 | 519 | tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( |
503 | - "|" + tgrep_relations) | |
520 | + u"|" + tgrep_relations) | |
504 | 521 | tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) |
505 | 522 | if set_parse_actions: |
506 | 523 | tgrep_node.setParseAction(_tgrep_node_action) |
... | ... | @@ -520,6 +537,8 @@ def tgrep_tokenize(tgrep_string): |
520 | 537 | Tokenizes a TGrep search string into separate tokens. |
521 | 538 | ''' |
522 | 539 | parser = _build_tgrep_parser(False) |
540 | + if isinstance(tgrep_string, bytes): | |
541 | + tgrep_string = tgrep_string.decode() | |
523 | 542 | return list(parser.parseString(tgrep_string)) |
524 | 543 | |
525 | 544 | def tgrep_compile(tgrep_string): |
... | ... | @@ -528,6 +547,8 @@ def tgrep_compile(tgrep_string): |
528 | 547 | lambda function. |
529 | 548 | ''' |
530 | 549 | parser = _build_tgrep_parser(True) |
550 | + if isinstance(tgrep_string, bytes): | |
551 | + tgrep_string = tgrep_string.decode() | |
531 | 552 | return list(parser.parseString(tgrep_string, parseAll=True))[0] |
532 | 553 | |
533 | 554 | def treepositions_no_leaves(tree): |
... | ... | @@ -552,14 +573,15 @@ def tgrep_positions(tree, tgrep_string, search_leaves = True): |
552 | 573 | If `search_leaves` is False, the method will not return any |
553 | 574 | results in leaf positions. |
554 | 575 | ''' |
555 | - if not hasattr(tree, 'treepositions'): | |
576 | + try: | |
577 | + if search_leaves: | |
578 | + search_positions = tree.treepositions() | |
579 | + else: | |
580 | + search_positions = treepositions_no_leaves(tree) | |
581 | + except AttributeError: | |
556 | 582 | return [] |
557 | - if isinstance(tgrep_string, basestring): | |
583 | + if isinstance(tgrep_string, (bytes, str)): | |
558 | 584 | tgrep_string = tgrep_compile(tgrep_string) |
559 | - if search_leaves: | |
560 | - search_positions = tree.treepositions() | |
561 | - else: | |
562 | - search_positions = treepositions_no_leaves(tree) | |
563 | 585 | return [position for position in search_positions |
564 | 586 | if tgrep_string(tree[position])] |
565 | 587 | ... | ... |