Commit 3dc55945f34e948398d13b7346524af6a001348d
1 parent
4fd9f50a
Exists in
master
and in
1 other branch
Adiciona nova versao tgrep
Showing
1 changed file
with
144 additions
and
122 deletions
Show diff stats
src/new/tgrep.py
@@ -38,6 +38,7 @@ Tgrep2 source: | @@ -38,6 +38,7 @@ Tgrep2 source: | ||
38 | http://tedlab.mit.edu/~dr/Tgrep2/ | 38 | http://tedlab.mit.edu/~dr/Tgrep2/ |
39 | ''' | 39 | ''' |
40 | 40 | ||
41 | +from builtins import bytes, range, str | ||
41 | import nltk.tree | 42 | import nltk.tree |
42 | import pyparsing | 43 | import pyparsing |
43 | import re | 44 | import re |
@@ -48,11 +49,12 @@ def ancestors(node): | @@ -48,11 +49,12 @@ def ancestors(node): | ||
48 | This method will not work with leaf nodes, since there is no way | 49 | This method will not work with leaf nodes, since there is no way |
49 | to recover the parent. | 50 | to recover the parent. |
50 | ''' | 51 | ''' |
51 | - # if node is a leaf, we cannot retrieve its parent | ||
52 | - if not hasattr(node, 'parent'): | ||
53 | - return [] | ||
54 | results = [] | 52 | results = [] |
55 | - current = node.parent() | 53 | + try: |
54 | + current = node.parent() | ||
55 | + except AttributeError: | ||
56 | + # if node is a leaf, we cannot retrieve its parent | ||
57 | + return results | ||
56 | while current: | 58 | while current: |
57 | results.append(current) | 59 | results.append(current) |
58 | current = current.parent() | 60 | current = current.parent() |
@@ -63,11 +65,12 @@ def unique_ancestors(node): | @@ -63,11 +65,12 @@ def unique_ancestors(node): | ||
63 | Returns the list of all nodes dominating the given node, where | 65 | Returns the list of all nodes dominating the given node, where |
64 | there is only a single path of descent. | 66 | there is only a single path of descent. |
65 | ''' | 67 | ''' |
66 | - # if node is a leaf, we cannot retrieve its parent | ||
67 | - if not hasattr(node, 'parent'): | ||
68 | - return [] | ||
69 | results = [] | 68 | results = [] |
70 | - current = node.parent() | 69 | + try: |
70 | + current = node.parent() | ||
71 | + except AttributeError: | ||
72 | + # if node is a leaf, we cannot retrieve its parent | ||
73 | + return results | ||
71 | while current and len(current) == 1: | 74 | while current and len(current) == 1: |
72 | results.append(current) | 75 | results.append(current) |
73 | current = current.parent() | 76 | current = current.parent() |
@@ -78,29 +81,38 @@ def _descendants(node): | @@ -78,29 +81,38 @@ def _descendants(node): | ||
78 | Returns the list of all nodes which are descended from the given | 81 | Returns the list of all nodes which are descended from the given |
79 | tree node in some way. | 82 | tree node in some way. |
80 | ''' | 83 | ''' |
81 | - if not hasattr(node, 'treepositions'): | 84 | + try: |
85 | + treepos = node.treepositions() | ||
86 | + except AttributeError: | ||
82 | return [] | 87 | return [] |
83 | - return [node[x] for x in node.treepositions()[1:]] | 88 | + return [node[x] for x in treepos[1:]] |
84 | 89 | ||
85 | def _leftmost_descendants(node): | 90 | def _leftmost_descendants(node): |
86 | ''' | 91 | ''' |
87 | Returns the set of all nodes descended in some way through | 92 | Returns the set of all nodes descended in some way through |
88 | left branches from this node. | 93 | left branches from this node. |
89 | ''' | 94 | ''' |
90 | - if not hasattr(node, 'treepositions'): | 95 | + try: |
96 | + treepos = node.treepositions() | ||
97 | + except AttributeError: | ||
91 | return [] | 98 | return [] |
92 | - return [node[x] for x in node.treepositions()[1:] if all(y == 0 for y in x)] | 99 | + return [node[x] for x in treepos[1:] if all(y == 0 for y in x)] |
93 | 100 | ||
94 | def _rightmost_descendants(node): | 101 | def _rightmost_descendants(node): |
95 | ''' | 102 | ''' |
96 | Returns the set of all nodes descended in some way through | 103 | Returns the set of all nodes descended in some way through |
97 | right branches from this node. | 104 | right branches from this node. |
98 | ''' | 105 | ''' |
99 | - if not hasattr(node, 'treepositions'): | 106 | + try: |
107 | + rightmost_leaf = max(node.treepositions()) | ||
108 | + except AttributeError: | ||
100 | return [] | 109 | return [] |
101 | - rightmost_leaf = max(node.treepositions()) | ||
102 | return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] | 110 | return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] |
103 | 111 | ||
112 | +def _istree(obj): | ||
113 | + '''Predicate to check whether `obj` is a nltk.tree.Tree.''' | ||
114 | + return isinstance(obj, nltk.tree.Tree) | ||
115 | + | ||
104 | def _unique_descendants(node): | 116 | def _unique_descendants(node): |
105 | ''' | 117 | ''' |
106 | Returns the list of all nodes descended from the given node, where | 118 | Returns the list of all nodes descended from the given node, where |
@@ -108,7 +120,7 @@ def _unique_descendants(node): | @@ -108,7 +120,7 @@ def _unique_descendants(node): | ||
108 | ''' | 120 | ''' |
109 | results = [] | 121 | results = [] |
110 | current = node | 122 | current = node |
111 | - while current and isinstance(current, nltk.tree.Tree) and len(current) == 1: | 123 | + while current and _istree(current) and len(current) == 1: |
112 | current = current[0] | 124 | current = current[0] |
113 | results.append(current) | 125 | results.append(current) |
114 | return results | 126 | return results |
@@ -117,10 +129,11 @@ def _before(node): | @@ -117,10 +129,11 @@ def _before(node): | ||
117 | ''' | 129 | ''' |
118 | Returns the set of all nodes that are before the given node. | 130 | Returns the set of all nodes that are before the given node. |
119 | ''' | 131 | ''' |
120 | - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | 132 | + try: |
133 | + pos = node.treeposition() | ||
134 | + tree = node.root() | ||
135 | + except AttributeError: | ||
121 | return [] | 136 | return [] |
122 | - pos = node.treeposition() | ||
123 | - tree = node.root() | ||
124 | return [tree[x] for x in tree.treepositions() | 137 | return [tree[x] for x in tree.treepositions() |
125 | if x[:len(pos)] < pos[:len(x)]] | 138 | if x[:len(pos)] < pos[:len(x)]] |
126 | 139 | ||
@@ -133,9 +146,11 @@ def _immediately_before(node): | @@ -133,9 +146,11 @@ def _immediately_before(node): | ||
133 | symbol (word) produced by A immediately precedes the first | 146 | symbol (word) produced by A immediately precedes the first |
134 | terminal symbol produced by B. | 147 | terminal symbol produced by B. |
135 | ''' | 148 | ''' |
136 | - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | 149 | + try: |
150 | + pos = node.treeposition() | ||
151 | + tree = node.root() | ||
152 | + except AttributeError: | ||
137 | return [] | 153 | return [] |
138 | - pos = node.treeposition() | ||
139 | # go "upwards" from pos until there is a place we can go to the left | 154 | # go "upwards" from pos until there is a place we can go to the left |
140 | idx = len(pos) - 1 | 155 | idx = len(pos) - 1 |
141 | while 0 <= idx and pos[idx] == 0: | 156 | while 0 <= idx and pos[idx] == 0: |
@@ -144,17 +159,18 @@ def _immediately_before(node): | @@ -144,17 +159,18 @@ def _immediately_before(node): | ||
144 | return [] | 159 | return [] |
145 | pos = list(pos[:idx + 1]) | 160 | pos = list(pos[:idx + 1]) |
146 | pos[-1] -= 1 | 161 | pos[-1] -= 1 |
147 | - before = node.root()[pos] | 162 | + before = tree[pos] |
148 | return [before] + _rightmost_descendants(before) | 163 | return [before] + _rightmost_descendants(before) |
149 | 164 | ||
150 | def _after(node): | 165 | def _after(node): |
151 | ''' | 166 | ''' |
152 | Returns the set of all nodes that are after the given node. | 167 | Returns the set of all nodes that are after the given node. |
153 | ''' | 168 | ''' |
154 | - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): | 169 | + try: |
170 | + pos = node.treeposition() | ||
171 | + tree = node.root() | ||
172 | + except AttributeError: | ||
155 | return [] | 173 | return [] |
156 | - pos = node.treeposition() | ||
157 | - tree = node.root() | ||
158 | return [tree[x] for x in tree.treepositions() | 174 | return [tree[x] for x in tree.treepositions() |
159 | if x[:len(pos)] > pos[:len(x)]] | 175 | if x[:len(pos)] > pos[:len(x)]] |
160 | 176 | ||
@@ -167,14 +183,15 @@ def _immediately_after(node): | @@ -167,14 +183,15 @@ def _immediately_after(node): | ||
167 | symbol (word) produced by A immediately follows the last | 183 | symbol (word) produced by A immediately follows the last |
168 | terminal symbol produced by B. | 184 | terminal symbol produced by B. |
169 | ''' | 185 | ''' |
170 | - if (not hasattr(node, 'root') or not hasattr(node, 'treeposition') or | ||
171 | - not hasattr(node, 'parent')): | 186 | + try: |
187 | + pos = node.treeposition() | ||
188 | + tree = node.root() | ||
189 | + current = node.parent() | ||
190 | + except AttributeError: | ||
172 | return [] | 191 | return [] |
173 | - pos = node.treeposition() | ||
174 | # go "upwards" from pos until there is a place we can go to the | 192 | # go "upwards" from pos until there is a place we can go to the |
175 | # right | 193 | # right |
176 | idx = len(pos) - 1 | 194 | idx = len(pos) - 1 |
177 | - current = node.parent() | ||
178 | while 0 <= idx and pos[idx] == len(current) - 1: | 195 | while 0 <= idx and pos[idx] == len(current) - 1: |
179 | idx -= 1 | 196 | idx -= 1 |
180 | current = current.parent() | 197 | current = current.parent() |
@@ -182,7 +199,7 @@ def _immediately_after(node): | @@ -182,7 +199,7 @@ def _immediately_after(node): | ||
182 | return [] | 199 | return [] |
183 | pos = list(pos[:idx + 1]) | 200 | pos = list(pos[:idx + 1]) |
184 | pos[-1] += 1 | 201 | pos[-1] += 1 |
185 | - after = node.root()[pos] | 202 | + after = tree[pos] |
186 | return [after] + _leftmost_descendants(after) | 203 | return [after] + _leftmost_descendants(after) |
187 | 204 | ||
188 | def _tgrep_node_literal_value(node): | 205 | def _tgrep_node_literal_value(node): |
@@ -190,7 +207,7 @@ def _tgrep_node_literal_value(node): | @@ -190,7 +207,7 @@ def _tgrep_node_literal_value(node): | ||
190 | Gets the string value of a given parse tree node, for comparison | 207 | Gets the string value of a given parse tree node, for comparison |
191 | using the tgrep node literal predicates. | 208 | using the tgrep node literal predicates. |
192 | ''' | 209 | ''' |
193 | - return (node.label() if isinstance(node, nltk.tree.Tree) else unicode(node)) | 210 | + return (node.label() if _istree(node) else str(node)) |
194 | 211 | ||
195 | def _tgrep_node_action(_s, _l, tokens): | 212 | def _tgrep_node_action(_s, _l, tokens): |
196 | ''' | 213 | ''' |
@@ -198,30 +215,30 @@ def _tgrep_node_action(_s, _l, tokens): | @@ -198,30 +215,30 @@ def _tgrep_node_action(_s, _l, tokens): | ||
198 | depending on the name of its node. | 215 | depending on the name of its node. |
199 | ''' | 216 | ''' |
200 | # print 'node tokens: ', tokens | 217 | # print 'node tokens: ', tokens |
201 | - if tokens[0] == "'": | 218 | + if tokens[0] == u"'": |
202 | # strip initial apostrophe (tgrep2 print command) | 219 | # strip initial apostrophe (tgrep2 print command) |
203 | tokens = tokens[1:] | 220 | tokens = tokens[1:] |
204 | if len(tokens) > 1: | 221 | if len(tokens) > 1: |
205 | # disjunctive definition of a node name | 222 | # disjunctive definition of a node name |
206 | - assert list(set(tokens[1::2])) == ['|'] | 223 | + assert list(set(tokens[1::2])) == [u'|'] |
207 | # recursively call self to interpret each node name definition | 224 | # recursively call self to interpret each node name definition |
208 | tokens = [_tgrep_node_action(None, None, [node]) | 225 | tokens = [_tgrep_node_action(None, None, [node]) |
209 | for node in tokens[::2]] | 226 | for node in tokens[::2]] |
210 | # capture tokens and return the disjunction | 227 | # capture tokens and return the disjunction |
211 | return (lambda t: lambda n: any(f(n) for f in t))(tokens) | 228 | return (lambda t: lambda n: any(f(n) for f in t))(tokens) |
212 | else: | 229 | else: |
213 | - if hasattr(tokens[0], '__call__'): | 230 | + if hasattr(tokens[0], u'__call__'): |
214 | # this is a previously interpreted parenthetical node | 231 | # this is a previously interpreted parenthetical node |
215 | # definition (lambda function) | 232 | # definition (lambda function) |
216 | return tokens[0] | 233 | return tokens[0] |
217 | - elif tokens[0] == '*' or tokens[0] == '__': | 234 | + elif tokens[0] == u'*' or tokens[0] == u'__': |
218 | return lambda n: True | 235 | return lambda n: True |
219 | - elif tokens[0].startswith('"'): | ||
220 | - return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip('"')) | ||
221 | - elif tokens[0].startswith('/'): | 236 | + elif tokens[0].startswith(u'"'): |
237 | + return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip(u'"')) | ||
238 | + elif tokens[0].startswith(u'/'): | ||
222 | return (lambda r: lambda n: | 239 | return (lambda r: lambda n: |
223 | - r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip('/'))) | ||
224 | - elif tokens[0].startswith('i@'): | 240 | + r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip(u'/'))) |
241 | + elif tokens[0].startswith(u'i@'): | ||
225 | return (lambda s: lambda n: | 242 | return (lambda s: lambda n: |
226 | _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower()) | 243 | _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower()) |
227 | else: | 244 | else: |
@@ -234,8 +251,8 @@ def _tgrep_parens_action(_s, _l, tokens): | @@ -234,8 +251,8 @@ def _tgrep_parens_action(_s, _l, tokens): | ||
234 | ''' | 251 | ''' |
235 | # print 'parenthetical tokens: ', tokens | 252 | # print 'parenthetical tokens: ', tokens |
236 | assert len(tokens) == 3 | 253 | assert len(tokens) == 3 |
237 | - assert tokens[0] == '(' | ||
238 | - assert tokens[2] == ')' | 254 | + assert tokens[0] == u'(' |
255 | + assert tokens[2] == u')' | ||
239 | return tokens[1] | 256 | return tokens[1] |
240 | 257 | ||
241 | def _tgrep_nltk_tree_pos_action(_s, _l, tokens): | 258 | def _tgrep_nltk_tree_pos_action(_s, _l, tokens): |
@@ -247,7 +264,7 @@ def _tgrep_nltk_tree_pos_action(_s, _l, tokens): | @@ -247,7 +264,7 @@ def _tgrep_nltk_tree_pos_action(_s, _l, tokens): | ||
247 | # recover the tuple from the parsed sting | 264 | # recover the tuple from the parsed sting |
248 | node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) | 265 | node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) |
249 | # capture the node's tree position | 266 | # capture the node's tree position |
250 | - return (lambda i: lambda n: (hasattr(n, 'treeposition') and | 267 | + return (lambda i: lambda n: (hasattr(n, u'treeposition') and |
251 | n.treeposition() == i))(node_tree_position) | 268 | n.treeposition() == i))(node_tree_position) |
252 | 269 | ||
253 | def _tgrep_relation_action(_s, _l, tokens): | 270 | def _tgrep_relation_action(_s, _l, tokens): |
@@ -258,177 +275,177 @@ def _tgrep_relation_action(_s, _l, tokens): | @@ -258,177 +275,177 @@ def _tgrep_relation_action(_s, _l, tokens): | ||
258 | # print 'relation tokens: ', tokens | 275 | # print 'relation tokens: ', tokens |
259 | # process negation first if needed | 276 | # process negation first if needed |
260 | negated = False | 277 | negated = False |
261 | - if tokens[0] == '!': | 278 | + if tokens[0] == u'!': |
262 | negated = True | 279 | negated = True |
263 | tokens = tokens[1:] | 280 | tokens = tokens[1:] |
264 | - if tokens[0] == '[': | 281 | + if tokens[0] == u'[': |
265 | # process square-bracketed relation expressions | 282 | # process square-bracketed relation expressions |
266 | assert len(tokens) == 3 | 283 | assert len(tokens) == 3 |
267 | - assert tokens[2] == ']' | 284 | + assert tokens[2] == u']' |
268 | retval = tokens[1] | 285 | retval = tokens[1] |
269 | else: | 286 | else: |
270 | # process operator-node relation expressions | 287 | # process operator-node relation expressions |
271 | assert len(tokens) == 2 | 288 | assert len(tokens) == 2 |
272 | operator, predicate = tokens | 289 | operator, predicate = tokens |
273 | # A < B A is the parent of (immediately dominates) B. | 290 | # A < B A is the parent of (immediately dominates) B. |
274 | - if operator == '<': | ||
275 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | 291 | + if operator == u'<': |
292 | + retval = lambda n: (_istree(n) and | ||
276 | any(predicate(x) for x in n)) | 293 | any(predicate(x) for x in n)) |
277 | # A > B A is the child of B. | 294 | # A > B A is the child of B. |
278 | - elif operator == '>': | ||
279 | - retval = lambda n: (hasattr(n, 'parent') and | 295 | + elif operator == u'>': |
296 | + retval = lambda n: (hasattr(n, u'parent') and | ||
280 | bool(n.parent()) and | 297 | bool(n.parent()) and |
281 | predicate(n.parent())) | 298 | predicate(n.parent())) |
282 | # A <, B Synonymous with A <1 B. | 299 | # A <, B Synonymous with A <1 B. |
283 | - elif operator == '<,' or operator == '<1': | ||
284 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | 300 | + elif operator == u'<,' or operator == u'<1': |
301 | + retval = lambda n: (_istree(n) and | ||
285 | bool(list(n)) and | 302 | bool(list(n)) and |
286 | predicate(n[0])) | 303 | predicate(n[0])) |
287 | # A >, B Synonymous with A >1 B. | 304 | # A >, B Synonymous with A >1 B. |
288 | - elif operator == '>,' or operator == '>1': | ||
289 | - retval = lambda n: (hasattr(n, 'parent') and | 305 | + elif operator == u'>,' or operator == u'>1': |
306 | + retval = lambda n: (hasattr(n, u'parent') and | ||
290 | bool(n.parent()) and | 307 | bool(n.parent()) and |
291 | (n is n.parent()[0]) and | 308 | (n is n.parent()[0]) and |
292 | predicate(n.parent())) | 309 | predicate(n.parent())) |
293 | # A <N B B is the Nth child of A (the first child is <1). | 310 | # A <N B B is the Nth child of A (the first child is <1). |
294 | - elif operator[0] == '<' and operator[1:].isdigit(): | 311 | + elif operator[0] == u'<' and operator[1:].isdigit(): |
295 | idx = int(operator[1:]) | 312 | idx = int(operator[1:]) |
296 | # capture the index parameter | 313 | # capture the index parameter |
297 | - retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and | 314 | + retval = (lambda i: lambda n: (_istree(n) and |
298 | bool(list(n)) and | 315 | bool(list(n)) and |
299 | 0 <= i < len(n) and | 316 | 0 <= i < len(n) and |
300 | predicate(n[i])))(idx - 1) | 317 | predicate(n[i])))(idx - 1) |
301 | # A >N B A is the Nth child of B (the first child is >1). | 318 | # A >N B A is the Nth child of B (the first child is >1). |
302 | - elif operator[0] == '>' and operator[1:].isdigit(): | 319 | + elif operator[0] == u'>' and operator[1:].isdigit(): |
303 | idx = int(operator[1:]) | 320 | idx = int(operator[1:]) |
304 | # capture the index parameter | 321 | # capture the index parameter |
305 | - retval = (lambda i: lambda n: (hasattr(n, 'parent') and | 322 | + retval = (lambda i: lambda n: (hasattr(n, u'parent') and |
306 | bool(n.parent()) and | 323 | bool(n.parent()) and |
307 | 0 <= i < len(n.parent()) and | 324 | 0 <= i < len(n.parent()) and |
308 | (n is n.parent()[i]) and | 325 | (n is n.parent()[i]) and |
309 | predicate(n.parent())))(idx - 1) | 326 | predicate(n.parent())))(idx - 1) |
310 | # A <' B B is the last child of A (also synonymous with A <-1 B). | 327 | # A <' B B is the last child of A (also synonymous with A <-1 B). |
311 | # A <- B B is the last child of A (synonymous with A <-1 B). | 328 | # A <- B B is the last child of A (synonymous with A <-1 B). |
312 | - elif operator == '<\'' or operator == '<-' or operator == '<-1': | ||
313 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and bool(list(n)) | 329 | + elif operator == u'<\'' or operator == u'<-' or operator == u'<-1': |
330 | + retval = lambda n: (_istree(n) and bool(list(n)) | ||
314 | and predicate(n[-1])) | 331 | and predicate(n[-1])) |
315 | # A >' B A is the last child of B (also synonymous with A >-1 B). | 332 | # A >' B A is the last child of B (also synonymous with A >-1 B). |
316 | # A >- B A is the last child of B (synonymous with A >-1 B). | 333 | # A >- B A is the last child of B (synonymous with A >-1 B). |
317 | - elif operator == '>\'' or operator == '>-' or operator == '>-1': | ||
318 | - retval = lambda n: (hasattr(n, 'parent') and | 334 | + elif operator == u'>\'' or operator == u'>-' or operator == u'>-1': |
335 | + retval = lambda n: (hasattr(n, u'parent') and | ||
319 | bool(n.parent()) and | 336 | bool(n.parent()) and |
320 | (n is n.parent()[-1]) and | 337 | (n is n.parent()[-1]) and |
321 | predicate(n.parent())) | 338 | predicate(n.parent())) |
322 | # A <-N B B is the N th-to-last child of A (the last child is <-1). | 339 | # A <-N B B is the N th-to-last child of A (the last child is <-1). |
323 | - elif operator[:2] == '<-' and operator[2:].isdigit(): | 340 | + elif operator[:2] == u'<-' and operator[2:].isdigit(): |
324 | idx = -int(operator[2:]) | 341 | idx = -int(operator[2:]) |
325 | # capture the index parameter | 342 | # capture the index parameter |
326 | - retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and | 343 | + retval = (lambda i: lambda n: (_istree(n) and |
327 | bool(list(n)) and | 344 | bool(list(n)) and |
328 | 0 <= (i + len(n)) < len(n) and | 345 | 0 <= (i + len(n)) < len(n) and |
329 | predicate(n[i + len(n)])))(idx) | 346 | predicate(n[i + len(n)])))(idx) |
330 | # A >-N B A is the N th-to-last child of B (the last child is >-1). | 347 | # A >-N B A is the N th-to-last child of B (the last child is >-1). |
331 | - elif operator[:2] == '>-' and operator[2:].isdigit(): | 348 | + elif operator[:2] == u'>-' and operator[2:].isdigit(): |
332 | idx = -int(operator[2:]) | 349 | idx = -int(operator[2:]) |
333 | # capture the index parameter | 350 | # capture the index parameter |
334 | retval = (lambda i: lambda n: | 351 | retval = (lambda i: lambda n: |
335 | - (hasattr(n, 'parent') and | 352 | + (hasattr(n, u'parent') and |
336 | bool(n.parent()) and | 353 | bool(n.parent()) and |
337 | 0 <= (i + len(n.parent())) < len(n.parent()) and | 354 | 0 <= (i + len(n.parent())) < len(n.parent()) and |
338 | (n is n.parent()[i + len(n.parent())]) and | 355 | (n is n.parent()[i + len(n.parent())]) and |
339 | predicate(n.parent())))(idx) | 356 | predicate(n.parent())))(idx) |
340 | # A <: B B is the only child of A | 357 | # A <: B B is the only child of A |
341 | - elif operator == '<:': | ||
342 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | 358 | + elif operator == u'<:': |
359 | + retval = lambda n: (_istree(n) and | ||
343 | len(n) == 1 and | 360 | len(n) == 1 and |
344 | predicate(n[0])) | 361 | predicate(n[0])) |
345 | # A >: B A is the only child of B. | 362 | # A >: B A is the only child of B. |
346 | - elif operator == '>:': | ||
347 | - retval = lambda n: (hasattr(n, 'parent') and | 363 | + elif operator == u'>:': |
364 | + retval = lambda n: (hasattr(n, u'parent') and | ||
348 | bool(n.parent()) and | 365 | bool(n.parent()) and |
349 | len(n.parent()) == 1 and | 366 | len(n.parent()) == 1 and |
350 | predicate(n.parent())) | 367 | predicate(n.parent())) |
351 | # A << B A dominates B (A is an ancestor of B). | 368 | # A << B A dominates B (A is an ancestor of B). |
352 | - elif operator == '<<': | ||
353 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | 369 | + elif operator == u'<<': |
370 | + retval = lambda n: (_istree(n) and | ||
354 | any(predicate(x) for x in _descendants(n))) | 371 | any(predicate(x) for x in _descendants(n))) |
355 | # A >> B A is dominated by B (A is a descendant of B). | 372 | # A >> B A is dominated by B (A is a descendant of B). |
356 | - elif operator == '>>': | 373 | + elif operator == u'>>': |
357 | retval = lambda n: any(predicate(x) for x in ancestors(n)) | 374 | retval = lambda n: any(predicate(x) for x in ancestors(n)) |
358 | # A <<, B B is a left-most descendant of A. | 375 | # A <<, B B is a left-most descendant of A. |
359 | - elif operator == '<<,' or operator == '<<1': | ||
360 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | 376 | + elif operator == u'<<,' or operator == u'<<1': |
377 | + retval = lambda n: (_istree(n) and | ||
361 | any(predicate(x) | 378 | any(predicate(x) |
362 | for x in _leftmost_descendants(n))) | 379 | for x in _leftmost_descendants(n))) |
363 | # A >>, B A is a left-most descendant of B. | 380 | # A >>, B A is a left-most descendant of B. |
364 | - elif operator == '>>,': | 381 | + elif operator == u'>>,': |
365 | retval = lambda n: any((predicate(x) and | 382 | retval = lambda n: any((predicate(x) and |
366 | n in _leftmost_descendants(x)) | 383 | n in _leftmost_descendants(x)) |
367 | for x in ancestors(n)) | 384 | for x in ancestors(n)) |
368 | # A <<' B B is a right-most descendant of A. | 385 | # A <<' B B is a right-most descendant of A. |
369 | - elif operator == '<<\'': | ||
370 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | 386 | + elif operator == u'<<\'': |
387 | + retval = lambda n: (_istree(n) and | ||
371 | any(predicate(x) | 388 | any(predicate(x) |
372 | for x in _rightmost_descendants(n))) | 389 | for x in _rightmost_descendants(n))) |
373 | # A >>' B A is a right-most descendant of B. | 390 | # A >>' B A is a right-most descendant of B. |
374 | - elif operator == '>>': | 391 | + elif operator == u'>>': |
375 | retval = lambda n: any((predicate(x) and | 392 | retval = lambda n: any((predicate(x) and |
376 | n in _rightmost_descendants(x)) | 393 | n in _rightmost_descendants(x)) |
377 | for x in ancestors(n)) | 394 | for x in ancestors(n)) |
378 | # A <<: B There is a single path of descent from A and B is on it. | 395 | # A <<: B There is a single path of descent from A and B is on it. |
379 | - elif operator == '<<:': | ||
380 | - retval = lambda n: (isinstance(n, nltk.tree.Tree) and | 396 | + elif operator == u'<<:': |
397 | + retval = lambda n: (_istree(n) and | ||
381 | any(predicate(x) | 398 | any(predicate(x) |
382 | for x in _unique_descendants(n))) | 399 | for x in _unique_descendants(n))) |
383 | # A >>: B There is a single path of descent from B and A is on it. | 400 | # A >>: B There is a single path of descent from B and A is on it. |
384 | - elif operator == '>>:': | 401 | + elif operator == u'>>:': |
385 | retval = lambda n: any(predicate(x) for x in unique_ancestors(n)) | 402 | retval = lambda n: any(predicate(x) for x in unique_ancestors(n)) |
386 | # A . B A immediately precedes B. | 403 | # A . B A immediately precedes B. |
387 | - elif operator == '.': | 404 | + elif operator == u'.': |
388 | retval = lambda n: any(predicate(x) | 405 | retval = lambda n: any(predicate(x) |
389 | for x in _immediately_after(n)) | 406 | for x in _immediately_after(n)) |
390 | # A , B A immediately follows B. | 407 | # A , B A immediately follows B. |
391 | - elif operator == ',': | 408 | + elif operator == u',': |
392 | retval = lambda n: any(predicate(x) | 409 | retval = lambda n: any(predicate(x) |
393 | for x in _immediately_before(n)) | 410 | for x in _immediately_before(n)) |
394 | # A .. B A precedes B. | 411 | # A .. B A precedes B. |
395 | - elif operator == '..': | 412 | + elif operator == u'..': |
396 | retval = lambda n: any(predicate(x) for x in _after(n)) | 413 | retval = lambda n: any(predicate(x) for x in _after(n)) |
397 | # A ,, B A follows B. | 414 | # A ,, B A follows B. |
398 | - elif operator == ',,': | 415 | + elif operator == u',,': |
399 | retval = lambda n: any(predicate(x) for x in _before(n)) | 416 | retval = lambda n: any(predicate(x) for x in _before(n)) |
400 | # A $ B A is a sister of B (and A != B). | 417 | # A $ B A is a sister of B (and A != B). |
401 | - elif operator == '$' or operator == '%': | ||
402 | - retval = lambda n: (hasattr(n, 'parent') and | 418 | + elif operator == u'$' or operator == u'%': |
419 | + retval = lambda n: (hasattr(n, u'parent') and | ||
403 | bool(n.parent()) and | 420 | bool(n.parent()) and |
404 | any(predicate(x) | 421 | any(predicate(x) |
405 | for x in n.parent() if x is not n)) | 422 | for x in n.parent() if x is not n)) |
406 | # A $. B A is a sister of and immediately precedes B. | 423 | # A $. B A is a sister of and immediately precedes B. |
407 | - elif operator == '$.' or operator == '%.': | ||
408 | - retval = lambda n: (hasattr(n, 'right_sibling') and | 424 | + elif operator == u'$.' or operator == u'%.': |
425 | + retval = lambda n: (hasattr(n, u'right_sibling') and | ||
409 | bool(n.right_sibling()) and | 426 | bool(n.right_sibling()) and |
410 | predicate(n.right_sibling())) | 427 | predicate(n.right_sibling())) |
411 | # A $, B A is a sister of and immediately follows B. | 428 | # A $, B A is a sister of and immediately follows B. |
412 | - elif operator == '$,' or operator == '%,': | ||
413 | - retval = lambda n: (hasattr(n, 'left_sibling') and | 429 | + elif operator == u'$,' or operator == u'%,': |
430 | + retval = lambda n: (hasattr(n, u'left_sibling') and | ||
414 | bool(n.left_sibling()) and | 431 | bool(n.left_sibling()) and |
415 | predicate(n.left_sibling())) | 432 | predicate(n.left_sibling())) |
416 | # A $.. B A is a sister of and precedes B. | 433 | # A $.. B A is a sister of and precedes B. |
417 | - elif operator == '$..' or operator == '%..': | ||
418 | - retval = lambda n: (hasattr(n, 'parent') and | ||
419 | - hasattr(n, 'parent_index') and | 434 | + elif operator == u'$..' or operator == u'%..': |
435 | + retval = lambda n: (hasattr(n, u'parent') and | ||
436 | + hasattr(n, u'parent_index') and | ||
420 | bool(n.parent()) and | 437 | bool(n.parent()) and |
421 | any(predicate(x) for x in | 438 | any(predicate(x) for x in |
422 | n.parent()[n.parent_index() + 1:])) | 439 | n.parent()[n.parent_index() + 1:])) |
423 | # A $,, B A is a sister of and follows B. | 440 | # A $,, B A is a sister of and follows B. |
424 | - elif operator == '$,,' or operator == '%,,': | ||
425 | - retval = lambda n: (hasattr(n, 'parent') and | ||
426 | - hasattr(n, 'parent_index') and | 441 | + elif operator == u'$,,' or operator == u'%,,': |
442 | + retval = lambda n: (hasattr(n, u'parent') and | ||
443 | + hasattr(n, u'parent_index') and | ||
427 | bool(n.parent()) and | 444 | bool(n.parent()) and |
428 | any(predicate(x) for x in | 445 | any(predicate(x) for x in |
429 | n.parent()[:n.parent_index()])) | 446 | n.parent()[:n.parent_index()])) |
430 | else: | 447 | else: |
431 | - assert False, 'cannot interpret tgrep operator "{0}"'.format( | 448 | + assert False, u'cannot interpret tgrep operator "{0}"'.format( |
432 | operator) | 449 | operator) |
433 | # now return the built function | 450 | # now return the built function |
434 | if negated: | 451 | if negated: |
@@ -442,7 +459,7 @@ def _tgrep_rel_conjunction_action(_s, _l, tokens): | @@ -442,7 +459,7 @@ def _tgrep_rel_conjunction_action(_s, _l, tokens): | ||
442 | from the conjunction of several other such lambda functions. | 459 | from the conjunction of several other such lambda functions. |
443 | ''' | 460 | ''' |
444 | # filter out the ampersand | 461 | # filter out the ampersand |
445 | - tokens = [x for x in tokens if x != '&'] | 462 | + tokens = [x for x in tokens if x != u'&'] |
446 | # print 'relation conjunction tokens: ', tokens | 463 | # print 'relation conjunction tokens: ', tokens |
447 | if len(tokens) == 1: | 464 | if len(tokens) == 1: |
448 | return tokens[0] | 465 | return tokens[0] |
@@ -455,7 +472,7 @@ def _tgrep_rel_disjunction_action(_s, _l, tokens): | @@ -455,7 +472,7 @@ def _tgrep_rel_disjunction_action(_s, _l, tokens): | ||
455 | from the disjunction of several other such lambda functions. | 472 | from the disjunction of several other such lambda functions. |
456 | ''' | 473 | ''' |
457 | # filter out the pipe | 474 | # filter out the pipe |
458 | - tokens = [x for x in tokens if x != '|'] | 475 | + tokens = [x for x in tokens if x != u'|'] |
459 | # print 'relation disjunction tokens: ', tokens | 476 | # print 'relation disjunction tokens: ', tokens |
460 | if len(tokens) == 1: | 477 | if len(tokens) == 1: |
461 | return tokens[0] | 478 | return tokens[0] |
@@ -467,40 +484,40 @@ def _build_tgrep_parser(set_parse_actions = True): | @@ -467,40 +484,40 @@ def _build_tgrep_parser(set_parse_actions = True): | ||
467 | Builds a pyparsing-based parser object for tokenizing and | 484 | Builds a pyparsing-based parser object for tokenizing and |
468 | interpreting tgrep search strings. | 485 | interpreting tgrep search strings. |
469 | ''' | 486 | ''' |
470 | - tgrep_op = (pyparsing.Optional('!') + | ||
471 | - pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')) | ||
472 | - tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\', | 487 | + tgrep_op = (pyparsing.Optional(u'!') + |
488 | + pyparsing.Regex(u'[$%,.<>][%,.<>0-9-\':]*')) | ||
489 | + tgrep_qstring = pyparsing.QuotedString(quoteChar=u'"', escChar=u'\\', | ||
473 | unquoteResults=False) | 490 | unquoteResults=False) |
474 | - tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\', | 491 | + tgrep_node_regex = pyparsing.QuotedString(quoteChar=u'/', escChar=u'\', |
475 | unquoteResults=False) | 492 | unquoteResults=False) |
476 | - tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%^=]+') | 493 | + tgrep_node_literal = pyparsing.Regex(u'[^][ \r\t\n;:.,&|<>()$!@%^=]+') |
477 | tgrep_expr = pyparsing.Forward() | 494 | tgrep_expr = pyparsing.Forward() |
478 | tgrep_relations = pyparsing.Forward() | 495 | tgrep_relations = pyparsing.Forward() |
479 | - tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')' | 496 | + tgrep_parens = pyparsing.Literal(u'(') + tgrep_expr + u')' |
480 | tgrep_nltk_tree_pos = ( | 497 | tgrep_nltk_tree_pos = ( |
481 | - pyparsing.Literal('N(') + | ||
482 | - pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' + | 498 | + pyparsing.Literal(u'N(') + |
499 | + pyparsing.Optional(pyparsing.Word(pyparsing.nums) + u',' + | ||
483 | pyparsing.Optional(pyparsing.delimitedList( | 500 | pyparsing.Optional(pyparsing.delimitedList( |
484 | - pyparsing.Word(pyparsing.nums), delim=',') + | ||
485 | - pyparsing.Optional(','))) + ')') | 501 | + pyparsing.Word(pyparsing.nums), delim=u',') + |
502 | + pyparsing.Optional(u','))) + u')') | ||
486 | tgrep_node_expr = (tgrep_qstring | | 503 | tgrep_node_expr = (tgrep_qstring | |
487 | tgrep_node_regex | | 504 | tgrep_node_regex | |
488 | - '*' | | 505 | + u'*' | |
489 | tgrep_node_literal) | 506 | tgrep_node_literal) |
490 | tgrep_node = (tgrep_parens | | 507 | tgrep_node = (tgrep_parens | |
491 | tgrep_nltk_tree_pos | | 508 | tgrep_nltk_tree_pos | |
492 | - (pyparsing.Optional("'") + | 509 | + (pyparsing.Optional(u"'") + |
493 | tgrep_node_expr + | 510 | tgrep_node_expr + |
494 | - pyparsing.ZeroOrMore("|" + tgrep_node_expr))) | 511 | + pyparsing.ZeroOrMore(u"|" + tgrep_node_expr))) |
495 | tgrep_relation = pyparsing.Forward() | 512 | tgrep_relation = pyparsing.Forward() |
496 | - tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']' | 513 | + tgrep_brackets = pyparsing.Optional(u'!') + u'[' + tgrep_relations + u']' |
497 | tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node | 514 | tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node |
498 | tgrep_rel_conjunction = pyparsing.Forward() | 515 | tgrep_rel_conjunction = pyparsing.Forward() |
499 | tgrep_rel_conjunction << (tgrep_relation + | 516 | tgrep_rel_conjunction << (tgrep_relation + |
500 | - pyparsing.ZeroOrMore(pyparsing.Optional('&') + | 517 | + pyparsing.ZeroOrMore(pyparsing.Optional(u'&') + |
501 | tgrep_rel_conjunction)) | 518 | tgrep_rel_conjunction)) |
502 | tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( | 519 | tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( |
503 | - "|" + tgrep_relations) | 520 | + u"|" + tgrep_relations) |
504 | tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) | 521 | tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) |
505 | if set_parse_actions: | 522 | if set_parse_actions: |
506 | tgrep_node.setParseAction(_tgrep_node_action) | 523 | tgrep_node.setParseAction(_tgrep_node_action) |
@@ -520,6 +537,8 @@ def tgrep_tokenize(tgrep_string): | @@ -520,6 +537,8 @@ def tgrep_tokenize(tgrep_string): | ||
520 | Tokenizes a TGrep search string into separate tokens. | 537 | Tokenizes a TGrep search string into separate tokens. |
521 | ''' | 538 | ''' |
522 | parser = _build_tgrep_parser(False) | 539 | parser = _build_tgrep_parser(False) |
540 | + if isinstance(tgrep_string, bytes): | ||
541 | + tgrep_string = tgrep_string.decode() | ||
523 | return list(parser.parseString(tgrep_string)) | 542 | return list(parser.parseString(tgrep_string)) |
524 | 543 | ||
525 | def tgrep_compile(tgrep_string): | 544 | def tgrep_compile(tgrep_string): |
@@ -528,6 +547,8 @@ def tgrep_compile(tgrep_string): | @@ -528,6 +547,8 @@ def tgrep_compile(tgrep_string): | ||
528 | lambda function. | 547 | lambda function. |
529 | ''' | 548 | ''' |
530 | parser = _build_tgrep_parser(True) | 549 | parser = _build_tgrep_parser(True) |
550 | + if isinstance(tgrep_string, bytes): | ||
551 | + tgrep_string = tgrep_string.decode() | ||
531 | return list(parser.parseString(tgrep_string, parseAll=True))[0] | 552 | return list(parser.parseString(tgrep_string, parseAll=True))[0] |
532 | 553 | ||
533 | def treepositions_no_leaves(tree): | 554 | def treepositions_no_leaves(tree): |
@@ -552,14 +573,15 @@ def tgrep_positions(tree, tgrep_string, search_leaves = True): | @@ -552,14 +573,15 @@ def tgrep_positions(tree, tgrep_string, search_leaves = True): | ||
552 | If `search_leaves` is False, the method will not return any | 573 | If `search_leaves` is False, the method will not return any |
553 | results in leaf positions. | 574 | results in leaf positions. |
554 | ''' | 575 | ''' |
555 | - if not hasattr(tree, 'treepositions'): | 576 | + try: |
577 | + if search_leaves: | ||
578 | + search_positions = tree.treepositions() | ||
579 | + else: | ||
580 | + search_positions = treepositions_no_leaves(tree) | ||
581 | + except AttributeError: | ||
556 | return [] | 582 | return [] |
557 | - if isinstance(tgrep_string, basestring): | 583 | + if isinstance(tgrep_string, (bytes, str)): |
558 | tgrep_string = tgrep_compile(tgrep_string) | 584 | tgrep_string = tgrep_compile(tgrep_string) |
559 | - if search_leaves: | ||
560 | - search_positions = tree.treepositions() | ||
561 | - else: | ||
562 | - search_positions = treepositions_no_leaves(tree) | ||
563 | return [position for position in search_positions | 585 | return [position for position in search_positions |
564 | if tgrep_string(tree[position])] | 586 | if tgrep_string(tree[position])] |
565 | 587 |