Commit 3dc55945f34e948398d13b7346524af6a001348d

Authored by Erickson Silva
1 parent 4fd9f50a
Exists in master and in 1 other branch devel

Adiciona nova versao tgrep

Showing 1 changed file with 144 additions and 122 deletions   Show diff stats
src/new/tgrep.py
@@ -38,6 +38,7 @@ Tgrep2 source: @@ -38,6 +38,7 @@ Tgrep2 source:
38 http://tedlab.mit.edu/~dr/Tgrep2/ 38 http://tedlab.mit.edu/~dr/Tgrep2/
39 ''' 39 '''
40 40
  41 +from builtins import bytes, range, str
41 import nltk.tree 42 import nltk.tree
42 import pyparsing 43 import pyparsing
43 import re 44 import re
@@ -48,11 +49,12 @@ def ancestors(node): @@ -48,11 +49,12 @@ def ancestors(node):
48 This method will not work with leaf nodes, since there is no way 49 This method will not work with leaf nodes, since there is no way
49 to recover the parent. 50 to recover the parent.
50 ''' 51 '''
51 - # if node is a leaf, we cannot retrieve its parent  
52 - if not hasattr(node, 'parent'):  
53 - return []  
54 results = [] 52 results = []
55 - current = node.parent() 53 + try:
  54 + current = node.parent()
  55 + except AttributeError:
  56 + # if node is a leaf, we cannot retrieve its parent
  57 + return results
56 while current: 58 while current:
57 results.append(current) 59 results.append(current)
58 current = current.parent() 60 current = current.parent()
@@ -63,11 +65,12 @@ def unique_ancestors(node): @@ -63,11 +65,12 @@ def unique_ancestors(node):
63 Returns the list of all nodes dominating the given node, where 65 Returns the list of all nodes dominating the given node, where
64 there is only a single path of descent. 66 there is only a single path of descent.
65 ''' 67 '''
66 - # if node is a leaf, we cannot retrieve its parent  
67 - if not hasattr(node, 'parent'):  
68 - return []  
69 results = [] 68 results = []
70 - current = node.parent() 69 + try:
  70 + current = node.parent()
  71 + except AttributeError:
  72 + # if node is a leaf, we cannot retrieve its parent
  73 + return results
71 while current and len(current) == 1: 74 while current and len(current) == 1:
72 results.append(current) 75 results.append(current)
73 current = current.parent() 76 current = current.parent()
@@ -78,29 +81,38 @@ def _descendants(node): @@ -78,29 +81,38 @@ def _descendants(node):
78 Returns the list of all nodes which are descended from the given 81 Returns the list of all nodes which are descended from the given
79 tree node in some way. 82 tree node in some way.
80 ''' 83 '''
81 - if not hasattr(node, 'treepositions'): 84 + try:
  85 + treepos = node.treepositions()
  86 + except AttributeError:
82 return [] 87 return []
83 - return [node[x] for x in node.treepositions()[1:]] 88 + return [node[x] for x in treepos[1:]]
84 89
85 def _leftmost_descendants(node): 90 def _leftmost_descendants(node):
86 ''' 91 '''
87 Returns the set of all nodes descended in some way through 92 Returns the set of all nodes descended in some way through
88 left branches from this node. 93 left branches from this node.
89 ''' 94 '''
90 - if not hasattr(node, 'treepositions'): 95 + try:
  96 + treepos = node.treepositions()
  97 + except AttributeError:
91 return [] 98 return []
92 - return [node[x] for x in node.treepositions()[1:] if all(y == 0 for y in x)] 99 + return [node[x] for x in treepos[1:] if all(y == 0 for y in x)]
93 100
94 def _rightmost_descendants(node): 101 def _rightmost_descendants(node):
95 ''' 102 '''
96 Returns the set of all nodes descended in some way through 103 Returns the set of all nodes descended in some way through
97 right branches from this node. 104 right branches from this node.
98 ''' 105 '''
99 - if not hasattr(node, 'treepositions'): 106 + try:
  107 + rightmost_leaf = max(node.treepositions())
  108 + except AttributeError:
100 return [] 109 return []
101 - rightmost_leaf = max(node.treepositions())  
102 return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] 110 return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)]
103 111
  112 +def _istree(obj):
  113 + '''Predicate to check whether `obj` is a nltk.tree.Tree.'''
  114 + return isinstance(obj, nltk.tree.Tree)
  115 +
104 def _unique_descendants(node): 116 def _unique_descendants(node):
105 ''' 117 '''
106 Returns the list of all nodes descended from the given node, where 118 Returns the list of all nodes descended from the given node, where
@@ -108,7 +120,7 @@ def _unique_descendants(node): @@ -108,7 +120,7 @@ def _unique_descendants(node):
108 ''' 120 '''
109 results = [] 121 results = []
110 current = node 122 current = node
111 - while current and isinstance(current, nltk.tree.Tree) and len(current) == 1: 123 + while current and _istree(current) and len(current) == 1:
112 current = current[0] 124 current = current[0]
113 results.append(current) 125 results.append(current)
114 return results 126 return results
@@ -117,10 +129,11 @@ def _before(node): @@ -117,10 +129,11 @@ def _before(node):
117 ''' 129 '''
118 Returns the set of all nodes that are before the given node. 130 Returns the set of all nodes that are before the given node.
119 ''' 131 '''
120 - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): 132 + try:
  133 + pos = node.treeposition()
  134 + tree = node.root()
  135 + except AttributeError:
121 return [] 136 return []
122 - pos = node.treeposition()  
123 - tree = node.root()  
124 return [tree[x] for x in tree.treepositions() 137 return [tree[x] for x in tree.treepositions()
125 if x[:len(pos)] < pos[:len(x)]] 138 if x[:len(pos)] < pos[:len(x)]]
126 139
@@ -133,9 +146,11 @@ def _immediately_before(node): @@ -133,9 +146,11 @@ def _immediately_before(node):
133 symbol (word) produced by A immediately precedes the first 146 symbol (word) produced by A immediately precedes the first
134 terminal symbol produced by B. 147 terminal symbol produced by B.
135 ''' 148 '''
136 - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): 149 + try:
  150 + pos = node.treeposition()
  151 + tree = node.root()
  152 + except AttributeError:
137 return [] 153 return []
138 - pos = node.treeposition()  
139 # go "upwards" from pos until there is a place we can go to the left 154 # go "upwards" from pos until there is a place we can go to the left
140 idx = len(pos) - 1 155 idx = len(pos) - 1
141 while 0 <= idx and pos[idx] == 0: 156 while 0 <= idx and pos[idx] == 0:
@@ -144,17 +159,18 @@ def _immediately_before(node): @@ -144,17 +159,18 @@ def _immediately_before(node):
144 return [] 159 return []
145 pos = list(pos[:idx + 1]) 160 pos = list(pos[:idx + 1])
146 pos[-1] -= 1 161 pos[-1] -= 1
147 - before = node.root()[pos] 162 + before = tree[pos]
148 return [before] + _rightmost_descendants(before) 163 return [before] + _rightmost_descendants(before)
149 164
150 def _after(node): 165 def _after(node):
151 ''' 166 '''
152 Returns the set of all nodes that are after the given node. 167 Returns the set of all nodes that are after the given node.
153 ''' 168 '''
154 - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): 169 + try:
  170 + pos = node.treeposition()
  171 + tree = node.root()
  172 + except AttributeError:
155 return [] 173 return []
156 - pos = node.treeposition()  
157 - tree = node.root()  
158 return [tree[x] for x in tree.treepositions() 174 return [tree[x] for x in tree.treepositions()
159 if x[:len(pos)] > pos[:len(x)]] 175 if x[:len(pos)] > pos[:len(x)]]
160 176
@@ -167,14 +183,15 @@ def _immediately_after(node): @@ -167,14 +183,15 @@ def _immediately_after(node):
167 symbol (word) produced by A immediately follows the last 183 symbol (word) produced by A immediately follows the last
168 terminal symbol produced by B. 184 terminal symbol produced by B.
169 ''' 185 '''
170 - if (not hasattr(node, 'root') or not hasattr(node, 'treeposition') or  
171 - not hasattr(node, 'parent')): 186 + try:
  187 + pos = node.treeposition()
  188 + tree = node.root()
  189 + current = node.parent()
  190 + except AttributeError:
172 return [] 191 return []
173 - pos = node.treeposition()  
174 # go "upwards" from pos until there is a place we can go to the 192 # go "upwards" from pos until there is a place we can go to the
175 # right 193 # right
176 idx = len(pos) - 1 194 idx = len(pos) - 1
177 - current = node.parent()  
178 while 0 <= idx and pos[idx] == len(current) - 1: 195 while 0 <= idx and pos[idx] == len(current) - 1:
179 idx -= 1 196 idx -= 1
180 current = current.parent() 197 current = current.parent()
@@ -182,7 +199,7 @@ def _immediately_after(node): @@ -182,7 +199,7 @@ def _immediately_after(node):
182 return [] 199 return []
183 pos = list(pos[:idx + 1]) 200 pos = list(pos[:idx + 1])
184 pos[-1] += 1 201 pos[-1] += 1
185 - after = node.root()[pos] 202 + after = tree[pos]
186 return [after] + _leftmost_descendants(after) 203 return [after] + _leftmost_descendants(after)
187 204
188 def _tgrep_node_literal_value(node): 205 def _tgrep_node_literal_value(node):
@@ -190,7 +207,7 @@ def _tgrep_node_literal_value(node): @@ -190,7 +207,7 @@ def _tgrep_node_literal_value(node):
190 Gets the string value of a given parse tree node, for comparison 207 Gets the string value of a given parse tree node, for comparison
191 using the tgrep node literal predicates. 208 using the tgrep node literal predicates.
192 ''' 209 '''
193 - return (node.label() if isinstance(node, nltk.tree.Tree) else unicode(node)) 210 + return (node.label() if _istree(node) else str(node))
194 211
195 def _tgrep_node_action(_s, _l, tokens): 212 def _tgrep_node_action(_s, _l, tokens):
196 ''' 213 '''
@@ -198,30 +215,30 @@ def _tgrep_node_action(_s, _l, tokens): @@ -198,30 +215,30 @@ def _tgrep_node_action(_s, _l, tokens):
198 depending on the name of its node. 215 depending on the name of its node.
199 ''' 216 '''
200 # print 'node tokens: ', tokens 217 # print 'node tokens: ', tokens
201 - if tokens[0] == "'": 218 + if tokens[0] == u"'":
202 # strip initial apostrophe (tgrep2 print command) 219 # strip initial apostrophe (tgrep2 print command)
203 tokens = tokens[1:] 220 tokens = tokens[1:]
204 if len(tokens) > 1: 221 if len(tokens) > 1:
205 # disjunctive definition of a node name 222 # disjunctive definition of a node name
206 - assert list(set(tokens[1::2])) == ['|'] 223 + assert list(set(tokens[1::2])) == [u'|']
207 # recursively call self to interpret each node name definition 224 # recursively call self to interpret each node name definition
208 tokens = [_tgrep_node_action(None, None, [node]) 225 tokens = [_tgrep_node_action(None, None, [node])
209 for node in tokens[::2]] 226 for node in tokens[::2]]
210 # capture tokens and return the disjunction 227 # capture tokens and return the disjunction
211 return (lambda t: lambda n: any(f(n) for f in t))(tokens) 228 return (lambda t: lambda n: any(f(n) for f in t))(tokens)
212 else: 229 else:
213 - if hasattr(tokens[0], '__call__'): 230 + if hasattr(tokens[0], u'__call__'):
214 # this is a previously interpreted parenthetical node 231 # this is a previously interpreted parenthetical node
215 # definition (lambda function) 232 # definition (lambda function)
216 return tokens[0] 233 return tokens[0]
217 - elif tokens[0] == '*' or tokens[0] == '__': 234 + elif tokens[0] == u'*' or tokens[0] == u'__':
218 return lambda n: True 235 return lambda n: True
219 - elif tokens[0].startswith('"'):  
220 - return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip('"'))  
221 - elif tokens[0].startswith('/'): 236 + elif tokens[0].startswith(u'"'):
  237 + return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip(u'"'))
  238 + elif tokens[0].startswith(u'/'):
222 return (lambda r: lambda n: 239 return (lambda r: lambda n:
223 - r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip('/')))  
224 - elif tokens[0].startswith('i@'): 240 + r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip(u'/')))
  241 + elif tokens[0].startswith(u'i@'):
225 return (lambda s: lambda n: 242 return (lambda s: lambda n:
226 _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower()) 243 _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower())
227 else: 244 else:
@@ -234,8 +251,8 @@ def _tgrep_parens_action(_s, _l, tokens): @@ -234,8 +251,8 @@ def _tgrep_parens_action(_s, _l, tokens):
234 ''' 251 '''
235 # print 'parenthetical tokens: ', tokens 252 # print 'parenthetical tokens: ', tokens
236 assert len(tokens) == 3 253 assert len(tokens) == 3
237 - assert tokens[0] == '('  
238 - assert tokens[2] == ')' 254 + assert tokens[0] == u'('
  255 + assert tokens[2] == u')'
239 return tokens[1] 256 return tokens[1]
240 257
241 def _tgrep_nltk_tree_pos_action(_s, _l, tokens): 258 def _tgrep_nltk_tree_pos_action(_s, _l, tokens):
@@ -247,7 +264,7 @@ def _tgrep_nltk_tree_pos_action(_s, _l, tokens): @@ -247,7 +264,7 @@ def _tgrep_nltk_tree_pos_action(_s, _l, tokens):
247 # recover the tuple from the parsed sting 264 # recover the tuple from the parsed sting
248 node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) 265 node_tree_position = tuple(int(x) for x in tokens if x.isdigit())
249 # capture the node's tree position 266 # capture the node's tree position
250 - return (lambda i: lambda n: (hasattr(n, 'treeposition') and 267 + return (lambda i: lambda n: (hasattr(n, u'treeposition') and
251 n.treeposition() == i))(node_tree_position) 268 n.treeposition() == i))(node_tree_position)
252 269
253 def _tgrep_relation_action(_s, _l, tokens): 270 def _tgrep_relation_action(_s, _l, tokens):
@@ -258,177 +275,177 @@ def _tgrep_relation_action(_s, _l, tokens): @@ -258,177 +275,177 @@ def _tgrep_relation_action(_s, _l, tokens):
258 # print 'relation tokens: ', tokens 275 # print 'relation tokens: ', tokens
259 # process negation first if needed 276 # process negation first if needed
260 negated = False 277 negated = False
261 - if tokens[0] == '!': 278 + if tokens[0] == u'!':
262 negated = True 279 negated = True
263 tokens = tokens[1:] 280 tokens = tokens[1:]
264 - if tokens[0] == '[': 281 + if tokens[0] == u'[':
265 # process square-bracketed relation expressions 282 # process square-bracketed relation expressions
266 assert len(tokens) == 3 283 assert len(tokens) == 3
267 - assert tokens[2] == ']' 284 + assert tokens[2] == u']'
268 retval = tokens[1] 285 retval = tokens[1]
269 else: 286 else:
270 # process operator-node relation expressions 287 # process operator-node relation expressions
271 assert len(tokens) == 2 288 assert len(tokens) == 2
272 operator, predicate = tokens 289 operator, predicate = tokens
273 # A < B A is the parent of (immediately dominates) B. 290 # A < B A is the parent of (immediately dominates) B.
274 - if operator == '<':  
275 - retval = lambda n: (isinstance(n, nltk.tree.Tree) and 291 + if operator == u'<':
  292 + retval = lambda n: (_istree(n) and
276 any(predicate(x) for x in n)) 293 any(predicate(x) for x in n))
277 # A > B A is the child of B. 294 # A > B A is the child of B.
278 - elif operator == '>':  
279 - retval = lambda n: (hasattr(n, 'parent') and 295 + elif operator == u'>':
  296 + retval = lambda n: (hasattr(n, u'parent') and
280 bool(n.parent()) and 297 bool(n.parent()) and
281 predicate(n.parent())) 298 predicate(n.parent()))
282 # A <, B Synonymous with A <1 B. 299 # A <, B Synonymous with A <1 B.
283 - elif operator == '<,' or operator == '<1':  
284 - retval = lambda n: (isinstance(n, nltk.tree.Tree) and 300 + elif operator == u'<,' or operator == u'<1':
  301 + retval = lambda n: (_istree(n) and
285 bool(list(n)) and 302 bool(list(n)) and
286 predicate(n[0])) 303 predicate(n[0]))
287 # A >, B Synonymous with A >1 B. 304 # A >, B Synonymous with A >1 B.
288 - elif operator == '>,' or operator == '>1':  
289 - retval = lambda n: (hasattr(n, 'parent') and 305 + elif operator == u'>,' or operator == u'>1':
  306 + retval = lambda n: (hasattr(n, u'parent') and
290 bool(n.parent()) and 307 bool(n.parent()) and
291 (n is n.parent()[0]) and 308 (n is n.parent()[0]) and
292 predicate(n.parent())) 309 predicate(n.parent()))
293 # A <N B B is the Nth child of A (the first child is <1). 310 # A <N B B is the Nth child of A (the first child is <1).
294 - elif operator[0] == '<' and operator[1:].isdigit(): 311 + elif operator[0] == u'<' and operator[1:].isdigit():
295 idx = int(operator[1:]) 312 idx = int(operator[1:])
296 # capture the index parameter 313 # capture the index parameter
297 - retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and 314 + retval = (lambda i: lambda n: (_istree(n) and
298 bool(list(n)) and 315 bool(list(n)) and
299 0 <= i < len(n) and 316 0 <= i < len(n) and
300 predicate(n[i])))(idx - 1) 317 predicate(n[i])))(idx - 1)
301 # A >N B A is the Nth child of B (the first child is >1). 318 # A >N B A is the Nth child of B (the first child is >1).
302 - elif operator[0] == '>' and operator[1:].isdigit(): 319 + elif operator[0] == u'>' and operator[1:].isdigit():
303 idx = int(operator[1:]) 320 idx = int(operator[1:])
304 # capture the index parameter 321 # capture the index parameter
305 - retval = (lambda i: lambda n: (hasattr(n, 'parent') and 322 + retval = (lambda i: lambda n: (hasattr(n, u'parent') and
306 bool(n.parent()) and 323 bool(n.parent()) and
307 0 <= i < len(n.parent()) and 324 0 <= i < len(n.parent()) and
308 (n is n.parent()[i]) and 325 (n is n.parent()[i]) and
309 predicate(n.parent())))(idx - 1) 326 predicate(n.parent())))(idx - 1)
310 # A <' B B is the last child of A (also synonymous with A <-1 B). 327 # A <' B B is the last child of A (also synonymous with A <-1 B).
311 # A <- B B is the last child of A (synonymous with A <-1 B). 328 # A <- B B is the last child of A (synonymous with A <-1 B).
312 - elif operator == '<\'' or operator == '<-' or operator == '<-1':  
313 - retval = lambda n: (isinstance(n, nltk.tree.Tree) and bool(list(n)) 329 + elif operator == u'<\'' or operator == u'<-' or operator == u'<-1':
  330 + retval = lambda n: (_istree(n) and bool(list(n))
314 and predicate(n[-1])) 331 and predicate(n[-1]))
315 # A >' B A is the last child of B (also synonymous with A >-1 B). 332 # A >' B A is the last child of B (also synonymous with A >-1 B).
316 # A >- B A is the last child of B (synonymous with A >-1 B). 333 # A >- B A is the last child of B (synonymous with A >-1 B).
317 - elif operator == '>\'' or operator == '>-' or operator == '>-1':  
318 - retval = lambda n: (hasattr(n, 'parent') and 334 + elif operator == u'>\'' or operator == u'>-' or operator == u'>-1':
  335 + retval = lambda n: (hasattr(n, u'parent') and
319 bool(n.parent()) and 336 bool(n.parent()) and
320 (n is n.parent()[-1]) and 337 (n is n.parent()[-1]) and
321 predicate(n.parent())) 338 predicate(n.parent()))
322 # A <-N B B is the N th-to-last child of A (the last child is <-1). 339 # A <-N B B is the N th-to-last child of A (the last child is <-1).
323 - elif operator[:2] == '<-' and operator[2:].isdigit(): 340 + elif operator[:2] == u'<-' and operator[2:].isdigit():
324 idx = -int(operator[2:]) 341 idx = -int(operator[2:])
325 # capture the index parameter 342 # capture the index parameter
326 - retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and 343 + retval = (lambda i: lambda n: (_istree(n) and
327 bool(list(n)) and 344 bool(list(n)) and
328 0 <= (i + len(n)) < len(n) and 345 0 <= (i + len(n)) < len(n) and
329 predicate(n[i + len(n)])))(idx) 346 predicate(n[i + len(n)])))(idx)
330 # A >-N B A is the N th-to-last child of B (the last child is >-1). 347 # A >-N B A is the N th-to-last child of B (the last child is >-1).
331 - elif operator[:2] == '>-' and operator[2:].isdigit(): 348 + elif operator[:2] == u'>-' and operator[2:].isdigit():
332 idx = -int(operator[2:]) 349 idx = -int(operator[2:])
333 # capture the index parameter 350 # capture the index parameter
334 retval = (lambda i: lambda n: 351 retval = (lambda i: lambda n:
335 - (hasattr(n, 'parent') and 352 + (hasattr(n, u'parent') and
336 bool(n.parent()) and 353 bool(n.parent()) and
337 0 <= (i + len(n.parent())) < len(n.parent()) and 354 0 <= (i + len(n.parent())) < len(n.parent()) and
338 (n is n.parent()[i + len(n.parent())]) and 355 (n is n.parent()[i + len(n.parent())]) and
339 predicate(n.parent())))(idx) 356 predicate(n.parent())))(idx)
340 # A <: B B is the only child of A 357 # A <: B B is the only child of A
341 - elif operator == '<:':  
342 - retval = lambda n: (isinstance(n, nltk.tree.Tree) and 358 + elif operator == u'<:':
  359 + retval = lambda n: (_istree(n) and
343 len(n) == 1 and 360 len(n) == 1 and
344 predicate(n[0])) 361 predicate(n[0]))
345 # A >: B A is the only child of B. 362 # A >: B A is the only child of B.
346 - elif operator == '>:':  
347 - retval = lambda n: (hasattr(n, 'parent') and 363 + elif operator == u'>:':
  364 + retval = lambda n: (hasattr(n, u'parent') and
348 bool(n.parent()) and 365 bool(n.parent()) and
349 len(n.parent()) == 1 and 366 len(n.parent()) == 1 and
350 predicate(n.parent())) 367 predicate(n.parent()))
351 # A << B A dominates B (A is an ancestor of B). 368 # A << B A dominates B (A is an ancestor of B).
352 - elif operator == '<<':  
353 - retval = lambda n: (isinstance(n, nltk.tree.Tree) and 369 + elif operator == u'<<':
  370 + retval = lambda n: (_istree(n) and
354 any(predicate(x) for x in _descendants(n))) 371 any(predicate(x) for x in _descendants(n)))
355 # A >> B A is dominated by B (A is a descendant of B). 372 # A >> B A is dominated by B (A is a descendant of B).
356 - elif operator == '>>': 373 + elif operator == u'>>':
357 retval = lambda n: any(predicate(x) for x in ancestors(n)) 374 retval = lambda n: any(predicate(x) for x in ancestors(n))
358 # A <<, B B is a left-most descendant of A. 375 # A <<, B B is a left-most descendant of A.
359 - elif operator == '<<,' or operator == '<<1':  
360 - retval = lambda n: (isinstance(n, nltk.tree.Tree) and 376 + elif operator == u'<<,' or operator == u'<<1':
  377 + retval = lambda n: (_istree(n) and
361 any(predicate(x) 378 any(predicate(x)
362 for x in _leftmost_descendants(n))) 379 for x in _leftmost_descendants(n)))
363 # A >>, B A is a left-most descendant of B. 380 # A >>, B A is a left-most descendant of B.
364 - elif operator == '>>,': 381 + elif operator == u'>>,':
365 retval = lambda n: any((predicate(x) and 382 retval = lambda n: any((predicate(x) and
366 n in _leftmost_descendants(x)) 383 n in _leftmost_descendants(x))
367 for x in ancestors(n)) 384 for x in ancestors(n))
368 # A <<' B B is a right-most descendant of A. 385 # A <<' B B is a right-most descendant of A.
369 - elif operator == '<<\'':  
370 - retval = lambda n: (isinstance(n, nltk.tree.Tree) and 386 + elif operator == u'<<\'':
  387 + retval = lambda n: (_istree(n) and
371 any(predicate(x) 388 any(predicate(x)
372 for x in _rightmost_descendants(n))) 389 for x in _rightmost_descendants(n)))
373 # A >>' B A is a right-most descendant of B. 390 # A >>' B A is a right-most descendant of B.
374 - elif operator == '>>': 391 + elif operator == u'>>':
375 retval = lambda n: any((predicate(x) and 392 retval = lambda n: any((predicate(x) and
376 n in _rightmost_descendants(x)) 393 n in _rightmost_descendants(x))
377 for x in ancestors(n)) 394 for x in ancestors(n))
378 # A <<: B There is a single path of descent from A and B is on it. 395 # A <<: B There is a single path of descent from A and B is on it.
379 - elif operator == '<<:':  
380 - retval = lambda n: (isinstance(n, nltk.tree.Tree) and 396 + elif operator == u'<<:':
  397 + retval = lambda n: (_istree(n) and
381 any(predicate(x) 398 any(predicate(x)
382 for x in _unique_descendants(n))) 399 for x in _unique_descendants(n)))
383 # A >>: B There is a single path of descent from B and A is on it. 400 # A >>: B There is a single path of descent from B and A is on it.
384 - elif operator == '>>:': 401 + elif operator == u'>>:':
385 retval = lambda n: any(predicate(x) for x in unique_ancestors(n)) 402 retval = lambda n: any(predicate(x) for x in unique_ancestors(n))
386 # A . B A immediately precedes B. 403 # A . B A immediately precedes B.
387 - elif operator == '.': 404 + elif operator == u'.':
388 retval = lambda n: any(predicate(x) 405 retval = lambda n: any(predicate(x)
389 for x in _immediately_after(n)) 406 for x in _immediately_after(n))
390 # A , B A immediately follows B. 407 # A , B A immediately follows B.
391 - elif operator == ',': 408 + elif operator == u',':
392 retval = lambda n: any(predicate(x) 409 retval = lambda n: any(predicate(x)
393 for x in _immediately_before(n)) 410 for x in _immediately_before(n))
394 # A .. B A precedes B. 411 # A .. B A precedes B.
395 - elif operator == '..': 412 + elif operator == u'..':
396 retval = lambda n: any(predicate(x) for x in _after(n)) 413 retval = lambda n: any(predicate(x) for x in _after(n))
397 # A ,, B A follows B. 414 # A ,, B A follows B.
398 - elif operator == ',,': 415 + elif operator == u',,':
399 retval = lambda n: any(predicate(x) for x in _before(n)) 416 retval = lambda n: any(predicate(x) for x in _before(n))
400 # A $ B A is a sister of B (and A != B). 417 # A $ B A is a sister of B (and A != B).
401 - elif operator == '$' or operator == '%':  
402 - retval = lambda n: (hasattr(n, 'parent') and 418 + elif operator == u'$' or operator == u'%':
  419 + retval = lambda n: (hasattr(n, u'parent') and
403 bool(n.parent()) and 420 bool(n.parent()) and
404 any(predicate(x) 421 any(predicate(x)
405 for x in n.parent() if x is not n)) 422 for x in n.parent() if x is not n))
406 # A $. B A is a sister of and immediately precedes B. 423 # A $. B A is a sister of and immediately precedes B.
407 - elif operator == '$.' or operator == '%.':  
408 - retval = lambda n: (hasattr(n, 'right_sibling') and 424 + elif operator == u'$.' or operator == u'%.':
  425 + retval = lambda n: (hasattr(n, u'right_sibling') and
409 bool(n.right_sibling()) and 426 bool(n.right_sibling()) and
410 predicate(n.right_sibling())) 427 predicate(n.right_sibling()))
411 # A $, B A is a sister of and immediately follows B. 428 # A $, B A is a sister of and immediately follows B.
412 - elif operator == '$,' or operator == '%,':  
413 - retval = lambda n: (hasattr(n, 'left_sibling') and 429 + elif operator == u'$,' or operator == u'%,':
  430 + retval = lambda n: (hasattr(n, u'left_sibling') and
414 bool(n.left_sibling()) and 431 bool(n.left_sibling()) and
415 predicate(n.left_sibling())) 432 predicate(n.left_sibling()))
416 # A $.. B A is a sister of and precedes B. 433 # A $.. B A is a sister of and precedes B.
417 - elif operator == '$..' or operator == '%..':  
418 - retval = lambda n: (hasattr(n, 'parent') and  
419 - hasattr(n, 'parent_index') and 434 + elif operator == u'$..' or operator == u'%..':
  435 + retval = lambda n: (hasattr(n, u'parent') and
  436 + hasattr(n, u'parent_index') and
420 bool(n.parent()) and 437 bool(n.parent()) and
421 any(predicate(x) for x in 438 any(predicate(x) for x in
422 n.parent()[n.parent_index() + 1:])) 439 n.parent()[n.parent_index() + 1:]))
423 # A $,, B A is a sister of and follows B. 440 # A $,, B A is a sister of and follows B.
424 - elif operator == '$,,' or operator == '%,,':  
425 - retval = lambda n: (hasattr(n, 'parent') and  
426 - hasattr(n, 'parent_index') and 441 + elif operator == u'$,,' or operator == u'%,,':
  442 + retval = lambda n: (hasattr(n, u'parent') and
  443 + hasattr(n, u'parent_index') and
427 bool(n.parent()) and 444 bool(n.parent()) and
428 any(predicate(x) for x in 445 any(predicate(x) for x in
429 n.parent()[:n.parent_index()])) 446 n.parent()[:n.parent_index()]))
430 else: 447 else:
431 - assert False, 'cannot interpret tgrep operator "{0}"'.format( 448 + assert False, u'cannot interpret tgrep operator "{0}"'.format(
432 operator) 449 operator)
433 # now return the built function 450 # now return the built function
434 if negated: 451 if negated:
@@ -442,7 +459,7 @@ def _tgrep_rel_conjunction_action(_s, _l, tokens): @@ -442,7 +459,7 @@ def _tgrep_rel_conjunction_action(_s, _l, tokens):
442 from the conjunction of several other such lambda functions. 459 from the conjunction of several other such lambda functions.
443 ''' 460 '''
444 # filter out the ampersand 461 # filter out the ampersand
445 - tokens = [x for x in tokens if x != '&'] 462 + tokens = [x for x in tokens if x != u'&']
446 # print 'relation conjunction tokens: ', tokens 463 # print 'relation conjunction tokens: ', tokens
447 if len(tokens) == 1: 464 if len(tokens) == 1:
448 return tokens[0] 465 return tokens[0]
@@ -455,7 +472,7 @@ def _tgrep_rel_disjunction_action(_s, _l, tokens): @@ -455,7 +472,7 @@ def _tgrep_rel_disjunction_action(_s, _l, tokens):
455 from the disjunction of several other such lambda functions. 472 from the disjunction of several other such lambda functions.
456 ''' 473 '''
457 # filter out the pipe 474 # filter out the pipe
458 - tokens = [x for x in tokens if x != '|'] 475 + tokens = [x for x in tokens if x != u'|']
459 # print 'relation disjunction tokens: ', tokens 476 # print 'relation disjunction tokens: ', tokens
460 if len(tokens) == 1: 477 if len(tokens) == 1:
461 return tokens[0] 478 return tokens[0]
@@ -467,40 +484,40 @@ def _build_tgrep_parser(set_parse_actions = True): @@ -467,40 +484,40 @@ def _build_tgrep_parser(set_parse_actions = True):
467 Builds a pyparsing-based parser object for tokenizing and 484 Builds a pyparsing-based parser object for tokenizing and
468 interpreting tgrep search strings. 485 interpreting tgrep search strings.
469 ''' 486 '''
470 - tgrep_op = (pyparsing.Optional('!') +  
471 - pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*'))  
472 - tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\', 487 + tgrep_op = (pyparsing.Optional(u'!') +
  488 + pyparsing.Regex(u'[$%,.<>][%,.<>0-9-\':]*'))
  489 + tgrep_qstring = pyparsing.QuotedString(quoteChar=u'"', escChar=u'\\',
473 unquoteResults=False) 490 unquoteResults=False)
474 - tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\', 491 + tgrep_node_regex = pyparsing.QuotedString(quoteChar=u'/', escChar=u'\',
475 unquoteResults=False) 492 unquoteResults=False)
476 - tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%^=]+') 493 + tgrep_node_literal = pyparsing.Regex(u'[^][ \r\t\n;:.,&|<>()$!@%^=]+')
477 tgrep_expr = pyparsing.Forward() 494 tgrep_expr = pyparsing.Forward()
478 tgrep_relations = pyparsing.Forward() 495 tgrep_relations = pyparsing.Forward()
479 - tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')' 496 + tgrep_parens = pyparsing.Literal(u'(') + tgrep_expr + u')'
480 tgrep_nltk_tree_pos = ( 497 tgrep_nltk_tree_pos = (
481 - pyparsing.Literal('N(') +  
482 - pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' + 498 + pyparsing.Literal(u'N(') +
  499 + pyparsing.Optional(pyparsing.Word(pyparsing.nums) + u',' +
483 pyparsing.Optional(pyparsing.delimitedList( 500 pyparsing.Optional(pyparsing.delimitedList(
484 - pyparsing.Word(pyparsing.nums), delim=',') +  
485 - pyparsing.Optional(','))) + ')') 501 + pyparsing.Word(pyparsing.nums), delim=u',') +
  502 + pyparsing.Optional(u','))) + u')')
486 tgrep_node_expr = (tgrep_qstring | 503 tgrep_node_expr = (tgrep_qstring |
487 tgrep_node_regex | 504 tgrep_node_regex |
488 - '*' | 505 + u'*' |
489 tgrep_node_literal) 506 tgrep_node_literal)
490 tgrep_node = (tgrep_parens | 507 tgrep_node = (tgrep_parens |
491 tgrep_nltk_tree_pos | 508 tgrep_nltk_tree_pos |
492 - (pyparsing.Optional("'") + 509 + (pyparsing.Optional(u"'") +
493 tgrep_node_expr + 510 tgrep_node_expr +
494 - pyparsing.ZeroOrMore("|" + tgrep_node_expr))) 511 + pyparsing.ZeroOrMore(u"|" + tgrep_node_expr)))
495 tgrep_relation = pyparsing.Forward() 512 tgrep_relation = pyparsing.Forward()
496 - tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']' 513 + tgrep_brackets = pyparsing.Optional(u'!') + u'[' + tgrep_relations + u']'
497 tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node 514 tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node
498 tgrep_rel_conjunction = pyparsing.Forward() 515 tgrep_rel_conjunction = pyparsing.Forward()
499 tgrep_rel_conjunction << (tgrep_relation + 516 tgrep_rel_conjunction << (tgrep_relation +
500 - pyparsing.ZeroOrMore(pyparsing.Optional('&') + 517 + pyparsing.ZeroOrMore(pyparsing.Optional(u'&') +
501 tgrep_rel_conjunction)) 518 tgrep_rel_conjunction))
502 tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( 519 tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore(
503 - "|" + tgrep_relations) 520 + u"|" + tgrep_relations)
504 tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) 521 tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations)
505 if set_parse_actions: 522 if set_parse_actions:
506 tgrep_node.setParseAction(_tgrep_node_action) 523 tgrep_node.setParseAction(_tgrep_node_action)
@@ -520,6 +537,8 @@ def tgrep_tokenize(tgrep_string): @@ -520,6 +537,8 @@ def tgrep_tokenize(tgrep_string):
520 Tokenizes a TGrep search string into separate tokens. 537 Tokenizes a TGrep search string into separate tokens.
521 ''' 538 '''
522 parser = _build_tgrep_parser(False) 539 parser = _build_tgrep_parser(False)
  540 + if isinstance(tgrep_string, bytes):
  541 + tgrep_string = tgrep_string.decode()
523 return list(parser.parseString(tgrep_string)) 542 return list(parser.parseString(tgrep_string))
524 543
525 def tgrep_compile(tgrep_string): 544 def tgrep_compile(tgrep_string):
@@ -528,6 +547,8 @@ def tgrep_compile(tgrep_string): @@ -528,6 +547,8 @@ def tgrep_compile(tgrep_string):
528 lambda function. 547 lambda function.
529 ''' 548 '''
530 parser = _build_tgrep_parser(True) 549 parser = _build_tgrep_parser(True)
  550 + if isinstance(tgrep_string, bytes):
  551 + tgrep_string = tgrep_string.decode()
531 return list(parser.parseString(tgrep_string, parseAll=True))[0] 552 return list(parser.parseString(tgrep_string, parseAll=True))[0]
532 553
533 def treepositions_no_leaves(tree): 554 def treepositions_no_leaves(tree):
@@ -552,14 +573,15 @@ def tgrep_positions(tree, tgrep_string, search_leaves = True): @@ -552,14 +573,15 @@ def tgrep_positions(tree, tgrep_string, search_leaves = True):
552 If `search_leaves` is False, the method will not return any 573 If `search_leaves` is False, the method will not return any
553 results in leaf positions. 574 results in leaf positions.
554 ''' 575 '''
555 - if not hasattr(tree, 'treepositions'): 576 + try:
  577 + if search_leaves:
  578 + search_positions = tree.treepositions()
  579 + else:
  580 + search_positions = treepositions_no_leaves(tree)
  581 + except AttributeError:
556 return [] 582 return []
557 - if isinstance(tgrep_string, basestring): 583 + if isinstance(tgrep_string, (bytes, str)):
558 tgrep_string = tgrep_compile(tgrep_string) 584 tgrep_string = tgrep_compile(tgrep_string)
559 - if search_leaves:  
560 - search_positions = tree.treepositions()  
561 - else:  
562 - search_positions = treepositions_no_leaves(tree)  
563 return [position for position in search_positions 585 return [position for position in search_positions
564 if tgrep_string(tree[position])] 586 if tgrep_string(tree[position])]
565 587