diff --git a/src/new/tgrep.py b/src/new/tgrep.py index 800774a..c374bf6 100644 --- a/src/new/tgrep.py +++ b/src/new/tgrep.py @@ -38,6 +38,7 @@ Tgrep2 source: http://tedlab.mit.edu/~dr/Tgrep2/ ''' +from builtins import bytes, range, str import nltk.tree import pyparsing import re @@ -48,11 +49,12 @@ def ancestors(node): This method will not work with leaf nodes, since there is no way to recover the parent. ''' - # if node is a leaf, we cannot retrieve its parent - if not hasattr(node, 'parent'): - return [] results = [] - current = node.parent() + try: + current = node.parent() + except AttributeError: + # if node is a leaf, we cannot retrieve its parent + return results while current: results.append(current) current = current.parent() @@ -63,11 +65,12 @@ def unique_ancestors(node): Returns the list of all nodes dominating the given node, where there is only a single path of descent. ''' - # if node is a leaf, we cannot retrieve its parent - if not hasattr(node, 'parent'): - return [] results = [] - current = node.parent() + try: + current = node.parent() + except AttributeError: + # if node is a leaf, we cannot retrieve its parent + return results while current and len(current) == 1: results.append(current) current = current.parent() @@ -78,29 +81,38 @@ def _descendants(node): Returns the list of all nodes which are descended from the given tree node in some way. ''' - if not hasattr(node, 'treepositions'): + try: + treepos = node.treepositions() + except AttributeError: return [] - return [node[x] for x in node.treepositions()[1:]] + return [node[x] for x in treepos[1:]] def _leftmost_descendants(node): ''' Returns the set of all nodes descended in some way through left branches from this node. ''' - if not hasattr(node, 'treepositions'): + try: + treepos = node.treepositions() + except AttributeError: return [] - return [node[x] for x in node.treepositions()[1:] if all(y == 0 for y in x)] + return [node[x] for x in treepos[1:] if all(y == 0 for y in x)] def _rightmost_descendants(node): ''' Returns the set of all nodes descended in some way through right branches from this node. ''' - if not hasattr(node, 'treepositions'): + try: + rightmost_leaf = max(node.treepositions()) + except AttributeError: return [] - rightmost_leaf = max(node.treepositions()) return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] +def _istree(obj): + '''Predicate to check whether `obj` is a nltk.tree.Tree.''' + return isinstance(obj, nltk.tree.Tree) + def _unique_descendants(node): ''' Returns the list of all nodes descended from the given node, where @@ -108,7 +120,7 @@ def _unique_descendants(node): ''' results = [] current = node - while current and isinstance(current, nltk.tree.Tree) and len(current) == 1: + while current and _istree(current) and len(current) == 1: current = current[0] results.append(current) return results @@ -117,10 +129,11 @@ def _before(node): ''' Returns the set of all nodes that are before the given node. ''' - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): + try: + pos = node.treeposition() + tree = node.root() + except AttributeError: return [] - pos = node.treeposition() - tree = node.root() return [tree[x] for x in tree.treepositions() if x[:len(pos)] < pos[:len(x)]] @@ -133,9 +146,11 @@ def _immediately_before(node): symbol (word) produced by A immediately precedes the first terminal symbol produced by B. ''' - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): + try: + pos = node.treeposition() + tree = node.root() + except AttributeError: return [] - pos = node.treeposition() # go "upwards" from pos until there is a place we can go to the left idx = len(pos) - 1 while 0 <= idx and pos[idx] == 0: @@ -144,17 +159,18 @@ def _immediately_before(node): return [] pos = list(pos[:idx + 1]) pos[-1] -= 1 - before = node.root()[pos] + before = tree[pos] return [before] + _rightmost_descendants(before) def _after(node): ''' Returns the set of all nodes that are after the given node. ''' - if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): + try: + pos = node.treeposition() + tree = node.root() + except AttributeError: return [] - pos = node.treeposition() - tree = node.root() return [tree[x] for x in tree.treepositions() if x[:len(pos)] > pos[:len(x)]] @@ -167,14 +183,15 @@ def _immediately_after(node): symbol (word) produced by A immediately follows the last terminal symbol produced by B. ''' - if (not hasattr(node, 'root') or not hasattr(node, 'treeposition') or - not hasattr(node, 'parent')): + try: + pos = node.treeposition() + tree = node.root() + current = node.parent() + except AttributeError: return [] - pos = node.treeposition() # go "upwards" from pos until there is a place we can go to the # right idx = len(pos) - 1 - current = node.parent() while 0 <= idx and pos[idx] == len(current) - 1: idx -= 1 current = current.parent() @@ -182,7 +199,7 @@ def _immediately_after(node): return [] pos = list(pos[:idx + 1]) pos[-1] += 1 - after = node.root()[pos] + after = tree[pos] return [after] + _leftmost_descendants(after) def _tgrep_node_literal_value(node): @@ -190,7 +207,7 @@ def _tgrep_node_literal_value(node): Gets the string value of a given parse tree node, for comparison using the tgrep node literal predicates. ''' - return (node.label() if isinstance(node, nltk.tree.Tree) else unicode(node)) + return (node.label() if _istree(node) else str(node)) def _tgrep_node_action(_s, _l, tokens): ''' @@ -198,30 +215,30 @@ def _tgrep_node_action(_s, _l, tokens): depending on the name of its node. ''' # print 'node tokens: ', tokens - if tokens[0] == "'": + if tokens[0] == u"'": # strip initial apostrophe (tgrep2 print command) tokens = tokens[1:] if len(tokens) > 1: # disjunctive definition of a node name - assert list(set(tokens[1::2])) == ['|'] + assert list(set(tokens[1::2])) == [u'|'] # recursively call self to interpret each node name definition tokens = [_tgrep_node_action(None, None, [node]) for node in tokens[::2]] # capture tokens and return the disjunction return (lambda t: lambda n: any(f(n) for f in t))(tokens) else: - if hasattr(tokens[0], '__call__'): + if hasattr(tokens[0], u'__call__'): # this is a previously interpreted parenthetical node # definition (lambda function) return tokens[0] - elif tokens[0] == '*' or tokens[0] == '__': + elif tokens[0] == u'*' or tokens[0] == u'__': return lambda n: True - elif tokens[0].startswith('"'): - return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip('"')) - elif tokens[0].startswith('/'): + elif tokens[0].startswith(u'"'): + return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip(u'"')) + elif tokens[0].startswith(u'/'): return (lambda r: lambda n: - r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip('/'))) - elif tokens[0].startswith('i@'): + r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip(u'/'))) + elif tokens[0].startswith(u'i@'): return (lambda s: lambda n: _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower()) else: @@ -234,8 +251,8 @@ def _tgrep_parens_action(_s, _l, tokens): ''' # print 'parenthetical tokens: ', tokens assert len(tokens) == 3 - assert tokens[0] == '(' - assert tokens[2] == ')' + assert tokens[0] == u'(' + assert tokens[2] == u')' return tokens[1] def _tgrep_nltk_tree_pos_action(_s, _l, tokens): @@ -247,7 +264,7 @@ def _tgrep_nltk_tree_pos_action(_s, _l, tokens): # recover the tuple from the parsed sting node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) # capture the node's tree position - return (lambda i: lambda n: (hasattr(n, 'treeposition') and + return (lambda i: lambda n: (hasattr(n, u'treeposition') and n.treeposition() == i))(node_tree_position) def _tgrep_relation_action(_s, _l, tokens): @@ -258,177 +275,177 @@ def _tgrep_relation_action(_s, _l, tokens): # print 'relation tokens: ', tokens # process negation first if needed negated = False - if tokens[0] == '!': + if tokens[0] == u'!': negated = True tokens = tokens[1:] - if tokens[0] == '[': + if tokens[0] == u'[': # process square-bracketed relation expressions assert len(tokens) == 3 - assert tokens[2] == ']' + assert tokens[2] == u']' retval = tokens[1] else: # process operator-node relation expressions assert len(tokens) == 2 operator, predicate = tokens # A < B A is the parent of (immediately dominates) B. - if operator == '<': - retval = lambda n: (isinstance(n, nltk.tree.Tree) and + if operator == u'<': + retval = lambda n: (_istree(n) and any(predicate(x) for x in n)) # A > B A is the child of B. - elif operator == '>': - retval = lambda n: (hasattr(n, 'parent') and + elif operator == u'>': + retval = lambda n: (hasattr(n, u'parent') and bool(n.parent()) and predicate(n.parent())) # A <, B Synonymous with A <1 B. - elif operator == '<,' or operator == '<1': - retval = lambda n: (isinstance(n, nltk.tree.Tree) and + elif operator == u'<,' or operator == u'<1': + retval = lambda n: (_istree(n) and bool(list(n)) and predicate(n[0])) # A >, B Synonymous with A >1 B. - elif operator == '>,' or operator == '>1': - retval = lambda n: (hasattr(n, 'parent') and + elif operator == u'>,' or operator == u'>1': + retval = lambda n: (hasattr(n, u'parent') and bool(n.parent()) and (n is n.parent()[0]) and predicate(n.parent())) # A N B A is the Nth child of B (the first child is >1). - elif operator[0] == '>' and operator[1:].isdigit(): + elif operator[0] == u'>' and operator[1:].isdigit(): idx = int(operator[1:]) # capture the index parameter - retval = (lambda i: lambda n: (hasattr(n, 'parent') and + retval = (lambda i: lambda n: (hasattr(n, u'parent') and bool(n.parent()) and 0 <= i < len(n.parent()) and (n is n.parent()[i]) and predicate(n.parent())))(idx - 1) # A <' B B is the last child of A (also synonymous with A <-1 B). # A <- B B is the last child of A (synonymous with A <-1 B). - elif operator == '<\'' or operator == '<-' or operator == '<-1': - retval = lambda n: (isinstance(n, nltk.tree.Tree) and bool(list(n)) + elif operator == u'<\'' or operator == u'<-' or operator == u'<-1': + retval = lambda n: (_istree(n) and bool(list(n)) and predicate(n[-1])) # A >' B A is the last child of B (also synonymous with A >-1 B). # A >- B A is the last child of B (synonymous with A >-1 B). - elif operator == '>\'' or operator == '>-' or operator == '>-1': - retval = lambda n: (hasattr(n, 'parent') and + elif operator == u'>\'' or operator == u'>-' or operator == u'>-1': + retval = lambda n: (hasattr(n, u'parent') and bool(n.parent()) and (n is n.parent()[-1]) and predicate(n.parent())) # A <-N B B is the N th-to-last child of A (the last child is <-1). - elif operator[:2] == '<-' and operator[2:].isdigit(): + elif operator[:2] == u'<-' and operator[2:].isdigit(): idx = -int(operator[2:]) # capture the index parameter - retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and + retval = (lambda i: lambda n: (_istree(n) and bool(list(n)) and 0 <= (i + len(n)) < len(n) and predicate(n[i + len(n)])))(idx) # A >-N B A is the N th-to-last child of B (the last child is >-1). - elif operator[:2] == '>-' and operator[2:].isdigit(): + elif operator[:2] == u'>-' and operator[2:].isdigit(): idx = -int(operator[2:]) # capture the index parameter retval = (lambda i: lambda n: - (hasattr(n, 'parent') and + (hasattr(n, u'parent') and bool(n.parent()) and 0 <= (i + len(n.parent())) < len(n.parent()) and (n is n.parent()[i + len(n.parent())]) and predicate(n.parent())))(idx) # A <: B B is the only child of A - elif operator == '<:': - retval = lambda n: (isinstance(n, nltk.tree.Tree) and + elif operator == u'<:': + retval = lambda n: (_istree(n) and len(n) == 1 and predicate(n[0])) # A >: B A is the only child of B. - elif operator == '>:': - retval = lambda n: (hasattr(n, 'parent') and + elif operator == u'>:': + retval = lambda n: (hasattr(n, u'parent') and bool(n.parent()) and len(n.parent()) == 1 and predicate(n.parent())) # A << B A dominates B (A is an ancestor of B). - elif operator == '<<': - retval = lambda n: (isinstance(n, nltk.tree.Tree) and + elif operator == u'<<': + retval = lambda n: (_istree(n) and any(predicate(x) for x in _descendants(n))) # A >> B A is dominated by B (A is a descendant of B). - elif operator == '>>': + elif operator == u'>>': retval = lambda n: any(predicate(x) for x in ancestors(n)) # A <<, B B is a left-most descendant of A. - elif operator == '<<,' or operator == '<<1': - retval = lambda n: (isinstance(n, nltk.tree.Tree) and + elif operator == u'<<,' or operator == u'<<1': + retval = lambda n: (_istree(n) and any(predicate(x) for x in _leftmost_descendants(n))) # A >>, B A is a left-most descendant of B. - elif operator == '>>,': + elif operator == u'>>,': retval = lambda n: any((predicate(x) and n in _leftmost_descendants(x)) for x in ancestors(n)) # A <<' B B is a right-most descendant of A. - elif operator == '<<\'': - retval = lambda n: (isinstance(n, nltk.tree.Tree) and + elif operator == u'<<\'': + retval = lambda n: (_istree(n) and any(predicate(x) for x in _rightmost_descendants(n))) # A >>' B A is a right-most descendant of B. - elif operator == '>>\'': + elif operator == u'>>\'': retval = lambda n: any((predicate(x) and n in _rightmost_descendants(x)) for x in ancestors(n)) # A <<: B There is a single path of descent from A and B is on it. - elif operator == '<<:': - retval = lambda n: (isinstance(n, nltk.tree.Tree) and + elif operator == u'<<:': + retval = lambda n: (_istree(n) and any(predicate(x) for x in _unique_descendants(n))) # A >>: B There is a single path of descent from B and A is on it. - elif operator == '>>:': + elif operator == u'>>:': retval = lambda n: any(predicate(x) for x in unique_ancestors(n)) # A . B A immediately precedes B. - elif operator == '.': + elif operator == u'.': retval = lambda n: any(predicate(x) for x in _immediately_after(n)) # A , B A immediately follows B. - elif operator == ',': + elif operator == u',': retval = lambda n: any(predicate(x) for x in _immediately_before(n)) # A .. B A precedes B. - elif operator == '..': + elif operator == u'..': retval = lambda n: any(predicate(x) for x in _after(n)) # A ,, B A follows B. - elif operator == ',,': + elif operator == u',,': retval = lambda n: any(predicate(x) for x in _before(n)) # A $ B A is a sister of B (and A != B). - elif operator == '$' or operator == '%': - retval = lambda n: (hasattr(n, 'parent') and + elif operator == u'$' or operator == u'%': + retval = lambda n: (hasattr(n, u'parent') and bool(n.parent()) and any(predicate(x) for x in n.parent() if x is not n)) # A $. B A is a sister of and immediately precedes B. - elif operator == '$.' or operator == '%.': - retval = lambda n: (hasattr(n, 'right_sibling') and + elif operator == u'$.' or operator == u'%.': + retval = lambda n: (hasattr(n, u'right_sibling') and bool(n.right_sibling()) and predicate(n.right_sibling())) # A $, B A is a sister of and immediately follows B. - elif operator == '$,' or operator == '%,': - retval = lambda n: (hasattr(n, 'left_sibling') and + elif operator == u'$,' or operator == u'%,': + retval = lambda n: (hasattr(n, u'left_sibling') and bool(n.left_sibling()) and predicate(n.left_sibling())) # A $.. B A is a sister of and precedes B. - elif operator == '$..' or operator == '%..': - retval = lambda n: (hasattr(n, 'parent') and - hasattr(n, 'parent_index') and + elif operator == u'$..' or operator == u'%..': + retval = lambda n: (hasattr(n, u'parent') and + hasattr(n, u'parent_index') and bool(n.parent()) and any(predicate(x) for x in n.parent()[n.parent_index() + 1:])) # A $,, B A is a sister of and follows B. - elif operator == '$,,' or operator == '%,,': - retval = lambda n: (hasattr(n, 'parent') and - hasattr(n, 'parent_index') and + elif operator == u'$,,' or operator == u'%,,': + retval = lambda n: (hasattr(n, u'parent') and + hasattr(n, u'parent_index') and bool(n.parent()) and any(predicate(x) for x in n.parent()[:n.parent_index()])) else: - assert False, 'cannot interpret tgrep operator "{0}"'.format( + assert False, u'cannot interpret tgrep operator "{0}"'.format( operator) # now return the built function if negated: @@ -442,7 +459,7 @@ def _tgrep_rel_conjunction_action(_s, _l, tokens): from the conjunction of several other such lambda functions. ''' # filter out the ampersand - tokens = [x for x in tokens if x != '&'] + tokens = [x for x in tokens if x != u'&'] # print 'relation conjunction tokens: ', tokens if len(tokens) == 1: return tokens[0] @@ -455,7 +472,7 @@ def _tgrep_rel_disjunction_action(_s, _l, tokens): from the disjunction of several other such lambda functions. ''' # filter out the pipe - tokens = [x for x in tokens if x != '|'] + tokens = [x for x in tokens if x != u'|'] # print 'relation disjunction tokens: ', tokens if len(tokens) == 1: return tokens[0] @@ -467,40 +484,40 @@ def _build_tgrep_parser(set_parse_actions = True): Builds a pyparsing-based parser object for tokenizing and interpreting tgrep search strings. ''' - tgrep_op = (pyparsing.Optional('!') + - pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')) - tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\', + tgrep_op = (pyparsing.Optional(u'!') + + pyparsing.Regex(u'[$%,.<>][%,.<>0-9-\':]*')) + tgrep_qstring = pyparsing.QuotedString(quoteChar=u'"', escChar=u'\\', unquoteResults=False) - tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\\', + tgrep_node_regex = pyparsing.QuotedString(quoteChar=u'/', escChar=u'\\', unquoteResults=False) - tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+') + tgrep_node_literal = pyparsing.Regex(u'[^][ \r\t\n;:.,&|<>()$!@%\'^=]+') tgrep_expr = pyparsing.Forward() tgrep_relations = pyparsing.Forward() - tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')' + tgrep_parens = pyparsing.Literal(u'(') + tgrep_expr + u')' tgrep_nltk_tree_pos = ( - pyparsing.Literal('N(') + - pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' + + pyparsing.Literal(u'N(') + + pyparsing.Optional(pyparsing.Word(pyparsing.nums) + u',' + pyparsing.Optional(pyparsing.delimitedList( - pyparsing.Word(pyparsing.nums), delim=',') + - pyparsing.Optional(','))) + ')') + pyparsing.Word(pyparsing.nums), delim=u',') + + pyparsing.Optional(u','))) + u')') tgrep_node_expr = (tgrep_qstring | tgrep_node_regex | - '*' | + u'*' | tgrep_node_literal) tgrep_node = (tgrep_parens | tgrep_nltk_tree_pos | - (pyparsing.Optional("'") + + (pyparsing.Optional(u"'") + tgrep_node_expr + - pyparsing.ZeroOrMore("|" + tgrep_node_expr))) + pyparsing.ZeroOrMore(u"|" + tgrep_node_expr))) tgrep_relation = pyparsing.Forward() - tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']' + tgrep_brackets = pyparsing.Optional(u'!') + u'[' + tgrep_relations + u']' tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node tgrep_rel_conjunction = pyparsing.Forward() tgrep_rel_conjunction << (tgrep_relation + - pyparsing.ZeroOrMore(pyparsing.Optional('&') + + pyparsing.ZeroOrMore(pyparsing.Optional(u'&') + tgrep_rel_conjunction)) tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( - "|" + tgrep_relations) + u"|" + tgrep_relations) tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) if set_parse_actions: tgrep_node.setParseAction(_tgrep_node_action) @@ -520,6 +537,8 @@ def tgrep_tokenize(tgrep_string): Tokenizes a TGrep search string into separate tokens. ''' parser = _build_tgrep_parser(False) + if isinstance(tgrep_string, bytes): + tgrep_string = tgrep_string.decode() return list(parser.parseString(tgrep_string)) def tgrep_compile(tgrep_string): @@ -528,6 +547,8 @@ def tgrep_compile(tgrep_string): lambda function. ''' parser = _build_tgrep_parser(True) + if isinstance(tgrep_string, bytes): + tgrep_string = tgrep_string.decode() return list(parser.parseString(tgrep_string, parseAll=True))[0] def treepositions_no_leaves(tree): @@ -552,14 +573,15 @@ def tgrep_positions(tree, tgrep_string, search_leaves = True): If `search_leaves` is False, the method will not return any results in leaf positions. ''' - if not hasattr(tree, 'treepositions'): + try: + if search_leaves: + search_positions = tree.treepositions() + else: + search_positions = treepositions_no_leaves(tree) + except AttributeError: return [] - if isinstance(tgrep_string, basestring): + if isinstance(tgrep_string, (bytes, str)): tgrep_string = tgrep_compile(tgrep_string) - if search_leaves: - search_positions = tree.treepositions() - else: - search_positions = treepositions_no_leaves(tree) return [position for position in search_positions if tgrep_string(tree[position])] -- libgit2 0.21.2