diff --git a/src/new/tgrep.py b/src/new/tgrep.py new file mode 100644 index 0000000..800774a --- /dev/null +++ b/src/new/tgrep.py @@ -0,0 +1,575 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation files +# (the "Software"), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, +# publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +''' +TGrep search implementation for NTLK trees. + +(c) 16 March, 2013 Will Roberts . + +This module supports TGrep2 syntax for matching parts of NLTK Trees. +Note that many tgrep operators require the tree passed to be a +ParentedTree. + +Tgrep tutorial: +http://www.stanford.edu/dept/linguistics/corpora/cas-tut-tgrep.html +Tgrep2 manual: +http://tedlab.mit.edu/~dr/Tgrep2/tgrep2.pdf +Tgrep2 source: +http://tedlab.mit.edu/~dr/Tgrep2/ +''' + +import nltk.tree +import pyparsing +import re + +def ancestors(node): + ''' + Returns the list of all nodes dominating the given tree node. + This method will not work with leaf nodes, since there is no way + to recover the parent. + ''' + # if node is a leaf, we cannot retrieve its parent + if not hasattr(node, 'parent'): + return [] + results = [] + current = node.parent() + while current: + results.append(current) + current = current.parent() + return results + +def unique_ancestors(node): + ''' + Returns the list of all nodes dominating the given node, where + there is only a single path of descent. + ''' + # if node is a leaf, we cannot retrieve its parent + if not hasattr(node, 'parent'): + return [] + results = [] + current = node.parent() + while current and len(current) == 1: + results.append(current) + current = current.parent() + return results + +def _descendants(node): + ''' + Returns the list of all nodes which are descended from the given + tree node in some way. + ''' + if not hasattr(node, 'treepositions'): + return [] + return [node[x] for x in node.treepositions()[1:]] + +def _leftmost_descendants(node): + ''' + Returns the set of all nodes descended in some way through + left branches from this node. + ''' + if not hasattr(node, 'treepositions'): + return [] + return [node[x] for x in node.treepositions()[1:] if all(y == 0 for y in x)] + +def _rightmost_descendants(node): + ''' + Returns the set of all nodes descended in some way through + right branches from this node. + ''' + if not hasattr(node, 'treepositions'): + return [] + rightmost_leaf = max(node.treepositions()) + return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] + +def _unique_descendants(node): + ''' + Returns the list of all nodes descended from the given node, where + there is only a single path of descent. + ''' + results = [] + current = node + while current and isinstance(current, nltk.tree.Tree) and len(current) == 1: + current = current[0] + results.append(current) + return results + +def _before(node): + ''' + Returns the set of all nodes that are before the given node. + ''' + if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): + return [] + pos = node.treeposition() + tree = node.root() + return [tree[x] for x in tree.treepositions() + if x[:len(pos)] < pos[:len(x)]] + +def _immediately_before(node): + ''' + Returns the set of all nodes that are immediately before the given + node. + + Tree node A immediately precedes node B if the last terminal + symbol (word) produced by A immediately precedes the first + terminal symbol produced by B. + ''' + if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): + return [] + pos = node.treeposition() + # go "upwards" from pos until there is a place we can go to the left + idx = len(pos) - 1 + while 0 <= idx and pos[idx] == 0: + idx -= 1 + if idx < 0: + return [] + pos = list(pos[:idx + 1]) + pos[-1] -= 1 + before = node.root()[pos] + return [before] + _rightmost_descendants(before) + +def _after(node): + ''' + Returns the set of all nodes that are after the given node. + ''' + if not hasattr(node, 'root') or not hasattr(node, 'treeposition'): + return [] + pos = node.treeposition() + tree = node.root() + return [tree[x] for x in tree.treepositions() + if x[:len(pos)] > pos[:len(x)]] + +def _immediately_after(node): + ''' + Returns the set of all nodes that are immediately after the given + node. + + Tree node A immediately follows node B if the first terminal + symbol (word) produced by A immediately follows the last + terminal symbol produced by B. + ''' + if (not hasattr(node, 'root') or not hasattr(node, 'treeposition') or + not hasattr(node, 'parent')): + return [] + pos = node.treeposition() + # go "upwards" from pos until there is a place we can go to the + # right + idx = len(pos) - 1 + current = node.parent() + while 0 <= idx and pos[idx] == len(current) - 1: + idx -= 1 + current = current.parent() + if idx < 0: + return [] + pos = list(pos[:idx + 1]) + pos[-1] += 1 + after = node.root()[pos] + return [after] + _leftmost_descendants(after) + +def _tgrep_node_literal_value(node): + ''' + Gets the string value of a given parse tree node, for comparison + using the tgrep node literal predicates. + ''' + return (node.label() if isinstance(node, nltk.tree.Tree) else unicode(node)) + +def _tgrep_node_action(_s, _l, tokens): + ''' + Builds a lambda function representing a predicate on a tree node + depending on the name of its node. + ''' + # print 'node tokens: ', tokens + if tokens[0] == "'": + # strip initial apostrophe (tgrep2 print command) + tokens = tokens[1:] + if len(tokens) > 1: + # disjunctive definition of a node name + assert list(set(tokens[1::2])) == ['|'] + # recursively call self to interpret each node name definition + tokens = [_tgrep_node_action(None, None, [node]) + for node in tokens[::2]] + # capture tokens and return the disjunction + return (lambda t: lambda n: any(f(n) for f in t))(tokens) + else: + if hasattr(tokens[0], '__call__'): + # this is a previously interpreted parenthetical node + # definition (lambda function) + return tokens[0] + elif tokens[0] == '*' or tokens[0] == '__': + return lambda n: True + elif tokens[0].startswith('"'): + return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0].strip('"')) + elif tokens[0].startswith('/'): + return (lambda r: lambda n: + r.match(_tgrep_node_literal_value(n)))(re.compile(tokens[0].strip('/'))) + elif tokens[0].startswith('i@'): + return (lambda s: lambda n: + _tgrep_node_literal_value(n).lower() == s)(tokens[0][2:].lower()) + else: + return (lambda s: lambda n: _tgrep_node_literal_value(n) == s)(tokens[0]) + +def _tgrep_parens_action(_s, _l, tokens): + ''' + Builds a lambda function representing a predicate on a tree node + from a parenthetical notation. + ''' + # print 'parenthetical tokens: ', tokens + assert len(tokens) == 3 + assert tokens[0] == '(' + assert tokens[2] == ')' + return tokens[1] + +def _tgrep_nltk_tree_pos_action(_s, _l, tokens): + ''' + Builds a lambda function representing a predicate on a tree node + which returns true if the node is located at a specific tree + position. + ''' + # recover the tuple from the parsed sting + node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) + # capture the node's tree position + return (lambda i: lambda n: (hasattr(n, 'treeposition') and + n.treeposition() == i))(node_tree_position) + +def _tgrep_relation_action(_s, _l, tokens): + ''' + Builds a lambda function representing a predicate on a tree node + depending on its relation to other nodes in the tree. + ''' + # print 'relation tokens: ', tokens + # process negation first if needed + negated = False + if tokens[0] == '!': + negated = True + tokens = tokens[1:] + if tokens[0] == '[': + # process square-bracketed relation expressions + assert len(tokens) == 3 + assert tokens[2] == ']' + retval = tokens[1] + else: + # process operator-node relation expressions + assert len(tokens) == 2 + operator, predicate = tokens + # A < B A is the parent of (immediately dominates) B. + if operator == '<': + retval = lambda n: (isinstance(n, nltk.tree.Tree) and + any(predicate(x) for x in n)) + # A > B A is the child of B. + elif operator == '>': + retval = lambda n: (hasattr(n, 'parent') and + bool(n.parent()) and + predicate(n.parent())) + # A <, B Synonymous with A <1 B. + elif operator == '<,' or operator == '<1': + retval = lambda n: (isinstance(n, nltk.tree.Tree) and + bool(list(n)) and + predicate(n[0])) + # A >, B Synonymous with A >1 B. + elif operator == '>,' or operator == '>1': + retval = lambda n: (hasattr(n, 'parent') and + bool(n.parent()) and + (n is n.parent()[0]) and + predicate(n.parent())) + # A N B A is the Nth child of B (the first child is >1). + elif operator[0] == '>' and operator[1:].isdigit(): + idx = int(operator[1:]) + # capture the index parameter + retval = (lambda i: lambda n: (hasattr(n, 'parent') and + bool(n.parent()) and + 0 <= i < len(n.parent()) and + (n is n.parent()[i]) and + predicate(n.parent())))(idx - 1) + # A <' B B is the last child of A (also synonymous with A <-1 B). + # A <- B B is the last child of A (synonymous with A <-1 B). + elif operator == '<\'' or operator == '<-' or operator == '<-1': + retval = lambda n: (isinstance(n, nltk.tree.Tree) and bool(list(n)) + and predicate(n[-1])) + # A >' B A is the last child of B (also synonymous with A >-1 B). + # A >- B A is the last child of B (synonymous with A >-1 B). + elif operator == '>\'' or operator == '>-' or operator == '>-1': + retval = lambda n: (hasattr(n, 'parent') and + bool(n.parent()) and + (n is n.parent()[-1]) and + predicate(n.parent())) + # A <-N B B is the N th-to-last child of A (the last child is <-1). + elif operator[:2] == '<-' and operator[2:].isdigit(): + idx = -int(operator[2:]) + # capture the index parameter + retval = (lambda i: lambda n: (isinstance(n, nltk.tree.Tree) and + bool(list(n)) and + 0 <= (i + len(n)) < len(n) and + predicate(n[i + len(n)])))(idx) + # A >-N B A is the N th-to-last child of B (the last child is >-1). + elif operator[:2] == '>-' and operator[2:].isdigit(): + idx = -int(operator[2:]) + # capture the index parameter + retval = (lambda i: lambda n: + (hasattr(n, 'parent') and + bool(n.parent()) and + 0 <= (i + len(n.parent())) < len(n.parent()) and + (n is n.parent()[i + len(n.parent())]) and + predicate(n.parent())))(idx) + # A <: B B is the only child of A + elif operator == '<:': + retval = lambda n: (isinstance(n, nltk.tree.Tree) and + len(n) == 1 and + predicate(n[0])) + # A >: B A is the only child of B. + elif operator == '>:': + retval = lambda n: (hasattr(n, 'parent') and + bool(n.parent()) and + len(n.parent()) == 1 and + predicate(n.parent())) + # A << B A dominates B (A is an ancestor of B). + elif operator == '<<': + retval = lambda n: (isinstance(n, nltk.tree.Tree) and + any(predicate(x) for x in _descendants(n))) + # A >> B A is dominated by B (A is a descendant of B). + elif operator == '>>': + retval = lambda n: any(predicate(x) for x in ancestors(n)) + # A <<, B B is a left-most descendant of A. + elif operator == '<<,' or operator == '<<1': + retval = lambda n: (isinstance(n, nltk.tree.Tree) and + any(predicate(x) + for x in _leftmost_descendants(n))) + # A >>, B A is a left-most descendant of B. + elif operator == '>>,': + retval = lambda n: any((predicate(x) and + n in _leftmost_descendants(x)) + for x in ancestors(n)) + # A <<' B B is a right-most descendant of A. + elif operator == '<<\'': + retval = lambda n: (isinstance(n, nltk.tree.Tree) and + any(predicate(x) + for x in _rightmost_descendants(n))) + # A >>' B A is a right-most descendant of B. + elif operator == '>>\'': + retval = lambda n: any((predicate(x) and + n in _rightmost_descendants(x)) + for x in ancestors(n)) + # A <<: B There is a single path of descent from A and B is on it. + elif operator == '<<:': + retval = lambda n: (isinstance(n, nltk.tree.Tree) and + any(predicate(x) + for x in _unique_descendants(n))) + # A >>: B There is a single path of descent from B and A is on it. + elif operator == '>>:': + retval = lambda n: any(predicate(x) for x in unique_ancestors(n)) + # A . B A immediately precedes B. + elif operator == '.': + retval = lambda n: any(predicate(x) + for x in _immediately_after(n)) + # A , B A immediately follows B. + elif operator == ',': + retval = lambda n: any(predicate(x) + for x in _immediately_before(n)) + # A .. B A precedes B. + elif operator == '..': + retval = lambda n: any(predicate(x) for x in _after(n)) + # A ,, B A follows B. + elif operator == ',,': + retval = lambda n: any(predicate(x) for x in _before(n)) + # A $ B A is a sister of B (and A != B). + elif operator == '$' or operator == '%': + retval = lambda n: (hasattr(n, 'parent') and + bool(n.parent()) and + any(predicate(x) + for x in n.parent() if x is not n)) + # A $. B A is a sister of and immediately precedes B. + elif operator == '$.' or operator == '%.': + retval = lambda n: (hasattr(n, 'right_sibling') and + bool(n.right_sibling()) and + predicate(n.right_sibling())) + # A $, B A is a sister of and immediately follows B. + elif operator == '$,' or operator == '%,': + retval = lambda n: (hasattr(n, 'left_sibling') and + bool(n.left_sibling()) and + predicate(n.left_sibling())) + # A $.. B A is a sister of and precedes B. + elif operator == '$..' or operator == '%..': + retval = lambda n: (hasattr(n, 'parent') and + hasattr(n, 'parent_index') and + bool(n.parent()) and + any(predicate(x) for x in + n.parent()[n.parent_index() + 1:])) + # A $,, B A is a sister of and follows B. + elif operator == '$,,' or operator == '%,,': + retval = lambda n: (hasattr(n, 'parent') and + hasattr(n, 'parent_index') and + bool(n.parent()) and + any(predicate(x) for x in + n.parent()[:n.parent_index()])) + else: + assert False, 'cannot interpret tgrep operator "{0}"'.format( + operator) + # now return the built function + if negated: + return (lambda r: (lambda n: not r(n)))(retval) + else: + return retval + +def _tgrep_rel_conjunction_action(_s, _l, tokens): + ''' + Builds a lambda function representing a predicate on a tree node + from the conjunction of several other such lambda functions. + ''' + # filter out the ampersand + tokens = [x for x in tokens if x != '&'] + # print 'relation conjunction tokens: ', tokens + if len(tokens) == 1: + return tokens[0] + elif len(tokens) == 2: + return (lambda a, b: lambda n: a(n) and b(n))(tokens[0], tokens[1]) + +def _tgrep_rel_disjunction_action(_s, _l, tokens): + ''' + Builds a lambda function representing a predicate on a tree node + from the disjunction of several other such lambda functions. + ''' + # filter out the pipe + tokens = [x for x in tokens if x != '|'] + # print 'relation disjunction tokens: ', tokens + if len(tokens) == 1: + return tokens[0] + elif len(tokens) == 2: + return (lambda a, b: lambda n: a(n) or b(n))(tokens[0], tokens[1]) + +def _build_tgrep_parser(set_parse_actions = True): + ''' + Builds a pyparsing-based parser object for tokenizing and + interpreting tgrep search strings. + ''' + tgrep_op = (pyparsing.Optional('!') + + pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')) + tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\', + unquoteResults=False) + tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\\', + unquoteResults=False) + tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+') + tgrep_expr = pyparsing.Forward() + tgrep_relations = pyparsing.Forward() + tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')' + tgrep_nltk_tree_pos = ( + pyparsing.Literal('N(') + + pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' + + pyparsing.Optional(pyparsing.delimitedList( + pyparsing.Word(pyparsing.nums), delim=',') + + pyparsing.Optional(','))) + ')') + tgrep_node_expr = (tgrep_qstring | + tgrep_node_regex | + '*' | + tgrep_node_literal) + tgrep_node = (tgrep_parens | + tgrep_nltk_tree_pos | + (pyparsing.Optional("'") + + tgrep_node_expr + + pyparsing.ZeroOrMore("|" + tgrep_node_expr))) + tgrep_relation = pyparsing.Forward() + tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']' + tgrep_relation = tgrep_brackets | tgrep_op + tgrep_node + tgrep_rel_conjunction = pyparsing.Forward() + tgrep_rel_conjunction << (tgrep_relation + + pyparsing.ZeroOrMore(pyparsing.Optional('&') + + tgrep_rel_conjunction)) + tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( + "|" + tgrep_relations) + tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) + if set_parse_actions: + tgrep_node.setParseAction(_tgrep_node_action) + tgrep_parens.setParseAction(_tgrep_parens_action) + tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action) + tgrep_relation.setParseAction(_tgrep_relation_action) + tgrep_rel_conjunction.setParseAction(_tgrep_rel_conjunction_action) + tgrep_relations.setParseAction(_tgrep_rel_disjunction_action) + # the whole expression is also the conjunction of two + # predicates: the first node predicate, and the remaining + # relation predicates + tgrep_expr.setParseAction(_tgrep_rel_conjunction_action) + return tgrep_expr + +def tgrep_tokenize(tgrep_string): + ''' + Tokenizes a TGrep search string into separate tokens. + ''' + parser = _build_tgrep_parser(False) + return list(parser.parseString(tgrep_string)) + +def tgrep_compile(tgrep_string): + ''' + Parses (and tokenizes, if necessary) a TGrep search string into a + lambda function. + ''' + parser = _build_tgrep_parser(True) + return list(parser.parseString(tgrep_string, parseAll=True))[0] + +def treepositions_no_leaves(tree): + ''' + Returns all the tree positions in the given tree which are not + leaf nodes. + ''' + treepositions = tree.treepositions() + # leaves are treeposition tuples that are not prefixes of any + # other treeposition + prefixes = set() + for pos in treepositions: + for length in range(len(pos)): + prefixes.add(pos[:length]) + return [pos for pos in treepositions if pos in prefixes] + +def tgrep_positions(tree, tgrep_string, search_leaves = True): + ''' + Return all tree positions in the given tree which match the given + `tgrep_string`. + + If `search_leaves` is False, the method will not return any + results in leaf positions. + ''' + if not hasattr(tree, 'treepositions'): + return [] + if isinstance(tgrep_string, basestring): + tgrep_string = tgrep_compile(tgrep_string) + if search_leaves: + search_positions = tree.treepositions() + else: + search_positions = treepositions_no_leaves(tree) + return [position for position in search_positions + if tgrep_string(tree[position])] + +def tgrep_nodes(tree, tgrep_string, search_leaves = True): + ''' + Return all tree nodes in the given tree which match the given + `tgrep_ string`. + + If `search_leaves` is False, the method will not return any + results in leaf positions. + ''' + return [tree[position] for position in tgrep_positions(tree, tgrep_string, + search_leaves)] -- libgit2 0.21.2