Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
148 changes: 40 additions & 108 deletions stanza/models/common/chuliu_edmonds.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,75 +46,46 @@ def maybe_pop_cycle(i):
stack.pop()
onstack[i] = False
cycle[i] = True
if cycle.sum() > 1:
if np.count_nonzero(cycle) > 1:
cycles.append(cycle)

def initialize_strong_connect(i):
_index[0] += 1
index = _index[-1]
index = _index[0]
indices[i] = lowlinks[i] = index - 1
stack.append(i)
onstack[i] = True

def strong_connect(i):
# this ridiculous atrocity is because somehow people keep
# coming up with graphs which overflow python's call stack
# so instead we make our own call stack and turn the recursion
# into a loop
# see for example
# https://github.com/stanfordnlp/stanza/issues/962
# https://github.com/spraakbanken/sparv-pipeline/issues/166
# in an ideal world this block of code would look like this
# initialize_strong_connect(i)
# dependents = iter(np.where(np.equal(tree, i))[0])
# for j in dependents:
# if indices[j] == -1:
# strong_connect(j)
# lowlinks[i] = min(lowlinks[i], lowlinks[j])
# elif onstack[j]:
# lowlinks[i] = min(lowlinks[i], indices[j])
#
# maybe_pop_cycle(i)
call_stack = [(i, None, None)]
while len(call_stack) > 0:
while call_stack:
i, dependents_iterator, j = call_stack.pop()
if dependents_iterator is None: # first time getting here for this i
if dependents_iterator is None:
initialize_strong_connect(i)
dependents_iterator = iter(np.where(np.equal(tree, i))[0])
else: # been here before. j was the dependent we were just considering
dependents_arr = np.where(tree == i)[0]
n_dependents = len(dependents_arr)
# Faster than building and using iterator repeatedly
dependents_iterator = (dependents_arr, 0)
else:
dependents_arr, idx = dependents_iterator
lowlinks[i] = min(lowlinks[i], lowlinks[j])
for j in dependents_iterator:

dependents_arr, idx = dependents_iterator

while idx < len(dependents_arr):
j = dependents_arr[idx]
idx += 1
if indices[j] == -1:
# have to remember where we were...
# put the current iterator & its state on the "call stack"
# we will come back to it later
call_stack.append((i, dependents_iterator, j))
# also, this is what we do next...
# push current iterator & state to call_stack
call_stack.append((i, (dependents_arr, idx), j))
call_stack.append((j, None, None))
# this will break this iterator for now
# the next time through, we will continue progressing this iterator
break
elif onstack[j]:
lowlinks[i] = min(lowlinks[i], indices[j])
else:
# this is an intended use of for/else
# please stop filing git issues on obscure language features
# we finished iterating without a break
# and can finally resolve any possible cycles
# If completed, maybe cycle
maybe_pop_cycle(i)
# at this point, there are two cases:
#
# we iterated all the way through an iterator (the else in the for/else)
# and have resolved any possible cycles. can then proceed to the previous
# iterator we were considering (or finish, if there are no others)
# OR
# we have hit a break in the iteration over the dependents
# for a node
# and we need to dig deeper into the graph and resolve the dependent's dependents
# before we can continue the previous node
#
# either way, we check to see if there are unfinished subtrees
# when that is finally done, we can return
# If break, will revisit as above

#-------------------------------------------------------------
for i in range(len(tree)):
Expand All @@ -126,38 +97,27 @@ def process_cycle(tree, cycle, scores):
"""
Build a subproblem with one cycle broken
"""
# indices of cycle in original tree; (c) in t
cycle_locs = np.where(cycle)[0]
# heads of cycle in original tree; (c) in t
cycle_subtree = tree[cycle]
# scores of cycle in original tree; (c) in R
cycle_scores = scores[cycle, cycle_subtree]
# total score of cycle; () in R
cycle_score = cycle_scores.sum()
cycle_score = np.sum(cycle_scores)

# locations of noncycle; (t) in [0,1]
noncycle = np.logical_not(cycle)
# indices of noncycle in original tree; (n) in t
noncycle_locs = np.where(noncycle)[0]
#print(cycle_locs, noncycle_locs)

# scores of cycle's potential heads; (c x n) - (c) + () -> (n x c) in R
metanode_head_scores = scores[cycle][:,noncycle] - cycle_scores[:,None] + cycle_score
# scores of cycle's potential dependents; (n x c) in R
metanode_dep_scores = scores[noncycle][:,cycle]
# best noncycle head for each cycle dependent; (n) in c
# Reduce allocations: np.ix_ is faster than double-index for repeated use
metanode_head_scores = scores[np.ix_(cycle, noncycle)] - cycle_scores[:, None] + cycle_score
metanode_dep_scores = scores[np.ix_(noncycle, cycle)]

metanode_heads = np.argmax(metanode_head_scores, axis=0)
# best cycle head for each noncycle dependent; (n) in c
metanode_deps = np.argmax(metanode_dep_scores, axis=1)

# scores of noncycle graph; (n x n) in R
subscores = scores[noncycle][:,noncycle]
# pad to contracted graph; (n+1 x n+1) in R
subscores = np.pad(subscores, ( (0,1) , (0,1) ), 'constant')
# set the contracted graph scores of cycle's potential heads; (c x n)[:, (n) in n] in R -> (n) in R
subscores = scores[np.ix_(noncycle, noncycle)]
subscores = np.pad(subscores, ((0, 1), (0, 1)), 'constant')

subscores[-1, :-1] = metanode_head_scores[metanode_heads, np.arange(len(noncycle_locs))]
# set the contracted graph scores of cycle's potential dependents; (n x c)[(n) in n] in R-> (n) in R
subscores[:-1,-1] = metanode_dep_scores[np.arange(len(noncycle_locs)), metanode_deps]
subscores[:-1, -1] = metanode_dep_scores[np.arange(len(noncycle_locs)), metanode_deps]

return subscores, cycle_locs, noncycle_locs, metanode_heads, metanode_deps


Expand All @@ -166,42 +126,31 @@ def expand_contracted_tree(tree, contracted_tree, cycle_locs, noncycle_locs, met
Given a partially solved tree with a cycle and a solved subproblem
for the cycle, build a larger solution without the cycle
"""
# head of the cycle; () in n
#print(contracted_tree)
cycle_head = contracted_tree[-1]
# fixed tree: (n) in n+1
contracted_tree = contracted_tree[:-1]
# initialize new tree; (t) in 0

new_tree = -np.ones_like(tree)
#print(0, new_tree)
# fixed tree with no heads coming from the cycle: (n) in [0,1]
contracted_subtree = contracted_tree < len(contracted_tree)
# add the nodes to the new tree (t)[(n)[(n) in [0,1]] in t] in t = (n)[(n)[(n) in [0,1]] in n] in t

# Assign: non-cycle node heads from non-cycle parents
new_tree[noncycle_locs[contracted_subtree]] = noncycle_locs[contracted_tree[contracted_subtree]]
#print(1, new_tree)
# fixed tree with heads coming from the cycle: (n) in [0,1]
contracted_subtree = np.logical_not(contracted_subtree)
# add the nodes to the tree (t)[(n)[(n) in [0,1]] in t] in t = (c)[(n)[(n) in [0,1]] in c] in t
new_tree[noncycle_locs[contracted_subtree]] = cycle_locs[metanode_deps[contracted_subtree]]
#print(2, new_tree)
# add the old cycle to the tree; (t)[(c) in t] in t = (t)[(c) in t] in t
# Assign: non-cycle node heads from cycle parents
contracted_not_subtree = ~contracted_subtree
new_tree[noncycle_locs[contracted_not_subtree]] = cycle_locs[metanode_deps[contracted_not_subtree]]
# Cycle part: assign original cycle tree relationships
new_tree[cycle_locs] = tree[cycle_locs]
#print(3, new_tree)
# root of the cycle; (n)[() in n] in c = () in c

cycle_root = metanode_heads[cycle_head]
# add the root of the cycle to the new tree; (t)[(c)[() in c] in t] = (c)[() in c]
new_tree[cycle_locs[cycle_root]] = noncycle_locs[cycle_head]
#print(4, new_tree)
return new_tree

def prepare_scores(scores):
"""
Alter the scores matrix to avoid self loops and handle the root
"""
# prevent self-loops, set up the root location
np.fill_diagonal(scores, -float('inf')) # prevent self-loops
scores[0] = -float('inf')
scores[0,0] = 0
scores[0, 0] = 0

def chuliu_edmonds(scores):
subtree_stack = []
Expand All @@ -210,32 +159,15 @@ def chuliu_edmonds(scores):
tree = np.argmax(scores, axis=1)
cycles = tarjan(tree)

#print(scores)
#print(cycles)

# recursive implementation:
#if cycles:
# # t = len(tree); c = len(cycle); n = len(noncycle)
# # cycles.pop(): locations of cycle; (t) in [0,1]
# subscores, cycle_locs, noncycle_locs, metanode_heads, metanode_deps = process_cycle(tree, cycles.pop(), scores)
# # MST with contraction; (n+1) in n+1
# contracted_tree = chuliu_edmonds(subscores)
# tree = expand_contracted_tree(tree, contracted_tree, cycle_locs, noncycle_locs, metanode_heads, metanode_deps)
# unfortunately, while the recursion is simpler to understand, it can get too deep for python's stack limit
# so instead we make our own recursion, with blackjack and (you know how it goes)

while cycles:
# t = len(tree); c = len(cycle); n = len(noncycle)
# cycles.pop(): locations of cycle; (t) in [0,1]
subscores, cycle_locs, noncycle_locs, metanode_heads, metanode_deps = process_cycle(tree, cycles.pop(), scores)
subtree_stack.append((tree, cycles, scores, subscores, cycle_locs, noncycle_locs, metanode_heads, metanode_deps))

scores = subscores
prepare_scores(scores)
tree = np.argmax(scores, axis=1)
cycles = tarjan(tree)

while len(subtree_stack) > 0:
while subtree_stack:
contracted_tree = tree
(tree, cycles, scores, subscores, cycle_locs, noncycle_locs, metanode_heads, metanode_deps) = subtree_stack.pop()
tree = expand_contracted_tree(tree, contracted_tree, cycle_locs, noncycle_locs, metanode_heads, metanode_deps)
Expand Down