I was trying to solve a programming problem that involved a lot of insertions into a list. The specifics of the problem are not relevant to the question, however.
To solve it, I ended up writing some kind of a rope data structure, trying to piece an implementation together from the scarce resources I was able to find on the internet.
I ended up writing this:
from dataclasses import dataclass
import typing as t
from hypothesis import given, strategies as st
T = t.TypeVar("T")
@dataclass
class Branch(t.Generic[T]):
weight: int
left: t.Union[t.List[T], "Branch[T]"]
right: t.Union[t.List[T], "Branch[T]"]
Rope = t.Union[t.List[T], Branch[T]]
def insert(rope: Rope, index: int, item: T) -> Rope:
if isinstance(rope, list): # leaf
if index == len(rope): # at the end, just append
rope.append(item)
return rope
# split
left, right = rope[:index], [item, *rope[index:]]
return Branch(len(left), left, right)
else: # branch
if rope.right and rope.weight <= index:
rope.right = insert(rope.right, index - rope.weight, item)
else:
rope.weight += 1
rope.left = insert(rope.left, index, item)
return rope
def straighten(rope: Rope) -> t.Iterator[T]:
if isinstance(rope, list):
yield from rope
else:
yield from straighten(rope.left)
yield from straighten(rope.right)
@st.composite
def instructions(draw, elements=st.integers(0, 255)):
items = draw(st.lists(elements, min_size=1))
result = []
for i, item in enumerate(items):
insertion_point = draw(st.integers(min_value=0, max_value=i))
result.append((insertion_point, item))
return result
@given(instructions())
def test_correctness(seq: t.List[t.Tuple[int, int]]) -> None:
manual: t.List[int] = []
for (i, item) in seq:
manual.insert(i, item)
rope: Rope = []
for (i, item) in seq:
rope = insert(rope, i, item)
assert manual == list(
straighten(rope)
), f"{manual!r} != {rope!r} => {list(straighten(rope))!r}"
if __name__ == "__main__":
test_correctness()
print("All good")
As you can see, the code passes the hypothesis and, therefore, it solves the problem.
I am unable to understand however why this code I wrote is correct, as trying to rewrite it utilizing just Branches and leaves (that is to say, without using lists) yields an incorrect implementation:
from dataclasses import dataclass
import typing as t
from hypothesis import given, strategies as st
T = t.TypeVar("T")
@dataclass
class Branch(t.Generic[T]):
weight: int
left: t.Union[T, "Branch[T]"]
right: t.Union[T, "Branch[T]"]
Rope = t.Union[T, Branch[T]]
def insert(rope: Rope, index: int, item: T) -> Rope:
if isinstance(rope, Branch): # leaf
if rope.right and rope.weight <= index:
rope.right = insert(rope.right, index - rope.weight, item)
else:
rope.weight += 1
rope.left = insert(rope.left, index, item)
return rope
if index == 0:
left, right = item, rope
else:
left, right = rope, item
return Branch(1, left, right)
def straighten(rope: Rope) -> t.Iterator[T]:
if isinstance(rope, Branch):
yield from straighten(rope.left)
yield from straighten(rope.right)
else:
yield rope
@st.composite
def instructions(draw, elements=st.integers(0, 255)):
items = draw(st.lists(elements, min_size=1))
result = []
for i, item in enumerate(items):
insertion_point = draw(st.integers(min_value=0, max_value=i))
result.append((insertion_point, item))
return result
@given(instructions())
def test_correctness(seq: t.List[t.Tuple[int, int]]) -> None:
it = iter(seq)
_, head = next(it)
rope: Rope = head
manual = [head]
for (i, item) in it:
manual.insert(i, item)
rope = insert(rope, i, item)
straight_rope: t.List[int] = list(straighten(rope))
assert manual == straight_rope, f"{manual!r} != {straight_rope!r} ({rope!r})"
if __name__ == "__main__":
test_correctness()
print("All good")
So I'd appreciate if anybody with more insight into this data structure would tell me the difference between the two implementations and why one works while the other doesn't. Thank you ina dvance.
Your code fails because in if rope.right and rope.weight <= index:
, the condition evaluates to False
when rope.right == 0
. In your second code this happens when the right node is a leaf with value 0, and will cause the value to be inserted into the wrong subtree.
There is no reason for checking rope.right
at all, so this check should simply be if rope.weight <= index:
.
There is another potential issue in both of your rope implementations: You are mutating nodes, while rope implementations normally treat nodes as immutable and create new nodes to make any changes. This doesn't affect your tests because you only have one rope instance at a time. An example where it would cause issues:
r = create_some_rope()
r2 = r
r = insert(r, idx, val)
# now r2 may or may not have changed, depending on its structure.
It will also cause issues if you ever add rope concatenation. In that case, if you concatenate a rope to itself (or to a subtree of itself), then any further operations may inadvertently affect multiple parts of the rope.