I'm implementing a B-tree insert function.
But it gets into problems. For inserting 50 elements and below it seems to work fine, but inserting 100 elements yields the error:
vector subscript out of range
Here is the code and the debug outputs that I tried to implement in my program:
#include <iostream>
#include <vector>
#include <algorithm>
#include <queue>
#include <stack>
#include <ctime>
using namespace std;
struct Node {
vector<int>key;
vector<Node*>children;
Node* parent = NULL;
int index = 0;
bool is_leaf()
{
if (children.size() == 0)
{
return true;
}
return false;
}
Node(const vector<int>©vect)
{
key = copyvect;
}
Node(int data)
{
key.push_back(data);
}
Node()
{
}
};
int binarySearch(vector<int>&arr, int x)
{
int l = 0;
int r = arr.size() - 1;
if (arr.size() == 0)
{
return -1;
}
int mid = 0;
while (l <= r)
{
mid = (l + r) / 2;
if (arr[mid] < x)
{
l = mid + 1;
}
else if (arr[mid] > x)
{
r = mid - 1;
}
else {
return mid;
}
}
return mid;
}
Node* insert_node(Node* root, int key, int order)
{
int minkey = order / 2 - 1;
int maxkey = order - 1;
int maxchild = order;
if (root->key.size() == 0)
{
root->key.push_back(key);
return root;
}
else
{
Node* p = root;
int searchindex = 0;
while (!p->is_leaf())
{
searchindex = binarySearch(p->key, key);
cout << "searchindex: " << searchindex << endl;
if (p->key[searchindex] < key)
{
p->children[searchindex + 1]->index = searchindex + 1;
p = p->children[searchindex + 1];
}
else
{
p->children[searchindex]->index = searchindex;
p = p->children[searchindex];
}
}
int searchindex2 = binarySearch(p->key, key);
cout << "searchindex2: " << searchindex2 << endl;
if (p->key[searchindex2] < key)
{
p->key.insert(p->key.begin() + searchindex2 + 1, key);
}
else
{
p->key.insert(p->key.begin() + searchindex2, key);
}
while (p != NULL)
{
if (p->key.size() == maxkey)
{
int medianIndex = p->key.size() / 2;
int mediankey = p->key[medianIndex];
Node* newNode = new Node;
newNode->parent = p->parent;
newNode->key.assign(p->key.begin(), p->key.begin() + medianIndex);
cout << "newnode key size: " << newNode->key.size() << endl;
p->key.erase(p->key.begin(), p->key.begin() + medianIndex + 1);
cout << "p->key size after erase: " << p->key.size() << endl;
if (!p->is_leaf())
{
newNode->children.assign(p->children.begin(), p->children.begin() + p->children.size() / 2 + 1);
cout << "newNode children size: " << newNode->children.size() << endl;
p->children.erase(p->children.begin(), p->children.begin() + p->children.size() / 2 + 1);
cout << "p children size after erase: " << p->children.size() << endl;
}
if (p->parent != NULL)
{
cout << "index is: " << p->index << endl;
p->parent->children.insert(p->parent->children.begin() + p->index, newNode);
cout << "p->parent->children size: " << p->parent->children.size() << endl;
p->parent->key.insert(p->parent->key.begin() + p->index, mediankey);
cout << "p->parent->key size: " << p->parent->key.size() << endl;
}
else
{
Node* newParent = new Node(mediankey);
newParent->children.push_back(newNode);
newParent->children.push_back(p);
newNode->parent = newParent;
p->parent = newParent;
return newParent;
}
}
if (p->parent == NULL)
{
return p;
}
p = p->parent;
}
return root;
}
}
void levelorder(Node* root)
{
if (root == nullptr)
return;
queue<Node*> q;
q.push(root);
while (!q.empty())
{
int countNode = q.size();
while (countNode > 0)
{
Node* current = q.front();
q.pop();
for (int i = 0; i < current->key.size(); i++)
{
cout << current->key[i] << " ";
if (!current->is_leaf())
{
q.push(current->children[i]);
if (i == current->key.size() - 1)
{
q.push(current->children.back());
}
}
}
cout << " ";
countNode--;
}
cout << '\n';
}
}
int main()
{
srand(time(NULL));
int n;
cin >> n;
Node* root = new Node;
for (int i = 0; i < n; i++)
{
root = insert_node(root, rand(), 6);
}
cout << '\n';
levelorder(root);
}
I tried debugging it by printing all the things that could potentially cause this error. But upon examining it, I didn't know why it is acting like that.
Here is the output:
searchindex2: 0
searchindex2: 1
searchindex2: 2
searchindex2: 3
newnode key size: 2
p->key size after erase: 2
searchindex: 0
searchindex2: 1
searchindex: 0
searchindex2: 2
searchindex: 0
searchindex2: 0
newnode key size: 2
p->key size after erase: 2
index is: 1
p->parent->children size: 3
p->parent->key size: 2
searchindex: 1
searchindex2: 0
searchindex: 1
searchindex2: 1
searchindex: 1
searchindex2: 2
searchindex: 1
searchindex2: 2
searchindex: 1
searchindex2: 2
newnode key size: 2
p->key size after erase: 2
index is: 2
p->parent->children size: 4
p->parent->key size: 3
searchindex: 0
searchindex2: 3
newnode key size: 2
p->key size after erase: 2
index is: 1
p->parent->children size: 5
p->parent->key size: 4
searchindex: 3
searchindex2: 1
searchindex: 3
searchindex2: 1
searchindex: 0
searchindex2: 0
searchindex: 3
searchindex2: 2
searchindex: 2
searchindex2: 0
searchindex: 0
searchindex2: 1
searchindex: 3
searchindex2: 3
newnode key size: 2
p->key size after erase: 2
index is: 3
p->parent->children size: 6
p->parent->key size: 5
newnode key size: 2
p->key size after erase: 2
newNode children size: 4
p children size after erase: 2
searchindex: 0
searchindex: 1
What am I doing wrong?
There are several issues, but before getting into them, I would like to make some suggestions to debug more efficiently:
To make the debugging output more "visual", I would write a function that can print the tree in a way that indicates which keys appear in which nodes and which nodes are children of which other nodes. The output generated by levelOrder
leaves some doubt about those aspects as soon as you get a tree with three or more layers. You get easily lost in that output. Yet you would need to see what the structure is. An easy output format is printing the tree 90° rotated, so that the root node appears at the left (column), its children at the right of it, ...etc. For instance, imagine the following output:
98
96
95
94
89
86
77
76
75
74
73
72
71
69
67
66
64
63
56
55
50
42
41
40
24
20
19
18
17
15
So here the root node has two keys: 50 and 73. It has three children. The first child has the keys 19 and 40. The second child has keys 63 and 67, and the last child of the root has keys 77 and 95. You get the idea...
Here is a function for that:
// Helper function that gets a second argument that helps indenting the output
void printTree(Node *current, string tab) {
for (int i = current->key.size(); i >=0; i--) {
if (!current->is_leaf()) {
printTree(current->children[i], tab + " ");
}
if (i == 0) break;
cout << tab << current->key[i-1] << "\n";
}
}
void printTree(Node *root) {
cout << "TREE:\n";
printTree(root, "");
cout << "\n";
}
Secondly, you'll want to check the consistency of your tree after every insertion. You could check things like:
If you call such a function at a moment that the tree should be in a consistent state, you'll detect errors early.
Here is a function you could use for that purpose:
#include <climits>
// Helper function that gets the window of allowed keys and expected parent
void verifyTree(Node *current, int low, int high, Node *parent) {
// Verify parent back-reference
if (current->parent != parent) {
cout << "parent inconsistency at " << current->key[0] << " and its parent\n";
exit(1);
}
bool isLeaf = current->is_leaf();
int numKeys = current->key.size();
// Verify this node has the expected number of children
int expectedChildren = isLeaf ? 0 : numKeys + 1;
if (expectedChildren != current->children.size()) {
cout << "inconcisitency in number of children at parent of "
<< current->key[0] << ". Expected " << expectedChildren
<< " but got " << current->children.size() << "\n";
exit(1);
}
// Verify the order of the keys
int prevKey = low;
for (int i = 0; i < numKeys; i++) {
int key = current->key[i];
if (key < prevKey) {
cout << "wrong key order: " << prevKey << " and " << key << "\n";
exit(1);
}
if (!isLeaf) {
verifyTree(current->children[i], prevKey, key, current);
}
prevKey = key;
}
if (!isLeaf) {
verifyTree(current->children[numKeys], prevKey, high, current);
}
}
void verifyTree(Node *root) {
verifyTree(root, INT_MIN, INT_MAX, nullptr);
}
Finally, to find smaller inputs for which things go wrong, don't pass 6 as value for the order
parameter, but 4. And don't produce such large integers, but limited to two digits (using rand() % 100
).
With this configuration it soon turned out that there were inputs with only 9 values that would trigger an inconsistency, for example with these input values:
6, 3, 5, 4, 1, 8, 7, 2, 9
With the above functions it becomes much easier to pinpoint where things go wrong:
When an internal node splits, the number of children your code moves into the new node is wrong (when p->children.size()
is even), moving one child too many. To correct this bug, change these statements:
newNode->children.assign(p->children.begin(), p->children.begin() + p->children.size() / 2 + 1);
p->children.erase(p->children.begin(), p->children.begin() + p->children.size() / 2 + 1);
to:
newNode->children.assign(p->children.begin(), p->children.begin() + medianIndex + 1);
p->children.erase(p->children.begin(), p->children.begin() + medianIndex + 1);
At the same part of the code as above, you've introduced a child-parent inconsistency, because the children that are moved to newNode
, still have their previous parent as parent
reference. These should be updated. So add this piece of code just below the above mentioned code:
for (auto child : newNode->children) {
child->parent = newNode;
}
These bugfixes will be enough to make it run correctly.
Still, there are some other remarks:
binarySearch
is a bit strange, because it can return only the indexes of the given vector (0..𝑛−1), which forces you to still check whether the key at the returned index is less or not than the key you want to insert and to choose either the child at that index or the next child. It would be more useful to make binarySearch
return a value that can possibly be 𝑛, always ensuring that index can be used for choosing the child. This is easy to do: just replace the final return m;
with return l;
and then you can simplify the code you have just after the call of binarySearch
.
The variables minkey
and maxchild
are defined but never used. These could be removed.
insert_node
is a misleading name for what it does. It inserts a key, which may or may not lead to the insertion of one or more new nodes.
Your code performs a split when p->key.size() == maxkey
, but this means you are actually building a tree of order-1
. A node should only be split when p->key.size() > maxkey
.
To store the "path" of your tree search in the node's index
field is not that elegant: these indices have a temporary nature, yet they are left available in the nodes, without any use.
It is not really needed to maintain a parent
field in each node. If you implement the insertion (and deletion) algorithms using recursion, you can use the call stack to maintain these relations. This will also be a solution for keeping track of the indices that were used along the path (previous point).