Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extended balance_root to handle root underflow case #621

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
215 changes: 144 additions & 71 deletions core/storage/btree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -903,7 +903,7 @@ impl BTreeCursor {
}

if !self.stack.has_parent() {
self.balance_root();
self.balance_root()?;
return Ok(CursorResult::Ok(()));
}
debug!("Balancing leaf. leaf={}", current_page.get().id);
Expand Down Expand Up @@ -989,7 +989,7 @@ impl BTreeCursor {
let (page_type, current_idx) = {
let current_page = self.stack.top();
let contents = current_page.get().contents.as_ref().unwrap();
(contents.page_type().clone(), current_page.get().id)
(contents.page_type(), current_page.get().id)
};

parent.set_dirty();
Expand All @@ -1005,8 +1005,8 @@ impl BTreeCursor {
.cell_get(
cell_idx,
self.pager.clone(),
self.payload_overflow_threshold_max(page_type.clone()),
self.payload_overflow_threshold_min(page_type.clone()),
self.payload_overflow_threshold_max(page_type),
self.payload_overflow_threshold_min(page_type),
self.usable_space(),
)
.unwrap();
Expand All @@ -1019,8 +1019,8 @@ impl BTreeCursor {
if found {
let (start, _len) = parent_contents.cell_get_raw_region(
cell_idx,
self.payload_overflow_threshold_max(page_type.clone()),
self.payload_overflow_threshold_min(page_type.clone()),
self.payload_overflow_threshold_max(page_type),
self.payload_overflow_threshold_min(page_type),
self.usable_space(),
);
right_pointer = start;
Expand Down Expand Up @@ -1191,82 +1191,154 @@ impl BTreeCursor {
}
}

/// Balance the root page.
/// This is done when the root page overflows, and we need to create a new root page.
/// See e.g. https://en.wikipedia.org/wiki/B-tree
fn balance_root(&mut self) {
/* todo: balance deeper, create child and copy contents of root there. Then split root */
/* if we are in root page then we just need to create a new root and push key there */
// Use this only inside balance_root
fn is_overflow(&self, page: &PageContent) -> bool {
!page.overflow_cells.is_empty()
}

let is_page_1 = {
let current_root = self.stack.top();
current_root.get().id == 1
// Use this only inside balance_root
fn is_underflow(&self, page: &PageContent) -> bool {
// Root is special case - only underflow when empty with one child
let current_page = self.stack.top();
if current_page.get().id == self.root_page {
page.cell_count() == 0 && !page.is_leaf()
} else {
false
}
}

fn copy_node_content(&self, src: PageRef, dst: PageRef) -> Result<()> {
let src_contents = src.get().contents.as_ref().unwrap();
let dst_contents = dst.get().contents.as_mut().unwrap();

let src_buf = src_contents.as_ptr();
let dst_buf = dst_contents.as_ptr();

let dst_header_offset = if dst.get().id == 1 {
DATABASE_HEADER_SIZE
} else {
0
};

let offset = if is_page_1 { DATABASE_HEADER_SIZE } else { 0 };
let new_root_page = self.allocate_page(PageType::TableInterior, offset);
{
// Copy content area
let content_start = src_contents.cell_content_area() as usize;
let content_size = self.usable_space() - content_start;
dst_buf[content_start..content_start + content_size]
.copy_from_slice(&src_buf[content_start..content_start + content_size]);

// Copy header and cell pointer array
let header_and_pointers_size =
src_contents.header_size() + src_contents.cell_pointer_array_size();
dst_buf[dst_header_offset..dst_header_offset + header_and_pointers_size].copy_from_slice(
&src_buf[src_contents.offset..src_contents.offset + header_and_pointers_size],
);

Ok(())
}

// ** Zeros a page's important header offset locations and clears its overflow cells
// ** In SQLite this function is called from functions balance_nonroot(), balance_deeper() (root overflow balancing)
// newDatabase(), btreeCreateTable() and clearDatabase()
// ** After calling this zero_page. We have to set flag byte (PAGE_HEADER_OFFSET_PAGE_TYPE) correctly, so
// refer to SQLite source its usage.
fn zero_page(&self, page: &PageRef) {
let contents = page.get().contents.as_mut().unwrap();

contents.write_u32(PAGE_HEADER_OFFSET_FIRST_FREEBLOCK, 0);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pereman2 I've written write_u32 here instead of write_u16 because SQLite does memset to 4 bytes: https://github.com/sqlite/sqlite/blob/c2e400af042bdd7d21e159a41fcf34c05398044c/src/btree.c#L2277

contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
contents.write_u16(
PAGE_HEADER_OFFSET_CELL_CONTENT_AREA,
self.usable_space() as u16,
);
contents.write_u8(PAGE_HEADER_OFFSET_FRAGMENTED_BYTES_COUNT, 0);
contents.overflow_cells.clear();
}

fn balance_root(&mut self) -> Result<CursorResult<()>> {
let current_root = self.stack.top();
let contents = current_root.get().contents.as_mut().unwrap();
let is_page_1 = current_root.get().id == 1;

if self.is_overflow(contents) {
let current_root = self.stack.top();
let contents = current_root.get().contents.as_mut().unwrap();
assert!(!contents.overflow_cells.is_empty());

let original_type = contents.page_type();
current_root.set_dirty();

let child_page = self.allocate_page(original_type, 0);
let child_page_id = child_page.get().id;

// Copy all root content to child
self.copy_node_content(current_root.clone(), child_page.clone())?;

// Copy overflow cells to child from root page
let child_contents = child_page.get().contents.as_mut().unwrap();
let current_root_contents = current_root.get().contents.as_ref().unwrap();
child_contents.overflow_cells = current_root_contents.overflow_cells.clone();

// zero out root page to make it empty.
self.zero_page(&current_root);

// Set root page_type as TableInterior
// Initially root page is of type TableLeaf and as the btree grows deeper the
// root page changes its type to TableInterior since it now points to a child page.
contents.write_u8(PAGE_HEADER_OFFSET_PAGE_TYPE, PageType::TableInterior as u8);

// Set child as root rightmost pointer
contents.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, child_page_id as u32);

let new_root_page_id = new_root_page.get().id;
let new_root_page_contents = new_root_page.get().contents.as_mut().unwrap();
if is_page_1 {
// Copy header
let current_root_buf = current_root_contents.as_ptr();
let new_root_buf = new_root_page_contents.as_ptr();
new_root_buf[0..DATABASE_HEADER_SIZE]
.copy_from_slice(&current_root_buf[0..DATABASE_HEADER_SIZE]);
// Update pager's tracking
self.pager.add_dirty(child_page_id);

// Update page stack to include both root and child
self.stack.clear();
self.stack.push(current_root.clone());
self.stack.push(child_page.clone());

Ok(CursorResult::Ok(()))
} else if self.is_underflow(contents) {
let child_page_id = contents.rightmost_pointer().unwrap();
let child_page = self.pager.read_page(child_page_id as usize)?;

return_if_locked!(child_page);
if !child_page.is_loaded() {
self.pager.load_page(child_page.clone())?;
return Ok(CursorResult::IO);
}
// point new root right child to previous root
new_root_page_contents
.write_u32(PAGE_HEADER_OFFSET_RIGHTMOST_PTR, new_root_page_id as u32);
new_root_page_contents.write_u16(PAGE_HEADER_OFFSET_CELL_COUNT, 0);
}

/* swap splitted page buffer with new root buffer so we don't have to update page idx */
{
let (root_id, child_id, child) = {
let page_ref = self.stack.top();
let child = page_ref.clone();

// Swap the entire Page structs
std::mem::swap(&mut child.get().id, &mut new_root_page.get().id);
// TODO:: shift bytes by offset to left on child because now child has offset 100
// and header bytes
// Also change the offset of page
//
if is_page_1 {
// Remove header from child and set offset to 0
let contents = child.get().contents.as_mut().unwrap();
let (cell_pointer_offset, _) = contents.cell_pointer_array_offset_and_size();
// change cell pointers
for cell_idx in 0..contents.cell_count() {
let cell_pointer_offset = cell_pointer_offset + (2 * cell_idx) - offset;
let pc = contents.read_u16(cell_pointer_offset);
contents.write_u16(cell_pointer_offset, pc - offset as u16);
}
let child_contents = child_page.get().contents.as_mut().unwrap();

contents.offset = 0;
let buf = contents.as_ptr();
buf.copy_within(DATABASE_HEADER_SIZE.., 0);
}
// Defragment child before inserting its cells into root, because root is smaller than child due to it
// containing header.
self.defragment_page(child_contents, self.database_header.borrow());

self.pager.add_dirty(new_root_page.get().id);
self.pager.add_dirty(child.get().id);
(new_root_page.get().id, child.get().id, child)
let root_free_space =
self.compute_free_space(contents, self.database_header.borrow()) as usize;
let available_space = if is_page_1 {
root_free_space - DATABASE_HEADER_SIZE
} else {
root_free_space
};

debug!("Balancing root. root={}, rightmost={}", root_id, child_id);
let root = new_root_page.clone();
// If root can't consume child, leave it empty temporarily and handle it in balance_nonroot step
let child_free_space =
self.compute_free_space(child_contents, self.database_header.borrow()) as usize;
let child_used_space = self.usable_space() - child_free_space;

self.root_page = root_id;
self.stack.clear();
self.stack.push(root.clone());
self.stack.push(child.clone());
// If root doesn't have enough free space for child's content, leave it empty for balance_leaf
if available_space < child_used_space {
return Ok(CursorResult::Ok(()));
}

self.copy_node_content(child_page.clone(), current_root.clone())?;

self.pager.put_loaded_page(root_id, root);
self.pager.put_loaded_page(child_id, child);
// TODO: free the child page by adding it to the free list. Functionality has yet to be added to pager.rs first.

Ok(CursorResult::Ok(()))
} else {
unreachable!("balance_root was called where we didn't have any overflow or underflow");
}
}

Expand Down Expand Up @@ -1529,7 +1601,7 @@ impl BTreeCursor {
write_varint_to_vec(record_buf.len() as u64, cell_payload);
}

let payload_overflow_threshold_max = self.payload_overflow_threshold_max(page_type.clone());
let payload_overflow_threshold_max = self.payload_overflow_threshold_max(page_type);
log::debug!(
"fill_cell_payload(record_size={}, payload_overflow_threshold_max={})",
record_buf.len(),
Expand Down Expand Up @@ -1645,13 +1717,14 @@ impl BTreeCursor {
fn find_cell(&self, page: &PageContent, int_key: u64) -> usize {
let mut cell_idx = 0;
let cell_count = page.cell_count();
let page_type = page.page_type();
while cell_idx < cell_count {
match page
.cell_get(
cell_idx,
self.pager.clone(),
self.payload_overflow_threshold_max(page.page_type()),
self.payload_overflow_threshold_min(page.page_type()),
self.payload_overflow_threshold_max(page_type),
self.payload_overflow_threshold_min(page_type),
self.usable_space(),
)
.unwrap()
Expand Down
2 changes: 1 addition & 1 deletion core/storage/sqlite3_ondisk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ fn write_header_to_buf(buf: &mut [u8], header: &DatabaseHeader) {
}

#[repr(u8)]
#[derive(Debug, PartialEq, Clone)]
#[derive(Debug, PartialEq, Clone, Copy)]
pub enum PageType {
IndexInterior = 2,
TableInterior = 5,
Expand Down