Part 9 - Binary Search and Duplicate Keys

Part 8 - B-Tree Leaf Node Format

Part 10 - Splitting a Leaf Node

Last time we noted that we’re still storing keys in unsorted order. We’re going to fix that problem, plus detect and reject duplicate keys.

Right now, our execute_insert() function always chooses to insert at the end of the table. Instead, we should search the table for the correct place to insert, then insert there. If the key already exists there, return an error.

  1. ExecuteResult execute_insert(Statement* statement, Table* table) {
  2. void* node = get_page(table->pager, table->root_page_num);
  3. - if ((*leaf_node_num_cells(node) >= LEAF_NODE_MAX_CELLS)) {
  4. + uint32_t num_cells = (*leaf_node_num_cells(node));
  5. + if (num_cells >= LEAF_NODE_MAX_CELLS) {
  6. return EXECUTE_TABLE_FULL;
  7. }
  8. Row* row_to_insert = &(statement->row_to_insert);
  9. - Cursor* cursor = table_end(table);
  10. + uint32_t key_to_insert = row_to_insert->id;
  11. + Cursor* cursor = table_find(table, key_to_insert);
  12. +
  13. + if (cursor->cell_num < num_cells) {
  14. + uint32_t key_at_index = *leaf_node_key(node, cursor->cell_num);
  15. + if (key_at_index == key_to_insert) {
  16. + return EXECUTE_DUPLICATE_KEY;
  17. + }
  18. + }
  19. leaf_node_insert(cursor, row_to_insert->id, row_to_insert);

We don’t need the table_end() function anymore.

  1. -Cursor* table_end(Table* table) {
  2. - Cursor* cursor = malloc(sizeof(Cursor));
  3. - cursor->table = table;
  4. - cursor->page_num = table->root_page_num;
  5. -
  6. - void* root_node = get_page(table->pager, table->root_page_num);
  7. - uint32_t num_cells = *leaf_node_num_cells(root_node);
  8. - cursor->cell_num = num_cells;
  9. - cursor->end_of_table = true;
  10. -
  11. - return cursor;
  12. -}

We’ll replace it with a method that searches the tree for a given key.

  1. +/*
  2. +Return the position of the given key.
  3. +If the key is not present, return the position
  4. +where it should be inserted
  5. +*/
  6. +Cursor* table_find(Table* table, uint32_t key) {
  7. + uint32_t root_page_num = table->root_page_num;
  8. + void* root_node = get_page(table->pager, root_page_num);
  9. +
  10. + if (get_node_type(root_node) == NODE_LEAF) {
  11. + return leaf_node_find(table, root_page_num, key);
  12. + } else {
  13. + printf("Need to implement searching an internal node\n");
  14. + exit(EXIT_FAILURE);
  15. + }
  16. +}

I’m stubbing out the branch for internal nodes because we haven’t implemented internal nodes yet. We can search the leaf node with binary search.

  1. +Cursor* leaf_node_find(Table* table, uint32_t page_num, uint32_t key) {
  2. + void* node = get_page(table->pager, page_num);
  3. + uint32_t num_cells = *leaf_node_num_cells(node);
  4. +
  5. + Cursor* cursor = malloc(sizeof(Cursor));
  6. + cursor->table = table;
  7. + cursor->page_num = page_num;
  8. +
  9. + // Binary search
  10. + uint32_t min_index = 0;
  11. + uint32_t one_past_max_index = num_cells;
  12. + while (one_past_max_index != min_index) {
  13. + uint32_t index = (min_index + one_past_max_index) / 2;
  14. + uint32_t key_at_index = *leaf_node_key(node, index);
  15. + if (key == key_at_index) {
  16. + cursor->cell_num = index;
  17. + return cursor;
  18. + }
  19. + if (key < key_at_index) {
  20. + one_past_max_index = index;
  21. + } else {
  22. + min_index = index + 1;
  23. + }
  24. + }
  25. +
  26. + cursor->cell_num = min_index;
  27. + return cursor;
  28. +}

This will either return

  • the position of the key,
  • the position of another key that we’ll need to move if we want to insert the new key, or
  • the position one past the last key
    Since we’re now checking node type, we need functions to get and set that value in a node.
  1. +NodeType get_node_type(void* node) {
  2. + uint8_t value = *((uint8_t*)(node + NODE_TYPE_OFFSET));
  3. + return (NodeType)value;
  4. +}
  5. +
  6. +void set_node_type(void* node, NodeType type) {
  7. + uint8_t value = type;
  8. + *((uint8_t*)(node + NODE_TYPE_OFFSET)) = value;
  9. +}

We have to cast to uint8_t first to ensure it’s serialized as a single byte.

We also need to initialize node type.

  1. -void initialize_leaf_node(void* node) { *leaf_node_num_cells(node) = 0; }
  2. +void initialize_leaf_node(void* node) {
  3. + set_node_type(node, NODE_LEAF);
  4. + *leaf_node_num_cells(node) = 0;
  5. +}

Lastly, we need to make and handle a new error code.

  1. -enum ExecuteResult_t { EXECUTE_SUCCESS, EXECUTE_TABLE_FULL };
  2. +enum ExecuteResult_t {
  3. + EXECUTE_SUCCESS,
  4. + EXECUTE_DUPLICATE_KEY,
  5. + EXECUTE_TABLE_FULL
  6. +};
  1. case (EXECUTE_SUCCESS):
  2. printf("Executed.\n");
  3. break;
  4. + case (EXECUTE_DUPLICATE_KEY):
  5. + printf("Error: Duplicate key.\n");
  6. + break;
  7. case (EXECUTE_TABLE_FULL):
  8. printf("Error: Table full.\n");
  9. break;

With these changes, our test can change to check for sorted order:

  1. "db > Executed.",
  2. "db > Tree:",
  3. "leaf (size 3)",
  4. - " - 0 : 3",
  5. - " - 1 : 1",
  6. - " - 2 : 2",
  7. + " - 0 : 1",
  8. + " - 1 : 2",
  9. + " - 2 : 3",
  10. "db > "
  11. ])
  12. end

And we can add a new test for duplicate keys:

  1. + it 'prints an error message if there is a duplicate id' do
  2. + script = [
  3. + "insert 1 user1 person1@example.com",
  4. + "insert 1 user1 person1@example.com",
  5. + "select",
  6. + ".exit",
  7. + ]
  8. + result = run_script(script)
  9. + expect(result).to match_array([
  10. + "db > Executed.",
  11. + "db > Error: Duplicate key.",
  12. + "db > (1, user1, person1@example.com)",
  13. + "Executed.",
  14. + "db > ",
  15. + ])
  16. + end

That’s it! Next up: implement splitting leaf nodes and creating internal nodes.

Part 8 - B-Tree Leaf Node Format

Part 10 - Splitting a Leaf Node

原文: https://cstack.github.io/db_tutorial/parts/part9.html