Part 3 - An In-Memory, Append-Only, Single-Table Database

Part 2 - World's Simplest SQL Compiler and Virtual Machine

Part 4 - Our First Tests (and Bugs)

We’re going to start small by putting a lot of limitations on our database. For now, it will:

  • support two operations: inserting a row and printing all rows
  • reside only in memory (no persistence to disk)
  • support a single, hard-coded table
    Our hard-coded table is going to store users and look like this:
column type
id integer
username varchar(32)
email varchar(255)

This is a simple schema, but it gets us to support multiple data types and multiple sizes of text data types.

insert statements are now going to look like this:

  1. insert 1 cstack foo@bar.com

That means we need to upgrade our prepare_statement function to parse arguments

  1. if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
  2. statement->type = STATEMENT_INSERT;
  3. + int args_assigned = sscanf(
  4. + input_buffer->buffer, "insert %d %s %s", &(statement->row_to_insert.id),
  5. + statement->row_to_insert.username, statement->row_to_insert.email);
  6. + if (args_assigned < 3) {
  7. + return PREPARE_SYNTAX_ERROR;
  8. + }
  9. return PREPARE_SUCCESS;
  10. }
  11. if (strcmp(input_buffer->buffer, "select") == 0) {

We store those parsed arguments into a new Row data structure inside the statement object:

  1. +const uint32_t COLUMN_USERNAME_SIZE = 32;
  2. +const uint32_t COLUMN_EMAIL_SIZE = 255;
  3. +struct Row_t {
  4. + uint32_t id;
  5. + char username[COLUMN_USERNAME_SIZE];
  6. + char email[COLUMN_EMAIL_SIZE];
  7. +};
  8. +typedef struct Row_t Row;
  9. +
  10. struct Statement_t {
  11. StatementType type;
  12. + Row row_to_insert; // only used by insert statement
  13. };
  14. typedef struct Statement_t Statement;

Now we need to copy that data into some data structure representing the table. SQLite uses a B-tree for fast lookups, inserts and deletes. We’ll start with something simpler. Like a B-tree, it will group rows into pages, but instead of arranging those pages as a tree it will arrange them as an array.

Here’s my plan:

  • Store rows in blocks of memory called pages
  • Each page stores as many rows as it can fit
  • Rows are serialized into a compact representation with each page
  • Pages are only allocated as needed
  • Keep a fixed-size array of pointers to pages
    First we’ll define the compact representation of a row:
  1. +#define size_of_attribute(Struct, Attribute) sizeof(((Struct*)0)->Attribute)
  2. +
  3. +const uint32_t ID_SIZE = size_of_attribute(Row, id);
  4. +const uint32_t USERNAME_SIZE = size_of_attribute(Row, username);
  5. +const uint32_t EMAIL_SIZE = size_of_attribute(Row, email);
  6. +const uint32_t ID_OFFSET = 0;
  7. +const uint32_t USERNAME_OFFSET = ID_OFFSET + ID_SIZE;
  8. +const uint32_t EMAIL_OFFSET = USERNAME_OFFSET + USERNAME_SIZE;
  9. +const uint32_t ROW_SIZE = ID_SIZE + USERNAME_SIZE + EMAIL_SIZE;

This means the layout of a serialized row will look like this:

column size (bytes) offset
id 4 0
username 32 4
email 255 36
total 291

We also need code to convert to and from the compact representation.

  1. +void serialize_row(Row* source, void* destination) {
  2. + memcpy(destination + ID_OFFSET, &(source->id), ID_SIZE);
  3. + memcpy(destination + USERNAME_OFFSET, &(source->username), USERNAME_SIZE);
  4. + memcpy(destination + EMAIL_OFFSET, &(source->email), EMAIL_SIZE);
  5. +}
  6. +
  7. +void deserialize_row(void* source, Row* destination) {
  8. + memcpy(&(destination->id), source + ID_OFFSET, ID_SIZE);
  9. + memcpy(&(destination->username), source + USERNAME_OFFSET, USERNAME_SIZE);
  10. + memcpy(&(destination->email), source + EMAIL_OFFSET, EMAIL_SIZE);
  11. +}

Next, a Table structure that points to pages of rows and keeps track of how many rows there are:

  1. +const uint32_t PAGE_SIZE = 4096;
  2. +const uint32_t TABLE_MAX_PAGES = 100;
  3. +const uint32_t ROWS_PER_PAGE = PAGE_SIZE / ROW_SIZE;
  4. +const uint32_t TABLE_MAX_ROWS = ROWS_PER_PAGE * TABLE_MAX_PAGES;
  5. +
  6. +struct Table_t {
  7. + void* pages[TABLE_MAX_PAGES];
  8. + uint32_t num_rows;
  9. +};
  10. +typedef struct Table_t Table;

I’m making our page size 4 kilobytes because it’s the same size as a page used in the virtual memory systems of most computer architectures. This means one page in our database corresponds to one page used by the operating system. The operating system will move pages in and out of memory as whole units instead of breaking them up.

I’m setting an arbitrary limit of 100 pages that we will allocate. When we switch to a tree structure, our database’s maximum size will only be limited by the maximum size of a file. (Although we’ll still limit how many pages we keep in memory at once)

Rows should not cross page boundaries. Since pages probably won’t exist next to each other in memory, this assumption makes it easier to read/write rows.

Speaking of which, here is how we figure out where to read/write in memory for a particular row:

  1. +void* row_slot(Table* table, uint32_t row_num) {
  2. + uint32_t page_num = row_num / ROWS_PER_PAGE;
  3. + void* page = table->pages[page_num];
  4. + if (!page) {
  5. + // Allocate memory only when we try to access page
  6. + page = table->pages[page_num] = malloc(PAGE_SIZE);
  7. + }
  8. + uint32_t row_offset = row_num % ROWS_PER_PAGE;
  9. + uint32_t byte_offset = row_offset * ROW_SIZE;
  10. + return page + byte_offset;
  11. +}

Now we can make execute_statement read/write from our table structure:

  1. -void execute_statement(Statement* statement) {
  2. +ExecuteResult execute_insert(Statement* statement, Table* table) {
  3. + if (table->num_rows >= TABLE_MAX_ROWS) {
  4. + return EXECUTE_TABLE_FULL;
  5. + }
  6. +
  7. + Row* row_to_insert = &(statement->row_to_insert);
  8. +
  9. + serialize_row(row_to_insert, row_slot(table, table->num_rows));
  10. + table->num_rows += 1;
  11. +
  12. + return EXECUTE_SUCCESS;
  13. +}
  14. +
  15. +ExecuteResult execute_select(Statement* statement, Table* table) {
  16. + Row row;
  17. + for (uint32_t i = 0; i < table->num_rows; i++) {
  18. + deserialize_row(row_slot(table, i), &row);
  19. + print_row(&row);
  20. + }
  21. + return EXECUTE_SUCCESS;
  22. +}
  23. +
  24. +ExecuteResult execute_statement(Statement* statement, Table* table) {
  25. switch (statement->type) {
  26. case (STATEMENT_INSERT):
  27. - printf("This is where we would do an insert.\n");
  28. - break;
  29. + return execute_insert(statement, table);
  30. case (STATEMENT_SELECT):
  31. - printf("This is where we would do a select.\n");
  32. - break;
  33. + return execute_select(statement, table);
  34. }
  35. }

Lastly, we need to initialize the table and handle a few more error cases:

  1. + Table* new_table() {
  2. + Table* table = malloc(sizeof(Table));
  3. + table->num_rows = 0;
  4. +
  5. + return table;
  6. +}
  1. int main(int argc, char* argv[]) {
  2. + Table* table = new_table();
  3. InputBuffer* input_buffer = new_input_buffer();
  4. while (true) {
  5. print_prompt();
  6. @@ -105,13 +203,22 @@ int main(int argc, char* argv[]) {
  7. switch (prepare_statement(input_buffer, &statement)) {
  8. case (PREPARE_SUCCESS):
  9. break;
  10. + case (PREPARE_SYNTAX_ERROR):
  11. + printf("Syntax error. Could not parse statement.\n");
  12. + continue;
  13. case (PREPARE_UNRECOGNIZED_STATEMENT):
  14. printf("Unrecognized keyword at start of '%s'.\n",
  15. input_buffer->buffer);
  16. continue;
  17. }
  18. - execute_statement(&statement);
  19. - printf("Executed.\n");
  20. + switch (execute_statement(&statement, table)) {
  21. + case (EXECUTE_SUCCESS):
  22. + printf("Executed.\n");
  23. + break;
  24. + case (EXECUTE_TABLE_FULL):
  25. + printf("Error: Table full.\n");
  26. + break;
  27. + }
  28. }
  29. }

With those changes we can actually save data in our database!

  1. ~ ./db
  2. db > insert 1 cstack foo@bar.com
  3. Executed.
  4. db > insert 2 bob bob@example.com
  5. Executed.
  6. db > select
  7. (1, cstack, foo@bar.com)
  8. (2, bob, bob@example.com)
  9. Executed.
  10. db > insert foo bar 1
  11. Syntax error. Could not parse statement.
  12. db > .exit
  13. ~

Now would be a great time to write some tests, for a couple reasons:

  • We’re planning to dramatically change the data structure storing our table, and tests would catch regressions.
  • There are a couple edge cases we haven’t tested manually (e.g. filling up the table)
    We’ll address those issues in the next part. For now, here’s the complete diff from this part:
  1. typedef struct InputBuffer_t InputBuffer;
  2. +enum ExecuteResult_t { EXECUTE_SUCCESS, EXECUTE_TABLE_FULL };
  3. +typedef enum ExecuteResult_t ExecuteResult;
  4. +
  5. enum MetaCommandResult_t {
  6. META_COMMAND_SUCCESS,
  7. META_COMMAND_UNRECOGNIZED_COMMAND
  8. };
  9. typedef enum MetaCommandResult_t MetaCommandResult;
  10. -enum PrepareResult_t { PREPARE_SUCCESS, PREPARE_UNRECOGNIZED_STATEMENT };
  11. +enum PrepareResult_t {
  12. + PREPARE_SUCCESS,
  13. + PREPARE_SYNTAX_ERROR,
  14. + PREPARE_UNRECOGNIZED_STATEMENT
  15. +};
  16. typedef enum PrepareResult_t PrepareResult;
  17. enum StatementType_t { STATEMENT_INSERT, STATEMENT_SELECT };
  18. typedef enum StatementType_t StatementType;
  19. +const uint32_t COLUMN_USERNAME_SIZE = 32;
  20. +const uint32_t COLUMN_EMAIL_SIZE = 255;
  21. +struct Row_t {
  22. + uint32_t id;
  23. + char username[COLUMN_USERNAME_SIZE];
  24. + char email[COLUMN_EMAIL_SIZE];
  25. +};
  26. +typedef struct Row_t Row;
  27. +
  28. struct Statement_t {
  29. StatementType type;
  30. + Row row_to_insert; // only used by insert statement
  31. };
  32. typedef struct Statement_t Statement;
  33. +#define size_of_attribute(Struct, Attribute) sizeof(((Struct*)0)->Attribute)
  34. +
  35. +const uint32_t ID_SIZE = size_of_attribute(Row, id);
  36. +const uint32_t USERNAME_SIZE = size_of_attribute(Row, username);
  37. +const uint32_t EMAIL_SIZE = size_of_attribute(Row, email);
  38. +const uint32_t ID_OFFSET = 0;
  39. +const uint32_t USERNAME_OFFSET = ID_OFFSET + ID_SIZE;
  40. +const uint32_t EMAIL_OFFSET = USERNAME_OFFSET + USERNAME_SIZE;
  41. +const uint32_t ROW_SIZE = ID_SIZE + USERNAME_SIZE + EMAIL_SIZE;
  42. +
  43. +const uint32_t PAGE_SIZE = 4096;
  44. +const uint32_t TABLE_MAX_PAGES = 100;
  45. +const uint32_t ROWS_PER_PAGE = PAGE_SIZE / ROW_SIZE;
  46. +const uint32_t TABLE_MAX_ROWS = ROWS_PER_PAGE * TABLE_MAX_PAGES;
  47. +
  48. +struct Table_t {
  49. + void* pages[TABLE_MAX_PAGES];
  50. + uint32_t num_rows;
  51. +};
  52. +typedef struct Table_t Table;
  53. +
  54. +void print_row(Row* row) {
  55. + printf("(%d, %s, %s)\n", row->id, row->username, row->email);
  56. +}
  57. +
  58. +void serialize_row(Row* source, void* destination) {
  59. + memcpy(destination + ID_OFFSET, &(source->id), ID_SIZE);
  60. + memcpy(destination + USERNAME_OFFSET, &(source->username), USERNAME_SIZE);
  61. + memcpy(destination + EMAIL_OFFSET, &(source->email), EMAIL_SIZE);
  62. +}
  63. +
  64. +void deserialize_row(void* source, Row* destination) {
  65. + memcpy(&(destination->id), source + ID_OFFSET, ID_SIZE);
  66. + memcpy(&(destination->username), source + USERNAME_OFFSET, USERNAME_SIZE);
  67. + memcpy(&(destination->email), source + EMAIL_OFFSET, EMAIL_SIZE);
  68. +}
  69. +
  70. +void* row_slot(Table* table, uint32_t row_num) {
  71. + uint32_t page_num = row_num / ROWS_PER_PAGE;
  72. + void* page = table->pages[page_num];
  73. + if (!page) {
  74. + // Allocate memory only when we try to access page
  75. + page = table->pages[page_num] = malloc(PAGE_SIZE);
  76. + }
  77. + uint32_t row_offset = row_num % ROWS_PER_PAGE;
  78. + uint32_t byte_offset = row_offset * ROW_SIZE;
  79. + return page + byte_offset;
  80. +}
  81. +
  82. +Table* new_table() {
  83. + Table* table = malloc(sizeof(Table));
  84. + table->num_rows = 0;
  85. +
  86. + return table;
  87. +}
  88. +
  89. InputBuffer* new_input_buffer() {
  90. InputBuffer* input_buffer = malloc(sizeof(InputBuffer));
  91. input_buffer->buffer = NULL;
  92. @@ -64,6 +137,12 @@ PrepareResult prepare_statement(InputBuffer* input_buffer,
  93. Statement* statement) {
  94. if (strncmp(input_buffer->buffer, "insert", 6) == 0) {
  95. statement->type = STATEMENT_INSERT;
  96. + int args_assigned = sscanf(
  97. + input_buffer->buffer, "insert %d %s %s", &(statement->row_to_insert.id),
  98. + statement->row_to_insert.username, statement->row_to_insert.email);
  99. + if (args_assigned < 3) {
  100. + return PREPARE_SYNTAX_ERROR;
  101. + }
  102. return PREPARE_SUCCESS;
  103. }
  104. if (strcmp(input_buffer->buffer, "select") == 0) {
  105. @@ -74,18 +153,39 @@ PrepareResult prepare_statement(InputBuffer* input_buffer,
  106. return PREPARE_UNRECOGNIZED_STATEMENT;
  107. }
  108. -void execute_statement(Statement* statement) {
  109. +ExecuteResult execute_insert(Statement* statement, Table* table) {
  110. + if (table->num_rows >= TABLE_MAX_ROWS) {
  111. + return EXECUTE_TABLE_FULL;
  112. + }
  113. +
  114. + Row* row_to_insert = &(statement->row_to_insert);
  115. +
  116. + serialize_row(row_to_insert, row_slot(table, table->num_rows));
  117. + table->num_rows += 1;
  118. +
  119. + return EXECUTE_SUCCESS;
  120. +}
  121. +
  122. +ExecuteResult execute_select(Statement* statement, Table* table) {
  123. + Row row;
  124. + for (uint32_t i = 0; i < table->num_rows; i++) {
  125. + deserialize_row(row_slot(table, i), &row);
  126. + print_row(&row);
  127. + }
  128. + return EXECUTE_SUCCESS;
  129. +}
  130. +
  131. +ExecuteResult execute_statement(Statement* statement, Table* table) {
  132. switch (statement->type) {
  133. case (STATEMENT_INSERT):
  134. - printf("This is where we would do an insert.\n");
  135. - break;
  136. + return execute_insert(statement, table);
  137. case (STATEMENT_SELECT):
  138. - printf("This is where we would do a select.\n");
  139. - break;
  140. + return execute_select(statement, table);
  141. }
  142. }
  143. int main(int argc, char* argv[]) {
  144. + Table* table = new_table();
  145. InputBuffer* input_buffer = new_input_buffer();
  146. while (true) {
  147. print_prompt();
  148. @@ -105,13 +205,22 @@ int main(int argc, char* argv[]) {
  149. switch (prepare_statement(input_buffer, &statement)) {
  150. case (PREPARE_SUCCESS):
  151. break;
  152. + case (PREPARE_SYNTAX_ERROR):
  153. + printf("Syntax error. Could not parse statement.\n");
  154. + continue;
  155. case (PREPARE_UNRECOGNIZED_STATEMENT):
  156. printf("Unrecognized keyword at start of '%s'.\n",
  157. input_buffer->buffer);
  158. continue;
  159. }
  160. - execute_statement(&statement);
  161. - printf("Executed.\n");
  162. + switch (execute_statement(&statement, table)) {
  163. + case (EXECUTE_SUCCESS):
  164. + printf("Executed.\n");
  165. + break;
  166. + case (EXECUTE_TABLE_FULL):
  167. + printf("Error: Table full.\n");
  168. + break;
  169. + }
  170. }
  171. }

Part 2 - World's Simplest SQL Compiler and Virtual Machine

Part 4 - Our First Tests (and Bugs)

原文: https://cstack.github.io/db_tutorial/parts/part3.html