第三节 node节点的相关操作

在开始分析node节点之前,我们先看一下官方对node节点的描述

node represents an in-memory, deserialized page

一个node节点,既可能是叶子节点,也可能是根节点,也可能是分支节点。是物理磁盘上读取进来的页page的内存表现形式。

3.3.1 node节点的定义

  1. // node represents an in-memory, deserialized page.
  2. type node struct {
  3. bucket *Bucket // 关联一个桶
  4. isLeaf bool
  5. unbalanced bool // 值为true的话,需要考虑页合并
  6. spilled bool // 值为true的话,需要考虑页分裂
  7. key []byte // 对于分支节点的话,保留的是最小的key
  8. pgid pgid // 分支节点关联的页id
  9. parent *node // 该节点的parent
  10. children nodes // 该节点的孩子节点
  11. inodes inodes // 该节点上保存的索引数据
  12. }
  13. // inode represents an internal node inside of a node.
  14. // It can be used to point to elements in a page or point
  15. // to an element which hasn't been added to a page yet.
  16. type inode struct {
  17. // 表示是否是子桶叶子节点还是普通叶子节点。如果flags值为1表示子桶叶子节点,否则为普通叶子节点
  18. flags uint32
  19. // 当inode为分支元素时,pgid才有值,为叶子元素时,则没值
  20. pgid pgid
  21. key []byte
  22. // 当inode为分支元素时,value为空,为叶子元素时,才有值
  23. value []byte
  24. }
  25. type inodes []inode

3.3.2 node节点和page转换

在node对象上有两个方法,read(page)、write(page),其中read(page)方法是用来通过page构建一个node节点;而write(page)方法则是将当前的node节点写入到page中,我们在前面他提到了node节点和page节点的相互转换,此处为了保证内容完整性,我们还是再补充下,同时也给大家加深下影响,展示下同样的数据在磁盘上如何组织的,在内存中又是如何组织的。

node->page

  1. // write writes the items onto one or more pages.
  2. // 将node转为page
  3. func (n *node) write(p *page) {
  4. // Initialize page.
  5. // 判断是否是叶子节点还是非叶子节点
  6. if n.isLeaf {
  7. p.flags |= leafPageFlag
  8. } else {
  9. p.flags |= branchPageFlag
  10. }
  11. // 这儿叶子节点不可能溢出,因为溢出时,会分裂
  12. if len(n.inodes) >= 0xFFFF {
  13. panic(fmt.Sprintf("inode overflow: %d (pgid=%d)", len(n.inodes), p.id))
  14. }
  15. p.count = uint16(len(n.inodes))
  16. // Stop here if there are no items to write.
  17. if p.count == 0 {
  18. return
  19. }
  20. // Loop over each item and write it to the page.
  21. // b指向的指针为提逃过所有item头部的位置
  22. b := (*[maxAllocSize]byte)(unsafe.Pointer(&p.ptr))[n.pageElementSize()*len(n.inodes):]
  23. for i, item := range n.inodes {
  24. _assert(len(item.key) > 0, "write: zero-length inode key")
  25. // Write the page element.
  26. // 写入叶子节点数据
  27. if n.isLeaf {
  28. elem := p.leafPageElement(uint16(i))
  29. elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
  30. elem.flags = item.flags
  31. elem.ksize = uint32(len(item.key))
  32. elem.vsize = uint32(len(item.value))
  33. } else {
  34. // 写入分支节点数据
  35. elem := p.branchPageElement(uint16(i))
  36. elem.pos = uint32(uintptr(unsafe.Pointer(&b[0])) - uintptr(unsafe.Pointer(elem)))
  37. elem.ksize = uint32(len(item.key))
  38. elem.pgid = item.pgid
  39. _assert(elem.pgid != p.id, "write: circular dependency occurred")
  40. }
  41. // If the length of key+value is larger than the max allocation size
  42. // then we need to reallocate the byte array pointer.
  43. //
  44. // See: https://github.com/boltdb/bolt/pull/335
  45. klen, vlen := len(item.key), len(item.value)
  46. if len(b) < klen+vlen {
  47. b = (*[maxAllocSize]byte)(unsafe.Pointer(&b[0]))[:]
  48. }
  49. // Write data for the element to the end of the page.
  50. copy(b[0:], item.key)
  51. b = b[klen:]
  52. copy(b[0:], item.value)
  53. b = b[vlen:]
  54. }
  55. // DEBUG ONLY: n.dump()
  56. }

page->node

  1. // 根据page来初始化node
  2. // read initializes the node from a page.
  3. func (n *node) read(p *page) {
  4. n.pgid = p.id
  5. n.isLeaf = ((p.flags & leafPageFlag) != 0)
  6. // 一个inodes对应一个xxxPageElement对象
  7. n.inodes = make(inodes, int(p.count))
  8. for i := 0; i < int(p.count); i++ {
  9. inode := &n.inodes[i]
  10. if n.isLeaf {
  11. // 获取第i个叶子节点
  12. elem := p.leafPageElement(uint16(i))
  13. inode.flags = elem.flags
  14. inode.key = elem.key()
  15. inode.value = elem.value()
  16. } else {
  17. // 树枝节点
  18. elem := p.branchPageElement(uint16(i))
  19. inode.pgid = elem.pgid
  20. inode.key = elem.key()
  21. }
  22. _assert(len(inode.key) > 0, "read: zero-length inode key")
  23. }
  24. // Save first key so we can find the node in the parent when we spill.
  25. if len(n.inodes) > 0 {
  26. // 保存第1个元素的值
  27. n.key = n.inodes[0].key
  28. _assert(len(n.key) > 0, "read: zero-length node key")
  29. } else {
  30. n.key = nil
  31. }
  32. }

3.3.3 node节点的增删改查

put(k,v)

  1. // put inserts a key/value.
  2. // 如果put的是一个key、value的话,不需要指定pgid。
  3. // 如果put的一个树枝节点,则需要指定pgid,不需要指定value
  4. func (n *node) put(oldKey, newKey, value []byte, pgid pgid, flags uint32) {
  5. if pgid >= n.bucket.tx.meta.pgid {
  6. panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", pgid, n.bucket.tx.meta.pgid))
  7. } else if len(oldKey) <= 0 {
  8. panic("put: zero-length old key")
  9. } else if len(newKey) <= 0 {
  10. panic("put: zero-length new key")
  11. }
  12. // Find insertion index.
  13. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, oldKey) != -1 })
  14. // Add capacity and shift nodes if we don't have an exact match and need to insert.
  15. exact := (len(n.inodes) > 0 && index < len(n.inodes) && bytes.Equal(n.inodes[index].key, oldKey))
  16. if !exact {
  17. n.inodes = append(n.inodes, inode{})
  18. copy(n.inodes[index+1:], n.inodes[index:])
  19. }
  20. inode := &n.inodes[index]
  21. inode.flags = flags
  22. inode.key = newKey
  23. inode.value = value
  24. inode.pgid = pgid
  25. _assert(len(inode.key) > 0, "put: zero-length inode key")
  26. }

get(k)

在node中,没有get(k)的方法,其本质是在Cursor中就返回了get的数据。大家可以看看Cursor中的keyValue()方法。

del(k)

  1. // del removes a key from the node.
  2. func (n *node) del(key []byte) {
  3. // Find index of key.
  4. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, key) != -1 })
  5. // Exit if the key isn't found.
  6. if index >= len(n.inodes) || !bytes.Equal(n.inodes[index].key, key) {
  7. return
  8. }
  9. // Delete inode from the node.
  10. n.inodes = append(n.inodes[:index], n.inodes[index+1:]...)
  11. // Mark the node as needing rebalancing.
  12. n.unbalanced = true
  13. }

nextSibling()、prevSibling()

  1. // nextSibling returns the next node with the same parent.
  2. // 返回下一个兄弟节点
  3. func (n *node) nextSibling() *node {
  4. if n.parent == nil {
  5. return nil
  6. }
  7. index := n.parent.childIndex(n)
  8. if index >= n.parent.numChildren()-1 {
  9. return nil
  10. }
  11. return n.parent.childAt(index + 1)
  12. }
  13. // prevSibling returns the previous node with the same parent.
  14. // 返回上一个兄弟节点
  15. func (n *node) prevSibling() *node {
  16. if n.parent == nil {
  17. return nil
  18. }
  19. // 首先找下标
  20. index := n.parent.childIndex(n)
  21. if index == 0 {
  22. return nil
  23. }
  24. // 然后返回
  25. return n.parent.childAt(index - 1)
  26. }
  27. // childIndex returns the index of a given child node.
  28. func (n *node) childIndex(child *node) int {
  29. index := sort.Search(len(n.inodes), func(i int) bool { return bytes.Compare(n.inodes[i].key, child.key) != -1 })
  30. return index
  31. }
  32. // childAt returns the child node at a given index.
  33. // 只有树枝节点才有孩子
  34. func (n *node) childAt(index int) *node {
  35. if n.isLeaf {
  36. panic(fmt.Sprintf("invalid childAt(%d) on a leaf node", index))
  37. }
  38. return n.bucket.node(n.inodes[index].pgid, n)
  39. }
  40. // node creates a node from a page and associates it with a given parent.
  41. // 根据pgid创建一个node
  42. func (b *Bucket) node(pgid pgid, parent *node) *node {
  43. _assert(b.nodes != nil, "nodes map expected")
  44. // Retrieve node if it's already been created.
  45. if n := b.nodes[pgid]; n != nil {
  46. return n
  47. }
  48. // Otherwise create a node and cache it.
  49. n := &node{bucket: b, parent: parent}
  50. if parent == nil {
  51. b.rootNode = n
  52. } else {
  53. parent.children = append(parent.children, n)
  54. }
  55. // Use the inline page if this is an inline bucket.
  56. // 如果第二次进来,b.page不为空
  57. // 此处的pgid和b.page只会有一个是有值的。
  58. var p = b.page
  59. // 说明不是内联桶
  60. if p == nil {
  61. p = b.tx.page(pgid)
  62. }
  63. // Read the page into the node and cache it.
  64. n.read(p)
  65. // 缓存
  66. b.nodes[pgid] = n
  67. // Update statistics.
  68. b.tx.stats.NodeCount++
  69. return n
  70. }

3.3.4 node节点的分裂和合并

上面我们看了对node节点的操作,包括put和del方法。经过这些操作后,可能会导致当前的page填充度过高或者过低。因此就引出了node节点的分裂和合并。下面简单介绍下什么是分裂和合并。

分裂: 当一个node中的数据过多时,最简单就是当超过了page的填充度时,就需要将当前的node拆分成两个,也就是底层会将一页数据拆分存放到两页中。

合并: 当删除了一个或者一批对象时,此时可能会导致一页数据的填充度过低,此时空间可能会浪费比较多。所以就需要考虑对页之间进行数据合并。

有了大概的了解,下面我们就看一下对一个node分裂和合并的实现过程。

分裂spill()

spill writes the nodes to dirty pages and splits nodes as it goes. Returns an error if dirty pages cannot be allocated.

  1. // spill writes the nodes to dirty pages and splits nodes as it goes.
  2. // Returns an error if dirty pages cannot be allocated.
  3. func (n *node) spill() error {
  4. var tx = n.bucket.tx
  5. if n.spilled {
  6. return nil
  7. }
  8. // Spill child nodes first. Child nodes can materialize sibling nodes in
  9. // the case of split-merge so we cannot use a range loop. We have to check
  10. // the children size on every loop iteration.
  11. sort.Sort(n.children)
  12. for i := 0; i < len(n.children); i++ {
  13. if err := n.children[i].spill(); err != nil {
  14. return err
  15. }
  16. }
  17. // We no longer need the child list because it's only used for spill tracking.
  18. n.children = nil
  19. // Split nodes into appropriate sizes. The first node will always be n.
  20. // 将当前的node进行拆分成多个node
  21. var nodes = n.split(tx.db.pageSize)
  22. for _, node := range nodes {
  23. // Add node's page to the freelist if it's not new.
  24. if node.pgid > 0 {
  25. tx.db.freelist.free(tx.meta.txid, tx.page(node.pgid))
  26. node.pgid = 0
  27. }
  28. // Allocate contiguous space for the node.
  29. p, err := tx.allocate((node.size() / tx.db.pageSize) + 1)
  30. if err != nil {
  31. return err
  32. }
  33. // Write the node.
  34. if p.id >= tx.meta.pgid {
  35. // 不可能发生
  36. panic(fmt.Sprintf("pgid (%d) above high water mark (%d)", p.id, tx.meta.pgid))
  37. }
  38. node.pgid = p.id
  39. node.write(p)
  40. // 已经拆分过了
  41. node.spilled = true
  42. // Insert into parent inodes.
  43. if node.parent != nil {
  44. var key = node.key
  45. if key == nil {
  46. key = node.inodes[0].key
  47. }
  48. // 放入父亲节点中
  49. node.parent.put(key, node.inodes[0].key, nil, node.pgid, 0)
  50. node.key = node.inodes[0].key
  51. _assert(len(node.key) > 0, "spill: zero-length node key")
  52. }
  53. // Update the statistics.
  54. tx.stats.Spill++
  55. }
  56. // If the root node split and created a new root then we need to spill that
  57. // as well. We'll clear out the children to make sure it doesn't try to respill.
  58. if n.parent != nil && n.parent.pgid == 0 {
  59. n.children = nil
  60. return n.parent.spill()
  61. }
  62. return nil
  63. }
  64. // split breaks up a node into multiple smaller nodes, if appropriate.
  65. // This should only be called from the spill() function.
  66. func (n *node) split(pageSize int) []*node {
  67. var nodes []*node
  68. node := n
  69. for {
  70. // Split node into two.
  71. a, b := node.splitTwo(pageSize)
  72. nodes = append(nodes, a)
  73. // If we can't split then exit the loop.
  74. if b == nil {
  75. break
  76. }
  77. // Set node to b so it gets split on the next iteration.
  78. node = b
  79. }
  80. return nodes
  81. }
  82. // splitTwo breaks up a node into two smaller nodes, if appropriate.
  83. // This should only be called from the split() function.
  84. func (n *node) splitTwo(pageSize int) (*node, *node) {
  85. // Ignore the split if the page doesn't have at least enough nodes for
  86. // two pages or if the nodes can fit in a single page.
  87. // 太小的话,就不拆分了
  88. if len(n.inodes) <= (minKeysPerPage*2) || n.sizeLessThan(pageSize) {
  89. return n, nil
  90. }
  91. // Determine the threshold before starting a new node.
  92. var fillPercent = n.bucket.FillPercent
  93. if fillPercent < minFillPercent {
  94. fillPercent = minFillPercent
  95. } else if fillPercent > maxFillPercent {
  96. fillPercent = maxFillPercent
  97. }
  98. threshold := int(float64(pageSize) * fillPercent)
  99. // Determine split position and sizes of the two pages.
  100. splitIndex, _ := n.splitIndex(threshold)
  101. // Split node into two separate nodes.
  102. // If there's no parent then we'll need to create one.
  103. if n.parent == nil {
  104. n.parent = &node{bucket: n.bucket, children: []*node{n}}
  105. }
  106. // Create a new node and add it to the parent.
  107. // 拆分出一个新节点
  108. next := &node{bucket: n.bucket, isLeaf: n.isLeaf, parent: n.parent}
  109. n.parent.children = append(n.parent.children, next)
  110. // Split inodes across two nodes.
  111. next.inodes = n.inodes[splitIndex:]
  112. n.inodes = n.inodes[:splitIndex]
  113. // Update the statistics.
  114. n.bucket.tx.stats.Split++
  115. return n, next
  116. }
  117. // splitIndex finds the position where a page will fill a given threshold.
  118. // It returns the index as well as the size of the first page.
  119. // This is only be called from split().
  120. // 找到合适的index
  121. func (n *node) splitIndex(threshold int) (index, sz int) {
  122. sz = pageHeaderSize
  123. // Loop until we only have the minimum number of keys required for the second page.
  124. for i := 0; i < len(n.inodes)-minKeysPerPage; i++ {
  125. index = i
  126. inode := n.inodes[i]
  127. elsize := n.pageElementSize() + len(inode.key) + len(inode.value)
  128. // If we have at least the minimum number of keys and adding another
  129. // node would put us over the threshold then exit and return.
  130. if i >= minKeysPerPage && sz+elsize > threshold {
  131. break
  132. }
  133. // Add the element size to the total size.
  134. sz += elsize
  135. }
  136. return
  137. }

合并rebalance()

rebalance attempts to combine the node with sibling nodes if the node fill size is below a threshold or if there are not enough keys.

页合并有点复杂,虽然能看懂,但要自己写感觉还是挺难写出bug free的

  1. // rebalance attempts to combine the node with sibling nodes if the node fill
  2. // size is below a threshold or if there are not enough keys.
  3. // 填充率太低或者没有足够的key时,进行页合并
  4. func (n *node) rebalance() {
  5. if !n.unbalanced {
  6. return
  7. }
  8. n.unbalanced = false
  9. // Update statistics.
  10. n.bucket.tx.stats.Rebalance++
  11. // Ignore if node is above threshold (25%) and has enough keys.
  12. var threshold = n.bucket.tx.db.pageSize / 4
  13. if n.size() > threshold && len(n.inodes) > n.minKeys() {
  14. return
  15. }
  16. // Root node has special handling.
  17. if n.parent == nil {
  18. // If root node is a branch and only has one node then collapse it.
  19. if !n.isLeaf && len(n.inodes) == 1 {
  20. // Move root's child up.
  21. child := n.bucket.node(n.inodes[0].pgid, n)
  22. n.isLeaf = child.isLeaf
  23. n.inodes = child.inodes[:]
  24. n.children = child.children
  25. // Reparent all child nodes being moved.
  26. for _, inode := range n.inodes {
  27. if child, ok := n.bucket.nodes[inode.pgid]; ok {
  28. child.parent = n
  29. }
  30. }
  31. // Remove old child.
  32. child.parent = nil
  33. delete(n.bucket.nodes, child.pgid)
  34. child.free()
  35. }
  36. return
  37. }
  38. // If node has no keys then just remove it.
  39. if n.numChildren() == 0 {
  40. n.parent.del(n.key)
  41. n.parent.removeChild(n)
  42. delete(n.bucket.nodes, n.pgid)
  43. n.free()
  44. n.parent.rebalance()
  45. return
  46. }
  47. _assert(n.parent.numChildren() > 1, "parent must have at least 2 children")
  48. // Destination node is right sibling if idx == 0, otherwise left sibling.
  49. var target *node
  50. // 判断当前node是否是parent的第一个孩子节点,是的话,就要找它的下一个兄弟节点,否则的话,就找上一个兄弟节点
  51. var useNextSibling = (n.parent.childIndex(n) == 0)
  52. if useNextSibling {
  53. target = n.nextSibling()
  54. } else {
  55. target = n.prevSibling()
  56. }
  57. // If both this node and the target node are too small then merge them.
  58. // 合并当前node和target,target合到node
  59. if useNextSibling {
  60. // Reparent all child nodes being moved.
  61. for _, inode := range target.inodes {
  62. if child, ok := n.bucket.nodes[inode.pgid]; ok {
  63. // 之前的父亲移除该孩子
  64. child.parent.removeChild(child)
  65. // 重新指定父亲节点
  66. child.parent = n
  67. // 父亲节点指当前孩子
  68. child.parent.children = append(child.parent.children, child)
  69. }
  70. }
  71. // Copy over inodes from target and remove target.
  72. n.inodes = append(n.inodes, target.inodes...)
  73. n.parent.del(target.key)
  74. n.parent.removeChild(target)
  75. delete(n.bucket.nodes, target.pgid)
  76. target.free()
  77. } else {
  78. // node合到target
  79. // Reparent all child nodes being moved.
  80. for _, inode := range n.inodes {
  81. if child, ok := n.bucket.nodes[inode.pgid]; ok {
  82. child.parent.removeChild(child)
  83. child.parent = target
  84. child.parent.children = append(child.parent.children, child)
  85. }
  86. }
  87. // Copy over inodes to target and remove node.
  88. target.inodes = append(target.inodes, n.inodes...)
  89. n.parent.del(n.key)
  90. n.parent.removeChild(n)
  91. delete(n.bucket.nodes, n.pgid)
  92. n.free()
  93. }
  94. // Either this node or the target node was deleted from the parent so rebalance it.
  95. n.parent.rebalance()
  96. }