MySQL · 源码分析 · btr_cur_search_to_nth_level 函数分析

本文基于MySQL Community 8.0.25 Version

概述

btr_cur_search_to_nth_level 函数实现innodb中btree cursor的定位,从btree顶层的root到具体的位置,同时在该函数实现了btree的并发访问控制。

btr_cur_search_to_nth_level函数有1000多行的代码,阅读起来不太直观方便,本文对该函数的分析是基于mysql-8.0.25 源码,通过给关键函数模块框架加注释,同时删除一些非核心的代码来进行分析,目的是让阅读本文后,能对该函数有快速直观的了解,方便后续更深入的理解。

源码分析

  1. void btr_cur_search_to_nth_level(
  2. dict_index_t *index, /*!< in: index */
  3. ulint level, /*!< in: the tree level of search */
  4. const dtuple_t *tuple, /*!< in: data tuple; NOTE: n_fields_cmp in
  5. tuple must be set so that it cannot get
  6. compared to the node ptr page number field! */
  7. page_cur_mode_t mode, /*!< in: PAGE_CUR_L, ...;
  8. Inserts should always be made using
  9. PAGE_CUR_LE to search the position! */
  10. ...)
  11. {
  12. // 变量定义及一些基本的assertion
  13. page_t *page = nullptr; /* remove warning */
  14. buf_block_t *block;
  15. ...
  16. // 使用ahi查询,查询成功返回
  17. if (btr_search_guess_on_hash(index, info, tuple, mode, latch_mode, cursor,
  18. has_search_latch, mtr)) {
  19. /* Search using the hash index succeeded */
  20. return;
  21. }
  22. // 给index 加latch
  23. switch (latch_mode) {
  24. case BTR_MODIFY_TREE:
  25. if (...) {
  26. mtr_x_lock(dict_index_get_lock(index), mtr);
  27. } else {
  28. mtr_sx_lock(dict_index_get_lock(index), mtr);
  29. }
  30. upper_rw_latch = RW_X_LATCH;
  31. break;
  32. case BTR_CONT_MODIFY_TREE:
  33. case BTR_CONT_SEARCH_TREE:
  34. if (...) {
  35. upper_rw_latch = RW_X_LATCH;
  36. } else {
  37. upper_rw_latch = RW_NO_LATCH;
  38. }
  39. break;
  40. default:
  41. if (!srv_read_only_mode) {
  42. if (...) {
  43. mtr_s_lock(dict_index_get_lock(index), mtr);
  44. } else {
  45. /* BTR_MODIFY_EXTERNAL needs to be excluded */
  46. mtr_sx_lock(dict_index_get_lock(index), mtr);
  47. }
  48. upper_rw_latch = RW_S_LATCH;
  49. } else {
  50. upper_rw_latch = RW_NO_LATCH;
  51. }
  52. }
  53. // 初始化root page_id
  54. const space_id_t space = dict_index_get_space(index);
  55. const page_size_t page_size(dict_table_page_size(index->table));
  56. /* Start with the root page. */
  57. page_id_t page_id(space, dict_index_get_page(index)); // root page id
  58. // 非叶子节点调整search mode
  59. switch (mode) {
  60. case PAGE_CUR_GE:
  61. page_mode = PAGE_CUR_L;
  62. break;
  63. case PAGE_CUR_G:
  64. page_mode = PAGE_CUR_LE;
  65. break;
  66. default:
  67. page_mode = mode;
  68. break;
  69. }
  70. // 循环、逐层的查找,直至达到传入的层数 level,一般是0(即叶子节点)
  71. // 此处的分析忽略Spatial index的部分
  72. // 从Buffer Pool或磁盘中得到索引页
  73. search_loop:
  74. // 更新rw_latch for page latch mode
  75. if (height != 0) {
  76. if ((latch_mode != BTR_MODIFY_TREE || height == level) &&
  77. !retrying_for_search_prev) {
  78. rw_latch = RW_SX_LATCH;
  79. } else {
  80. rw_latch = upper_rw_latch;
  81. }
  82. }
  83. } else if (latch_mode <= BTR_MODIFY_LEAF) {
  84. rw_latch = latch_mode;
  85. // 如果判断应该操作change buffer,则更新fetch mode为从buffer pool中获取
  86. if (btr_op != BTR_NO_OP &&
  87. ibuf_should_try(index, btr_op != BTR_INSERT_OP)) {
  88. /* Try to buffer the operation if the leaf
  89. page is not in the buffer pool. */
  90. fetch = btr_op == BTR_DELETE_OP ? Page_fetch::IF_IN_POOL_OR_WATCH
  91. : Page_fetch::IF_IN_POOL;
  92. }
  93. }
  94. // 获取page
  95. retry_page_get:
  96. block = buf_page_get_gen(page_id, page_size, rw_latch, guess, fetch, file, line, mtr);
  97. // 当block为null时,尝试change buffer操作
  98. if (block == nullptr) {
  99. switch (btr_op) {
  100. case BTR_INSERT_OP:
  101. case BTR_INSERT_IGNORE_UNIQUE_OP:
  102. if (ibuf_insert(IBUF_OP_INSERT, tuple, index, page_id, page_size,
  103. cursor->thr)) {
  104. cursor->flag = BTR_CUR_INSERT_TO_IBUF;
  105. goto func_exit;
  106. }
  107. break;
  108. case BTR_DELMARK_OP:
  109. if (ibuf_insert(IBUF_OP_DELETE_MARK, tuple, index, page_id, page_size,
  110. cursor->thr)) {
  111. cursor->flag = BTR_CUR_DEL_MARK_IBUF;
  112. goto func_exit;
  113. }
  114. break;
  115. case BTR_DELETE_OP:
  116. if (!row_purge_poss_sec(cursor->purge_node, index, tuple)) {
  117. /* The record cannot be purged yet. */
  118. cursor->flag = BTR_CUR_DELETE_REF;
  119. } else if (ibuf_insert(IBUF_OP_DELETE, tuple, index, page_id, page_size,
  120. cursor->thr)) {
  121. /* The purge was buffered. */
  122. cursor->flag = BTR_CUR_DELETE_IBUF;
  123. } else {
  124. /* The purge could not be buffered. */
  125. buf_pool_watch_unset(page_id);
  126. break;
  127. }
  128. buf_pool_watch_unset(page_id);
  129. goto func_exit;
  130. default:
  131. ut_error;
  132. }
  133. // 操作change buffer没有成功,则更新fetch mode为从disk中获取该page,然后再次获取该page
  134. fetch = cursor->m_fetch_mode;
  135. goto retry_page_get;
  136. }
  137. // 如果search prev, 读取及latch 左节点
  138. if (retrying_for_search_prev && height != 0) {
  139. //获取左节点page_no
  140. left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
  141. if (left_page_no != FIL_NULL) {
  142. get_block = buf_page_get_gen(page_id_t(page_id.space(), left_page_no), page_size,
  143. rw_latch, nullptr, fetch, file, line, mtr);
  144. }
  145. // 重新获取该page,并给该page加latch,因为之前该page是no latch,所以需要重新获取并加latch
  146. block = buf_page_get_gen(page_id, page_size, rw_latch, nullptr, fetch, file, line, mtr);
  147. }
  148. page = buf_block_get_frame(block);
  149. // 当root page同时为Leaf page,latch不符合预期时,则重新循环获取page
  150. if (height == ULINT_UNDEFINED && page_is_leaf(page) &&
  151. rw_latch != RW_NO_LATCH && rw_latch != root_leaf_rw_latch) {
  152. goto search_loop;
  153. }
  154. // root page时,更新信息
  155. if (UNIV_UNLIKELY(height == ULINT_UNDEFINED)) {
  156. /* We are in the root node */
  157. height = btr_page_get_level(page, mtr);
  158. root_height = height;
  159. cursor->tree_height = root_height + 1;
  160. info->root_guess = block;
  161. }
  162. // 当达到leaf层时
  163. if (height == 0) {
  164. // 给叶子节点加latch
  165. if (rw_latch == RW_NO_LATCH) {
  166. latch_leaves = btr_cur_latch_leaves(block, page_id, page_size, latch_mode,
  167. cursor, mtr);
  168. }
  169. // 释放mtr中index latch和和上层blocks
  170. switch (latch_mode) {
  171. case BTR_MODIFY_TREE:
  172. case BTR_CONT_MODIFY_TREE:
  173. case BTR_CONT_SEARCH_TREE:
  174. break;
  175. default:
  176. // 释放index latch
  177. if (!s_latch_by_caller && !srv_read_only_mode && !modify_external) {
  178. mtr_release_s_latch_at_savepoint(mtr, savepoint,
  179. dict_index_get_lock(index));
  180. }
  181. // 释放各层blocks
  182. for (; n_releases < n_blocks; n_releases++) {
  183. if (n_releases == 0 && modify_external) {
  184. /* keep latch of root page */
  185. continue;
  186. }
  187. mtr_release_block_at_savepoint(mtr, tree_savepoints[n_releases],
  188. tree_blocks[n_releases]);
  189. }
  190. }
  191. page_mode = mode;
  192. }
  193. // index是spatial index,进行一些page search mode调整
  194. if (dict_index_is_spatial(index)) {
  195. if (page_mode == PAGE_CUR_RTREE_LOCATE && level == height) {
  196. if (level == 0) {
  197. page_mode = PAGE_CUR_LE;
  198. } else {
  199. page_mode = PAGE_CUR_RTREE_GET_FATHER;
  200. }
  201. }
  202. if (page_mode == PAGE_CUR_RTREE_INSERT) {
  203. page_mode = (level == height) ? PAGE_CUR_LE : PAGE_CUR_RTREE_INSERT;
  204. }
  205. }
  206. // 在page中找到rec并保存到page_cursor中
  207. page_cur_search_with_match_bytes(block, index, tuple, page_mode, &up_match,
  208. &up_bytes, &low_match, &low_bytes,
  209. page_cursor);
  210. /* If this is the desired level, leave the loop */
  211. // 没有达到目的level
  212. if (level != height) {
  213. // 往下层推进
  214. height--;
  215. /* If the rec is the first or last in the page for
  216. pessimistic delete intention, it might cause node_ptr insert
  217. for the upper level. We should change the intention and retry.
  218. */
  219. // 可能导致SMO,重置index root page为search page,重新开始循环
  220. if (latch_mode == BTR_MODIFY_TREE &&
  221. btr_cur_need_opposite_intention(page, lock_intention, node_ptr)) {
  222. page_id.reset(space, dict_index_get_page(index));
  223. goto search_loop;
  224. }
  225. // spatial index
  226. if (dict_index_is_spatial(index)) {
  227. ...
  228. }
  229. /* If the first or the last record of the page
  230. or the same key value to the first record or last record,
  231. the another page might be choosen when BTR_CONT_MODIFY_TREE.
  232. So, the parent page should not released to avoiding deadlock
  233. with blocking the another search with the same key value. */
  234. // 判断是否是第一个或者最后一个rec,或者和他们match,设置变量detected_same_key_root
  235. if (!detected_same_key_root && lock_intention == BTR_INTENTION_BOTH &&
  236. !dict_index_is_unique(index) && latch_mode == BTR_MODIFY_TREE &&
  237. (up_match >= rec_offs_n_fields(offsets) - 1 ||
  238. low_match >= rec_offs_n_fields(offsets) - 1)) {
  239. // 为第一个或者最后一个
  240. if (node_ptr == first_rec || page_rec_is_last(node_ptr, page)) {
  241. detected_same_key_root = true;
  242. } else {
  243. // 比较是否和第一个或者最后一个rec match
  244. detected_same_key_root = true;
  245. }
  246. }
  247. /* If the page might cause modify_tree,
  248. we should not release the parent page's lock. */
  249. if (!detected_same_key_root && latch_mode == BTR_MODIFY_TREE &&
  250. !btr_cur_will_modify_tree(index, page, lock_intention, node_ptr,
  251. node_ptr_max_size, page_size, mtr) &&
  252. !rtree_parent_modified) {
  253. /* we can release upper blocks */
  254. for (; n_releases < n_blocks; n_releases++) {
  255. if (n_releases == 0) {
  256. /* we should not release root page
  257. to pin to same block. */
  258. continue;
  259. }
  260. /* release unused blocks to unpin */
  261. mtr_release_block_at_savepoint(mtr, tree_savepoints[n_releases],
  262. tree_blocks[n_releases]);
  263. }
  264. }
  265. // 为BTR_MODIFY_TREE时给root page加 sx latch,给他其他page加x latch
  266. if (height == level && latch_mode == BTR_MODIFY_TREE) {
  267. ut_ad(upper_rw_latch == RW_X_LATCH);
  268. /* we should sx-latch root page, if released already.
  269. It contains seg_header. */
  270. if (n_releases > 0) {
  271. mtr_block_sx_latch_at_savepoint(mtr, tree_savepoints[0],
  272. tree_blocks[0]);
  273. }
  274. /* x-latch the branch blocks not released yet. */
  275. for (ulint i = n_releases; i <= n_blocks; i++) {
  276. mtr_block_x_latch_at_savepoint(mtr, tree_savepoints[i], tree_blocks[i]);
  277. }
  278. }
  279. /* We should consider prev_page of parent page, if the node_ptr
  280. is the leftmost of the page. because BTR_SEARCH_PREV and
  281. BTR_MODIFY_PREV latches prev_page of the leaf page. */
  282. if ((latch_mode == BTR_SEARCH_PREV || latch_mode == BTR_MODIFY_PREV) &&
  283. !retrying_for_search_prev) {
  284. if (btr_page_get_prev(page, mtr) != FIL_NULL &&
  285. page_rec_is_first(node_ptr, page)) {
  286. if (leftmost_from_level == 0) {
  287. leftmost_from_level = height + 1;
  288. }
  289. } else {
  290. leftmost_from_level = 0;
  291. }
  292. if (height == 0 && leftmost_from_level > 0) {
  293. /* should retry to get also prev_page
  294. from level==leftmost_from_level. */
  295. retrying_for_search_prev = true;
  296. page_id.reset(space, tree_blocks[idx]->page.id.page_no());
  297. for (ulint i = n_blocks - (leftmost_from_level - 1); i <= n_blocks;
  298. i++) {
  299. mtr_release_block_at_savepoint(mtr, tree_savepoints[i],
  300. tree_blocks[i]);
  301. }
  302. n_blocks -= (leftmost_from_level - 1);
  303. height = leftmost_from_level;
  304. /* replay up_match, low_match */
  305. for (ulint i = 0; i < n_blocks; i++) {
  306. page_cur_search_with_match(tree_blocks[i], index, tuple, page_mode,
  307. &up_match, &low_match, page_cursor,
  308. rtr_info);
  309. }
  310. goto search_loop;
  311. }
  312. }
  313. //重置page_id为下层子节点page,然后进入下层查找
  314. page_id.reset(space, btr_node_ptr_get_child_page_no(node_ptr, offsets));
  315. n_blocks++;
  316. if (UNIV_UNLIKELY(height == 0 && dict_index_is_ibuf(index))) {
  317. /* We're doing a search on an ibuf tree and we're one
  318. level above the leaf page. */
  319. ut_ad(level == 0);
  320. fetch = cursor->m_fetch_mode;
  321. rw_latch = RW_NO_LATCH;
  322. goto retry_page_get;
  323. }
  324. // 循环到下层
  325. goto search_loop;
  326. }
  327. // 找到对应rec位置,更新cursor的low_match up_match字段
  328. if (level != 0) {
  329. // assert page not corrupt
  330. btr_assert_not_corrupted(block, index);
  331. if (page_mode <= PAGE_CUR_LE) {
  332. cursor->low_match = low_match;
  333. cursor->up_match = up_match;
  334. }
  335. } else {
  336. cursor->low_match = low_match;
  337. cursor->low_bytes = low_bytes;
  338. cursor->up_match = up_match;
  339. cursor->up_bytes = up_bytes;
  340. // 根据查询的结果,更新该index的ahi信息
  341. if (btr_search_enabled && !index->disable_ahi) {
  342. btr_search_info_update(index, cursor);
  343. }
  344. }
  345. // 函数退出
  346. // 释放相关堆内存等
  347. func_exit:
  348. if (UNIV_LIKELY_NULL(heap)) {
  349. mem_heap_free(heap);
  350. }
  351. if (retrying_for_search_prev) {
  352. ut_free(prev_tree_blocks);
  353. ut_free(prev_tree_savepoints);
  354. }
  355. ...
  356. }