SQLite—-Page Cache之事务处理(3)

写在前面:由于内容较多,所以断续没有写完的内容。

11、删除日志文件(Deleting The Rollback Journal)

一旦更改写入设备,日志文件将会被删除,这是事务真正提交的时刻。如果在这之前系统发生崩溃,就会进行恢复处理,使得数据库和没发生改变一样;如果在这之后系统发生崩溃,表明所有的更改都已经写入磁盘。SQLite就是根据日志存在情况决定是否对数据库进行恢复处理。

删除文件本质上不是一个原子操作,但是从用户进程的角度来看是一个原子操作,所以一个事务看起来是一个原子操作。 在许多系统中,删除文件也是一个高代价的操作。作为优化,SQLite可以配置成把日志文件的长度截为0或者把日志文件头清零。 document/2015-09-15/55f7c5015a12a

12、释放锁(Releasing The Lock)

作为原子提交的最后一步,释放排斥锁使得其它进程可以开始访问数据库了。 下图中,我们指明了当锁被释放的时候用户空间所拥有的信息已经被清空了.对于老版本的SQLite你可这么认为。但最新的SQLite会保存些用户空间的缓存不会被清空—万一下一个事务开始的时候,这些数据刚好可以用上呢。重新利用这些内存要比再次从操作系统磁盘缓存或者硬盘中读取要来得轻松与快捷得多,何乐而不为呢?在再次使用这些数据之前,我们必须先取得一个共享锁,同时我们还不得不去检查一下,保证还没有其他进程在我们拥有共享锁之前对数据库文件进行了修改。数据库文件的第一页中有一个计数器,数据库文件每做一次修改,这个计数器就会增长一下。我们可以通过检查这个计数器就可得知是否有其他进程修改过数据库文件。如果数据库文件已经被修改过了,那么用户内存空间的缓存就不得不清空,并重新读入。大多数情况下,这种情况不大会发生,因此用户空间的内存缓存将是有效的,这对于性能提高来说作用是显著的。 document/2015-09-15/55f7c5110eb5c

以上两步是在sqlite3BtreeCommit()—-btree.c函数中实现的。

代码如下:

  1. //提交事务,至此一个事务完成.主要做两件事:
  2. //删除日志文件,释放数据库文件的写锁
  3. int sqlite3BtreeCommit(Btree *p){
  4. BtShared *pBt = p->pBt;
  5. btreeIntegrity(p);
  6. /* If the handle has a write-transaction open, commit the shared-btrees
  7. ** transaction and set the shared state to TRANS_READ.
  8. */
  9. if( p->inTrans==TRANS_WRITE ){
  10. int rc;
  11. assert( pBt->inTransaction==TRANS_WRITE );
  12. assert( pBt->nTransaction>0 );
  13. //调用pager,提交事务
  14. rc = sqlite3pager_commit(pBt->pPager);
  15. if( rc!=SQLITE_OK ){
  16. return rc;
  17. }
  18. pBt->inTransaction = TRANS_READ;
  19. pBt->inStmt = 0;
  20. }
  21. unlockAllTables(p);
  22. /* If the handle has any kind of transaction open, decrement the transaction
  23. ** count of the shared btree. If the transaction count reaches 0, set
  24. ** the shared state to TRANS_NONE. The unlockBtreeIfUnused() call below
  25. ** will unlock the pager.
  26. */
  27. if( p->inTrans!=TRANS_NONE ){
  28. pBt->nTransaction--;
  29. if( 0==pBt->nTransaction ){
  30. pBt->inTransaction = TRANS_NONE;
  31. }
  32. }
  33. }
  34. //提交事务,主要调用pager_unwritelock()函数
  35. int sqlite3pager_commit(Pager *pPager){
  36. int rc;
  37. PgHdr *pPg;
  38. if( pPager->errCode ){
  39. return pPager->errCode;
  40. }
  41. if( pPager->state<PAGER_RESERVED ){
  42. return SQLITE_ERROR;
  43. }
  44. TRACE2("COMMIT %d\n", PAGERID(pPager));
  45. if( MEMDB ){
  46. pPg = pager_get_all_dirty_pages(pPager);
  47. while( pPg ){
  48. clearHistory(PGHDR_TO_HIST(pPg, pPager));
  49. pPg->dirty = 0;
  50. pPg->inJournal = 0;
  51. pPg->inStmt = 0;
  52. pPg->needSync = 0;
  53. pPg->pPrevStmt = pPg->pNextStmt = 0;
  54. pPg = pPg->pDirty;
  55. }
  56. pPager->pDirty = 0;
  57. #ifndef NDEBUG
  58. for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
  59. PgHistory *pHist = PGHDR_TO_HIST(pPg, pPager);
  60. assert( !pPg->alwaysRollback );
  61. assert( !pHist->pOrig );
  62. assert( !pHist->pStmt );
  63. }
  64. #endif
  65. pPager->pStmt = 0;
  66. pPager->state = PAGER_SHARED;
  67. return SQLITE_OK;
  68. }
  69. if( pPager->dirtyCache==0 ){
  70. /* Exit early (without doing the time-consuming sqlite3OsSync() calls)
  71. ** if there have been no changes to the database file. */
  72. assert( pPager->needSync==0 );
  73. rc = pager_unwritelock(pPager);
  74. pPager->dbSize = -1;
  75. return rc;
  76. }
  77. assert( pPager->journalOpen );
  78. rc = sqlite3pager_sync(pPager, 0, 0);
  79. //删除文件,释放写锁
  80. if( rc==SQLITE_OK ){
  81. rc = pager_unwritelock(pPager);
  82. pPager->dbSize = -1;
  83. }
  84. return rc;
  85. }
  86. //对数据库加read lock,删除日志文件
  87. static int pager_unwritelock(Pager *pPager){
  88. PgHdr *pPg;
  89. int rc;
  90. assert( !MEMDB );
  91. if( pPager->state<PAGER_RESERVED ){
  92. return SQLITE_OK;
  93. }
  94. sqlite3pager_stmt_commit(pPager);
  95. if( pPager->stmtOpen ){
  96. sqlite3OsClose(&pPager->stfd);
  97. pPager->stmtOpen = 0;
  98. }
  99. if( pPager->journalOpen ){
  100. //关闭日志文件
  101. sqlite3OsClose(&pPager->jfd);
  102. pPager->journalOpen = 0;
  103. //删除日志文件
  104. sqlite3OsDelete(pPager->zJournal);
  105. sqliteFree( pPager->aInJournal );
  106. pPager->aInJournal = 0;
  107. for(pPg=pPager->pAll; pPg; pPg=pPg->pNextAll){
  108. pPg->inJournal = 0;
  109. pPg->dirty = 0;
  110. pPg->needSync = 0;
  111. #ifdef SQLITE_CHECK_PAGES
  112. pPg->pageHash = pager_pagehash(pPg);
  113. #endif
  114. }
  115. pPager->pDirty = 0;
  116. pPager->dirtyCache = 0;
  117. pPager->nRec = 0;
  118. }else{
  119. assert( pPager->aInJournal==0 );
  120. assert( pPager->dirtyCache==0 || pPager->useJournal==0 );
  121. }
  122. //释放写锁,加读锁
  123. rc = sqlite3OsUnlock(pPager->fd, SHARED_LOCK);
  124. pPager->state = PAGER_SHARED;
  125. pPager->origDbSize = 0;
  126. pPager->setMaster = 0;
  127. pPager->needSync = 0;
  128. pPager->pFirstSynced = pPager->pFirst;
  129. return rc;
  130. }

下图可进一步描述该过程: document/2015-09-15/55f7c538bb005 最后来看看sqlite3BtreeSync()和sqlite3BtreeCommit()是如何被调用的。

一般来说,事务提交方式为自动提交的话,在虚拟机中的OP_Halt指令实现提交事务,相关代码如下:

  1. //虚拟机停机指令
  2. case OP_Halt: { /* no-push */
  3. p->pTos = pTos;
  4. p->rc = pOp->p1;
  5. p->pc = pc;
  6. p->errorAction = pOp->p2;
  7. if( pOp->p3 ){
  8. sqlite3SetString(&p->zErrMsg, pOp->p3, (char*)0);
  9. }
  10. //设置虚拟机状态SQLITE_MAGIC_RUN 为 SQLITE_MAGIC_HALT,
  11. //并提交事务
  12. rc = sqlite3VdbeHalt(p);
  13. assert( rc==SQLITE_BUSY || rc==SQLITE_OK );
  14. if( rc==SQLITE_BUSY ){
  15. p->rc = SQLITE_BUSY;
  16. return SQLITE_BUSY;
  17. }
  18. return p->rc ? SQLITE_ERROR : SQLITE_DONE;
  19. }

//当虚拟机要停机时,调用该函数,如果VDBE改变了数据库且为自动 //提交模式,则提交这些改变

  1. int sqlite3VdbeHalt(Vdbe *p){
  2. sqlite3 *db = p->db;
  3. int i;
  4. int (*xFunc)(Btree *pBt) = 0; /* Function to call on each btree backend */
  5. int isSpecialError; /* Set to true if SQLITE_NOMEM or IOERR */
  6. /* This function contains the logic that determines if a statement or
  7. ** transaction will be committed or rolled back as a result of the
  8. ** execution of this virtual machine.
  9. **
  10. ** Special errors:
  11. **
  12. ** If an SQLITE_NOMEM error has occured in a statement that writes to
  13. ** the database, then either a statement or transaction must be rolled
  14. ** back to ensure the tree-structures are in a consistent state. A
  15. ** statement transaction is rolled back if one is open, otherwise the
  16. ** entire transaction must be rolled back.
  17. **
  18. ** If an SQLITE_IOERR error has occured in a statement that writes to
  19. ** the database, then the entire transaction must be rolled back. The
  20. ** I/O error may have caused garbage to be written to the journal
  21. ** file. Were the transaction to continue and eventually be rolled
  22. ** back that garbage might end up in the database file.
  23. **
  24. ** In both of the above cases, the Vdbe.errorAction variable is
  25. ** ignored. If the sqlite3.autoCommit flag is false and a transaction
  26. ** is rolled back, it will be set to true.
  27. **
  28. ** Other errors:
  29. **
  30. ** No error:
  31. **
  32. */
  33. if( sqlite3MallocFailed() ){
  34. p->rc = SQLITE_NOMEM;
  35. }
  36. if( p->magic!=VDBE_MAGIC_RUN ){
  37. /* Already halted. Nothing to do. */
  38. assert( p->magic==VDBE_MAGIC_HALT );
  39. return SQLITE_OK;
  40. }
  41. //释放虚拟机中所有的游标
  42. closeAllCursors(p);
  43. checkActiveVdbeCnt(db);
  44. /* No commit or rollback needed if the program never started */
  45. if( p->pc>=0 ){
  46. /* Check for one of the special errors - SQLITE_NOMEM or SQLITE_IOERR */
  47. isSpecialError = ((p->rc==SQLITE_NOMEM || p->rc==SQLITE_IOERR)?1:0);
  48. if( isSpecialError ){
  49. /* This loop does static analysis of the query to see which of the
  50. ** following three categories it falls into:
  51. **
  52. ** Read-only
  53. ** Query with statement journal
  54. ** Query without statement journal
  55. **
  56. ** We could do something more elegant than this static analysis (i.e.
  57. ** store the type of query as part of the compliation phase), but
  58. ** handling malloc() or IO failure is a fairly obscure edge case so
  59. ** this is probably easier. Todo: Might be an opportunity to reduce
  60. ** code size a very small amount though
  61. */
  62. int isReadOnly = 1;
  63. int isStatement = 0;
  64. assert(p->aOp || p->nOp==0);
  65. for(i=0; i<p->nOp; i++){
  66. switch( p->aOp[i].opcode ){
  67. case OP_Transaction:
  68. isReadOnly = 0;
  69. break;
  70. case OP_Statement:
  71. isStatement = 1;
  72. break;
  73. }
  74. }
  75. /* If the query was read-only, we need do no rollback at all. Otherwise,
  76. ** proceed with the special handling.
  77. */
  78. if( !isReadOnly ){
  79. if( p->rc==SQLITE_NOMEM && isStatement ){
  80. xFunc = sqlite3BtreeRollbackStmt;
  81. }else{
  82. /* We are forced to roll back the active transaction. Before doing
  83. ** so, abort any other statements this handle currently has active.
  84. */
  85. sqlite3AbortOtherActiveVdbes(db, p);
  86. sqlite3RollbackAll(db);
  87. db->autoCommit = 1;
  88. }
  89. }
  90. }
  91. /* If the auto-commit flag is set and this is the only active vdbe, then
  92. ** we do either a commit or rollback of the current transaction.
  93. **
  94. ** Note: This block also runs if one of the special errors handled
  95. ** above has occured.
  96. */
  97. //如果自动提交事务,则提交事务
  98. if( db->autoCommit && db->activeVdbeCnt==1 ){
  99. if( p->rc==SQLITE_OK || (p->errorAction==OE_Fail && !isSpecialError) ){
  100. /* The auto-commit flag is true, and the vdbe program was
  101. ** successful or hit an 'OR FAIL' constraint. This means a commit
  102. ** is required.
  103. */
  104. //提交事务
  105. int rc = vdbeCommit(db);
  106. if( rc==SQLITE_BUSY ){
  107. return SQLITE_BUSY;
  108. }else if( rc!=SQLITE_OK ){
  109. p->rc = rc;
  110. sqlite3RollbackAll(db);
  111. }else{
  112. sqlite3CommitInternalChanges(db);
  113. }
  114. }else{
  115. sqlite3RollbackAll(db);
  116. }
  117. }else if( !xFunc ){
  118. if( p->rc==SQLITE_OK || p->errorAction==OE_Fail ){
  119. xFunc = sqlite3BtreeCommitStmt;
  120. }else if( p->errorAction==OE_Abort ){
  121. xFunc = sqlite3BtreeRollbackStmt;
  122. }else{
  123. sqlite3AbortOtherActiveVdbes(db, p);
  124. sqlite3RollbackAll(db);
  125. db->autoCommit = 1;
  126. }
  127. }
  128. /* If xFunc is not NULL, then it is one of sqlite3BtreeRollbackStmt or
  129. ** sqlite3BtreeCommitStmt. Call it once on each backend. If an error occurs
  130. ** and the return code is still SQLITE_OK, set the return code to the new
  131. ** error value.
  132. */
  133. assert(!xFunc ||
  134. xFunc==sqlite3BtreeCommitStmt ||
  135. xFunc==sqlite3BtreeRollbackStmt
  136. );
  137. for(i=0; xFunc && i<db->nDb; i++){
  138. int rc;
  139. Btree *pBt = db->aDb[i].pBt;
  140. if( pBt ){
  141. rc = xFunc(pBt);
  142. if( rc && (p->rc==SQLITE_OK || p->rc==SQLITE_CONSTRAINT) ){
  143. p->rc = rc;
  144. sqlite3SetString(&p->zErrMsg, 0);
  145. }
  146. }
  147. }
  148. /* If this was an INSERT, UPDATE or DELETE and the statement was committed,
  149. ** set the change counter.
  150. */
  151. if( p->changeCntOn && p->pc>=0 ){
  152. if( !xFunc || xFunc==sqlite3BtreeCommitStmt ){
  153. sqlite3VdbeSetChanges(db, p->nChange);
  154. }else{
  155. sqlite3VdbeSetChanges(db, 0);
  156. }
  157. p->nChange = 0;
  158. }
  159. /* Rollback or commit any schema changes that occurred. */
  160. if( p->rc!=SQLITE_OK && db->flags&SQLITE_InternChanges ){
  161. sqlite3ResetInternalSchema(db, 0);
  162. db->flags = (db->flags | SQLITE_InternChanges);
  163. }
  164. }
  165. /* We have successfully halted and closed the VM. Record this fact. */
  166. if( p->pc>=0 ){
  167. db->activeVdbeCnt--;
  168. }
  169. p->magic = VDBE_MAGIC_HALT;
  170. checkActiveVdbeCnt(db);
  171. return SQLITE_OK;
  172. }
  1. //提交事务,主要调用:
  2. //sqlite3BtreeSync()---同步btree, sqlite3BtreeCommit()---提交事务
  3. static int vdbeCommit(sqlite3 *db){
  4. int i;
  5. int nTrans = 0; /* Number of databases with an active write-transaction */
  6. int rc = SQLITE_OK;
  7. int needXcommit = 0;
  8. for(i=0; i<db->nDb; i++){
  9. Btree *pBt = db->aDb[i].pBt;
  10. if( pBt && sqlite3BtreeIsInTrans(pBt) ){
  11. needXcommit = 1;
  12. if( i!=1 ) nTrans++;
  13. }
  14. }
  15. /* If there are any write-transactions at all, invoke the commit hook */
  16. if( needXcommit && db->xCommitCallback ){
  17. sqlite3SafetyOff(db);
  18. rc = db->xCommitCallback(db->pCommitArg);
  19. sqlite3SafetyOn(db);
  20. if( rc ){
  21. return SQLITE_CONSTRAINT;
  22. }
  23. }
  24. /* The simple case - no more than one database file (not counting the
  25. ** TEMP database) has a transaction active. There is no need for the
  26. ** master-journal.
  27. **
  28. ** If the return value of sqlite3BtreeGetFilename() is a zero length
  29. ** string, it means the main database is :memory:. In that case we do
  30. ** not support atomic multi-file commits, so use the simple case then
  31. ** too.
  32. */
  33. //简单的情况,只有一个数据库文件,不需要master-journal
  34. if( 0==strlen(sqlite3BtreeGetFilename(db->aDb[0].pBt)) || nTrans<=1 ){
  35. for(i=0; rc==SQLITE_OK && i<db->nDb; i++){
  36. Btree *pBt = db->aDb[i].pBt;
  37. if( pBt ){
  38. //同步btree
  39. rc = sqlite3BtreeSync(pBt, 0);
  40. }
  41. }
  42. /* Do the commit only if all databases successfully synced */
  43. //commite事务
  44. if( rc==SQLITE_OK ){
  45. for(i=0; i<db->nDb; i++){
  46. Btree *pBt = db->aDb[i].pBt;
  47. if( pBt ){
  48. sqlite3BtreeCommit(pBt);
  49. }
  50. }
  51. }
  52. }
  53. /* The complex case - There is a multi-file write-transaction active.
  54. ** This requires a master journal file to ensure the transaction is
  55. ** committed atomicly.
  56. */
  57. #ifndef SQLITE_OMIT_DISKIO
  58. else{
  59. int needSync = 0;
  60. char *zMaster = 0; /* File-name for the master journal */
  61. char const *zMainFile = sqlite3BtreeGetFilename(db->aDb[0].pBt);
  62. OsFile *master = 0;
  63. /* Select a master journal file name */
  64. do {
  65. u32 random;
  66. sqliteFree(zMaster);
  67. sqlite3Randomness(sizeof(random), &random);
  68. zMaster = sqlite3MPrintf("%s-mj%08X", zMainFile, random&0x7fffffff);
  69. if( !zMaster ){
  70. return SQLITE_NOMEM;
  71. }
  72. }while( sqlite3OsFileExists(zMaster) );
  73. /* Open the master journal. */
  74. rc = sqlite3OsOpenExclusive(zMaster, &master, 0);
  75. if( rc!=SQLITE_OK ){
  76. sqliteFree(zMaster);
  77. return rc;
  78. }
  79. /* Write the name of each database file in the transaction into the new
  80. ** master journal file. If an error occurs at this point close
  81. ** and delete the master journal file. All the individual journal files
  82. ** still have 'null' as the master journal pointer, so they will roll
  83. ** back independently if a failure occurs.
  84. */
  85. for(i=0; i<db->nDb; i++){
  86. Btree *pBt = db->aDb[i].pBt;
  87. if( i==1 ) continue; /* Ignore the TEMP database */
  88. if( pBt && sqlite3BtreeIsInTrans(pBt) ){
  89. char const *zFile = sqlite3BtreeGetJournalname(pBt);
  90. if( zFile[0]==0 ) continue; /* Ignore :memory: databases */
  91. if( !needSync && !sqlite3BtreeSyncDisabled(pBt) ){
  92. needSync = 1;
  93. }
  94. rc = sqlite3OsWrite(master, zFile, strlen(zFile)+1);
  95. if( rc!=SQLITE_OK ){
  96. sqlite3OsClose(&master);
  97. sqlite3OsDelete(zMaster);
  98. sqliteFree(zMaster);
  99. return rc;
  100. }
  101. }
  102. }
  103. /* Sync the master journal file. Before doing this, open the directory
  104. ** the master journal file is store in so that it gets synced too.
  105. */
  106. zMainFile = sqlite3BtreeGetDirname(db->aDb[0].pBt);
  107. rc = sqlite3OsOpenDirectory(master, zMainFile);
  108. if( rc!=SQLITE_OK ||
  109. (needSync && (rc=sqlite3OsSync(master,0))!=SQLITE_OK) ){
  110. sqlite3OsClose(&master);
  111. sqlite3OsDelete(zMaster);
  112. sqliteFree(zMaster);
  113. return rc;
  114. }
  115. /* Sync all the db files involved in the transaction. The same call
  116. ** sets the master journal pointer in each individual journal. If
  117. ** an error occurs here, do not delete the master journal file.
  118. **
  119. ** If the error occurs during the first call to sqlite3BtreeSync(),
  120. ** then there is a chance that the master journal file will be
  121. ** orphaned. But we cannot delete it, in case the master journal
  122. ** file name was written into the journal file before the failure
  123. ** occured.
  124. */
  125. for(i=0; i<db->nDb; i++){
  126. Btree *pBt = db->aDb[i].pBt;
  127. if( pBt && sqlite3BtreeIsInTrans(pBt) ){
  128. rc = sqlite3BtreeSync(pBt, zMaster);
  129. if( rc!=SQLITE_OK ){
  130. sqlite3OsClose(&master);
  131. sqliteFree(zMaster);
  132. return rc;
  133. }
  134. }
  135. }
  136. sqlite3OsClose(&master);
  137. /* Delete the master journal file. This commits the transaction. After
  138. ** doing this the directory is synced again before any individual
  139. ** transaction files are deleted.
  140. */
  141. rc = sqlite3OsDelete(zMaster);
  142. assert( rc==SQLITE_OK );
  143. sqliteFree(zMaster);
  144. zMaster = 0;
  145. rc = sqlite3OsSyncDirectory(zMainFile);
  146. if( rc!=SQLITE_OK ){
  147. /* This is not good. The master journal file has been deleted, but
  148. ** the directory sync failed. There is no completely safe course of
  149. ** action from here. The individual journals contain the name of the
  150. ** master journal file, but there is no way of knowing if that
  151. ** master journal exists now or if it will exist after the operating
  152. ** system crash that may follow the fsync() failure.
  153. */
  154. return rc;
  155. }
  156. /* All files and directories have already been synced, so the following
  157. ** calls to sqlite3BtreeCommit() are only closing files and deleting
  158. ** journals. If something goes wrong while this is happening we don't
  159. ** really care. The integrity of the transaction is already guaranteed,
  160. ** but some stray 'cold' journals may be lying around. Returning an
  161. ** error code won't help matters.
  162. */
  163. for(i=0; i<db->nDb; i++){
  164. Btree *pBt = db->aDb[i].pBt;
  165. if( pBt ){
  166. sqlite3BtreeCommit(pBt);
  167. }
  168. }
  169. }
  170. #endif
  171. return rc;
  172. }