summaryrefslogtreecommitdiffstats
path: root/common/unqlite/pager.c
diff options
context:
space:
mode:
Diffstat (limited to 'common/unqlite/pager.c')
-rw-r--r--common/unqlite/pager.c2808
1 files changed, 2808 insertions, 0 deletions
diff --git a/common/unqlite/pager.c b/common/unqlite/pager.c
new file mode 100644
index 0000000..474a1ca
--- /dev/null
+++ b/common/unqlite/pager.c
@@ -0,0 +1,2808 @@
1/*
2 * Symisc unQLite: An Embeddable NoSQL (Post Modern) Database Engine.
3 * Copyright (C) 2012-2013, Symisc Systems http://unqlite.org/
4 * Copyright (C) 2014, Yuras Shumovich <shumovichy@gmail.com>
5 * Version 1.1.6
6 * For information on licensing, redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES
7 * please contact Symisc Systems via:
8 * legal@symisc.net
9 * licensing@symisc.net
10 * contact@symisc.net
11 * or visit:
12 * http://unqlite.org/licensing.html
13 */
14 /* $SymiscID: pager.c v1.1 Win7 2012-11-29 03:46 stable <chm@symisc.net> $ */
15#ifndef UNQLITE_AMALGAMATION
16#include "unqliteInt.h"
17#endif
18/*
19** This file implements the pager and the transaction manager for UnQLite (Mostly inspired from the SQLite3 Source tree).
20**
21** The Pager.eState variable stores the current 'state' of a pager. A
22** pager may be in any one of the seven states shown in the following
23** state diagram.
24**
25** OPEN <------+------+
26** | | |
27** V | |
28** +---------> READER-------+ |
29** | | |
30** | V |
31** |<-------WRITER_LOCKED--------->|
32** | | |
33** | V |
34** |<------WRITER_CACHEMOD-------->|
35** | | |
36** | V |
37** |<-------WRITER_DBMOD---------->|
38** | | |
39** | V |
40** +<------WRITER_FINISHED-------->+
41**
42** OPEN:
43**
44** The pager starts up in this state. Nothing is guaranteed in this
45** state - the file may or may not be locked and the database size is
46** unknown. The database may not be read or written.
47**
48** * No read or write transaction is active.
49** * Any lock, or no lock at all, may be held on the database file.
50** * The dbSize and dbOrigSize variables may not be trusted.
51**
52** READER:
53**
54** In this state all the requirements for reading the database in
55** rollback mode are met. Unless the pager is (or recently
56** was) in exclusive-locking mode, a user-level read transaction is
57** open. The database size is known in this state.
58**
59** * A read transaction may be active (but a write-transaction cannot).
60** * A SHARED or greater lock is held on the database file.
61** * The dbSize variable may be trusted (even if a user-level read
62** transaction is not active). The dbOrigSize variables
63** may not be trusted at this point.
64** * Even if a read-transaction is not open, it is guaranteed that
65** there is no hot-journal in the file-system.
66**
67** WRITER_LOCKED:
68**
69** The pager moves to this state from READER when a write-transaction
70** is first opened on the database. In WRITER_LOCKED state, all locks
71** required to start a write-transaction are held, but no actual
72** modifications to the cache or database have taken place.
73**
74** In rollback mode, a RESERVED or (if the transaction was opened with
75** EXCLUSIVE flag) EXCLUSIVE lock is obtained on the database file when
76** moving to this state, but the journal file is not written to or opened
77** to in this state. If the transaction is committed or rolled back while
78** in WRITER_LOCKED state, all that is required is to unlock the database
79** file.
80**
81** * A write transaction is active.
82** * If the connection is open in rollback-mode, a RESERVED or greater
83** lock is held on the database file.
84** * The dbSize and dbOrigSize variables are all valid.
85** * The contents of the pager cache have not been modified.
86** * The journal file may or may not be open.
87** * Nothing (not even the first header) has been written to the journal.
88**
89** WRITER_CACHEMOD:
90**
91** A pager moves from WRITER_LOCKED state to this state when a page is
92** first modified by the upper layer. In rollback mode the journal file
93** is opened (if it is not already open) and a header written to the
94** start of it. The database file on disk has not been modified.
95**
96** * A write transaction is active.
97** * A RESERVED or greater lock is held on the database file.
98** * The journal file is open and the first header has been written
99** to it, but the header has not been synced to disk.
100** * The contents of the page cache have been modified.
101**
102** WRITER_DBMOD:
103**
104** The pager transitions from WRITER_CACHEMOD into WRITER_DBMOD state
105** when it modifies the contents of the database file.
106**
107** * A write transaction is active.
108** * An EXCLUSIVE or greater lock is held on the database file.
109** * The journal file is open and the first header has been written
110** and synced to disk.
111** * The contents of the page cache have been modified (and possibly
112** written to disk).
113**
114** WRITER_FINISHED:
115**
116** A rollback-mode pager changes to WRITER_FINISHED state from WRITER_DBMOD
117** state after the entire transaction has been successfully written into the
118** database file. In this state the transaction may be committed simply
119** by finalizing the journal file. Once in WRITER_FINISHED state, it is
120** not possible to modify the database further. At this point, the upper
121** layer must either commit or rollback the transaction.
122**
123** * A write transaction is active.
124** * An EXCLUSIVE or greater lock is held on the database file.
125** * All writing and syncing of journal and database data has finished.
126** If no error occured, all that remains is to finalize the journal to
127** commit the transaction. If an error did occur, the caller will need
128** to rollback the transaction.
129**
130**
131*/
132#define PAGER_OPEN 0
133#define PAGER_READER 1
134#define PAGER_WRITER_LOCKED 2
135#define PAGER_WRITER_CACHEMOD 3
136#define PAGER_WRITER_DBMOD 4
137#define PAGER_WRITER_FINISHED 5
138/*
139** Journal files begin with the following magic string. The data
140** was obtained from /dev/random. It is used only as a sanity check.
141**
142** NOTE: These values must be different from the one used by SQLite3
143** to avoid journal file collision.
144**
145*/
146static const unsigned char aJournalMagic[] = {
147 0xa6, 0xe8, 0xcd, 0x2b, 0x1c, 0x92, 0xdb, 0x9f,
148};
149/*
150** The journal header size for this pager. This is usually the same
151** size as a single disk sector. See also setSectorSize().
152*/
153#define JOURNAL_HDR_SZ(pPager) (pPager->iSectorSize)
154/*
155 * Database page handle.
156 * Each raw disk page is represented in memory by an instance
157 * of the following structure.
158 */
159typedef struct Page Page;
160struct Page {
161 /* Must correspond to unqlite_page */
162 unsigned char *zData; /* Content of this page */
163 void *pUserData; /* Extra content */
164 pgno pgno; /* Page number for this page */
165 /**********************************************************************
166 ** Elements above are public. All that follows is private to pcache.c
167 ** and should not be accessed by other modules.
168 */
169 Pager *pPager; /* The pager this page is part of */
170 int flags; /* Page flags defined below */
171 int nRef; /* Number of users of this page */
172 Page *pNext, *pPrev; /* A list of all pages */
173 Page *pDirtyNext; /* Next element in list of dirty pages */
174 Page *pDirtyPrev; /* Previous element in list of dirty pages */
175 Page *pNextCollide,*pPrevCollide; /* Collission chain */
176 Page *pNextHot,*pPrevHot; /* Hot dirty pages chain */
177};
178/* Bit values for Page.flags */
179#define PAGE_DIRTY 0x002 /* Page has changed */
180#define PAGE_NEED_SYNC 0x004 /* fsync the rollback journal before
181 ** writing this page to the database */
182#define PAGE_DONT_WRITE 0x008 /* Dont write page content to disk */
183#define PAGE_NEED_READ 0x010 /* Content is unread */
184#define PAGE_IN_JOURNAL 0x020 /* Page written to the journal */
185#define PAGE_HOT_DIRTY 0x040 /* Hot dirty page */
186#define PAGE_DONT_MAKE_HOT 0x080 /* Dont make this page Hot. In other words,
187 * do not link it to the hot dirty list.
188 */
189/*
190 * Each active database pager is represented by an instance of
191 * the following structure.
192 */
193struct Pager
194{
195 SyMemBackend *pAllocator; /* Memory backend */
196 unqlite *pDb; /* DB handle that own this instance */
197 unqlite_kv_engine *pEngine; /* Underlying KV storage engine */
198 char *zFilename; /* Name of the database file */
199 char *zJournal; /* Name of the journal file */
200 unqlite_vfs *pVfs; /* Underlying virtual file system */
201 unqlite_file *pfd,*pjfd; /* File descriptors for database and journal */
202 pgno dbSize; /* Number of pages in the file */
203 pgno dbOrigSize; /* dbSize before the current change */
204 sxi64 dbByteSize; /* Database size in bytes */
205 void *pMmap; /* Read-only Memory view (mmap) of the whole file if requested (UNQLITE_OPEN_MMAP). */
206 sxu32 nRec; /* Number of pages written to the journal */
207 SyPRNGCtx sPrng; /* PRNG Context */
208 sxu32 cksumInit; /* Quasi-random value added to every checksum */
209 sxu32 iOpenFlags; /* Flag passed to unqlite_open() after processing */
210 sxi64 iJournalOfft; /* Journal offset we are reading from */
211 int (*xBusyHandler)(void *); /* Busy handler */
212 void *pBusyHandlerArg; /* First arg to xBusyHandler() */
213 void (*xPageUnpin)(void *); /* Page Unpin callback */
214 void (*xPageReload)(void *); /* Page Reload callback */
215 Bitvec *pVec; /* Bitmap */
216 Page *pHeader; /* Page one of the database (Unqlite header) */
217 Sytm tmCreate; /* Database creation time */
218 SyString sKv; /* Underlying Key/Value storage engine name */
219 int iState; /* Pager state */
220 int iLock; /* Lock state */
221 sxi32 iFlags; /* Control flags (see below) */
222 int is_mem; /* True for an in-memory database */
223 int is_rdonly; /* True for a read-only database */
224 int no_jrnl; /* TRUE to omit journaling */
225 int iPageSize; /* Page size in bytes (default 4K) */
226 int iSectorSize; /* Size of a single sector on disk */
227 unsigned char *zTmpPage; /* Temporary page */
228 Page *pFirstDirty; /* First dirty pages */
229 Page *pDirty; /* Transient list of dirty pages */
230 Page *pAll; /* List of all pages */
231 Page *pHotDirty; /* List of hot dirty pages */
232 Page *pFirstHot; /* First hot dirty page */
233 sxu32 nHot; /* Total number of hot dirty pages */
234 Page **apHash; /* Page table */
235 sxu32 nSize; /* apHash[] size: Must be a power of two */
236 sxu32 nPage; /* Total number of page loaded in memory */
237 sxu32 nCacheMax; /* Maximum page to cache*/
238};
239/* Control flags */
240#define PAGER_CTRL_COMMIT_ERR 0x001 /* Commit error */
241#define PAGER_CTRL_DIRTY_COMMIT 0x002 /* Dirty commit has been applied */
242/*
243** Read a 32-bit integer from the given file descriptor.
244** All values are stored on disk as big-endian.
245*/
246static int ReadInt32(unqlite_file *pFd,sxu32 *pOut,sxi64 iOfft)
247{
248 unsigned char zBuf[4];
249 int rc;
250 rc = unqliteOsRead(pFd,zBuf,sizeof(zBuf),iOfft);
251 if( rc != UNQLITE_OK ){
252 return rc;
253 }
254 SyBigEndianUnpack32(zBuf,pOut);
255 return UNQLITE_OK;
256}
257/*
258** Read a 64-bit integer from the given file descriptor.
259** All values are stored on disk as big-endian.
260*/
261static int ReadInt64(unqlite_file *pFd,sxu64 *pOut,sxi64 iOfft)
262{
263 unsigned char zBuf[8];
264 int rc;
265 rc = unqliteOsRead(pFd,zBuf,sizeof(zBuf),iOfft);
266 if( rc != UNQLITE_OK ){
267 return rc;
268 }
269 SyBigEndianUnpack64(zBuf,pOut);
270 return UNQLITE_OK;
271}
272/*
273** Write a 32-bit integer into the given file descriptor.
274*/
275static int WriteInt32(unqlite_file *pFd,sxu32 iNum,sxi64 iOfft)
276{
277 unsigned char zBuf[4];
278 int rc;
279 SyBigEndianPack32(zBuf,iNum);
280 rc = unqliteOsWrite(pFd,zBuf,sizeof(zBuf),iOfft);
281 return rc;
282}
283/*
284** Write a 64-bit integer into the given file descriptor.
285*/
286static int WriteInt64(unqlite_file *pFd,sxu64 iNum,sxi64 iOfft)
287{
288 unsigned char zBuf[8];
289 int rc;
290 SyBigEndianPack64(zBuf,iNum);
291 rc = unqliteOsWrite(pFd,zBuf,sizeof(zBuf),iOfft);
292 return rc;
293}
294/*
295** The maximum allowed sector size. 64KiB. If the xSectorsize() method
296** returns a value larger than this, then MAX_SECTOR_SIZE is used instead.
297** This could conceivably cause corruption following a power failure on
298** such a system. This is currently an undocumented limit.
299*/
300#define MAX_SECTOR_SIZE 0x10000
301/*
302** Get the size of a single sector on disk.
303** The sector size will be used used to determine the size
304** and alignment of journal header and within created journal files.
305**
306** The default sector size is set to 512.
307*/
308static int GetSectorSize(unqlite_file *pFd)
309{
310 int iSectorSize = UNQLITE_DEFAULT_SECTOR_SIZE;
311 if( pFd ){
312 iSectorSize = unqliteOsSectorSize(pFd);
313 if( iSectorSize < 32 ){
314 iSectorSize = 512;
315 }
316 if( iSectorSize > MAX_SECTOR_SIZE ){
317 iSectorSize = MAX_SECTOR_SIZE;
318 }
319 }
320 return iSectorSize;
321}
322/* Hash function for page number */
323#define PAGE_HASH(PNUM) (PNUM)
324/*
325 * Fetch a page from the cache.
326 */
327static Page * pager_fetch_page(Pager *pPager,pgno page_num)
328{
329 Page *pEntry;
330 if( pPager->nPage < 1 ){
331 /* Don't bother hashing */
332 return 0;
333 }
334 /* Perform the lookup */
335 pEntry = pPager->apHash[PAGE_HASH(page_num) & (pPager->nSize - 1)];
336 for(;;){
337 if( pEntry == 0 ){
338 break;
339 }
340 if( pEntry->pgno == page_num ){
341 return pEntry;
342 }
343 /* Point to the next entry in the colission chain */
344 pEntry = pEntry->pNextCollide;
345 }
346 /* No such page */
347 return 0;
348}
349/*
350 * Allocate and initialize a new page.
351 */
352static Page * pager_alloc_page(Pager *pPager,pgno num_page)
353{
354 Page *pNew;
355
356 pNew = (Page *)SyMemBackendPoolAlloc(pPager->pAllocator,sizeof(Page)+pPager->iPageSize);
357 if( pNew == 0 ){
358 return 0;
359 }
360 /* Zero the structure */
361 SyZero(pNew,sizeof(Page)+pPager->iPageSize);
362 /* Page data */
363 pNew->zData = (unsigned char *)&pNew[1];
364 /* Fill in the structure */
365 pNew->pPager = pPager;
366 pNew->nRef = 1;
367 pNew->pgno = num_page;
368 return pNew;
369}
370/*
371 * Increment the reference count of a given page.
372 */
373static void page_ref(Page *pPage)
374{
375 pPage->nRef++;
376}
377/*
378 * Release an in-memory page after its reference count reach zero.
379 */
380static int pager_release_page(Pager *pPager,Page *pPage)
381{
382 int rc = UNQLITE_OK;
383 if( !(pPage->flags & PAGE_DIRTY)){
384 /* Invoke the unpin callback if available */
385 if( pPager->xPageUnpin && pPage->pUserData ){
386 pPager->xPageUnpin(pPage->pUserData);
387 }
388 pPage->pUserData = 0;
389 SyMemBackendPoolFree(pPager->pAllocator,pPage);
390 }else{
391 /* Dirty page, it will be released later when a dirty commit
392 * or the final commit have been applied.
393 */
394 rc = UNQLITE_LOCKED;
395 }
396 return rc;
397}
398/* Forward declaration */
399static int pager_unlink_page(Pager *pPager,Page *pPage);
400/*
401 * Decrement the reference count of a given page.
402 */
403static void page_unref(Page *pPage)
404{
405 pPage->nRef--;
406 if( pPage->nRef < 1 ){
407 Pager *pPager = pPage->pPager;
408 if( !(pPage->flags & PAGE_DIRTY) ){
409 pager_unlink_page(pPager,pPage);
410 /* Release the page */
411 pager_release_page(pPager,pPage);
412 }else{
413 if( pPage->flags & PAGE_DONT_MAKE_HOT ){
414 /* Do not add this page to the hot dirty list */
415 return;
416 }
417 if( !(pPage->flags & PAGE_HOT_DIRTY) ){
418 /* Add to the hot dirty list */
419 pPage->pPrevHot = 0;
420 if( pPager->pFirstHot == 0 ){
421 pPager->pFirstHot = pPager->pHotDirty = pPage;
422 }else{
423 pPage->pNextHot = pPager->pHotDirty;
424 if( pPager->pHotDirty ){
425 pPager->pHotDirty->pPrevHot = pPage;
426 }
427 pPager->pHotDirty = pPage;
428 }
429 pPager->nHot++;
430 pPage->flags |= PAGE_HOT_DIRTY;
431 }
432 }
433 }
434}
435/*
436 * Link a freshly created page to the list of active page.
437 */
438static int pager_link_page(Pager *pPager,Page *pPage)
439{
440 sxu32 nBucket;
441 /* Install in the corresponding bucket */
442 nBucket = PAGE_HASH(pPage->pgno) & (pPager->nSize - 1);
443 pPage->pNextCollide = pPager->apHash[nBucket];
444 if( pPager->apHash[nBucket] ){
445 pPager->apHash[nBucket]->pPrevCollide = pPage;
446 }
447 pPager->apHash[nBucket] = pPage;
448 /* Link to the list of active pages */
449 MACRO_LD_PUSH(pPager->pAll,pPage);
450 pPager->nPage++;
451 if( (pPager->nPage >= pPager->nSize * 4) && pPager->nPage < 100000 ){
452 /* Grow the hashtable */
453 sxu32 nNewSize = pPager->nSize << 1;
454 Page *pEntry,**apNew;
455 sxu32 n;
456 apNew = (Page **)SyMemBackendAlloc(pPager->pAllocator, nNewSize * sizeof(Page *));
457 if( apNew ){
458 sxu32 iBucket;
459 /* Zero the new table */
460 SyZero((void *)apNew, nNewSize * sizeof(Page *));
461 /* Rehash all entries */
462 n = 0;
463 pEntry = pPager->pAll;
464 for(;;){
465 /* Loop one */
466 if( n >= pPager->nPage ){
467 break;
468 }
469 pEntry->pNextCollide = pEntry->pPrevCollide = 0;
470 /* Install in the new bucket */
471 iBucket = PAGE_HASH(pEntry->pgno) & (nNewSize - 1);
472 pEntry->pNextCollide = apNew[iBucket];
473 if( apNew[iBucket] ){
474 apNew[iBucket]->pPrevCollide = pEntry;
475 }
476 apNew[iBucket] = pEntry;
477 /* Point to the next entry */
478 pEntry = pEntry->pNext;
479 n++;
480 }
481 /* Release the old table and reflect the change */
482 SyMemBackendFree(pPager->pAllocator,(void *)pPager->apHash);
483 pPager->apHash = apNew;
484 pPager->nSize = nNewSize;
485 }
486 }
487 return UNQLITE_OK;
488}
489/*
490 * Unlink a page from the list of active pages.
491 */
492static int pager_unlink_page(Pager *pPager,Page *pPage)
493{
494 if( pPage->pNextCollide ){
495 pPage->pNextCollide->pPrevCollide = pPage->pPrevCollide;
496 }
497 if( pPage->pPrevCollide ){
498 pPage->pPrevCollide->pNextCollide = pPage->pNextCollide;
499 }else{
500 sxu32 nBucket = PAGE_HASH(pPage->pgno) & (pPager->nSize - 1);
501 pPager->apHash[nBucket] = pPage->pNextCollide;
502 }
503 MACRO_LD_REMOVE(pPager->pAll,pPage);
504 pPager->nPage--;
505 return UNQLITE_OK;
506}
507/*
508 * Update the content of a cached page.
509 */
510static int pager_fill_page(Pager *pPager,pgno iNum,void *pContents)
511{
512 Page *pPage;
513 /* Fetch the page from the catch */
514 pPage = pager_fetch_page(pPager,iNum);
515 if( pPage == 0 ){
516 return SXERR_NOTFOUND;
517 }
518 /* Reflect the change */
519 SyMemcpy(pContents,pPage->zData,pPager->iPageSize);
520
521 return UNQLITE_OK;
522}
523/*
524 * Read the content of a page from disk.
525 */
526static int pager_get_page_contents(Pager *pPager,Page *pPage,int noContent)
527{
528 int rc = UNQLITE_OK;
529 if( pPager->is_mem || noContent || pPage->pgno >= pPager->dbSize ){
530 /* Do not bother reading, zero the page contents only */
531 SyZero(pPage->zData,pPager->iPageSize);
532 return UNQLITE_OK;
533 }
534 if( (pPager->iOpenFlags & UNQLITE_OPEN_MMAP) && (pPager->pMmap /* Paranoid edition */) ){
535 unsigned char *zMap = (unsigned char *)pPager->pMmap;
536 pPage->zData = &zMap[pPage->pgno * pPager->iPageSize];
537 }else{
538 /* Read content */
539 rc = unqliteOsRead(pPager->pfd,pPage->zData,pPager->iPageSize,pPage->pgno * pPager->iPageSize);
540 }
541 return rc;
542}
543/*
544 * Add a page to the dirty list.
545 */
546static void pager_page_to_dirty_list(Pager *pPager,Page *pPage)
547{
548 if( pPage->flags & PAGE_DIRTY ){
549 /* Already set */
550 return;
551 }
552 /* Mark the page as dirty */
553 pPage->flags |= PAGE_DIRTY|PAGE_NEED_SYNC|PAGE_IN_JOURNAL;
554 /* Link to the list */
555 pPage->pDirtyPrev = 0;
556 pPage->pDirtyNext = pPager->pDirty;
557 if( pPager->pDirty ){
558 pPager->pDirty->pDirtyPrev = pPage;
559 }
560 pPager->pDirty = pPage;
561 if( pPager->pFirstDirty == 0 ){
562 pPager->pFirstDirty = pPage;
563 }
564}
565/*
566 * Merge sort.
567 * The merge sort implementation is based on the one used by
568 * the PH7 Embeddable PHP Engine (http://ph7.symisc.net/).
569 */
570/*
571** Inputs:
572** a: A sorted, null-terminated linked list. (May be null).
573** b: A sorted, null-terminated linked list. (May be null).
574** cmp: A pointer to the comparison function.
575**
576** Return Value:
577** A pointer to the head of a sorted list containing the elements
578** of both a and b.
579**
580** Side effects:
581** The "next", "prev" pointers for elements in the lists a and b are
582** changed.
583*/
584static Page * page_merge_dirty(Page *pA, Page *pB)
585{
586 Page result, *pTail;
587 /* Prevent compiler warning */
588 result.pDirtyNext = result.pDirtyPrev = 0;
589 pTail = &result;
590 while( pA && pB ){
591 if( pA->pgno < pB->pgno ){
592 pTail->pDirtyPrev = pA;
593 pA->pDirtyNext = pTail;
594 pTail = pA;
595 pA = pA->pDirtyPrev;
596 }else{
597 pTail->pDirtyPrev = pB;
598 pB->pDirtyNext = pTail;
599 pTail = pB;
600 pB = pB->pDirtyPrev;
601 }
602 }
603 if( pA ){
604 pTail->pDirtyPrev = pA;
605 pA->pDirtyNext = pTail;
606 }else if( pB ){
607 pTail->pDirtyPrev = pB;
608 pB->pDirtyNext = pTail;
609 }else{
610 pTail->pDirtyPrev = pTail->pDirtyNext = 0;
611 }
612 return result.pDirtyPrev;
613}
614/*
615** Inputs:
616** Map: Input hashmap
617** cmp: A comparison function.
618**
619** Return Value:
620** Sorted hashmap.
621**
622** Side effects:
623** The "next" pointers for elements in list are changed.
624*/
625#define N_SORT_BUCKET 32
626static Page * pager_get_dirty_pages(Pager *pPager)
627{
628 Page *a[N_SORT_BUCKET], *p, *pIn;
629 sxu32 i;
630 if( pPager->pFirstDirty == 0 ){
631 /* Don't bother sorting, the list is already empty */
632 return 0;
633 }
634 SyZero(a, sizeof(a));
635 /* Point to the first inserted entry */
636 pIn = pPager->pFirstDirty;
637 while( pIn ){
638 p = pIn;
639 pIn = p->pDirtyPrev;
640 p->pDirtyPrev = 0;
641 for(i=0; i<N_SORT_BUCKET-1; i++){
642 if( a[i]==0 ){
643 a[i] = p;
644 break;
645 }else{
646 p = page_merge_dirty(a[i], p);
647 a[i] = 0;
648 }
649 }
650 if( i==N_SORT_BUCKET-1 ){
651 /* To get here, there need to be 2^(N_SORT_BUCKET) elements in he input list.
652 * But that is impossible.
653 */
654 a[i] = page_merge_dirty(a[i], p);
655 }
656 }
657 p = a[0];
658 for(i=1; i<N_SORT_BUCKET; i++){
659 p = page_merge_dirty(p,a[i]);
660 }
661 p->pDirtyNext = 0;
662 return p;
663}
664/*
665 * See block comment above.
666 */
667static Page * page_merge_hot(Page *pA, Page *pB)
668{
669 Page result, *pTail;
670 /* Prevent compiler warning */
671 result.pNextHot = result.pPrevHot = 0;
672 pTail = &result;
673 while( pA && pB ){
674 if( pA->pgno < pB->pgno ){
675 pTail->pPrevHot = pA;
676 pA->pNextHot = pTail;
677 pTail = pA;
678 pA = pA->pPrevHot;
679 }else{
680 pTail->pPrevHot = pB;
681 pB->pNextHot = pTail;
682 pTail = pB;
683 pB = pB->pPrevHot;
684 }
685 }
686 if( pA ){
687 pTail->pPrevHot = pA;
688 pA->pNextHot = pTail;
689 }else if( pB ){
690 pTail->pPrevHot = pB;
691 pB->pNextHot = pTail;
692 }else{
693 pTail->pPrevHot = pTail->pNextHot = 0;
694 }
695 return result.pPrevHot;
696}
697/*
698** Inputs:
699** Map: Input hashmap
700** cmp: A comparison function.
701**
702** Return Value:
703** Sorted hashmap.
704**
705** Side effects:
706** The "next" pointers for elements in list are changed.
707*/
708#define N_SORT_BUCKET 32
709static Page * pager_get_hot_pages(Pager *pPager)
710{
711 Page *a[N_SORT_BUCKET], *p, *pIn;
712 sxu32 i;
713 if( pPager->pFirstHot == 0 ){
714 /* Don't bother sorting, the list is already empty */
715 return 0;
716 }
717 SyZero(a, sizeof(a));
718 /* Point to the first inserted entry */
719 pIn = pPager->pFirstHot;
720 while( pIn ){
721 p = pIn;
722 pIn = p->pPrevHot;
723 p->pPrevHot = 0;
724 for(i=0; i<N_SORT_BUCKET-1; i++){
725 if( a[i]==0 ){
726 a[i] = p;
727 break;
728 }else{
729 p = page_merge_hot(a[i], p);
730 a[i] = 0;
731 }
732 }
733 if( i==N_SORT_BUCKET-1 ){
734 /* To get here, there need to be 2^(N_SORT_BUCKET) elements in he input list.
735 * But that is impossible.
736 */
737 a[i] = page_merge_hot(a[i], p);
738 }
739 }
740 p = a[0];
741 for(i=1; i<N_SORT_BUCKET; i++){
742 p = page_merge_hot(p,a[i]);
743 }
744 p->pNextHot = 0;
745 return p;
746}
747/*
748** The format for the journal header is as follows:
749** - 8 bytes: Magic identifying journal format.
750** - 4 bytes: Number of records in journal.
751** - 4 bytes: Random number used for page hash.
752** - 8 bytes: Initial database page count.
753** - 4 bytes: Sector size used by the process that wrote this journal.
754** - 4 bytes: Database page size.
755**
756** Followed by (JOURNAL_HDR_SZ - 28) bytes of unused space.
757*/
758/*
759** Open the journal file and extract its header information.
760**
761** If the header is read successfully, *pNRec is set to the number of
762** page records following this header and *pDbSize is set to the size of the
763** database before the transaction began, in pages. Also, pPager->cksumInit
764** is set to the value read from the journal header. UNQLITE_OK is returned
765** in this case.
766**
767** If the journal header file appears to be corrupted, UNQLITE_DONE is
768** returned and *pNRec and *PDbSize are undefined. If JOURNAL_HDR_SZ bytes
769** cannot be read from the journal file an error code is returned.
770*/
771static int pager_read_journal_header(
772 Pager *pPager, /* Pager object */
773 sxu32 *pNRec, /* OUT: Value read from the nRec field */
774 pgno *pDbSize /* OUT: Value of original database size field */
775)
776{
777 sxu32 iPageSize,iSectorSize;
778 unsigned char zMagic[8];
779 sxi64 iHdrOfft;
780 sxi64 iSize;
781 int rc;
782 /* Offset to start reading from */
783 iHdrOfft = 0;
784 /* Get the size of the journal */
785 rc = unqliteOsFileSize(pPager->pjfd,&iSize);
786 if( rc != UNQLITE_OK ){
787 return UNQLITE_DONE;
788 }
789 /* If the journal file is too small, return UNQLITE_DONE. */
790 if( 32 /* Minimum sector size */> iSize ){
791 return UNQLITE_DONE;
792 }
793 /* Make sure we are dealing with a valid journal */
794 rc = unqliteOsRead(pPager->pjfd,zMagic,sizeof(zMagic),iHdrOfft);
795 if( rc != UNQLITE_OK ){
796 return rc;
797 }
798 if( SyMemcmp(zMagic,aJournalMagic,sizeof(zMagic)) != 0 ){
799 return UNQLITE_DONE;
800 }
801 iHdrOfft += sizeof(zMagic);
802 /* Read the first three 32-bit fields of the journal header: The nRec
803 ** field, the checksum-initializer and the database size at the start
804 ** of the transaction. Return an error code if anything goes wrong.
805 */
806 rc = ReadInt32(pPager->pjfd,pNRec,iHdrOfft);
807 if( rc != UNQLITE_OK ){
808 return rc;
809 }
810 iHdrOfft += 4;
811 rc = ReadInt32(pPager->pjfd,&pPager->cksumInit,iHdrOfft);
812 if( rc != UNQLITE_OK ){
813 return rc;
814 }
815 iHdrOfft += 4;
816 rc = ReadInt64(pPager->pjfd,pDbSize,iHdrOfft);
817 if( rc != UNQLITE_OK ){
818 return rc;
819 }
820 iHdrOfft += 8;
821 /* Read the page-size and sector-size journal header fields. */
822 rc = ReadInt32(pPager->pjfd,&iSectorSize,iHdrOfft);
823 if( rc != UNQLITE_OK ){
824 return rc;
825 }
826 iHdrOfft += 4;
827 rc = ReadInt32(pPager->pjfd,&iPageSize,iHdrOfft);
828 if( rc != UNQLITE_OK ){
829 return rc;
830 }
831 /* Check that the values read from the page-size and sector-size fields
832 ** are within range. To be 'in range', both values need to be a power
833 ** of two greater than or equal to 512 or 32, and not greater than their
834 ** respective compile time maximum limits.
835 */
836 if( iPageSize < UNQLITE_MIN_PAGE_SIZE || iSectorSize<32
837 || iPageSize > UNQLITE_MAX_PAGE_SIZE || iSectorSize>MAX_SECTOR_SIZE
838 || ((iPageSize-1)&iPageSize)!=0 || ((iSectorSize-1)&iSectorSize)!=0
839 ){
840 /* If the either the page-size or sector-size in the journal-header is
841 ** invalid, then the process that wrote the journal-header must have
842 ** crashed before the header was synced. In this case stop reading
843 ** the journal file here.
844 */
845 return UNQLITE_DONE;
846 }
847 /* Update the assumed sector-size to match the value used by
848 ** the process that created this journal. If this journal was
849 ** created by a process other than this one, then this routine
850 ** is being called from within pager_playback(). The local value
851 ** of Pager.sectorSize is restored at the end of that routine.
852 */
853 pPager->iSectorSize = iSectorSize;
854 pPager->iPageSize = iPageSize;
855 /* Ready to rollback */
856 pPager->iJournalOfft = JOURNAL_HDR_SZ(pPager);
857 /* All done */
858 return UNQLITE_OK;
859}
860/*
861 * Write the journal header in the given memory buffer.
862 * The given buffer is big enough to hold the whole header.
863 */
864static int pager_write_journal_header(Pager *pPager,unsigned char *zBuf)
865{
866 unsigned char *zPtr = zBuf;
867 /* 8 bytes magic number */
868 SyMemcpy(aJournalMagic,zPtr,sizeof(aJournalMagic));
869 zPtr += sizeof(aJournalMagic);
870 /* 4 bytes: Number of records in journal. */
871 SyBigEndianPack32(zPtr,0);
872 zPtr += 4;
873 /* 4 bytes: Random number used to compute page checksum. */
874 SyBigEndianPack32(zPtr,pPager->cksumInit);
875 zPtr += 4;
876 /* 8 bytes: Initial database page count. */
877 SyBigEndianPack64(zPtr,pPager->dbOrigSize);
878 zPtr += 8;
879 /* 4 bytes: Sector size used by the process that wrote this journal. */
880 SyBigEndianPack32(zPtr,(sxu32)pPager->iSectorSize);
881 zPtr += 4;
882 /* 4 bytes: Database page size. */
883 SyBigEndianPack32(zPtr,(sxu32)pPager->iPageSize);
884 return UNQLITE_OK;
885}
886/*
887** Parameter aData must point to a buffer of pPager->pageSize bytes
888** of data. Compute and return a checksum based ont the contents of the
889** page of data and the current value of pPager->cksumInit.
890**
891** This is not a real checksum. It is really just the sum of the
892** random initial value (pPager->cksumInit) and every 200th byte
893** of the page data, starting with byte offset (pPager->pageSize%200).
894** Each byte is interpreted as an 8-bit unsigned integer.
895**
896** Changing the formula used to compute this checksum results in an
897** incompatible journal file format.
898**
899** If journal corruption occurs due to a power failure, the most likely
900** scenario is that one end or the other of the record will be changed.
901** It is much less likely that the two ends of the journal record will be
902** correct and the middle be corrupt. Thus, this "checksum" scheme,
903** though fast and simple, catches the mostly likely kind of corruption.
904*/
905static sxu32 pager_cksum(Pager *pPager,const unsigned char *zData)
906{
907 sxu32 cksum = pPager->cksumInit; /* Checksum value to return */
908 int i = pPager->iPageSize-200; /* Loop counter */
909 while( i>0 ){
910 cksum += zData[i];
911 i -= 200;
912 }
913 return cksum;
914}
915/*
916** Read a single page from the journal file opened on file descriptor
917** jfd. Playback this one page. Update the offset to read from.
918*/
919static int pager_play_back_one_page(Pager *pPager,sxi64 *pOfft,unsigned char *zTmp)
920{
921 unsigned char *zData = zTmp;
922 sxi64 iOfft; /* Offset to read from */
923 pgno iNum; /* Pager number */
924 sxu32 ckSum; /* Sanity check */
925 int rc;
926 /* Offset to start reading from */
927 iOfft = *pOfft;
928 /* Database page number */
929 rc = ReadInt64(pPager->pjfd,&iNum,iOfft);
930 if( rc != UNQLITE_OK ){ return rc; }
931 iOfft += 8;
932 /* Page data */
933 rc = unqliteOsRead(pPager->pjfd,zData,pPager->iPageSize,iOfft);
934 if( rc != UNQLITE_OK ){ return rc; }
935 iOfft += pPager->iPageSize;
936 /* Page cksum */
937 rc = ReadInt32(pPager->pjfd,&ckSum,iOfft);
938 if( rc != UNQLITE_OK ){ return rc; }
939 iOfft += 4;
940 /* Synchronize pointers */
941 *pOfft = iOfft;
942 /* Make sure we are dealing with a valid page */
943 if( ckSum != pager_cksum(pPager,zData) ){
944 /* Ignore that page */
945 return SXERR_IGNORE;
946 }
947 if( iNum >= pPager->dbSize ){
948 /* Ignore that page */
949 return UNQLITE_OK;
950 }
951 /* playback */
952 rc = unqliteOsWrite(pPager->pfd,zData,pPager->iPageSize,iNum * pPager->iPageSize);
953 if( rc == UNQLITE_OK ){
954 /* Flush the cache */
955 pager_fill_page(pPager,iNum,zData);
956 }
957 return rc;
958}
959/*
960** Playback the journal and thus restore the database file to
961** the state it was in before we started making changes.
962**
963** The journal file format is as follows:
964**
965** (1) 8 byte prefix. A copy of aJournalMagic[].
966** (2) 4 byte big-endian integer which is the number of valid page records
967** in the journal.
968** (3) 4 byte big-endian integer which is the initial value for the
969** sanity checksum.
970** (4) 8 byte integer which is the number of pages to truncate the
971** database to during a rollback.
972** (5) 4 byte big-endian integer which is the sector size. The header
973** is this many bytes in size.
974** (6) 4 byte big-endian integer which is the page size.
975** (7) zero padding out to the next sector size.
976** (8) Zero or more pages instances, each as follows:
977** + 4 byte page number.
978** + pPager->pageSize bytes of data.
979** + 4 byte checksum
980**
981** When we speak of the journal header, we mean the first 7 items above.
982** Each entry in the journal is an instance of the 8th item.
983**
984** Call the value from the second bullet "nRec". nRec is the number of
985** valid page entries in the journal. In most cases, you can compute the
986** value of nRec from the size of the journal file. But if a power
987** failure occurred while the journal was being written, it could be the
988** case that the size of the journal file had already been increased but
989** the extra entries had not yet made it safely to disk. In such a case,
990** the value of nRec computed from the file size would be too large. For
991** that reason, we always use the nRec value in the header.
992**
993** If the file opened as the journal file is not a well-formed
994** journal file then all pages up to the first corrupted page are rolled
995** back (or no pages if the journal header is corrupted). The journal file
996** is then deleted and SQLITE_OK returned, just as if no corruption had
997** been encountered.
998**
999** If an I/O or malloc() error occurs, the journal-file is not deleted
1000** and an error code is returned.
1001**
1002*/
1003static int pager_playback(Pager *pPager)
1004{
1005 unsigned char *zTmp = 0; /* cc warning */
1006 sxu32 n,nRec;
1007 sxi64 iOfft;
1008 int rc;
1009 /* Read the journal header*/
1010 rc = pager_read_journal_header(pPager,&nRec,&pPager->dbSize);
1011 if( rc != UNQLITE_OK ){
1012 if( rc == UNQLITE_DONE ){
1013 goto end_playback;
1014 }
1015 unqliteGenErrorFormat(pPager->pDb,"IO error while reading journal file '%s' header",pPager->zJournal);
1016 return rc;
1017 }
1018 /* Truncate the database back to its original size */
1019 rc = unqliteOsTruncate(pPager->pfd,pPager->iPageSize * pPager->dbSize);
1020 if( rc != UNQLITE_OK ){
1021 unqliteGenError(pPager->pDb,"IO error while truncating database file");
1022 return rc;
1023 }
1024 /* Allocate a temporary page */
1025 zTmp = (unsigned char *)SyMemBackendAlloc(pPager->pAllocator,(sxu32)pPager->iPageSize);
1026 if( zTmp == 0 ){
1027 unqliteGenOutofMem(pPager->pDb);
1028 return UNQLITE_NOMEM;
1029 }
1030 SyZero((void *)zTmp,(sxu32)pPager->iPageSize);
1031 /* Copy original pages out of the journal and back into the
1032 ** database file and/or page cache.
1033 */
1034 iOfft = pPager->iJournalOfft;
1035 for( n = 0 ; n < nRec ; ++n ){
1036 rc = pager_play_back_one_page(pPager,&iOfft,zTmp);
1037 if( rc != UNQLITE_OK ){
1038 if( rc != SXERR_IGNORE ){
1039 unqliteGenError(pPager->pDb,"Page playback error");
1040 goto end_playback;
1041 }
1042 }
1043 }
1044end_playback:
1045 /* Release the temp page */
1046 SyMemBackendFree(pPager->pAllocator,(void *)zTmp);
1047 if( rc == UNQLITE_OK ){
1048 /* Sync the database file */
1049 unqliteOsSync(pPager->pfd,UNQLITE_SYNC_FULL);
1050 }
1051 if( rc == UNQLITE_DONE ){
1052 rc = UNQLITE_OK;
1053 }
1054 /* Return to the caller */
1055 return rc;
1056}
1057/*
1058** Unlock the database file to level eLock, which must be either NO_LOCK
1059** or SHARED_LOCK. Regardless of whether or not the call to xUnlock()
1060** succeeds, set the Pager.iLock variable to match the (attempted) new lock.
1061**
1062** Except, if Pager.iLock is set to NO_LOCK when this function is
1063** called, do not modify it. See the comment above the #define of
1064** NO_LOCK for an explanation of this.
1065*/
1066static int pager_unlock_db(Pager *pPager, int eLock)
1067{
1068 int rc = UNQLITE_OK;
1069 if( pPager->iLock != NO_LOCK ){
1070 rc = unqliteOsUnlock(pPager->pfd,eLock);
1071 pPager->iLock = eLock;
1072 }
1073 return rc;
1074}
1075/*
1076** Lock the database file to level eLock, which must be either SHARED_LOCK,
1077** RESERVED_LOCK or EXCLUSIVE_LOCK. If the caller is successful, set the
1078** Pager.eLock variable to the new locking state.
1079**
1080** Except, if Pager.eLock is set to NO_LOCK when this function is
1081** called, do not modify it unless the new locking state is EXCLUSIVE_LOCK.
1082** See the comment above the #define of NO_LOCK for an explanation
1083** of this.
1084*/
1085static int pager_lock_db(Pager *pPager, int eLock){
1086 int rc = UNQLITE_OK;
1087 if( pPager->iLock < eLock || pPager->iLock == NO_LOCK ){
1088 rc = unqliteOsLock(pPager->pfd, eLock);
1089 if( rc==UNQLITE_OK ){
1090 pPager->iLock = eLock;
1091 }else{
1092 unqliteGenError(pPager->pDb,
1093 rc == UNQLITE_BUSY ? "Another process or thread hold the requested lock" : "Error while requesting database lock"
1094 );
1095 }
1096 }
1097 return rc;
1098}
1099/*
1100** Try to obtain a lock of type locktype on the database file. If
1101** a similar or greater lock is already held, this function is a no-op
1102** (returning UNQLITE_OK immediately).
1103**
1104** Otherwise, attempt to obtain the lock using unqliteOsLock(). Invoke
1105** the busy callback if the lock is currently not available. Repeat
1106** until the busy callback returns false or until the attempt to
1107** obtain the lock succeeds.
1108**
1109** Return UNQLITE_OK on success and an error code if we cannot obtain
1110** the lock. If the lock is obtained successfully, set the Pager.state
1111** variable to locktype before returning.
1112*/
1113static int pager_wait_on_lock(Pager *pPager, int locktype){
1114 int rc; /* Return code */
1115 do {
1116 rc = pager_lock_db(pPager,locktype);
1117 }while( rc==UNQLITE_BUSY && pPager->xBusyHandler && pPager->xBusyHandler(pPager->pBusyHandlerArg) );
1118 return rc;
1119}
1120/*
1121** This function is called after transitioning from PAGER_OPEN to
1122** PAGER_SHARED state. It tests if there is a hot journal present in
1123** the file-system for the given pager. A hot journal is one that
1124** needs to be played back. According to this function, a hot-journal
1125** file exists if the following criteria are met:
1126**
1127** * The journal file exists in the file system, and
1128** * No process holds a RESERVED or greater lock on the database file, and
1129** * The database file itself is greater than 0 bytes in size, and
1130** * The first byte of the journal file exists and is not 0x00.
1131**
1132** If the current size of the database file is 0 but a journal file
1133** exists, that is probably an old journal left over from a prior
1134** database with the same name. In this case the journal file is
1135** just deleted using OsDelete, *pExists is set to 0 and UNQLITE_OK
1136** is returned.
1137**
1138** If a hot-journal file is found to exist, *pExists is set to 1 and
1139** UNQLITE_OK returned. If no hot-journal file is present, *pExists is
1140** set to 0 and UNQLITE_OK returned. If an IO error occurs while trying
1141** to determine whether or not a hot-journal file exists, the IO error
1142** code is returned and the value of *pExists is undefined.
1143*/
1144static int pager_has_hot_journal(Pager *pPager, int *pExists)
1145{
1146 unqlite_vfs *pVfs = pPager->pVfs;
1147 int rc = UNQLITE_OK; /* Return code */
1148 int exists = 1; /* True if a journal file is present */
1149
1150 *pExists = 0;
1151 rc = unqliteOsAccess(pVfs, pPager->zJournal, UNQLITE_ACCESS_EXISTS, &exists);
1152 if( rc==UNQLITE_OK && exists ){
1153 int locked = 0; /* True if some process holds a RESERVED lock */
1154
1155 /* Race condition here: Another process might have been holding the
1156 ** the RESERVED lock and have a journal open at the unqliteOsAccess()
1157 ** call above, but then delete the journal and drop the lock before
1158 ** we get to the following unqliteOsCheckReservedLock() call. If that
1159 ** is the case, this routine might think there is a hot journal when
1160 ** in fact there is none. This results in a false-positive which will
1161 ** be dealt with by the playback routine.
1162 */
1163 rc = unqliteOsCheckReservedLock(pPager->pfd, &locked);
1164 if( rc==UNQLITE_OK && !locked ){
1165 sxi64 n = 0; /* Size of db file in bytes */
1166
1167 /* Check the size of the database file. If it consists of 0 pages,
1168 ** then delete the journal file. See the header comment above for
1169 ** the reasoning here. Delete the obsolete journal file under
1170 ** a RESERVED lock to avoid race conditions.
1171 */
1172 rc = unqliteOsFileSize(pPager->pfd,&n);
1173 if( rc==UNQLITE_OK ){
1174 if( n < 1 ){
1175 if( pager_lock_db(pPager, RESERVED_LOCK)==UNQLITE_OK ){
1176 unqliteOsDelete(pVfs, pPager->zJournal, 0);
1177 pager_unlock_db(pPager, SHARED_LOCK);
1178 }
1179 }else{
1180 /* The journal file exists and no other connection has a reserved
1181 ** or greater lock on the database file. */
1182 *pExists = 1;
1183 }
1184 }
1185 }
1186 }
1187 return rc;
1188}
1189/*
1190 * Rollback a journal file. (See block-comment above).
1191 */
1192static int pager_journal_rollback(Pager *pPager,int check_hot)
1193{
1194 int rc;
1195 if( check_hot ){
1196 int iExists = 0; /* cc warning */
1197 /* Check if the journal file exists */
1198 rc = pager_has_hot_journal(pPager,&iExists);
1199 if( rc != UNQLITE_OK ){
1200 /* IO error */
1201 return rc;
1202 }
1203 if( !iExists ){
1204 /* Journal file does not exists */
1205 return UNQLITE_OK;
1206 }
1207 }
1208 if( pPager->is_rdonly ){
1209 unqliteGenErrorFormat(pPager->pDb,
1210 "Cannot rollback journal file '%s' due to a read-only database handle",pPager->zJournal);
1211 return UNQLITE_READ_ONLY;
1212 }
1213 /* Get an EXCLUSIVE lock on the database file. At this point it is
1214 ** important that a RESERVED lock is not obtained on the way to the
1215 ** EXCLUSIVE lock. If it were, another process might open the
1216 ** database file, detect the RESERVED lock, and conclude that the
1217 ** database is safe to read while this process is still rolling the
1218 ** hot-journal back.
1219 **
1220 ** Because the intermediate RESERVED lock is not requested, any
1221 ** other process attempting to access the database file will get to
1222 ** this point in the code and fail to obtain its own EXCLUSIVE lock
1223 ** on the database file.
1224 **
1225 ** Unless the pager is in locking_mode=exclusive mode, the lock is
1226 ** downgraded to SHARED_LOCK before this function returns.
1227 */
1228 /* Open the journal file */
1229 rc = unqliteOsOpen(pPager->pVfs,pPager->pAllocator,pPager->zJournal,&pPager->pjfd,UNQLITE_OPEN_READWRITE);
1230 if( rc != UNQLITE_OK ){
1231 unqliteGenErrorFormat(pPager->pDb,"IO error while opening journal file: '%s'",pPager->zJournal);
1232 goto fail;
1233 }
1234 rc = pager_lock_db(pPager,EXCLUSIVE_LOCK);
1235 if( rc != UNQLITE_OK ){
1236 unqliteGenError(pPager->pDb,"Cannot acquire an exclusive lock on the database while journal rollback");
1237 goto fail;
1238 }
1239 /* Sync the journal file */
1240 unqliteOsSync(pPager->pjfd,UNQLITE_SYNC_NORMAL);
1241 /* Finally rollback the database */
1242 rc = pager_playback(pPager);
1243 /* Switch back to shared lock */
1244 pager_unlock_db(pPager,SHARED_LOCK);
1245fail:
1246 /* Close the journal handle */
1247 unqliteOsCloseFree(pPager->pAllocator,pPager->pjfd);
1248 pPager->pjfd = 0;
1249 if( rc == UNQLITE_OK ){
1250 /* Delete the journal file */
1251 unqliteOsDelete(pPager->pVfs,pPager->zJournal,TRUE);
1252 }
1253 return rc;
1254}
1255/*
1256 * Write the unqlite header (First page). (Big-Endian)
1257 */
1258static int pager_write_db_header(Pager *pPager)
1259{
1260 unsigned char *zRaw = pPager->pHeader->zData;
1261 unqlite_kv_engine *pEngine = pPager->pEngine;
1262 sxu32 nDos;
1263 sxu16 nLen;
1264 /* Database signature */
1265 SyMemcpy(UNQLITE_DB_SIG,zRaw,sizeof(UNQLITE_DB_SIG)-1);
1266 zRaw += sizeof(UNQLITE_DB_SIG)-1;
1267 /* Database magic number */
1268 SyBigEndianPack32(zRaw,UNQLITE_DB_MAGIC);
1269 zRaw += 4; /* 4 byte magic number */
1270 /* Database creation time */
1271 SyZero(&pPager->tmCreate,sizeof(Sytm));
1272 if( pPager->pVfs->xCurrentTime ){
1273 pPager->pVfs->xCurrentTime(pPager->pVfs,&pPager->tmCreate);
1274 }
1275 /* DOS time format (4 bytes) */
1276 SyTimeFormatToDos(&pPager->tmCreate,&nDos);
1277 SyBigEndianPack32(zRaw,nDos);
1278 zRaw += 4; /* 4 byte DOS time */
1279 /* Sector size */
1280 SyBigEndianPack32(zRaw,(sxu32)pPager->iSectorSize);
1281 zRaw += 4; /* 4 byte sector size */
1282 /* Page size */
1283 SyBigEndianPack32(zRaw,(sxu32)pPager->iPageSize);
1284 zRaw += 4; /* 4 byte page size */
1285 /* Key value storage engine */
1286 nLen = (sxu16)SyStrlen(pEngine->pIo->pMethods->zName);
1287 SyBigEndianPack16(zRaw,nLen); /* 2 byte storage engine name */
1288 zRaw += 2;
1289 SyMemcpy((const void *)pEngine->pIo->pMethods->zName,(void *)zRaw,nLen);
1290 zRaw += nLen;
1291 /* All rest are meta-data available to the host application */
1292 return UNQLITE_OK;
1293}
1294/*
1295 * Read the unqlite header (first page). (Big-Endian)
1296 */
1297static int pager_extract_header(Pager *pPager,const unsigned char *zRaw,sxu32 nByte)
1298{
1299 const unsigned char *zEnd = &zRaw[nByte];
1300 sxu32 nDos,iMagic;
1301 sxu16 nLen;
1302 char *zKv;
1303 /* Database signature */
1304 if( SyMemcmp(UNQLITE_DB_SIG,zRaw,sizeof(UNQLITE_DB_SIG)-1) != 0 ){
1305 /* Corrupt database */
1306 return UNQLITE_CORRUPT;
1307 }
1308 zRaw += sizeof(UNQLITE_DB_SIG)-1;
1309 /* Database magic number */
1310 SyBigEndianUnpack32(zRaw,&iMagic);
1311 zRaw += 4; /* 4 byte magic number */
1312 if( iMagic != UNQLITE_DB_MAGIC ){
1313 /* Corrupt database */
1314 return UNQLITE_CORRUPT;
1315 }
1316 /* Database creation time */
1317 SyBigEndianUnpack32(zRaw,&nDos);
1318 zRaw += 4; /* 4 byte DOS time format */
1319 SyDosTimeFormat(nDos,&pPager->tmCreate);
1320 /* Sector size */
1321 SyBigEndianUnpack32(zRaw,(sxu32 *)&pPager->iSectorSize);
1322 zRaw += 4; /* 4 byte sector size */
1323 /* Page size */
1324 SyBigEndianUnpack32(zRaw,(sxu32 *)&pPager->iPageSize);
1325 zRaw += 4; /* 4 byte page size */
1326 /* Check that the values read from the page-size and sector-size fields
1327 ** are within range. To be 'in range', both values need to be a power
1328 ** of two greater than or equal to 512 or 32, and not greater than their
1329 ** respective compile time maximum limits.
1330 */
1331 if( pPager->iPageSize<UNQLITE_MIN_PAGE_SIZE || pPager->iSectorSize<32
1332 || pPager->iPageSize>UNQLITE_MAX_PAGE_SIZE || pPager->iSectorSize>MAX_SECTOR_SIZE
1333 || ((pPager->iPageSize<-1)&pPager->iPageSize)!=0 || ((pPager->iSectorSize-1)&pPager->iSectorSize)!=0
1334 ){
1335 return UNQLITE_CORRUPT;
1336 }
1337 /* Key value storage engine */
1338 SyBigEndianUnpack16(zRaw,&nLen); /* 2 byte storage engine length */
1339 zRaw += 2;
1340 if( nLen > (sxu16)(zEnd - zRaw) ){
1341 nLen = (sxu16)(zEnd - zRaw);
1342 }
1343 zKv = (char *)SyMemBackendDup(pPager->pAllocator,(const char *)zRaw,nLen);
1344 if( zKv == 0 ){
1345 return UNQLITE_NOMEM;
1346 }
1347 SyStringInitFromBuf(&pPager->sKv,zKv,nLen);
1348 return UNQLITE_OK;
1349}
1350/*
1351 * Read the database header.
1352 */
1353static int pager_read_db_header(Pager *pPager)
1354{
1355 unsigned char zRaw[UNQLITE_MIN_PAGE_SIZE]; /* Minimum page size */
1356 sxi64 n = 0; /* Size of db file in bytes */
1357 int rc;
1358 /* Get the file size first */
1359 rc = unqliteOsFileSize(pPager->pfd,&n);
1360 if( rc != UNQLITE_OK ){
1361 return rc;
1362 }
1363 pPager->dbByteSize = n;
1364 if( n > 0 ){
1365 unqlite_kv_methods *pMethods;
1366 SyString *pKv;
1367 pgno nPage;
1368 if( n < UNQLITE_MIN_PAGE_SIZE ){
1369 /* A valid unqlite database must be at least 512 bytes long */
1370 unqliteGenError(pPager->pDb,"Malformed database image");
1371 return UNQLITE_CORRUPT;
1372 }
1373 /* Read the database header */
1374 rc = unqliteOsRead(pPager->pfd,zRaw,sizeof(zRaw),0);
1375 if( rc != UNQLITE_OK ){
1376 unqliteGenError(pPager->pDb,"IO error while reading database header");
1377 return rc;
1378 }
1379 /* Extract the header */
1380 rc = pager_extract_header(pPager,zRaw,sizeof(zRaw));
1381 if( rc != UNQLITE_OK ){
1382 unqliteGenError(pPager->pDb,rc == UNQLITE_NOMEM ? "Unqlite is running out of memory" : "Malformed database image");
1383 return rc;
1384 }
1385 /* Update pager state */
1386 nPage = (pgno)(n / pPager->iPageSize);
1387 if( nPage==0 && n>0 ){
1388 nPage = 1;
1389 }
1390 pPager->dbSize = nPage;
1391 /* Laod the target Key/Value storage engine */
1392 pKv = &pPager->sKv;
1393 pMethods = unqliteFindKVStore(pKv->zString,pKv->nByte);
1394 if( pMethods == 0 ){
1395 unqliteGenErrorFormat(pPager->pDb,"No such Key/Value storage engine '%z'",pKv);
1396 return UNQLITE_NOTIMPLEMENTED;
1397 }
1398 /* Install the new KV storage engine */
1399 rc = unqlitePagerRegisterKvEngine(pPager,pMethods);
1400 if( rc != UNQLITE_OK ){
1401 return rc;
1402 }
1403 }else{
1404 /* Set a default page and sector size */
1405 pPager->iSectorSize = GetSectorSize(pPager->pfd);
1406 pPager->iPageSize = unqliteGetPageSize();
1407 SyStringInitFromBuf(&pPager->sKv,pPager->pEngine->pIo->pMethods->zName,SyStrlen(pPager->pEngine->pIo->pMethods->zName));
1408 pPager->dbSize = 0;
1409 }
1410 /* Allocate a temporary page size */
1411 pPager->zTmpPage = (unsigned char *)SyMemBackendAlloc(pPager->pAllocator,(sxu32)pPager->iPageSize);
1412 if( pPager->zTmpPage == 0 ){
1413 unqliteGenOutofMem(pPager->pDb);
1414 return UNQLITE_NOMEM;
1415 }
1416 SyZero(pPager->zTmpPage,(sxu32)pPager->iPageSize);
1417 return UNQLITE_OK;
1418}
1419/*
1420 * Write the database header.
1421 */
1422static int pager_create_header(Pager *pPager)
1423{
1424 Page *pHeader;
1425 int rc;
1426 /* Allocate a new page */
1427 pHeader = pager_alloc_page(pPager,0);
1428 if( pHeader == 0 ){
1429 return UNQLITE_NOMEM;
1430 }
1431 pPager->pHeader = pHeader;
1432 /* Link the page */
1433 pager_link_page(pPager,pHeader);
1434 /* Add to the dirty list */
1435 pager_page_to_dirty_list(pPager,pHeader);
1436 /* Write the database header */
1437 rc = pager_write_db_header(pPager);
1438 return rc;
1439}
1440/*
1441** This function is called to obtain a shared lock on the database file.
1442** It is illegal to call unqlitePagerAcquire() until after this function
1443** has been successfully called. If a shared-lock is already held when
1444** this function is called, it is a no-op.
1445**
1446** The following operations are also performed by this function.
1447**
1448** 1) If the pager is currently in PAGER_OPEN state (no lock held
1449** on the database file), then an attempt is made to obtain a
1450** SHARED lock on the database file. Immediately after obtaining
1451** the SHARED lock, the file-system is checked for a hot-journal,
1452** which is played back if present.
1453**
1454** If everything is successful, UNQLITE_OK is returned. If an IO error
1455** occurs while locking the database, checking for a hot-journal file or
1456** rolling back a journal file, the IO error code is returned.
1457*/
1458static int pager_shared_lock(Pager *pPager)
1459{
1460 int rc = UNQLITE_OK;
1461 if( pPager->iState == PAGER_OPEN ){
1462 unqlite_kv_methods *pMethods;
1463 /* Open the target database */
1464 rc = unqliteOsOpen(pPager->pVfs,pPager->pAllocator,pPager->zFilename,&pPager->pfd,pPager->iOpenFlags);
1465 if( rc != UNQLITE_OK ){
1466 unqliteGenErrorFormat(pPager->pDb,
1467 "IO error while opening the target database file: %s",pPager->zFilename
1468 );
1469 return rc;
1470 }
1471 /* Try to obtain a shared lock */
1472 rc = pager_wait_on_lock(pPager,SHARED_LOCK);
1473 if( rc == UNQLITE_OK ){
1474 if( pPager->iLock <= SHARED_LOCK ){
1475 /* Rollback any hot journal */
1476 rc = pager_journal_rollback(pPager,1);
1477 if( rc != UNQLITE_OK ){
1478 return rc;
1479 }
1480 }
1481 /* Read the database header */
1482 rc = pager_read_db_header(pPager);
1483 if( rc != UNQLITE_OK ){
1484 return rc;
1485 }
1486 if(pPager->dbSize > 0 ){
1487 if( pPager->iOpenFlags & UNQLITE_OPEN_MMAP ){
1488 const jx9_vfs *pVfs = jx9ExportBuiltinVfs();
1489 /* Obtain a read-only memory view of the whole file */
1490 if( pVfs && pVfs->xMmap ){
1491 int vr;
1492 vr = pVfs->xMmap(pPager->zFilename,&pPager->pMmap,&pPager->dbByteSize);
1493 if( vr != JX9_OK ){
1494 /* Generate a warning */
1495 unqliteGenError(pPager->pDb,"Cannot obtain a read-only memory view of the target database");
1496 pPager->iOpenFlags &= ~UNQLITE_OPEN_MMAP;
1497 }
1498 }else{
1499 /* Generate a warning */
1500 unqliteGenError(pPager->pDb,"Cannot obtain a read-only memory view of the target database");
1501 pPager->iOpenFlags &= ~UNQLITE_OPEN_MMAP;
1502 }
1503 }
1504 }
1505 /* Update the pager state */
1506 pPager->iState = PAGER_READER;
1507 /* Invoke the xOpen methods if available */
1508 pMethods = pPager->pEngine->pIo->pMethods;
1509 if( pMethods->xOpen ){
1510 rc = pMethods->xOpen(pPager->pEngine,pPager->dbSize);
1511 if( rc != UNQLITE_OK ){
1512 unqliteGenErrorFormat(pPager->pDb,
1513 "xOpen() method of the underlying KV engine '%z' failed",
1514 &pPager->sKv
1515 );
1516 pager_unlock_db(pPager,NO_LOCK);
1517 pPager->iState = PAGER_OPEN;
1518 return rc;
1519 }
1520 }
1521 }else if( rc == UNQLITE_BUSY ){
1522 unqliteGenError(pPager->pDb,"Another process or thread have a reserved or exclusive lock on this database");
1523 }
1524 }
1525 return rc;
1526}
1527/*
1528** Begin a write-transaction on the specified pager object. If a
1529** write-transaction has already been opened, this function is a no-op.
1530*/
1531UNQLITE_PRIVATE int unqlitePagerBegin(Pager *pPager)
1532{
1533 int rc;
1534 /* Obtain a shared lock on the database first */
1535 rc = pager_shared_lock(pPager);
1536 if( rc != UNQLITE_OK ){
1537 return rc;
1538 }
1539 if( pPager->iState >= PAGER_WRITER_LOCKED ){
1540 return UNQLITE_OK;
1541 }
1542 if( pPager->is_rdonly ){
1543 unqliteGenError(pPager->pDb,"Read-only database");
1544 /* Read only database */
1545 return UNQLITE_READ_ONLY;
1546 }
1547 /* Obtain a reserved lock on the database */
1548 rc = pager_wait_on_lock(pPager,RESERVED_LOCK);
1549 if( rc == UNQLITE_OK ){
1550 /* Create the bitvec */
1551 pPager->pVec = unqliteBitvecCreate(pPager->pAllocator,pPager->dbSize);
1552 if( pPager->pVec == 0 ){
1553 unqliteGenOutofMem(pPager->pDb);
1554 rc = UNQLITE_NOMEM;
1555 goto fail;
1556 }
1557 /* Change to the WRITER_LOCK state */
1558 pPager->iState = PAGER_WRITER_LOCKED;
1559 pPager->dbOrigSize = pPager->dbSize;
1560 pPager->iJournalOfft = 0;
1561 pPager->nRec = 0;
1562 if( pPager->dbSize < 1 ){
1563 /* Write the database header */
1564 rc = pager_create_header(pPager);
1565 if( rc != UNQLITE_OK ){
1566 goto fail;
1567 }
1568 pPager->dbSize = 1;
1569 }
1570 }else if( rc == UNQLITE_BUSY ){
1571 unqliteGenError(pPager->pDb,"Another process or thread have a reserved lock on this database");
1572 }
1573 return rc;
1574fail:
1575 /* Downgrade to shared lock */
1576 pager_unlock_db(pPager,SHARED_LOCK);
1577 return rc;
1578}
1579/*
1580** This function is called at the start of every write transaction.
1581** There must already be a RESERVED or EXCLUSIVE lock on the database
1582** file when this routine is called.
1583**
1584*/
1585static int unqliteOpenJournal(Pager *pPager)
1586{
1587 unsigned char *zHeader;
1588 int rc = UNQLITE_OK;
1589 if( pPager->is_mem || pPager->no_jrnl ){
1590 /* Journaling is omitted for this database */
1591 goto finish;
1592 }
1593 if( pPager->iState >= PAGER_WRITER_CACHEMOD ){
1594 /* Already opened */
1595 return UNQLITE_OK;
1596 }
1597 /* Delete any previously journal with the same name */
1598 unqliteOsDelete(pPager->pVfs,pPager->zJournal,1);
1599 /* Open the journal file */
1600 rc = unqliteOsOpen(pPager->pVfs,pPager->pAllocator,pPager->zJournal,
1601 &pPager->pjfd,UNQLITE_OPEN_CREATE|UNQLITE_OPEN_READWRITE);
1602 if( rc != UNQLITE_OK ){
1603 unqliteGenErrorFormat(pPager->pDb,"IO error while opening journal file: %s",pPager->zJournal);
1604 return rc;
1605 }
1606 /* Write the journal header */
1607 zHeader = (unsigned char *)SyMemBackendAlloc(pPager->pAllocator,(sxu32)pPager->iSectorSize);
1608 if( zHeader == 0 ){
1609 rc = UNQLITE_NOMEM;
1610 goto fail;
1611 }
1612 pager_write_journal_header(pPager,zHeader);
1613 /* Perform the disk write */
1614 rc = unqliteOsWrite(pPager->pjfd,zHeader,pPager->iSectorSize,0);
1615 /* Offset to start writing from */
1616 pPager->iJournalOfft = pPager->iSectorSize;
1617 /* All done, journal will be synced later */
1618 SyMemBackendFree(pPager->pAllocator,zHeader);
1619finish:
1620 if( rc == UNQLITE_OK ){
1621 pPager->iState = PAGER_WRITER_CACHEMOD;
1622 return UNQLITE_OK;
1623 }
1624fail:
1625 /* Unlink the journal file if something goes wrong */
1626 unqliteOsCloseFree(pPager->pAllocator,pPager->pjfd);
1627 unqliteOsDelete(pPager->pVfs,pPager->zJournal,0);
1628 pPager->pjfd = 0;
1629 return rc;
1630}
1631/*
1632** Sync the journal. In other words, make sure all the pages that have
1633** been written to the journal have actually reached the surface of the
1634** disk and can be restored in the event of a hot-journal rollback.
1635*
1636* This routine try also to obtain an exlusive lock on the database.
1637*/
1638static int unqliteFinalizeJournal(Pager *pPager,int *pRetry,int close_jrnl)
1639{
1640 int rc;
1641 *pRetry = 0;
1642 /* Grab the exclusive lock first */
1643 rc = pager_lock_db(pPager,EXCLUSIVE_LOCK);
1644 if( rc != UNQLITE_OK ){
1645 /* Retry the excusive lock process */
1646 *pRetry = 1;
1647 rc = UNQLITE_OK;
1648 }
1649 if( pPager->no_jrnl ){
1650 /* Journaling is omitted, return immediately */
1651 return UNQLITE_OK;
1652 }
1653 /* Write the total number of database records */
1654 rc = WriteInt32(pPager->pjfd,pPager->nRec,8 /* sizeof(aJournalRec) */);
1655 if( rc != UNQLITE_OK ){
1656 if( pPager->nRec > 0 ){
1657 return rc;
1658 }else{
1659 /* Not so fatal */
1660 rc = UNQLITE_OK;
1661 }
1662 }
1663 /* Sync the journal and close it */
1664 rc = unqliteOsSync(pPager->pjfd,UNQLITE_SYNC_NORMAL);
1665 if( close_jrnl ){
1666 /* close the journal file */
1667 if( UNQLITE_OK != unqliteOsCloseFree(pPager->pAllocator,pPager->pjfd) ){
1668 if( rc != UNQLITE_OK /* unqliteOsSync */ ){
1669 return rc;
1670 }
1671 }
1672 pPager->pjfd = 0;
1673 }
1674 if( (*pRetry) == 1 ){
1675 if( pager_lock_db(pPager,EXCLUSIVE_LOCK) == UNQLITE_OK ){
1676 /* Got exclusive lock */
1677 *pRetry = 0;
1678 }
1679 }
1680 return UNQLITE_OK;
1681}
1682/*
1683 * Mark a single data page as writeable. The page is written into the
1684 * main journal as required.
1685 */
1686static int page_write(Pager *pPager,Page *pPage)
1687{
1688 int rc;
1689 if( !pPager->is_mem && !pPager->no_jrnl ){
1690 /* Write the page to the transaction journal */
1691 if( pPage->pgno < pPager->dbOrigSize && !unqliteBitvecTest(pPager->pVec,pPage->pgno) ){
1692 sxu32 cksum;
1693 if( pPager->nRec == SXU32_HIGH ){
1694 /* Journal Limit reached */
1695 unqliteGenError(pPager->pDb,"Journal record limit reached, commit your changes");
1696 return UNQLITE_LIMIT;
1697 }
1698 /* Write the page number */
1699 rc = WriteInt64(pPager->pjfd,pPage->pgno,pPager->iJournalOfft);
1700 if( rc != UNQLITE_OK ){ return rc; }
1701 /* Write the raw page */
1702 /** CODEC */
1703 rc = unqliteOsWrite(pPager->pjfd,pPage->zData,pPager->iPageSize,pPager->iJournalOfft + 8);
1704 if( rc != UNQLITE_OK ){ return rc; }
1705 /* Compute the checksum */
1706 cksum = pager_cksum(pPager,pPage->zData);
1707 rc = WriteInt32(pPager->pjfd,cksum,pPager->iJournalOfft + 8 + pPager->iPageSize);
1708 if( rc != UNQLITE_OK ){ return rc; }
1709 /* Update the journal offset */
1710 pPager->iJournalOfft += 8 /* page num */ + pPager->iPageSize + 4 /* cksum */;
1711 pPager->nRec++;
1712 /* Mark as journalled */
1713 unqliteBitvecSet(pPager->pVec,pPage->pgno);
1714 }
1715 }
1716 /* Add the page to the dirty list */
1717 pager_page_to_dirty_list(pPager,pPage);
1718 /* Update the database size and return. */
1719 if( (1 + pPage->pgno) > pPager->dbSize ){
1720 pPager->dbSize = 1 + pPage->pgno;
1721 if( pPager->dbSize == SXU64_HIGH ){
1722 unqliteGenError(pPager->pDb,"Database maximum page limit (64-bit) reached");
1723 return UNQLITE_LIMIT;
1724 }
1725 }
1726 return UNQLITE_OK;
1727}
1728/*
1729** The argument is the first in a linked list of dirty pages connected
1730** by the PgHdr.pDirty pointer. This function writes each one of the
1731** in-memory pages in the list to the database file. The argument may
1732** be NULL, representing an empty list. In this case this function is
1733** a no-op.
1734**
1735** The pager must hold at least a RESERVED lock when this function
1736** is called. Before writing anything to the database file, this lock
1737** is upgraded to an EXCLUSIVE lock. If the lock cannot be obtained,
1738** UNQLITE_BUSY is returned and no data is written to the database file.
1739*/
1740static int pager_write_dirty_pages(Pager *pPager,Page *pDirty)
1741{
1742 int rc = UNQLITE_OK;
1743 Page *pNext;
1744 for(;;){
1745 if( pDirty == 0 ){
1746 break;
1747 }
1748 /* Point to the next dirty page */
1749 pNext = pDirty->pDirtyPrev; /* Not a bug: Reverse link */
1750 if( (pDirty->flags & PAGE_DONT_WRITE) == 0 ){
1751 rc = unqliteOsWrite(pPager->pfd,pDirty->zData,pPager->iPageSize,pDirty->pgno * pPager->iPageSize);
1752 if( rc != UNQLITE_OK ){
1753 /* A rollback should be done */
1754 break;
1755 }
1756 }
1757 /* Remove stale flags */
1758 pDirty->flags &= ~(PAGE_DIRTY|PAGE_DONT_WRITE|PAGE_NEED_SYNC|PAGE_IN_JOURNAL|PAGE_HOT_DIRTY);
1759 if( pDirty->nRef < 1 ){
1760 /* Unlink the page now it is unused */
1761 pager_unlink_page(pPager,pDirty);
1762 /* Release the page */
1763 pager_release_page(pPager,pDirty);
1764 }
1765 /* Point to the next page */
1766 pDirty = pNext;
1767 }
1768 pPager->pDirty = pPager->pFirstDirty = 0;
1769 pPager->pHotDirty = pPager->pFirstHot = 0;
1770 pPager->nHot = 0;
1771 return rc;
1772}
1773/*
1774** The argument is the first in a linked list of hot dirty pages connected
1775** by the PgHdr.pHotDirty pointer. This function writes each one of the
1776** in-memory pages in the list to the database file. The argument may
1777** be NULL, representing an empty list. In this case this function is
1778** a no-op.
1779**
1780** The pager must hold at least a RESERVED lock when this function
1781** is called. Before writing anything to the database file, this lock
1782** is upgraded to an EXCLUSIVE lock. If the lock cannot be obtained,
1783** UNQLITE_BUSY is returned and no data is written to the database file.
1784*/
1785static int pager_write_hot_dirty_pages(Pager *pPager,Page *pDirty)
1786{
1787 int rc = UNQLITE_OK;
1788 Page *pNext;
1789 for(;;){
1790 if( pDirty == 0 ){
1791 break;
1792 }
1793 /* Point to the next page */
1794 pNext = pDirty->pPrevHot; /* Not a bug: Reverse link */
1795 if( (pDirty->flags & PAGE_DONT_WRITE) == 0 ){
1796 rc = unqliteOsWrite(pPager->pfd,pDirty->zData,pPager->iPageSize,pDirty->pgno * pPager->iPageSize);
1797 if( rc != UNQLITE_OK ){
1798 break;
1799 }
1800 }
1801 /* Remove stale flags */
1802 pDirty->flags &= ~(PAGE_DIRTY|PAGE_DONT_WRITE|PAGE_NEED_SYNC|PAGE_IN_JOURNAL|PAGE_HOT_DIRTY);
1803 /* Unlink from the list of dirty pages */
1804 if( pDirty->pDirtyPrev ){
1805 pDirty->pDirtyPrev->pDirtyNext = pDirty->pDirtyNext;
1806 }else{
1807 pPager->pDirty = pDirty->pDirtyNext;
1808 }
1809 if( pDirty->pDirtyNext ){
1810 pDirty->pDirtyNext->pDirtyPrev = pDirty->pDirtyPrev;
1811 }else{
1812 pPager->pFirstDirty = pDirty->pDirtyPrev;
1813 }
1814 /* Discard */
1815 pager_unlink_page(pPager,pDirty);
1816 /* Release the page */
1817 pager_release_page(pPager,pDirty);
1818 /* Next hot page */
1819 pDirty = pNext;
1820 }
1821 return rc;
1822}
1823/*
1824 * Commit a transaction: Phase one.
1825 */
1826static int pager_commit_phase1(Pager *pPager)
1827{
1828 int get_excl = 0;
1829 Page *pDirty;
1830 int rc;
1831 /* If no database changes have been made, return early. */
1832 if( pPager->iState < PAGER_WRITER_CACHEMOD ){
1833 return UNQLITE_OK;
1834 }
1835 if( pPager->is_mem ){
1836 /* An in-memory database */
1837 return UNQLITE_OK;
1838 }
1839 if( pPager->is_rdonly ){
1840 /* Read-Only DB */
1841 unqliteGenError(pPager->pDb,"Read-Only database");
1842 return UNQLITE_READ_ONLY;
1843 }
1844 /* Finalize the journal file */
1845 rc = unqliteFinalizeJournal(pPager,&get_excl,1);
1846 if( rc != UNQLITE_OK ){
1847 return rc;
1848 }
1849 /* Get the dirty pages */
1850 pDirty = pager_get_dirty_pages(pPager);
1851 if( get_excl ){
1852 /* Wait one last time for the exclusive lock */
1853 rc = pager_wait_on_lock(pPager,EXCLUSIVE_LOCK);
1854 if( rc != UNQLITE_OK ){
1855 unqliteGenError(pPager->pDb,"Cannot obtain an Exclusive lock on the target database");
1856 return rc;
1857 }
1858 }
1859 if( pPager->iFlags & PAGER_CTRL_DIRTY_COMMIT ){
1860 /* Synce the database first if a dirty commit have been applied */
1861 unqliteOsSync(pPager->pfd,UNQLITE_SYNC_NORMAL);
1862 }
1863 /* Write the dirty pages */
1864 rc = pager_write_dirty_pages(pPager,pDirty);
1865 if( rc != UNQLITE_OK ){
1866 /* Rollback your DB */
1867 pPager->iFlags |= PAGER_CTRL_COMMIT_ERR;
1868 pPager->pFirstDirty = pDirty;
1869 unqliteGenError(pPager->pDb,"IO error while writing dirty pages, rollback your database");
1870 return rc;
1871 }
1872 /* If the file on disk is not the same size as the database image,
1873 * then use unqliteOsTruncate to grow or shrink the file here.
1874 */
1875 if( pPager->dbSize != pPager->dbOrigSize ){
1876 unqliteOsTruncate(pPager->pfd,pPager->iPageSize * pPager->dbSize);
1877 }
1878 /* Sync the database file */
1879 unqliteOsSync(pPager->pfd,UNQLITE_SYNC_FULL);
1880 /* Remove stale flags */
1881 pPager->iJournalOfft = 0;
1882 pPager->nRec = 0;
1883 return UNQLITE_OK;
1884}
1885/*
1886 * Commit a transaction: Phase two.
1887 */
1888static int pager_commit_phase2(Pager *pPager)
1889{
1890 if( !pPager->is_mem ){
1891 if( pPager->iState == PAGER_OPEN ){
1892 return UNQLITE_OK;
1893 }
1894 if( pPager->iState != PAGER_READER ){
1895 if( !pPager->no_jrnl ){
1896 /* Finally, unlink the journal file */
1897 unqliteOsDelete(pPager->pVfs,pPager->zJournal,1);
1898 }
1899 /* Downgrade to shraed lock */
1900 pager_unlock_db(pPager,SHARED_LOCK);
1901 pPager->iState = PAGER_READER;
1902 if( pPager->pVec ){
1903 unqliteBitvecDestroy(pPager->pVec);
1904 pPager->pVec = 0;
1905 }
1906 }
1907 }
1908 return UNQLITE_OK;
1909}
1910/*
1911 * Perform a dirty commit.
1912 */
1913static int pager_dirty_commit(Pager *pPager)
1914{
1915 int get_excl = 0;
1916 Page *pHot;
1917 int rc;
1918 /* Finalize the journal file without closing it */
1919 rc = unqliteFinalizeJournal(pPager,&get_excl,0);
1920 if( rc != UNQLITE_OK ){
1921 /* It's not a fatal error if something goes wrong here since
1922 * its not the final commit.
1923 */
1924 return UNQLITE_OK;
1925 }
1926 /* Point to the list of hot pages */
1927 pHot = pager_get_hot_pages(pPager);
1928 if( pHot == 0 ){
1929 return UNQLITE_OK;
1930 }
1931 if( get_excl ){
1932 /* Wait one last time for the exclusive lock */
1933 rc = pager_wait_on_lock(pPager,EXCLUSIVE_LOCK);
1934 if( rc != UNQLITE_OK ){
1935 /* Not so fatal, will try another time */
1936 return UNQLITE_OK;
1937 }
1938 }
1939 /* Tell that a dirty commit happen */
1940 pPager->iFlags |= PAGER_CTRL_DIRTY_COMMIT;
1941 /* Write the hot pages now */
1942 rc = pager_write_hot_dirty_pages(pPager,pHot);
1943 if( rc != UNQLITE_OK ){
1944 pPager->iFlags |= PAGER_CTRL_COMMIT_ERR;
1945 unqliteGenError(pPager->pDb,"IO error while writing hot dirty pages, rollback your database");
1946 return rc;
1947 }
1948 pPager->pFirstHot = pPager->pHotDirty = 0;
1949 pPager->nHot = 0;
1950 /* No need to sync the database file here, since the journal is already
1951 * open here and this is not the final commit.
1952 */
1953 return UNQLITE_OK;
1954}
1955/*
1956** Commit a transaction and sync the database file for the pager pPager.
1957**
1958** This routine ensures that:
1959**
1960** * the journal is synced,
1961** * all dirty pages are written to the database file,
1962** * the database file is truncated (if required), and
1963** * the database file synced.
1964** * the journal file is deleted.
1965*/
1966UNQLITE_PRIVATE int unqlitePagerCommit(Pager *pPager)
1967{
1968 int rc;
1969 /* Commit: Phase One */
1970 rc = pager_commit_phase1(pPager);
1971 if( rc != UNQLITE_OK ){
1972 goto fail;
1973 }
1974 /* Commit: Phase Two */
1975 rc = pager_commit_phase2(pPager);
1976 if( rc != UNQLITE_OK ){
1977 goto fail;
1978 }
1979 /* Remove stale flags */
1980 pPager->iFlags &= ~PAGER_CTRL_COMMIT_ERR;
1981 /* All done */
1982 return UNQLITE_OK;
1983fail:
1984 /* Disable the auto-commit flag */
1985 pPager->pDb->iFlags |= UNQLITE_FL_DISABLE_AUTO_COMMIT;
1986 return rc;
1987}
1988/*
1989 * Reset the pager to its initial state. This is caused by
1990 * a rollback operation.
1991 */
1992static int pager_reset_state(Pager *pPager,int bResetKvEngine)
1993{
1994 unqlite_kv_engine *pEngine = pPager->pEngine;
1995 Page *pNext,*pPtr = pPager->pAll;
1996 const unqlite_kv_io *pIo;
1997 int rc;
1998 /* Remove stale flags */
1999 pPager->iFlags &= ~(PAGER_CTRL_COMMIT_ERR|PAGER_CTRL_DIRTY_COMMIT);
2000 pPager->iJournalOfft = 0;
2001 pPager->nRec = 0;
2002 /* Database original size */
2003 pPager->dbSize = pPager->dbOrigSize;
2004 /* Discard all in-memory pages */
2005 for(;;){
2006 if( pPtr == 0 ){
2007 break;
2008 }
2009 pNext = pPtr->pNext; /* Reverse link */
2010 /* Remove stale flags */
2011 pPtr->flags &= ~(PAGE_DIRTY|PAGE_DONT_WRITE|PAGE_NEED_SYNC|PAGE_IN_JOURNAL|PAGE_HOT_DIRTY);
2012 /* Release the page */
2013 pager_release_page(pPager,pPtr);
2014 /* Point to the next page */
2015 pPtr = pNext;
2016 }
2017 pPager->pAll = 0;
2018 pPager->nPage = 0;
2019 pPager->pDirty = pPager->pFirstDirty = 0;
2020 pPager->pHotDirty = pPager->pFirstHot = 0;
2021 pPager->nHot = 0;
2022 if( pPager->apHash ){
2023 /* Zero the table */
2024 SyZero((void *)pPager->apHash,sizeof(Page *) * pPager->nSize);
2025 }
2026 if( pPager->pVec ){
2027 unqliteBitvecDestroy(pPager->pVec);
2028 pPager->pVec = 0;
2029 }
2030 /* Switch back to shared lock */
2031 pager_unlock_db(pPager,SHARED_LOCK);
2032 pPager->iState = PAGER_READER;
2033 if( bResetKvEngine ){
2034 /* Reset the underlying KV engine */
2035 pIo = pEngine->pIo;
2036 if( pIo->pMethods->xRelease ){
2037 /* Call the release callback */
2038 pIo->pMethods->xRelease(pEngine);
2039 }
2040 /* Zero the structure */
2041 SyZero(pEngine,(sxu32)pIo->pMethods->szKv);
2042 /* Fill in */
2043 pEngine->pIo = pIo;
2044 if( pIo->pMethods->xInit ){
2045 /* Call the init method */
2046 rc = pIo->pMethods->xInit(pEngine,pPager->iPageSize);
2047 if( rc != UNQLITE_OK ){
2048 return rc;
2049 }
2050 }
2051 if( pIo->pMethods->xOpen ){
2052 /* Call the xOpen method */
2053 rc = pIo->pMethods->xOpen(pEngine,pPager->dbSize);
2054 if( rc != UNQLITE_OK ){
2055 return rc;
2056 }
2057 }
2058 }
2059 /* All done */
2060 return UNQLITE_OK;
2061}
2062/*
2063** If a write transaction is open, then all changes made within the
2064** transaction are reverted and the current write-transaction is closed.
2065** The pager falls back to PAGER_READER state if successful.
2066**
2067** Otherwise, in rollback mode, this function performs two functions:
2068**
2069** 1) It rolls back the journal file, restoring all database file and
2070** in-memory cache pages to the state they were in when the transaction
2071** was opened, and
2072**
2073** 2) It finalizes the journal file, so that it is not used for hot
2074** rollback at any point in the future (i.e. deletion).
2075**
2076** Finalization of the journal file (task 2) is only performed if the
2077** rollback is successful.
2078**
2079*/
2080UNQLITE_PRIVATE int unqlitePagerRollback(Pager *pPager,int bResetKvEngine)
2081{
2082 int rc = UNQLITE_OK;
2083 if( pPager->iState < PAGER_WRITER_LOCKED ){
2084 /* A write transaction must be opened */
2085 return UNQLITE_OK;
2086 }
2087 if( pPager->is_mem ){
2088 /* As of this release 1.1.6: Transactions are not supported for in-memory databases */
2089 return UNQLITE_OK;
2090 }
2091 if( pPager->is_rdonly ){
2092 /* Read-Only DB */
2093 unqliteGenError(pPager->pDb,"Read-Only database");
2094 return UNQLITE_READ_ONLY;
2095 }
2096 if( pPager->iState >= PAGER_WRITER_CACHEMOD ){
2097 if( !pPager->no_jrnl ){
2098 /* Close any outstanding joural file */
2099 if( pPager->pjfd ){
2100 /* Sync the journal file */
2101 unqliteOsSync(pPager->pjfd,UNQLITE_SYNC_NORMAL);
2102 }
2103 unqliteOsCloseFree(pPager->pAllocator,pPager->pjfd);
2104 pPager->pjfd = 0;
2105 if( pPager->iFlags & (PAGER_CTRL_COMMIT_ERR|PAGER_CTRL_DIRTY_COMMIT) ){
2106 /* Perform the rollback */
2107 rc = pager_journal_rollback(pPager,0);
2108 if( rc != UNQLITE_OK ){
2109 /* Set the auto-commit flag */
2110 pPager->pDb->iFlags |= UNQLITE_FL_DISABLE_AUTO_COMMIT;
2111 return rc;
2112 }
2113 }
2114 }
2115 /* Unlink the journal file */
2116 unqliteOsDelete(pPager->pVfs,pPager->zJournal,1);
2117 /* Reset the pager state */
2118 rc = pager_reset_state(pPager,bResetKvEngine);
2119 if( rc != UNQLITE_OK ){
2120 /* Mostly an unlikely scenario */
2121 pPager->pDb->iFlags |= UNQLITE_FL_DISABLE_AUTO_COMMIT; /* Set the auto-commit flag */
2122 unqliteGenError(pPager->pDb,"Error while reseting pager to its initial state");
2123 return rc;
2124 }
2125 }else{
2126 /* Downgrade to shared lock */
2127 pager_unlock_db(pPager,SHARED_LOCK);
2128 pPager->iState = PAGER_READER;
2129 }
2130 return UNQLITE_OK;
2131}
2132/*
2133 * Mark a data page as non writeable.
2134 */
2135static int unqlitePagerDontWrite(unqlite_page *pMyPage)
2136{
2137 Page *pPage = (Page *)pMyPage;
2138 if( pPage->pgno > 0 /* Page 0 is always writeable */ ){
2139 pPage->flags |= PAGE_DONT_WRITE;
2140 }
2141 return UNQLITE_OK;
2142}
2143/*
2144** Mark a data page as writeable. This routine must be called before
2145** making changes to a page. The caller must check the return value
2146** of this function and be careful not to change any page data unless
2147** this routine returns UNQLITE_OK.
2148*/
2149static int unqlitePageWrite(unqlite_page *pMyPage)
2150{
2151 Page *pPage = (Page *)pMyPage;
2152 Pager *pPager = pPage->pPager;
2153 int rc;
2154 /* Begin the write transaction */
2155 rc = unqlitePagerBegin(pPager);
2156 if( rc != UNQLITE_OK ){
2157 return rc;
2158 }
2159 if( pPager->iState == PAGER_WRITER_LOCKED ){
2160 /* The journal file needs to be opened. Higher level routines have already
2161 ** obtained the necessary locks to begin the write-transaction, but the
2162 ** rollback journal might not yet be open. Open it now if this is the case.
2163 */
2164 rc = unqliteOpenJournal(pPager);
2165 if( rc != UNQLITE_OK ){
2166 return rc;
2167 }
2168 }
2169 if( pPager->nHot > 127 ){
2170 /* Write hot dirty pages */
2171 rc = pager_dirty_commit(pPager);
2172 if( rc != UNQLITE_OK ){
2173 /* A rollback must be done */
2174 unqliteGenError(pPager->pDb,"Please perform a rollback");
2175 return rc;
2176 }
2177 }
2178 /* Write the page to the journal file */
2179 rc = page_write(pPager,pPage);
2180 return rc;
2181}
2182/*
2183** Acquire a reference to page number pgno in pager pPager (a page
2184** reference has type unqlite_page*). If the requested reference is
2185** successfully obtained, it is copied to *ppPage and UNQLITE_OK returned.
2186**
2187** If the requested page is already in the cache, it is returned.
2188** Otherwise, a new page object is allocated and populated with data
2189** read from the database file.
2190*/
2191static int unqlitePagerAcquire(
2192 Pager *pPager, /* The pager open on the database file */
2193 pgno pgno, /* Page number to fetch */
2194 unqlite_page **ppPage, /* OUT: Acquired page */
2195 int fetchOnly, /* Cache lookup only */
2196 int noContent /* Do not bother reading content from disk if true */
2197)
2198{
2199 Page *pPage;
2200 int rc;
2201 /* Acquire a shared lock (if not yet done) on the database and rollback any hot-journal if present */
2202 rc = pager_shared_lock(pPager);
2203 if( rc != UNQLITE_OK ){
2204 return rc;
2205 }
2206 /* Fetch the page from the cache */
2207 pPage = pager_fetch_page(pPager,pgno);
2208 if( fetchOnly ){
2209 if( ppPage ){
2210 *ppPage = (unqlite_page *)pPage;
2211 }
2212 return pPage ? UNQLITE_OK : UNQLITE_NOTFOUND;
2213 }
2214 if( pPage == 0 ){
2215 /* Allocate a new page */
2216 pPage = pager_alloc_page(pPager,pgno);
2217 if( pPage == 0 ){
2218 unqliteGenOutofMem(pPager->pDb);
2219 return UNQLITE_NOMEM;
2220 }
2221 /* Read page contents */
2222 rc = pager_get_page_contents(pPager,pPage,noContent);
2223 if( rc != UNQLITE_OK ){
2224 SyMemBackendPoolFree(pPager->pAllocator,pPage);
2225 return rc;
2226 }
2227 /* Link the page */
2228 pager_link_page(pPager,pPage);
2229 }else{
2230 if( ppPage ){
2231 page_ref(pPage);
2232 }
2233 }
2234 /* All done, page is loaded in memeory */
2235 if( ppPage ){
2236 *ppPage = (unqlite_page *)pPage;
2237 }
2238 return UNQLITE_OK;
2239}
2240/*
2241 * Return true if we are dealing with an in-memory database.
2242 */
2243static int unqliteInMemory(const char *zFilename)
2244{
2245 sxu32 n;
2246 if( SX_EMPTY_STR(zFilename) ){
2247 /* NULL or the empty string means an in-memory database */
2248 return TRUE;
2249 }
2250 n = SyStrlen(zFilename);
2251 if( n == sizeof(":mem:") - 1 &&
2252 SyStrnicmp(zFilename,":mem:",sizeof(":mem:") - 1) == 0 ){
2253 return TRUE;
2254 }
2255 if( n == sizeof(":memory:") - 1 &&
2256 SyStrnicmp(zFilename,":memory:",sizeof(":memory:") - 1) == 0 ){
2257 return TRUE;
2258 }
2259 return FALSE;
2260}
2261/*
2262 * Allocate a new KV cursor.
2263 */
2264UNQLITE_PRIVATE int unqliteInitCursor(unqlite *pDb,unqlite_kv_cursor **ppOut)
2265{
2266 unqlite_kv_methods *pMethods;
2267 unqlite_kv_cursor *pCur;
2268 sxu32 nByte;
2269 /* Storage engine methods */
2270 pMethods = pDb->sDB.pPager->pEngine->pIo->pMethods;
2271 if( pMethods->szCursor < 1 ){
2272 /* Implementation does not supprt cursors */
2273 unqliteGenErrorFormat(pDb,"Storage engine '%s' does not support cursors",pMethods->zName);
2274 return UNQLITE_NOTIMPLEMENTED;
2275 }
2276 nByte = pMethods->szCursor;
2277 if( nByte < sizeof(unqlite_kv_cursor) ){
2278 nByte += sizeof(unqlite_kv_cursor);
2279 }
2280 pCur = (unqlite_kv_cursor *)SyMemBackendPoolAlloc(&pDb->sMem,nByte);
2281 if( pCur == 0 ){
2282 unqliteGenOutofMem(pDb);
2283 return UNQLITE_NOMEM;
2284 }
2285 /* Zero the structure */
2286 SyZero(pCur,nByte);
2287 /* Save the cursor */
2288 pCur->pStore = pDb->sDB.pPager->pEngine;
2289 /* Invoke the initialization callback if any */
2290 if( pMethods->xCursorInit ){
2291 pMethods->xCursorInit(pCur);
2292 }
2293 /* All done */
2294 *ppOut = pCur;
2295 return UNQLITE_OK;
2296}
2297/*
2298 * Release a cursor.
2299 */
2300UNQLITE_PRIVATE int unqliteReleaseCursor(unqlite *pDb,unqlite_kv_cursor *pCur)
2301{
2302 unqlite_kv_methods *pMethods;
2303 /* Storage engine methods */
2304 pMethods = pDb->sDB.pPager->pEngine->pIo->pMethods;
2305 /* Invoke the release callback if available */
2306 if( pMethods->xCursorRelease ){
2307 pMethods->xCursorRelease(pCur);
2308 }
2309 /* Finally, free the whole instance */
2310 SyMemBackendPoolFree(&pDb->sMem,pCur);
2311 return UNQLITE_OK;
2312}
2313/*
2314 * Release the underlying KV storage engine and invoke
2315 * its associated callbacks if available.
2316 */
2317static void pager_release_kv_engine(Pager *pPager)
2318{
2319 unqlite_kv_engine *pEngine = pPager->pEngine;
2320 unqlite_db *pStorage = &pPager->pDb->sDB;
2321 if( pStorage->pCursor ){
2322 /* Release the associated cursor */
2323 unqliteReleaseCursor(pPager->pDb,pStorage->pCursor);
2324 pStorage->pCursor = 0;
2325 }
2326 if( pEngine->pIo->pMethods->xRelease ){
2327 pEngine->pIo->pMethods->xRelease(pEngine);
2328 }
2329 /* Release the whole instance */
2330 SyMemBackendFree(&pPager->pDb->sMem,(void *)pEngine->pIo);
2331 SyMemBackendFree(&pPager->pDb->sMem,(void *)pEngine);
2332 pPager->pEngine = 0;
2333}
2334/* Forward declaration */
2335static int pager_kv_io_init(Pager *pPager,unqlite_kv_methods *pMethods,unqlite_kv_io *pIo);
2336/*
2337 * Allocate, initialize and register a new KV storage engine
2338 * within this database instance.
2339 */
2340UNQLITE_PRIVATE int unqlitePagerRegisterKvEngine(Pager *pPager,unqlite_kv_methods *pMethods)
2341{
2342 unqlite_db *pStorage = &pPager->pDb->sDB;
2343 unqlite *pDb = pPager->pDb;
2344 unqlite_kv_engine *pEngine;
2345 unqlite_kv_io *pIo;
2346 sxu32 nByte;
2347 int rc;
2348 if( pPager->pEngine ){
2349 if( pMethods == pPager->pEngine->pIo->pMethods ){
2350 /* Ticket 1432: Same implementation */
2351 return UNQLITE_OK;
2352 }
2353 /* Release the old KV engine */
2354 pager_release_kv_engine(pPager);
2355 }
2356 /* Allocate a new KV engine instance */
2357 nByte = (sxu32)pMethods->szKv;
2358 pEngine = (unqlite_kv_engine *)SyMemBackendAlloc(&pDb->sMem,nByte);
2359 if( pEngine == 0 ){
2360 unqliteGenOutofMem(pDb);
2361 return UNQLITE_NOMEM;
2362 }
2363 pIo = (unqlite_kv_io *)SyMemBackendAlloc(&pDb->sMem,sizeof(unqlite_kv_io));
2364 if( pIo == 0 ){
2365 SyMemBackendFree(&pDb->sMem,pEngine);
2366 unqliteGenOutofMem(pDb);
2367 return UNQLITE_NOMEM;
2368 }
2369 /* Zero the structure */
2370 SyZero(pIo,sizeof(unqlite_io_methods));
2371 SyZero(pEngine,nByte);
2372 /* Populate the IO structure */
2373 pager_kv_io_init(pPager,pMethods,pIo);
2374 pEngine->pIo = pIo;
2375 /* Invoke the init callback if avaialble */
2376 if( pMethods->xInit ){
2377 rc = pMethods->xInit(pEngine,unqliteGetPageSize());
2378 if( rc != UNQLITE_OK ){
2379 unqliteGenErrorFormat(pDb,
2380 "xInit() method of the underlying KV engine '%z' failed",&pPager->sKv);
2381 goto fail;
2382 }
2383 pEngine->pIo = pIo;
2384 }
2385 pPager->pEngine = pEngine;
2386 /* Allocate a new cursor */
2387 rc = unqliteInitCursor(pDb,&pStorage->pCursor);
2388 if( rc != UNQLITE_OK ){
2389 goto fail;
2390 }
2391 return UNQLITE_OK;
2392fail:
2393 SyMemBackendFree(&pDb->sMem,pEngine);
2394 SyMemBackendFree(&pDb->sMem,pIo);
2395 return rc;
2396}
2397/*
2398 * Return the underlying KV storage engine instance.
2399 */
2400UNQLITE_PRIVATE unqlite_kv_engine * unqlitePagerGetKvEngine(unqlite *pDb)
2401{
2402 return pDb->sDB.pPager->pEngine;
2403}
2404/*
2405* Allocate and initialize a new Pager object. The pager should
2406* eventually be freed by passing it to unqlitePagerClose().
2407*
2408* The zFilename argument is the path to the database file to open.
2409* If zFilename is NULL or ":memory:" then all information is held
2410* in cache. It is never written to disk. This can be used to implement
2411* an in-memory database.
2412*/
2413UNQLITE_PRIVATE int unqlitePagerOpen(
2414 unqlite_vfs *pVfs, /* The virtual file system to use */
2415 unqlite *pDb, /* Database handle */
2416 const char *zFilename, /* Name of the database file to open */
2417 unsigned int iFlags /* flags controlling this file */
2418 )
2419{
2420 unqlite_kv_methods *pMethods = 0;
2421 int is_mem,rd_only,no_jrnl;
2422 Pager *pPager;
2423 sxu32 nByte;
2424 sxu32 nLen;
2425 int rc;
2426
2427 /* Select the appropriate KV storage subsytem */
2428 if( (iFlags & UNQLITE_OPEN_IN_MEMORY) || unqliteInMemory(zFilename) ){
2429 /* An in-memory database, record that */
2430 pMethods = unqliteFindKVStore("mem",sizeof("mem") - 1); /* Always available */
2431 iFlags |= UNQLITE_OPEN_IN_MEMORY;
2432 }else{
2433 /* Install the default key value storage subsystem [i.e. Linear Hash] */
2434 pMethods = unqliteFindKVStore("hash",sizeof("hash")-1);
2435 if( pMethods == 0 ){
2436 /* Use the b+tree storage backend if the linear hash storage is not available */
2437 pMethods = unqliteFindKVStore("btree",sizeof("btree")-1);
2438 }
2439 }
2440 if( pMethods == 0 ){
2441 /* Can't happen */
2442 unqliteGenError(pDb,"Cannot install a default Key/Value storage engine");
2443 return UNQLITE_NOTIMPLEMENTED;
2444 }
2445 is_mem = (iFlags & UNQLITE_OPEN_IN_MEMORY) != 0;
2446 rd_only = (iFlags & UNQLITE_OPEN_READONLY) != 0;
2447 no_jrnl = (iFlags & UNQLITE_OPEN_OMIT_JOURNALING) != 0;
2448 rc = UNQLITE_OK;
2449 if( is_mem ){
2450 /* Omit journaling for in-memory database */
2451 no_jrnl = 1;
2452 }
2453 /* Total number of bytes to allocate */
2454 nByte = sizeof(Pager);
2455 nLen = 0;
2456 if( !is_mem ){
2457 nLen = SyStrlen(zFilename);
2458 nByte += pVfs->mxPathname + nLen + sizeof(char) /* null termniator */;
2459 }
2460 /* Allocate */
2461 pPager = (Pager *)SyMemBackendAlloc(&pDb->sMem,nByte);
2462 if( pPager == 0 ){
2463 return UNQLITE_NOMEM;
2464 }
2465 /* Zero the structure */
2466 SyZero(pPager,nByte);
2467 /* Fill-in the structure */
2468 pPager->pAllocator = &pDb->sMem;
2469 pPager->pDb = pDb;
2470 pDb->sDB.pPager = pPager;
2471 /* Allocate page table */
2472 pPager->nSize = 128; /* Must be a power of two */
2473 nByte = pPager->nSize * sizeof(Page *);
2474 pPager->apHash = (Page **)SyMemBackendAlloc(pPager->pAllocator,nByte);
2475 if( pPager->apHash == 0 ){
2476 rc = UNQLITE_NOMEM;
2477 goto fail;
2478 }
2479 SyZero(pPager->apHash,nByte);
2480 pPager->is_mem = is_mem;
2481 pPager->no_jrnl = no_jrnl;
2482 pPager->is_rdonly = rd_only;
2483 pPager->iOpenFlags = iFlags;
2484 pPager->pVfs = pVfs;
2485 SyRandomnessInit(&pPager->sPrng,0,0);
2486 SyRandomness(&pPager->sPrng,(void *)&pPager->cksumInit,sizeof(sxu32));
2487 /* Unlimited cache size */
2488 pPager->nCacheMax = SXU32_HIGH;
2489 /* Copy filename and journal name */
2490 if( !is_mem ){
2491 pPager->zFilename = (char *)&pPager[1];
2492 rc = UNQLITE_OK;
2493 if( pVfs->xFullPathname ){
2494 rc = pVfs->xFullPathname(pVfs,zFilename,pVfs->mxPathname + nLen,pPager->zFilename);
2495 }
2496 if( rc != UNQLITE_OK ){
2497 /* Simple filename copy */
2498 SyMemcpy(zFilename,pPager->zFilename,nLen);
2499 pPager->zFilename[nLen] = 0;
2500 rc = UNQLITE_OK;
2501 }else{
2502 nLen = SyStrlen(pPager->zFilename);
2503 }
2504 pPager->zJournal = (char *) SyMemBackendAlloc(pPager->pAllocator,nLen + sizeof(UNQLITE_JOURNAL_FILE_SUFFIX) + sizeof(char));
2505 if( pPager->zJournal == 0 ){
2506 rc = UNQLITE_NOMEM;
2507 goto fail;
2508 }
2509 /* Copy filename */
2510 SyMemcpy(pPager->zFilename,pPager->zJournal,nLen);
2511 /* Copy journal suffix */
2512 SyMemcpy(UNQLITE_JOURNAL_FILE_SUFFIX,&pPager->zJournal[nLen],sizeof(UNQLITE_JOURNAL_FILE_SUFFIX)-1);
2513 /* Append the nul terminator to the journal path */
2514 pPager->zJournal[nLen + ( sizeof(UNQLITE_JOURNAL_FILE_SUFFIX) - 1)] = 0;
2515 }
2516 /* Finally, register the selected KV engine */
2517 rc = unqlitePagerRegisterKvEngine(pPager,pMethods);
2518 if( rc != UNQLITE_OK ){
2519 goto fail;
2520 }
2521 /* Set the pager state */
2522 if( pPager->is_mem ){
2523 pPager->iState = PAGER_WRITER_FINISHED;
2524 pPager->iLock = EXCLUSIVE_LOCK;
2525 }else{
2526 pPager->iState = PAGER_OPEN;
2527 pPager->iLock = NO_LOCK;
2528 }
2529 /* All done, ready for processing */
2530 return UNQLITE_OK;
2531fail:
2532 SyMemBackendFree(&pDb->sMem,pPager);
2533 return rc;
2534}
2535/*
2536 * Set a cache limit. Note that, this is a simple hint, the pager is not
2537 * forced to honor this limit.
2538 */
2539UNQLITE_PRIVATE int unqlitePagerSetCachesize(Pager *pPager,int mxPage)
2540{
2541 if( mxPage < 256 ){
2542 return UNQLITE_INVALID;
2543 }
2544 pPager->nCacheMax = mxPage;
2545 return UNQLITE_OK;
2546}
2547/*
2548 * Shutdown the page cache. Free all memory and close the database file.
2549 */
2550UNQLITE_PRIVATE int unqlitePagerClose(Pager *pPager)
2551{
2552 /* Release the KV engine */
2553 pager_release_kv_engine(pPager);
2554 if( pPager->iOpenFlags & UNQLITE_OPEN_MMAP ){
2555 const jx9_vfs *pVfs = jx9ExportBuiltinVfs();
2556 if( pVfs && pVfs->xUnmap && pPager->pMmap ){
2557 pVfs->xUnmap(pPager->pMmap,pPager->dbByteSize);
2558 }
2559 }
2560 if( !pPager->is_mem && pPager->iState > PAGER_OPEN ){
2561 /* Release all lock on this database handle */
2562 pager_unlock_db(pPager,NO_LOCK);
2563 /* Close the file */
2564 unqliteOsCloseFree(pPager->pAllocator,pPager->pfd);
2565 }
2566 if( pPager->pVec ){
2567 unqliteBitvecDestroy(pPager->pVec);
2568 pPager->pVec = 0;
2569 }
2570 return UNQLITE_OK;
2571}
2572/*
2573 * Generate a random string.
2574 */
2575UNQLITE_PRIVATE void unqlitePagerRandomString(Pager *pPager,char *zBuf,sxu32 nLen)
2576{
2577 static const char zBase[] = {"abcdefghijklmnopqrstuvwxyz"}; /* English Alphabet */
2578 sxu32 i;
2579 /* Generate a binary string first */
2580 SyRandomness(&pPager->sPrng,zBuf,nLen);
2581 /* Turn the binary string into english based alphabet */
2582 for( i = 0 ; i < nLen ; ++i ){
2583 zBuf[i] = zBase[zBuf[i] % (sizeof(zBase)-1)];
2584 }
2585}
2586/*
2587 * Generate a random number.
2588 */
2589UNQLITE_PRIVATE sxu32 unqlitePagerRandomNum(Pager *pPager)
2590{
2591 sxu32 iNum;
2592 SyRandomness(&pPager->sPrng,(void *)&iNum,sizeof(iNum));
2593 return iNum;
2594}
2595/* Exported KV IO Methods */
2596/*
2597 * Refer to [unqlitePagerAcquire()]
2598 */
2599static int unqliteKvIoPageGet(unqlite_kv_handle pHandle,pgno iNum,unqlite_page **ppPage)
2600{
2601 int rc;
2602 rc = unqlitePagerAcquire((Pager *)pHandle,iNum,ppPage,0,0);
2603 return rc;
2604}
2605/*
2606 * Refer to [unqlitePagerAcquire()]
2607 */
2608static int unqliteKvIoPageLookup(unqlite_kv_handle pHandle,pgno iNum,unqlite_page **ppPage)
2609{
2610 int rc;
2611 rc = unqlitePagerAcquire((Pager *)pHandle,iNum,ppPage,1,0);
2612 return rc;
2613}
2614/*
2615 * Refer to [unqlitePagerAcquire()]
2616 */
2617static int unqliteKvIoNewPage(unqlite_kv_handle pHandle,unqlite_page **ppPage)
2618{
2619 Pager *pPager = (Pager *)pHandle;
2620 int rc;
2621 /*
2622 * Acquire a reader-lock first so that pPager->dbSize get initialized.
2623 */
2624 rc = pager_shared_lock(pPager);
2625 if( rc == UNQLITE_OK ){
2626 rc = unqlitePagerAcquire(pPager,pPager->dbSize == 0 ? /* Page 0 is reserved */ 1 : pPager->dbSize ,ppPage,0,0);
2627 }
2628 return rc;
2629}
2630/*
2631 * Refer to [unqlitePageWrite()]
2632 */
2633static int unqliteKvIopageWrite(unqlite_page *pPage)
2634{
2635 int rc;
2636 if( pPage == 0 ){
2637 /* TICKET 1433-0348 */
2638 return UNQLITE_OK;
2639 }
2640 rc = unqlitePageWrite(pPage);
2641 return rc;
2642}
2643/*
2644 * Refer to [unqlitePagerDontWrite()]
2645 */
2646static int unqliteKvIoPageDontWrite(unqlite_page *pPage)
2647{
2648 int rc;
2649 if( pPage == 0 ){
2650 /* TICKET 1433-0348 */
2651 return UNQLITE_OK;
2652 }
2653 rc = unqlitePagerDontWrite(pPage);
2654 return rc;
2655}
2656/*
2657 * Refer to [unqliteBitvecSet()]
2658 */
2659static int unqliteKvIoPageDontJournal(unqlite_page *pRaw)
2660{
2661 Page *pPage = (Page *)pRaw;
2662 Pager *pPager;
2663 if( pPage == 0 ){
2664 /* TICKET 1433-0348 */
2665 return UNQLITE_OK;
2666 }
2667 pPager = pPage->pPager;
2668 if( pPager->iState >= PAGER_WRITER_LOCKED ){
2669 if( !pPager->no_jrnl && pPager->pVec && !unqliteBitvecTest(pPager->pVec,pPage->pgno) ){
2670 unqliteBitvecSet(pPager->pVec,pPage->pgno);
2671 }
2672 }
2673 return UNQLITE_OK;
2674}
2675/*
2676 * Do not add a page to the hot dirty list.
2677 */
2678static int unqliteKvIoPageDontMakeHot(unqlite_page *pRaw)
2679{
2680 Page *pPage = (Page *)pRaw;
2681
2682 if( pPage == 0 ){
2683 /* TICKET 1433-0348 */
2684 return UNQLITE_OK;
2685 }
2686 pPage->flags |= PAGE_DONT_MAKE_HOT;
2687
2688 /* Remove from hot dirty list if it is already there */
2689 if( pPage->flags & PAGE_HOT_DIRTY ){
2690 Pager *pPager = pPage->pPager;
2691 if( pPage->pNextHot ){
2692 pPage->pNextHot->pPrevHot = pPage->pPrevHot;
2693 }
2694 if( pPage->pPrevHot ){
2695 pPage->pPrevHot->pNextHot = pPage->pNextHot;
2696 }
2697 if( pPager->pFirstHot == pPage ){
2698 pPager->pFirstHot = pPage->pPrevHot;
2699 }
2700 if( pPager->pHotDirty == pPage ){
2701 pPager->pHotDirty = pPage->pNextHot;
2702 }
2703 pPager->nHot--;
2704 pPage->flags &= ~PAGE_HOT_DIRTY;
2705 }
2706
2707 return UNQLITE_OK;
2708}
2709/*
2710 * Refer to [page_ref()]
2711 */
2712static int unqliteKvIopage_ref(unqlite_page *pPage)
2713{
2714 if( pPage ){
2715 page_ref((Page *)pPage);
2716 }
2717 return UNQLITE_OK;
2718}
2719/*
2720 * Refer to [page_unref()]
2721 */
2722static int unqliteKvIoPageUnRef(unqlite_page *pPage)
2723{
2724 if( pPage ){
2725 page_unref((Page *)pPage);
2726 }
2727 return UNQLITE_OK;
2728}
2729/*
2730 * Refer to the declaration of the [Pager] structure
2731 */
2732static int unqliteKvIoReadOnly(unqlite_kv_handle pHandle)
2733{
2734 return ((Pager *)pHandle)->is_rdonly;
2735}
2736/*
2737 * Refer to the declaration of the [Pager] structure
2738 */
2739static int unqliteKvIoPageSize(unqlite_kv_handle pHandle)
2740{
2741 return ((Pager *)pHandle)->iPageSize;
2742}
2743/*
2744 * Refer to the declaration of the [Pager] structure
2745 */
2746static unsigned char * unqliteKvIoTempPage(unqlite_kv_handle pHandle)
2747{
2748 return ((Pager *)pHandle)->zTmpPage;
2749}
2750/*
2751 * Set a page unpin callback.
2752 * Refer to the declaration of the [Pager] structure
2753 */
2754static void unqliteKvIoPageUnpin(unqlite_kv_handle pHandle,void (*xPageUnpin)(void *))
2755{
2756 Pager *pPager = (Pager *)pHandle;
2757 pPager->xPageUnpin = xPageUnpin;
2758}
2759/*
2760 * Set a page reload callback.
2761 * Refer to the declaration of the [Pager] structure
2762 */
2763static void unqliteKvIoPageReload(unqlite_kv_handle pHandle,void (*xPageReload)(void *))
2764{
2765 Pager *pPager = (Pager *)pHandle;
2766 pPager->xPageReload = xPageReload;
2767}
2768/*
2769 * Log an error.
2770 * Refer to the declaration of the [Pager] structure
2771 */
2772static void unqliteKvIoErr(unqlite_kv_handle pHandle,const char *zErr)
2773{
2774 Pager *pPager = (Pager *)pHandle;
2775 unqliteGenError(pPager->pDb,zErr);
2776}
2777/*
2778 * Init an instance of the [unqlite_kv_io] structure.
2779 */
2780static int pager_kv_io_init(Pager *pPager,unqlite_kv_methods *pMethods,unqlite_kv_io *pIo)
2781{
2782 pIo->pHandle = pPager;
2783 pIo->pMethods = pMethods;
2784
2785 pIo->xGet = unqliteKvIoPageGet;
2786 pIo->xLookup = unqliteKvIoPageLookup;
2787 pIo->xNew = unqliteKvIoNewPage;
2788
2789 pIo->xWrite = unqliteKvIopageWrite;
2790 pIo->xDontWrite = unqliteKvIoPageDontWrite;
2791 pIo->xDontJournal = unqliteKvIoPageDontJournal;
2792 pIo->xDontMkHot = unqliteKvIoPageDontMakeHot;
2793
2794 pIo->xPageRef = unqliteKvIopage_ref;
2795 pIo->xPageUnref = unqliteKvIoPageUnRef;
2796
2797 pIo->xPageSize = unqliteKvIoPageSize;
2798 pIo->xReadOnly = unqliteKvIoReadOnly;
2799
2800 pIo->xTmpPage = unqliteKvIoTempPage;
2801
2802 pIo->xSetUnpin = unqliteKvIoPageUnpin;
2803 pIo->xSetReload = unqliteKvIoPageReload;
2804
2805 pIo->xErr = unqliteKvIoErr;
2806
2807 return UNQLITE_OK;
2808}