summaryrefslogtreecommitdiffstats
path: root/common/unqlite/os_unix.c
diff options
context:
space:
mode:
authorAaron Seigo <aseigo@kde.org>2014-12-07 10:08:07 +0100
committerAaron Seigo <aseigo@kde.org>2014-12-11 01:07:08 +0100
commit9ee8378d393778ac67314be7ea8d5bcbaeee9ee0 (patch)
treecf93471a69f9f4bbb4940de55ae134106fcd8380 /common/unqlite/os_unix.c
parentee6f068dff6b15441e553ffbfb2bf8aa97b26f57 (diff)
downloadsink-9ee8378d393778ac67314be7ea8d5bcbaeee9ee0.tar.gz
sink-9ee8378d393778ac67314be7ea8d5bcbaeee9ee0.zip
try out unqlite
Diffstat (limited to 'common/unqlite/os_unix.c')
-rw-r--r--common/unqlite/os_unix.c1769
1 files changed, 1769 insertions, 0 deletions
diff --git a/common/unqlite/os_unix.c b/common/unqlite/os_unix.c
new file mode 100644
index 0000000..f578d07
--- /dev/null
+++ b/common/unqlite/os_unix.c
@@ -0,0 +1,1769 @@
1/*
2 * Symisc unQLite: An Embeddable NoSQL (Post Modern) Database Engine.
3 * Copyright (C) 2012-2013, Symisc Systems http://unqlite.org/
4 * Version 1.1.6
5 * For information on licensing, redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES
6 * please contact Symisc Systems via:
7 * legal@symisc.net
8 * licensing@symisc.net
9 * contact@symisc.net
10 * or visit:
11 * http://unqlite.org/licensing.html
12 */
13 /* $SymiscID: os_unix.c v1.3 FreeBSD 2013-04-05 01:10 devel <chm@symisc.net> $ */
14#ifndef UNQLITE_AMALGAMATION
15#include "unqliteInt.h"
16#endif
17/*
18 * Omit the whole layer from the build if compiling for platforms other than Unix (Linux, BSD, Solaris, OS X, etc.).
19 * Note: Mostly SQLite3 source tree.
20 */
21#if defined(__UNIXES__)
22/** This file contains the VFS implementation for unix-like operating systems
23** include Linux, MacOSX, *BSD, QNX, VxWorks, AIX, HPUX, and others.
24**
25** There are actually several different VFS implementations in this file.
26** The differences are in the way that file locking is done. The default
27** implementation uses Posix Advisory Locks. Alternative implementations
28** use flock(), dot-files, various proprietary locking schemas, or simply
29** skip locking all together.
30**
31** This source file is organized into divisions where the logic for various
32** subfunctions is contained within the appropriate division. PLEASE
33** KEEP THE STRUCTURE OF THIS FILE INTACT. New code should be placed
34** in the correct division and should be clearly labeled.
35**
36*/
37/*
38** standard include files.
39*/
40#include <sys/types.h>
41#include <sys/stat.h>
42#include <sys/uio.h>
43#include <sys/file.h>
44#include <fcntl.h>
45#include <unistd.h>
46#include <time.h>
47#include <sys/time.h>
48#include <errno.h>
49#if defined(__APPLE__)
50# include <sys/mount.h>
51#endif
52/*
53** Allowed values of unixFile.fsFlags
54*/
55#define UNQLITE_FSFLAGS_IS_MSDOS 0x1
56
57/*
58** Default permissions when creating a new file
59*/
60#ifndef UNQLITE_DEFAULT_FILE_PERMISSIONS
61# define UNQLITE_DEFAULT_FILE_PERMISSIONS 0644
62#endif
63/*
64 ** Default permissions when creating auto proxy dir
65 */
66#ifndef UNQLITE_DEFAULT_PROXYDIR_PERMISSIONS
67# define UNQLITE_DEFAULT_PROXYDIR_PERMISSIONS 0755
68#endif
69/*
70** Maximum supported path-length.
71*/
72#define MAX_PATHNAME 512
73/*
74** Only set the lastErrno if the error code is a real error and not
75** a normal expected return code of UNQLITE_BUSY or UNQLITE_OK
76*/
77#define IS_LOCK_ERROR(x) ((x != UNQLITE_OK) && (x != UNQLITE_BUSY))
78/* Forward references */
79typedef struct unixInodeInfo unixInodeInfo; /* An i-node */
80typedef struct UnixUnusedFd UnixUnusedFd; /* An unused file descriptor */
81/*
82** Sometimes, after a file handle is closed by SQLite, the file descriptor
83** cannot be closed immediately. In these cases, instances of the following
84** structure are used to store the file descriptor while waiting for an
85** opportunity to either close or reuse it.
86*/
87struct UnixUnusedFd {
88 int fd; /* File descriptor to close */
89 int flags; /* Flags this file descriptor was opened with */
90 UnixUnusedFd *pNext; /* Next unused file descriptor on same file */
91};
92/*
93** The unixFile structure is subclass of unqlite3_file specific to the unix
94** VFS implementations.
95*/
96typedef struct unixFile unixFile;
97struct unixFile {
98 const unqlite_io_methods *pMethod; /* Always the first entry */
99 unixInodeInfo *pInode; /* Info about locks on this inode */
100 int h; /* The file descriptor */
101 int dirfd; /* File descriptor for the directory */
102 unsigned char eFileLock; /* The type of lock held on this fd */
103 int lastErrno; /* The unix errno from last I/O error */
104 void *lockingContext; /* Locking style specific state */
105 UnixUnusedFd *pUnused; /* Pre-allocated UnixUnusedFd */
106 int fileFlags; /* Miscellanous flags */
107 const char *zPath; /* Name of the file */
108 unsigned fsFlags; /* cached details from statfs() */
109};
110/*
111** The following macros define bits in unixFile.fileFlags
112*/
113#define UNQLITE_WHOLE_FILE_LOCKING 0x0001 /* Use whole-file locking */
114/*
115** Define various macros that are missing from some systems.
116*/
117#ifndef O_LARGEFILE
118# define O_LARGEFILE 0
119#endif
120#ifndef O_NOFOLLOW
121# define O_NOFOLLOW 0
122#endif
123#ifndef O_BINARY
124# define O_BINARY 0
125#endif
126/*
127** Helper functions to obtain and relinquish the global mutex. The
128** global mutex is used to protect the unixInodeInfo and
129** vxworksFileId objects used by this file, all of which may be
130** shared by multiple threads.
131**
132** Function unixMutexHeld() is used to assert() that the global mutex
133** is held when required. This function is only used as part of assert()
134** statements. e.g.
135**
136** unixEnterMutex()
137** assert( unixMutexHeld() );
138** unixEnterLeave()
139*/
140static void unixEnterMutex(void){
141#ifdef UNQLITE_ENABLE_THREADS
142 const SyMutexMethods *pMutexMethods = SyMutexExportMethods();
143 if( pMutexMethods ){
144 SyMutex *pMutex = pMutexMethods->xNew(SXMUTEX_TYPE_STATIC_2); /* pre-allocated, never fail */
145 SyMutexEnter(pMutexMethods,pMutex);
146 }
147#endif /* UNQLITE_ENABLE_THREADS */
148}
149static void unixLeaveMutex(void){
150#ifdef UNQLITE_ENABLE_THREADS
151 const SyMutexMethods *pMutexMethods = SyMutexExportMethods();
152 if( pMutexMethods ){
153 SyMutex *pMutex = pMutexMethods->xNew(SXMUTEX_TYPE_STATIC_2); /* pre-allocated, never fail */
154 SyMutexLeave(pMutexMethods,pMutex);
155 }
156#endif /* UNQLITE_ENABLE_THREADS */
157}
158/*
159** This routine translates a standard POSIX errno code into something
160** useful to the clients of the unqlite3 functions. Specifically, it is
161** intended to translate a variety of "try again" errors into UNQLITE_BUSY
162** and a variety of "please close the file descriptor NOW" errors into
163** UNQLITE_IOERR
164**
165** Errors during initialization of locks, or file system support for locks,
166** should handle ENOLCK, ENOTSUP, EOPNOTSUPP separately.
167*/
168static int unqliteErrorFromPosixError(int posixError, int unqliteIOErr) {
169 switch (posixError) {
170 case 0:
171 return UNQLITE_OK;
172
173 case EAGAIN:
174 case ETIMEDOUT:
175 case EBUSY:
176 case EINTR:
177 case ENOLCK:
178 /* random NFS retry error, unless during file system support
179 * introspection, in which it actually means what it says */
180 return UNQLITE_BUSY;
181
182 case EACCES:
183 /* EACCES is like EAGAIN during locking operations, but not any other time*/
184 return UNQLITE_BUSY;
185
186 case EPERM:
187 return UNQLITE_PERM;
188
189 case EDEADLK:
190 return UNQLITE_IOERR;
191
192#if EOPNOTSUPP!=ENOTSUP
193 case EOPNOTSUPP:
194 /* something went terribly awry, unless during file system support
195 * introspection, in which it actually means what it says */
196#endif
197#ifdef ENOTSUP
198 case ENOTSUP:
199 /* invalid fd, unless during file system support introspection, in which
200 * it actually means what it says */
201#endif
202 case EIO:
203 case EBADF:
204 case EINVAL:
205 case ENOTCONN:
206 case ENODEV:
207 case ENXIO:
208 case ENOENT:
209 case ESTALE:
210 case ENOSYS:
211 /* these should force the client to close the file and reconnect */
212
213 default:
214 return unqliteIOErr;
215 }
216}
217/******************************************************************************
218*************************** Posix Advisory Locking ****************************
219**
220** POSIX advisory locks are broken by design. ANSI STD 1003.1 (1996)
221** section 6.5.2.2 lines 483 through 490 specify that when a process
222** sets or clears a lock, that operation overrides any prior locks set
223** by the same process. It does not explicitly say so, but this implies
224** that it overrides locks set by the same process using a different
225** file descriptor. Consider this test case:
226**
227** int fd1 = open("./file1", O_RDWR|O_CREAT, 0644);
228** int fd2 = open("./file2", O_RDWR|O_CREAT, 0644);
229**
230** Suppose ./file1 and ./file2 are really the same file (because
231** one is a hard or symbolic link to the other) then if you set
232** an exclusive lock on fd1, then try to get an exclusive lock
233** on fd2, it works. I would have expected the second lock to
234** fail since there was already a lock on the file due to fd1.
235** But not so. Since both locks came from the same process, the
236** second overrides the first, even though they were on different
237** file descriptors opened on different file names.
238**
239** This means that we cannot use POSIX locks to synchronize file access
240** among competing threads of the same process. POSIX locks will work fine
241** to synchronize access for threads in separate processes, but not
242** threads within the same process.
243**
244** To work around the problem, SQLite has to manage file locks internally
245** on its own. Whenever a new database is opened, we have to find the
246** specific inode of the database file (the inode is determined by the
247** st_dev and st_ino fields of the stat structure that fstat() fills in)
248** and check for locks already existing on that inode. When locks are
249** created or removed, we have to look at our own internal record of the
250** locks to see if another thread has previously set a lock on that same
251** inode.
252**
253** (Aside: The use of inode numbers as unique IDs does not work on VxWorks.
254** For VxWorks, we have to use the alternative unique ID system based on
255** canonical filename and implemented in the previous division.)
256**
257** There is one locking structure
258** per inode, so if the same inode is opened twice, both unixFile structures
259** point to the same locking structure. The locking structure keeps
260** a reference count (so we will know when to delete it) and a "cnt"
261** field that tells us its internal lock status. cnt==0 means the
262** file is unlocked. cnt==-1 means the file has an exclusive lock.
263** cnt>0 means there are cnt shared locks on the file.
264**
265** Any attempt to lock or unlock a file first checks the locking
266** structure. The fcntl() system call is only invoked to set a
267** POSIX lock if the internal lock structure transitions between
268** a locked and an unlocked state.
269**
270** But wait: there are yet more problems with POSIX advisory locks.
271**
272** If you close a file descriptor that points to a file that has locks,
273** all locks on that file that are owned by the current process are
274** released. To work around this problem, each unixInodeInfo object
275** maintains a count of the number of pending locks on that inode.
276** When an attempt is made to close an unixFile, if there are
277** other unixFile open on the same inode that are holding locks, the call
278** to close() the file descriptor is deferred until all of the locks clear.
279** The unixInodeInfo structure keeps a list of file descriptors that need to
280** be closed and that list is walked (and cleared) when the last lock
281** clears.
282**
283** Yet another problem: LinuxThreads do not play well with posix locks.
284**
285** Many older versions of linux use the LinuxThreads library which is
286** not posix compliant. Under LinuxThreads, a lock created by thread
287** A cannot be modified or overridden by a different thread B.
288** Only thread A can modify the lock. Locking behavior is correct
289** if the appliation uses the newer Native Posix Thread Library (NPTL)
290** on linux - with NPTL a lock created by thread A can override locks
291** in thread B. But there is no way to know at compile-time which
292** threading library is being used. So there is no way to know at
293** compile-time whether or not thread A can override locks on thread B.
294** One has to do a run-time check to discover the behavior of the
295** current process.
296**
297*/
298
299/*
300** An instance of the following structure serves as the key used
301** to locate a particular unixInodeInfo object.
302*/
303struct unixFileId {
304 dev_t dev; /* Device number */
305 ino_t ino; /* Inode number */
306};
307/*
308** An instance of the following structure is allocated for each open
309** inode. Or, on LinuxThreads, there is one of these structures for
310** each inode opened by each thread.
311**
312** A single inode can have multiple file descriptors, so each unixFile
313** structure contains a pointer to an instance of this object and this
314** object keeps a count of the number of unixFile pointing to it.
315*/
316struct unixInodeInfo {
317 struct unixFileId fileId; /* The lookup key */
318 int nShared; /* Number of SHARED locks held */
319 int eFileLock; /* One of SHARED_LOCK, RESERVED_LOCK etc. */
320 int nRef; /* Number of pointers to this structure */
321 int nLock; /* Number of outstanding file locks */
322 UnixUnusedFd *pUnused; /* Unused file descriptors to close */
323 unixInodeInfo *pNext; /* List of all unixInodeInfo objects */
324 unixInodeInfo *pPrev; /* .... doubly linked */
325};
326
327static unixInodeInfo *inodeList = 0;
328/*
329 * Local memory allocation stuff.
330 */
331static void * unqlite_malloc(sxu32 nByte)
332{
333 SyMemBackend *pAlloc;
334 void *p;
335 pAlloc = (SyMemBackend *)unqliteExportMemBackend();
336 p = SyMemBackendAlloc(pAlloc,nByte);
337 return p;
338}
339static void unqlite_free(void *p)
340{
341 SyMemBackend *pAlloc;
342 pAlloc = (SyMemBackend *)unqliteExportMemBackend();
343 SyMemBackendFree(pAlloc,p);
344}
345/*
346** Close all file descriptors accumuated in the unixInodeInfo->pUnused list.
347** If all such file descriptors are closed without error, the list is
348** cleared and UNQLITE_OK returned.
349**
350** Otherwise, if an error occurs, then successfully closed file descriptor
351** entries are removed from the list, and UNQLITE_IOERR_CLOSE returned.
352** not deleted and UNQLITE_IOERR_CLOSE returned.
353*/
354static int closePendingFds(unixFile *pFile){
355 int rc = UNQLITE_OK;
356 unixInodeInfo *pInode = pFile->pInode;
357 UnixUnusedFd *pError = 0;
358 UnixUnusedFd *p;
359 UnixUnusedFd *pNext;
360 for(p=pInode->pUnused; p; p=pNext){
361 pNext = p->pNext;
362 if( close(p->fd) ){
363 pFile->lastErrno = errno;
364 rc = UNQLITE_IOERR;
365 p->pNext = pError;
366 pError = p;
367 }else{
368 unqlite_free(p);
369 }
370 }
371 pInode->pUnused = pError;
372 return rc;
373}
374/*
375** Release a unixInodeInfo structure previously allocated by findInodeInfo().
376**
377** The mutex entered using the unixEnterMutex() function must be held
378** when this function is called.
379*/
380static void releaseInodeInfo(unixFile *pFile){
381 unixInodeInfo *pInode = pFile->pInode;
382 if( pInode ){
383 pInode->nRef--;
384 if( pInode->nRef==0 ){
385 closePendingFds(pFile);
386 if( pInode->pPrev ){
387 pInode->pPrev->pNext = pInode->pNext;
388 }else{
389 inodeList = pInode->pNext;
390 }
391 if( pInode->pNext ){
392 pInode->pNext->pPrev = pInode->pPrev;
393 }
394 unqlite_free(pInode);
395 }
396 }
397}
398/*
399** Given a file descriptor, locate the unixInodeInfo object that
400** describes that file descriptor. Create a new one if necessary. The
401** return value might be uninitialized if an error occurs.
402**
403** The mutex entered using the unixEnterMutex() function must be held
404** when this function is called.
405**
406** Return an appropriate error code.
407*/
408static int findInodeInfo(
409 unixFile *pFile, /* Unix file with file desc used in the key */
410 unixInodeInfo **ppInode /* Return the unixInodeInfo object here */
411){
412 int rc; /* System call return code */
413 int fd; /* The file descriptor for pFile */
414 struct unixFileId fileId; /* Lookup key for the unixInodeInfo */
415 struct stat statbuf; /* Low-level file information */
416 unixInodeInfo *pInode = 0; /* Candidate unixInodeInfo object */
417
418 /* Get low-level information about the file that we can used to
419 ** create a unique name for the file.
420 */
421 fd = pFile->h;
422 rc = fstat(fd, &statbuf);
423 if( rc!=0 ){
424 pFile->lastErrno = errno;
425#ifdef EOVERFLOW
426 if( pFile->lastErrno==EOVERFLOW ) return UNQLITE_NOTIMPLEMENTED;
427#endif
428 return UNQLITE_IOERR;
429 }
430
431#ifdef __APPLE__
432 /* On OS X on an msdos filesystem, the inode number is reported
433 ** incorrectly for zero-size files. See ticket #3260. To work
434 ** around this problem (we consider it a bug in OS X, not SQLite)
435 ** we always increase the file size to 1 by writing a single byte
436 ** prior to accessing the inode number. The one byte written is
437 ** an ASCII 'S' character which also happens to be the first byte
438 ** in the header of every SQLite database. In this way, if there
439 ** is a race condition such that another thread has already populated
440 ** the first page of the database, no damage is done.
441 */
442 if( statbuf.st_size==0 && (pFile->fsFlags & UNQLITE_FSFLAGS_IS_MSDOS)!=0 ){
443 rc = write(fd, "S", 1);
444 if( rc!=1 ){
445 pFile->lastErrno = errno;
446 return UNQLITE_IOERR;
447 }
448 rc = fstat(fd, &statbuf);
449 if( rc!=0 ){
450 pFile->lastErrno = errno;
451 return UNQLITE_IOERR;
452 }
453 }
454#endif
455 SyZero(&fileId,sizeof(fileId));
456 fileId.dev = statbuf.st_dev;
457 fileId.ino = statbuf.st_ino;
458 pInode = inodeList;
459 while( pInode && SyMemcmp((const void *)&fileId,(const void *)&pInode->fileId, sizeof(fileId)) ){
460 pInode = pInode->pNext;
461 }
462 if( pInode==0 ){
463 pInode = (unixInodeInfo *)unqlite_malloc( sizeof(*pInode) );
464 if( pInode==0 ){
465 return UNQLITE_NOMEM;
466 }
467 SyZero(pInode,sizeof(*pInode));
468 SyMemcpy((const void *)&fileId,(void *)&pInode->fileId,sizeof(fileId));
469 pInode->nRef = 1;
470 pInode->pNext = inodeList;
471 pInode->pPrev = 0;
472 if( inodeList ) inodeList->pPrev = pInode;
473 inodeList = pInode;
474 }else{
475 pInode->nRef++;
476 }
477 *ppInode = pInode;
478 return UNQLITE_OK;
479}
480/*
481** This routine checks if there is a RESERVED lock held on the specified
482** file by this or any other process. If such a lock is held, set *pResOut
483** to a non-zero value otherwise *pResOut is set to zero. The return value
484** is set to UNQLITE_OK unless an I/O error occurs during lock checking.
485*/
486static int unixCheckReservedLock(unqlite_file *id, int *pResOut){
487 int rc = UNQLITE_OK;
488 int reserved = 0;
489 unixFile *pFile = (unixFile*)id;
490
491
492 unixEnterMutex(); /* Because pFile->pInode is shared across threads */
493
494 /* Check if a thread in this process holds such a lock */
495 if( pFile->pInode->eFileLock>SHARED_LOCK ){
496 reserved = 1;
497 }
498
499 /* Otherwise see if some other process holds it.
500 */
501 if( !reserved ){
502 struct flock lock;
503 lock.l_whence = SEEK_SET;
504 lock.l_start = RESERVED_BYTE;
505 lock.l_len = 1;
506 lock.l_type = F_WRLCK;
507 if (-1 == fcntl(pFile->h, F_GETLK, &lock)) {
508 int tErrno = errno;
509 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
510 pFile->lastErrno = tErrno;
511 } else if( lock.l_type!=F_UNLCK ){
512 reserved = 1;
513 }
514 }
515
516 unixLeaveMutex();
517
518 *pResOut = reserved;
519 return rc;
520}
521/*
522** Lock the file with the lock specified by parameter eFileLock - one
523** of the following:
524**
525** (1) SHARED_LOCK
526** (2) RESERVED_LOCK
527** (3) PENDING_LOCK
528** (4) EXCLUSIVE_LOCK
529**
530** Sometimes when requesting one lock state, additional lock states
531** are inserted in between. The locking might fail on one of the later
532** transitions leaving the lock state different from what it started but
533** still short of its goal. The following chart shows the allowed
534** transitions and the inserted intermediate states:
535**
536** UNLOCKED -> SHARED
537** SHARED -> RESERVED
538** SHARED -> (PENDING) -> EXCLUSIVE
539** RESERVED -> (PENDING) -> EXCLUSIVE
540** PENDING -> EXCLUSIVE
541**
542** This routine will only increase a lock. Use the unqliteOsUnlock()
543** routine to lower a locking level.
544*/
545static int unixLock(unqlite_file *id, int eFileLock){
546 /* The following describes the implementation of the various locks and
547 ** lock transitions in terms of the POSIX advisory shared and exclusive
548 ** lock primitives (called read-locks and write-locks below, to avoid
549 ** confusion with SQLite lock names). The algorithms are complicated
550 ** slightly in order to be compatible with unixdows systems simultaneously
551 ** accessing the same database file, in case that is ever required.
552 **
553 ** Symbols defined in os.h indentify the 'pending byte' and the 'reserved
554 ** byte', each single bytes at well known offsets, and the 'shared byte
555 ** range', a range of 510 bytes at a well known offset.
556 **
557 ** To obtain a SHARED lock, a read-lock is obtained on the 'pending
558 ** byte'. If this is successful, a random byte from the 'shared byte
559 ** range' is read-locked and the lock on the 'pending byte' released.
560 **
561 ** A process may only obtain a RESERVED lock after it has a SHARED lock.
562 ** A RESERVED lock is implemented by grabbing a write-lock on the
563 ** 'reserved byte'.
564 **
565 ** A process may only obtain a PENDING lock after it has obtained a
566 ** SHARED lock. A PENDING lock is implemented by obtaining a write-lock
567 ** on the 'pending byte'. This ensures that no new SHARED locks can be
568 ** obtained, but existing SHARED locks are allowed to persist. A process
569 ** does not have to obtain a RESERVED lock on the way to a PENDING lock.
570 ** This property is used by the algorithm for rolling back a journal file
571 ** after a crash.
572 **
573 ** An EXCLUSIVE lock, obtained after a PENDING lock is held, is
574 ** implemented by obtaining a write-lock on the entire 'shared byte
575 ** range'. Since all other locks require a read-lock on one of the bytes
576 ** within this range, this ensures that no other locks are held on the
577 ** database.
578 **
579 ** The reason a single byte cannot be used instead of the 'shared byte
580 ** range' is that some versions of unixdows do not support read-locks. By
581 ** locking a random byte from a range, concurrent SHARED locks may exist
582 ** even if the locking primitive used is always a write-lock.
583 */
584 int rc = UNQLITE_OK;
585 unixFile *pFile = (unixFile*)id;
586 unixInodeInfo *pInode = pFile->pInode;
587 struct flock lock;
588 int s = 0;
589 int tErrno = 0;
590
591 /* If there is already a lock of this type or more restrictive on the
592 ** unixFile, do nothing. Don't use the end_lock: exit path, as
593 ** unixEnterMutex() hasn't been called yet.
594 */
595 if( pFile->eFileLock>=eFileLock ){
596 return UNQLITE_OK;
597 }
598 /* This mutex is needed because pFile->pInode is shared across threads
599 */
600 unixEnterMutex();
601 pInode = pFile->pInode;
602
603 /* If some thread using this PID has a lock via a different unixFile*
604 ** handle that precludes the requested lock, return BUSY.
605 */
606 if( (pFile->eFileLock!=pInode->eFileLock &&
607 (pInode->eFileLock>=PENDING_LOCK || eFileLock>SHARED_LOCK))
608 ){
609 rc = UNQLITE_BUSY;
610 goto end_lock;
611 }
612
613 /* If a SHARED lock is requested, and some thread using this PID already
614 ** has a SHARED or RESERVED lock, then increment reference counts and
615 ** return UNQLITE_OK.
616 */
617 if( eFileLock==SHARED_LOCK &&
618 (pInode->eFileLock==SHARED_LOCK || pInode->eFileLock==RESERVED_LOCK) ){
619 pFile->eFileLock = SHARED_LOCK;
620 pInode->nShared++;
621 pInode->nLock++;
622 goto end_lock;
623 }
624 /* A PENDING lock is needed before acquiring a SHARED lock and before
625 ** acquiring an EXCLUSIVE lock. For the SHARED lock, the PENDING will
626 ** be released.
627 */
628 lock.l_len = 1L;
629 lock.l_whence = SEEK_SET;
630 if( eFileLock==SHARED_LOCK
631 || (eFileLock==EXCLUSIVE_LOCK && pFile->eFileLock<PENDING_LOCK)
632 ){
633 lock.l_type = (eFileLock==SHARED_LOCK?F_RDLCK:F_WRLCK);
634 lock.l_start = PENDING_BYTE;
635 s = fcntl(pFile->h, F_SETLK, &lock);
636 if( s==(-1) ){
637 tErrno = errno;
638 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
639 if( IS_LOCK_ERROR(rc) ){
640 pFile->lastErrno = tErrno;
641 }
642 goto end_lock;
643 }
644 }
645 /* If control gets to this point, then actually go ahead and make
646 ** operating system calls for the specified lock.
647 */
648 if( eFileLock==SHARED_LOCK ){
649 /* Now get the read-lock */
650 lock.l_start = SHARED_FIRST;
651 lock.l_len = SHARED_SIZE;
652 if( (s = fcntl(pFile->h, F_SETLK, &lock))==(-1) ){
653 tErrno = errno;
654 }
655 /* Drop the temporary PENDING lock */
656 lock.l_start = PENDING_BYTE;
657 lock.l_len = 1L;
658 lock.l_type = F_UNLCK;
659 if( fcntl(pFile->h, F_SETLK, &lock)!=0 ){
660 if( s != -1 ){
661 /* This could happen with a network mount */
662 tErrno = errno;
663 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
664 if( IS_LOCK_ERROR(rc) ){
665 pFile->lastErrno = tErrno;
666 }
667 goto end_lock;
668 }
669 }
670 if( s==(-1) ){
671 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
672 if( IS_LOCK_ERROR(rc) ){
673 pFile->lastErrno = tErrno;
674 }
675 }else{
676 pFile->eFileLock = SHARED_LOCK;
677 pInode->nLock++;
678 pInode->nShared = 1;
679 }
680 }else if( eFileLock==EXCLUSIVE_LOCK && pInode->nShared>1 ){
681 /* We are trying for an exclusive lock but another thread in this
682 ** same process is still holding a shared lock. */
683 rc = UNQLITE_BUSY;
684 }else{
685 /* The request was for a RESERVED or EXCLUSIVE lock. It is
686 ** assumed that there is a SHARED or greater lock on the file
687 ** already.
688 */
689 lock.l_type = F_WRLCK;
690 switch( eFileLock ){
691 case RESERVED_LOCK:
692 lock.l_start = RESERVED_BYTE;
693 break;
694 case EXCLUSIVE_LOCK:
695 lock.l_start = SHARED_FIRST;
696 lock.l_len = SHARED_SIZE;
697 break;
698 default:
699 /* Can't happen */
700 break;
701 }
702 s = fcntl(pFile->h, F_SETLK, &lock);
703 if( s==(-1) ){
704 tErrno = errno;
705 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
706 if( IS_LOCK_ERROR(rc) ){
707 pFile->lastErrno = tErrno;
708 }
709 }
710 }
711 if( rc==UNQLITE_OK ){
712 pFile->eFileLock = eFileLock;
713 pInode->eFileLock = eFileLock;
714 }else if( eFileLock==EXCLUSIVE_LOCK ){
715 pFile->eFileLock = PENDING_LOCK;
716 pInode->eFileLock = PENDING_LOCK;
717 }
718end_lock:
719 unixLeaveMutex();
720 return rc;
721}
722/*
723** Add the file descriptor used by file handle pFile to the corresponding
724** pUnused list.
725*/
726static void setPendingFd(unixFile *pFile){
727 unixInodeInfo *pInode = pFile->pInode;
728 UnixUnusedFd *p = pFile->pUnused;
729 p->pNext = pInode->pUnused;
730 pInode->pUnused = p;
731 pFile->h = -1;
732 pFile->pUnused = 0;
733}
734/*
735** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
736** must be either NO_LOCK or SHARED_LOCK.
737**
738** If the locking level of the file descriptor is already at or below
739** the requested locking level, this routine is a no-op.
740**
741** If handleNFSUnlock is true, then on downgrading an EXCLUSIVE_LOCK to SHARED
742** the byte range is divided into 2 parts and the first part is unlocked then
743** set to a read lock, then the other part is simply unlocked. This works
744** around a bug in BSD NFS lockd (also seen on MacOSX 10.3+) that fails to
745** remove the write lock on a region when a read lock is set.
746*/
747static int _posixUnlock(unqlite_file *id, int eFileLock, int handleNFSUnlock){
748 unixFile *pFile = (unixFile*)id;
749 unixInodeInfo *pInode;
750 struct flock lock;
751 int rc = UNQLITE_OK;
752 int h;
753 int tErrno; /* Error code from system call errors */
754
755 if( pFile->eFileLock<=eFileLock ){
756 return UNQLITE_OK;
757 }
758 unixEnterMutex();
759
760 h = pFile->h;
761 pInode = pFile->pInode;
762
763 if( pFile->eFileLock>SHARED_LOCK ){
764 /* downgrading to a shared lock on NFS involves clearing the write lock
765 ** before establishing the readlock - to avoid a race condition we downgrade
766 ** the lock in 2 blocks, so that part of the range will be covered by a
767 ** write lock until the rest is covered by a read lock:
768 ** 1: [WWWWW]
769 ** 2: [....W]
770 ** 3: [RRRRW]
771 ** 4: [RRRR.]
772 */
773 if( eFileLock==SHARED_LOCK ){
774 if( handleNFSUnlock ){
775 off_t divSize = SHARED_SIZE - 1;
776
777 lock.l_type = F_UNLCK;
778 lock.l_whence = SEEK_SET;
779 lock.l_start = SHARED_FIRST;
780 lock.l_len = divSize;
781 if( fcntl(h, F_SETLK, &lock)==(-1) ){
782 tErrno = errno;
783 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
784 if( IS_LOCK_ERROR(rc) ){
785 pFile->lastErrno = tErrno;
786 }
787 goto end_unlock;
788 }
789 lock.l_type = F_RDLCK;
790 lock.l_whence = SEEK_SET;
791 lock.l_start = SHARED_FIRST;
792 lock.l_len = divSize;
793 if( fcntl(h, F_SETLK, &lock)==(-1) ){
794 tErrno = errno;
795 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
796 if( IS_LOCK_ERROR(rc) ){
797 pFile->lastErrno = tErrno;
798 }
799 goto end_unlock;
800 }
801 lock.l_type = F_UNLCK;
802 lock.l_whence = SEEK_SET;
803 lock.l_start = SHARED_FIRST+divSize;
804 lock.l_len = SHARED_SIZE-divSize;
805 if( fcntl(h, F_SETLK, &lock)==(-1) ){
806 tErrno = errno;
807 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
808 if( IS_LOCK_ERROR(rc) ){
809 pFile->lastErrno = tErrno;
810 }
811 goto end_unlock;
812 }
813 }else{
814 lock.l_type = F_RDLCK;
815 lock.l_whence = SEEK_SET;
816 lock.l_start = SHARED_FIRST;
817 lock.l_len = SHARED_SIZE;
818 if( fcntl(h, F_SETLK, &lock)==(-1) ){
819 tErrno = errno;
820 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
821 if( IS_LOCK_ERROR(rc) ){
822 pFile->lastErrno = tErrno;
823 }
824 goto end_unlock;
825 }
826 }
827 }
828 lock.l_type = F_UNLCK;
829 lock.l_whence = SEEK_SET;
830 lock.l_start = PENDING_BYTE;
831 lock.l_len = 2L;
832 if( fcntl(h, F_SETLK, &lock)!=(-1) ){
833 pInode->eFileLock = SHARED_LOCK;
834 }else{
835 tErrno = errno;
836 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
837 if( IS_LOCK_ERROR(rc) ){
838 pFile->lastErrno = tErrno;
839 }
840 goto end_unlock;
841 }
842 }
843 if( eFileLock==NO_LOCK ){
844 /* Decrement the shared lock counter. Release the lock using an
845 ** OS call only when all threads in this same process have released
846 ** the lock.
847 */
848 pInode->nShared--;
849 if( pInode->nShared==0 ){
850 lock.l_type = F_UNLCK;
851 lock.l_whence = SEEK_SET;
852 lock.l_start = lock.l_len = 0L;
853
854 if( fcntl(h, F_SETLK, &lock)!=(-1) ){
855 pInode->eFileLock = NO_LOCK;
856 }else{
857 tErrno = errno;
858 rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
859 if( IS_LOCK_ERROR(rc) ){
860 pFile->lastErrno = tErrno;
861 }
862 pInode->eFileLock = NO_LOCK;
863 pFile->eFileLock = NO_LOCK;
864 }
865 }
866
867 /* Decrement the count of locks against this same file. When the
868 ** count reaches zero, close any other file descriptors whose close
869 ** was deferred because of outstanding locks.
870 */
871 pInode->nLock--;
872
873 if( pInode->nLock==0 ){
874 int rc2 = closePendingFds(pFile);
875 if( rc==UNQLITE_OK ){
876 rc = rc2;
877 }
878 }
879 }
880
881end_unlock:
882
883 unixLeaveMutex();
884
885 if( rc==UNQLITE_OK ) pFile->eFileLock = eFileLock;
886 return rc;
887}
888/*
889** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
890** must be either NO_LOCK or SHARED_LOCK.
891**
892** If the locking level of the file descriptor is already at or below
893** the requested locking level, this routine is a no-op.
894*/
895static int unixUnlock(unqlite_file *id, int eFileLock){
896 return _posixUnlock(id, eFileLock, 0);
897}
898/*
899** This function performs the parts of the "close file" operation
900** common to all locking schemes. It closes the directory and file
901** handles, if they are valid, and sets all fields of the unixFile
902** structure to 0.
903**
904*/
905static int closeUnixFile(unqlite_file *id){
906 unixFile *pFile = (unixFile*)id;
907 if( pFile ){
908 if( pFile->dirfd>=0 ){
909 int err = close(pFile->dirfd);
910 if( err ){
911 pFile->lastErrno = errno;
912 return UNQLITE_IOERR;
913 }else{
914 pFile->dirfd=-1;
915 }
916 }
917 if( pFile->h>=0 ){
918 int err = close(pFile->h);
919 if( err ){
920 pFile->lastErrno = errno;
921 return UNQLITE_IOERR;
922 }
923 }
924 unqlite_free(pFile->pUnused);
925 SyZero(pFile,sizeof(unixFile));
926 }
927 return UNQLITE_OK;
928}
929/*
930** Close a file.
931*/
932static int unixClose(unqlite_file *id){
933 int rc = UNQLITE_OK;
934 if( id ){
935 unixFile *pFile = (unixFile *)id;
936 unixUnlock(id, NO_LOCK);
937 unixEnterMutex();
938 if( pFile->pInode && pFile->pInode->nLock ){
939 /* If there are outstanding locks, do not actually close the file just
940 ** yet because that would clear those locks. Instead, add the file
941 ** descriptor to pInode->pUnused list. It will be automatically closed
942 ** when the last lock is cleared.
943 */
944 setPendingFd(pFile);
945 }
946 releaseInodeInfo(pFile);
947 rc = closeUnixFile(id);
948 unixLeaveMutex();
949 }
950 return rc;
951}
952/************** End of the posix advisory lock implementation *****************
953******************************************************************************/
954/*
955**
956** The next division contains implementations for all methods of the
957** unqlite_file object other than the locking methods. The locking
958** methods were defined in divisions above (one locking method per
959** division). Those methods that are common to all locking modes
960** are gather together into this division.
961*/
962/*
963** Seek to the offset passed as the second argument, then read cnt
964** bytes into pBuf. Return the number of bytes actually read.
965**
966** NB: If you define USE_PREAD or USE_PREAD64, then it might also
967** be necessary to define _XOPEN_SOURCE to be 500. This varies from
968** one system to another. Since SQLite does not define USE_PREAD
969** any form by default, we will not attempt to define _XOPEN_SOURCE.
970** See tickets #2741 and #2681.
971**
972** To avoid stomping the errno value on a failed read the lastErrno value
973** is set before returning.
974*/
975static int seekAndRead(unixFile *id, unqlite_int64 offset, void *pBuf, int cnt){
976 int got;
977#if (!defined(USE_PREAD) && !defined(USE_PREAD64))
978 unqlite_int64 newOffset;
979#endif
980
981#if defined(USE_PREAD)
982 got = pread(id->h, pBuf, cnt, offset);
983#elif defined(USE_PREAD64)
984 got = pread64(id->h, pBuf, cnt, offset);
985#else
986 newOffset = lseek(id->h, offset, SEEK_SET);
987
988 if( newOffset!=offset ){
989 if( newOffset == -1 ){
990 ((unixFile*)id)->lastErrno = errno;
991 }else{
992 ((unixFile*)id)->lastErrno = 0;
993 }
994 return -1;
995 }
996 got = read(id->h, pBuf, cnt);
997#endif
998 if( got<0 ){
999 ((unixFile*)id)->lastErrno = errno;
1000 }
1001 return got;
1002}
1003/*
1004** Read data from a file into a buffer. Return UNQLITE_OK if all
1005** bytes were read successfully and UNQLITE_IOERR if anything goes
1006** wrong.
1007*/
1008static int unixRead(
1009 unqlite_file *id,
1010 void *pBuf,
1011 unqlite_int64 amt,
1012 unqlite_int64 offset
1013){
1014 unixFile *pFile = (unixFile *)id;
1015 int got;
1016
1017 got = seekAndRead(pFile, offset, pBuf, (int)amt);
1018 if( got==(int)amt ){
1019 return UNQLITE_OK;
1020 }else if( got<0 ){
1021 /* lastErrno set by seekAndRead */
1022 return UNQLITE_IOERR;
1023 }else{
1024 pFile->lastErrno = 0; /* not a system error */
1025 /* Unread parts of the buffer must be zero-filled */
1026 SyZero(&((char*)pBuf)[got],(sxu32)amt-got);
1027 return UNQLITE_IOERR;
1028 }
1029}
1030/*
1031** Seek to the offset in id->offset then read cnt bytes into pBuf.
1032** Return the number of bytes actually read. Update the offset.
1033**
1034** To avoid stomping the errno value on a failed write the lastErrno value
1035** is set before returning.
1036*/
1037static int seekAndWrite(unixFile *id, unqlite_int64 offset, const void *pBuf, unqlite_int64 cnt){
1038 int got;
1039#if (!defined(USE_PREAD) && !defined(USE_PREAD64))
1040 unqlite_int64 newOffset;
1041#endif
1042
1043#if defined(USE_PREAD)
1044 got = pwrite(id->h, pBuf, cnt, offset);
1045#elif defined(USE_PREAD64)
1046 got = pwrite64(id->h, pBuf, cnt, offset);
1047#else
1048 newOffset = lseek(id->h, offset, SEEK_SET);
1049 if( newOffset!=offset ){
1050 if( newOffset == -1 ){
1051 ((unixFile*)id)->lastErrno = errno;
1052 }else{
1053 ((unixFile*)id)->lastErrno = 0;
1054 }
1055 return -1;
1056 }
1057 got = write(id->h, pBuf, cnt);
1058#endif
1059 if( got<0 ){
1060 ((unixFile*)id)->lastErrno = errno;
1061 }
1062 return got;
1063}
1064/*
1065** Write data from a buffer into a file. Return UNQLITE_OK on success
1066** or some other error code on failure.
1067*/
1068static int unixWrite(
1069 unqlite_file *id,
1070 const void *pBuf,
1071 unqlite_int64 amt,
1072 unqlite_int64 offset
1073){
1074 unixFile *pFile = (unixFile*)id;
1075 int wrote = 0;
1076
1077 while( amt>0 && (wrote = seekAndWrite(pFile, offset, pBuf, amt))>0 ){
1078 amt -= wrote;
1079 offset += wrote;
1080 pBuf = &((char*)pBuf)[wrote];
1081 }
1082
1083 if( amt>0 ){
1084 if( wrote<0 ){
1085 /* lastErrno set by seekAndWrite */
1086 return UNQLITE_IOERR;
1087 }else{
1088 pFile->lastErrno = 0; /* not a system error */
1089 return UNQLITE_FULL;
1090 }
1091 }
1092 return UNQLITE_OK;
1093}
1094/*
1095** We do not trust systems to provide a working fdatasync(). Some do.
1096** Others do no. To be safe, we will stick with the (slower) fsync().
1097** If you know that your system does support fdatasync() correctly,
1098** then simply compile with -Dfdatasync=fdatasync
1099*/
1100#if !defined(fdatasync) && !defined(__linux__)
1101# define fdatasync fsync
1102#endif
1103
1104/*
1105** Define HAVE_FULLFSYNC to 0 or 1 depending on whether or not
1106** the F_FULLFSYNC macro is defined. F_FULLFSYNC is currently
1107** only available on Mac OS X. But that could change.
1108*/
1109#ifdef F_FULLFSYNC
1110# define HAVE_FULLFSYNC 1
1111#else
1112# define HAVE_FULLFSYNC 0
1113#endif
1114/*
1115** The fsync() system call does not work as advertised on many
1116** unix systems. The following procedure is an attempt to make
1117** it work better.
1118**
1119**
1120** SQLite sets the dataOnly flag if the size of the file is unchanged.
1121** The idea behind dataOnly is that it should only write the file content
1122** to disk, not the inode. We only set dataOnly if the file size is
1123** unchanged since the file size is part of the inode. However,
1124** Ted Ts'o tells us that fdatasync() will also write the inode if the
1125** file size has changed. The only real difference between fdatasync()
1126** and fsync(), Ted tells us, is that fdatasync() will not flush the
1127** inode if the mtime or owner or other inode attributes have changed.
1128** We only care about the file size, not the other file attributes, so
1129** as far as SQLite is concerned, an fdatasync() is always adequate.
1130** So, we always use fdatasync() if it is available, regardless of
1131** the value of the dataOnly flag.
1132*/
1133static int full_fsync(int fd, int fullSync, int dataOnly){
1134 int rc;
1135#if HAVE_FULLFSYNC
1136 SXUNUSED(dataOnly);
1137#else
1138 SXUNUSED(fullSync);
1139 SXUNUSED(dataOnly);
1140#endif
1141
1142 /* If we compiled with the UNQLITE_NO_SYNC flag, then syncing is a
1143 ** no-op
1144 */
1145#if HAVE_FULLFSYNC
1146 if( fullSync ){
1147 rc = fcntl(fd, F_FULLFSYNC, 0);
1148 }else{
1149 rc = 1;
1150 }
1151 /* If the FULLFSYNC failed, fall back to attempting an fsync().
1152 ** It shouldn't be possible for fullfsync to fail on the local
1153 ** file system (on OSX), so failure indicates that FULLFSYNC
1154 ** isn't supported for this file system. So, attempt an fsync
1155 ** and (for now) ignore the overhead of a superfluous fcntl call.
1156 ** It'd be better to detect fullfsync support once and avoid
1157 ** the fcntl call every time sync is called.
1158 */
1159 if( rc ) rc = fsync(fd);
1160
1161#elif defined(__APPLE__)
1162 /* fdatasync() on HFS+ doesn't yet flush the file size if it changed correctly
1163 ** so currently we default to the macro that redefines fdatasync to fsync
1164 */
1165 rc = fsync(fd);
1166#else
1167 rc = fdatasync(fd);
1168#endif /* ifdef UNQLITE_NO_SYNC elif HAVE_FULLFSYNC */
1169 if( rc!= -1 ){
1170 rc = 0;
1171 }
1172 return rc;
1173}
1174/*
1175** Make sure all writes to a particular file are committed to disk.
1176**
1177** If dataOnly==0 then both the file itself and its metadata (file
1178** size, access time, etc) are synced. If dataOnly!=0 then only the
1179** file data is synced.
1180**
1181** Under Unix, also make sure that the directory entry for the file
1182** has been created by fsync-ing the directory that contains the file.
1183** If we do not do this and we encounter a power failure, the directory
1184** entry for the journal might not exist after we reboot. The next
1185** SQLite to access the file will not know that the journal exists (because
1186** the directory entry for the journal was never created) and the transaction
1187** will not roll back - possibly leading to database corruption.
1188*/
1189static int unixSync(unqlite_file *id, int flags){
1190 int rc;
1191 unixFile *pFile = (unixFile*)id;
1192
1193 int isDataOnly = (flags&UNQLITE_SYNC_DATAONLY);
1194 int isFullsync = (flags&0x0F)==UNQLITE_SYNC_FULL;
1195
1196 rc = full_fsync(pFile->h, isFullsync, isDataOnly);
1197
1198 if( rc ){
1199 pFile->lastErrno = errno;
1200 return UNQLITE_IOERR;
1201 }
1202 if( pFile->dirfd>=0 ){
1203 int err;
1204#ifndef UNQLITE_DISABLE_DIRSYNC
1205 /* The directory sync is only attempted if full_fsync is
1206 ** turned off or unavailable. If a full_fsync occurred above,
1207 ** then the directory sync is superfluous.
1208 */
1209 if( (!HAVE_FULLFSYNC || !isFullsync) && full_fsync(pFile->dirfd,0,0) ){
1210 /*
1211 ** We have received multiple reports of fsync() returning
1212 ** errors when applied to directories on certain file systems.
1213 ** A failed directory sync is not a big deal. So it seems
1214 ** better to ignore the error. Ticket #1657
1215 */
1216 /* pFile->lastErrno = errno; */
1217 /* return UNQLITE_IOERR; */
1218 }
1219#endif
1220 err = close(pFile->dirfd); /* Only need to sync once, so close the */
1221 if( err==0 ){ /* directory when we are done */
1222 pFile->dirfd = -1;
1223 }else{
1224 pFile->lastErrno = errno;
1225 rc = UNQLITE_IOERR;
1226 }
1227 }
1228 return rc;
1229}
1230/*
1231** Truncate an open file to a specified size
1232*/
1233static int unixTruncate(unqlite_file *id, sxi64 nByte){
1234 unixFile *pFile = (unixFile *)id;
1235 int rc;
1236
1237 rc = ftruncate(pFile->h, (off_t)nByte);
1238 if( rc ){
1239 pFile->lastErrno = errno;
1240 return UNQLITE_IOERR;
1241 }else{
1242 return UNQLITE_OK;
1243 }
1244}
1245/*
1246** Determine the current size of a file in bytes
1247*/
1248static int unixFileSize(unqlite_file *id,sxi64 *pSize){
1249 int rc;
1250 struct stat buf;
1251
1252 rc = fstat(((unixFile*)id)->h, &buf);
1253
1254 if( rc!=0 ){
1255 ((unixFile*)id)->lastErrno = errno;
1256 return UNQLITE_IOERR;
1257 }
1258 *pSize = buf.st_size;
1259
1260 /* When opening a zero-size database, the findInodeInfo() procedure
1261 ** writes a single byte into that file in order to work around a bug
1262 ** in the OS-X msdos filesystem. In order to avoid problems with upper
1263 ** layers, we need to report this file size as zero even though it is
1264 ** really 1. Ticket #3260.
1265 */
1266 if( *pSize==1 ) *pSize = 0;
1267
1268 return UNQLITE_OK;
1269}
1270/*
1271** Return the sector size in bytes of the underlying block device for
1272** the specified file. This is almost always 512 bytes, but may be
1273** larger for some devices.
1274**
1275** SQLite code assumes this function cannot fail. It also assumes that
1276** if two files are created in the same file-system directory (i.e.
1277** a database and its journal file) that the sector size will be the
1278** same for both.
1279*/
1280static int unixSectorSize(unqlite_file *NotUsed){
1281 SXUNUSED(NotUsed);
1282 return UNQLITE_DEFAULT_SECTOR_SIZE;
1283}
1284/*
1285** This vector defines all the methods that can operate on an
1286** unqlite_file for Windows systems.
1287*/
1288static const unqlite_io_methods unixIoMethod = {
1289 1, /* iVersion */
1290 unixClose, /* xClose */
1291 unixRead, /* xRead */
1292 unixWrite, /* xWrite */
1293 unixTruncate, /* xTruncate */
1294 unixSync, /* xSync */
1295 unixFileSize, /* xFileSize */
1296 unixLock, /* xLock */
1297 unixUnlock, /* xUnlock */
1298 unixCheckReservedLock, /* xCheckReservedLock */
1299 unixSectorSize, /* xSectorSize */
1300};
1301/****************************************************************************
1302**************************** unqlite_vfs methods ****************************
1303**
1304** This division contains the implementation of methods on the
1305** unqlite_vfs object.
1306*/
1307/*
1308** Initialize the contents of the unixFile structure pointed to by pId.
1309*/
1310static int fillInUnixFile(
1311 unqlite_vfs *pVfs, /* Pointer to vfs object */
1312 int h, /* Open file descriptor of file being opened */
1313 int dirfd, /* Directory file descriptor */
1314 unqlite_file *pId, /* Write to the unixFile structure here */
1315 const char *zFilename, /* Name of the file being opened */
1316 int noLock, /* Omit locking if true */
1317 int isDelete /* Delete on close if true */
1318){
1319 const unqlite_io_methods *pLockingStyle = &unixIoMethod;
1320 unixFile *pNew = (unixFile *)pId;
1321 int rc = UNQLITE_OK;
1322
1323 /* Parameter isDelete is only used on vxworks. Express this explicitly
1324 ** here to prevent compiler warnings about unused parameters.
1325 */
1326 SXUNUSED(isDelete);
1327 SXUNUSED(noLock);
1328 SXUNUSED(pVfs);
1329
1330 pNew->h = h;
1331 pNew->dirfd = dirfd;
1332 pNew->fileFlags = 0;
1333 pNew->zPath = zFilename;
1334
1335 unixEnterMutex();
1336 rc = findInodeInfo(pNew, &pNew->pInode);
1337 if( rc!=UNQLITE_OK ){
1338 /* If an error occured in findInodeInfo(), close the file descriptor
1339 ** immediately, before releasing the mutex. findInodeInfo() may fail
1340 ** in two scenarios:
1341 **
1342 ** (a) A call to fstat() failed.
1343 ** (b) A malloc failed.
1344 **
1345 ** Scenario (b) may only occur if the process is holding no other
1346 ** file descriptors open on the same file. If there were other file
1347 ** descriptors on this file, then no malloc would be required by
1348 ** findInodeInfo(). If this is the case, it is quite safe to close
1349 ** handle h - as it is guaranteed that no posix locks will be released
1350 ** by doing so.
1351 **
1352 ** If scenario (a) caused the error then things are not so safe. The
1353 ** implicit assumption here is that if fstat() fails, things are in
1354 ** such bad shape that dropping a lock or two doesn't matter much.
1355 */
1356 close(h);
1357 h = -1;
1358 }
1359 unixLeaveMutex();
1360
1361 pNew->lastErrno = 0;
1362 if( rc!=UNQLITE_OK ){
1363 if( dirfd>=0 ) close(dirfd); /* silent leak if fail, already in error */
1364 if( h>=0 ) close(h);
1365 }else{
1366 pNew->pMethod = pLockingStyle;
1367 }
1368 return rc;
1369}
1370/*
1371** Open a file descriptor to the directory containing file zFilename.
1372** If successful, *pFd is set to the opened file descriptor and
1373** UNQLITE_OK is returned. If an error occurs, either UNQLITE_NOMEM
1374** or UNQLITE_CANTOPEN is returned and *pFd is set to an undefined
1375** value.
1376**
1377** If UNQLITE_OK is returned, the caller is responsible for closing
1378** the file descriptor *pFd using close().
1379*/
1380static int openDirectory(const char *zFilename, int *pFd){
1381 sxu32 ii;
1382 int fd = -1;
1383 char zDirname[MAX_PATHNAME+1];
1384 sxu32 n;
1385 n = Systrcpy(zDirname,sizeof(zDirname),zFilename,0);
1386 for(ii=n; ii>1 && zDirname[ii]!='/'; ii--);
1387 if( ii>0 ){
1388 zDirname[ii] = '\0';
1389 fd = open(zDirname, O_RDONLY|O_BINARY, 0);
1390 if( fd>=0 ){
1391#ifdef FD_CLOEXEC
1392 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD, 0) | FD_CLOEXEC);
1393#endif
1394 }
1395 }
1396 *pFd = fd;
1397 return (fd>=0?UNQLITE_OK: UNQLITE_IOERR );
1398}
1399/*
1400** Search for an unused file descriptor that was opened on the database
1401** file (not a journal or master-journal file) identified by pathname
1402** zPath with UNQLITE_OPEN_XXX flags matching those passed as the second
1403** argument to this function.
1404**
1405** Such a file descriptor may exist if a database connection was closed
1406** but the associated file descriptor could not be closed because some
1407** other file descriptor open on the same file is holding a file-lock.
1408** Refer to comments in the unixClose() function and the lengthy comment
1409** describing "Posix Advisory Locking" at the start of this file for
1410** further details. Also, ticket #4018.
1411**
1412** If a suitable file descriptor is found, then it is returned. If no
1413** such file descriptor is located, -1 is returned.
1414*/
1415static UnixUnusedFd *findReusableFd(const char *zPath, int flags){
1416 UnixUnusedFd *pUnused = 0;
1417 struct stat sStat; /* Results of stat() call */
1418 /* A stat() call may fail for various reasons. If this happens, it is
1419 ** almost certain that an open() call on the same path will also fail.
1420 ** For this reason, if an error occurs in the stat() call here, it is
1421 ** ignored and -1 is returned. The caller will try to open a new file
1422 ** descriptor on the same path, fail, and return an error to SQLite.
1423 **
1424 ** Even if a subsequent open() call does succeed, the consequences of
1425 ** not searching for a resusable file descriptor are not dire. */
1426 if( 0==stat(zPath, &sStat) ){
1427 unixInodeInfo *pInode;
1428
1429 unixEnterMutex();
1430 pInode = inodeList;
1431 while( pInode && (pInode->fileId.dev!=sStat.st_dev
1432 || pInode->fileId.ino!=sStat.st_ino) ){
1433 pInode = pInode->pNext;
1434 }
1435 if( pInode ){
1436 UnixUnusedFd **pp;
1437 for(pp=&pInode->pUnused; *pp && (*pp)->flags!=flags; pp=&((*pp)->pNext));
1438 pUnused = *pp;
1439 if( pUnused ){
1440 *pp = pUnused->pNext;
1441 }
1442 }
1443 unixLeaveMutex();
1444 }
1445 return pUnused;
1446}
1447/*
1448** This function is called by unixOpen() to determine the unix permissions
1449** to create new files with. If no error occurs, then UNQLITE_OK is returned
1450** and a value suitable for passing as the third argument to open(2) is
1451** written to *pMode. If an IO error occurs, an SQLite error code is
1452** returned and the value of *pMode is not modified.
1453**
1454** If the file being opened is a temporary file, it is always created with
1455** the octal permissions 0600 (read/writable by owner only). If the file
1456** is a database or master journal file, it is created with the permissions
1457** mask UNQLITE_DEFAULT_FILE_PERMISSIONS.
1458**
1459** Finally, if the file being opened is a WAL or regular journal file, then
1460** this function queries the file-system for the permissions on the
1461** corresponding database file and sets *pMode to this value. Whenever
1462** possible, WAL and journal files are created using the same permissions
1463** as the associated database file.
1464*/
1465static int findCreateFileMode(
1466 const char *zPath, /* Path of file (possibly) being created */
1467 int flags, /* Flags passed as 4th argument to xOpen() */
1468 mode_t *pMode /* OUT: Permissions to open file with */
1469){
1470 int rc = UNQLITE_OK; /* Return Code */
1471 if( flags & UNQLITE_OPEN_TEMP_DB ){
1472 *pMode = 0600;
1473 SXUNUSED(zPath);
1474 }else{
1475 *pMode = UNQLITE_DEFAULT_FILE_PERMISSIONS;
1476 }
1477 return rc;
1478}
1479/*
1480** Open the file zPath.
1481**
1482** Previously, the SQLite OS layer used three functions in place of this
1483** one:
1484**
1485** unqliteOsOpenReadWrite();
1486** unqliteOsOpenReadOnly();
1487** unqliteOsOpenExclusive();
1488**
1489** These calls correspond to the following combinations of flags:
1490**
1491** ReadWrite() -> (READWRITE | CREATE)
1492** ReadOnly() -> (READONLY)
1493** OpenExclusive() -> (READWRITE | CREATE | EXCLUSIVE)
1494**
1495** The old OpenExclusive() accepted a boolean argument - "delFlag". If
1496** true, the file was configured to be automatically deleted when the
1497** file handle closed. To achieve the same effect using this new
1498** interface, add the DELETEONCLOSE flag to those specified above for
1499** OpenExclusive().
1500*/
1501static int unixOpen(
1502 unqlite_vfs *pVfs, /* The VFS for which this is the xOpen method */
1503 const char *zPath, /* Pathname of file to be opened */
1504 unqlite_file *pFile, /* The file descriptor to be filled in */
1505 unsigned int flags /* Input flags to control the opening */
1506){
1507 unixFile *p = (unixFile *)pFile;
1508 int fd = -1; /* File descriptor returned by open() */
1509 int dirfd = -1; /* Directory file descriptor */
1510 int openFlags = 0; /* Flags to pass to open() */
1511 int noLock; /* True to omit locking primitives */
1512 int rc = UNQLITE_OK; /* Function Return Code */
1513 UnixUnusedFd *pUnused;
1514 int isExclusive = (flags & UNQLITE_OPEN_EXCLUSIVE);
1515 int isDelete = (flags & UNQLITE_OPEN_TEMP_DB);
1516 int isCreate = (flags & UNQLITE_OPEN_CREATE);
1517 int isReadonly = (flags & UNQLITE_OPEN_READONLY);
1518 int isReadWrite = (flags & UNQLITE_OPEN_READWRITE);
1519 /* If creating a master or main-file journal, this function will open
1520 ** a file-descriptor on the directory too. The first time unixSync()
1521 ** is called the directory file descriptor will be fsync()ed and close()d.
1522 */
1523 int isOpenDirectory = isCreate ;
1524 const char *zName = zPath;
1525
1526 SyZero(p,sizeof(unixFile));
1527
1528 pUnused = findReusableFd(zName, flags);
1529 if( pUnused ){
1530 fd = pUnused->fd;
1531 }else{
1532 pUnused = unqlite_malloc(sizeof(*pUnused));
1533 if( !pUnused ){
1534 return UNQLITE_NOMEM;
1535 }
1536 }
1537 p->pUnused = pUnused;
1538
1539 /* Determine the value of the flags parameter passed to POSIX function
1540 ** open(). These must be calculated even if open() is not called, as
1541 ** they may be stored as part of the file handle and used by the
1542 ** 'conch file' locking functions later on. */
1543 if( isReadonly ) openFlags |= O_RDONLY;
1544 if( isReadWrite ) openFlags |= O_RDWR;
1545 if( isCreate ) openFlags |= O_CREAT;
1546 if( isExclusive ) openFlags |= (O_EXCL|O_NOFOLLOW);
1547 openFlags |= (O_LARGEFILE|O_BINARY);
1548
1549 if( fd<0 ){
1550 mode_t openMode; /* Permissions to create file with */
1551 rc = findCreateFileMode(zName, flags, &openMode);
1552 if( rc!=UNQLITE_OK ){
1553 return rc;
1554 }
1555 fd = open(zName, openFlags, openMode);
1556 if( fd<0 ){
1557 rc = UNQLITE_IOERR;
1558 goto open_finished;
1559 }
1560 }
1561
1562 if( p->pUnused ){
1563 p->pUnused->fd = fd;
1564 p->pUnused->flags = flags;
1565 }
1566
1567 if( isDelete ){
1568 unlink(zName);
1569 }
1570
1571 if( isOpenDirectory ){
1572 rc = openDirectory(zPath, &dirfd);
1573 if( rc!=UNQLITE_OK ){
1574 /* It is safe to close fd at this point, because it is guaranteed not
1575 ** to be open on a database file. If it were open on a database file,
1576 ** it would not be safe to close as this would release any locks held
1577 ** on the file by this process. */
1578 close(fd); /* silently leak if fail, already in error */
1579 goto open_finished;
1580 }
1581 }
1582
1583#ifdef FD_CLOEXEC
1584 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD, 0) | FD_CLOEXEC);
1585#endif
1586
1587 noLock = 0;
1588
1589#if defined(__APPLE__)
1590 struct statfs fsInfo;
1591 if( fstatfs(fd, &fsInfo) == -1 ){
1592 ((unixFile*)pFile)->lastErrno = errno;
1593 if( dirfd>=0 ) close(dirfd); /* silently leak if fail, in error */
1594 close(fd); /* silently leak if fail, in error */
1595 return UNQLITE_IOERR;
1596 }
1597 if (0 == SyStrncmp("msdos", fsInfo.f_fstypename, 5)) {
1598 ((unixFile*)pFile)->fsFlags |= UNQLITE_FSFLAGS_IS_MSDOS;
1599 }
1600#endif
1601
1602 rc = fillInUnixFile(pVfs, fd, dirfd, pFile, zPath, noLock, isDelete);
1603open_finished:
1604 if( rc!=UNQLITE_OK ){
1605 unqlite_free(p->pUnused);
1606 }
1607 return rc;
1608}
1609/*
1610** Delete the file at zPath. If the dirSync argument is true, fsync()
1611** the directory after deleting the file.
1612*/
1613static int unixDelete(
1614 unqlite_vfs *NotUsed, /* VFS containing this as the xDelete method */
1615 const char *zPath, /* Name of file to be deleted */
1616 int dirSync /* If true, fsync() directory after deleting file */
1617){
1618 int rc = UNQLITE_OK;
1619 SXUNUSED(NotUsed);
1620
1621 if( unlink(zPath)==(-1) && errno!=ENOENT ){
1622 return UNQLITE_IOERR;
1623 }
1624#ifndef UNQLITE_DISABLE_DIRSYNC
1625 if( dirSync ){
1626 int fd;
1627 rc = openDirectory(zPath, &fd);
1628 if( rc==UNQLITE_OK ){
1629 if( fsync(fd) )
1630 {
1631 rc = UNQLITE_IOERR;
1632 }
1633 if( close(fd) && !rc ){
1634 rc = UNQLITE_IOERR;
1635 }
1636 }
1637 }
1638#endif
1639 return rc;
1640}
1641/*
1642** Sleep for a little while. Return the amount of time slept.
1643** The argument is the number of microseconds we want to sleep.
1644** The return value is the number of microseconds of sleep actually
1645** requested from the underlying operating system, a number which
1646** might be greater than or equal to the argument, but not less
1647** than the argument.
1648*/
1649static int unixSleep(unqlite_vfs *NotUsed, int microseconds)
1650{
1651#if defined(HAVE_USLEEP) && HAVE_USLEEP
1652 usleep(microseconds);
1653 SXUNUSED(NotUsed);
1654 return microseconds;
1655#else
1656 int seconds = (microseconds+999999)/1000000;
1657 SXUNUSED(NotUsed);
1658 sleep(seconds);
1659 return seconds*1000000;
1660#endif
1661}
1662/*
1663 * Export the current system time.
1664 */
1665static int unixCurrentTime(unqlite_vfs *pVfs,Sytm *pOut)
1666{
1667 struct tm *pTm;
1668 time_t tt;
1669 SXUNUSED(pVfs);
1670 time(&tt);
1671 pTm = gmtime(&tt);
1672 if( pTm ){ /* Yes, it can fail */
1673 STRUCT_TM_TO_SYTM(pTm,pOut);
1674 }
1675 return UNQLITE_OK;
1676}
1677/*
1678** Test the existance of or access permissions of file zPath. The
1679** test performed depends on the value of flags:
1680**
1681** UNQLITE_ACCESS_EXISTS: Return 1 if the file exists
1682** UNQLITE_ACCESS_READWRITE: Return 1 if the file is read and writable.
1683** UNQLITE_ACCESS_READONLY: Return 1 if the file is readable.
1684**
1685** Otherwise return 0.
1686*/
1687static int unixAccess(
1688 unqlite_vfs *NotUsed, /* The VFS containing this xAccess method */
1689 const char *zPath, /* Path of the file to examine */
1690 int flags, /* What do we want to learn about the zPath file? */
1691 int *pResOut /* Write result boolean here */
1692){
1693 int amode = 0;
1694 SXUNUSED(NotUsed);
1695 switch( flags ){
1696 case UNQLITE_ACCESS_EXISTS:
1697 amode = F_OK;
1698 break;
1699 case UNQLITE_ACCESS_READWRITE:
1700 amode = W_OK|R_OK;
1701 break;
1702 case UNQLITE_ACCESS_READ:
1703 amode = R_OK;
1704 break;
1705 default:
1706 /* Can't happen */
1707 break;
1708 }
1709 *pResOut = (access(zPath, amode)==0);
1710 if( flags==UNQLITE_ACCESS_EXISTS && *pResOut ){
1711 struct stat buf;
1712 if( 0==stat(zPath, &buf) && buf.st_size==0 ){
1713 *pResOut = 0;
1714 }
1715 }
1716 return UNQLITE_OK;
1717}
1718/*
1719** Turn a relative pathname into a full pathname. The relative path
1720** is stored as a nul-terminated string in the buffer pointed to by
1721** zPath.
1722**
1723** zOut points to a buffer of at least unqlite_vfs.mxPathname bytes
1724** (in this case, MAX_PATHNAME bytes). The full-path is written to
1725** this buffer before returning.
1726*/
1727static int unixFullPathname(
1728 unqlite_vfs *pVfs, /* Pointer to vfs object */
1729 const char *zPath, /* Possibly relative input path */
1730 int nOut, /* Size of output buffer in bytes */
1731 char *zOut /* Output buffer */
1732){
1733 if( zPath[0]=='/' ){
1734 Systrcpy(zOut,(sxu32)nOut,zPath,0);
1735 SXUNUSED(pVfs);
1736 }else{
1737 sxu32 nCwd;
1738 zOut[nOut-1] = '\0';
1739 if( getcwd(zOut, nOut-1)==0 ){
1740 return UNQLITE_IOERR;
1741 }
1742 nCwd = SyStrlen(zOut);
1743 SyBufferFormat(&zOut[nCwd],(sxu32)nOut-nCwd,"/%s",zPath);
1744 }
1745 return UNQLITE_OK;
1746}
1747/*
1748 * Export the Unix Vfs.
1749 */
1750UNQLITE_PRIVATE const unqlite_vfs * unqliteExportBuiltinVfs(void)
1751{
1752 static const unqlite_vfs sUnixvfs = {
1753 "Unix", /* Vfs name */
1754 1, /* Vfs structure version */
1755 sizeof(unixFile), /* szOsFile */
1756 MAX_PATHNAME, /* mxPathName */
1757 unixOpen, /* xOpen */
1758 unixDelete, /* xDelete */
1759 unixAccess, /* xAccess */
1760 unixFullPathname, /* xFullPathname */
1761 0, /* xTmp */
1762 unixSleep, /* xSleep */
1763 unixCurrentTime, /* xCurrentTime */
1764 0, /* xGetLastError */
1765 };
1766 return &sUnixvfs;
1767}
1768
1769#endif /* __UNIXES__ */