From 9ee8378d393778ac67314be7ea8d5bcbaeee9ee0 Mon Sep 17 00:00:00 2001 From: Aaron Seigo Date: Sun, 7 Dec 2014 10:08:07 +0100 Subject: try out unqlite --- common/unqlite/os_unix.c | 1769 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1769 insertions(+) create mode 100644 common/unqlite/os_unix.c (limited to 'common/unqlite/os_unix.c') diff --git a/common/unqlite/os_unix.c b/common/unqlite/os_unix.c new file mode 100644 index 0000000..f578d07 --- /dev/null +++ b/common/unqlite/os_unix.c @@ -0,0 +1,1769 @@ +/* + * Symisc unQLite: An Embeddable NoSQL (Post Modern) Database Engine. + * Copyright (C) 2012-2013, Symisc Systems http://unqlite.org/ + * Version 1.1.6 + * For information on licensing, redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES + * please contact Symisc Systems via: + * legal@symisc.net + * licensing@symisc.net + * contact@symisc.net + * or visit: + * http://unqlite.org/licensing.html + */ + /* $SymiscID: os_unix.c v1.3 FreeBSD 2013-04-05 01:10 devel $ */ +#ifndef UNQLITE_AMALGAMATION +#include "unqliteInt.h" +#endif +/* + * Omit the whole layer from the build if compiling for platforms other than Unix (Linux, BSD, Solaris, OS X, etc.). + * Note: Mostly SQLite3 source tree. + */ +#if defined(__UNIXES__) +/** This file contains the VFS implementation for unix-like operating systems +** include Linux, MacOSX, *BSD, QNX, VxWorks, AIX, HPUX, and others. +** +** There are actually several different VFS implementations in this file. +** The differences are in the way that file locking is done. The default +** implementation uses Posix Advisory Locks. Alternative implementations +** use flock(), dot-files, various proprietary locking schemas, or simply +** skip locking all together. +** +** This source file is organized into divisions where the logic for various +** subfunctions is contained within the appropriate division. PLEASE +** KEEP THE STRUCTURE OF THIS FILE INTACT. New code should be placed +** in the correct division and should be clearly labeled. +** +*/ +/* +** standard include files. +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__APPLE__) +# include +#endif +/* +** Allowed values of unixFile.fsFlags +*/ +#define UNQLITE_FSFLAGS_IS_MSDOS 0x1 + +/* +** Default permissions when creating a new file +*/ +#ifndef UNQLITE_DEFAULT_FILE_PERMISSIONS +# define UNQLITE_DEFAULT_FILE_PERMISSIONS 0644 +#endif +/* + ** Default permissions when creating auto proxy dir + */ +#ifndef UNQLITE_DEFAULT_PROXYDIR_PERMISSIONS +# define UNQLITE_DEFAULT_PROXYDIR_PERMISSIONS 0755 +#endif +/* +** Maximum supported path-length. +*/ +#define MAX_PATHNAME 512 +/* +** Only set the lastErrno if the error code is a real error and not +** a normal expected return code of UNQLITE_BUSY or UNQLITE_OK +*/ +#define IS_LOCK_ERROR(x) ((x != UNQLITE_OK) && (x != UNQLITE_BUSY)) +/* Forward references */ +typedef struct unixInodeInfo unixInodeInfo; /* An i-node */ +typedef struct UnixUnusedFd UnixUnusedFd; /* An unused file descriptor */ +/* +** Sometimes, after a file handle is closed by SQLite, the file descriptor +** cannot be closed immediately. In these cases, instances of the following +** structure are used to store the file descriptor while waiting for an +** opportunity to either close or reuse it. +*/ +struct UnixUnusedFd { + int fd; /* File descriptor to close */ + int flags; /* Flags this file descriptor was opened with */ + UnixUnusedFd *pNext; /* Next unused file descriptor on same file */ +}; +/* +** The unixFile structure is subclass of unqlite3_file specific to the unix +** VFS implementations. +*/ +typedef struct unixFile unixFile; +struct unixFile { + const unqlite_io_methods *pMethod; /* Always the first entry */ + unixInodeInfo *pInode; /* Info about locks on this inode */ + int h; /* The file descriptor */ + int dirfd; /* File descriptor for the directory */ + unsigned char eFileLock; /* The type of lock held on this fd */ + int lastErrno; /* The unix errno from last I/O error */ + void *lockingContext; /* Locking style specific state */ + UnixUnusedFd *pUnused; /* Pre-allocated UnixUnusedFd */ + int fileFlags; /* Miscellanous flags */ + const char *zPath; /* Name of the file */ + unsigned fsFlags; /* cached details from statfs() */ +}; +/* +** The following macros define bits in unixFile.fileFlags +*/ +#define UNQLITE_WHOLE_FILE_LOCKING 0x0001 /* Use whole-file locking */ +/* +** Define various macros that are missing from some systems. +*/ +#ifndef O_LARGEFILE +# define O_LARGEFILE 0 +#endif +#ifndef O_NOFOLLOW +# define O_NOFOLLOW 0 +#endif +#ifndef O_BINARY +# define O_BINARY 0 +#endif +/* +** Helper functions to obtain and relinquish the global mutex. The +** global mutex is used to protect the unixInodeInfo and +** vxworksFileId objects used by this file, all of which may be +** shared by multiple threads. +** +** Function unixMutexHeld() is used to assert() that the global mutex +** is held when required. This function is only used as part of assert() +** statements. e.g. +** +** unixEnterMutex() +** assert( unixMutexHeld() ); +** unixEnterLeave() +*/ +static void unixEnterMutex(void){ +#ifdef UNQLITE_ENABLE_THREADS + const SyMutexMethods *pMutexMethods = SyMutexExportMethods(); + if( pMutexMethods ){ + SyMutex *pMutex = pMutexMethods->xNew(SXMUTEX_TYPE_STATIC_2); /* pre-allocated, never fail */ + SyMutexEnter(pMutexMethods,pMutex); + } +#endif /* UNQLITE_ENABLE_THREADS */ +} +static void unixLeaveMutex(void){ +#ifdef UNQLITE_ENABLE_THREADS + const SyMutexMethods *pMutexMethods = SyMutexExportMethods(); + if( pMutexMethods ){ + SyMutex *pMutex = pMutexMethods->xNew(SXMUTEX_TYPE_STATIC_2); /* pre-allocated, never fail */ + SyMutexLeave(pMutexMethods,pMutex); + } +#endif /* UNQLITE_ENABLE_THREADS */ +} +/* +** This routine translates a standard POSIX errno code into something +** useful to the clients of the unqlite3 functions. Specifically, it is +** intended to translate a variety of "try again" errors into UNQLITE_BUSY +** and a variety of "please close the file descriptor NOW" errors into +** UNQLITE_IOERR +** +** Errors during initialization of locks, or file system support for locks, +** should handle ENOLCK, ENOTSUP, EOPNOTSUPP separately. +*/ +static int unqliteErrorFromPosixError(int posixError, int unqliteIOErr) { + switch (posixError) { + case 0: + return UNQLITE_OK; + + case EAGAIN: + case ETIMEDOUT: + case EBUSY: + case EINTR: + case ENOLCK: + /* random NFS retry error, unless during file system support + * introspection, in which it actually means what it says */ + return UNQLITE_BUSY; + + case EACCES: + /* EACCES is like EAGAIN during locking operations, but not any other time*/ + return UNQLITE_BUSY; + + case EPERM: + return UNQLITE_PERM; + + case EDEADLK: + return UNQLITE_IOERR; + +#if EOPNOTSUPP!=ENOTSUP + case EOPNOTSUPP: + /* something went terribly awry, unless during file system support + * introspection, in which it actually means what it says */ +#endif +#ifdef ENOTSUP + case ENOTSUP: + /* invalid fd, unless during file system support introspection, in which + * it actually means what it says */ +#endif + case EIO: + case EBADF: + case EINVAL: + case ENOTCONN: + case ENODEV: + case ENXIO: + case ENOENT: + case ESTALE: + case ENOSYS: + /* these should force the client to close the file and reconnect */ + + default: + return unqliteIOErr; + } +} +/****************************************************************************** +*************************** Posix Advisory Locking **************************** +** +** POSIX advisory locks are broken by design. ANSI STD 1003.1 (1996) +** section 6.5.2.2 lines 483 through 490 specify that when a process +** sets or clears a lock, that operation overrides any prior locks set +** by the same process. It does not explicitly say so, but this implies +** that it overrides locks set by the same process using a different +** file descriptor. Consider this test case: +** +** int fd1 = open("./file1", O_RDWR|O_CREAT, 0644); +** int fd2 = open("./file2", O_RDWR|O_CREAT, 0644); +** +** Suppose ./file1 and ./file2 are really the same file (because +** one is a hard or symbolic link to the other) then if you set +** an exclusive lock on fd1, then try to get an exclusive lock +** on fd2, it works. I would have expected the second lock to +** fail since there was already a lock on the file due to fd1. +** But not so. Since both locks came from the same process, the +** second overrides the first, even though they were on different +** file descriptors opened on different file names. +** +** This means that we cannot use POSIX locks to synchronize file access +** among competing threads of the same process. POSIX locks will work fine +** to synchronize access for threads in separate processes, but not +** threads within the same process. +** +** To work around the problem, SQLite has to manage file locks internally +** on its own. Whenever a new database is opened, we have to find the +** specific inode of the database file (the inode is determined by the +** st_dev and st_ino fields of the stat structure that fstat() fills in) +** and check for locks already existing on that inode. When locks are +** created or removed, we have to look at our own internal record of the +** locks to see if another thread has previously set a lock on that same +** inode. +** +** (Aside: The use of inode numbers as unique IDs does not work on VxWorks. +** For VxWorks, we have to use the alternative unique ID system based on +** canonical filename and implemented in the previous division.) +** +** There is one locking structure +** per inode, so if the same inode is opened twice, both unixFile structures +** point to the same locking structure. The locking structure keeps +** a reference count (so we will know when to delete it) and a "cnt" +** field that tells us its internal lock status. cnt==0 means the +** file is unlocked. cnt==-1 means the file has an exclusive lock. +** cnt>0 means there are cnt shared locks on the file. +** +** Any attempt to lock or unlock a file first checks the locking +** structure. The fcntl() system call is only invoked to set a +** POSIX lock if the internal lock structure transitions between +** a locked and an unlocked state. +** +** But wait: there are yet more problems with POSIX advisory locks. +** +** If you close a file descriptor that points to a file that has locks, +** all locks on that file that are owned by the current process are +** released. To work around this problem, each unixInodeInfo object +** maintains a count of the number of pending locks on that inode. +** When an attempt is made to close an unixFile, if there are +** other unixFile open on the same inode that are holding locks, the call +** to close() the file descriptor is deferred until all of the locks clear. +** The unixInodeInfo structure keeps a list of file descriptors that need to +** be closed and that list is walked (and cleared) when the last lock +** clears. +** +** Yet another problem: LinuxThreads do not play well with posix locks. +** +** Many older versions of linux use the LinuxThreads library which is +** not posix compliant. Under LinuxThreads, a lock created by thread +** A cannot be modified or overridden by a different thread B. +** Only thread A can modify the lock. Locking behavior is correct +** if the appliation uses the newer Native Posix Thread Library (NPTL) +** on linux - with NPTL a lock created by thread A can override locks +** in thread B. But there is no way to know at compile-time which +** threading library is being used. So there is no way to know at +** compile-time whether or not thread A can override locks on thread B. +** One has to do a run-time check to discover the behavior of the +** current process. +** +*/ + +/* +** An instance of the following structure serves as the key used +** to locate a particular unixInodeInfo object. +*/ +struct unixFileId { + dev_t dev; /* Device number */ + ino_t ino; /* Inode number */ +}; +/* +** An instance of the following structure is allocated for each open +** inode. Or, on LinuxThreads, there is one of these structures for +** each inode opened by each thread. +** +** A single inode can have multiple file descriptors, so each unixFile +** structure contains a pointer to an instance of this object and this +** object keeps a count of the number of unixFile pointing to it. +*/ +struct unixInodeInfo { + struct unixFileId fileId; /* The lookup key */ + int nShared; /* Number of SHARED locks held */ + int eFileLock; /* One of SHARED_LOCK, RESERVED_LOCK etc. */ + int nRef; /* Number of pointers to this structure */ + int nLock; /* Number of outstanding file locks */ + UnixUnusedFd *pUnused; /* Unused file descriptors to close */ + unixInodeInfo *pNext; /* List of all unixInodeInfo objects */ + unixInodeInfo *pPrev; /* .... doubly linked */ +}; + +static unixInodeInfo *inodeList = 0; +/* + * Local memory allocation stuff. + */ +static void * unqlite_malloc(sxu32 nByte) +{ + SyMemBackend *pAlloc; + void *p; + pAlloc = (SyMemBackend *)unqliteExportMemBackend(); + p = SyMemBackendAlloc(pAlloc,nByte); + return p; +} +static void unqlite_free(void *p) +{ + SyMemBackend *pAlloc; + pAlloc = (SyMemBackend *)unqliteExportMemBackend(); + SyMemBackendFree(pAlloc,p); +} +/* +** Close all file descriptors accumuated in the unixInodeInfo->pUnused list. +** If all such file descriptors are closed without error, the list is +** cleared and UNQLITE_OK returned. +** +** Otherwise, if an error occurs, then successfully closed file descriptor +** entries are removed from the list, and UNQLITE_IOERR_CLOSE returned. +** not deleted and UNQLITE_IOERR_CLOSE returned. +*/ +static int closePendingFds(unixFile *pFile){ + int rc = UNQLITE_OK; + unixInodeInfo *pInode = pFile->pInode; + UnixUnusedFd *pError = 0; + UnixUnusedFd *p; + UnixUnusedFd *pNext; + for(p=pInode->pUnused; p; p=pNext){ + pNext = p->pNext; + if( close(p->fd) ){ + pFile->lastErrno = errno; + rc = UNQLITE_IOERR; + p->pNext = pError; + pError = p; + }else{ + unqlite_free(p); + } + } + pInode->pUnused = pError; + return rc; +} +/* +** Release a unixInodeInfo structure previously allocated by findInodeInfo(). +** +** The mutex entered using the unixEnterMutex() function must be held +** when this function is called. +*/ +static void releaseInodeInfo(unixFile *pFile){ + unixInodeInfo *pInode = pFile->pInode; + if( pInode ){ + pInode->nRef--; + if( pInode->nRef==0 ){ + closePendingFds(pFile); + if( pInode->pPrev ){ + pInode->pPrev->pNext = pInode->pNext; + }else{ + inodeList = pInode->pNext; + } + if( pInode->pNext ){ + pInode->pNext->pPrev = pInode->pPrev; + } + unqlite_free(pInode); + } + } +} +/* +** Given a file descriptor, locate the unixInodeInfo object that +** describes that file descriptor. Create a new one if necessary. The +** return value might be uninitialized if an error occurs. +** +** The mutex entered using the unixEnterMutex() function must be held +** when this function is called. +** +** Return an appropriate error code. +*/ +static int findInodeInfo( + unixFile *pFile, /* Unix file with file desc used in the key */ + unixInodeInfo **ppInode /* Return the unixInodeInfo object here */ +){ + int rc; /* System call return code */ + int fd; /* The file descriptor for pFile */ + struct unixFileId fileId; /* Lookup key for the unixInodeInfo */ + struct stat statbuf; /* Low-level file information */ + unixInodeInfo *pInode = 0; /* Candidate unixInodeInfo object */ + + /* Get low-level information about the file that we can used to + ** create a unique name for the file. + */ + fd = pFile->h; + rc = fstat(fd, &statbuf); + if( rc!=0 ){ + pFile->lastErrno = errno; +#ifdef EOVERFLOW + if( pFile->lastErrno==EOVERFLOW ) return UNQLITE_NOTIMPLEMENTED; +#endif + return UNQLITE_IOERR; + } + +#ifdef __APPLE__ + /* On OS X on an msdos filesystem, the inode number is reported + ** incorrectly for zero-size files. See ticket #3260. To work + ** around this problem (we consider it a bug in OS X, not SQLite) + ** we always increase the file size to 1 by writing a single byte + ** prior to accessing the inode number. The one byte written is + ** an ASCII 'S' character which also happens to be the first byte + ** in the header of every SQLite database. In this way, if there + ** is a race condition such that another thread has already populated + ** the first page of the database, no damage is done. + */ + if( statbuf.st_size==0 && (pFile->fsFlags & UNQLITE_FSFLAGS_IS_MSDOS)!=0 ){ + rc = write(fd, "S", 1); + if( rc!=1 ){ + pFile->lastErrno = errno; + return UNQLITE_IOERR; + } + rc = fstat(fd, &statbuf); + if( rc!=0 ){ + pFile->lastErrno = errno; + return UNQLITE_IOERR; + } + } +#endif + SyZero(&fileId,sizeof(fileId)); + fileId.dev = statbuf.st_dev; + fileId.ino = statbuf.st_ino; + pInode = inodeList; + while( pInode && SyMemcmp((const void *)&fileId,(const void *)&pInode->fileId, sizeof(fileId)) ){ + pInode = pInode->pNext; + } + if( pInode==0 ){ + pInode = (unixInodeInfo *)unqlite_malloc( sizeof(*pInode) ); + if( pInode==0 ){ + return UNQLITE_NOMEM; + } + SyZero(pInode,sizeof(*pInode)); + SyMemcpy((const void *)&fileId,(void *)&pInode->fileId,sizeof(fileId)); + pInode->nRef = 1; + pInode->pNext = inodeList; + pInode->pPrev = 0; + if( inodeList ) inodeList->pPrev = pInode; + inodeList = pInode; + }else{ + pInode->nRef++; + } + *ppInode = pInode; + return UNQLITE_OK; +} +/* +** This routine checks if there is a RESERVED lock held on the specified +** file by this or any other process. If such a lock is held, set *pResOut +** to a non-zero value otherwise *pResOut is set to zero. The return value +** is set to UNQLITE_OK unless an I/O error occurs during lock checking. +*/ +static int unixCheckReservedLock(unqlite_file *id, int *pResOut){ + int rc = UNQLITE_OK; + int reserved = 0; + unixFile *pFile = (unixFile*)id; + + + unixEnterMutex(); /* Because pFile->pInode is shared across threads */ + + /* Check if a thread in this process holds such a lock */ + if( pFile->pInode->eFileLock>SHARED_LOCK ){ + reserved = 1; + } + + /* Otherwise see if some other process holds it. + */ + if( !reserved ){ + struct flock lock; + lock.l_whence = SEEK_SET; + lock.l_start = RESERVED_BYTE; + lock.l_len = 1; + lock.l_type = F_WRLCK; + if (-1 == fcntl(pFile->h, F_GETLK, &lock)) { + int tErrno = errno; + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + pFile->lastErrno = tErrno; + } else if( lock.l_type!=F_UNLCK ){ + reserved = 1; + } + } + + unixLeaveMutex(); + + *pResOut = reserved; + return rc; +} +/* +** Lock the file with the lock specified by parameter eFileLock - one +** of the following: +** +** (1) SHARED_LOCK +** (2) RESERVED_LOCK +** (3) PENDING_LOCK +** (4) EXCLUSIVE_LOCK +** +** Sometimes when requesting one lock state, additional lock states +** are inserted in between. The locking might fail on one of the later +** transitions leaving the lock state different from what it started but +** still short of its goal. The following chart shows the allowed +** transitions and the inserted intermediate states: +** +** UNLOCKED -> SHARED +** SHARED -> RESERVED +** SHARED -> (PENDING) -> EXCLUSIVE +** RESERVED -> (PENDING) -> EXCLUSIVE +** PENDING -> EXCLUSIVE +** +** This routine will only increase a lock. Use the unqliteOsUnlock() +** routine to lower a locking level. +*/ +static int unixLock(unqlite_file *id, int eFileLock){ + /* The following describes the implementation of the various locks and + ** lock transitions in terms of the POSIX advisory shared and exclusive + ** lock primitives (called read-locks and write-locks below, to avoid + ** confusion with SQLite lock names). The algorithms are complicated + ** slightly in order to be compatible with unixdows systems simultaneously + ** accessing the same database file, in case that is ever required. + ** + ** Symbols defined in os.h indentify the 'pending byte' and the 'reserved + ** byte', each single bytes at well known offsets, and the 'shared byte + ** range', a range of 510 bytes at a well known offset. + ** + ** To obtain a SHARED lock, a read-lock is obtained on the 'pending + ** byte'. If this is successful, a random byte from the 'shared byte + ** range' is read-locked and the lock on the 'pending byte' released. + ** + ** A process may only obtain a RESERVED lock after it has a SHARED lock. + ** A RESERVED lock is implemented by grabbing a write-lock on the + ** 'reserved byte'. + ** + ** A process may only obtain a PENDING lock after it has obtained a + ** SHARED lock. A PENDING lock is implemented by obtaining a write-lock + ** on the 'pending byte'. This ensures that no new SHARED locks can be + ** obtained, but existing SHARED locks are allowed to persist. A process + ** does not have to obtain a RESERVED lock on the way to a PENDING lock. + ** This property is used by the algorithm for rolling back a journal file + ** after a crash. + ** + ** An EXCLUSIVE lock, obtained after a PENDING lock is held, is + ** implemented by obtaining a write-lock on the entire 'shared byte + ** range'. Since all other locks require a read-lock on one of the bytes + ** within this range, this ensures that no other locks are held on the + ** database. + ** + ** The reason a single byte cannot be used instead of the 'shared byte + ** range' is that some versions of unixdows do not support read-locks. By + ** locking a random byte from a range, concurrent SHARED locks may exist + ** even if the locking primitive used is always a write-lock. + */ + int rc = UNQLITE_OK; + unixFile *pFile = (unixFile*)id; + unixInodeInfo *pInode = pFile->pInode; + struct flock lock; + int s = 0; + int tErrno = 0; + + /* If there is already a lock of this type or more restrictive on the + ** unixFile, do nothing. Don't use the end_lock: exit path, as + ** unixEnterMutex() hasn't been called yet. + */ + if( pFile->eFileLock>=eFileLock ){ + return UNQLITE_OK; + } + /* This mutex is needed because pFile->pInode is shared across threads + */ + unixEnterMutex(); + pInode = pFile->pInode; + + /* If some thread using this PID has a lock via a different unixFile* + ** handle that precludes the requested lock, return BUSY. + */ + if( (pFile->eFileLock!=pInode->eFileLock && + (pInode->eFileLock>=PENDING_LOCK || eFileLock>SHARED_LOCK)) + ){ + rc = UNQLITE_BUSY; + goto end_lock; + } + + /* If a SHARED lock is requested, and some thread using this PID already + ** has a SHARED or RESERVED lock, then increment reference counts and + ** return UNQLITE_OK. + */ + if( eFileLock==SHARED_LOCK && + (pInode->eFileLock==SHARED_LOCK || pInode->eFileLock==RESERVED_LOCK) ){ + pFile->eFileLock = SHARED_LOCK; + pInode->nShared++; + pInode->nLock++; + goto end_lock; + } + /* A PENDING lock is needed before acquiring a SHARED lock and before + ** acquiring an EXCLUSIVE lock. For the SHARED lock, the PENDING will + ** be released. + */ + lock.l_len = 1L; + lock.l_whence = SEEK_SET; + if( eFileLock==SHARED_LOCK + || (eFileLock==EXCLUSIVE_LOCK && pFile->eFileLockh, F_SETLK, &lock); + if( s==(-1) ){ + tErrno = errno; + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + if( IS_LOCK_ERROR(rc) ){ + pFile->lastErrno = tErrno; + } + goto end_lock; + } + } + /* If control gets to this point, then actually go ahead and make + ** operating system calls for the specified lock. + */ + if( eFileLock==SHARED_LOCK ){ + /* Now get the read-lock */ + lock.l_start = SHARED_FIRST; + lock.l_len = SHARED_SIZE; + if( (s = fcntl(pFile->h, F_SETLK, &lock))==(-1) ){ + tErrno = errno; + } + /* Drop the temporary PENDING lock */ + lock.l_start = PENDING_BYTE; + lock.l_len = 1L; + lock.l_type = F_UNLCK; + if( fcntl(pFile->h, F_SETLK, &lock)!=0 ){ + if( s != -1 ){ + /* This could happen with a network mount */ + tErrno = errno; + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + if( IS_LOCK_ERROR(rc) ){ + pFile->lastErrno = tErrno; + } + goto end_lock; + } + } + if( s==(-1) ){ + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + if( IS_LOCK_ERROR(rc) ){ + pFile->lastErrno = tErrno; + } + }else{ + pFile->eFileLock = SHARED_LOCK; + pInode->nLock++; + pInode->nShared = 1; + } + }else if( eFileLock==EXCLUSIVE_LOCK && pInode->nShared>1 ){ + /* We are trying for an exclusive lock but another thread in this + ** same process is still holding a shared lock. */ + rc = UNQLITE_BUSY; + }else{ + /* The request was for a RESERVED or EXCLUSIVE lock. It is + ** assumed that there is a SHARED or greater lock on the file + ** already. + */ + lock.l_type = F_WRLCK; + switch( eFileLock ){ + case RESERVED_LOCK: + lock.l_start = RESERVED_BYTE; + break; + case EXCLUSIVE_LOCK: + lock.l_start = SHARED_FIRST; + lock.l_len = SHARED_SIZE; + break; + default: + /* Can't happen */ + break; + } + s = fcntl(pFile->h, F_SETLK, &lock); + if( s==(-1) ){ + tErrno = errno; + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + if( IS_LOCK_ERROR(rc) ){ + pFile->lastErrno = tErrno; + } + } + } + if( rc==UNQLITE_OK ){ + pFile->eFileLock = eFileLock; + pInode->eFileLock = eFileLock; + }else if( eFileLock==EXCLUSIVE_LOCK ){ + pFile->eFileLock = PENDING_LOCK; + pInode->eFileLock = PENDING_LOCK; + } +end_lock: + unixLeaveMutex(); + return rc; +} +/* +** Add the file descriptor used by file handle pFile to the corresponding +** pUnused list. +*/ +static void setPendingFd(unixFile *pFile){ + unixInodeInfo *pInode = pFile->pInode; + UnixUnusedFd *p = pFile->pUnused; + p->pNext = pInode->pUnused; + pInode->pUnused = p; + pFile->h = -1; + pFile->pUnused = 0; +} +/* +** Lower the locking level on file descriptor pFile to eFileLock. eFileLock +** must be either NO_LOCK or SHARED_LOCK. +** +** If the locking level of the file descriptor is already at or below +** the requested locking level, this routine is a no-op. +** +** If handleNFSUnlock is true, then on downgrading an EXCLUSIVE_LOCK to SHARED +** the byte range is divided into 2 parts and the first part is unlocked then +** set to a read lock, then the other part is simply unlocked. This works +** around a bug in BSD NFS lockd (also seen on MacOSX 10.3+) that fails to +** remove the write lock on a region when a read lock is set. +*/ +static int _posixUnlock(unqlite_file *id, int eFileLock, int handleNFSUnlock){ + unixFile *pFile = (unixFile*)id; + unixInodeInfo *pInode; + struct flock lock; + int rc = UNQLITE_OK; + int h; + int tErrno; /* Error code from system call errors */ + + if( pFile->eFileLock<=eFileLock ){ + return UNQLITE_OK; + } + unixEnterMutex(); + + h = pFile->h; + pInode = pFile->pInode; + + if( pFile->eFileLock>SHARED_LOCK ){ + /* downgrading to a shared lock on NFS involves clearing the write lock + ** before establishing the readlock - to avoid a race condition we downgrade + ** the lock in 2 blocks, so that part of the range will be covered by a + ** write lock until the rest is covered by a read lock: + ** 1: [WWWWW] + ** 2: [....W] + ** 3: [RRRRW] + ** 4: [RRRR.] + */ + if( eFileLock==SHARED_LOCK ){ + if( handleNFSUnlock ){ + off_t divSize = SHARED_SIZE - 1; + + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = SHARED_FIRST; + lock.l_len = divSize; + if( fcntl(h, F_SETLK, &lock)==(-1) ){ + tErrno = errno; + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + if( IS_LOCK_ERROR(rc) ){ + pFile->lastErrno = tErrno; + } + goto end_unlock; + } + lock.l_type = F_RDLCK; + lock.l_whence = SEEK_SET; + lock.l_start = SHARED_FIRST; + lock.l_len = divSize; + if( fcntl(h, F_SETLK, &lock)==(-1) ){ + tErrno = errno; + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + if( IS_LOCK_ERROR(rc) ){ + pFile->lastErrno = tErrno; + } + goto end_unlock; + } + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = SHARED_FIRST+divSize; + lock.l_len = SHARED_SIZE-divSize; + if( fcntl(h, F_SETLK, &lock)==(-1) ){ + tErrno = errno; + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + if( IS_LOCK_ERROR(rc) ){ + pFile->lastErrno = tErrno; + } + goto end_unlock; + } + }else{ + lock.l_type = F_RDLCK; + lock.l_whence = SEEK_SET; + lock.l_start = SHARED_FIRST; + lock.l_len = SHARED_SIZE; + if( fcntl(h, F_SETLK, &lock)==(-1) ){ + tErrno = errno; + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + if( IS_LOCK_ERROR(rc) ){ + pFile->lastErrno = tErrno; + } + goto end_unlock; + } + } + } + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = PENDING_BYTE; + lock.l_len = 2L; + if( fcntl(h, F_SETLK, &lock)!=(-1) ){ + pInode->eFileLock = SHARED_LOCK; + }else{ + tErrno = errno; + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + if( IS_LOCK_ERROR(rc) ){ + pFile->lastErrno = tErrno; + } + goto end_unlock; + } + } + if( eFileLock==NO_LOCK ){ + /* Decrement the shared lock counter. Release the lock using an + ** OS call only when all threads in this same process have released + ** the lock. + */ + pInode->nShared--; + if( pInode->nShared==0 ){ + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = lock.l_len = 0L; + + if( fcntl(h, F_SETLK, &lock)!=(-1) ){ + pInode->eFileLock = NO_LOCK; + }else{ + tErrno = errno; + rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); + if( IS_LOCK_ERROR(rc) ){ + pFile->lastErrno = tErrno; + } + pInode->eFileLock = NO_LOCK; + pFile->eFileLock = NO_LOCK; + } + } + + /* Decrement the count of locks against this same file. When the + ** count reaches zero, close any other file descriptors whose close + ** was deferred because of outstanding locks. + */ + pInode->nLock--; + + if( pInode->nLock==0 ){ + int rc2 = closePendingFds(pFile); + if( rc==UNQLITE_OK ){ + rc = rc2; + } + } + } + +end_unlock: + + unixLeaveMutex(); + + if( rc==UNQLITE_OK ) pFile->eFileLock = eFileLock; + return rc; +} +/* +** Lower the locking level on file descriptor pFile to eFileLock. eFileLock +** must be either NO_LOCK or SHARED_LOCK. +** +** If the locking level of the file descriptor is already at or below +** the requested locking level, this routine is a no-op. +*/ +static int unixUnlock(unqlite_file *id, int eFileLock){ + return _posixUnlock(id, eFileLock, 0); +} +/* +** This function performs the parts of the "close file" operation +** common to all locking schemes. It closes the directory and file +** handles, if they are valid, and sets all fields of the unixFile +** structure to 0. +** +*/ +static int closeUnixFile(unqlite_file *id){ + unixFile *pFile = (unixFile*)id; + if( pFile ){ + if( pFile->dirfd>=0 ){ + int err = close(pFile->dirfd); + if( err ){ + pFile->lastErrno = errno; + return UNQLITE_IOERR; + }else{ + pFile->dirfd=-1; + } + } + if( pFile->h>=0 ){ + int err = close(pFile->h); + if( err ){ + pFile->lastErrno = errno; + return UNQLITE_IOERR; + } + } + unqlite_free(pFile->pUnused); + SyZero(pFile,sizeof(unixFile)); + } + return UNQLITE_OK; +} +/* +** Close a file. +*/ +static int unixClose(unqlite_file *id){ + int rc = UNQLITE_OK; + if( id ){ + unixFile *pFile = (unixFile *)id; + unixUnlock(id, NO_LOCK); + unixEnterMutex(); + if( pFile->pInode && pFile->pInode->nLock ){ + /* If there are outstanding locks, do not actually close the file just + ** yet because that would clear those locks. Instead, add the file + ** descriptor to pInode->pUnused list. It will be automatically closed + ** when the last lock is cleared. + */ + setPendingFd(pFile); + } + releaseInodeInfo(pFile); + rc = closeUnixFile(id); + unixLeaveMutex(); + } + return rc; +} +/************** End of the posix advisory lock implementation ***************** +******************************************************************************/ +/* +** +** The next division contains implementations for all methods of the +** unqlite_file object other than the locking methods. The locking +** methods were defined in divisions above (one locking method per +** division). Those methods that are common to all locking modes +** are gather together into this division. +*/ +/* +** Seek to the offset passed as the second argument, then read cnt +** bytes into pBuf. Return the number of bytes actually read. +** +** NB: If you define USE_PREAD or USE_PREAD64, then it might also +** be necessary to define _XOPEN_SOURCE to be 500. This varies from +** one system to another. Since SQLite does not define USE_PREAD +** any form by default, we will not attempt to define _XOPEN_SOURCE. +** See tickets #2741 and #2681. +** +** To avoid stomping the errno value on a failed read the lastErrno value +** is set before returning. +*/ +static int seekAndRead(unixFile *id, unqlite_int64 offset, void *pBuf, int cnt){ + int got; +#if (!defined(USE_PREAD) && !defined(USE_PREAD64)) + unqlite_int64 newOffset; +#endif + +#if defined(USE_PREAD) + got = pread(id->h, pBuf, cnt, offset); +#elif defined(USE_PREAD64) + got = pread64(id->h, pBuf, cnt, offset); +#else + newOffset = lseek(id->h, offset, SEEK_SET); + + if( newOffset!=offset ){ + if( newOffset == -1 ){ + ((unixFile*)id)->lastErrno = errno; + }else{ + ((unixFile*)id)->lastErrno = 0; + } + return -1; + } + got = read(id->h, pBuf, cnt); +#endif + if( got<0 ){ + ((unixFile*)id)->lastErrno = errno; + } + return got; +} +/* +** Read data from a file into a buffer. Return UNQLITE_OK if all +** bytes were read successfully and UNQLITE_IOERR if anything goes +** wrong. +*/ +static int unixRead( + unqlite_file *id, + void *pBuf, + unqlite_int64 amt, + unqlite_int64 offset +){ + unixFile *pFile = (unixFile *)id; + int got; + + got = seekAndRead(pFile, offset, pBuf, (int)amt); + if( got==(int)amt ){ + return UNQLITE_OK; + }else if( got<0 ){ + /* lastErrno set by seekAndRead */ + return UNQLITE_IOERR; + }else{ + pFile->lastErrno = 0; /* not a system error */ + /* Unread parts of the buffer must be zero-filled */ + SyZero(&((char*)pBuf)[got],(sxu32)amt-got); + return UNQLITE_IOERR; + } +} +/* +** Seek to the offset in id->offset then read cnt bytes into pBuf. +** Return the number of bytes actually read. Update the offset. +** +** To avoid stomping the errno value on a failed write the lastErrno value +** is set before returning. +*/ +static int seekAndWrite(unixFile *id, unqlite_int64 offset, const void *pBuf, unqlite_int64 cnt){ + int got; +#if (!defined(USE_PREAD) && !defined(USE_PREAD64)) + unqlite_int64 newOffset; +#endif + +#if defined(USE_PREAD) + got = pwrite(id->h, pBuf, cnt, offset); +#elif defined(USE_PREAD64) + got = pwrite64(id->h, pBuf, cnt, offset); +#else + newOffset = lseek(id->h, offset, SEEK_SET); + if( newOffset!=offset ){ + if( newOffset == -1 ){ + ((unixFile*)id)->lastErrno = errno; + }else{ + ((unixFile*)id)->lastErrno = 0; + } + return -1; + } + got = write(id->h, pBuf, cnt); +#endif + if( got<0 ){ + ((unixFile*)id)->lastErrno = errno; + } + return got; +} +/* +** Write data from a buffer into a file. Return UNQLITE_OK on success +** or some other error code on failure. +*/ +static int unixWrite( + unqlite_file *id, + const void *pBuf, + unqlite_int64 amt, + unqlite_int64 offset +){ + unixFile *pFile = (unixFile*)id; + int wrote = 0; + + while( amt>0 && (wrote = seekAndWrite(pFile, offset, pBuf, amt))>0 ){ + amt -= wrote; + offset += wrote; + pBuf = &((char*)pBuf)[wrote]; + } + + if( amt>0 ){ + if( wrote<0 ){ + /* lastErrno set by seekAndWrite */ + return UNQLITE_IOERR; + }else{ + pFile->lastErrno = 0; /* not a system error */ + return UNQLITE_FULL; + } + } + return UNQLITE_OK; +} +/* +** We do not trust systems to provide a working fdatasync(). Some do. +** Others do no. To be safe, we will stick with the (slower) fsync(). +** If you know that your system does support fdatasync() correctly, +** then simply compile with -Dfdatasync=fdatasync +*/ +#if !defined(fdatasync) && !defined(__linux__) +# define fdatasync fsync +#endif + +/* +** Define HAVE_FULLFSYNC to 0 or 1 depending on whether or not +** the F_FULLFSYNC macro is defined. F_FULLFSYNC is currently +** only available on Mac OS X. But that could change. +*/ +#ifdef F_FULLFSYNC +# define HAVE_FULLFSYNC 1 +#else +# define HAVE_FULLFSYNC 0 +#endif +/* +** The fsync() system call does not work as advertised on many +** unix systems. The following procedure is an attempt to make +** it work better. +** +** +** SQLite sets the dataOnly flag if the size of the file is unchanged. +** The idea behind dataOnly is that it should only write the file content +** to disk, not the inode. We only set dataOnly if the file size is +** unchanged since the file size is part of the inode. However, +** Ted Ts'o tells us that fdatasync() will also write the inode if the +** file size has changed. The only real difference between fdatasync() +** and fsync(), Ted tells us, is that fdatasync() will not flush the +** inode if the mtime or owner or other inode attributes have changed. +** We only care about the file size, not the other file attributes, so +** as far as SQLite is concerned, an fdatasync() is always adequate. +** So, we always use fdatasync() if it is available, regardless of +** the value of the dataOnly flag. +*/ +static int full_fsync(int fd, int fullSync, int dataOnly){ + int rc; +#if HAVE_FULLFSYNC + SXUNUSED(dataOnly); +#else + SXUNUSED(fullSync); + SXUNUSED(dataOnly); +#endif + + /* If we compiled with the UNQLITE_NO_SYNC flag, then syncing is a + ** no-op + */ +#if HAVE_FULLFSYNC + if( fullSync ){ + rc = fcntl(fd, F_FULLFSYNC, 0); + }else{ + rc = 1; + } + /* If the FULLFSYNC failed, fall back to attempting an fsync(). + ** It shouldn't be possible for fullfsync to fail on the local + ** file system (on OSX), so failure indicates that FULLFSYNC + ** isn't supported for this file system. So, attempt an fsync + ** and (for now) ignore the overhead of a superfluous fcntl call. + ** It'd be better to detect fullfsync support once and avoid + ** the fcntl call every time sync is called. + */ + if( rc ) rc = fsync(fd); + +#elif defined(__APPLE__) + /* fdatasync() on HFS+ doesn't yet flush the file size if it changed correctly + ** so currently we default to the macro that redefines fdatasync to fsync + */ + rc = fsync(fd); +#else + rc = fdatasync(fd); +#endif /* ifdef UNQLITE_NO_SYNC elif HAVE_FULLFSYNC */ + if( rc!= -1 ){ + rc = 0; + } + return rc; +} +/* +** Make sure all writes to a particular file are committed to disk. +** +** If dataOnly==0 then both the file itself and its metadata (file +** size, access time, etc) are synced. If dataOnly!=0 then only the +** file data is synced. +** +** Under Unix, also make sure that the directory entry for the file +** has been created by fsync-ing the directory that contains the file. +** If we do not do this and we encounter a power failure, the directory +** entry for the journal might not exist after we reboot. The next +** SQLite to access the file will not know that the journal exists (because +** the directory entry for the journal was never created) and the transaction +** will not roll back - possibly leading to database corruption. +*/ +static int unixSync(unqlite_file *id, int flags){ + int rc; + unixFile *pFile = (unixFile*)id; + + int isDataOnly = (flags&UNQLITE_SYNC_DATAONLY); + int isFullsync = (flags&0x0F)==UNQLITE_SYNC_FULL; + + rc = full_fsync(pFile->h, isFullsync, isDataOnly); + + if( rc ){ + pFile->lastErrno = errno; + return UNQLITE_IOERR; + } + if( pFile->dirfd>=0 ){ + int err; +#ifndef UNQLITE_DISABLE_DIRSYNC + /* The directory sync is only attempted if full_fsync is + ** turned off or unavailable. If a full_fsync occurred above, + ** then the directory sync is superfluous. + */ + if( (!HAVE_FULLFSYNC || !isFullsync) && full_fsync(pFile->dirfd,0,0) ){ + /* + ** We have received multiple reports of fsync() returning + ** errors when applied to directories on certain file systems. + ** A failed directory sync is not a big deal. So it seems + ** better to ignore the error. Ticket #1657 + */ + /* pFile->lastErrno = errno; */ + /* return UNQLITE_IOERR; */ + } +#endif + err = close(pFile->dirfd); /* Only need to sync once, so close the */ + if( err==0 ){ /* directory when we are done */ + pFile->dirfd = -1; + }else{ + pFile->lastErrno = errno; + rc = UNQLITE_IOERR; + } + } + return rc; +} +/* +** Truncate an open file to a specified size +*/ +static int unixTruncate(unqlite_file *id, sxi64 nByte){ + unixFile *pFile = (unixFile *)id; + int rc; + + rc = ftruncate(pFile->h, (off_t)nByte); + if( rc ){ + pFile->lastErrno = errno; + return UNQLITE_IOERR; + }else{ + return UNQLITE_OK; + } +} +/* +** Determine the current size of a file in bytes +*/ +static int unixFileSize(unqlite_file *id,sxi64 *pSize){ + int rc; + struct stat buf; + + rc = fstat(((unixFile*)id)->h, &buf); + + if( rc!=0 ){ + ((unixFile*)id)->lastErrno = errno; + return UNQLITE_IOERR; + } + *pSize = buf.st_size; + + /* When opening a zero-size database, the findInodeInfo() procedure + ** writes a single byte into that file in order to work around a bug + ** in the OS-X msdos filesystem. In order to avoid problems with upper + ** layers, we need to report this file size as zero even though it is + ** really 1. Ticket #3260. + */ + if( *pSize==1 ) *pSize = 0; + + return UNQLITE_OK; +} +/* +** Return the sector size in bytes of the underlying block device for +** the specified file. This is almost always 512 bytes, but may be +** larger for some devices. +** +** SQLite code assumes this function cannot fail. It also assumes that +** if two files are created in the same file-system directory (i.e. +** a database and its journal file) that the sector size will be the +** same for both. +*/ +static int unixSectorSize(unqlite_file *NotUsed){ + SXUNUSED(NotUsed); + return UNQLITE_DEFAULT_SECTOR_SIZE; +} +/* +** This vector defines all the methods that can operate on an +** unqlite_file for Windows systems. +*/ +static const unqlite_io_methods unixIoMethod = { + 1, /* iVersion */ + unixClose, /* xClose */ + unixRead, /* xRead */ + unixWrite, /* xWrite */ + unixTruncate, /* xTruncate */ + unixSync, /* xSync */ + unixFileSize, /* xFileSize */ + unixLock, /* xLock */ + unixUnlock, /* xUnlock */ + unixCheckReservedLock, /* xCheckReservedLock */ + unixSectorSize, /* xSectorSize */ +}; +/**************************************************************************** +**************************** unqlite_vfs methods **************************** +** +** This division contains the implementation of methods on the +** unqlite_vfs object. +*/ +/* +** Initialize the contents of the unixFile structure pointed to by pId. +*/ +static int fillInUnixFile( + unqlite_vfs *pVfs, /* Pointer to vfs object */ + int h, /* Open file descriptor of file being opened */ + int dirfd, /* Directory file descriptor */ + unqlite_file *pId, /* Write to the unixFile structure here */ + const char *zFilename, /* Name of the file being opened */ + int noLock, /* Omit locking if true */ + int isDelete /* Delete on close if true */ +){ + const unqlite_io_methods *pLockingStyle = &unixIoMethod; + unixFile *pNew = (unixFile *)pId; + int rc = UNQLITE_OK; + + /* Parameter isDelete is only used on vxworks. Express this explicitly + ** here to prevent compiler warnings about unused parameters. + */ + SXUNUSED(isDelete); + SXUNUSED(noLock); + SXUNUSED(pVfs); + + pNew->h = h; + pNew->dirfd = dirfd; + pNew->fileFlags = 0; + pNew->zPath = zFilename; + + unixEnterMutex(); + rc = findInodeInfo(pNew, &pNew->pInode); + if( rc!=UNQLITE_OK ){ + /* If an error occured in findInodeInfo(), close the file descriptor + ** immediately, before releasing the mutex. findInodeInfo() may fail + ** in two scenarios: + ** + ** (a) A call to fstat() failed. + ** (b) A malloc failed. + ** + ** Scenario (b) may only occur if the process is holding no other + ** file descriptors open on the same file. If there were other file + ** descriptors on this file, then no malloc would be required by + ** findInodeInfo(). If this is the case, it is quite safe to close + ** handle h - as it is guaranteed that no posix locks will be released + ** by doing so. + ** + ** If scenario (a) caused the error then things are not so safe. The + ** implicit assumption here is that if fstat() fails, things are in + ** such bad shape that dropping a lock or two doesn't matter much. + */ + close(h); + h = -1; + } + unixLeaveMutex(); + + pNew->lastErrno = 0; + if( rc!=UNQLITE_OK ){ + if( dirfd>=0 ) close(dirfd); /* silent leak if fail, already in error */ + if( h>=0 ) close(h); + }else{ + pNew->pMethod = pLockingStyle; + } + return rc; +} +/* +** Open a file descriptor to the directory containing file zFilename. +** If successful, *pFd is set to the opened file descriptor and +** UNQLITE_OK is returned. If an error occurs, either UNQLITE_NOMEM +** or UNQLITE_CANTOPEN is returned and *pFd is set to an undefined +** value. +** +** If UNQLITE_OK is returned, the caller is responsible for closing +** the file descriptor *pFd using close(). +*/ +static int openDirectory(const char *zFilename, int *pFd){ + sxu32 ii; + int fd = -1; + char zDirname[MAX_PATHNAME+1]; + sxu32 n; + n = Systrcpy(zDirname,sizeof(zDirname),zFilename,0); + for(ii=n; ii>1 && zDirname[ii]!='/'; ii--); + if( ii>0 ){ + zDirname[ii] = '\0'; + fd = open(zDirname, O_RDONLY|O_BINARY, 0); + if( fd>=0 ){ +#ifdef FD_CLOEXEC + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD, 0) | FD_CLOEXEC); +#endif + } + } + *pFd = fd; + return (fd>=0?UNQLITE_OK: UNQLITE_IOERR ); +} +/* +** Search for an unused file descriptor that was opened on the database +** file (not a journal or master-journal file) identified by pathname +** zPath with UNQLITE_OPEN_XXX flags matching those passed as the second +** argument to this function. +** +** Such a file descriptor may exist if a database connection was closed +** but the associated file descriptor could not be closed because some +** other file descriptor open on the same file is holding a file-lock. +** Refer to comments in the unixClose() function and the lengthy comment +** describing "Posix Advisory Locking" at the start of this file for +** further details. Also, ticket #4018. +** +** If a suitable file descriptor is found, then it is returned. If no +** such file descriptor is located, -1 is returned. +*/ +static UnixUnusedFd *findReusableFd(const char *zPath, int flags){ + UnixUnusedFd *pUnused = 0; + struct stat sStat; /* Results of stat() call */ + /* A stat() call may fail for various reasons. If this happens, it is + ** almost certain that an open() call on the same path will also fail. + ** For this reason, if an error occurs in the stat() call here, it is + ** ignored and -1 is returned. The caller will try to open a new file + ** descriptor on the same path, fail, and return an error to SQLite. + ** + ** Even if a subsequent open() call does succeed, the consequences of + ** not searching for a resusable file descriptor are not dire. */ + if( 0==stat(zPath, &sStat) ){ + unixInodeInfo *pInode; + + unixEnterMutex(); + pInode = inodeList; + while( pInode && (pInode->fileId.dev!=sStat.st_dev + || pInode->fileId.ino!=sStat.st_ino) ){ + pInode = pInode->pNext; + } + if( pInode ){ + UnixUnusedFd **pp; + for(pp=&pInode->pUnused; *pp && (*pp)->flags!=flags; pp=&((*pp)->pNext)); + pUnused = *pp; + if( pUnused ){ + *pp = pUnused->pNext; + } + } + unixLeaveMutex(); + } + return pUnused; +} +/* +** This function is called by unixOpen() to determine the unix permissions +** to create new files with. If no error occurs, then UNQLITE_OK is returned +** and a value suitable for passing as the third argument to open(2) is +** written to *pMode. If an IO error occurs, an SQLite error code is +** returned and the value of *pMode is not modified. +** +** If the file being opened is a temporary file, it is always created with +** the octal permissions 0600 (read/writable by owner only). If the file +** is a database or master journal file, it is created with the permissions +** mask UNQLITE_DEFAULT_FILE_PERMISSIONS. +** +** Finally, if the file being opened is a WAL or regular journal file, then +** this function queries the file-system for the permissions on the +** corresponding database file and sets *pMode to this value. Whenever +** possible, WAL and journal files are created using the same permissions +** as the associated database file. +*/ +static int findCreateFileMode( + const char *zPath, /* Path of file (possibly) being created */ + int flags, /* Flags passed as 4th argument to xOpen() */ + mode_t *pMode /* OUT: Permissions to open file with */ +){ + int rc = UNQLITE_OK; /* Return Code */ + if( flags & UNQLITE_OPEN_TEMP_DB ){ + *pMode = 0600; + SXUNUSED(zPath); + }else{ + *pMode = UNQLITE_DEFAULT_FILE_PERMISSIONS; + } + return rc; +} +/* +** Open the file zPath. +** +** Previously, the SQLite OS layer used three functions in place of this +** one: +** +** unqliteOsOpenReadWrite(); +** unqliteOsOpenReadOnly(); +** unqliteOsOpenExclusive(); +** +** These calls correspond to the following combinations of flags: +** +** ReadWrite() -> (READWRITE | CREATE) +** ReadOnly() -> (READONLY) +** OpenExclusive() -> (READWRITE | CREATE | EXCLUSIVE) +** +** The old OpenExclusive() accepted a boolean argument - "delFlag". If +** true, the file was configured to be automatically deleted when the +** file handle closed. To achieve the same effect using this new +** interface, add the DELETEONCLOSE flag to those specified above for +** OpenExclusive(). +*/ +static int unixOpen( + unqlite_vfs *pVfs, /* The VFS for which this is the xOpen method */ + const char *zPath, /* Pathname of file to be opened */ + unqlite_file *pFile, /* The file descriptor to be filled in */ + unsigned int flags /* Input flags to control the opening */ +){ + unixFile *p = (unixFile *)pFile; + int fd = -1; /* File descriptor returned by open() */ + int dirfd = -1; /* Directory file descriptor */ + int openFlags = 0; /* Flags to pass to open() */ + int noLock; /* True to omit locking primitives */ + int rc = UNQLITE_OK; /* Function Return Code */ + UnixUnusedFd *pUnused; + int isExclusive = (flags & UNQLITE_OPEN_EXCLUSIVE); + int isDelete = (flags & UNQLITE_OPEN_TEMP_DB); + int isCreate = (flags & UNQLITE_OPEN_CREATE); + int isReadonly = (flags & UNQLITE_OPEN_READONLY); + int isReadWrite = (flags & UNQLITE_OPEN_READWRITE); + /* If creating a master or main-file journal, this function will open + ** a file-descriptor on the directory too. The first time unixSync() + ** is called the directory file descriptor will be fsync()ed and close()d. + */ + int isOpenDirectory = isCreate ; + const char *zName = zPath; + + SyZero(p,sizeof(unixFile)); + + pUnused = findReusableFd(zName, flags); + if( pUnused ){ + fd = pUnused->fd; + }else{ + pUnused = unqlite_malloc(sizeof(*pUnused)); + if( !pUnused ){ + return UNQLITE_NOMEM; + } + } + p->pUnused = pUnused; + + /* Determine the value of the flags parameter passed to POSIX function + ** open(). These must be calculated even if open() is not called, as + ** they may be stored as part of the file handle and used by the + ** 'conch file' locking functions later on. */ + if( isReadonly ) openFlags |= O_RDONLY; + if( isReadWrite ) openFlags |= O_RDWR; + if( isCreate ) openFlags |= O_CREAT; + if( isExclusive ) openFlags |= (O_EXCL|O_NOFOLLOW); + openFlags |= (O_LARGEFILE|O_BINARY); + + if( fd<0 ){ + mode_t openMode; /* Permissions to create file with */ + rc = findCreateFileMode(zName, flags, &openMode); + if( rc!=UNQLITE_OK ){ + return rc; + } + fd = open(zName, openFlags, openMode); + if( fd<0 ){ + rc = UNQLITE_IOERR; + goto open_finished; + } + } + + if( p->pUnused ){ + p->pUnused->fd = fd; + p->pUnused->flags = flags; + } + + if( isDelete ){ + unlink(zName); + } + + if( isOpenDirectory ){ + rc = openDirectory(zPath, &dirfd); + if( rc!=UNQLITE_OK ){ + /* It is safe to close fd at this point, because it is guaranteed not + ** to be open on a database file. If it were open on a database file, + ** it would not be safe to close as this would release any locks held + ** on the file by this process. */ + close(fd); /* silently leak if fail, already in error */ + goto open_finished; + } + } + +#ifdef FD_CLOEXEC + fcntl(fd, F_SETFD, fcntl(fd, F_GETFD, 0) | FD_CLOEXEC); +#endif + + noLock = 0; + +#if defined(__APPLE__) + struct statfs fsInfo; + if( fstatfs(fd, &fsInfo) == -1 ){ + ((unixFile*)pFile)->lastErrno = errno; + if( dirfd>=0 ) close(dirfd); /* silently leak if fail, in error */ + close(fd); /* silently leak if fail, in error */ + return UNQLITE_IOERR; + } + if (0 == SyStrncmp("msdos", fsInfo.f_fstypename, 5)) { + ((unixFile*)pFile)->fsFlags |= UNQLITE_FSFLAGS_IS_MSDOS; + } +#endif + + rc = fillInUnixFile(pVfs, fd, dirfd, pFile, zPath, noLock, isDelete); +open_finished: + if( rc!=UNQLITE_OK ){ + unqlite_free(p->pUnused); + } + return rc; +} +/* +** Delete the file at zPath. If the dirSync argument is true, fsync() +** the directory after deleting the file. +*/ +static int unixDelete( + unqlite_vfs *NotUsed, /* VFS containing this as the xDelete method */ + const char *zPath, /* Name of file to be deleted */ + int dirSync /* If true, fsync() directory after deleting file */ +){ + int rc = UNQLITE_OK; + SXUNUSED(NotUsed); + + if( unlink(zPath)==(-1) && errno!=ENOENT ){ + return UNQLITE_IOERR; + } +#ifndef UNQLITE_DISABLE_DIRSYNC + if( dirSync ){ + int fd; + rc = openDirectory(zPath, &fd); + if( rc==UNQLITE_OK ){ + if( fsync(fd) ) + { + rc = UNQLITE_IOERR; + } + if( close(fd) && !rc ){ + rc = UNQLITE_IOERR; + } + } + } +#endif + return rc; +} +/* +** Sleep for a little while. Return the amount of time slept. +** The argument is the number of microseconds we want to sleep. +** The return value is the number of microseconds of sleep actually +** requested from the underlying operating system, a number which +** might be greater than or equal to the argument, but not less +** than the argument. +*/ +static int unixSleep(unqlite_vfs *NotUsed, int microseconds) +{ +#if defined(HAVE_USLEEP) && HAVE_USLEEP + usleep(microseconds); + SXUNUSED(NotUsed); + return microseconds; +#else + int seconds = (microseconds+999999)/1000000; + SXUNUSED(NotUsed); + sleep(seconds); + return seconds*1000000; +#endif +} +/* + * Export the current system time. + */ +static int unixCurrentTime(unqlite_vfs *pVfs,Sytm *pOut) +{ + struct tm *pTm; + time_t tt; + SXUNUSED(pVfs); + time(&tt); + pTm = gmtime(&tt); + if( pTm ){ /* Yes, it can fail */ + STRUCT_TM_TO_SYTM(pTm,pOut); + } + return UNQLITE_OK; +} +/* +** Test the existance of or access permissions of file zPath. The +** test performed depends on the value of flags: +** +** UNQLITE_ACCESS_EXISTS: Return 1 if the file exists +** UNQLITE_ACCESS_READWRITE: Return 1 if the file is read and writable. +** UNQLITE_ACCESS_READONLY: Return 1 if the file is readable. +** +** Otherwise return 0. +*/ +static int unixAccess( + unqlite_vfs *NotUsed, /* The VFS containing this xAccess method */ + const char *zPath, /* Path of the file to examine */ + int flags, /* What do we want to learn about the zPath file? */ + int *pResOut /* Write result boolean here */ +){ + int amode = 0; + SXUNUSED(NotUsed); + switch( flags ){ + case UNQLITE_ACCESS_EXISTS: + amode = F_OK; + break; + case UNQLITE_ACCESS_READWRITE: + amode = W_OK|R_OK; + break; + case UNQLITE_ACCESS_READ: + amode = R_OK; + break; + default: + /* Can't happen */ + break; + } + *pResOut = (access(zPath, amode)==0); + if( flags==UNQLITE_ACCESS_EXISTS && *pResOut ){ + struct stat buf; + if( 0==stat(zPath, &buf) && buf.st_size==0 ){ + *pResOut = 0; + } + } + return UNQLITE_OK; +} +/* +** Turn a relative pathname into a full pathname. The relative path +** is stored as a nul-terminated string in the buffer pointed to by +** zPath. +** +** zOut points to a buffer of at least unqlite_vfs.mxPathname bytes +** (in this case, MAX_PATHNAME bytes). The full-path is written to +** this buffer before returning. +*/ +static int unixFullPathname( + unqlite_vfs *pVfs, /* Pointer to vfs object */ + const char *zPath, /* Possibly relative input path */ + int nOut, /* Size of output buffer in bytes */ + char *zOut /* Output buffer */ +){ + if( zPath[0]=='/' ){ + Systrcpy(zOut,(sxu32)nOut,zPath,0); + SXUNUSED(pVfs); + }else{ + sxu32 nCwd; + zOut[nOut-1] = '\0'; + if( getcwd(zOut, nOut-1)==0 ){ + return UNQLITE_IOERR; + } + nCwd = SyStrlen(zOut); + SyBufferFormat(&zOut[nCwd],(sxu32)nOut-nCwd,"/%s",zPath); + } + return UNQLITE_OK; +} +/* + * Export the Unix Vfs. + */ +UNQLITE_PRIVATE const unqlite_vfs * unqliteExportBuiltinVfs(void) +{ + static const unqlite_vfs sUnixvfs = { + "Unix", /* Vfs name */ + 1, /* Vfs structure version */ + sizeof(unixFile), /* szOsFile */ + MAX_PATHNAME, /* mxPathName */ + unixOpen, /* xOpen */ + unixDelete, /* xDelete */ + unixAccess, /* xAccess */ + unixFullPathname, /* xFullPathname */ + 0, /* xTmp */ + unixSleep, /* xSleep */ + unixCurrentTime, /* xCurrentTime */ + 0, /* xGetLastError */ + }; + return &sUnixvfs; +} + +#endif /* __UNIXES__ */ -- cgit v1.2.3