Merge branch 'unqlite'

author: Aaron Seigo <aseigo@kde.org> 2014-12-14 12:00:05 +0100
committer: Aaron Seigo <aseigo@kde.org> 2014-12-14 12:00:05 +0100
commit: 7cc25005b8c46d1fa783d33def2c6923e8ef8469 (patch)
tree: 64fa59d17af29838396cf37b912b3babd885e5dd /common/unqlite/os_unix.c
parent: bfc32f265e8ad72823db960fed371d72596003b7 (diff)
parent: a6ed70495f9f3ecb21c26860dda16aadcdc91c3a (diff)
download: sink-7cc25005b8c46d1fa783d33def2c6923e8ef8469.tar.gz
sink-7cc25005b8c46d1fa783d33def2c6923e8ef8469.zip
1 files changed, 1769 insertions, 0 deletions
diff --git a/common/unqlite/os_unix.c b/common/unqlite/os_unix.c
new file mode 100644
index 0000000..f578d07
--- /dev/null
+++ b/common/unqlite/os_unix.c
@@ -0,0 +1,1769 @@
+/*
+ * Symisc unQLite: An Embeddable NoSQL (Post Modern) Database Engine.
+ * Copyright (C) 2012-2013, Symisc Systems http://unqlite.org/
+ * Version 1.1.6
+ * For information on licensing, redistribution of this file, and for a DISCLAIMER OF ALL WARRANTIES
+ * please contact Symisc Systems via:
+ *       legal@symisc.net
+ *       licensing@symisc.net
+ *       contact@symisc.net
+ * or visit:
+ *      http://unqlite.org/licensing.html
+ */
+ /* $SymiscID: os_unix.c v1.3 FreeBSD 2013-04-05 01:10 devel <chm@symisc.net> $ */
+#ifndef UNQLITE_AMALGAMATION
+#include "unqliteInt.h"
+#endif
+/* 
+ * Omit the whole layer from the build if compiling for platforms other than Unix (Linux, BSD, Solaris, OS X, etc.).
+ * Note: Mostly SQLite3 source tree.
+ */
+#if defined(__UNIXES__)
+/** This file contains the VFS implementation for unix-like operating systems
+** include Linux, MacOSX, *BSD, QNX, VxWorks, AIX, HPUX, and others.
+**
+** There are actually several different VFS implementations in this file.
+** The differences are in the way that file locking is done.  The default
+** implementation uses Posix Advisory Locks.  Alternative implementations
+** use flock(), dot-files, various proprietary locking schemas, or simply
+** skip locking all together.
+**
+** This source file is organized into divisions where the logic for various
+** subfunctions is contained within the appropriate division.  PLEASE
+** KEEP THE STRUCTURE OF THIS FILE INTACT.  New code should be placed
+** in the correct division and should be clearly labeled.
+**
+*/
+/*
+** standard include files.
+*/
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/file.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <time.h>
+#include <sys/time.h>
+#include <errno.h>
+#if defined(__APPLE__) 
+# include <sys/mount.h>
+#endif
+/*
+** Allowed values of unixFile.fsFlags
+*/
+#define UNQLITE_FSFLAGS_IS_MSDOS     0x1
+/*
+** Default permissions when creating a new file
+*/
+#ifndef UNQLITE_DEFAULT_FILE_PERMISSIONS
+# define UNQLITE_DEFAULT_FILE_PERMISSIONS 0644
+#endif
+/*
+ ** Default permissions when creating auto proxy dir
+ */
+#ifndef UNQLITE_DEFAULT_PROXYDIR_PERMISSIONS
+# define UNQLITE_DEFAULT_PROXYDIR_PERMISSIONS 0755
+#endif
+/*
+** Maximum supported path-length.
+*/
+#define MAX_PATHNAME 512
+/*
+** Only set the lastErrno if the error code is a real error and not 
+** a normal expected return code of UNQLITE_BUSY or UNQLITE_OK
+*/
+#define IS_LOCK_ERROR(x)  ((x != UNQLITE_OK) && (x != UNQLITE_BUSY))
+/* Forward references */
+typedef struct unixInodeInfo unixInodeInfo;   /* An i-node */
+typedef struct UnixUnusedFd UnixUnusedFd;     /* An unused file descriptor */
+/*
+** Sometimes, after a file handle is closed by SQLite, the file descriptor
+** cannot be closed immediately. In these cases, instances of the following
+** structure are used to store the file descriptor while waiting for an
+** opportunity to either close or reuse it.
+*/
+struct UnixUnusedFd {
+  int fd;                   /* File descriptor to close */
+  int flags;                /* Flags this file descriptor was opened with */
+  UnixUnusedFd *pNext;      /* Next unused file descriptor on same file */
+};
+/*
+** The unixFile structure is subclass of unqlite3_file specific to the unix
+** VFS implementations.
+*/
+typedef struct unixFile unixFile;
+struct unixFile {
+  const unqlite_io_methods *pMethod;  /* Always the first entry */
+  unixInodeInfo *pInode;              /* Info about locks on this inode */
+  int h;                              /* The file descriptor */
+  int dirfd;                          /* File descriptor for the directory */
+  unsigned char eFileLock;            /* The type of lock held on this fd */
+  int lastErrno;                      /* The unix errno from last I/O error */
+  void *lockingContext;               /* Locking style specific state */
+  UnixUnusedFd *pUnused;              /* Pre-allocated UnixUnusedFd */
+  int fileFlags;                      /* Miscellanous flags */
+  const char *zPath;                  /* Name of the file */
+  unsigned fsFlags;                   /* cached details from statfs() */
+};
+/*
+** The following macros define bits in unixFile.fileFlags
+*/
+#define UNQLITE_WHOLE_FILE_LOCKING  0x0001   /* Use whole-file locking */
+/*
+** Define various macros that are missing from some systems.
+*/
+#ifndef O_LARGEFILE
+# define O_LARGEFILE 0
+#endif
+#ifndef O_NOFOLLOW
+# define O_NOFOLLOW 0
+#endif
+#ifndef O_BINARY
+# define O_BINARY 0
+#endif
+/*
+** Helper functions to obtain and relinquish the global mutex. The
+** global mutex is used to protect the unixInodeInfo and
+** vxworksFileId objects used by this file, all of which may be 
+** shared by multiple threads.
+**
+** Function unixMutexHeld() is used to assert() that the global mutex 
+** is held when required. This function is only used as part of assert() 
+** statements. e.g.
+**
+**   unixEnterMutex()
+**     assert( unixMutexHeld() );
+**   unixEnterLeave()
+*/
+static void unixEnterMutex(void){
+#ifdef UNQLITE_ENABLE_THREADS
+        const SyMutexMethods *pMutexMethods = SyMutexExportMethods();
+        if( pMutexMethods ){
+                SyMutex *pMutex = pMutexMethods->xNew(SXMUTEX_TYPE_STATIC_2); /* pre-allocated, never fail */
+                SyMutexEnter(pMutexMethods,pMutex);
+        }
+#endif /* UNQLITE_ENABLE_THREADS */
+}
+static void unixLeaveMutex(void){
+#ifdef UNQLITE_ENABLE_THREADS
+  const SyMutexMethods *pMutexMethods = SyMutexExportMethods();
+  if( pMutexMethods ){
+         SyMutex *pMutex = pMutexMethods->xNew(SXMUTEX_TYPE_STATIC_2); /* pre-allocated, never fail */
+         SyMutexLeave(pMutexMethods,pMutex);
+  }
+#endif /* UNQLITE_ENABLE_THREADS */
+}
+/*
+** This routine translates a standard POSIX errno code into something
+** useful to the clients of the unqlite3 functions.  Specifically, it is
+** intended to translate a variety of "try again" errors into UNQLITE_BUSY
+** and a variety of "please close the file descriptor NOW" errors into 
+** UNQLITE_IOERR
+** 
+** Errors during initialization of locks, or file system support for locks,
+** should handle ENOLCK, ENOTSUP, EOPNOTSUPP separately.
+*/
+static int unqliteErrorFromPosixError(int posixError, int unqliteIOErr) {
+  switch (posixError) {
+  case 0: 
+    return UNQLITE_OK;
+    
+  case EAGAIN:
+  case ETIMEDOUT:
+  case EBUSY:
+  case EINTR:
+  case ENOLCK:  
+    /* random NFS retry error, unless during file system support 
+     * introspection, in which it actually means what it says */
+    return UNQLITE_BUSY;
+ 
+  case EACCES: 
+    /* EACCES is like EAGAIN during locking operations, but not any other time*/
+      return UNQLITE_BUSY;
+    
+  case EPERM: 
+    return UNQLITE_PERM;
+    
+  case EDEADLK:
+    return UNQLITE_IOERR;
+    
+#if EOPNOTSUPP!=ENOTSUP
+  case EOPNOTSUPP: 
+    /* something went terribly awry, unless during file system support 
+     * introspection, in which it actually means what it says */
+#endif
+#ifdef ENOTSUP
+  case ENOTSUP: 
+    /* invalid fd, unless during file system support introspection, in which 
+     * it actually means what it says */
+#endif
+  case EIO:
+  case EBADF:
+  case EINVAL:
+  case ENOTCONN:
+  case ENODEV:
+  case ENXIO:
+  case ENOENT:
+  case ESTALE:
+  case ENOSYS:
+    /* these should force the client to close the file and reconnect */
+    
+  default: 
+    return unqliteIOErr;
+  }
+}
+/******************************************************************************
+*************************** Posix Advisory Locking ****************************
+**
+** POSIX advisory locks are broken by design.  ANSI STD 1003.1 (1996)
+** section 6.5.2.2 lines 483 through 490 specify that when a process
+** sets or clears a lock, that operation overrides any prior locks set
+** by the same process.  It does not explicitly say so, but this implies
+** that it overrides locks set by the same process using a different
+** file descriptor.  Consider this test case:
+**
+**       int fd1 = open("./file1", O_RDWR|O_CREAT, 0644);
+**       int fd2 = open("./file2", O_RDWR|O_CREAT, 0644);
+**
+** Suppose ./file1 and ./file2 are really the same file (because
+** one is a hard or symbolic link to the other) then if you set
+** an exclusive lock on fd1, then try to get an exclusive lock
+** on fd2, it works.  I would have expected the second lock to
+** fail since there was already a lock on the file due to fd1.
+** But not so.  Since both locks came from the same process, the
+** second overrides the first, even though they were on different
+** file descriptors opened on different file names.
+**
+** This means that we cannot use POSIX locks to synchronize file access
+** among competing threads of the same process.  POSIX locks will work fine
+** to synchronize access for threads in separate processes, but not
+** threads within the same process.
+**
+** To work around the problem, SQLite has to manage file locks internally
+** on its own.  Whenever a new database is opened, we have to find the
+** specific inode of the database file (the inode is determined by the
+** st_dev and st_ino fields of the stat structure that fstat() fills in)
+** and check for locks already existing on that inode.  When locks are
+** created or removed, we have to look at our own internal record of the
+** locks to see if another thread has previously set a lock on that same
+** inode.
+**
+** (Aside: The use of inode numbers as unique IDs does not work on VxWorks.
+** For VxWorks, we have to use the alternative unique ID system based on
+** canonical filename and implemented in the previous division.)
+**
+** There is one locking structure
+** per inode, so if the same inode is opened twice, both unixFile structures
+** point to the same locking structure.  The locking structure keeps
+** a reference count (so we will know when to delete it) and a "cnt"
+** field that tells us its internal lock status.  cnt==0 means the
+** file is unlocked.  cnt==-1 means the file has an exclusive lock.
+** cnt>0 means there are cnt shared locks on the file.
+**
+** Any attempt to lock or unlock a file first checks the locking
+** structure.  The fcntl() system call is only invoked to set a 
+** POSIX lock if the internal lock structure transitions between
+** a locked and an unlocked state.
+**
+** But wait:  there are yet more problems with POSIX advisory locks.
+**
+** If you close a file descriptor that points to a file that has locks,
+** all locks on that file that are owned by the current process are
+** released.  To work around this problem, each unixInodeInfo object
+** maintains a count of the number of pending locks on that inode.
+** When an attempt is made to close an unixFile, if there are
+** other unixFile open on the same inode that are holding locks, the call
+** to close() the file descriptor is deferred until all of the locks clear.
+** The unixInodeInfo structure keeps a list of file descriptors that need to
+** be closed and that list is walked (and cleared) when the last lock
+** clears.
+**
+** Yet another problem:  LinuxThreads do not play well with posix locks.
+**
+** Many older versions of linux use the LinuxThreads library which is
+** not posix compliant.  Under LinuxThreads, a lock created by thread
+** A cannot be modified or overridden by a different thread B.
+** Only thread A can modify the lock.  Locking behavior is correct
+** if the appliation uses the newer Native Posix Thread Library (NPTL)
+** on linux - with NPTL a lock created by thread A can override locks
+** in thread B.  But there is no way to know at compile-time which
+** threading library is being used.  So there is no way to know at
+** compile-time whether or not thread A can override locks on thread B.
+** One has to do a run-time check to discover the behavior of the
+** current process.
+**
+*/
+/*
+** An instance of the following structure serves as the key used
+** to locate a particular unixInodeInfo object.
+*/
+struct unixFileId {
+  dev_t dev;                  /* Device number */
+  ino_t ino;                  /* Inode number */
+};
+/*
+** An instance of the following structure is allocated for each open
+** inode.  Or, on LinuxThreads, there is one of these structures for
+** each inode opened by each thread.
+**
+** A single inode can have multiple file descriptors, so each unixFile
+** structure contains a pointer to an instance of this object and this
+** object keeps a count of the number of unixFile pointing to it.
+*/
+struct unixInodeInfo {
+  struct unixFileId fileId;       /* The lookup key */
+  int nShared;                    /* Number of SHARED locks held */
+  int eFileLock;                  /* One of SHARED_LOCK, RESERVED_LOCK etc. */
+  int nRef;                       /* Number of pointers to this structure */
+  int nLock;                      /* Number of outstanding file locks */
+  UnixUnusedFd *pUnused;          /* Unused file descriptors to close */
+  unixInodeInfo *pNext;           /* List of all unixInodeInfo objects */
+  unixInodeInfo *pPrev;           /*    .... doubly linked */
+};
+static unixInodeInfo *inodeList = 0;
+/*
+ * Local memory allocation stuff.
+ */
+static void * unqlite_malloc(sxu32 nByte)
+{
+        SyMemBackend *pAlloc;
+        void *p;
+        pAlloc = (SyMemBackend *)unqliteExportMemBackend();
+        p = SyMemBackendAlloc(pAlloc,nByte);
+        return p;
+}
+static void unqlite_free(void *p)
+{
+        SyMemBackend *pAlloc;
+        pAlloc = (SyMemBackend *)unqliteExportMemBackend();
+        SyMemBackendFree(pAlloc,p);
+}
+/*
+** Close all file descriptors accumuated in the unixInodeInfo->pUnused list.
+** If all such file descriptors are closed without error, the list is
+** cleared and UNQLITE_OK returned.
+**
+** Otherwise, if an error occurs, then successfully closed file descriptor
+** entries are removed from the list, and UNQLITE_IOERR_CLOSE returned. 
+** not deleted and UNQLITE_IOERR_CLOSE returned.
+*/ 
+static int closePendingFds(unixFile *pFile){
+  int rc = UNQLITE_OK;
+  unixInodeInfo *pInode = pFile->pInode;
+  UnixUnusedFd *pError = 0;
+  UnixUnusedFd *p;
+  UnixUnusedFd *pNext;
+  for(p=pInode->pUnused; p; p=pNext){
+    pNext = p->pNext;
+    if( close(p->fd) ){
+      pFile->lastErrno = errno;
+          rc = UNQLITE_IOERR;
+      p->pNext = pError;
+      pError = p;
+    }else{
+      unqlite_free(p);
+    }
+  }
+  pInode->pUnused = pError;
+  return rc;
+}
+/*
+** Release a unixInodeInfo structure previously allocated by findInodeInfo().
+**
+** The mutex entered using the unixEnterMutex() function must be held
+** when this function is called.
+*/
+static void releaseInodeInfo(unixFile *pFile){
+  unixInodeInfo *pInode = pFile->pInode;
+  if( pInode ){
+    pInode->nRef--;
+    if( pInode->nRef==0 ){
+      closePendingFds(pFile);
+      if( pInode->pPrev ){
+        pInode->pPrev->pNext = pInode->pNext;
+      }else{
+        inodeList = pInode->pNext;
+      }
+      if( pInode->pNext ){
+        pInode->pNext->pPrev = pInode->pPrev;
+      }
+      unqlite_free(pInode);
+    }
+  }
+}
+/*
+** Given a file descriptor, locate the unixInodeInfo object that
+** describes that file descriptor.  Create a new one if necessary.  The
+** return value might be uninitialized if an error occurs.
+**
+** The mutex entered using the unixEnterMutex() function must be held
+** when this function is called.
+**
+** Return an appropriate error code.
+*/
+static int findInodeInfo(
+  unixFile *pFile,               /* Unix file with file desc used in the key */
+  unixInodeInfo **ppInode        /* Return the unixInodeInfo object here */
+){
+  int rc;                        /* System call return code */
+  int fd;                        /* The file descriptor for pFile */
+  struct unixFileId fileId;      /* Lookup key for the unixInodeInfo */
+  struct stat statbuf;           /* Low-level file information */
+  unixInodeInfo *pInode = 0;     /* Candidate unixInodeInfo object */
+  /* Get low-level information about the file that we can used to
+  ** create a unique name for the file.
+  */
+  fd = pFile->h;
+  rc = fstat(fd, &statbuf);
+  if( rc!=0 ){
+    pFile->lastErrno = errno;
+#ifdef EOVERFLOW
+        if( pFile->lastErrno==EOVERFLOW ) return UNQLITE_NOTIMPLEMENTED;
+#endif
+    return UNQLITE_IOERR;
+  }
+#ifdef __APPLE__
+  /* On OS X on an msdos filesystem, the inode number is reported
+  ** incorrectly for zero-size files.  See ticket #3260.  To work
+  ** around this problem (we consider it a bug in OS X, not SQLite)
+  ** we always increase the file size to 1 by writing a single byte
+  ** prior to accessing the inode number.  The one byte written is
+  ** an ASCII 'S' character which also happens to be the first byte
+  ** in the header of every SQLite database.  In this way, if there
+  ** is a race condition such that another thread has already populated
+  ** the first page of the database, no damage is done.
+  */
+  if( statbuf.st_size==0 && (pFile->fsFlags & UNQLITE_FSFLAGS_IS_MSDOS)!=0 ){
+    rc = write(fd, "S", 1);
+    if( rc!=1 ){
+      pFile->lastErrno = errno;
+      return UNQLITE_IOERR;
+    }
+    rc = fstat(fd, &statbuf);
+    if( rc!=0 ){
+      pFile->lastErrno = errno;
+      return UNQLITE_IOERR;
+    }
+  }
+#endif
+  SyZero(&fileId,sizeof(fileId));
+  fileId.dev = statbuf.st_dev;
+  fileId.ino = statbuf.st_ino;
+  pInode = inodeList;
+  while( pInode && SyMemcmp((const void *)&fileId,(const void *)&pInode->fileId, sizeof(fileId)) ){
+    pInode = pInode->pNext;
+  }
+  if( pInode==0 ){
+    pInode = (unixInodeInfo *)unqlite_malloc( sizeof(*pInode) );
+    if( pInode==0 ){
+      return UNQLITE_NOMEM;
+    }
+    SyZero(pInode,sizeof(*pInode));
+        SyMemcpy((const void *)&fileId,(void *)&pInode->fileId,sizeof(fileId));
+    pInode->nRef = 1;
+    pInode->pNext = inodeList;
+    pInode->pPrev = 0;
+    if( inodeList ) inodeList->pPrev = pInode;
+    inodeList = pInode;
+  }else{
+    pInode->nRef++;
+  }
+  *ppInode = pInode;
+  return UNQLITE_OK;
+}
+/*
+** This routine checks if there is a RESERVED lock held on the specified
+** file by this or any other process. If such a lock is held, set *pResOut
+** to a non-zero value otherwise *pResOut is set to zero.  The return value
+** is set to UNQLITE_OK unless an I/O error occurs during lock checking.
+*/
+static int unixCheckReservedLock(unqlite_file *id, int *pResOut){
+  int rc = UNQLITE_OK;
+  int reserved = 0;
+  unixFile *pFile = (unixFile*)id;
+ 
+  unixEnterMutex(); /* Because pFile->pInode is shared across threads */
+  /* Check if a thread in this process holds such a lock */
+  if( pFile->pInode->eFileLock>SHARED_LOCK ){
+    reserved = 1;
+  }
+  /* Otherwise see if some other process holds it.
+  */
+  if( !reserved ){
+    struct flock lock;
+    lock.l_whence = SEEK_SET;
+    lock.l_start = RESERVED_BYTE;
+    lock.l_len = 1;
+    lock.l_type = F_WRLCK;
+    if (-1 == fcntl(pFile->h, F_GETLK, &lock)) {
+      int tErrno = errno;
+          rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
+      pFile->lastErrno = tErrno;
+    } else if( lock.l_type!=F_UNLCK ){
+      reserved = 1;
+    }
+  }
+  
+  unixLeaveMutex();
+ 
+  *pResOut = reserved;
+  return rc;
+}
+/*
+** Lock the file with the lock specified by parameter eFileLock - one
+** of the following:
+**
+**     (1) SHARED_LOCK
+**     (2) RESERVED_LOCK
+**     (3) PENDING_LOCK
+**     (4) EXCLUSIVE_LOCK
+**
+** Sometimes when requesting one lock state, additional lock states
+** are inserted in between.  The locking might fail on one of the later
+** transitions leaving the lock state different from what it started but
+** still short of its goal.  The following chart shows the allowed
+** transitions and the inserted intermediate states:
+**
+**    UNLOCKED -> SHARED
+**    SHARED -> RESERVED
+**    SHARED -> (PENDING) -> EXCLUSIVE
+**    RESERVED -> (PENDING) -> EXCLUSIVE
+**    PENDING -> EXCLUSIVE
+**
+** This routine will only increase a lock.  Use the unqliteOsUnlock()
+** routine to lower a locking level.
+*/
+static int unixLock(unqlite_file *id, int eFileLock){
+  /* The following describes the implementation of the various locks and
+  ** lock transitions in terms of the POSIX advisory shared and exclusive
+  ** lock primitives (called read-locks and write-locks below, to avoid
+  ** confusion with SQLite lock names). The algorithms are complicated
+  ** slightly in order to be compatible with unixdows systems simultaneously
+  ** accessing the same database file, in case that is ever required.
+  **
+  ** Symbols defined in os.h indentify the 'pending byte' and the 'reserved
+  ** byte', each single bytes at well known offsets, and the 'shared byte
+  ** range', a range of 510 bytes at a well known offset.
+  **
+  ** To obtain a SHARED lock, a read-lock is obtained on the 'pending
+  ** byte'.  If this is successful, a random byte from the 'shared byte
+  ** range' is read-locked and the lock on the 'pending byte' released.
+  **
+  ** A process may only obtain a RESERVED lock after it has a SHARED lock.
+  ** A RESERVED lock is implemented by grabbing a write-lock on the
+  ** 'reserved byte'. 
+  **
+  ** A process may only obtain a PENDING lock after it has obtained a
+  ** SHARED lock. A PENDING lock is implemented by obtaining a write-lock
+  ** on the 'pending byte'. This ensures that no new SHARED locks can be
+  ** obtained, but existing SHARED locks are allowed to persist. A process
+  ** does not have to obtain a RESERVED lock on the way to a PENDING lock.
+  ** This property is used by the algorithm for rolling back a journal file
+  ** after a crash.
+  **
+  ** An EXCLUSIVE lock, obtained after a PENDING lock is held, is
+  ** implemented by obtaining a write-lock on the entire 'shared byte
+  ** range'. Since all other locks require a read-lock on one of the bytes
+  ** within this range, this ensures that no other locks are held on the
+  ** database. 
+  **
+  ** The reason a single byte cannot be used instead of the 'shared byte
+  ** range' is that some versions of unixdows do not support read-locks. By
+  ** locking a random byte from a range, concurrent SHARED locks may exist
+  ** even if the locking primitive used is always a write-lock.
+  */
+  int rc = UNQLITE_OK;
+  unixFile *pFile = (unixFile*)id;
+  unixInodeInfo *pInode = pFile->pInode;
+  struct flock lock;
+  int s = 0;
+  int tErrno = 0;
+  /* If there is already a lock of this type or more restrictive on the
+  ** unixFile, do nothing. Don't use the end_lock: exit path, as
+  ** unixEnterMutex() hasn't been called yet.
+  */
+  if( pFile->eFileLock>=eFileLock ){
+    return UNQLITE_OK;
+  }
+  /* This mutex is needed because pFile->pInode is shared across threads
+  */
+  unixEnterMutex();
+  pInode = pFile->pInode;
+  /* If some thread using this PID has a lock via a different unixFile*
+  ** handle that precludes the requested lock, return BUSY.
+  */
+  if( (pFile->eFileLock!=pInode->eFileLock && 
+          (pInode->eFileLock>=PENDING_LOCK || eFileLock>SHARED_LOCK))
+  ){
+    rc = UNQLITE_BUSY;
+    goto end_lock;
+  }
+  /* If a SHARED lock is requested, and some thread using this PID already
+  ** has a SHARED or RESERVED lock, then increment reference counts and
+  ** return UNQLITE_OK.
+  */
+  if( eFileLock==SHARED_LOCK && 
+      (pInode->eFileLock==SHARED_LOCK || pInode->eFileLock==RESERVED_LOCK) ){
+    pFile->eFileLock = SHARED_LOCK;
+    pInode->nShared++;
+    pInode->nLock++;
+    goto end_lock;
+  }
+  /* A PENDING lock is needed before acquiring a SHARED lock and before
+  ** acquiring an EXCLUSIVE lock.  For the SHARED lock, the PENDING will
+  ** be released.
+  */
+  lock.l_len = 1L;
+  lock.l_whence = SEEK_SET;
+  if( eFileLock==SHARED_LOCK 
+      || (eFileLock==EXCLUSIVE_LOCK && pFile->eFileLock<PENDING_LOCK)
+  ){
+    lock.l_type = (eFileLock==SHARED_LOCK?F_RDLCK:F_WRLCK);
+    lock.l_start = PENDING_BYTE;
+    s = fcntl(pFile->h, F_SETLK, &lock);
+    if( s==(-1) ){
+      tErrno = errno;
+      rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
+      if( IS_LOCK_ERROR(rc) ){
+        pFile->lastErrno = tErrno;
+      }
+      goto end_lock;
+    }
+  }
+  /* If control gets to this point, then actually go ahead and make
+  ** operating system calls for the specified lock.
+  */
+  if( eFileLock==SHARED_LOCK ){
+    /* Now get the read-lock */
+    lock.l_start = SHARED_FIRST;
+    lock.l_len = SHARED_SIZE;
+    if( (s = fcntl(pFile->h, F_SETLK, &lock))==(-1) ){
+      tErrno = errno;
+    }
+    /* Drop the temporary PENDING lock */
+    lock.l_start = PENDING_BYTE;
+    lock.l_len = 1L;
+    lock.l_type = F_UNLCK;
+    if( fcntl(pFile->h, F_SETLK, &lock)!=0 ){
+      if( s != -1 ){
+        /* This could happen with a network mount */
+        tErrno = errno; 
+        rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR); 
+        if( IS_LOCK_ERROR(rc) ){
+          pFile->lastErrno = tErrno;
+        }
+        goto end_lock;
+      }
+    }
+    if( s==(-1) ){
+                rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
+      if( IS_LOCK_ERROR(rc) ){
+        pFile->lastErrno = tErrno;
+      }
+    }else{
+      pFile->eFileLock = SHARED_LOCK;
+      pInode->nLock++;
+      pInode->nShared = 1;
+    }
+  }else if( eFileLock==EXCLUSIVE_LOCK && pInode->nShared>1 ){
+    /* We are trying for an exclusive lock but another thread in this
+    ** same process is still holding a shared lock. */
+    rc = UNQLITE_BUSY;
+  }else{
+    /* The request was for a RESERVED or EXCLUSIVE lock.  It is
+    ** assumed that there is a SHARED or greater lock on the file
+    ** already.
+    */
+    lock.l_type = F_WRLCK;
+    switch( eFileLock ){
+      case RESERVED_LOCK:
+        lock.l_start = RESERVED_BYTE;
+        break;
+      case EXCLUSIVE_LOCK:
+        lock.l_start = SHARED_FIRST;
+        lock.l_len = SHARED_SIZE;
+        break;
+      default:
+                  /* Can't happen */
+        break;
+    }
+    s = fcntl(pFile->h, F_SETLK, &lock);
+    if( s==(-1) ){
+      tErrno = errno;
+      rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
+      if( IS_LOCK_ERROR(rc) ){
+        pFile->lastErrno = tErrno;
+      }
+    }
+  }
+  if( rc==UNQLITE_OK ){
+    pFile->eFileLock = eFileLock;
+    pInode->eFileLock = eFileLock;
+  }else if( eFileLock==EXCLUSIVE_LOCK ){
+    pFile->eFileLock = PENDING_LOCK;
+    pInode->eFileLock = PENDING_LOCK;
+  }
+end_lock:
+  unixLeaveMutex();
+  return rc;
+}
+/*
+** Add the file descriptor used by file handle pFile to the corresponding
+** pUnused list.
+*/
+static void setPendingFd(unixFile *pFile){
+  unixInodeInfo *pInode = pFile->pInode;
+  UnixUnusedFd *p = pFile->pUnused;
+  p->pNext = pInode->pUnused;
+  pInode->pUnused = p;
+  pFile->h = -1;
+  pFile->pUnused = 0;
+}
+/*
+** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
+** must be either NO_LOCK or SHARED_LOCK.
+**
+** If the locking level of the file descriptor is already at or below
+** the requested locking level, this routine is a no-op.
+** 
+** If handleNFSUnlock is true, then on downgrading an EXCLUSIVE_LOCK to SHARED
+** the byte range is divided into 2 parts and the first part is unlocked then
+** set to a read lock, then the other part is simply unlocked.  This works 
+** around a bug in BSD NFS lockd (also seen on MacOSX 10.3+) that fails to 
+** remove the write lock on a region when a read lock is set.
+*/
+static int _posixUnlock(unqlite_file *id, int eFileLock, int handleNFSUnlock){
+  unixFile *pFile = (unixFile*)id;
+  unixInodeInfo *pInode;
+  struct flock lock;
+  int rc = UNQLITE_OK;
+  int h;
+  int tErrno;                      /* Error code from system call errors */
+   if( pFile->eFileLock<=eFileLock ){
+    return UNQLITE_OK;
+  }
+  unixEnterMutex();
+  
+  h = pFile->h;
+  pInode = pFile->pInode;
+  
+  if( pFile->eFileLock>SHARED_LOCK ){
+    /* downgrading to a shared lock on NFS involves clearing the write lock
+    ** before establishing the readlock - to avoid a race condition we downgrade
+    ** the lock in 2 blocks, so that part of the range will be covered by a 
+    ** write lock until the rest is covered by a read lock:
+    **  1:   [WWWWW]
+    **  2:   [....W]
+    **  3:   [RRRRW]
+    **  4:   [RRRR.]
+    */
+    if( eFileLock==SHARED_LOCK ){
+      if( handleNFSUnlock ){
+        off_t divSize = SHARED_SIZE - 1;
+        
+        lock.l_type = F_UNLCK;
+        lock.l_whence = SEEK_SET;
+        lock.l_start = SHARED_FIRST;
+        lock.l_len = divSize;
+        if( fcntl(h, F_SETLK, &lock)==(-1) ){
+          tErrno = errno;
+                  rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
+          if( IS_LOCK_ERROR(rc) ){
+            pFile->lastErrno = tErrno;
+          }
+          goto end_unlock;
+        }
+        lock.l_type = F_RDLCK;
+        lock.l_whence = SEEK_SET;
+        lock.l_start = SHARED_FIRST;
+        lock.l_len = divSize;
+        if( fcntl(h, F_SETLK, &lock)==(-1) ){
+          tErrno = errno;
+                  rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
+          if( IS_LOCK_ERROR(rc) ){
+            pFile->lastErrno = tErrno;
+          }
+          goto end_unlock;
+        }
+        lock.l_type = F_UNLCK;
+        lock.l_whence = SEEK_SET;
+        lock.l_start = SHARED_FIRST+divSize;
+        lock.l_len = SHARED_SIZE-divSize;
+        if( fcntl(h, F_SETLK, &lock)==(-1) ){
+          tErrno = errno;
+                  rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
+          if( IS_LOCK_ERROR(rc) ){
+            pFile->lastErrno = tErrno;
+          }
+          goto end_unlock;
+        }
+      }else{
+        lock.l_type = F_RDLCK;
+        lock.l_whence = SEEK_SET;
+        lock.l_start = SHARED_FIRST;
+        lock.l_len = SHARED_SIZE;
+        if( fcntl(h, F_SETLK, &lock)==(-1) ){
+          tErrno = errno;
+                  rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
+          if( IS_LOCK_ERROR(rc) ){
+            pFile->lastErrno = tErrno;
+          }
+          goto end_unlock;
+        }
+      }
+    }
+    lock.l_type = F_UNLCK;
+    lock.l_whence = SEEK_SET;
+    lock.l_start = PENDING_BYTE;
+    lock.l_len = 2L;
+    if( fcntl(h, F_SETLK, &lock)!=(-1) ){
+      pInode->eFileLock = SHARED_LOCK;
+    }else{
+      tErrno = errno;
+          rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
+      if( IS_LOCK_ERROR(rc) ){
+        pFile->lastErrno = tErrno;
+      }
+      goto end_unlock;
+    }
+  }
+  if( eFileLock==NO_LOCK ){
+    /* Decrement the shared lock counter.  Release the lock using an
+    ** OS call only when all threads in this same process have released
+    ** the lock.
+    */
+    pInode->nShared--;
+    if( pInode->nShared==0 ){
+      lock.l_type = F_UNLCK;
+      lock.l_whence = SEEK_SET;
+      lock.l_start = lock.l_len = 0L;
+      
+      if( fcntl(h, F_SETLK, &lock)!=(-1) ){
+        pInode->eFileLock = NO_LOCK;
+      }else{
+        tErrno = errno;
+                rc = unqliteErrorFromPosixError(tErrno, UNQLITE_LOCKERR);
+        if( IS_LOCK_ERROR(rc) ){
+          pFile->lastErrno = tErrno;
+        }
+        pInode->eFileLock = NO_LOCK;
+        pFile->eFileLock = NO_LOCK;
+      }
+    }
+    /* Decrement the count of locks against this same file.  When the
+    ** count reaches zero, close any other file descriptors whose close
+    ** was deferred because of outstanding locks.
+    */
+    pInode->nLock--;
+ 
+    if( pInode->nLock==0 ){
+      int rc2 = closePendingFds(pFile);
+      if( rc==UNQLITE_OK ){
+        rc = rc2;
+      }
+    }
+  }
+        
+end_unlock:
+  unixLeaveMutex();
+  
+  if( rc==UNQLITE_OK ) pFile->eFileLock = eFileLock;
+  return rc;
+}
+/*
+** Lower the locking level on file descriptor pFile to eFileLock.  eFileLock
+** must be either NO_LOCK or SHARED_LOCK.
+**
+** If the locking level of the file descriptor is already at or below
+** the requested locking level, this routine is a no-op.
+*/
+static int unixUnlock(unqlite_file *id, int eFileLock){
+  return _posixUnlock(id, eFileLock, 0);
+}
+/*
+** This function performs the parts of the "close file" operation 
+** common to all locking schemes. It closes the directory and file
+** handles, if they are valid, and sets all fields of the unixFile
+** structure to 0.
+**
+*/
+static int closeUnixFile(unqlite_file *id){
+  unixFile *pFile = (unixFile*)id;
+  if( pFile ){
+    if( pFile->dirfd>=0 ){
+      int err = close(pFile->dirfd);
+      if( err ){
+        pFile->lastErrno = errno;
+        return UNQLITE_IOERR;
+      }else{
+        pFile->dirfd=-1;
+      }
+    }
+    if( pFile->h>=0 ){
+      int err = close(pFile->h);
+      if( err ){
+        pFile->lastErrno = errno;
+        return UNQLITE_IOERR;
+      }
+    }
+    unqlite_free(pFile->pUnused);
+    SyZero(pFile,sizeof(unixFile));
+  }
+  return UNQLITE_OK;
+}
+/*
+** Close a file.
+*/
+static int unixClose(unqlite_file *id){
+  int rc = UNQLITE_OK;
+  if( id ){
+    unixFile *pFile = (unixFile *)id;
+    unixUnlock(id, NO_LOCK);
+    unixEnterMutex();
+    if( pFile->pInode && pFile->pInode->nLock ){
+      /* If there are outstanding locks, do not actually close the file just
+      ** yet because that would clear those locks.  Instead, add the file
+      ** descriptor to pInode->pUnused list.  It will be automatically closed 
+      ** when the last lock is cleared.
+      */
+      setPendingFd(pFile);
+    }
+    releaseInodeInfo(pFile);
+    rc = closeUnixFile(id);
+    unixLeaveMutex();
+  }
+  return rc;
+}
+/************** End of the posix advisory lock implementation *****************
+******************************************************************************/
+/*
+**
+** The next division contains implementations for all methods of the 
+** unqlite_file object other than the locking methods.  The locking
+** methods were defined in divisions above (one locking method per
+** division).  Those methods that are common to all locking modes
+** are gather together into this division.
+*/
+/*
+** Seek to the offset passed as the second argument, then read cnt 
+** bytes into pBuf. Return the number of bytes actually read.
+**
+** NB:  If you define USE_PREAD or USE_PREAD64, then it might also
+** be necessary to define _XOPEN_SOURCE to be 500.  This varies from
+** one system to another.  Since SQLite does not define USE_PREAD
+** any form by default, we will not attempt to define _XOPEN_SOURCE.
+** See tickets #2741 and #2681.
+**
+** To avoid stomping the errno value on a failed read the lastErrno value
+** is set before returning.
+*/
+static int seekAndRead(unixFile *id, unqlite_int64 offset, void *pBuf, int cnt){
+  int got;
+#if (!defined(USE_PREAD) && !defined(USE_PREAD64))
+  unqlite_int64 newOffset;
+#endif
+ 
+#if defined(USE_PREAD)
+  got = pread(id->h, pBuf, cnt, offset);
+#elif defined(USE_PREAD64)
+  got = pread64(id->h, pBuf, cnt, offset);
+#else
+  newOffset = lseek(id->h, offset, SEEK_SET);
+  
+  if( newOffset!=offset ){
+    if( newOffset == -1 ){
+      ((unixFile*)id)->lastErrno = errno;
+    }else{
+      ((unixFile*)id)->lastErrno = 0;                   
+    }
+    return -1;
+  }
+  got = read(id->h, pBuf, cnt);
+#endif
+  if( got<0 ){
+    ((unixFile*)id)->lastErrno = errno;
+  }
+  return got;
+}
+/*
+** Read data from a file into a buffer.  Return UNQLITE_OK if all
+** bytes were read successfully and UNQLITE_IOERR if anything goes
+** wrong.
+*/
+static int unixRead(
+  unqlite_file *id, 
+  void *pBuf, 
+  unqlite_int64 amt,
+  unqlite_int64 offset
+){
+  unixFile *pFile = (unixFile *)id;
+  int got;
+  
+  got = seekAndRead(pFile, offset, pBuf, (int)amt);
+  if( got==(int)amt ){
+    return UNQLITE_OK;
+  }else if( got<0 ){
+    /* lastErrno set by seekAndRead */
+    return UNQLITE_IOERR;
+  }else{
+    pFile->lastErrno = 0; /* not a system error */
+    /* Unread parts of the buffer must be zero-filled */
+    SyZero(&((char*)pBuf)[got],(sxu32)amt-got);
+    return UNQLITE_IOERR;
+  }
+}
+/*
+** Seek to the offset in id->offset then read cnt bytes into pBuf.
+** Return the number of bytes actually read.  Update the offset.
+**
+** To avoid stomping the errno value on a failed write the lastErrno value
+** is set before returning.
+*/
+static int seekAndWrite(unixFile *id, unqlite_int64 offset, const void *pBuf, unqlite_int64 cnt){
+  int got;
+#if (!defined(USE_PREAD) && !defined(USE_PREAD64))
+  unqlite_int64 newOffset;
+#endif
+  
+#if defined(USE_PREAD)
+  got = pwrite(id->h, pBuf, cnt, offset);
+#elif defined(USE_PREAD64)
+  got = pwrite64(id->h, pBuf, cnt, offset);
+#else
+  newOffset = lseek(id->h, offset, SEEK_SET);
+  if( newOffset!=offset ){
+    if( newOffset == -1 ){
+      ((unixFile*)id)->lastErrno = errno;
+    }else{
+      ((unixFile*)id)->lastErrno = 0;                   
+    }
+    return -1;
+  }
+  got = write(id->h, pBuf, cnt);
+#endif
+  if( got<0 ){
+    ((unixFile*)id)->lastErrno = errno;
+  }
+  return got;
+}
+/*
+** Write data from a buffer into a file.  Return UNQLITE_OK on success
+** or some other error code on failure.
+*/
+static int unixWrite(
+  unqlite_file *id, 
+  const void *pBuf, 
+  unqlite_int64 amt,
+  unqlite_int64 offset 
+){
+  unixFile *pFile = (unixFile*)id;
+  int wrote = 0;
+  while( amt>0 && (wrote = seekAndWrite(pFile, offset, pBuf, amt))>0 ){
+    amt -= wrote;
+    offset += wrote;
+    pBuf = &((char*)pBuf)[wrote];
+  }
+  
+  if( amt>0 ){
+    if( wrote<0 ){
+      /* lastErrno set by seekAndWrite */
+      return UNQLITE_IOERR;
+    }else{
+      pFile->lastErrno = 0; /* not a system error */
+      return UNQLITE_FULL;
+    }
+  }
+  return UNQLITE_OK;
+}
+/*
+** We do not trust systems to provide a working fdatasync().  Some do.
+** Others do no.  To be safe, we will stick with the (slower) fsync().
+** If you know that your system does support fdatasync() correctly,
+** then simply compile with -Dfdatasync=fdatasync
+*/
+#if !defined(fdatasync) && !defined(__linux__)
+# define fdatasync fsync
+#endif
+/*
+** Define HAVE_FULLFSYNC to 0 or 1 depending on whether or not
+** the F_FULLFSYNC macro is defined.  F_FULLFSYNC is currently
+** only available on Mac OS X.  But that could change.
+*/
+#ifdef F_FULLFSYNC
+# define HAVE_FULLFSYNC 1
+#else
+# define HAVE_FULLFSYNC 0
+#endif
+/*
+** The fsync() system call does not work as advertised on many
+** unix systems.  The following procedure is an attempt to make
+** it work better.
+**
+**
+** SQLite sets the dataOnly flag if the size of the file is unchanged.
+** The idea behind dataOnly is that it should only write the file content
+** to disk, not the inode.  We only set dataOnly if the file size is 
+** unchanged since the file size is part of the inode.  However, 
+** Ted Ts'o tells us that fdatasync() will also write the inode if the
+** file size has changed.  The only real difference between fdatasync()
+** and fsync(), Ted tells us, is that fdatasync() will not flush the
+** inode if the mtime or owner or other inode attributes have changed.
+** We only care about the file size, not the other file attributes, so
+** as far as SQLite is concerned, an fdatasync() is always adequate.
+** So, we always use fdatasync() if it is available, regardless of
+** the value of the dataOnly flag.
+*/
+static int full_fsync(int fd, int fullSync, int dataOnly){
+  int rc;
+#if HAVE_FULLFSYNC
+  SXUNUSED(dataOnly);
+#else
+  SXUNUSED(fullSync);
+  SXUNUSED(dataOnly);
+#endif
+  /* If we compiled with the UNQLITE_NO_SYNC flag, then syncing is a
+  ** no-op
+  */
+#if HAVE_FULLFSYNC
+  if( fullSync ){
+    rc = fcntl(fd, F_FULLFSYNC, 0);
+  }else{
+    rc = 1;
+  }
+  /* If the FULLFSYNC failed, fall back to attempting an fsync().
+  ** It shouldn't be possible for fullfsync to fail on the local 
+  ** file system (on OSX), so failure indicates that FULLFSYNC
+  ** isn't supported for this file system. So, attempt an fsync 
+  ** and (for now) ignore the overhead of a superfluous fcntl call.  
+  ** It'd be better to detect fullfsync support once and avoid 
+  ** the fcntl call every time sync is called.
+  */
+  if( rc ) rc = fsync(fd);
+#elif defined(__APPLE__)
+  /* fdatasync() on HFS+ doesn't yet flush the file size if it changed correctly
+  ** so currently we default to the macro that redefines fdatasync to fsync
+  */
+  rc = fsync(fd);
+#else 
+  rc = fdatasync(fd);
+#endif /* ifdef UNQLITE_NO_SYNC elif HAVE_FULLFSYNC */
+  if( rc!= -1 ){
+    rc = 0;
+  }
+  return rc;
+}
+/*
+** Make sure all writes to a particular file are committed to disk.
+**
+** If dataOnly==0 then both the file itself and its metadata (file
+** size, access time, etc) are synced.  If dataOnly!=0 then only the
+** file data is synced.
+**
+** Under Unix, also make sure that the directory entry for the file
+** has been created by fsync-ing the directory that contains the file.
+** If we do not do this and we encounter a power failure, the directory
+** entry for the journal might not exist after we reboot.  The next
+** SQLite to access the file will not know that the journal exists (because
+** the directory entry for the journal was never created) and the transaction
+** will not roll back - possibly leading to database corruption.
+*/
+static int unixSync(unqlite_file *id, int flags){
+  int rc;
+  unixFile *pFile = (unixFile*)id;
+  int isDataOnly = (flags&UNQLITE_SYNC_DATAONLY);
+  int isFullsync = (flags&0x0F)==UNQLITE_SYNC_FULL;
+  rc = full_fsync(pFile->h, isFullsync, isDataOnly);
+  if( rc ){
+    pFile->lastErrno = errno;
+    return UNQLITE_IOERR;
+  }
+  if( pFile->dirfd>=0 ){
+    int err;
+#ifndef UNQLITE_DISABLE_DIRSYNC
+    /* The directory sync is only attempted if full_fsync is
+    ** turned off or unavailable.  If a full_fsync occurred above,
+    ** then the directory sync is superfluous.
+    */
+    if( (!HAVE_FULLFSYNC || !isFullsync) && full_fsync(pFile->dirfd,0,0) ){
+       /*
+       ** We have received multiple reports of fsync() returning
+       ** errors when applied to directories on certain file systems.
+       ** A failed directory sync is not a big deal.  So it seems
+       ** better to ignore the error.  Ticket #1657
+       */
+       /* pFile->lastErrno = errno; */
+       /* return UNQLITE_IOERR; */
+    }
+#endif
+    err = close(pFile->dirfd); /* Only need to sync once, so close the */
+    if( err==0 ){              /* directory when we are done */
+      pFile->dirfd = -1;
+    }else{
+      pFile->lastErrno = errno;
+      rc = UNQLITE_IOERR;
+    }
+  }
+  return rc;
+}
+/*
+** Truncate an open file to a specified size
+*/
+static int unixTruncate(unqlite_file *id, sxi64 nByte){
+  unixFile *pFile = (unixFile *)id;
+  int rc;
+  rc = ftruncate(pFile->h, (off_t)nByte);
+  if( rc ){
+    pFile->lastErrno = errno;
+    return UNQLITE_IOERR;
+  }else{
+    return UNQLITE_OK;
+  }
+}
+/*
+** Determine the current size of a file in bytes
+*/
+static int unixFileSize(unqlite_file *id,sxi64 *pSize){
+  int rc;
+  struct stat buf;
+  
+  rc = fstat(((unixFile*)id)->h, &buf);
+  
+  if( rc!=0 ){
+    ((unixFile*)id)->lastErrno = errno;
+    return UNQLITE_IOERR;
+  }
+  *pSize = buf.st_size;
+  /* When opening a zero-size database, the findInodeInfo() procedure
+  ** writes a single byte into that file in order to work around a bug
+  ** in the OS-X msdos filesystem.  In order to avoid problems with upper
+  ** layers, we need to report this file size as zero even though it is
+  ** really 1.   Ticket #3260.
+  */
+  if( *pSize==1 ) *pSize = 0;
+  return UNQLITE_OK;
+}
+/*
+** Return the sector size in bytes of the underlying block device for
+** the specified file. This is almost always 512 bytes, but may be
+** larger for some devices.
+**
+** SQLite code assumes this function cannot fail. It also assumes that
+** if two files are created in the same file-system directory (i.e.
+** a database and its journal file) that the sector size will be the
+** same for both.
+*/
+static int unixSectorSize(unqlite_file *NotUsed){
+  SXUNUSED(NotUsed);
+  return UNQLITE_DEFAULT_SECTOR_SIZE;
+}
+/*
+** This vector defines all the methods that can operate on an
+** unqlite_file for Windows systems.
+*/
+static const unqlite_io_methods unixIoMethod = {
+  1,                              /* iVersion */
+  unixClose,                       /* xClose */
+  unixRead,                        /* xRead */
+  unixWrite,                       /* xWrite */
+  unixTruncate,                    /* xTruncate */
+  unixSync,                        /* xSync */
+  unixFileSize,                    /* xFileSize */
+  unixLock,                        /* xLock */
+  unixUnlock,                      /* xUnlock */
+  unixCheckReservedLock,           /* xCheckReservedLock */
+  unixSectorSize,                  /* xSectorSize */
+};
+/****************************************************************************
+**************************** unqlite_vfs methods ****************************
+**
+** This division contains the implementation of methods on the
+** unqlite_vfs object.
+*/
+/*
+** Initialize the contents of the unixFile structure pointed to by pId.
+*/
+static int fillInUnixFile(
+  unqlite_vfs *pVfs,      /* Pointer to vfs object */
+  int h,                  /* Open file descriptor of file being opened */
+  int dirfd,              /* Directory file descriptor */
+  unqlite_file *pId,      /* Write to the unixFile structure here */
+  const char *zFilename,  /* Name of the file being opened */
+  int noLock,             /* Omit locking if true */
+  int isDelete            /* Delete on close if true */
+){
+  const unqlite_io_methods *pLockingStyle = &unixIoMethod;
+  unixFile *pNew = (unixFile *)pId;
+  int rc = UNQLITE_OK;
+  /* Parameter isDelete is only used on vxworks. Express this explicitly 
+  ** here to prevent compiler warnings about unused parameters.
+  */
+  SXUNUSED(isDelete);
+  SXUNUSED(noLock);
+  SXUNUSED(pVfs);
+  pNew->h = h;
+  pNew->dirfd = dirfd;
+  pNew->fileFlags = 0;
+  pNew->zPath = zFilename;
+  
+  unixEnterMutex();
+  rc = findInodeInfo(pNew, &pNew->pInode);
+  if( rc!=UNQLITE_OK ){
+      /* If an error occured in findInodeInfo(), close the file descriptor
+      ** immediately, before releasing the mutex. findInodeInfo() may fail
+      ** in two scenarios:
+      **
+      **   (a) A call to fstat() failed.
+      **   (b) A malloc failed.
+      **
+      ** Scenario (b) may only occur if the process is holding no other
+      ** file descriptors open on the same file. If there were other file
+      ** descriptors on this file, then no malloc would be required by
+      ** findInodeInfo(). If this is the case, it is quite safe to close
+      ** handle h - as it is guaranteed that no posix locks will be released
+      ** by doing so.
+      **
+      ** If scenario (a) caused the error then things are not so safe. The
+      ** implicit assumption here is that if fstat() fails, things are in
+      ** such bad shape that dropping a lock or two doesn't matter much.
+      */
+      close(h);
+      h = -1;
+  }
+  unixLeaveMutex();
+  
+  pNew->lastErrno = 0;
+  if( rc!=UNQLITE_OK ){
+    if( dirfd>=0 ) close(dirfd); /* silent leak if fail, already in error */
+    if( h>=0 ) close(h);
+  }else{
+    pNew->pMethod = pLockingStyle;
+  }
+  return rc;
+}
+/*
+** Open a file descriptor to the directory containing file zFilename.
+** If successful, *pFd is set to the opened file descriptor and
+** UNQLITE_OK is returned. If an error occurs, either UNQLITE_NOMEM
+** or UNQLITE_CANTOPEN is returned and *pFd is set to an undefined
+** value.
+**
+** If UNQLITE_OK is returned, the caller is responsible for closing
+** the file descriptor *pFd using close().
+*/
+static int openDirectory(const char *zFilename, int *pFd){
+  sxu32 ii;
+  int fd = -1;
+  char zDirname[MAX_PATHNAME+1];
+  sxu32 n;
+  n = Systrcpy(zDirname,sizeof(zDirname),zFilename,0);
+  for(ii=n; ii>1 && zDirname[ii]!='/'; ii--);
+  if( ii>0 ){
+    zDirname[ii] = '\0';
+    fd = open(zDirname, O_RDONLY|O_BINARY, 0);
+    if( fd>=0 ){
+#ifdef FD_CLOEXEC
+      fcntl(fd, F_SETFD, fcntl(fd, F_GETFD, 0) | FD_CLOEXEC);
+#endif
+    }
+  }
+  *pFd = fd;
+  return (fd>=0?UNQLITE_OK: UNQLITE_IOERR );
+}
+/*
+** Search for an unused file descriptor that was opened on the database 
+** file (not a journal or master-journal file) identified by pathname
+** zPath with UNQLITE_OPEN_XXX flags matching those passed as the second
+** argument to this function.
+**
+** Such a file descriptor may exist if a database connection was closed
+** but the associated file descriptor could not be closed because some
+** other file descriptor open on the same file is holding a file-lock.
+** Refer to comments in the unixClose() function and the lengthy comment
+** describing "Posix Advisory Locking" at the start of this file for 
+** further details. Also, ticket #4018.
+**
+** If a suitable file descriptor is found, then it is returned. If no
+** such file descriptor is located, -1 is returned.
+*/
+static UnixUnusedFd *findReusableFd(const char *zPath, int flags){
+  UnixUnusedFd *pUnused = 0;
+  struct stat sStat;                   /* Results of stat() call */
+  /* A stat() call may fail for various reasons. If this happens, it is
+  ** almost certain that an open() call on the same path will also fail.
+  ** For this reason, if an error occurs in the stat() call here, it is
+  ** ignored and -1 is returned. The caller will try to open a new file
+  ** descriptor on the same path, fail, and return an error to SQLite.
+  **
+  ** Even if a subsequent open() call does succeed, the consequences of
+  ** not searching for a resusable file descriptor are not dire.  */
+  if( 0==stat(zPath, &sStat) ){
+    unixInodeInfo *pInode;
+    unixEnterMutex();
+    pInode = inodeList;
+    while( pInode && (pInode->fileId.dev!=sStat.st_dev
+                     || pInode->fileId.ino!=sStat.st_ino) ){
+       pInode = pInode->pNext;
+    }
+    if( pInode ){
+      UnixUnusedFd **pp;
+      for(pp=&pInode->pUnused; *pp && (*pp)->flags!=flags; pp=&((*pp)->pNext));
+      pUnused = *pp;
+      if( pUnused ){
+        *pp = pUnused->pNext;
+      }
+    }
+    unixLeaveMutex();
+  }
+  return pUnused;
+}
+/*
+** This function is called by unixOpen() to determine the unix permissions
+** to create new files with. If no error occurs, then UNQLITE_OK is returned
+** and a value suitable for passing as the third argument to open(2) is
+** written to *pMode. If an IO error occurs, an SQLite error code is 
+** returned and the value of *pMode is not modified.
+**
+** If the file being opened is a temporary file, it is always created with
+** the octal permissions 0600 (read/writable by owner only). If the file
+** is a database or master journal file, it is created with the permissions 
+** mask UNQLITE_DEFAULT_FILE_PERMISSIONS.
+**
+** Finally, if the file being opened is a WAL or regular journal file, then 
+** this function queries the file-system for the permissions on the 
+** corresponding database file and sets *pMode to this value. Whenever 
+** possible, WAL and journal files are created using the same permissions 
+** as the associated database file.
+*/
+static int findCreateFileMode(
+  const char *zPath,              /* Path of file (possibly) being created */
+  int flags,                      /* Flags passed as 4th argument to xOpen() */
+  mode_t *pMode                   /* OUT: Permissions to open file with */
+){
+  int rc = UNQLITE_OK;             /* Return Code */
+  if( flags & UNQLITE_OPEN_TEMP_DB ){
+    *pMode = 0600;
+     SXUNUSED(zPath);
+  }else{
+    *pMode = UNQLITE_DEFAULT_FILE_PERMISSIONS;
+  }
+  return rc;
+}
+/*
+** Open the file zPath.
+** 
+** Previously, the SQLite OS layer used three functions in place of this
+** one:
+**
+**     unqliteOsOpenReadWrite();
+**     unqliteOsOpenReadOnly();
+**     unqliteOsOpenExclusive();
+**
+** These calls correspond to the following combinations of flags:
+**
+**     ReadWrite() ->     (READWRITE | CREATE)
+**     ReadOnly()  ->     (READONLY) 
+**     OpenExclusive() -> (READWRITE | CREATE | EXCLUSIVE)
+**
+** The old OpenExclusive() accepted a boolean argument - "delFlag". If
+** true, the file was configured to be automatically deleted when the
+** file handle closed. To achieve the same effect using this new 
+** interface, add the DELETEONCLOSE flag to those specified above for 
+** OpenExclusive().
+*/
+static int unixOpen(
+  unqlite_vfs *pVfs,           /* The VFS for which this is the xOpen method */
+  const char *zPath,           /* Pathname of file to be opened */
+  unqlite_file *pFile,         /* The file descriptor to be filled in */
+  unsigned int flags           /* Input flags to control the opening */
+){
+  unixFile *p = (unixFile *)pFile;
+  int fd = -1;                   /* File descriptor returned by open() */
+  int dirfd = -1;                /* Directory file descriptor */
+  int openFlags = 0;             /* Flags to pass to open() */
+  int noLock;                    /* True to omit locking primitives */
+  int rc = UNQLITE_OK;            /* Function Return Code */
+  UnixUnusedFd *pUnused;
+  int isExclusive  = (flags & UNQLITE_OPEN_EXCLUSIVE);
+  int isDelete     = (flags & UNQLITE_OPEN_TEMP_DB);
+  int isCreate     = (flags & UNQLITE_OPEN_CREATE);
+  int isReadonly   = (flags & UNQLITE_OPEN_READONLY);
+  int isReadWrite  = (flags & UNQLITE_OPEN_READWRITE);
+  /* If creating a master or main-file journal, this function will open
+  ** a file-descriptor on the directory too. The first time unixSync()
+  ** is called the directory file descriptor will be fsync()ed and close()d.
+  */
+  int isOpenDirectory = isCreate ;
+  const char *zName = zPath;
+  SyZero(p,sizeof(unixFile));
+  
+  pUnused = findReusableFd(zName, flags);
+  if( pUnused ){
+          fd = pUnused->fd;
+  }else{
+          pUnused = unqlite_malloc(sizeof(*pUnused));
+      if( !pUnused ){
+        return UNQLITE_NOMEM;
+      }
+  }
+  p->pUnused = pUnused;
+  
+  /* Determine the value of the flags parameter passed to POSIX function
+  ** open(). These must be calculated even if open() is not called, as
+  ** they may be stored as part of the file handle and used by the 
+  ** 'conch file' locking functions later on.  */
+  if( isReadonly )  openFlags |= O_RDONLY;
+  if( isReadWrite ) openFlags |= O_RDWR;
+  if( isCreate )    openFlags |= O_CREAT;
+  if( isExclusive ) openFlags |= (O_EXCL|O_NOFOLLOW);
+  openFlags |= (O_LARGEFILE|O_BINARY);
+  if( fd<0 ){
+    mode_t openMode;              /* Permissions to create file with */
+    rc = findCreateFileMode(zName, flags, &openMode);
+    if( rc!=UNQLITE_OK ){
+      return rc;
+    }
+    fd = open(zName, openFlags, openMode);
+    if( fd<0 ){
+          rc = UNQLITE_IOERR;
+      goto open_finished;
+    }
+  }
+  
+  if( p->pUnused ){
+    p->pUnused->fd = fd;
+    p->pUnused->flags = flags;
+  }
+  if( isDelete ){
+    unlink(zName);
+  }
+  if( isOpenDirectory ){
+    rc = openDirectory(zPath, &dirfd);
+    if( rc!=UNQLITE_OK ){
+      /* It is safe to close fd at this point, because it is guaranteed not
+      ** to be open on a database file. If it were open on a database file,
+      ** it would not be safe to close as this would release any locks held
+      ** on the file by this process.  */
+      close(fd);             /* silently leak if fail, already in error */
+      goto open_finished;
+    }
+  }
+#ifdef FD_CLOEXEC
+  fcntl(fd, F_SETFD, fcntl(fd, F_GETFD, 0) | FD_CLOEXEC);
+#endif
+  noLock = 0;
+#if defined(__APPLE__) 
+  struct statfs fsInfo;
+  if( fstatfs(fd, &fsInfo) == -1 ){
+    ((unixFile*)pFile)->lastErrno = errno;
+    if( dirfd>=0 ) close(dirfd); /* silently leak if fail, in error */
+    close(fd); /* silently leak if fail, in error */
+    return UNQLITE_IOERR;
+  }
+  if (0 == SyStrncmp("msdos", fsInfo.f_fstypename, 5)) {
+    ((unixFile*)pFile)->fsFlags |= UNQLITE_FSFLAGS_IS_MSDOS;
+  }
+#endif
+  
+  rc = fillInUnixFile(pVfs, fd, dirfd, pFile, zPath, noLock, isDelete);
+open_finished:
+  if( rc!=UNQLITE_OK ){
+    unqlite_free(p->pUnused);
+  }
+  return rc;
+}
+/*
+** Delete the file at zPath. If the dirSync argument is true, fsync()
+** the directory after deleting the file.
+*/
+static int unixDelete(
+  unqlite_vfs *NotUsed,     /* VFS containing this as the xDelete method */
+  const char *zPath,        /* Name of file to be deleted */
+  int dirSync               /* If true, fsync() directory after deleting file */
+){
+  int rc = UNQLITE_OK;
+  SXUNUSED(NotUsed);
+  
+  if( unlink(zPath)==(-1) && errno!=ENOENT ){
+          return UNQLITE_IOERR;
+  }
+#ifndef UNQLITE_DISABLE_DIRSYNC
+  if( dirSync ){
+    int fd;
+    rc = openDirectory(zPath, &fd);
+    if( rc==UNQLITE_OK ){
+      if( fsync(fd) )
+      {
+        rc = UNQLITE_IOERR;
+      }
+      if( close(fd) && !rc ){
+        rc = UNQLITE_IOERR;
+      }
+    }
+  }
+#endif
+  return rc;
+}
+/*
+** Sleep for a little while.  Return the amount of time slept.
+** The argument is the number of microseconds we want to sleep.
+** The return value is the number of microseconds of sleep actually
+** requested from the underlying operating system, a number which
+** might be greater than or equal to the argument, but not less
+** than the argument.
+*/
+static int unixSleep(unqlite_vfs *NotUsed, int microseconds)
+{
+#if defined(HAVE_USLEEP) && HAVE_USLEEP
+  usleep(microseconds);
+  SXUNUSED(NotUsed);
+  return microseconds;
+#else
+  int seconds = (microseconds+999999)/1000000;
+  SXUNUSED(NotUsed);
+  sleep(seconds);
+  return seconds*1000000;
+#endif
+}
+/*
+ * Export the current system time.
+ */
+static int unixCurrentTime(unqlite_vfs *pVfs,Sytm *pOut)
+{
+        struct tm *pTm;
+        time_t tt;
+        SXUNUSED(pVfs);
+        time(&tt);
+        pTm = gmtime(&tt);
+        if( pTm ){ /* Yes, it can fail */
+                STRUCT_TM_TO_SYTM(pTm,pOut);
+        }
+        return UNQLITE_OK;
+}
+/*
+** Test the existance of or access permissions of file zPath. The
+** test performed depends on the value of flags:
+**
+**     UNQLITE_ACCESS_EXISTS: Return 1 if the file exists
+**     UNQLITE_ACCESS_READWRITE: Return 1 if the file is read and writable.
+**     UNQLITE_ACCESS_READONLY: Return 1 if the file is readable.
+**
+** Otherwise return 0.
+*/
+static int unixAccess(
+  unqlite_vfs *NotUsed,   /* The VFS containing this xAccess method */
+  const char *zPath,      /* Path of the file to examine */
+  int flags,              /* What do we want to learn about the zPath file? */
+  int *pResOut            /* Write result boolean here */
+){
+  int amode = 0;
+  SXUNUSED(NotUsed);
+  switch( flags ){
+    case UNQLITE_ACCESS_EXISTS:
+      amode = F_OK;
+      break;
+    case UNQLITE_ACCESS_READWRITE:
+      amode = W_OK|R_OK;
+      break;
+    case UNQLITE_ACCESS_READ:
+      amode = R_OK;
+      break;
+    default:
+                /* Can't happen */
+      break;
+  }
+  *pResOut = (access(zPath, amode)==0);
+  if( flags==UNQLITE_ACCESS_EXISTS && *pResOut ){
+    struct stat buf;
+    if( 0==stat(zPath, &buf) && buf.st_size==0 ){
+      *pResOut = 0;
+    }
+  }
+  return UNQLITE_OK;
+}
+/*
+** Turn a relative pathname into a full pathname. The relative path
+** is stored as a nul-terminated string in the buffer pointed to by
+** zPath. 
+**
+** zOut points to a buffer of at least unqlite_vfs.mxPathname bytes 
+** (in this case, MAX_PATHNAME bytes). The full-path is written to
+** this buffer before returning.
+*/
+static int unixFullPathname(
+  unqlite_vfs *pVfs,            /* Pointer to vfs object */
+  const char *zPath,            /* Possibly relative input path */
+  int nOut,                     /* Size of output buffer in bytes */
+  char *zOut                    /* Output buffer */
+){
+  if( zPath[0]=='/' ){
+          Systrcpy(zOut,(sxu32)nOut,zPath,0);
+          SXUNUSED(pVfs);
+  }else{
+    sxu32 nCwd;
+        zOut[nOut-1] = '\0';
+    if( getcwd(zOut, nOut-1)==0 ){
+                return UNQLITE_IOERR;
+    }
+    nCwd = SyStrlen(zOut);
+    SyBufferFormat(&zOut[nCwd],(sxu32)nOut-nCwd,"/%s",zPath);
+  }
+  return UNQLITE_OK;
+}
+/*
+ * Export the Unix Vfs.
+ */
+UNQLITE_PRIVATE const unqlite_vfs * unqliteExportBuiltinVfs(void)
+{
+        static const unqlite_vfs sUnixvfs = {
+                "Unix",              /* Vfs name */
+                1,                   /* Vfs structure version */
+                sizeof(unixFile),    /* szOsFile */
+                MAX_PATHNAME,        /* mxPathName */
+                unixOpen,            /* xOpen */
+                unixDelete,          /* xDelete */
+                unixAccess,          /* xAccess */
+                unixFullPathname,    /* xFullPathname */
+                0,                   /* xTmp */
+                unixSleep,           /* xSleep */
+                unixCurrentTime,     /* xCurrentTime */
+                0,                   /* xGetLastError */
+        };
+        return &sUnixvfs;
+}
+#endif /* __UNIXES__ */
author	Aaron Seigo <aseigo@kde.org>	2014-12-14 12:00:05 +0100
committer	Aaron Seigo <aseigo@kde.org>	2014-12-14 12:00:05 +0100
commit	7cc25005b8c46d1fa783d33def2c6923e8ef8469 (patch)
tree	64fa59d17af29838396cf37b912b3babd885e5dd /common/unqlite/os_unix.c
parent	bfc32f265e8ad72823db960fed371d72596003b7 (diff)
parent	a6ed70495f9f3ecb21c26860dda16aadcdc91c3a (diff)
download	sink-7cc25005b8c46d1fa783d33def2c6923e8ef8469.tar.gz sink-7cc25005b8c46d1fa783d33def2c6923e8ef8469.zip