blob: 804c588e45e1b914a753a60ff2f76959d42213bc [file] [log] [blame]
/*
** 2004 May 22
**
** The author disclaims copyright to this source code. In place of
** a legal notice, here is a blessing:
**
** May you do good and not evil.
** May you find forgiveness for yourself and forgive others.
** May you share freely, never taking more than you give.
**
******************************************************************************
**
** This file contains the VFS implementation for unix-like operating systems
** include Linux, MacOSX, *BSD, QNX, VxWorks, AIX, HPUX, and others.
**
** There are actually several different VFS implementations in this file.
** The differences are in the way that file locking is done. The default
** implementation uses Posix Advisory Locks. Alternative implementations
** use flock(), dot-files, various proprietary locking schemas, or simply
** skip locking all together.
**
** This source file is organized into divisions where the logic for various
** subfunctions is contained within the appropriate division. PLEASE
** KEEP THE STRUCTURE OF THIS FILE INTACT. New code should be placed
** in the correct division and should be clearly labeled.
**
** The layout of divisions is as follows:
**
** * General-purpose declarations and utility functions.
** * Unique file ID logic used by VxWorks.
** * Various locking primitive implementations (all except proxy locking):
** + for Posix Advisory Locks
** + for no-op locks
** + for dot-file locks
** + for flock() locking
** + for named semaphore locks (VxWorks only)
** + for AFP filesystem locks (MacOSX only)
** * sqlite3_file methods not associated with locking.
** * Definitions of sqlite3_io_methods objects for all locking
** methods plus "finder" functions for each locking method.
** * sqlite3_vfs method implementations.
** * Locking primitives for the proxy uber-locking-method. (MacOSX only)
** * Definitions of sqlite3_vfs objects for all locking methods
** plus implementations of sqlite3_os_init() and sqlite3_os_end().
*/
#include "sqliteInt.h"
#if SQLITE_OS_UNIX /* This file is used on unix only */
/*
** There are various methods for file locking used for concurrency
** control:
**
** 1. POSIX locking (the default),
** 2. No locking,
** 3. Dot-file locking,
** 4. flock() locking,
** 5. AFP locking (OSX only),
** 6. Named POSIX semaphores (VXWorks only),
** 7. proxy locking. (OSX only)
**
** Styles 4, 5, and 7 are only available of SQLITE_ENABLE_LOCKING_STYLE
** is defined to 1. The SQLITE_ENABLE_LOCKING_STYLE also enables automatic
** selection of the appropriate locking style based on the filesystem
** where the database is located.
*/
#if !defined(SQLITE_ENABLE_LOCKING_STYLE)
# if defined(__APPLE__)
# define SQLITE_ENABLE_LOCKING_STYLE 1
# else
# define SQLITE_ENABLE_LOCKING_STYLE 0
# endif
#endif
/*
** Define the OS_VXWORKS pre-processor macro to 1 if building on
** vxworks, or 0 otherwise.
*/
#ifndef OS_VXWORKS
# if defined(__RTP__) || defined(_WRS_KERNEL)
# define OS_VXWORKS 1
# else
# define OS_VXWORKS 0
# endif
#endif
/*
** These #defines should enable >2GB file support on Posix if the
** underlying operating system supports it. If the OS lacks
** large file support, these should be no-ops.
**
** Large file support can be disabled using the -DSQLITE_DISABLE_LFS switch
** on the compiler command line. This is necessary if you are compiling
** on a recent machine (ex: RedHat 7.2) but you want your code to work
** on an older machine (ex: RedHat 6.0). If you compile on RedHat 7.2
** without this option, LFS is enable. But LFS does not exist in the kernel
** in RedHat 6.0, so the code won't work. Hence, for maximum binary
** portability you should omit LFS.
**
** The previous paragraph was written in 2005. (This paragraph is written
** on 2008-11-28.) These days, all Linux kernels support large files, so
** you should probably leave LFS enabled. But some embedded platforms might
** lack LFS in which case the SQLITE_DISABLE_LFS macro might still be useful.
*/
#ifndef SQLITE_DISABLE_LFS
# define _LARGE_FILE 1
# ifndef _FILE_OFFSET_BITS
# define _FILE_OFFSET_BITS 64
# endif
# define _LARGEFILE_SOURCE 1
#endif
/*
** standard include files.
*/
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <sys/time.h>
#include <errno.h>
#ifndef SQLITE_OMIT_WAL
#include <sys/mman.h>
#endif
#if SQLITE_ENABLE_LOCKING_STYLE
# include <sys/ioctl.h>
# if OS_VXWORKS
# include <semaphore.h>
# include <limits.h>
# else
# include <sys/file.h>
# include <sys/param.h>
# endif
#endif /* SQLITE_ENABLE_LOCKING_STYLE */
#if defined(__APPLE__) || (SQLITE_ENABLE_LOCKING_STYLE && !OS_VXWORKS)
# include <sys/mount.h>
#endif
/*
** Allowed values of unixFile.fsFlags
*/
#define SQLITE_FSFLAGS_IS_MSDOS 0x1
/*
** If we are to be thread-safe, include the pthreads header and define
** the SQLITE_UNIX_THREADS macro.
*/
#if SQLITE_THREADSAFE
# include <pthread.h>
# define SQLITE_UNIX_THREADS 1
#endif
/*
** Default permissions when creating a new file
*/
#ifndef SQLITE_DEFAULT_FILE_PERMISSIONS
# define SQLITE_DEFAULT_FILE_PERMISSIONS 0644
#endif
/*
** Default permissions when creating auto proxy dir
*/
#ifndef SQLITE_DEFAULT_PROXYDIR_PERMISSIONS
# define SQLITE_DEFAULT_PROXYDIR_PERMISSIONS 0755
#endif
/*
** Maximum supported path-length.
*/
#define MAX_PATHNAME 512
/*
** Only set the lastErrno if the error code is a real error and not
** a normal expected return code of SQLITE_BUSY or SQLITE_OK
*/
#define IS_LOCK_ERROR(x) ((x != SQLITE_OK) && (x != SQLITE_BUSY))
/* Forward references */
typedef struct unixShm unixShm; /* Connection shared memory */
typedef struct unixShmNode unixShmNode; /* Shared memory instance */
typedef struct unixInodeInfo unixInodeInfo; /* An i-node */
typedef struct UnixUnusedFd UnixUnusedFd; /* An unused file descriptor */
/*
** Sometimes, after a file handle is closed by SQLite, the file descriptor
** cannot be closed immediately. In these cases, instances of the following
** structure are used to store the file descriptor while waiting for an
** opportunity to either close or reuse it.
*/
struct UnixUnusedFd {
int fd; /* File descriptor to close */
int flags; /* Flags this file descriptor was opened with */
UnixUnusedFd *pNext; /* Next unused file descriptor on same file */
};
/*
** The unixFile structure is subclass of sqlite3_file specific to the unix
** VFS implementations.
*/
typedef struct unixFile unixFile;
struct unixFile {
sqlite3_io_methods const *pMethod; /* Always the first entry */
unixInodeInfo *pInode; /* Info about locks on this inode */
int h; /* The file descriptor */
unsigned char eFileLock; /* The type of lock held on this fd */
unsigned char ctrlFlags; /* Behavioral bits. UNIXFILE_* flags */
int lastErrno; /* The unix errno from last I/O error */
void *lockingContext; /* Locking style specific state */
UnixUnusedFd *pUnused; /* Pre-allocated UnixUnusedFd */
const char *zPath; /* Name of the file */
unixShm *pShm; /* Shared memory segment information */
int szChunk; /* Configured by FCNTL_CHUNK_SIZE */
#if SQLITE_ENABLE_LOCKING_STYLE
int openFlags; /* The flags specified at open() */
#endif
#if SQLITE_ENABLE_LOCKING_STYLE || defined(__APPLE__)
unsigned fsFlags; /* cached details from statfs() */
#endif
#if OS_VXWORKS
int isDelete; /* Delete on close if true */
struct vxworksFileId *pId; /* Unique file ID */
#endif
#ifndef NDEBUG
/* The next group of variables are used to track whether or not the
** transaction counter in bytes 24-27 of database files are updated
** whenever any part of the database changes. An assertion fault will
** occur if a file is updated without also updating the transaction
** counter. This test is made to avoid new problems similar to the
** one described by ticket #3584.
*/
unsigned char transCntrChng; /* True if the transaction counter changed */
unsigned char dbUpdate; /* True if any part of database file changed */
unsigned char inNormalWrite; /* True if in a normal write operation */
#endif
#ifdef SQLITE_TEST
/* In test mode, increase the size of this structure a bit so that
** it is larger than the struct CrashFile defined in test6.c.
*/
char aPadding[32];
#endif
};
/*
** Allowed values for the unixFile.ctrlFlags bitmask:
*/
#define UNIXFILE_EXCL 0x01 /* Connections from one process only */
#define UNIXFILE_RDONLY 0x02 /* Connection is read only */
#define UNIXFILE_DIRSYNC 0x04 /* Directory sync needed */
/*
** Include code that is common to all os_*.c files
*/
#include "os_common.h"
/*
** Define various macros that are missing from some systems.
*/
#ifndef O_LARGEFILE
# define O_LARGEFILE 0
#endif
#ifdef SQLITE_DISABLE_LFS
# undef O_LARGEFILE
# define O_LARGEFILE 0
#endif
#ifndef O_NOFOLLOW
# define O_NOFOLLOW 0
#endif
#ifndef O_BINARY
# define O_BINARY 0
#endif
/*
** The threadid macro resolves to the thread-id or to 0. Used for
** testing and debugging only.
*/
#if SQLITE_THREADSAFE
#define threadid pthread_self()
#else
#define threadid 0
#endif
/* Forward reference */
static int openDirectory(const char*, int*);
/*
** Many system calls are accessed through pointer-to-functions so that
** they may be overridden at runtime to facilitate fault injection during
** testing and sandboxing. The following array holds the names and pointers
** to all overrideable system calls.
*/
static struct unix_syscall {
const char *zName; /* Name of the sytem call */
sqlite3_syscall_ptr pCurrent; /* Current value of the system call */
sqlite3_syscall_ptr pDefault; /* Default value */
} aSyscall[] = {
{ "open", (sqlite3_syscall_ptr)open, 0 },
#define osOpen ((int(*)(const char*,int,...))aSyscall[0].pCurrent)
{ "close", (sqlite3_syscall_ptr)close, 0 },
#define osClose ((int(*)(int))aSyscall[1].pCurrent)
{ "access", (sqlite3_syscall_ptr)access, 0 },
#define osAccess ((int(*)(const char*,int))aSyscall[2].pCurrent)
{ "getcwd", (sqlite3_syscall_ptr)getcwd, 0 },
#define osGetcwd ((char*(*)(char*,size_t))aSyscall[3].pCurrent)
{ "stat", (sqlite3_syscall_ptr)stat, 0 },
#define osStat ((int(*)(const char*,struct stat*))aSyscall[4].pCurrent)
/*
** The DJGPP compiler environment looks mostly like Unix, but it
** lacks the fcntl() system call. So redefine fcntl() to be something
** that always succeeds. This means that locking does not occur under
** DJGPP. But it is DOS - what did you expect?
*/
#ifdef __DJGPP__
{ "fstat", 0, 0 },
#define osFstat(a,b,c) 0
#else
{ "fstat", (sqlite3_syscall_ptr)fstat, 0 },
#define osFstat ((int(*)(int,struct stat*))aSyscall[5].pCurrent)
#endif
{ "ftruncate", (sqlite3_syscall_ptr)ftruncate, 0 },
#define osFtruncate ((int(*)(int,off_t))aSyscall[6].pCurrent)
{ "fcntl", (sqlite3_syscall_ptr)fcntl, 0 },
#define osFcntl ((int(*)(int,int,...))aSyscall[7].pCurrent)
{ "read", (sqlite3_syscall_ptr)read, 0 },
#define osRead ((ssize_t(*)(int,void*,size_t))aSyscall[8].pCurrent)
#if defined(USE_PREAD) || defined(SQLITE_ENABLE_LOCKING_STYLE)
{ "pread", (sqlite3_syscall_ptr)pread, 0 },
#else
{ "pread", (sqlite3_syscall_ptr)0, 0 },
#endif
#define osPread ((ssize_t(*)(int,void*,size_t,off_t))aSyscall[9].pCurrent)
#if defined(USE_PREAD64)
{ "pread64", (sqlite3_syscall_ptr)pread64, 0 },
#else
{ "pread64", (sqlite3_syscall_ptr)0, 0 },
#endif
#define osPread64 ((ssize_t(*)(int,void*,size_t,off_t))aSyscall[10].pCurrent)
{ "write", (sqlite3_syscall_ptr)write, 0 },
#define osWrite ((ssize_t(*)(int,const void*,size_t))aSyscall[11].pCurrent)
#if defined(USE_PREAD) || defined(SQLITE_ENABLE_LOCKING_STYLE)
{ "pwrite", (sqlite3_syscall_ptr)pwrite, 0 },
#else
{ "pwrite", (sqlite3_syscall_ptr)0, 0 },
#endif
#define osPwrite ((ssize_t(*)(int,const void*,size_t,off_t))\
aSyscall[12].pCurrent)
#if defined(USE_PREAD64)
{ "pwrite64", (sqlite3_syscall_ptr)pwrite64, 0 },
#else
{ "pwrite64", (sqlite3_syscall_ptr)0, 0 },
#endif
#define osPwrite64 ((ssize_t(*)(int,const void*,size_t,off_t))\
aSyscall[13].pCurrent)
#if SQLITE_ENABLE_LOCKING_STYLE
{ "fchmod", (sqlite3_syscall_ptr)fchmod, 0 },
#else
{ "fchmod", (sqlite3_syscall_ptr)0, 0 },
#endif
#define osFchmod ((int(*)(int,mode_t))aSyscall[14].pCurrent)
#if defined(HAVE_POSIX_FALLOCATE) && HAVE_POSIX_FALLOCATE
{ "fallocate", (sqlite3_syscall_ptr)posix_fallocate, 0 },
#else
{ "fallocate", (sqlite3_syscall_ptr)0, 0 },
#endif
#define osFallocate ((int(*)(int,off_t,off_t))aSyscall[15].pCurrent)
{ "unlink", (sqlite3_syscall_ptr)unlink, 0 },
#define osUnlink ((int(*)(const char*))aSyscall[16].pCurrent)
{ "openDirectory", (sqlite3_syscall_ptr)openDirectory, 0 },
#define osOpenDirectory ((int(*)(const char*,int*))aSyscall[17].pCurrent)
}; /* End of the overrideable system calls */
/*
** This is the xSetSystemCall() method of sqlite3_vfs for all of the
** "unix" VFSes. Return SQLITE_OK opon successfully updating the
** system call pointer, or SQLITE_NOTFOUND if there is no configurable
** system call named zName.
*/
static int unixSetSystemCall(
sqlite3_vfs *pNotUsed, /* The VFS pointer. Not used */
const char *zName, /* Name of system call to override */
sqlite3_syscall_ptr pNewFunc /* Pointer to new system call value */
){
unsigned int i;
int rc = SQLITE_NOTFOUND;
UNUSED_PARAMETER(pNotUsed);
if( zName==0 ){
/* If no zName is given, restore all system calls to their default
** settings and return NULL
*/
rc = SQLITE_OK;
for(i=0; i<sizeof(aSyscall)/sizeof(aSyscall[0]); i++){
if( aSyscall[i].pDefault ){
aSyscall[i].pCurrent = aSyscall[i].pDefault;
}
}
}else{
/* If zName is specified, operate on only the one system call
** specified.
*/
for(i=0; i<sizeof(aSyscall)/sizeof(aSyscall[0]); i++){
if( strcmp(zName, aSyscall[i].zName)==0 ){
if( aSyscall[i].pDefault==0 ){
aSyscall[i].pDefault = aSyscall[i].pCurrent;
}
rc = SQLITE_OK;
if( pNewFunc==0 ) pNewFunc = aSyscall[i].pDefault;
aSyscall[i].pCurrent = pNewFunc;
break;
}
}
}
return rc;
}
/*
** Return the value of a system call. Return NULL if zName is not a
** recognized system call name. NULL is also returned if the system call
** is currently undefined.
*/
static sqlite3_syscall_ptr unixGetSystemCall(
sqlite3_vfs *pNotUsed,
const char *zName
){
unsigned int i;
UNUSED_PARAMETER(pNotUsed);
for(i=0; i<sizeof(aSyscall)/sizeof(aSyscall[0]); i++){
if( strcmp(zName, aSyscall[i].zName)==0 ) return aSyscall[i].pCurrent;
}
return 0;
}
/*
** Return the name of the first system call after zName. If zName==NULL
** then return the name of the first system call. Return NULL if zName
** is the last system call or if zName is not the name of a valid
** system call.
*/
static const char *unixNextSystemCall(sqlite3_vfs *p, const char *zName){
int i = -1;
UNUSED_PARAMETER(p);
if( zName ){
for(i=0; i<ArraySize(aSyscall)-1; i++){
if( strcmp(zName, aSyscall[i].zName)==0 ) break;
}
}
for(i++; i<ArraySize(aSyscall); i++){
if( aSyscall[i].pCurrent!=0 ) return aSyscall[i].zName;
}
return 0;
}
/*
** Retry open() calls that fail due to EINTR
*/
static int robust_open(const char *z, int f, int m){
int rc;
do{ rc = osOpen(z,f,m); }while( rc<0 && errno==EINTR );
return rc;
}
/*
** Helper functions to obtain and relinquish the global mutex. The
** global mutex is used to protect the unixInodeInfo and
** vxworksFileId objects used by this file, all of which may be
** shared by multiple threads.
**
** Function unixMutexHeld() is used to assert() that the global mutex
** is held when required. This function is only used as part of assert()
** statements. e.g.
**
** unixEnterMutex()
** assert( unixMutexHeld() );
** unixEnterLeave()
*/
static void unixEnterMutex(void){
sqlite3_mutex_enter(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER));
}
static void unixLeaveMutex(void){
sqlite3_mutex_leave(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER));
}
#ifdef SQLITE_DEBUG
static int unixMutexHeld(void) {
return sqlite3_mutex_held(sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MASTER));
}
#endif
#ifdef SQLITE_DEBUG
/*
** Helper function for printing out trace information from debugging
** binaries. This returns the string represetation of the supplied
** integer lock-type.
*/
static const char *azFileLock(int eFileLock){
switch( eFileLock ){
case NO_LOCK: return "NONE";
case SHARED_LOCK: return "SHARED";
case RESERVED_LOCK: return "RESERVED";
case PENDING_LOCK: return "PENDING";
case EXCLUSIVE_LOCK: return "EXCLUSIVE";
}
return "ERROR";
}
#endif
#ifdef SQLITE_LOCK_TRACE
/*
** Print out information about all locking operations.
**
** This routine is used for troubleshooting locks on multithreaded
** platforms. Enable by compiling with the -DSQLITE_LOCK_TRACE
** command-line option on the compiler. This code is normally
** turned off.
*/
static int lockTrace(int fd, int op, struct flock *p){
char *zOpName, *zType;
int s;
int savedErrno;
if( op==F_GETLK ){
zOpName = "GETLK";
}else if( op==F_SETLK ){
zOpName = "SETLK";
}else{
s = osFcntl(fd, op, p);
sqlite3DebugPrintf("fcntl unknown %d %d %d\n", fd, op, s);
return s;
}
if( p->l_type==F_RDLCK ){
zType = "RDLCK";
}else if( p->l_type==F_WRLCK ){
zType = "WRLCK";
}else if( p->l_type==F_UNLCK ){
zType = "UNLCK";
}else{
assert( 0 );
}
assert( p->l_whence==SEEK_SET );
s = osFcntl(fd, op, p);
savedErrno = errno;
sqlite3DebugPrintf("fcntl %d %d %s %s %d %d %d %d\n",
threadid, fd, zOpName, zType, (int)p->l_start, (int)p->l_len,
(int)p->l_pid, s);
if( s==(-1) && op==F_SETLK && (p->l_type==F_RDLCK || p->l_type==F_WRLCK) ){
struct flock l2;
l2 = *p;
osFcntl(fd, F_GETLK, &l2);
if( l2.l_type==F_RDLCK ){
zType = "RDLCK";
}else if( l2.l_type==F_WRLCK ){
zType = "WRLCK";
}else if( l2.l_type==F_UNLCK ){
zType = "UNLCK";
}else{
assert( 0 );
}
sqlite3DebugPrintf("fcntl-failure-reason: %s %d %d %d\n",
zType, (int)l2.l_start, (int)l2.l_len, (int)l2.l_pid);
}
errno = savedErrno;
return s;
}
#undef osFcntl
#define osFcntl lockTrace
#endif /* SQLITE_LOCK_TRACE */
/*
** Retry ftruncate() calls that fail due to EINTR
*/
static int robust_ftruncate(int h, sqlite3_int64 sz){
int rc;
do{ rc = osFtruncate(h,sz); }while( rc<0 && errno==EINTR );
return rc;
}
/*
** This routine translates a standard POSIX errno code into something
** useful to the clients of the sqlite3 functions. Specifically, it is
** intended to translate a variety of "try again" errors into SQLITE_BUSY
** and a variety of "please close the file descriptor NOW" errors into
** SQLITE_IOERR
**
** Errors during initialization of locks, or file system support for locks,
** should handle ENOLCK, ENOTSUP, EOPNOTSUPP separately.
*/
static int sqliteErrorFromPosixError(int posixError, int sqliteIOErr) {
switch (posixError) {
#if 0
/* At one point this code was not commented out. In theory, this branch
** should never be hit, as this function should only be called after
** a locking-related function (i.e. fcntl()) has returned non-zero with
** the value of errno as the first argument. Since a system call has failed,
** errno should be non-zero.
**
** Despite this, if errno really is zero, we still don't want to return
** SQLITE_OK. The system call failed, and *some* SQLite error should be
** propagated back to the caller. Commenting this branch out means errno==0
** will be handled by the "default:" case below.
*/
case 0:
return SQLITE_OK;
#endif
case EAGAIN:
case ETIMEDOUT:
case EBUSY:
case EINTR:
case ENOLCK:
/* random NFS retry error, unless during file system support
* introspection, in which it actually means what it says */
return SQLITE_BUSY;
case EACCES:
/* EACCES is like EAGAIN during locking operations, but not any other time*/
if( (sqliteIOErr == SQLITE_IOERR_LOCK) ||
(sqliteIOErr == SQLITE_IOERR_UNLOCK) ||
(sqliteIOErr == SQLITE_IOERR_RDLOCK) ||
(sqliteIOErr == SQLITE_IOERR_CHECKRESERVEDLOCK) ){
return SQLITE_BUSY;
}
/* else fall through */
case EPERM:
return SQLITE_PERM;
/* EDEADLK is only possible if a call to fcntl(F_SETLKW) is made. And
** this module never makes such a call. And the code in SQLite itself
** asserts that SQLITE_IOERR_BLOCKED is never returned. For these reasons
** this case is also commented out. If the system does set errno to EDEADLK,
** the default SQLITE_IOERR_XXX code will be returned. */
#if 0
case EDEADLK:
return SQLITE_IOERR_BLOCKED;
#endif
#if EOPNOTSUPP!=ENOTSUP
case EOPNOTSUPP:
/* something went terribly awry, unless during file system support
* introspection, in which it actually means what it says */
#endif
#ifdef ENOTSUP
case ENOTSUP:
/* invalid fd, unless during file system support introspection, in which
* it actually means what it says */
#endif
case EIO:
case EBADF:
case EINVAL:
case ENOTCONN:
case ENODEV:
case ENXIO:
case ENOENT:
case ESTALE:
case ENOSYS:
/* these should force the client to close the file and reconnect */
default:
return sqliteIOErr;
}
}
/******************************************************************************
****************** Begin Unique File ID Utility Used By VxWorks ***************
**
** On most versions of unix, we can get a unique ID for a file by concatenating
** the device number and the inode number. But this does not work on VxWorks.
** On VxWorks, a unique file id must be based on the canonical filename.
**
** A pointer to an instance of the following structure can be used as a
** unique file ID in VxWorks. Each instance of this structure contains
** a copy of the canonical filename. There is also a reference count.
** The structure is reclaimed when the number of pointers to it drops to
** zero.
**
** There are never very many files open at one time and lookups are not
** a performance-critical path, so it is sufficient to put these
** structures on a linked list.
*/
struct vxworksFileId {
struct vxworksFileId *pNext; /* Next in a list of them all */
int nRef; /* Number of references to this one */
int nName; /* Length of the zCanonicalName[] string */
char *zCanonicalName; /* Canonical filename */
};
#if OS_VXWORKS
/*
** All unique filenames are held on a linked list headed by this
** variable:
*/
static struct vxworksFileId *vxworksFileList = 0;
/*
** Simplify a filename into its canonical form
** by making the following changes:
**
** * removing any trailing and duplicate /
** * convert /./ into just /
** * convert /A/../ where A is any simple name into just /
**
** Changes are made in-place. Return the new name length.
**
** The original filename is in z[0..n-1]. Return the number of
** characters in the simplified name.
*/
static int vxworksSimplifyName(char *z, int n){
int i, j;
while( n>1 && z[n-1]=='/' ){ n--; }
for(i=j=0; i<n; i++){
if( z[i]=='/' ){
if( z[i+1]=='/' ) continue;
if( z[i+1]=='.' && i+2<n && z[i+2]=='/' ){
i += 1;
continue;
}
if( z[i+1]=='.' && i+3<n && z[i+2]=='.' && z[i+3]=='/' ){
while( j>0 && z[j-1]!='/' ){ j--; }
if( j>0 ){ j--; }
i += 2;
continue;
}
}
z[j++] = z[i];
}
z[j] = 0;
return j;
}
/*
** Find a unique file ID for the given absolute pathname. Return
** a pointer to the vxworksFileId object. This pointer is the unique
** file ID.
**
** The nRef field of the vxworksFileId object is incremented before
** the object is returned. A new vxworksFileId object is created
** and added to the global list if necessary.
**
** If a memory allocation error occurs, return NULL.
*/
static struct vxworksFileId *vxworksFindFileId(const char *zAbsoluteName){
struct vxworksFileId *pNew; /* search key and new file ID */
struct vxworksFileId *pCandidate; /* For looping over existing file IDs */
int n; /* Length of zAbsoluteName string */
assert( zAbsoluteName[0]=='/' );
n = (int)strlen(zAbsoluteName);
pNew = sqlite3_malloc( sizeof(*pNew) + (n+1) );
if( pNew==0 ) return 0;
pNew->zCanonicalName = (char*)&pNew[1];
memcpy(pNew->zCanonicalName, zAbsoluteName, n+1);
n = vxworksSimplifyName(pNew->zCanonicalName, n);
/* Search for an existing entry that matching the canonical name.
** If found, increment the reference count and return a pointer to
** the existing file ID.
*/
unixEnterMutex();
for(pCandidate=vxworksFileList; pCandidate; pCandidate=pCandidate->pNext){
if( pCandidate->nName==n
&& memcmp(pCandidate->zCanonicalName, pNew->zCanonicalName, n)==0
){
sqlite3_free(pNew);
pCandidate->nRef++;
unixLeaveMutex();
return pCandidate;
}
}
/* No match was found. We will make a new file ID */
pNew->nRef = 1;
pNew->nName = n;
pNew->pNext = vxworksFileList;
vxworksFileList = pNew;
unixLeaveMutex();
return pNew;
}
/*
** Decrement the reference count on a vxworksFileId object. Free
** the object when the reference count reaches zero.
*/
static void vxworksReleaseFileId(struct vxworksFileId *pId){
unixEnterMutex();
assert( pId->nRef>0 );
pId->nRef--;
if( pId->nRef==0 ){
struct vxworksFileId **pp;
for(pp=&vxworksFileList; *pp && *pp!=pId; pp = &((*pp)->pNext)){}
assert( *pp==pId );
*pp = pId->pNext;
sqlite3_free(pId);
}
unixLeaveMutex();
}
#endif /* OS_VXWORKS */
/*************** End of Unique File ID Utility Used By VxWorks ****************
******************************************************************************/
/******************************************************************************
*************************** Posix Advisory Locking ****************************
**
** POSIX advisory locks are broken by design. ANSI STD 1003.1 (1996)
** section 6.5.2.2 lines 483 through 490 specify that when a process
** sets or clears a lock, that operation overrides any prior locks set
** by the same process. It does not explicitly say so, but this implies
** that it overrides locks set by the same process using a different
** file descriptor. Consider this test case:
**
** int fd1 = open("./file1", O_RDWR|O_CREAT, 0644);
** int fd2 = open("./file2", O_RDWR|O_CREAT, 0644);
**
** Suppose ./file1 and ./file2 are really the same file (because
** one is a hard or symbolic link to the other) then if you set
** an exclusive lock on fd1, then try to get an exclusive lock
** on fd2, it works. I would have expected the second lock to
** fail since there was already a lock on the file due to fd1.
** But not so. Since both locks came from the same process, the
** second overrides the first, even though they were on different
** file descriptors opened on different file names.
**
** This means that we cannot use POSIX locks to synchronize file access
** among competing threads of the same process. POSIX locks will work fine
** to synchronize access for threads in separate processes, but not
** threads within the same process.
**
** To work around the problem, SQLite has to manage file locks internally
** on its own. Whenever a new database is opened, we have to find the
** specific inode of the database file (the inode is determined by the
** st_dev and st_ino fields of the stat structure that fstat() fills in)
** and check for locks already existing on that inode. When locks are
** created or removed, we have to look at our own internal record of the
** locks to see if another thread has previously set a lock on that same
** inode.
**
** (Aside: The use of inode numbers as unique IDs does not work on VxWorks.
** For VxWorks, we have to use the alternative unique ID system based on
** canonical filename and implemented in the previous division.)
**
** The sqlite3_file structure for POSIX is no longer just an integer file
** descriptor. It is now a structure that holds the integer file
** descriptor and a pointer to a structure that describes the internal
** locks on the corresponding inode. There is one locking structure
** per inode, so if the same inode is opened twice, both unixFile structures
** point to the same locking structure. The locking structure keeps
** a reference count (so we will know when to delete it) and a "cnt"
** field that tells us its internal lock status. cnt==0 means the
** file is unlocked. cnt==-1 means the file has an exclusive lock.
** cnt>0 means there are cnt shared locks on the file.
**
** Any attempt to lock or unlock a file first checks the locking
** structure. The fcntl() system call is only invoked to set a
** POSIX lock if the internal lock structure transitions between
** a locked and an unlocked state.
**
** But wait: there are yet more problems with POSIX advisory locks.
**
** If you close a file descriptor that points to a file that has locks,
** all locks on that file that are owned by the current process are
** released. To work around this problem, each unixInodeInfo object
** maintains a count of the number of pending locks on tha inode.
** When an attempt is made to close an unixFile, if there are
** other unixFile open on the same inode that are holding locks, the call
** to close() the file descriptor is deferred until all of the locks clear.
** The unixInodeInfo structure keeps a list of file descriptors that need to
** be closed and that list is walked (and cleared) when the last lock
** clears.
**
** Yet another problem: LinuxThreads do not play well with posix locks.
**
** Many older versions of linux use the LinuxThreads library which is
** not posix compliant. Under LinuxThreads, a lock created by thread
** A cannot be modified or overridden by a different thread B.
** Only thread A can modify the lock. Locking behavior is correct
** if the appliation uses the newer Native Posix Thread Library (NPTL)
** on linux - with NPTL a lock created by thread A can override locks
** in thread B. But there is no way to know at compile-time which
** threading library is being used. So there is no way to know at
** compile-time whether or not thread A can override locks on thread B.
** One has to do a run-time check to discover the behavior of the
** current process.
**
** SQLite used to support LinuxThreads. But support for LinuxThreads
** was dropped beginning with version 3.7.0. SQLite will still work with
** LinuxThreads provided that (1) there is no more than one connection
** per database file in the same process and (2) database connections
** do not move across threads.
*/
/*
** An instance of the following structure serves as the key used
** to locate a particular unixInodeInfo object.
*/
struct unixFileId {
dev_t dev; /* Device number */
#if OS_VXWORKS
struct vxworksFileId *pId; /* Unique file ID for vxworks. */
#else
ino_t ino; /* Inode number */
#endif
};
/*
** An instance of the following structure is allocated for each open
** inode. Or, on LinuxThreads, there is one of these structures for
** each inode opened by each thread.
**
** A single inode can have multiple file descriptors, so each unixFile
** structure contains a pointer to an instance of this object and this
** object keeps a count of the number of unixFile pointing to it.
*/
struct unixInodeInfo {
struct unixFileId fileId; /* The lookup key */
int nShared; /* Number of SHARED locks held */
unsigned char eFileLock; /* One of SHARED_LOCK, RESERVED_LOCK etc. */
unsigned char bProcessLock; /* An exclusive process lock is held */
int nRef; /* Number of pointers to this structure */
unixShmNode *pShmNode; /* Shared memory associated with this inode */
int nLock; /* Number of outstanding file locks */
UnixUnusedFd *pUnused; /* Unused file descriptors to close */
unixInodeInfo *pNext; /* List of all unixInodeInfo objects */
unixInodeInfo *pPrev; /* .... doubly linked */
#if defined(SQLITE_ENABLE_LOCKING_STYLE)
unsigned long long sharedByte; /* for AFP simulated shared lock */
#endif
#if OS_VXWORKS
sem_t *pSem; /* Named POSIX semaphore */
char aSemName[MAX_PATHNAME+2]; /* Name of that semaphore */
#endif
};
/*
** A lists of all unixInodeInfo objects.
*/
static unixInodeInfo *inodeList = 0;
/*
**
** This function - unixLogError_x(), is only ever called via the macro
** unixLogError().
**
** It is invoked after an error occurs in an OS function and errno has been
** set. It logs a message using sqlite3_log() containing the current value of
** errno and, if possible, the human-readable equivalent from strerror() or
** strerror_r().
**
** The first argument passed to the macro should be the error code that
** will be returned to SQLite (e.g. SQLITE_IOERR_DELETE, SQLITE_CANTOPEN).
** The two subsequent arguments should be the name of the OS function that
** failed (e.g. "unlink", "open") and the the associated file-system path,
** if any.
*/
#define unixLogError(a,b,c) unixLogErrorAtLine(a,b,c,__LINE__)
static int unixLogErrorAtLine(
int errcode, /* SQLite error code */
const char *zFunc, /* Name of OS function that failed */
const char *zPath, /* File path associated with error */
int iLine /* Source line number where error occurred */
){
char *zErr; /* Message from strerror() or equivalent */
int iErrno = errno; /* Saved syscall error number */
/* If this is not a threadsafe build (SQLITE_THREADSAFE==0), then use
** the strerror() function to obtain the human-readable error message
** equivalent to errno. Otherwise, use strerror_r().
*/
#if SQLITE_THREADSAFE && defined(HAVE_STRERROR_R)
char aErr[80];
memset(aErr, 0, sizeof(aErr));
zErr = aErr;
/* If STRERROR_R_CHAR_P (set by autoconf scripts) or __USE_GNU is defined,
** assume that the system provides the the GNU version of strerror_r() that
** returns a pointer to a buffer containing the error message. That pointer
** may point to aErr[], or it may point to some static storage somewhere.
** Otherwise, assume that the system provides the POSIX version of
** strerror_r(), which always writes an error message into aErr[].
**
** If the code incorrectly assumes that it is the POSIX version that is
** available, the error message will often be an empty string. Not a
** huge problem. Incorrectly concluding that the GNU version is available
** could lead to a segfault though.
*/
#if defined(STRERROR_R_CHAR_P) || defined(__USE_GNU)
zErr =
# endif
strerror_r(iErrno, aErr, sizeof(aErr)-1);
#elif SQLITE_THREADSAFE
/* This is a threadsafe build, but strerror_r() is not available. */
zErr = "";
#else
/* Non-threadsafe build, use strerror(). */
zErr = strerror(iErrno);
#endif
assert( errcode!=SQLITE_OK );
if( zPath==0 ) zPath = "";
sqlite3_log(errcode,
"os_unix.c:%d: (%d) %s(%s) - %s",
iLine, iErrno, zFunc, zPath, zErr
);
return errcode;
}
/*
** Close a file descriptor.
**
** We assume that close() almost always works, since it is only in a
** very sick application or on a very sick platform that it might fail.
** If it does fail, simply leak the file descriptor, but do log the
** error.
**
** Note that it is not safe to retry close() after EINTR since the
** file descriptor might have already been reused by another thread.
** So we don't even try to recover from an EINTR. Just log the error
** and move on.
*/
static void robust_close(unixFile *pFile, int h, int lineno){
if( osClose(h) ){
unixLogErrorAtLine(SQLITE_IOERR_CLOSE, "close",
pFile ? pFile->zPath : 0, lineno);
}
}
/*
** Close all file descriptors accumuated in the unixInodeInfo->pUnused list.
*/
static void closePendingFds(unixFile *pFile){
unixInodeInfo *pInode = pFile->pInode;
UnixUnusedFd *p;
UnixUnusedFd *pNext;
for(p=pInode->pUnused; p; p=pNext){
pNext = p->pNext;
robust_close(pFile, p->fd, __LINE__);
sqlite3_free(p);
}
pInode->pUnused = 0;
}
/*
** Release a unixInodeInfo structure previously allocated by findInodeInfo().
**
** The mutex entered using the unixEnterMutex() function must be held
** when this function is called.
*/
static void releaseInodeInfo(unixFile *pFile){
unixInodeInfo *pInode = pFile->pInode;
assert( unixMutexHeld() );
if( ALWAYS(pInode) ){
pInode->nRef--;
if( pInode->nRef==0 ){
assert( pInode->pShmNode==0 );
closePendingFds(pFile);
if( pInode->pPrev ){
assert( pInode->pPrev->pNext==pInode );
pInode->pPrev->pNext = pInode->pNext;
}else{
assert( inodeList==pInode );
inodeList = pInode->pNext;
}
if( pInode->pNext ){
assert( pInode->pNext->pPrev==pInode );
pInode->pNext->pPrev = pInode->pPrev;
}
sqlite3_free(pInode);
}
}
}
/*
** Given a file descriptor, locate the unixInodeInfo object that
** describes that file descriptor. Create a new one if necessary. The
** return value might be uninitialized if an error occurs.
**
** The mutex entered using the unixEnterMutex() function must be held
** when this function is called.
**
** Return an appropriate error code.
*/
static int findInodeInfo(
unixFile *pFile, /* Unix file with file desc used in the key */
unixInodeInfo **ppInode /* Return the unixInodeInfo object here */
){
int rc; /* System call return code */
int fd; /* The file descriptor for pFile */
struct unixFileId fileId; /* Lookup key for the unixInodeInfo */
struct stat statbuf; /* Low-level file information */
unixInodeInfo *pInode = 0; /* Candidate unixInodeInfo object */
assert( unixMutexHeld() );
/* Get low-level information about the file that we can used to
** create a unique name for the file.
*/
fd = pFile->h;
rc = osFstat(fd, &statbuf);
if( rc!=0 ){
pFile->lastErrno = errno;
#ifdef EOVERFLOW
if( pFile->lastErrno==EOVERFLOW ) return SQLITE_NOLFS;
#endif
return SQLITE_IOERR;
}
#ifdef __APPLE__
/* On OS X on an msdos filesystem, the inode number is reported
** incorrectly for zero-size files. See ticket #3260. To work
** around this problem (we consider it a bug in OS X, not SQLite)
** we always increase the file size to 1 by writing a single byte
** prior to accessing the inode number. The one byte written is
** an ASCII 'S' character which also happens to be the first byte
** in the header of every SQLite database. In this way, if there
** is a race condition such that another thread has already populated
** the first page of the database, no damage is done.
*/
if( statbuf.st_size==0 && (pFile->fsFlags & SQLITE_FSFLAGS_IS_MSDOS)!=0 ){
do{ rc = osWrite(fd, "S", 1); }while( rc<0 && errno==EINTR );
if( rc!=1 ){
pFile->lastErrno = errno;
return SQLITE_IOERR;
}
rc = osFstat(fd, &statbuf);
if( rc!=0 ){
pFile->lastErrno = errno;
return SQLITE_IOERR;
}
}
#endif
memset(&fileId, 0, sizeof(fileId));
fileId.dev = statbuf.st_dev;
#if OS_VXWORKS
fileId.pId = pFile->pId;
#else
fileId.ino = statbuf.st_ino;
#endif
pInode = inodeList;
while( pInode && memcmp(&fileId, &pInode->fileId, sizeof(fileId)) ){
pInode = pInode->pNext;
}
if( pInode==0 ){
pInode = sqlite3_malloc( sizeof(*pInode) );
if( pInode==0 ){
return SQLITE_NOMEM;
}
memset(pInode, 0, sizeof(*pInode));
memcpy(&pInode->fileId, &fileId, sizeof(fileId));
pInode->nRef = 1;
pInode->pNext = inodeList;
pInode->pPrev = 0;
if( inodeList ) inodeList->pPrev = pInode;
inodeList = pInode;
}else{
pInode->nRef++;
}
*ppInode = pInode;
return SQLITE_OK;
}
/*
** This routine checks if there is a RESERVED lock held on the specified
** file by this or any other process. If such a lock is held, set *pResOut
** to a non-zero value otherwise *pResOut is set to zero. The return value
** is set to SQLITE_OK unless an I/O error occurs during lock checking.
*/
static int unixCheckReservedLock(sqlite3_file *id, int *pResOut){
int rc = SQLITE_OK;
int reserved = 0;
unixFile *pFile = (unixFile*)id;
SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
assert( pFile );
unixEnterMutex(); /* Because pFile->pInode is shared across threads */
/* Check if a thread in this process holds such a lock */
if( pFile->pInode->eFileLock>SHARED_LOCK ){
reserved = 1;
}
/* Otherwise see if some other process holds it.
*/
#ifndef __DJGPP__
if( !reserved && !pFile->pInode->bProcessLock ){
struct flock lock;
lock.l_whence = SEEK_SET;
lock.l_start = RESERVED_BYTE;
lock.l_len = 1;
lock.l_type = F_WRLCK;
if( osFcntl(pFile->h, F_GETLK, &lock) ){
rc = SQLITE_IOERR_CHECKRESERVEDLOCK;
pFile->lastErrno = errno;
} else if( lock.l_type!=F_UNLCK ){
reserved = 1;
}
}
#endif
unixLeaveMutex();
OSTRACE(("TEST WR-LOCK %d %d %d (unix)\n", pFile->h, rc, reserved));
*pResOut = reserved;
return rc;
}
/*
** Attempt to set a system-lock on the file pFile. The lock is
** described by pLock.
**
** If the pFile was opened read/write from unix-excl, then the only lock
** ever obtained is an exclusive lock, and it is obtained exactly once
** the first time any lock is attempted. All subsequent system locking
** operations become no-ops. Locking operations still happen internally,
** in order to coordinate access between separate database connections
** within this process, but all of that is handled in memory and the
** operating system does not participate.
**
** This function is a pass-through to fcntl(F_SETLK) if pFile is using
** any VFS other than "unix-excl" or if pFile is opened on "unix-excl"
** and is read-only.
**
** Zero is returned if the call completes successfully, or -1 if a call
** to fcntl() fails. In this case, errno is set appropriately (by fcntl()).
*/
static int unixFileLock(unixFile *pFile, struct flock *pLock){
int rc;
unixInodeInfo *pInode = pFile->pInode;
assert( unixMutexHeld() );
assert( pInode!=0 );
if( ((pFile->ctrlFlags & UNIXFILE_EXCL)!=0 || pInode->bProcessLock)
&& ((pFile->ctrlFlags & UNIXFILE_RDONLY)==0)
){
if( pInode->bProcessLock==0 ){
struct flock lock;
assert( pInode->nLock==0 );
lock.l_whence = SEEK_SET;
lock.l_start = SHARED_FIRST;
lock.l_len = SHARED_SIZE;
lock.l_type = F_WRLCK;
rc = osFcntl(pFile->h, F_SETLK, &lock);
if( rc<0 ) return rc;
pInode->bProcessLock = 1;
pInode->nLock++;
}else{
rc = 0;
}
}else{
rc = osFcntl(pFile->h, F_SETLK, pLock);
}
return rc;
}
/*
** Lock the file with the lock specified by parameter eFileLock - one
** of the following:
**
** (1) SHARED_LOCK
** (2) RESERVED_LOCK
** (3) PENDING_LOCK
** (4) EXCLUSIVE_LOCK
**
** Sometimes when requesting one lock state, additional lock states
** are inserted in between. The locking might fail on one of the later
** transitions leaving the lock state different from what it started but
** still short of its goal. The following chart shows the allowed
** transitions and the inserted intermediate states:
**
** UNLOCKED -> SHARED
** SHARED -> RESERVED
** SHARED -> (PENDING) -> EXCLUSIVE
** RESERVED -> (PENDING) -> EXCLUSIVE
** PENDING -> EXCLUSIVE
**
** This routine will only increase a lock. Use the sqlite3OsUnlock()
** routine to lower a locking level.
*/
static int unixLock(sqlite3_file *id, int eFileLock){
/* The following describes the implementation of the various locks and
** lock transitions in terms of the POSIX advisory shared and exclusive
** lock primitives (called read-locks and write-locks below, to avoid
** confusion with SQLite lock names). The algorithms are complicated
** slightly in order to be compatible with windows systems simultaneously
** accessing the same database file, in case that is ever required.
**
** Symbols defined in os.h indentify the 'pending byte' and the 'reserved
** byte', each single bytes at well known offsets, and the 'shared byte
** range', a range of 510 bytes at a well known offset.
**
** To obtain a SHARED lock, a read-lock is obtained on the 'pending
** byte'. If this is successful, a random byte from the 'shared byte
** range' is read-locked and the lock on the 'pending byte' released.
**
** A process may only obtain a RESERVED lock after it has a SHARED lock.
** A RESERVED lock is implemented by grabbing a write-lock on the
** 'reserved byte'.
**
** A process may only obtain a PENDING lock after it has obtained a
** SHARED lock. A PENDING lock is implemented by obtaining a write-lock
** on the 'pending byte'. This ensures that no new SHARED locks can be
** obtained, but existing SHARED locks are allowed to persist. A process
** does not have to obtain a RESERVED lock on the way to a PENDING lock.
** This property is used by the algorithm for rolling back a journal file
** after a crash.
**
** An EXCLUSIVE lock, obtained after a PENDING lock is held, is
** implemented by obtaining a write-lock on the entire 'shared byte
** range'. Since all other locks require a read-lock on one of the bytes
** within this range, this ensures that no other locks are held on the
** database.
**
** The reason a single byte cannot be used instead of the 'shared byte
** range' is that some versions of windows do not support read-locks. By
** locking a random byte from a range, concurrent SHARED locks may exist
** even if the locking primitive used is always a write-lock.
*/
int rc = SQLITE_OK;
unixFile *pFile = (unixFile*)id;
unixInodeInfo *pInode = pFile->pInode;
struct flock lock;
int tErrno = 0;
assert( pFile );
OSTRACE(("LOCK %d %s was %s(%s,%d) pid=%d (unix)\n", pFile->h,
azFileLock(eFileLock), azFileLock(pFile->eFileLock),
azFileLock(pInode->eFileLock), pInode->nShared , getpid()));
/* If there is already a lock of this type or more restrictive on the
** unixFile, do nothing. Don't use the end_lock: exit path, as
** unixEnterMutex() hasn't been called yet.
*/
if( pFile->eFileLock>=eFileLock ){
OSTRACE(("LOCK %d %s ok (already held) (unix)\n", pFile->h,
azFileLock(eFileLock)));
return SQLITE_OK;
}
/* Make sure the locking sequence is correct.
** (1) We never move from unlocked to anything higher than shared lock.
** (2) SQLite never explicitly requests a pendig lock.
** (3) A shared lock is always held when a reserve lock is requested.
*/
assert( pFile->eFileLock!=NO_LOCK || eFileLock==SHARED_LOCK );
assert( eFileLock!=PENDING_LOCK );
assert( eFileLock!=RESERVED_LOCK || pFile->eFileLock==SHARED_LOCK );
/* This mutex is needed because pFile->pInode is shared across threads
*/
unixEnterMutex();
pInode = pFile->pInode;
/* If some thread using this PID has a lock via a different unixFile*
** handle that precludes the requested lock, return BUSY.
*/
if( (pFile->eFileLock!=pInode->eFileLock &&
(pInode->eFileLock>=PENDING_LOCK || eFileLock>SHARED_LOCK))
){
rc = SQLITE_BUSY;
goto end_lock;
}
/* If a SHARED lock is requested, and some thread using this PID already
** has a SHARED or RESERVED lock, then increment reference counts and
** return SQLITE_OK.
*/
if( eFileLock==SHARED_LOCK &&
(pInode->eFileLock==SHARED_LOCK || pInode->eFileLock==RESERVED_LOCK) ){
assert( eFileLock==SHARED_LOCK );
assert( pFile->eFileLock==0 );
assert( pInode->nShared>0 );
pFile->eFileLock = SHARED_LOCK;
pInode->nShared++;
pInode->nLock++;
goto end_lock;
}
/* A PENDING lock is needed before acquiring a SHARED lock and before
** acquiring an EXCLUSIVE lock. For the SHARED lock, the PENDING will
** be released.
*/
lock.l_len = 1L;
lock.l_whence = SEEK_SET;
if( eFileLock==SHARED_LOCK
|| (eFileLock==EXCLUSIVE_LOCK && pFile->eFileLock<PENDING_LOCK)
){
lock.l_type = (eFileLock==SHARED_LOCK?F_RDLCK:F_WRLCK);
lock.l_start = PENDING_BYTE;
if( unixFileLock(pFile, &lock) ){
tErrno = errno;
rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
if( rc!=SQLITE_BUSY ){
pFile->lastErrno = tErrno;
}
goto end_lock;
}
}
/* If control gets to this point, then actually go ahead and make
** operating system calls for the specified lock.
*/
if( eFileLock==SHARED_LOCK ){
assert( pInode->nShared==0 );
assert( pInode->eFileLock==0 );
assert( rc==SQLITE_OK );
/* Now get the read-lock */
lock.l_start = SHARED_FIRST;
lock.l_len = SHARED_SIZE;
if( unixFileLock(pFile, &lock) ){
tErrno = errno;
rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
}
/* Drop the temporary PENDING lock */
lock.l_start = PENDING_BYTE;
lock.l_len = 1L;
lock.l_type = F_UNLCK;
if( unixFileLock(pFile, &lock) && rc==SQLITE_OK ){
/* This could happen with a network mount */
tErrno = errno;
rc = SQLITE_IOERR_UNLOCK;
}
if( rc ){
if( rc!=SQLITE_BUSY ){
pFile->lastErrno = tErrno;
}
goto end_lock;
}else{
pFile->eFileLock = SHARED_LOCK;
pInode->nLock++;
pInode->nShared = 1;
}
}else if( eFileLock==EXCLUSIVE_LOCK && pInode->nShared>1 ){
/* We are trying for an exclusive lock but another thread in this
** same process is still holding a shared lock. */
rc = SQLITE_BUSY;
}else{
/* The request was for a RESERVED or EXCLUSIVE lock. It is
** assumed that there is a SHARED or greater lock on the file
** already.
*/
assert( 0!=pFile->eFileLock );
lock.l_type = F_WRLCK;
assert( eFileLock==RESERVED_LOCK || eFileLock==EXCLUSIVE_LOCK );
if( eFileLock==RESERVED_LOCK ){
lock.l_start = RESERVED_BYTE;
lock.l_len = 1L;
}else{
lock.l_start = SHARED_FIRST;
lock.l_len = SHARED_SIZE;
}
if( unixFileLock(pFile, &lock) ){
tErrno = errno;
rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
if( rc!=SQLITE_BUSY ){
pFile->lastErrno = tErrno;
}
}
}
#ifndef NDEBUG
/* Set up the transaction-counter change checking flags when
** transitioning from a SHARED to a RESERVED lock. The change
** from SHARED to RESERVED marks the beginning of a normal
** write operation (not a hot journal rollback).
*/
if( rc==SQLITE_OK
&& pFile->eFileLock<=SHARED_LOCK
&& eFileLock==RESERVED_LOCK
){
pFile->transCntrChng = 0;
pFile->dbUpdate = 0;
pFile->inNormalWrite = 1;
}
#endif
if( rc==SQLITE_OK ){
pFile->eFileLock = eFileLock;
pInode->eFileLock = eFileLock;
}else if( eFileLock==EXCLUSIVE_LOCK ){
pFile->eFileLock = PENDING_LOCK;
pInode->eFileLock = PENDING_LOCK;
}
end_lock:
unixLeaveMutex();
OSTRACE(("LOCK %d %s %s (unix)\n", pFile->h, azFileLock(eFileLock),
rc==SQLITE_OK ? "ok" : "failed"));
return rc;
}
/*
** Add the file descriptor used by file handle pFile to the corresponding
** pUnused list.
*/
static void setPendingFd(unixFile *pFile){
unixInodeInfo *pInode = pFile->pInode;
UnixUnusedFd *p = pFile->pUnused;
p->pNext = pInode->pUnused;
pInode->pUnused = p;
pFile->h = -1;
pFile->pUnused = 0;
}
/*
** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
** must be either NO_LOCK or SHARED_LOCK.
**
** If the locking level of the file descriptor is already at or below
** the requested locking level, this routine is a no-op.
**
** If handleNFSUnlock is true, then on downgrading an EXCLUSIVE_LOCK to SHARED
** the byte range is divided into 2 parts and the first part is unlocked then
** set to a read lock, then the other part is simply unlocked. This works
** around a bug in BSD NFS lockd (also seen on MacOSX 10.3+) that fails to
** remove the write lock on a region when a read lock is set.
*/
static int posixUnlock(sqlite3_file *id, int eFileLock, int handleNFSUnlock){
unixFile *pFile = (unixFile*)id;
unixInodeInfo *pInode;
struct flock lock;
int rc = SQLITE_OK;
int h;
assert( pFile );
OSTRACE(("UNLOCK %d %d was %d(%d,%d) pid=%d (unix)\n", pFile->h, eFileLock,
pFile->eFileLock, pFile->pInode->eFileLock, pFile->pInode->nShared,
getpid()));
assert( eFileLock<=SHARED_LOCK );
if( pFile->eFileLock<=eFileLock ){
return SQLITE_OK;
}
unixEnterMutex();
h = pFile->h;
pInode = pFile->pInode;
assert( pInode->nShared!=0 );
if( pFile->eFileLock>SHARED_LOCK ){
assert( pInode->eFileLock==pFile->eFileLock );
SimulateIOErrorBenign(1);
SimulateIOError( h=(-1) )
SimulateIOErrorBenign(0);
#ifndef NDEBUG
/* When reducing a lock such that other processes can start
** reading the database file again, make sure that the
** transaction counter was updated if any part of the database
** file changed. If the transaction counter is not updated,
** other connections to the same file might not realize that
** the file has changed and hence might not know to flush their
** cache. The use of a stale cache can lead to database corruption.
*/
#if 0
assert( pFile->inNormalWrite==0
|| pFile->dbUpdate==0
|| pFile->transCntrChng==1 );
#endif
pFile->inNormalWrite = 0;
#endif
/* downgrading to a shared lock on NFS involves clearing the write lock
** before establishing the readlock - to avoid a race condition we downgrade
** the lock in 2 blocks, so that part of the range will be covered by a
** write lock until the rest is covered by a read lock:
** 1: [WWWWW]
** 2: [....W]
** 3: [RRRRW]
** 4: [RRRR.]
*/
if( eFileLock==SHARED_LOCK ){
#if !defined(__APPLE__) || !SQLITE_ENABLE_LOCKING_STYLE
(void)handleNFSUnlock;
assert( handleNFSUnlock==0 );
#endif
#if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
if( handleNFSUnlock ){
int tErrno; /* Error code from system call errors */
off_t divSize = SHARED_SIZE - 1;
lock.l_type = F_UNLCK;
lock.l_whence = SEEK_SET;
lock.l_start = SHARED_FIRST;
lock.l_len = divSize;
if( unixFileLock(pFile, &lock)==(-1) ){
tErrno = errno;
rc = SQLITE_IOERR_UNLOCK;
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
goto end_unlock;
}
lock.l_type = F_RDLCK;
lock.l_whence = SEEK_SET;
lock.l_start = SHARED_FIRST;
lock.l_len = divSize;
if( unixFileLock(pFile, &lock)==(-1) ){
tErrno = errno;
rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_RDLOCK);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
goto end_unlock;
}
lock.l_type = F_UNLCK;
lock.l_whence = SEEK_SET;
lock.l_start = SHARED_FIRST+divSize;
lock.l_len = SHARED_SIZE-divSize;
if( unixFileLock(pFile, &lock)==(-1) ){
tErrno = errno;
rc = SQLITE_IOERR_UNLOCK;
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
goto end_unlock;
}
}else
#endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
{
lock.l_type = F_RDLCK;
lock.l_whence = SEEK_SET;
lock.l_start = SHARED_FIRST;
lock.l_len = SHARED_SIZE;
if( unixFileLock(pFile, &lock) ){
/* In theory, the call to unixFileLock() cannot fail because another
** process is holding an incompatible lock. If it does, this
** indicates that the other process is not following the locking
** protocol. If this happens, return SQLITE_IOERR_RDLOCK. Returning
** SQLITE_BUSY would confuse the upper layer (in practice it causes
** an assert to fail). */
rc = SQLITE_IOERR_RDLOCK;
pFile->lastErrno = errno;
goto end_unlock;
}
}
}
lock.l_type = F_UNLCK;
lock.l_whence = SEEK_SET;
lock.l_start = PENDING_BYTE;
lock.l_len = 2L; assert( PENDING_BYTE+1==RESERVED_BYTE );
if( unixFileLock(pFile, &lock)==0 ){
pInode->eFileLock = SHARED_LOCK;
}else{
rc = SQLITE_IOERR_UNLOCK;
pFile->lastErrno = errno;
goto end_unlock;
}
}
if( eFileLock==NO_LOCK ){
/* Decrement the shared lock counter. Release the lock using an
** OS call only when all threads in this same process have released
** the lock.
*/
pInode->nShared--;
if( pInode->nShared==0 ){
lock.l_type = F_UNLCK;
lock.l_whence = SEEK_SET;
lock.l_start = lock.l_len = 0L;
SimulateIOErrorBenign(1);
SimulateIOError( h=(-1) )
SimulateIOErrorBenign(0);
if( unixFileLock(pFile, &lock)==0 ){
pInode->eFileLock = NO_LOCK;
}else{
rc = SQLITE_IOERR_UNLOCK;
pFile->lastErrno = errno;
pInode->eFileLock = NO_LOCK;
pFile->eFileLock = NO_LOCK;
}
}
/* Decrement the count of locks against this same file. When the
** count reaches zero, close any other file descriptors whose close
** was deferred because of outstanding locks.
*/
pInode->nLock--;
assert( pInode->nLock>=0 );
if( pInode->nLock==0 ){
closePendingFds(pFile);
}
}
end_unlock:
unixLeaveMutex();
if( rc==SQLITE_OK ) pFile->eFileLock = eFileLock;
return rc;
}
/*
** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
** must be either NO_LOCK or SHARED_LOCK.
**
** If the locking level of the file descriptor is already at or below
** the requested locking level, this routine is a no-op.
*/
static int unixUnlock(sqlite3_file *id, int eFileLock){
return posixUnlock(id, eFileLock, 0);
}
/*
** This function performs the parts of the "close file" operation
** common to all locking schemes. It closes the directory and file
** handles, if they are valid, and sets all fields of the unixFile
** structure to 0.
**
** It is *not* necessary to hold the mutex when this routine is called,
** even on VxWorks. A mutex will be acquired on VxWorks by the
** vxworksReleaseFileId() routine.
*/
static int closeUnixFile(sqlite3_file *id){
unixFile *pFile = (unixFile*)id;
if( pFile->h>=0 ){
robust_close(pFile, pFile->h, __LINE__);
pFile->h = -1;
}
#if OS_VXWORKS
if( pFile->pId ){
if( pFile->isDelete ){
osUnlink(pFile->pId->zCanonicalName);
}
vxworksReleaseFileId(pFile->pId);
pFile->pId = 0;
}
#endif
OSTRACE(("CLOSE %-3d\n", pFile->h));
OpenCounter(-1);
sqlite3_free(pFile->pUnused);
memset(pFile, 0, sizeof(unixFile));
return SQLITE_OK;
}
/*
** Close a file.
*/
static int unixClose(sqlite3_file *id){
int rc = SQLITE_OK;
unixFile *pFile = (unixFile *)id;
unixUnlock(id, NO_LOCK);
unixEnterMutex();
/* unixFile.pInode is always valid here. Otherwise, a different close
** routine (e.g. nolockClose()) would be called instead.
*/
assert( pFile->pInode->nLock>0 || pFile->pInode->bProcessLock==0 );
if( ALWAYS(pFile->pInode) && pFile->pInode->nLock ){
/* If there are outstanding locks, do not actually close the file just
** yet because that would clear those locks. Instead, add the file
** descriptor to pInode->pUnused list. It will be automatically closed
** when the last lock is cleared.
*/
setPendingFd(pFile);
}
releaseInodeInfo(pFile);
rc = closeUnixFile(id);
unixLeaveMutex();
return rc;
}
/************** End of the posix advisory lock implementation *****************
******************************************************************************/
/******************************************************************************
****************************** No-op Locking **********************************
**
** Of the various locking implementations available, this is by far the
** simplest: locking is ignored. No attempt is made to lock the database
** file for reading or writing.
**
** This locking mode is appropriate for use on read-only databases
** (ex: databases that are burned into CD-ROM, for example.) It can
** also be used if the application employs some external mechanism to
** prevent simultaneous access of the same database by two or more
** database connections. But there is a serious risk of database
** corruption if this locking mode is used in situations where multiple
** database connections are accessing the same database file at the same
** time and one or more of those connections are writing.
*/
static int nolockCheckReservedLock(sqlite3_file *NotUsed, int *pResOut){
UNUSED_PARAMETER(NotUsed);
*pResOut = 0;
return SQLITE_OK;
}
static int nolockLock(sqlite3_file *NotUsed, int NotUsed2){
UNUSED_PARAMETER2(NotUsed, NotUsed2);
return SQLITE_OK;
}
static int nolockUnlock(sqlite3_file *NotUsed, int NotUsed2){
UNUSED_PARAMETER2(NotUsed, NotUsed2);
return SQLITE_OK;
}
/*
** Close the file.
*/
static int nolockClose(sqlite3_file *id) {
return closeUnixFile(id);
}
/******************* End of the no-op lock implementation *********************
******************************************************************************/
/******************************************************************************
************************* Begin dot-file Locking ******************************
**
** The dotfile locking implementation uses the existance of separate lock
** files in order to control access to the database. This works on just
** about every filesystem imaginable. But there are serious downsides:
**
** (1) There is zero concurrency. A single reader blocks all other
** connections from reading or writing the database.
**
** (2) An application crash or power loss can leave stale lock files
** sitting around that need to be cleared manually.
**
** Nevertheless, a dotlock is an appropriate locking mode for use if no
** other locking strategy is available.
**
** Dotfile locking works by creating a file in the same directory as the
** database and with the same name but with a ".lock" extension added.
** The existance of a lock file implies an EXCLUSIVE lock. All other lock
** types (SHARED, RESERVED, PENDING) are mapped into EXCLUSIVE.
*/
/*
** The file suffix added to the data base filename in order to create the
** lock file.
*/
#define DOTLOCK_SUFFIX ".lock"
/*
** This routine checks if there is a RESERVED lock held on the specified
** file by this or any other process. If such a lock is held, set *pResOut
** to a non-zero value otherwise *pResOut is set to zero. The return value
** is set to SQLITE_OK unless an I/O error occurs during lock checking.
**
** In dotfile locking, either a lock exists or it does not. So in this
** variation of CheckReservedLock(), *pResOut is set to true if any lock
** is held on the file and false if the file is unlocked.
*/
static int dotlockCheckReservedLock(sqlite3_file *id, int *pResOut) {
int rc = SQLITE_OK;
int reserved = 0;
unixFile *pFile = (unixFile*)id;
SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
assert( pFile );
/* Check if a thread in this process holds such a lock */
if( pFile->eFileLock>SHARED_LOCK ){
/* Either this connection or some other connection in the same process
** holds a lock on the file. No need to check further. */
reserved = 1;
}else{
/* The lock is held if and only if the lockfile exists */
const char *zLockFile = (const char*)pFile->lockingContext;
reserved = osAccess(zLockFile, 0)==0;
}
OSTRACE(("TEST WR-LOCK %d %d %d (dotlock)\n", pFile->h, rc, reserved));
*pResOut = reserved;
return rc;
}
/*
** Lock the file with the lock specified by parameter eFileLock - one
** of the following:
**
** (1) SHARED_LOCK
** (2) RESERVED_LOCK
** (3) PENDING_LOCK
** (4) EXCLUSIVE_LOCK
**
** Sometimes when requesting one lock state, additional lock states
** are inserted in between. The locking might fail on one of the later
** transitions leaving the lock state different from what it started but
** still short of its goal. The following chart shows the allowed
** transitions and the inserted intermediate states:
**
** UNLOCKED -> SHARED
** SHARED -> RESERVED
** SHARED -> (PENDING) -> EXCLUSIVE
** RESERVED -> (PENDING) -> EXCLUSIVE
** PENDING -> EXCLUSIVE
**
** This routine will only increase a lock. Use the sqlite3OsUnlock()
** routine to lower a locking level.
**
** With dotfile locking, we really only support state (4): EXCLUSIVE.
** But we track the other locking levels internally.
*/
static int dotlockLock(sqlite3_file *id, int eFileLock) {
unixFile *pFile = (unixFile*)id;
int fd;
char *zLockFile = (char *)pFile->lockingContext;
int rc = SQLITE_OK;
/* If we have any lock, then the lock file already exists. All we have
** to do is adjust our internal record of the lock level.
*/
if( pFile->eFileLock > NO_LOCK ){
pFile->eFileLock = eFileLock;
#if !OS_VXWORKS
/* Always update the timestamp on the old file */
utimes(zLockFile, NULL);
#endif
return SQLITE_OK;
}
/* grab an exclusive lock */
fd = robust_open(zLockFile,O_RDONLY|O_CREAT|O_EXCL,0600);
if( fd<0 ){
/* failed to open/create the file, someone else may have stolen the lock */
int tErrno = errno;
if( EEXIST == tErrno ){
rc = SQLITE_BUSY;
} else {
rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
}
return rc;
}
robust_close(pFile, fd, __LINE__);
/* got it, set the type and return ok */
pFile->eFileLock = eFileLock;
return rc;
}
/*
** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
** must be either NO_LOCK or SHARED_LOCK.
**
** If the locking level of the file descriptor is already at or below
** the requested locking level, this routine is a no-op.
**
** When the locking level reaches NO_LOCK, delete the lock file.
*/
static int dotlockUnlock(sqlite3_file *id, int eFileLock) {
unixFile *pFile = (unixFile*)id;
char *zLockFile = (char *)pFile->lockingContext;
assert( pFile );
OSTRACE(("UNLOCK %d %d was %d pid=%d (dotlock)\n", pFile->h, eFileLock,
pFile->eFileLock, getpid()));
assert( eFileLock<=SHARED_LOCK );
/* no-op if possible */
if( pFile->eFileLock==eFileLock ){
return SQLITE_OK;
}
/* To downgrade to shared, simply update our internal notion of the
** lock state. No need to mess with the file on disk.
*/
if( eFileLock==SHARED_LOCK ){
pFile->eFileLock = SHARED_LOCK;
return SQLITE_OK;
}
/* To fully unlock the database, delete the lock file */
assert( eFileLock==NO_LOCK );
if( osUnlink(zLockFile) ){
int rc = 0;
int tErrno = errno;
if( ENOENT != tErrno ){
rc = SQLITE_IOERR_UNLOCK;
}
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
return rc;
}
pFile->eFileLock = NO_LOCK;
return SQLITE_OK;
}
/*
** Close a file. Make sure the lock has been released before closing.
*/
static int dotlockClose(sqlite3_file *id) {
int rc;
if( id ){
unixFile *pFile = (unixFile*)id;
dotlockUnlock(id, NO_LOCK);
sqlite3_free(pFile->lockingContext);
}
rc = closeUnixFile(id);
return rc;
}
/****************** End of the dot-file lock implementation *******************
******************************************************************************/
/******************************************************************************
************************** Begin flock Locking ********************************
**
** Use the flock() system call to do file locking.
**
** flock() locking is like dot-file locking in that the various
** fine-grain locking levels supported by SQLite are collapsed into
** a single exclusive lock. In other words, SHARED, RESERVED, and
** PENDING locks are the same thing as an EXCLUSIVE lock. SQLite
** still works when you do this, but concurrency is reduced since
** only a single process can be reading the database at a time.
**
** Omit this section if SQLITE_ENABLE_LOCKING_STYLE is turned off or if
** compiling for VXWORKS.
*/
#if SQLITE_ENABLE_LOCKING_STYLE && !OS_VXWORKS
/*
** Retry flock() calls that fail with EINTR
*/
#ifdef EINTR
static int robust_flock(int fd, int op){
int rc;
do{ rc = flock(fd,op); }while( rc<0 && errno==EINTR );
return rc;
}
#else
# define robust_flock(a,b) flock(a,b)
#endif
/*
** This routine checks if there is a RESERVED lock held on the specified
** file by this or any other process. If such a lock is held, set *pResOut
** to a non-zero value otherwise *pResOut is set to zero. The return value
** is set to SQLITE_OK unless an I/O error occurs during lock checking.
*/
static int flockCheckReservedLock(sqlite3_file *id, int *pResOut){
int rc = SQLITE_OK;
int reserved = 0;
unixFile *pFile = (unixFile*)id;
SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
assert( pFile );
/* Check if a thread in this process holds such a lock */
if( pFile->eFileLock>SHARED_LOCK ){
reserved = 1;
}
/* Otherwise see if some other process holds it. */
if( !reserved ){
/* attempt to get the lock */
int lrc = robust_flock(pFile->h, LOCK_EX | LOCK_NB);
if( !lrc ){
/* got the lock, unlock it */
lrc = robust_flock(pFile->h, LOCK_UN);
if ( lrc ) {
int tErrno = errno;
/* unlock failed with an error */
lrc = SQLITE_IOERR_UNLOCK;
if( IS_LOCK_ERROR(lrc) ){
pFile->lastErrno = tErrno;
rc = lrc;
}
}
} else {
int tErrno = errno;
reserved = 1;
/* someone else might have it reserved */
lrc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
if( IS_LOCK_ERROR(lrc) ){
pFile->lastErrno = tErrno;
rc = lrc;
}
}
}
OSTRACE(("TEST WR-LOCK %d %d %d (flock)\n", pFile->h, rc, reserved));
#ifdef SQLITE_IGNORE_FLOCK_LOCK_ERRORS
if( (rc & SQLITE_IOERR) == SQLITE_IOERR ){
rc = SQLITE_OK;
reserved=1;
}
#endif /* SQLITE_IGNORE_FLOCK_LOCK_ERRORS */
*pResOut = reserved;
return rc;
}
/*
** Lock the file with the lock specified by parameter eFileLock - one
** of the following:
**
** (1) SHARED_LOCK
** (2) RESERVED_LOCK
** (3) PENDING_LOCK
** (4) EXCLUSIVE_LOCK
**
** Sometimes when requesting one lock state, additional lock states
** are inserted in between. The locking might fail on one of the later
** transitions leaving the lock state different from what it started but
** still short of its goal. The following chart shows the allowed
** transitions and the inserted intermediate states:
**
** UNLOCKED -> SHARED
** SHARED -> RESERVED
** SHARED -> (PENDING) -> EXCLUSIVE
** RESERVED -> (PENDING) -> EXCLUSIVE
** PENDING -> EXCLUSIVE
**
** flock() only really support EXCLUSIVE locks. We track intermediate
** lock states in the sqlite3_file structure, but all locks SHARED or
** above are really EXCLUSIVE locks and exclude all other processes from
** access the file.
**
** This routine will only increase a lock. Use the sqlite3OsUnlock()
** routine to lower a locking level.
*/
static int flockLock(sqlite3_file *id, int eFileLock) {
int rc = SQLITE_OK;
unixFile *pFile = (unixFile*)id;
assert( pFile );
/* if we already have a lock, it is exclusive.
** Just adjust level and punt on outta here. */
if (pFile->eFileLock > NO_LOCK) {
pFile->eFileLock = eFileLock;
return SQLITE_OK;
}
/* grab an exclusive lock */
if (robust_flock(pFile->h, LOCK_EX | LOCK_NB)) {
int tErrno = errno;
/* didn't get, must be busy */
rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_LOCK);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
} else {
/* got it, set the type and return ok */
pFile->eFileLock = eFileLock;
}
OSTRACE(("LOCK %d %s %s (flock)\n", pFile->h, azFileLock(eFileLock),
rc==SQLITE_OK ? "ok" : "failed"));
#ifdef SQLITE_IGNORE_FLOCK_LOCK_ERRORS
if( (rc & SQLITE_IOERR) == SQLITE_IOERR ){
rc = SQLITE_BUSY;
}
#endif /* SQLITE_IGNORE_FLOCK_LOCK_ERRORS */
return rc;
}
/*
** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
** must be either NO_LOCK or SHARED_LOCK.
**
** If the locking level of the file descriptor is already at or below
** the requested locking level, this routine is a no-op.
*/
static int flockUnlock(sqlite3_file *id, int eFileLock) {
unixFile *pFile = (unixFile*)id;
assert( pFile );
OSTRACE(("UNLOCK %d %d was %d pid=%d (flock)\n", pFile->h, eFileLock,
pFile->eFileLock, getpid()));
assert( eFileLock<=SHARED_LOCK );
/* no-op if possible */
if( pFile->eFileLock==eFileLock ){
return SQLITE_OK;
}
/* shared can just be set because we always have an exclusive */
if (eFileLock==SHARED_LOCK) {
pFile->eFileLock = eFileLock;
return SQLITE_OK;
}
/* no, really, unlock. */
if( robust_flock(pFile->h, LOCK_UN) ){
#ifdef SQLITE_IGNORE_FLOCK_LOCK_ERRORS
return SQLITE_OK;
#endif /* SQLITE_IGNORE_FLOCK_LOCK_ERRORS */
return SQLITE_IOERR_UNLOCK;
}else{
pFile->eFileLock = NO_LOCK;
return SQLITE_OK;
}
}
/*
** Close a file.
*/
static int flockClose(sqlite3_file *id) {
if( id ){
flockUnlock(id, NO_LOCK);
}
return closeUnixFile(id);
}
#endif /* SQLITE_ENABLE_LOCKING_STYLE && !OS_VXWORK */
/******************* End of the flock lock implementation *********************
******************************************************************************/
/******************************************************************************
************************ Begin Named Semaphore Locking ************************
**
** Named semaphore locking is only supported on VxWorks.
**
** Semaphore locking is like dot-lock and flock in that it really only
** supports EXCLUSIVE locking. Only a single process can read or write
** the database file at a time. This reduces potential concurrency, but
** makes the lock implementation much easier.
*/
#if OS_VXWORKS
/*
** This routine checks if there is a RESERVED lock held on the specified
** file by this or any other process. If such a lock is held, set *pResOut
** to a non-zero value otherwise *pResOut is set to zero. The return value
** is set to SQLITE_OK unless an I/O error occurs during lock checking.
*/
static int semCheckReservedLock(sqlite3_file *id, int *pResOut) {
int rc = SQLITE_OK;
int reserved = 0;
unixFile *pFile = (unixFile*)id;
SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
assert( pFile );
/* Check if a thread in this process holds such a lock */
if( pFile->eFileLock>SHARED_LOCK ){
reserved = 1;
}
/* Otherwise see if some other process holds it. */
if( !reserved ){
sem_t *pSem = pFile->pInode->pSem;
struct stat statBuf;
if( sem_trywait(pSem)==-1 ){
int tErrno = errno;
if( EAGAIN != tErrno ){
rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_CHECKRESERVEDLOCK);
pFile->lastErrno = tErrno;
} else {
/* someone else has the lock when we are in NO_LOCK */
reserved = (pFile->eFileLock < SHARED_LOCK);
}
}else{
/* we could have it if we want it */
sem_post(pSem);
}
}
OSTRACE(("TEST WR-LOCK %d %d %d (sem)\n", pFile->h, rc, reserved));
*pResOut = reserved;
return rc;
}
/*
** Lock the file with the lock specified by parameter eFileLock - one
** of the following:
**
** (1) SHARED_LOCK
** (2) RESERVED_LOCK
** (3) PENDING_LOCK
** (4) EXCLUSIVE_LOCK
**
** Sometimes when requesting one lock state, additional lock states
** are inserted in between. The locking might fail on one of the later
** transitions leaving the lock state different from what it started but
** still short of its goal. The following chart shows the allowed
** transitions and the inserted intermediate states:
**
** UNLOCKED -> SHARED
** SHARED -> RESERVED
** SHARED -> (PENDING) -> EXCLUSIVE
** RESERVED -> (PENDING) -> EXCLUSIVE
** PENDING -> EXCLUSIVE
**
** Semaphore locks only really support EXCLUSIVE locks. We track intermediate
** lock states in the sqlite3_file structure, but all locks SHARED or
** above are really EXCLUSIVE locks and exclude all other processes from
** access the file.
**
** This routine will only increase a lock. Use the sqlite3OsUnlock()
** routine to lower a locking level.
*/
static int semLock(sqlite3_file *id, int eFileLock) {
unixFile *pFile = (unixFile*)id;
int fd;
sem_t *pSem = pFile->pInode->pSem;
int rc = SQLITE_OK;
/* if we already have a lock, it is exclusive.
** Just adjust level and punt on outta here. */
if (pFile->eFileLock > NO_LOCK) {
pFile->eFileLock = eFileLock;
rc = SQLITE_OK;
goto sem_end_lock;
}
/* lock semaphore now but bail out when already locked. */
if( sem_trywait(pSem)==-1 ){
rc = SQLITE_BUSY;
goto sem_end_lock;
}
/* got it, set the type and return ok */
pFile->eFileLock = eFileLock;
sem_end_lock:
return rc;
}
/*
** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
** must be either NO_LOCK or SHARED_LOCK.
**
** If the locking level of the file descriptor is already at or below
** the requested locking level, this routine is a no-op.
*/
static int semUnlock(sqlite3_file *id, int eFileLock) {
unixFile *pFile = (unixFile*)id;
sem_t *pSem = pFile->pInode->pSem;
assert( pFile );
assert( pSem );
OSTRACE(("UNLOCK %d %d was %d pid=%d (sem)\n", pFile->h, eFileLock,
pFile->eFileLock, getpid()));
assert( eFileLock<=SHARED_LOCK );
/* no-op if possible */
if( pFile->eFileLock==eFileLock ){
return SQLITE_OK;
}
/* shared can just be set because we always have an exclusive */
if (eFileLock==SHARED_LOCK) {
pFile->eFileLock = eFileLock;
return SQLITE_OK;
}
/* no, really unlock. */
if ( sem_post(pSem)==-1 ) {
int rc, tErrno = errno;
rc = sqliteErrorFromPosixError(tErrno, SQLITE_IOERR_UNLOCK);
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
return rc;
}
pFile->eFileLock = NO_LOCK;
return SQLITE_OK;
}
/*
** Close a file.
*/
static int semClose(sqlite3_file *id) {
if( id ){
unixFile *pFile = (unixFile*)id;
semUnlock(id, NO_LOCK);
assert( pFile );
unixEnterMutex();
releaseInodeInfo(pFile);
unixLeaveMutex();
closeUnixFile(id);
}
return SQLITE_OK;
}
#endif /* OS_VXWORKS */
/*
** Named semaphore locking is only available on VxWorks.
**
*************** End of the named semaphore lock implementation ****************
******************************************************************************/
/******************************************************************************
*************************** Begin AFP Locking *********************************
**
** AFP is the Apple Filing Protocol. AFP is a network filesystem found
** on Apple Macintosh computers - both OS9 and OSX.
**
** Third-party implementations of AFP are available. But this code here
** only works on OSX.
*/
#if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
/*
** The afpLockingContext structure contains all afp lock specific state
*/
typedef struct afpLockingContext afpLockingContext;
struct afpLockingContext {
int reserved;
const char *dbPath; /* Name of the open file */
};
struct ByteRangeLockPB2
{
unsigned long long offset; /* offset to first byte to lock */
unsigned long long length; /* nbr of bytes to lock */
unsigned long long retRangeStart; /* nbr of 1st byte locked if successful */
unsigned char unLockFlag; /* 1 = unlock, 0 = lock */
unsigned char startEndFlag; /* 1=rel to end of fork, 0=rel to start */
int fd; /* file desc to assoc this lock with */
};
#define afpfsByteRangeLock2FSCTL _IOWR('z', 23, struct ByteRangeLockPB2)
/*
** This is a utility for setting or clearing a bit-range lock on an
** AFP filesystem.
**
** Return SQLITE_OK on success, SQLITE_BUSY on failure.
*/
static int afpSetLock(
const char *path, /* Name of the file to be locked or unlocked */
unixFile *pFile, /* Open file descriptor on path */
unsigned long long offset, /* First byte to be locked */
unsigned long long length, /* Number of bytes to lock */
int setLockFlag /* True to set lock. False to clear lock */
){
struct ByteRangeLockPB2 pb;
int err;
pb.unLockFlag = setLockFlag ? 0 : 1;
pb.startEndFlag = 0;
pb.offset = offset;
pb.length = length;
pb.fd = pFile->h;
OSTRACE(("AFPSETLOCK [%s] for %d%s in range %llx:%llx\n",
(setLockFlag?"ON":"OFF"), pFile->h, (pb.fd==-1?"[testval-1]":""),
offset, length));
err = fsctl(path, afpfsByteRangeLock2FSCTL, &pb, 0);
if ( err==-1 ) {
int rc;
int tErrno = errno;
OSTRACE(("AFPSETLOCK failed to fsctl() '%s' %d %s\n",
path, tErrno, strerror(tErrno)));
#ifdef SQLITE_IGNORE_AFP_LOCK_ERRORS
rc = SQLITE_BUSY;
#else
rc = sqliteErrorFromPosixError(tErrno,
setLockFlag ? SQLITE_IOERR_LOCK : SQLITE_IOERR_UNLOCK);
#endif /* SQLITE_IGNORE_AFP_LOCK_ERRORS */
if( IS_LOCK_ERROR(rc) ){
pFile->lastErrno = tErrno;
}
return rc;
} else {
return SQLITE_OK;
}
}
/*
** This routine checks if there is a RESERVED lock held on the specified
** file by this or any other process. If such a lock is held, set *pResOut
** to a non-zero value otherwise *pResOut is set to zero. The return value
** is set to SQLITE_OK unless an I/O error occurs during lock checking.
*/
static int afpCheckReservedLock(sqlite3_file *id, int *pResOut){
int rc = SQLITE_OK;
int reserved = 0;
unixFile *pFile = (unixFile*)id;
SimulateIOError( return SQLITE_IOERR_CHECKRESERVEDLOCK; );
assert( pFile );
afpLockingContext *context = (afpLockingContext *) pFile->lockingContext;
if( context->reserved ){
*pResOut = 1;
return SQLITE_OK;
}
unixEnterMutex(); /* Because pFile->pInode is shared across threads */
/* Check if a thread in this process holds such a lock */
if( pFile->pInode->eFileLock>SHARED_LOCK ){
reserved = 1;
}
/* Otherwise see if some other process holds it.
*/
if( !reserved ){
/* lock the RESERVED byte */
int lrc = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1,1);
if( SQLITE_OK==lrc ){
/* if we succeeded in taking the reserved lock, unlock it to restore
** the original state */
lrc = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1, 0);
} else {
/* if we failed to get the lock then someone else must have it */
reserved = 1;
}
if( IS_LOCK_ERROR(lrc) ){
rc=lrc;
}
}
unixLeaveMutex();
OSTRACE(("TEST WR-LOCK %d %d %d (afp)\n", pFile->h, rc, reserved));
*pResOut = reserved;
return rc;
}
/*
** Lock the file with the lock specified by parameter eFileLock - one
** of the following:
**
** (1) SHARED_LOCK
** (2) RESERVED_LOCK
** (3) PENDING_LOCK
** (4) EXCLUSIVE_LOCK
**
** Sometimes when requesting one lock state, additional lock states
** are inserted in between. The locking might fail on one of the later
** transitions leaving the lock state different from what it started but
** still short of its goal. The following chart shows the allowed
** transitions and the inserted intermediate states:
**
** UNLOCKED -> SHARED
** SHARED -> RESERVED
** SHARED -> (PENDING) -> EXCLUSIVE
** RESERVED -> (PENDING) -> EXCLUSIVE
** PENDING -> EXCLUSIVE
**
** This routine will only increase a lock. Use the sqlite3OsUnlock()
** routine to lower a locking level.
*/
static int afpLock(sqlite3_file *id, int eFileLock){
int rc = SQLITE_OK;
unixFile *pFile = (unixFile*)id;
unixInodeInfo *pInode = pFile->pInode;
afpLockingContext *context = (afpLockingContext *) pFile->lockingContext;
assert( pFile );
OSTRACE(("LOCK %d %s was %s(%s,%d) pid=%d (afp)\n", pFile->h,
azFileLock(eFileLock), azFileLock(pFile->eFileLock),
azFileLock(pInode->eFileLock), pInode->nShared , getpid()));
/* If there is already a lock of this type or more restrictive on the
** unixFile, do nothing. Don't use the afp_end_lock: exit path, as
** unixEnterMutex() hasn't been called yet.
*/
if( pFile->eFileLock>=eFileLock ){
OSTRACE(("LOCK %d %s ok (already held) (afp)\n", pFile->h,
azFileLock(eFileLock)));
return SQLITE_OK;
}
/* Make sure the locking sequence is correct
** (1) We never move from unlocked to anything higher than shared lock.
** (2) SQLite never explicitly requests a pendig lock.
** (3) A shared lock is always held when a reserve lock is requested.
*/
assert( pFile->eFileLock!=NO_LOCK || eFileLock==SHARED_LOCK );
assert( eFileLock!=PENDING_LOCK );
assert( eFileLock!=RESERVED_LOCK || pFile->eFileLock==SHARED_LOCK );
/* This mutex is needed because pFile->pInode is shared across threads
*/
unixEnterMutex();
pInode = pFile->pInode;
/* If some thread using this PID has a lock via a different unixFile*
** handle that precludes the requested lock, return BUSY.
*/
if( (pFile->eFileLock!=pInode->eFileLock &&
(pInode->eFileLock>=PENDING_LOCK || eFileLock>SHARED_LOCK))
){
rc = SQLITE_BUSY;
goto afp_end_lock;
}
/* If a SHARED lock is requested, and some thread using this PID already
** has a SHARED or RESERVED lock, then increment reference counts and
** return SQLITE_OK.
*/
if( eFileLock==SHARED_LOCK &&
(pInode->eFileLock==SHARED_LOCK || pInode->eFileLock==RESERVED_LOCK) ){
assert( eFileLock==SHARED_LOCK );
assert( pFile->eFileLock==0 );
assert( pInode->nShared>0 );
pFile->eFileLock = SHARED_LOCK;
pInode->nShared++;
pInode->nLock++;
goto afp_end_lock;
}
/* A PENDING lock is needed before acquiring a SHARED lock and before
** acquiring an EXCLUSIVE lock. For the SHARED lock, the PENDING will
** be released.
*/
if( eFileLock==SHARED_LOCK
|| (eFileLock==EXCLUSIVE_LOCK && pFile->eFileLock<PENDING_LOCK)
){
int failed;
failed = afpSetLock(context->dbPath, pFile, PENDING_BYTE, 1, 1);
if (failed) {
rc = failed;
goto afp_end_lock;
}
}
/* If control gets to this point, then actually go ahead and make
** operating system calls for the specified lock.
*/
if( eFileLock==SHARED_LOCK ){
int lrc1, lrc2, lrc1Errno;
long lk, mask;
assert( pInode->nShared==0 );
assert( pInode->eFileLock==0 );
mask = (sizeof(long)==8) ? LARGEST_INT64 : 0x7fffffff;
/* Now get the read-lock SHARED_LOCK */
/* note that the quality of the randomness doesn't matter that much */
lk = random();
pInode->sharedByte = (lk & mask)%(SHARED_SIZE - 1);
lrc1 = afpSetLock(context->dbPath, pFile,
SHARED_FIRST+pInode->sharedByte, 1, 1);
if( IS_LOCK_ERROR(lrc1) ){
lrc1Errno = pFile->lastErrno;
}
/* Drop the temporary PENDING lock */
lrc2 = afpSetLock(context->dbPath, pFile, PENDING_BYTE, 1, 0);
if( IS_LOCK_ERROR(lrc1) ) {
pFile->lastErrno = lrc1Errno;
rc = lrc1;
goto afp_end_lock;
} else if( IS_LOCK_ERROR(lrc2) ){
rc = lrc2;
goto afp_end_lock;
} else if( lrc1 != SQLITE_OK ) {
rc = lrc1;
} else {
pFile->eFileLock = SHARED_LOCK;
pInode->nLock++;
pInode->nShared = 1;
}
}else if( eFileLock==EXCLUSIVE_LOCK && pInode->nShared>1 ){
/* We are trying for an exclusive lock but another thread in this
** same process is still holding a shared lock. */
rc = SQLITE_BUSY;
}else{
/* The request was for a RESERVED or EXCLUSIVE lock. It is
** assumed that there is a SHARED or greater lock on the file
** already.
*/
int failed = 0;
assert( 0!=pFile->eFileLock );
if (eFileLock >= RESERVED_LOCK && pFile->eFileLock < RESERVED_LOCK) {
/* Acquire a RESERVED lock */
failed = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1,1);
if( !failed ){
context->reserved = 1;
}
}
if (!failed && eFileLock == EXCLUSIVE_LOCK) {
/* Acquire an EXCLUSIVE lock */
/* Remove the shared lock before trying the range. we'll need to
** reestablish the shared lock if we can't get the afpUnlock
*/
if( !(failed = afpSetLock(context->dbPath, pFile, SHARED_FIRST +
pInode->sharedByte, 1, 0)) ){
int failed2 = SQLITE_OK;
/* now attemmpt to get the exclusive lock range */
failed = afpSetLock(context->dbPath, pFile, SHARED_FIRST,
SHARED_SIZE, 1);
if( failed && (failed2 = afpSetLock(context->dbPath, pFile,
SHARED_FIRST + pInode->sharedByte, 1, 1)) ){
/* Can't reestablish the shared lock. Sqlite can't deal, this is
** a critical I/O error
*/
rc = ((failed & SQLITE_IOERR) == SQLITE_IOERR) ? failed2 :
SQLITE_IOERR_LOCK;
goto afp_end_lock;
}
}else{
rc = failed;
}
}
if( failed ){
rc = failed;
}
}
if( rc==SQLITE_OK ){
pFile->eFileLock = eFileLock;
pInode->eFileLock = eFileLock;
}else if( eFileLock==EXCLUSIVE_LOCK ){
pFile->eFileLock = PENDING_LOCK;
pInode->eFileLock = PENDING_LOCK;
}
afp_end_lock:
unixLeaveMutex();
OSTRACE(("LOCK %d %s %s (afp)\n", pFile->h, azFileLock(eFileLock),
rc==SQLITE_OK ? "ok" : "failed"));
return rc;
}
/*
** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
** must be either NO_LOCK or SHARED_LOCK.
**
** If the locking level of the file descriptor is already at or below
** the requested locking level, this routine is a no-op.
*/
static int afpUnlock(sqlite3_file *id, int eFileLock) {
int rc = SQLITE_OK;
unixFile *pFile = (unixFile*)id;
unixInodeInfo *pInode;
afpLockingContext *context = (afpLockingContext *) pFile->lockingContext;
int skipShared = 0;
#ifdef SQLITE_TEST
int h = pFile->h;
#endif
assert( pFile );
OSTRACE(("UNLOCK %d %d was %d(%d,%d) pid=%d (afp)\n", pFile->h, eFileLock,
pFile->eFileLock, pFile->pInode->eFileLock, pFile->pInode->nShared,
getpid()));
assert( eFileLock<=SHARED_LOCK );
if( pFile->eFileLock<=eFileLock ){
return SQLITE_OK;
}
unixEnterMutex();
pInode = pFile->pInode;
assert( pInode->nShared!=0 );
if( pFile->eFileLock>SHARED_LOCK ){
assert( pInode->eFileLock==pFile->eFileLock );
SimulateIOErrorBenign(1);
SimulateIOError( h=(-1) )
SimulateIOErrorBenign(0);
#ifndef NDEBUG
/* When reducing a lock such that other processes can start
** reading the database file again, make sure that the
** transaction counter was updated if any part of the database
** file changed. If the transaction counter is not updated,
** other connections to the same file might not realize that
** the file has changed and hence might not know to flush their
** cache. The use of a stale cache can lead to database corruption.
*/
assert( pFile->inNormalWrite==0
|| pFile->dbUpdate==0
|| pFile->transCntrChng==1 );
pFile->inNormalWrite = 0;
#endif
if( pFile->eFileLock==EXCLUSIVE_LOCK ){
rc = afpSetLock(context->dbPath, pFile, SHARED_FIRST, SHARED_SIZE, 0);
if( rc==SQLITE_OK && (eFileLock==SHARED_LOCK || pInode->nShared>1) ){
/* only re-establish the shared lock if necessary */
int sharedLockByte = SHARED_FIRST+pInode->sharedByte;
rc = afpSetLock(context->dbPath, pFile, sharedLockByte, 1, 1);
} else {
skipShared = 1;
}
}
if( rc==SQLITE_OK && pFile->eFileLock>=PENDING_LOCK ){
rc = afpSetLock(context->dbPath, pFile, PENDING_BYTE, 1, 0);
}
if( rc==SQLITE_OK && pFile->eFileLock>=RESERVED_LOCK && context->reserved ){
rc = afpSetLock(context->dbPath, pFile, RESERVED_BYTE, 1, 0);
if( !rc ){
context->reserved = 0;
}
}
if( rc==SQLITE_OK && (eFileLock==SHARED_LOCK || pInode->nShared>1)){
pInode->eFileLock = SHARED_LOCK;
}
}
if( rc==SQLITE_OK && eFileLock==NO_LOCK ){
/* Decrement the shared lock counter. Release the lock using an
** OS call only when all threads in this same process have released
** the lock.
*/
unsigned long long sharedLockByte = SHARED_FIRST+pInode->sharedByte;
pInode->nShared--;
if( pInode->nShared==0 ){
SimulateIOErrorBenign(1);
SimulateIOError( h=(-1) )
SimulateIOErrorBenign(0);
if( !skipShared ){
rc = afpSetLock(context->dbPath, pFile, sharedLockByte, 1, 0);
}
if( !rc ){
pInode->eFileLock = NO_LOCK;
pFile->eFileLock = NO_LOCK;
}
}
if( rc==SQLITE_OK ){
pInode->nLock--;
assert( pInode->nLock>=0 );
if( pInode->nLock==0 ){
closePendingFds(pFile);
}
}
}
unixLeaveMutex();
if( rc==SQLITE_OK ) pFile->eFileLock = eFileLock;
return rc;
}
/*
** Close a file & cleanup AFP specific locking context
*/
static int afpClose(sqlite3_file *id) {
int rc = SQLITE_OK;
if( id ){
unixFile *pFile = (unixFile*)id;
afpUnlock(id, NO_LOCK);
unixEnterMutex();
if( pFile->pInode && pFile->pInode->nLock ){
/* If there are outstanding locks, do not actually close the file just
** yet because that would clear those locks. Instead, add the file
** descriptor to pInode->aPending. It will be automatically closed when
** the last lock is cleared.
*/
setPendingFd(pFile);
}
releaseInodeInfo(pFile);
sqlite3_free(pFile->lockingContext);
rc = closeUnixFile(id);
unixLeaveMutex();
}
return rc;
}
#endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
/*
** The code above is the AFP lock implementation. The code is specific
** to MacOSX and does not work on other unix platforms. No alternative
** is available. If you don't compile for a mac, then the "unix-afp"
** VFS is not available.
**
********************* End of the AFP lock implementation **********************
******************************************************************************/
/******************************************************************************
*************************** Begin NFS Locking ********************************/
#if defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE
/*
** Lower the locking level on file descriptor pFile to eFileLock. eFileLock
** must be either NO_LOCK or SHARED_LOCK.
**
** If the locking level of the file descriptor is already at or below
** the requested locking level, this routine is a no-op.
*/
static int nfsUnlock(sqlite3_file *id, int eFileLock){
return posixUnlock(id, eFileLock, 1);
}
#endif /* defined(__APPLE__) && SQLITE_ENABLE_LOCKING_STYLE */
/*
** The code above is the NFS lock implementation. The code is specific
** to MacOSX and does not work on other unix platforms. No alternative
** is available.
**
********************* End of the NFS lock implementation **********************
******************************************************************************/
/******************************************************************************
**************** Non-locking sqlite3_file methods *****************************
**
** The next division contains implementations for all methods of the
** sqlite3_file object other than the locking methods. The locking
** methods were defined in divisions above (one locking method per
** division). Those methods that are common to all locking modes
** are gather together into this division.
*/
/*
** Seek to the offset passed as the second argument, then read cnt
** bytes into pBuf. Return the number of bytes actually read.
**
** NB: If you define USE_PREAD or USE_PREAD64, then it might also
** be necessary to define _XOPEN_SOURCE to be 500. This varies from
** one system to another. Since SQLite does not define USE_PREAD
** any any form by default, we will not attempt to define _XOPEN_SOURCE.
** See tickets #2741 and #2681.
**
** To avoid stomping the errno value on a failed read the lastErrno value
** is set before returning.
*/
static int seekAndRead(unixFile *id, sqlite3_int64 offset, void *pBuf, int cnt){
int got;
#if (!defined(USE_PREAD) && !defined(USE_PREAD64))
i64 newOffset;
#endif
TIMER_START;
#if defined(USE_PREAD)
do{ got = osPread(id->h, pBuf, cnt, offset); }while( got<0 && errno==EINTR );
SimulateIOError( got = -1 );
#elif defined(USE_PREAD64)
do{ got = osPread64(id->h, pBuf, cnt, offset); }while( got<0 && errno==EINTR);
SimulateIOError( got = -1 );
#else
newOffset = lseek(id->h, offset, SEEK_SET);
SimulateIOError( newOffset-- );
if( newOffset!=offset ){
if( newOffset == -1 ){
((unixFile*)id)->lastErrno = errno;
}else{
((unixFile*)id)->lastErrno = 0;
}
return -1;
}
do{ got = osRead(id->h, pBuf, cnt); }while( got<0 && errno==EINTR );
#endif
TIMER_END;
if( got<0 ){
((unixFile*)id)->lastErrno = errno;
}
OSTRACE(("READ %-3d %5d %7lld %llu\n", id->h, got, offset, TIMER_ELAPSED));
return got;
}
/*
** Read data from a file into a buffer. Return SQLITE_OK if all
** bytes were read successfully and SQLITE_IOERR if anything goes
** wrong.
*/
static int unixRead(
sqlite3_file *id,
void *pBuf,
int amt,
sqlite3_int64 offset
){
unixFile *pFile = (unixFile *)id;
int got;
assert( id );
/* If this is a database file (not a journal, master-journal or temp
** file), the bytes in the locking range should never be read or written. */
#if 0
assert( pFile->pUnused==0
|| offset>=PENDING_BYTE+512
|| offset+amt<=PENDING_BYTE
);
#endif
got = seekAndRead(pFile, offset, pBuf, amt);
if( got==amt ){
return SQLITE_OK;
}else if( got<0 ){
/* lastErrno set by seekAndRead */
return SQLITE_IOERR_READ;
}else{
pFile->lastErrno = 0; /* not a system error */
/* Unread parts of the buffer must be zero-filled */
memset(&((char*)pBuf)[got], 0, amt-got);
return SQLITE_IOERR_SHORT_READ;
}
}
/*
** Seek to the offset in id->offset then read cnt bytes into pBuf.
** Return the number of bytes actually read. Update the offset.
**
** To avoid stomping the errno value on a failed write the lastErrno value
** is set before returning.
*/
static int seekAndWrite(unixFile *id, i64 offset, const void *pBuf, int cnt){
int got;
#if (!defined(USE_PREAD) && !defined(USE_PREAD64))
i64 newOffset;
#endif
TIMER_START;
#if defined(USE_PREAD)
do{ got = osPwrite(id->h, pBuf, cnt, offset); }while( got<0 && errno==EINTR );
#elif defined(USE_PREAD64)
do{ got = osPwrite64(id->h, pBuf, cnt, offset);}while( got<0 && errno==EINTR);
#else
newOffset = lseek(id->h, offset, SEEK_SET);
SimulateIOError( newOffset-- );
if( newOffset!=offset ){
if( newOffset == -1 ){
((unixFile*)id)->lastErrno = errno;
}else{
((unixFile*)id)->lastErrno = 0;
}
return -1;
}
do{ got = osWrite(id->h, pBuf, cnt); }while( got<0 && errno==EINTR );
#endif
TIMER_END;
if( got<0 ){
((unixFile*)id)->lastErrno = errno;
}
OSTRACE(("WRITE %-3d %5d %7lld %llu\n", id->h, got, offset, TIMER_ELAPSED));
return got;
}
/*
** Write data from a buffer into a file. Return SQLITE_OK on success
** or some other error code on failure.
*/
static int unixWrite(
sqlite3_file *id,
const void *pBuf,
int amt,
sqlite3_int64 offset
){
unixFile *pFile = (unixFile*)id;
int wrote = 0;
assert( id );
assert( amt>0 );
/* If this is a database file (not a journal, master-journal or temp
** file), the bytes in the locking range should never be read or written. */
#if 0
assert( pFile->pUnused==0
|| offset>=PENDING_BYTE+512
|| offset+amt<=PENDING_BYTE
);
#endif
#ifndef NDEBUG
/* If we are doing a normal write to a database file (as opposed to
** doing a hot-journal rollback or a write to some file other than a
** normal database file) then record the fact that the database
** has changed. If the transaction counter is modified, record that
** fact too.
*/
if( pFile->inNormalWrite ){
pFile->dbUpdate = 1; /* The database has been modified */
if( offset<=24 && offset+amt>=27 ){
int rc;
char oldCntr[4];
SimulateIOErrorBenign(1);
rc = seekAndRead(pFile, 24, oldCntr, 4);
SimulateIOErrorBenign(0);
if( rc!=4 || memcmp(oldCntr, &((char*)pBuf)[24-offset], 4)!=0 ){
pFile->transCntrChng = 1; /* The transaction counter has changed */
}
}
}
#endif
while( amt>0 && (wrote = seekAndWrite(pFile, offset, pBuf, amt))>0 ){
amt -= wrote;
offset += wrote;
pBuf = &((char*)pBuf)[wrote];
}
SimulateIOError(( wrote=(-1), amt=1 ));
SimulateDiskfullError(( wrote=0, amt=1 ));
if( amt>0 ){
if( wrote<0 ){
/* lastErrno set by seekAndWrite */
return SQLITE_IOERR_WRITE;
}else{
pFile->lastErrno = 0; /* not a system error */
return SQLITE_FULL;
}
}
return SQLITE_OK;
}
#ifdef SQLITE_TEST
/*
** Count the number of fullsyncs and normal syncs. This is used to test
** that syncs and fullsyncs are occurring at the right times.
*/
int sqlite3_sync_count = 0;
int sqlite3_fullsync_count = 0;
#endif
/*
** We do not trust systems to provide a working fdatasync(). Some do.
** Others do no. To be safe, we will stick with the (slower) fsync().
** If you know that your system does support fdatasync() correctly,
** then simply compile with -Dfdatasync=fdatasync
*/
#if !defined(fdatasync) && !defined(__linux__)
# define fdatasync fsync
#endif
/*
** Define HAVE_FULLFSYNC to 0 or 1 depending on whether or not
** the F_FULLFSYNC macro is defined. F_FULLFSYNC is currently
** only available on Mac OS X. But that could change.
*/
#ifdef F_FULLFSYNC
# define HAVE_FULLFSYNC 1
#else
# define HAVE_FULLFSYNC 0
#endif
/*
** The fsync() system call does not work as advertised on many
** unix systems. The following procedure is an attempt to make
** it work better.
**
** The SQLITE_NO_SYNC macro disables all fsync()s. This is useful
** for testing when we want to run through the test suite quickly.
** You are strongly advised *not* to deploy with SQLITE_NO_SYNC
** enabled, however, since with SQLITE_NO_SYNC enabled, an OS crash
** or power failure will likely corrupt the database file.
**
** SQLite sets the dataOnly flag if the size of the file is unchanged.
** The idea behind dataOnly is that it should only write the file content
** to disk, not the inode. We only set dataOnly if the file size is
** unchanged since the file size is part of the inode. However,
** Ted Ts'o tells us that fdatasync() will also write the inode if the
** file size has changed. The only real difference between fdatasync()
** and fsync(), Ted tells us, is that fdatasync() will not flush the
** inode if the mtime or owner or other inode attributes have changed.
** We only care about the file size, not the other file attributes, so
** as far as SQLite is concerned, an fdatasync() is always adequate.
** So, we always use fdatasync() if it is available, regardless of
** the value of the dataOnly flag.
*/
static int full_fsync(int fd, int fullSync, int dataOnly){
int rc;
/* The following "ifdef/elif/else/" block has the same structure as
** the one below. It is replicated here solely to avoid cluttering
** up the real code with the UNUSED_PARAMETER() macros.
*/
#ifdef SQLITE_NO_SYNC
UNUSED_PARAMETER(fd);
UNUSED_PARAMETER(fullSync);
UNUSED_PARAMETER(dataOnly);
#elif HAVE_FULLFSYNC
UNUSED_PARAMETER(dataOnly);
#else
UNUSED_PARAMETER(fullSync);
UNUSED_PARAMETER(dataOnly);
#endif
/* Record the number of times that we do a normal fsync() and
** FULLSYNC. This is used during testing to verify that this procedure
** gets called with the correct arguments.
*/
#ifdef SQLITE_TEST
if( fullSync ) sqlite3_fullsync_count++;
sqlite3_sync_count++;
#endif
/* If we compiled with the SQLITE_NO_SYNC flag, then syncing is a
** no-op
*/
#ifdef SQLITE_NO_SYNC
rc = SQLITE_OK;
#elif HAVE_FULLFSYNC
if( fullSync ){
rc = osFcntl(fd, F_FULLFSYNC, 0);
}else{
rc = 1;
}
/* If the FULLFSYNC failed, fall back to attempting an fsync().
** It shouldn't be possible for fullfsync to fail on the local
** file system (on OSX), so failure indicates that FULLFSYNC
** isn't supported for this file system. So, attempt an fsync
** and (for now) ignore the overhead of a superfluous fcntl call.
** It'd be better to detect fullfsync support once and avoid
** the fcntl call every time sync is called.
*/
if( rc ) rc = fsync(fd);
#elif defined(__APPLE__)
/* fdatasync() on HFS+ doesn't yet flush the file size if it changed correctly
** so currently we default to the macro that redefines fdatasync to fsync
*/
rc = fsync(fd);
#else
rc = fdatasync(fd);
#if OS_VXWORKS
if( rc==-1 && errno==ENOTSUP ){
rc = fsync(fd);
}
#endif /* OS_VXWORKS */
#endif /* ifdef SQLITE_NO_SYNC elif HAVE_FULLFSYNC */
if( OS_VXWORKS && rc!= -1 ){
rc = 0;
}
return rc;
}
/*
** Open a file descriptor to the directory containing file zFilename.
** If successful, *pFd is set to the opened file descriptor and
** SQLITE_OK is returned. If an error occurs, either SQLITE_NOMEM
** or SQLITE_CANTOPEN is returned and *pFd is set to an undefined
** value.
**
** The directory file descriptor is used for only one thing - to
** fsync() a directory to make sure file creation and deletion events
** are flushed to disk. Such fsyncs are not needed on newer
** journaling filesystems, but are required on older filesystems.
**
** This routine can be overridden using the xSetSysCall interface.
** The ability to override this routine was added in support of the
** chromium sandbox. Opening a directory is a security risk (we are
** told) so making it overrideable allows the chromium sandbox to
** replace this routine with a harmless no-op. To make this routine
** a no-op, replace it with a stub that returns SQLITE_OK but leaves
** *pFd set to a negative number.
**
** If SQLITE_OK is returned, the caller is responsible for closing
** the file descriptor *pFd using close().
*/
static int openDirectory(const char *zFilename, int *pFd){
int ii;
int fd = -1;
char zDirname[MAX_PATHNAME+1];
sqlite3_snprintf(MAX_PATHNAME, zDirname, "%s", zFilename);
for(ii=(int)strlen(zDirname); ii>1 && zDirname[ii]!='/'; ii--);
if( ii>0 ){
zDirname[ii] = '\0';
fd = robust_open(zDirname, O_RDONLY|O_BINARY, 0);
if( fd>=0 ){
#ifdef FD_CLOEXEC
osFcntl(fd, F_SETFD, osFcntl(fd, F_GETFD, 0) | FD_CLOEXEC);
#endif
OSTRACE(("OPENDIR %-3d %s\n", fd, zDirname));
}
}
*pFd = fd;
return (fd>=0?SQLITE_OK:unixLogError(SQLITE_CANTOPEN_BKPT, "open", zDirname));
}
/*
** Make sure all writes to a particular file are committed to disk.
**
** If dataOnly==0 then both the file itself and its metadata (file
** size, access time, etc) are synced. If dataOnly!=0 then only the
** file data is synced.
**
** Under Unix, also make sure that the directory entry for the file
** has been created by fsync-ing the directory that contains the file.
** If we do not do this and we encounter a power failure, the directory
** entry for the journal might not exist after we reboot. The next
** SQLite to access the file will not know that the journal exists (because
** the directory entry for the jo