Back to home page

Enduro/X

 
 

    


0001 /** @file edb.c
0002  *  @brief Lightning memory-mapped database library
0003  *
0004  *  A Btree-based database management library modeled loosely on the
0005  *  BerkeleyDB API, but much simplified.
0006  */
0007 /*
0008  * Copyright 2011-2020 Howard Chu, Symas Corp.
0009  * All rights reserved.
0010  *
0011  * Redistribution and use in source and binary forms, with or without
0012  * modification, are permitted only as authorized by the OpenLDAP
0013  * Public License.
0014  *
0015  * A copy of this license is available in the file LICENSE in the
0016  * top-level directory of the distribution or, alternatively, at
0017  * <http://www.OpenLDAP.org/license.html>.
0018  *
0019  * This code is derived from btree.c written by Martin Hedenfalk.
0020  *
0021  * Copyright (c) 2009, 2010 Martin Hedenfalk <martin@bzero.se>
0022  *
0023  * Permission to use, copy, modify, and distribute this software for any
0024  * purpose with or without fee is hereby granted, provided that the above
0025  * copyright notice and this permission notice appear in all copies.
0026  *
0027  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
0028  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
0029  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
0030  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
0031  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
0032  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
0033  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
0034  */
0035 #ifndef _GNU_SOURCE
0036 #define _GNU_SOURCE 1
0037 #endif
0038 #if defined(EDB_VL32) || defined(__WIN64__)
0039 #define _FILE_OFFSET_BITS   64
0040 #endif
0041 #ifdef _WIN32
0042 #include <malloc.h>
0043 #include <windows.h>
0044 #include <wchar.h>              /* get wcscpy() */
0045 
0046 /* We use native NT APIs to setup the memory map, so that we can
0047  * let the DB file grow incrementally instead of always preallocating
0048  * the full size. These APIs are defined in <wdm.h> and <ntifs.h>
0049  * but those headers are meant for driver-level development and
0050  * conflict with the regular user-level headers, so we explicitly
0051  * declare them here. We get pointers to these functions from
0052  * NTDLL.DLL at runtime, to avoid buildtime dependencies on any
0053  * NTDLL import libraries.
0054  */
0055 typedef NTSTATUS (WINAPI NtCreateSectionFunc)
0056   (OUT PHANDLE sh, IN ACCESS_MASK acc,
0057   IN void * oa OPTIONAL,
0058   IN PLARGE_INTEGER ms OPTIONAL,
0059   IN ULONG pp, IN ULONG aa, IN HANDLE fh OPTIONAL);
0060 
0061 static NtCreateSectionFunc *NtCreateSection;
0062 
0063 typedef enum _SECTION_INHERIT {
0064     ViewShare = 1,
0065     ViewUnmap = 2
0066 } SECTION_INHERIT;
0067 
0068 typedef NTSTATUS (WINAPI NtMapViewOfSectionFunc)
0069   (IN PHANDLE sh, IN HANDLE ph,
0070   IN OUT PVOID *addr, IN ULONG_PTR zbits,
0071   IN SIZE_T cs, IN OUT PLARGE_INTEGER off OPTIONAL,
0072   IN OUT PSIZE_T vs, IN SECTION_INHERIT ih,
0073   IN ULONG at, IN ULONG pp);
0074 
0075 static NtMapViewOfSectionFunc *NtMapViewOfSection;
0076 
0077 typedef NTSTATUS (WINAPI NtCloseFunc)(HANDLE h);
0078 
0079 static NtCloseFunc *NtClose;
0080 
0081 /** getpid() returns int; MinGW defines pid_t but MinGW64 typedefs it
0082  *  as int64 which is wrong. MSVC doesn't define it at all, so just
0083  *  don't use it.
0084  */
0085 #define EDB_PID_T   int
0086 #define EDB_THR_T   DWORD
0087 #include <sys/types.h>
0088 #include <sys/stat.h>
0089 #ifdef __GNUC__
0090 # include <sys/param.h>
0091 #else
0092 # define LITTLE_ENDIAN  1234
0093 # define BIG_ENDIAN 4321
0094 # define BYTE_ORDER LITTLE_ENDIAN
0095 # ifndef SSIZE_MAX
0096 #  define SSIZE_MAX INT_MAX
0097 # endif
0098 #endif
0099 #else
0100 #include <sys/types.h>
0101 #include <sys/stat.h>
0102 #define EDB_PID_T   pid_t
0103 #define EDB_THR_T   pthread_t
0104 #include <sys/param.h>
0105 #include <sys/uio.h>
0106 #include <sys/mman.h>
0107 #ifdef HAVE_SYS_FILE_H
0108 #include <sys/file.h>
0109 #endif
0110 #include <fcntl.h>
0111 #endif
0112 
0113 #include <ndrx_config.h>
0114 #include <ndebug.h>
0115 #include <ndrxdiag.h>
0116 
0117 #if defined(__mips) && defined(__linux)
0118 /* MIPS has cache coherency issues, requires explicit cache control */
0119 #include <asm/cachectl.h>
0120 extern int cacheflush(char *addr, int nbytes, int cache);
0121 #define CACHEFLUSH(addr, bytes, cache)  cacheflush(addr, bytes, cache)
0122 #else
0123 #define CACHEFLUSH(addr, bytes, cache)
0124 #endif
0125 
0126 #if defined(__linux) && !defined(EDB_FDATASYNC_WORKS)
0127 /** fdatasync is broken on ext3/ext4fs on older kernels, see
0128  *  description in #edb_env_open2 comments. You can safely
0129  *  define EDB_FDATASYNC_WORKS if this code will only be run
0130  *  on kernels 3.6 and newer.
0131  */
0132 #define BROKEN_FDATASYNC
0133 #endif
0134 
0135 #include <errno.h>
0136 #include <limits.h>
0137 #include <stddef.h>
0138 #include <inttypes.h>
0139 #include <stdio.h>
0140 #include <stdlib.h>
0141 #include <string.h>
0142 #include <time.h>
0143 #include <ndrx_config.h>
0144 
0145 #ifdef _MSC_VER
0146 #include <io.h>
0147 typedef SSIZE_T ssize_t;
0148 #else
0149 #include <unistd.h>
0150 #endif
0151 
0152 #if defined(__sun) || defined(__ANDROID__)
0153 /* Most platforms have posix_memalign, older may only have memalign */
0154 #define HAVE_MEMALIGN   1
0155 #include <malloc.h>
0156 /* On Solaris, we need the POSIX sigwait function */
0157 #if defined (__sun) && !defined(_POSIX_PTHREAD_SEMANTICS)
0158 # define _POSIX_PTHREAD_SEMANTICS   1
0159 #endif
0160 #endif
0161 
0162 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
0163 #include <netinet/in.h>
0164 #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
0165 #endif
0166 
0167 #if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) || defined (EX_OS_AIX)
0168 # if !(defined(EDB_USE_POSIX_MUTEX) || defined(EDB_USE_POSIX_SEM))
0169 # define EDB_USE_SYSV_SEM   1
0170 # endif
0171 # define EDB_FDATASYNC      fsync
0172 #elif defined(__ANDROID__)
0173 # define EDB_FDATASYNC      fsync
0174 #endif
0175 
0176 #ifndef _WIN32
0177 #include <pthread.h>
0178 #include <signal.h>
0179 #ifdef EDB_USE_POSIX_SEM
0180 # define EDB_USE_HASH       1
0181 #include <semaphore.h>
0182 #elif defined(EDB_USE_SYSV_SEM)
0183 #include <sys/ipc.h>
0184 #include <sys/sem.h>
0185 #ifndef EX_HAVE_SEMUN
0186 union semun {
0187     int val;
0188     struct semid_ds *buf;
0189     unsigned short *array;
0190 };
0191 #endif /* _SEM_SEMUN_UNDEFINED */
0192 #else
0193 #define EDB_USE_POSIX_MUTEX 1
0194 #endif /* EDB_USE_POSIX_SEM */
0195 #endif /* !_WIN32 */
0196 
0197 #if defined(_WIN32) + defined(EDB_USE_POSIX_SEM) + defined(EDB_USE_SYSV_SEM) \
0198     + defined(EDB_USE_POSIX_MUTEX) != 1
0199 # error "Ambiguous shared-lock implementation"
0200 #endif
0201 
0202 #ifdef USE_VALGRIND
0203 #include <valgrind/memcheck.h>
0204 #define VGMEMP_CREATE(h,r,z)    VALGRIND_CREATE_MEMPOOL(h,r,z)
0205 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
0206 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
0207 #define VGMEMP_DESTROY(h)   VALGRIND_DESTROY_MEMPOOL(h)
0208 #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s)
0209 #else
0210 #define VGMEMP_CREATE(h,r,z)
0211 #define VGMEMP_ALLOC(h,a,s)
0212 #define VGMEMP_FREE(h,a)
0213 #define VGMEMP_DESTROY(h)
0214 #define VGMEMP_DEFINED(a,s)
0215 #endif
0216 
0217 #ifndef BYTE_ORDER
0218 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
0219 /* Solaris just defines one or the other */
0220 #  define LITTLE_ENDIAN 1234
0221 #  define BIG_ENDIAN    4321
0222 #  ifdef _LITTLE_ENDIAN
0223 #   define BYTE_ORDER  LITTLE_ENDIAN
0224 #  else
0225 #   define BYTE_ORDER  BIG_ENDIAN
0226 #  endif
0227 # else
0228 #  define BYTE_ORDER   __BYTE_ORDER
0229 # endif
0230 #endif
0231 
0232 #ifndef LITTLE_ENDIAN
0233 #define LITTLE_ENDIAN   __LITTLE_ENDIAN
0234 #endif
0235 #ifndef BIG_ENDIAN
0236 #define BIG_ENDIAN  __BIG_ENDIAN
0237 #endif
0238 
0239 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86)
0240 #define MISALIGNED_OK   1
0241 #endif
0242 
0243 #include "exdb.h"
0244 #include "eidl.h"
0245 
0246 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN)
0247 # error "Unknown or unsupported endianness (BYTE_ORDER)"
0248 #elif (-6 & 5) || CHAR_BIT!=8 || UINT_MAX!=0xffffffff || EDB_SIZE_MAX%UINT_MAX
0249 # error "Two's complement, reasonably sized integer types, please"
0250 #endif
0251 
0252 #ifdef __GNUC__
0253 /** Put infrequently used env functions in separate section */
0254 # ifdef __APPLE__
0255 #  define   ESECT   __attribute__ ((section("__TEXT,text_env")))
0256 # else
0257 #  define   ESECT   __attribute__ ((section("text_env")))
0258 # endif
0259 #else
0260 #define ESECT
0261 #endif
0262 
0263 #ifdef _WIN32
0264 #define CALL_CONV WINAPI
0265 #else
0266 #define CALL_CONV
0267 #endif
0268 
0269 /** @defgroup internal  EXDB Internals
0270  *  @{
0271  */
0272 /** @defgroup compat    Compatibility Macros
0273  *  A bunch of macros to minimize the amount of platform-specific ifdefs
0274  *  needed throughout the rest of the code. When the features this library
0275  *  needs are similar enough to POSIX to be hidden in a one-or-two line
0276  *  replacement, this macro approach is used.
0277  *  @{
0278  */
0279 
0280     /** Features under development */
0281 #ifndef EDB_DEVEL
0282 #define EDB_DEVEL 0
0283 #endif
0284 
0285     /** Wrapper around __func__, which is a C99 feature */
0286 #if __STDC_VERSION__ >= 199901L
0287 # define edb_func_  __func__
0288 #elif __GNUC__ >= 2 || _MSC_VER >= 1300
0289 # define edb_func_  __FUNCTION__
0290 #else
0291 /* If a debug message says <edb_unknown>(), update the #if statements above */
0292 # define edb_func_  "<edb_unknown>"
0293 #endif
0294 
0295 /* Internal error codes, not exposed outside libexdb */
0296 #define EDB_NO_ROOT     (EDB_LAST_ERRCODE + 10)
0297 #ifdef _WIN32
0298 #define EDB_OWNERDEAD   ((int) WAIT_ABANDONED)
0299 #elif defined EDB_USE_SYSV_SEM
0300 #define EDB_OWNERDEAD   (EDB_LAST_ERRCODE + 11)
0301 #elif defined(EDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD)
0302 #define EDB_OWNERDEAD   EOWNERDEAD  /**< #LOCK_MUTEX0() result if dead owner */
0303 #endif
0304 
0305 #ifdef __GLIBC__
0306 #define GLIBC_VER   ((__GLIBC__ << 16 )| __GLIBC_MINOR__)
0307 #endif
0308 /** Some platforms define the EOWNERDEAD error code
0309  * even though they don't support Robust Mutexes.
0310  * Compile with -DEDB_USE_ROBUST=0, or use some other
0311  * mechanism like -DEDB_USE_SYSV_SEM instead of
0312  * -DEDB_USE_POSIX_MUTEX. (SysV semaphores are
0313  * also Robust, but some systems don't support them
0314  * either.)
0315  */
0316 #ifndef EDB_USE_ROBUST
0317 /* Android currently lacks Robust Mutex support. So does glibc < 2.4. */
0318 # if defined(EDB_USE_POSIX_MUTEX) && (defined(__ANDROID__) || \
0319     (defined(__GLIBC__) && GLIBC_VER < 0x020004))
0320 #  define EDB_USE_ROBUST    0
0321 # else
0322 #  define EDB_USE_ROBUST    1
0323 # endif
0324 #endif /* !EDB_USE_ROBUST */
0325 
0326 #if defined(EDB_USE_POSIX_MUTEX) && (EDB_USE_ROBUST)
0327 /* glibc < 2.12 only provided _np API */
0328 #  if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \
0329     (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST))
0330 #   define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP
0331 #   define pthread_mutexattr_setrobust(attr, flag)  pthread_mutexattr_setrobust_np(attr, flag)
0332 #   define pthread_mutex_consistent(mutex)  pthread_mutex_consistent_np(mutex)
0333 #  endif
0334 #endif /* EDB_USE_POSIX_MUTEX && EDB_USE_ROBUST */
0335 
0336 #if defined(EDB_OWNERDEAD) && (EDB_USE_ROBUST)
0337 #define EDB_ROBUST_SUPPORTED    1
0338 #endif
0339 
0340 #ifdef _WIN32
0341 #define EDB_USE_HASH    1
0342 #define EDB_PIDLOCK 0
0343 #define THREAD_RET  DWORD
0344 #define pthread_t   HANDLE
0345 #define pthread_mutex_t HANDLE
0346 #define pthread_cond_t  HANDLE
0347 typedef HANDLE edb_mutex_t, edb_mutexref_t;
0348 #define pthread_key_t   DWORD
0349 #define pthread_self()  GetCurrentThreadId()
0350 #define pthread_key_create(x,y) \
0351     ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0)
0352 #define pthread_key_delete(x)   TlsFree(x)
0353 #define pthread_getspecific(x)  TlsGetValue(x)
0354 #define pthread_setspecific(x,y)    (TlsSetValue(x,y) ? 0 : ErrCode())
0355 #define pthread_mutex_unlock(x) ReleaseMutex(*x)
0356 #define pthread_mutex_lock(x)   WaitForSingleObject(*x, INFINITE)
0357 #define pthread_cond_signal(x)  SetEvent(*x)
0358 #define pthread_cond_wait(cond,mutex)   do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0)
0359 #define THREAD_CREATE(thr,start,arg) \
0360     (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode())
0361 #define THREAD_FINISH(thr) \
0362     (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0)
0363 #define LOCK_MUTEX0(mutex)      WaitForSingleObject(mutex, INFINITE)
0364 #define UNLOCK_MUTEX(mutex)     ReleaseMutex(mutex)
0365 #define edb_mutex_consistent(mutex) 0
0366 #define getpid()    GetCurrentProcessId()
0367 #define EDB_FDATASYNC(fd)   (!FlushFileBuffers(fd))
0368 #define EDB_MSYNC(addr,len,flags)   (!FlushViewOfFile(addr,len))
0369 #define ErrCode()   GetLastError()
0370 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
0371 #define close(fd)   (CloseHandle(fd) ? 0 : -1)
0372 #define munmap(ptr,len) UnmapViewOfFile(ptr)
0373 #ifdef PROCESS_QUERY_LIMITED_INFORMATION
0374 #define EDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION
0375 #else
0376 #define EDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000
0377 #endif
0378 #else
0379 #define THREAD_RET  void *
0380 #define THREAD_CREATE(thr,start,arg)    pthread_create(&thr,NULL,start,arg)
0381 #define THREAD_FINISH(thr)  pthread_join(thr,NULL)
0382 
0383     /** For EDB_LOCK_FORMAT: True if readers take a pid lock in the lockfile */
0384 #define EDB_PIDLOCK         1
0385 
0386 #ifdef EDB_USE_POSIX_SEM
0387 
0388 typedef sem_t *edb_mutex_t, *edb_mutexref_t;
0389 #define LOCK_MUTEX0(mutex)      edb_sem_wait(mutex)
0390 #define UNLOCK_MUTEX(mutex)     sem_post(mutex)
0391 
0392 static int
0393 edb_sem_wait(sem_t *sem)
0394 {
0395    int rc;
0396    while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ;
0397    return rc;
0398 }
0399 
0400 #elif defined EDB_USE_SYSV_SEM
0401 
0402 typedef struct edb_mutex {
0403     int semid;
0404     int semnum;
0405     int *locked;
0406 } edb_mutex_t[1], *edb_mutexref_t;
0407 
0408 #define LOCK_MUTEX0(mutex)      edb_sem_wait(mutex)
0409 #define UNLOCK_MUTEX(mutex)     do { \
0410     struct sembuf sb = { 0, 1, SEM_UNDO }; \
0411     sb.sem_num = (mutex)->semnum; \
0412     *(mutex)->locked = 0; \
0413     semop((mutex)->semid, &sb, 1); \
0414 } while(0)
0415 
0416 static int
0417 edb_sem_wait(edb_mutexref_t sem)
0418 {
0419     int rc, *locked = sem->locked;
0420     struct sembuf sb = { 0, -1, SEM_UNDO };
0421     sb.sem_num = sem->semnum;
0422     do {
0423         if (!semop(sem->semid, &sb, 1)) {
0424             rc = *locked ? EDB_OWNERDEAD : EDB_SUCCESS;
0425             *locked = 1;
0426             break;
0427         }
0428     } while ((rc = errno) == EINTR);
0429     return rc;
0430 }
0431 
0432 #define edb_mutex_consistent(mutex) 0
0433 
0434 #else   /* EDB_USE_POSIX_MUTEX: */
0435     /** Shared mutex/semaphore as the original is stored.
0436      *
0437      *  Not for copies.  Instead it can be assigned to an #edb_mutexref_t.
0438      *  When edb_mutexref_t is a pointer and edb_mutex_t is not, then it
0439      *  is array[size 1] so it can be assigned to the pointer.
0440      */
0441 typedef pthread_mutex_t edb_mutex_t[1];
0442     /** Reference to an #edb_mutex_t */
0443 typedef pthread_mutex_t *edb_mutexref_t;
0444     /** Lock the reader or writer mutex.
0445      *  Returns 0 or a code to give #edb_mutex_failed(), as in #LOCK_MUTEX().
0446      */
0447 #define LOCK_MUTEX0(mutex)  pthread_mutex_lock(mutex)
0448     /** Unlock the reader or writer mutex.
0449      */
0450 #define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex)
0451     /** Mark mutex-protected data as repaired, after death of previous owner.
0452      */
0453 #define edb_mutex_consistent(mutex) pthread_mutex_consistent(mutex)
0454 #endif  /* EDB_USE_POSIX_SEM || EDB_USE_SYSV_SEM */
0455 
0456     /** Get the error code for the last failed system function.
0457      */
0458 #define ErrCode()   errno
0459 
0460     /** An abstraction for a file handle.
0461      *  On POSIX systems file handles are small integers. On Windows
0462      *  they're opaque pointers.
0463      */
0464 #define HANDLE  int
0465 
0466     /** A value for an invalid file handle.
0467      *  Mainly used to initialize file variables and signify that they are
0468      *  unused.
0469      */
0470 #define INVALID_HANDLE_VALUE    (-1)
0471 
0472     /** Get the size of a memory page for the system.
0473      *  This is the basic size that the platform's memory manager uses, and is
0474      *  fundamental to the use of memory-mapped files.
0475      */
0476 #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE))
0477 #endif
0478 
0479 #define Z   EDB_FMT_Z   /**< printf/scanf format modifier for size_t */
0480 #define Yu  EDB_PRIy(u) /**< printf format for #edb_size_t */
0481 #define Yd  EDB_PRIy(d) /**< printf format for 'signed #edb_size_t' */
0482 
0483 #ifdef EDB_USE_SYSV_SEM
0484 #define MNAME_LEN   (sizeof(int))
0485 #else
0486 #define MNAME_LEN   (sizeof(pthread_mutex_t))
0487 #endif
0488 
0489 /** Initial part of #EDB_env.me_mutexname[].
0490  *  Changes to this code must be reflected in #EDB_LOCK_FORMAT.
0491  */
0492 #ifdef _WIN32
0493 #define MUTEXNAME_PREFIX        "Global\\EDB"
0494 #elif defined EDB_USE_POSIX_SEM
0495 #define MUTEXNAME_PREFIX        "/EDB"
0496 #endif
0497 
0498 /** @} */
0499 
0500 #ifdef EDB_ROBUST_SUPPORTED
0501     /** Lock mutex, handle any error, set rc = result.
0502      *  Return 0 on success, nonzero (not rc) on error.
0503      */
0504 #define LOCK_MUTEX(rc, env, mutex) \
0505     (((rc) = LOCK_MUTEX0(mutex)) && \
0506      ((rc) = edb_mutex_failed(env, mutex, rc)))
0507 static int edb_mutex_failed(EDB_env *env, edb_mutexref_t mutex, int rc);
0508 #else
0509 #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex))
0510 #define edb_mutex_failed(env, mutex, rc) (rc)
0511 #endif
0512 
0513 #ifndef _WIN32
0514 /** A flag for opening a file and requesting synchronous data writes.
0515  *  This is only used when writing a meta page. It's not strictly needed;
0516  *  we could just do a normal write and then immediately perform a flush.
0517  *  But if this flag is available it saves us an extra system call.
0518  *
0519  *  @note If O_DSYNC is undefined but exists in /usr/include,
0520  * preferably set some compiler flag to get the definition.
0521  */
0522 #ifndef EDB_DSYNC
0523 # ifdef O_DSYNC
0524 # define EDB_DSYNC  O_DSYNC
0525 # else
0526 # define EDB_DSYNC  O_SYNC
0527 # endif
0528 #endif
0529 #endif
0530 
0531 /** Function for flushing the data of a file. Define this to fsync
0532  *  if fdatasync() is not supported.
0533  */
0534 #ifndef EDB_FDATASYNC
0535 # define EDB_FDATASYNC  fdatasync
0536 #endif
0537 
0538 #ifndef EDB_MSYNC
0539 # define EDB_MSYNC(addr,len,flags)  msync(addr,len,flags)
0540 #endif
0541 
0542 #ifndef MS_SYNC
0543 #define MS_SYNC 1
0544 #endif
0545 
0546 #ifndef MS_ASYNC
0547 #define MS_ASYNC    0
0548 #endif
0549 
0550     /** A page number in the database.
0551      *  Note that 64 bit page numbers are overkill, since pages themselves
0552      *  already represent 12-13 bits of addressable memory, and the OS will
0553      *  always limit applications to a maximum of 63 bits of address space.
0554      *
0555      *  @note In the #EDB_node structure, we only store 48 bits of this value,
0556      *  which thus limits us to only 60 bits of addressable data.
0557      */
0558 typedef EDB_ID  pgno_t;
0559 
0560     /** A transaction ID.
0561      *  See struct EDB_txn.mt_txnid for details.
0562      */
0563 typedef EDB_ID  txnid_t;
0564 
0565 /** @defgroup debug Debug Macros
0566  *  @{
0567  */
0568 #ifndef EDB_DEBUG
0569     /** Enable debug output.  Needs variable argument macros (a C99 feature).
0570      *  Set this to 1 for copious tracing. Set to 2 to add dumps of all IDLs
0571      *  read from and written to the database (used for free space management).
0572      */
0573 #define EDB_DEBUG 0
0574 #endif
0575 
0576 #if EDB_DEBUG
0577 static int edb_debug;
0578 static txnid_t edb_debug_start;
0579 
0580     /** Print a debug message with printf formatting.
0581      *  Requires double parenthesis around 2 or more args.
0582      */
0583 # define DPRINTF(args) ((void) ((edb_debug) && DPRINTF0 args))
0584 # define DPRINTF0(fmt, ...) \
0585     fprintf(stderr, "%s:%d " fmt "\n", edb_func_, __LINE__, __VA_ARGS__)
0586 #else
0587 # define DPRINTF(args)  ((void) 0)
0588 #endif
0589     /** Print a debug string.
0590      *  The string is printed literally, with no format processing.
0591      */
0592 #define DPUTS(arg)  DPRINTF(("%s", arg))
0593     /** Debuging output value of a cursor DBI: Negative in a sub-cursor. */
0594 #define DDBI(mc) \
0595     (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
0596 /** @} */
0597 
0598     /** @brief The maximum size of a database page.
0599      *
0600      *  It is 32k or 64k, since value-PAGEBASE must fit in
0601      *  #EDB_page.%mp_upper.
0602      *
0603      *  EXDB will use database pages < OS pages if needed.
0604      *  That causes more I/O in write transactions: The OS must
0605      *  know (read) the whole page before writing a partial page.
0606      *
0607      *  Note that we don't currently support Huge pages. On Linux,
0608      *  regular data files cannot use Huge pages, and in general
0609      *  Huge pages aren't actually pageable. We rely on the OS
0610      *  demand-pager to read our data and page it out when memory
0611      *  pressure from other processes is high. So until OSs have
0612      *  actual paging support for Huge pages, they're not viable.
0613      */
0614 #define MAX_PAGESIZE     (PAGEBASE ? 0x10000 : 0x8000)
0615 
0616     /** The minimum number of keys required in a database page.
0617      *  Setting this to a larger value will place a smaller bound on the
0618      *  maximum size of a data item. Data items larger than this size will
0619      *  be pushed into overflow pages instead of being stored directly in
0620      *  the B-tree node. This value used to default to 4. With a page size
0621      *  of 4096 bytes that meant that any item larger than 1024 bytes would
0622      *  go into an overflow page. That also meant that on average 2-3KB of
0623      *  each overflow page was wasted space. The value cannot be lower than
0624      *  2 because then there would no longer be a tree structure. With this
0625      *  value, items larger than 2KB will go into overflow pages, and on
0626      *  average only 1KB will be wasted.
0627      */
0628 #define EDB_MINKEYS  2
0629 
0630     /** A stamp that identifies a file as an EXDB file.
0631      *  There's nothing special about this value other than that it is easily
0632      *  recognizable, and it will reflect any byte order mismatches.
0633      */
0634 #define EDB_MAGIC    0xBEEFC0DE
0635 
0636     /** The version number for a database's datafile format. */
0637 #define EDB_DATA_VERSION     ((EDB_DEVEL) ? 999 : 1)
0638     /** The version number for a database's lockfile format. */
0639 #define EDB_LOCK_VERSION     ((EDB_DEVEL) ? 999 : 2)
0640     /** Number of bits representing #EDB_LOCK_VERSION in #EDB_LOCK_FORMAT.
0641      *  The remaining bits must leave room for #EDB_lock_desc.
0642      */
0643 #define EDB_LOCK_VERSION_BITS 12
0644 
0645     /** @brief The max size of a key we can write, or 0 for computed max.
0646      *
0647      *  This macro should normally be left alone or set to 0.
0648      *  Note that a database with big keys or dupsort data cannot be
0649      *  reliably modified by a libexdb which uses a smaller max.
0650      *  The default is 511 for backwards compat, or 0 when #EDB_DEVEL.
0651      *
0652      *  Other values are allowed, for backwards compat.  However:
0653      *  A value bigger than the computed max can break if you do not
0654      *  know what you are doing, and libexdb <= 0.9.10 can break when
0655      *  modifying a DB with keys/dupsort data bigger than its max.
0656      *
0657      *  Data items in an #EDB_DUPSORT database are also limited to
0658      *  this size, since they're actually keys of a sub-DB.  Keys and
0659      *  #EDB_DUPSORT data items must fit on a node in a regular page.
0660      */
0661 #ifndef EDB_MAXKEYSIZE
0662 #define EDB_MAXKEYSIZE   ((EDB_DEVEL) ? 0 : 511)
0663 #endif
0664 
0665     /** The maximum size of a key we can write to the environment. */
0666 #if EDB_MAXKEYSIZE
0667 #define ENV_MAXKEY(env) (EDB_MAXKEYSIZE)
0668 #else
0669 #define ENV_MAXKEY(env) ((env)->me_maxkey)
0670 #endif
0671 
0672     /** @brief The maximum size of a data item.
0673      *
0674      *  We only store a 32 bit value for node sizes.
0675      */
0676 #define MAXDATASIZE 0xffffffffUL
0677 
0678 #if EDB_DEBUG
0679     /** Key size which fits in a #DKBUF.
0680      *  @ingroup debug
0681      */
0682 #define DKBUF_MAXKEYSIZE ((EDB_MAXKEYSIZE) > 0 ? (EDB_MAXKEYSIZE) : 511)
0683     /** A key buffer.
0684      *  @ingroup debug
0685      *  This is used for printing a hex dump of a key's contents.
0686      */
0687 #define DKBUF   char kbuf[DKBUF_MAXKEYSIZE*2+1]
0688     /** Display a key in hex.
0689      *  @ingroup debug
0690      *  Invoke a function to display a key in hex.
0691      */
0692 #define DKEY(x) edb_dkey(x, kbuf)
0693 #else
0694 #define DKBUF
0695 #define DKEY(x) 0
0696 #endif
0697 
0698     /** An invalid page number.
0699      *  Mainly used to denote an empty tree.
0700      */
0701 #define P_INVALID    (~(pgno_t)0)
0702 
0703     /** Test if the flags \b f are set in a flag word \b w. */
0704 #define F_ISSET(w, f)    (((w) & (f)) == (f))
0705 
0706     /** Round \b n up to an even number. */
0707 #define EVEN(n)     (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
0708 
0709     /** Least significant 1-bit of \b n.  n must be of an unsigned type. */
0710 #define LOW_BIT(n)      ((n) & (-(n)))
0711 
0712     /** (log2(\b p2) % \b n), for p2 = power of 2 and 0 < n < 8. */
0713 #define LOG2_MOD(p2, n) (7 - 86 / ((p2) % ((1U<<(n))-1) + 11))
0714     /* Explanation: Let p2 = 2**(n*y + x), x<n and M = (1U<<n)-1. Now p2 =
0715      * (M+1)**y * 2**x = 2**x (mod M). Finally "/" "happens" to return 7-x.
0716      */
0717 
0718     /** Should be alignment of \b type. Ensure it is a power of 2. */
0719 #define ALIGNOF2(type) \
0720     LOW_BIT(offsetof(struct { char ch_; type align_; }, align_))
0721 
0722     /** Used for offsets within a single page.
0723      *  Since memory pages are typically 4 or 8KB in size, 12-13 bits,
0724      *  this is plenty.
0725      */
0726 typedef uint16_t     indx_t;
0727 
0728 typedef unsigned long long  edb_hash_t;
0729 
0730     /** Default size of memory map.
0731      *  This is certainly too small for any actual applications. Apps should always set
0732      *  the size explicitly using #edb_env_set_mapsize().
0733      */
0734 #define DEFAULT_MAPSIZE 1048576
0735 
0736 /** @defgroup readers   Reader Lock Table
0737  *  Readers don't acquire any locks for their data access. Instead, they
0738  *  simply record their transaction ID in the reader table. The reader
0739  *  mutex is needed just to find an empty slot in the reader table. The
0740  *  slot's address is saved in thread-specific data so that subsequent read
0741  *  transactions started by the same thread need no further locking to proceed.
0742  *
0743  *  If #EDB_NOTLS is set, the slot address is not saved in thread-specific data.
0744  *
0745  *  No reader table is used if the database is on a read-only filesystem, or
0746  *  if #EDB_NOLOCK is set.
0747  *
0748  *  Since the database uses multi-version concurrency control, readers don't
0749  *  actually need any locking. This table is used to keep track of which
0750  *  readers are using data from which old transactions, so that we'll know
0751  *  when a particular old transaction is no longer in use. Old transactions
0752  *  that have discarded any data pages can then have those pages reclaimed
0753  *  for use by a later write transaction.
0754  *
0755  *  The lock table is constructed such that reader slots are aligned with the
0756  *  processor's cache line size. Any slot is only ever used by one thread.
0757  *  This alignment guarantees that there will be no contention or cache
0758  *  thrashing as threads update their own slot info, and also eliminates
0759  *  any need for locking when accessing a slot.
0760  *
0761  *  A writer thread will scan every slot in the table to determine the oldest
0762  *  outstanding reader transaction. Any freed pages older than this will be
0763  *  reclaimed by the writer. The writer doesn't use any locks when scanning
0764  *  this table. This means that there's no guarantee that the writer will
0765  *  see the most up-to-date reader info, but that's not required for correct
0766  *  operation - all we need is to know the upper bound on the oldest reader,
0767  *  we don't care at all about the newest reader. So the only consequence of
0768  *  reading stale information here is that old pages might hang around a
0769  *  while longer before being reclaimed. That's actually good anyway, because
0770  *  the longer we delay reclaiming old pages, the more likely it is that a
0771  *  string of contiguous pages can be found after coalescing old pages from
0772  *  many old transactions together.
0773  *  @{
0774  */
0775     /** Number of slots in the reader table.
0776      *  This value was chosen somewhat arbitrarily. 126 readers plus a
0777      *  couple mutexes fit exactly into 8KB on my development machine.
0778      *  Applications should set the table size using #edb_env_set_maxreaders().
0779      */
0780 #define DEFAULT_READERS 126
0781 
0782     /** The size of a CPU cache line in bytes. We want our lock structures
0783      *  aligned to this size to avoid false cache line sharing in the
0784      *  lock table.
0785      *  This value works for most CPUs. For Itanium this should be 128.
0786      */
0787 #ifndef CACHELINE
0788 #define CACHELINE   64
0789 #endif
0790 
0791     /** The information we store in a single slot of the reader table.
0792      *  In addition to a transaction ID, we also record the process and
0793      *  thread ID that owns a slot, so that we can detect stale information,
0794      *  e.g. threads or processes that went away without cleaning up.
0795      *  @note We currently don't check for stale records. We simply re-init
0796      *  the table when we know that we're the only process opening the
0797      *  lock file.
0798      */
0799 typedef struct EDB_rxbody {
0800     /** Current Transaction ID when this transaction began, or (txnid_t)-1.
0801      *  Multiple readers that start at the same time will probably have the
0802      *  same ID here. Again, it's not important to exclude them from
0803      *  anything; all we need to know is which version of the DB they
0804      *  started from so we can avoid overwriting any data used in that
0805      *  particular version.
0806      */
0807     volatile txnid_t        mrb_txnid;
0808     /** The process ID of the process owning this reader txn. */
0809     volatile EDB_PID_T  mrb_pid;
0810     /** The thread ID of the thread owning this txn. */
0811     volatile EDB_THR_T  mrb_tid;
0812 } EDB_rxbody;
0813 
0814     /** The actual reader record, with cacheline padding. */
0815 typedef struct EDB_reader {
0816     union {
0817         EDB_rxbody mrx;
0818         /** shorthand for mrb_txnid */
0819 #define mr_txnid    mru.mrx.mrb_txnid
0820 #define mr_pid  mru.mrx.mrb_pid
0821 #define mr_tid  mru.mrx.mrb_tid
0822         /** cache line alignment */
0823         char pad[(sizeof(EDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
0824     } mru;
0825 } EDB_reader;
0826 
0827     /** The header for the reader table.
0828      *  The table resides in a memory-mapped file. (This is a different file
0829      *  than is used for the main database.)
0830      *
0831      *  For POSIX the actual mutexes reside in the shared memory of this
0832      *  mapped file. On Windows, mutexes are named objects allocated by the
0833      *  kernel; we store the mutex names in this mapped file so that other
0834      *  processes can grab them. This same approach is also used on
0835      *  MacOSX/Darwin (using named semaphores) since MacOSX doesn't support
0836      *  process-shared POSIX mutexes. For these cases where a named object
0837      *  is used, the object name is derived from a 64 bit FNV hash of the
0838      *  environment pathname. As such, naming collisions are extremely
0839      *  unlikely. If a collision occurs, the results are unpredictable.
0840      */
0841 typedef struct EDB_txbody {
0842         /** Stamp identifying this as an EXDB file. It must be set
0843          *  to #EDB_MAGIC. */
0844     uint32_t    mtb_magic;
0845         /** Format of this lock file. Must be set to #EDB_LOCK_FORMAT. */
0846     uint32_t    mtb_format;
0847         /** The ID of the last transaction committed to the database.
0848          *  This is recorded here only for convenience; the value can always
0849          *  be determined by reading the main database meta pages.
0850          */
0851     volatile txnid_t        mtb_txnid;
0852         /** The number of slots that have been used in the reader table.
0853          *  This always records the maximum count, it is not decremented
0854          *  when readers release their slots.
0855          */
0856     volatile unsigned   mtb_numreaders;
0857 #if defined(_WIN32) || defined(EDB_USE_POSIX_SEM)
0858         /** Binary form of names of the reader/writer locks */
0859     edb_hash_t          mtb_mutexid;
0860 #elif defined(EDB_USE_SYSV_SEM)
0861     int     mtb_semid;
0862     int     mtb_rlocked;
0863 #else
0864         /** Mutex protecting access to this table.
0865          *  This is the reader table lock used with LOCK_MUTEX().
0866          */
0867     edb_mutex_t mtb_rmutex;
0868 #endif
0869 } EDB_txbody;
0870 
0871     /** The actual reader table definition. */
0872 typedef struct EDB_txninfo {
0873     union {
0874         EDB_txbody mtb;
0875 #define mti_magic   mt1.mtb.mtb_magic
0876 #define mti_format  mt1.mtb.mtb_format
0877 #define mti_rmutex  mt1.mtb.mtb_rmutex
0878 #define mti_txnid   mt1.mtb.mtb_txnid
0879 #define mti_numreaders  mt1.mtb.mtb_numreaders
0880 #define mti_mutexid mt1.mtb.mtb_mutexid
0881 #ifdef EDB_USE_SYSV_SEM
0882 #define mti_semid   mt1.mtb.mtb_semid
0883 #define mti_rlocked mt1.mtb.mtb_rlocked
0884 #endif
0885         char pad[(sizeof(EDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
0886     } mt1;
0887 #if !(defined(_WIN32) || defined(EDB_USE_POSIX_SEM))
0888     union {
0889 #ifdef EDB_USE_SYSV_SEM
0890         int mt2_wlocked;
0891 #define mti_wlocked mt2.mt2_wlocked
0892 #else
0893         edb_mutex_t mt2_wmutex;
0894 #define mti_wmutex  mt2.mt2_wmutex
0895 #endif
0896         char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
0897     } mt2;
0898 #endif
0899     EDB_reader  mti_readers[1];
0900 } EDB_txninfo;
0901 
0902     /** Lockfile format signature: version, features and field layout */
0903 #define EDB_LOCK_FORMAT \
0904     ((uint32_t)         \
0905      (((EDB_LOCK_VERSION) % (1U << EDB_LOCK_VERSION_BITS)) \
0906       + EDB_lock_desc     * (1U << EDB_LOCK_VERSION_BITS)))
0907 
0908     /** Lock type and layout. Values 0-119. _WIN32 implies #EDB_PIDLOCK.
0909      *  Some low values are reserved for future tweaks.
0910      */
0911 #ifdef _WIN32
0912 # define EDB_LOCK_TYPE  (0 + ALIGNOF2(edb_hash_t)/8 % 2)
0913 #elif defined EDB_USE_POSIX_SEM
0914 # define EDB_LOCK_TYPE  (4 + ALIGNOF2(edb_hash_t)/8 % 2)
0915 #elif defined EDB_USE_SYSV_SEM
0916 # define EDB_LOCK_TYPE  (8)
0917 #elif defined EDB_USE_POSIX_MUTEX
0918 /* We do not know the inside of a POSIX mutex and how to check if mutexes
0919  * used by two executables are compatible. Just check alignment and size.
0920  */
0921 # define EDB_LOCK_TYPE  (10 + \
0922         LOG2_MOD(ALIGNOF2(pthread_mutex_t), 5) + \
0923         sizeof(pthread_mutex_t) / 4U % 22 * 5)
0924 #endif
0925 
0926 enum {
0927     /** Magic number for lockfile layout and features.
0928      *
0929      *  This *attempts* to stop libexdb variants compiled with conflicting
0930      *  options from using the lockfile at the same time and thus breaking
0931      *  it.  It describes locking types, and sizes and sometimes alignment
0932      *  of the various lockfile items.
0933      *
0934      *  The detected ranges are mostly guesswork, or based simply on how
0935      *  big they could be without using more bits.  So we can tweak them
0936      *  in good conscience when updating #EDB_LOCK_VERSION.
0937      */
0938     EDB_lock_desc =
0939     /* Default CACHELINE=64 vs. other values (have seen mention of 32-256) */
0940     (CACHELINE==64 ? 0 : 1 + LOG2_MOD(CACHELINE >> (CACHELINE>64), 5))
0941     + 6  * (sizeof(EDB_PID_T)/4 % 3)    /* legacy(2) to word(4/8)? */
0942     + 18 * (sizeof(pthread_t)/4 % 5)    /* can be struct{id, active data} */
0943     + 90 * (sizeof(EDB_txbody) / CACHELINE % 3)
0944     + 270 * (EDB_LOCK_TYPE % 120)
0945     /* The above is < 270*120 < 2**15 */
0946     + ((sizeof(txnid_t) == 8) << 15)    /* 32bit/64bit */
0947     + ((sizeof(EDB_reader) > CACHELINE) << 16)
0948     /* Not really needed - implied by EDB_LOCK_TYPE != (_WIN32 locking) */
0949     + (((EDB_PIDLOCK) != 0)   << 17)
0950     /* 18 bits total: Must be <= (32 - EDB_LOCK_VERSION_BITS). */
0951 };
0952 /** @} */
0953 
0954 /** Common header for all page types. The page type depends on #mp_flags.
0955  *
0956  * #P_BRANCH and #P_LEAF pages have unsorted '#EDB_node's at the end, with
0957  * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages
0958  * omit mp_ptrs and pack sorted #EDB_DUPFIXED values after the page header.
0959  *
0960  * #P_OVERFLOW records occupy one or more contiguous pages where only the
0961  * first has a page header. They hold the real data of #F_BIGDATA nodes.
0962  *
0963  * #P_SUBP sub-pages are small leaf "pages" with duplicate data.
0964  * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page.
0965  * (Duplicate data can also go in sub-databases, which use normal pages.)
0966  *
0967  * #P_META pages contain #EDB_meta, the start point of an EXDB snapshot.
0968  *
0969  * Each non-metapage up to #EDB_meta.%mm_last_pg is reachable exactly once
0970  * in the snapshot: Either used by a database or listed in a freeDB record.
0971  */
0972 typedef struct EDB_page {
0973 #define mp_pgno mp_p.p_pgno
0974 #define mp_next mp_p.p_next
0975     union {
0976         pgno_t      p_pgno; /**< page number */
0977         struct EDB_page *p_next; /**< for in-memory list of freed pages */
0978     } mp_p;
0979     uint16_t    mp_pad;         /**< key size if this is a LEAF2 page */
0980 /** @defgroup edb_page  Page Flags
0981  *  @ingroup internal
0982  *  Flags for the page headers.
0983  *  @{
0984  */
0985 #define P_BRANCH     0x01       /**< branch page */
0986 #define P_LEAF       0x02       /**< leaf page */
0987 #define P_OVERFLOW   0x04       /**< overflow page */
0988 #define P_META       0x08       /**< meta page */
0989 #define P_DIRTY      0x10       /**< dirty page, also set for #P_SUBP pages */
0990 #define P_LEAF2      0x20       /**< for #EDB_DUPFIXED records */
0991 #define P_SUBP       0x40       /**< for #EDB_DUPSORT sub-pages */
0992 #define P_LOOSE      0x4000     /**< page was dirtied then freed, can be reused */
0993 #define P_KEEP       0x8000     /**< leave this page alone during spill */
0994 /** @} */
0995     uint16_t    mp_flags;       /**< @ref edb_page */
0996 #define mp_lower    mp_pb.pb.pb_lower
0997 #define mp_upper    mp_pb.pb.pb_upper
0998 #define mp_pages    mp_pb.pb_pages
0999     union {
1000         struct {
1001             indx_t      pb_lower;       /**< lower bound of free space */
1002             indx_t      pb_upper;       /**< upper bound of free space */
1003         } pb;
1004         uint32_t    pb_pages;   /**< number of overflow pages */
1005     } mp_pb;
1006     indx_t      mp_ptrs[1];     /**< dynamic size */
1007 } EDB_page;
1008 
1009     /** Size of the page header, excluding dynamic data at the end */
1010 #define PAGEHDRSZ    ((unsigned) offsetof(EDB_page, mp_ptrs))
1011 
1012     /** Address of first usable data byte in a page, after the header */
1013 #define METADATA(p)  ((void *)((char *)(p) + PAGEHDRSZ))
1014 
1015     /** ITS#7713, change PAGEBASE to handle 65536 byte pages */
1016 #define PAGEBASE    ((EDB_DEVEL) ? PAGEHDRSZ : 0)
1017 
1018     /** Number of nodes on a page */
1019 #define NUMKEYS(p)   (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1)
1020 
1021     /** The amount of space remaining in the page */
1022 #define SIZELEFT(p)  (indx_t)((p)->mp_upper - (p)->mp_lower)
1023 
1024     /** The percentage of space used in the page, in tenths of a percent. */
1025 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
1026                 ((env)->me_psize - PAGEHDRSZ))
1027     /** The minimum page fill factor, in tenths of a percent.
1028      *  Pages emptier than this are candidates for merging.
1029      */
1030 #define FILL_THRESHOLD   250
1031 
1032     /** Test if a page is a leaf page */
1033 #define IS_LEAF(p)   F_ISSET((p)->mp_flags, P_LEAF)
1034     /** Test if a page is a LEAF2 page */
1035 #define IS_LEAF2(p)  F_ISSET((p)->mp_flags, P_LEAF2)
1036     /** Test if a page is a branch page */
1037 #define IS_BRANCH(p)     F_ISSET((p)->mp_flags, P_BRANCH)
1038     /** Test if a page is an overflow page */
1039 #define IS_OVERFLOW(p)   F_ISSET((p)->mp_flags, P_OVERFLOW)
1040     /** Test if a page is a sub page */
1041 #define IS_SUBP(p)   F_ISSET((p)->mp_flags, P_SUBP)
1042 
1043     /** The number of overflow pages needed to store the given size. */
1044 #define OVPAGES(size, psize)    ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
1045 
1046     /** Link in #EDB_txn.%mt_loose_pgs list.
1047      *  Kept outside the page header, which is needed when reusing the page.
1048      */
1049 #define NEXT_LOOSE_PAGE(p)      (*(EDB_page **)((p) + 2))
1050 
1051     /** Header for a single key/data pair within a page.
1052      * Used in pages of type #P_BRANCH and #P_LEAF without #P_LEAF2.
1053      * We guarantee 2-byte alignment for 'EDB_node's.
1054      *
1055      * #mn_lo and #mn_hi are used for data size on leaf nodes, and for child
1056      * pgno on branch nodes.  On 64 bit platforms, #mn_flags is also used
1057      * for pgno.  (Branch nodes have no flags).  Lo and hi are in host byte
1058      * order in case some accesses can be optimized to 32-bit word access.
1059      *
1060      * Leaf node flags describe node contents.  #F_BIGDATA says the node's
1061      * data part is the page number of an overflow page with actual data.
1062      * #F_DUPDATA and #F_SUBDATA can be combined giving duplicate data in
1063      * a sub-page/sub-database, and named databases (just #F_SUBDATA).
1064      */
1065 typedef struct EDB_node {
1066     /** part of data size or pgno
1067      *  @{ */
1068 #if BYTE_ORDER == LITTLE_ENDIAN
1069     unsigned short  mn_lo, mn_hi;
1070 #else
1071     unsigned short  mn_hi, mn_lo;
1072 #endif
1073     /** @} */
1074 /** @defgroup edb_node Node Flags
1075  *  @ingroup internal
1076  *  Flags for node headers.
1077  *  @{
1078  */
1079 #define F_BIGDATA    0x01           /**< data put on overflow page */
1080 #define F_SUBDATA    0x02           /**< data is a sub-database */
1081 #define F_DUPDATA    0x04           /**< data has duplicates */
1082 
1083 /** valid flags for #edb_node_add() */
1084 #define NODE_ADD_FLAGS  (F_DUPDATA|F_SUBDATA|EDB_RESERVE|EDB_APPEND)
1085 
1086 /** @} */
1087     unsigned short  mn_flags;       /**< @ref edb_node */
1088     unsigned short  mn_ksize;       /**< key size */
1089     char        mn_data[1];         /**< key and data are appended here */
1090 } EDB_node;
1091 
1092     /** Size of the node header, excluding dynamic data at the end */
1093 #define NODESIZE     offsetof(EDB_node, mn_data)
1094 
1095     /** Bit position of top word in page number, for shifting mn_flags */
1096 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
1097 
1098     /** Size of a node in a branch page with a given key.
1099      *  This is just the node header plus the key, there is no data.
1100      */
1101 #define INDXSIZE(k)  (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
1102 
1103     /** Size of a node in a leaf page with a given key and data.
1104      *  This is node header plus key plus data size.
1105      */
1106 #define LEAFSIZE(k, d)   (NODESIZE + (k)->mv_size + (d)->mv_size)
1107 
1108     /** Address of node \b i in page \b p */
1109 #define NODEPTR(p, i)    ((EDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE))
1110 
1111     /** Address of the key for the node */
1112 #define NODEKEY(node)    (void *)((node)->mn_data)
1113 
1114     /** Address of the data for a node */
1115 #define NODEDATA(node)   (void *)((char *)(node)->mn_data + (node)->mn_ksize)
1116 
1117     /** Get the page number pointed to by a branch node */
1118 #define NODEPGNO(node) \
1119     ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
1120      (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))
1121     /** Set the page number in a branch node */
1122 #define SETPGNO(node,pgno)  do { \
1123     (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
1124     if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)
1125 
1126     /** Get the size of the data in a leaf node */
1127 #define NODEDSZ(node)    ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
1128     /** Set the size of the data for a leaf node */
1129 #define SETDSZ(node,size)   do { \
1130     (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)
1131     /** The size of a key in a node */
1132 #define NODEKSZ(node)    ((node)->mn_ksize)
1133 
1134     /** Copy a page number from src to dst */
1135 #ifdef MISALIGNED_OK
1136 #define COPY_PGNO(dst,src)  dst = src
1137 #else
1138 #if EDB_SIZE_MAX > 0xffffffffU
1139 #define COPY_PGNO(dst,src)  do { \
1140     unsigned short *s, *d;  \
1141     s = (unsigned short *)&(src);   \
1142     d = (unsigned short *)&(dst);   \
1143     *d++ = *s++;    \
1144     *d++ = *s++;    \
1145     *d++ = *s++;    \
1146     *d = *s;    \
1147 } while (0)
1148 #else
1149 #define COPY_PGNO(dst,src)  do { \
1150     unsigned short *s, *d;  \
1151     s = (unsigned short *)&(src);   \
1152     d = (unsigned short *)&(dst);   \
1153     *d++ = *s++;    \
1154     *d = *s;    \
1155 } while (0)
1156 #endif
1157 #endif
1158     /** The address of a key in a LEAF2 page.
1159      *  LEAF2 pages are used for #EDB_DUPFIXED sorted-duplicate sub-DBs.
1160      *  There are no node headers, keys are stored contiguously.
1161      */
1162 #define LEAF2KEY(p, i, ks)  ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
1163 
1164     /** Set the \b node's key into \b keyptr, if requested. */
1165 #define EDB_GET_KEY(node, keyptr)   { if ((keyptr) != NULL) { \
1166     (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } }
1167 
1168     /** Set the \b node's key into \b key. */
1169 #define EDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); }
1170 
1171     /** Information about a single database in the environment. */
1172 typedef struct EDB_db {
1173     uint32_t    md_pad;     /**< also ksize for LEAF2 pages */
1174     uint16_t    md_flags;   /**< @ref edb_dbi_open */
1175     uint16_t    md_depth;   /**< depth of this tree */
1176     pgno_t      md_branch_pages;    /**< number of internal pages */
1177     pgno_t      md_leaf_pages;      /**< number of leaf pages */
1178     pgno_t      md_overflow_pages;  /**< number of overflow pages */
1179     edb_size_t  md_entries;     /**< number of data items */
1180     pgno_t      md_root;        /**< the root page of this tree */
1181 } EDB_db;
1182 
1183 #define EDB_VALID   0x8000      /**< DB handle is valid, for me_dbflags */
1184 #define PERSISTENT_FLAGS    (0xffff & ~(EDB_VALID))
1185     /** #edb_dbi_open() flags */
1186 #define VALID_FLAGS (EDB_REVERSEKEY|EDB_DUPSORT|EDB_INTEGERKEY|EDB_DUPFIXED|\
1187     EDB_INTEGERDUP|EDB_REVERSEDUP|EDB_CREATE)
1188 
1189     /** Handle for the DB used to track free pages. */
1190 #define FREE_DBI    0
1191     /** Handle for the default DB. */
1192 #define MAIN_DBI    1
1193     /** Number of DBs in metapage (free and main) - also hardcoded elsewhere */
1194 #define CORE_DBS    2
1195 
1196     /** Number of meta pages - also hardcoded elsewhere */
1197 #define NUM_METAS   2
1198 
1199     /** Meta page content.
1200      *  A meta page is the start point for accessing a database snapshot.
1201      *  Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2).
1202      */
1203 typedef struct EDB_meta {
1204         /** Stamp identifying this as an EXDB file. It must be set
1205          *  to #EDB_MAGIC. */
1206     uint32_t    mm_magic;
1207         /** Version number of this file. Must be set to #EDB_DATA_VERSION. */
1208     uint32_t    mm_version;
1209 #ifdef EDB_VL32
1210     union {     /* always zero since we don't support fixed mapping in EDB_VL32 */
1211         EDB_ID  mmun_ull;
1212         void *mmun_address;
1213     } mm_un;
1214 #define mm_address mm_un.mmun_address
1215 #else
1216     void        *mm_address;        /**< address for fixed mapping */
1217 #endif
1218     edb_size_t  mm_mapsize;         /**< size of mmap region */
1219     EDB_db      mm_dbs[CORE_DBS];   /**< first is free space, 2nd is main db */
1220     /** The size of pages used in this DB */
1221 #define mm_psize    mm_dbs[FREE_DBI].md_pad
1222     /** Any persistent environment flags. @ref edb_env */
1223 #define mm_flags    mm_dbs[FREE_DBI].md_flags
1224     /** Last used page in the datafile.
1225      *  Actually the file may be shorter if the freeDB lists the final pages.
1226      */
1227     pgno_t      mm_last_pg;
1228     volatile txnid_t    mm_txnid;   /**< txnid that committed this page */
1229 } EDB_meta;
1230 
1231     /** Buffer for a stack-allocated meta page.
1232      *  The members define size and alignment, and silence type
1233      *  aliasing warnings.  They are not used directly; that could
1234      *  mean incorrectly using several union members in parallel.
1235      */
1236 typedef union EDB_metabuf {
1237     EDB_page    mb_page;
1238     struct {
1239         char        mm_pad[PAGEHDRSZ];
1240         EDB_meta    mm_meta;
1241     } mb_metabuf;
1242 } EDB_metabuf;
1243 
1244     /** Auxiliary DB info.
1245      *  The information here is mostly static/read-only. There is
1246      *  only a single copy of this record in the environment.
1247      */
1248 typedef struct EDB_dbx {
1249     EDB_val     md_name;        /**< name of the database */
1250     EDB_cmp_func    *md_cmp;    /**< function for comparing keys */
1251     EDB_cmp_func    *md_dcmp;   /**< function for comparing data items */
1252     EDB_rel_func    *md_rel;    /**< user relocate function */
1253     void        *md_relctx;     /**< user-provided context for md_rel */
1254 } EDB_dbx;
1255 
1256     /** A database transaction.
1257      *  Every operation requires a transaction handle.
1258      */
1259 struct EDB_txn {
1260     EDB_txn     *mt_parent;     /**< parent of a nested txn */
1261     /** Nested txn under this txn, set together with flag #EDB_TXN_HAS_CHILD */
1262     EDB_txn     *mt_child;
1263     pgno_t      mt_next_pgno;   /**< next unallocated page */
1264 #ifdef EDB_VL32
1265     pgno_t      mt_last_pgno;   /**< last written page */
1266 #endif
1267     /** The ID of this transaction. IDs are integers incrementing from 1.
1268      *  Only committed write transactions increment the ID. If a transaction
1269      *  aborts, the ID may be re-used by the next writer.
1270      */
1271     txnid_t     mt_txnid;
1272     EDB_env     *mt_env;        /**< the DB environment */
1273     /** The list of pages that became unused during this transaction.
1274      */
1275     EDB_IDL     mt_free_pgs;
1276     /** The list of loose pages that became unused and may be reused
1277      *  in this transaction, linked through #NEXT_LOOSE_PAGE(page).
1278      */
1279     EDB_page    *mt_loose_pgs;
1280     /** Number of loose pages (#mt_loose_pgs) */
1281     int         mt_loose_count;
1282     /** The sorted list of dirty pages we temporarily wrote to disk
1283      *  because the dirty list was full. page numbers in here are
1284      *  shifted left by 1, deleted slots have the LSB set.
1285      */
1286     EDB_IDL     mt_spill_pgs;
1287     union {
1288         /** For write txns: Modified pages. Sorted when not EDB_WRITEMAP. */
1289         EDB_ID2L    dirty_list;
1290         /** For read txns: This thread/txn's reader table slot, or NULL. */
1291         EDB_reader  *reader;
1292     } mt_u;
1293     /** Array of records for each DB known in the environment. */
1294     EDB_dbx     *mt_dbxs;
1295     /** Array of EDB_db records for each known DB */
1296     EDB_db      *mt_dbs;
1297     /** Array of sequence numbers for each DB handle */
1298     unsigned int    *mt_dbiseqs;
1299 /** @defgroup mt_dbflag Transaction DB Flags
1300  *  @ingroup internal
1301  * @{
1302  */
1303 #define DB_DIRTY    0x01        /**< DB was written in this txn */
1304 #define DB_STALE    0x02        /**< Named-DB record is older than txnID */
1305 #define DB_NEW      0x04        /**< Named-DB handle opened in this txn */
1306 #define DB_VALID    0x08        /**< DB handle is valid, see also #EDB_VALID */
1307 #define DB_USRVALID 0x10        /**< As #DB_VALID, but not set for #FREE_DBI */
1308 #define DB_DUPDATA  0x20        /**< DB is #EDB_DUPSORT data */
1309 /** @} */
1310     /** In write txns, array of cursors for each DB */
1311     EDB_cursor  **mt_cursors;
1312     /** Array of flags for each DB */
1313     unsigned char   *mt_dbflags;
1314 #ifdef EDB_VL32
1315     /** List of read-only pages (actually chunks) */
1316     EDB_ID3L    mt_rpages;
1317     /** We map chunks of 16 pages. Even though Windows uses 4KB pages, all
1318      * mappings must begin on 64KB boundaries. So we round off all pgnos to
1319      * a chunk boundary. We do the same on Linux for symmetry, and also to
1320      * reduce the frequency of mmap/munmap calls.
1321      */
1322 #define EDB_RPAGE_CHUNK 16
1323 #define EDB_TRPAGE_SIZE 4096    /**< size of #mt_rpages array of chunks */
1324 #define EDB_TRPAGE_MAX  (EDB_TRPAGE_SIZE-1) /**< maximum chunk index */
1325     unsigned int mt_rpcheck;    /**< threshold for reclaiming unref'd chunks */
1326 #endif
1327     /** Number of DB records in use, or 0 when the txn is finished.
1328      *  This number only ever increments until the txn finishes; we
1329      *  don't decrement it when individual DB handles are closed.
1330      */
1331     EDB_dbi     mt_nuedbs;
1332 
1333 /** @defgroup edb_txn   Transaction Flags
1334  *  @ingroup internal
1335  *  @{
1336  */
1337     /** #edb_txn_begin() flags */
1338 #define EDB_TXN_BEGIN_FLAGS (EDB_NOMETASYNC|EDB_NOSYNC|EDB_RDONLY)
1339 #define EDB_TXN_NOMETASYNC  EDB_NOMETASYNC  /**< don't sync meta for this txn on commit */
1340 #define EDB_TXN_NOSYNC      EDB_NOSYNC  /**< don't sync this txn on commit */
1341 #define EDB_TXN_RDONLY      EDB_RDONLY  /**< read-only transaction */
1342     /* internal txn flags */
1343 #define EDB_TXN_WRITEMAP    EDB_WRITEMAP    /**< copy of #EDB_env flag in writers */
1344 #define EDB_TXN_FINISHED    0x01        /**< txn is finished or never began */
1345 #define EDB_TXN_ERROR       0x02        /**< txn is unusable after an error */
1346 #define EDB_TXN_DIRTY       0x04        /**< must write, even if dirty list is empty */
1347 #define EDB_TXN_SPILLS      0x08        /**< txn or a parent has spilled pages */
1348 #define EDB_TXN_HAS_CHILD   0x10        /**< txn has an #EDB_txn.%mt_child */
1349     /** most operations on the txn are currently illegal */
1350 #define EDB_TXN_BLOCKED     (EDB_TXN_FINISHED|EDB_TXN_ERROR|EDB_TXN_HAS_CHILD)
1351 /** @} */
1352     unsigned int    mt_flags;       /**< @ref edb_txn */
1353     /** #dirty_list room: Array size - \#dirty pages visible to this txn.
1354      *  Includes ancestor txns' dirty pages not hidden by other txns'
1355      *  dirty/spilled pages. Thus commit(nested txn) has room to merge
1356      *  dirty_list into mt_parent after freeing hidden mt_parent pages.
1357      */
1358     unsigned int    mt_dirty_room;
1359 };
1360 
1361 /** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
1362  * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
1363  * raise this on a 64 bit machine.
1364  */
1365 #define CURSOR_STACK         32
1366 
1367 struct EDB_xcursor;
1368 
1369     /** Cursors are used for all DB operations.
1370      *  A cursor holds a path of (page pointer, key index) from the DB
1371      *  root to a position in the DB, plus other state. #EDB_DUPSORT
1372      *  cursors include an xcursor to the current data item. Write txns
1373      *  track their cursors and keep them up to date when data moves.
1374      *  Exception: An xcursor's pointer to a #P_SUBP page can be stale.
1375      *  (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage).
1376      */
1377 struct EDB_cursor {
1378     /** Next cursor on this DB in this txn */
1379     EDB_cursor  *mc_next;
1380     /** Backup of the original cursor if this cursor is a shadow */
1381     EDB_cursor  *mc_backup;
1382     /** Context used for databases with #EDB_DUPSORT, otherwise NULL */
1383     struct EDB_xcursor  *mc_xcursor;
1384     /** The transaction that owns this cursor */
1385     EDB_txn     *mc_txn;
1386     /** The database handle this cursor operates on */
1387     EDB_dbi     mc_dbi;
1388     /** The database record for this cursor */
1389     EDB_db      *mc_db;
1390     /** The database auxiliary record for this cursor */
1391     EDB_dbx     *mc_dbx;
1392     /** The @ref mt_dbflag for this database */
1393     unsigned char   *mc_dbflag;
1394     unsigned short  mc_snum;    /**< number of pushed pages */
1395     unsigned short  mc_top;     /**< index of top page, normally mc_snum-1 */
1396 /** @defgroup edb_cursor    Cursor Flags
1397  *  @ingroup internal
1398  *  Cursor state flags.
1399  *  @{
1400  */
1401 #define C_INITIALIZED   0x01    /**< cursor has been initialized and is valid */
1402 #define C_EOF   0x02            /**< No more data */
1403 #define C_SUB   0x04            /**< Cursor is a sub-cursor */
1404 #define C_DEL   0x08            /**< last op was a cursor_del */
1405 #define C_UNTRACK   0x40        /**< Un-track cursor when closing */
1406 #define C_WRITEMAP  EDB_TXN_WRITEMAP /**< Copy of txn flag */
1407 /** Read-only cursor into the txn's original snapshot in the map.
1408  *  Set for read-only txns, and in #edb_page_alloc() for #FREE_DBI when
1409  *  #EDB_DEVEL & 2. Only implements code which is necessary for this.
1410  */
1411 #define C_ORIG_RDONLY   EDB_TXN_RDONLY
1412 /** @} */
1413     unsigned int    mc_flags;   /**< @ref edb_cursor */
1414     EDB_page    *mc_pg[CURSOR_STACK];   /**< stack of pushed pages */
1415     indx_t      mc_ki[CURSOR_STACK];    /**< stack of page indices */
1416 #ifdef EDB_VL32
1417     EDB_page    *mc_ovpg;       /**< a referenced overflow page */
1418 #   define MC_OVPG(mc)          ((mc)->mc_ovpg)
1419 #   define MC_SET_OVPG(mc, pg)  ((mc)->mc_ovpg = (pg))
1420 #else
1421 #   define MC_OVPG(mc)          ((EDB_page *)0)
1422 #   define MC_SET_OVPG(mc, pg)  ((void)0)
1423 #endif
1424 };
1425 
1426     /** Context for sorted-dup records.
1427      *  We could have gone to a fully recursive design, with arbitrarily
1428      *  deep nesting of sub-databases. But for now we only handle these
1429      *  levels - main DB, optional sub-DB, sorted-duplicate DB.
1430      */
1431 typedef struct EDB_xcursor {
1432     /** A sub-cursor for traversing the Dup DB */
1433     EDB_cursor mx_cursor;
1434     /** The database record for this Dup DB */
1435     EDB_db  mx_db;
1436     /** The auxiliary DB record for this Dup DB */
1437     EDB_dbx mx_dbx;
1438     /** The @ref mt_dbflag for this Dup DB */
1439     unsigned char mx_dbflag;
1440 } EDB_xcursor;
1441 
1442     /** Check if there is an inited xcursor */
1443 #define XCURSOR_INITED(mc) \
1444     ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
1445 
1446     /** Update the xcursor's sub-page pointer, if any, in \b mc.  Needed
1447      *  when the node which contains the sub-page may have moved.  Called
1448      *  with leaf page \b mp = mc->mc_pg[\b top].
1449      */
1450 #define XCURSOR_REFRESH(mc, top, mp) do { \
1451     EDB_page *xr_pg = (mp); \
1452     EDB_node *xr_node; \
1453     if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \
1454     xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \
1455     if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \
1456         (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \
1457 } while (0)
1458 
1459     /** State of FreeDB old pages, stored in the EDB_env */
1460 typedef struct EDB_pgstate {
1461     pgno_t      *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */
1462     txnid_t     mf_pglast;  /**< ID of last used record, or 0 if !mf_pghead */
1463 } EDB_pgstate;
1464 
1465     /** The database environment. */
1466 struct EDB_env {
1467     HANDLE      me_fd;      /**< The main data file */
1468     HANDLE      me_lfd;     /**< The lock file */
1469     HANDLE      me_mfd;     /**< For writing and syncing the meta pages */
1470 #if defined(EDB_VL32) && defined(_WIN32)
1471     HANDLE      me_fmh;     /**< File Mapping handle */
1472 #endif
1473     /** Failed to update the meta page. Probably an I/O error. */
1474 #define EDB_FATAL_ERROR 0x80000000U
1475     /** Some fields are initialized. */
1476 #define EDB_ENV_ACTIVE  0x20000000U
1477     /** me_txkey is set */
1478 #define EDB_ENV_TXKEY   0x10000000U
1479     /** fdatasync is unreliable */
1480 #define EDB_FSYNCONLY   0x08000000U
1481     uint32_t    me_flags;       /**< @ref edb_env */
1482     unsigned int    me_psize;   /**< DB page size, inited from me_os_psize */
1483     unsigned int    me_os_psize;    /**< OS page size, from #GET_PAGESIZE */
1484     unsigned int    me_maxreaders;  /**< size of the reader table */
1485     /** Max #EDB_txninfo.%mti_numreaders of interest to #edb_env_close() */
1486     volatile int    me_close_readers;
1487     EDB_dbi     me_nuedbs;      /**< number of DBs opened */
1488     EDB_dbi     me_maxdbs;      /**< size of the DB table */
1489     EDB_PID_T   me_pid;     /**< process ID of this env */
1490     char        *me_path;       /**< path to the DB files */
1491     char        *me_map;        /**< the memory map of the data file */
1492     EDB_txninfo *me_txns;       /**< the memory map of the lock file or NULL */
1493     EDB_meta    *me_metas[NUM_METAS];   /**< pointers to the two meta pages */
1494     void        *me_pbuf;       /**< scratch area for DUPSORT put() */
1495     EDB_txn     *me_txn;        /**< current write transaction */
1496     EDB_txn     *me_txn0;       /**< prealloc'd write transaction */
1497     edb_size_t  me_mapsize;     /**< size of the data memory map */
1498     off_t       me_size;        /**< current file size */
1499     pgno_t      me_maxpg;       /**< me_mapsize / me_psize */
1500     EDB_dbx     *me_dbxs;       /**< array of static DB info */
1501     uint16_t    *me_dbflags;    /**< array of flags from EDB_db.md_flags */
1502     unsigned int    *me_dbiseqs;    /**< array of dbi sequence numbers */
1503     pthread_key_t   me_txkey;   /**< thread-key for readers */
1504     txnid_t     me_pgoldest;    /**< ID of oldest reader last time we looked */
1505     EDB_pgstate me_pgstate;     /**< state of old pages from freeDB */
1506 #   define      me_pglast   me_pgstate.mf_pglast
1507 #   define      me_pghead   me_pgstate.mf_pghead
1508     EDB_page    *me_dpages;     /**< list of malloc'd blocks for re-use */
1509     /** IDL of pages that became unused in a write txn */
1510     EDB_IDL     me_free_pgs;
1511     /** ID2L of pages written during a write txn. Length EDB_IDL_UM_SIZE. */
1512     EDB_ID2L    me_dirty_list;
1513     /** Max number of freelist items that can fit in a single overflow page */
1514     int         me_maxfree_1pg;
1515     /** Max size of a node on a page */
1516     unsigned int    me_nodemax;
1517 #if !(EDB_MAXKEYSIZE)
1518     unsigned int    me_maxkey;  /**< max size of a key */
1519 #endif
1520     int     me_live_reader;     /**< have liveness lock in reader table */
1521 #ifdef _WIN32
1522     int     me_pidquery;        /**< Used in OpenProcess */
1523 #endif
1524 #ifdef EDB_USE_POSIX_MUTEX  /* Posix mutexes reside in shared mem */
1525 #   define      me_rmutex   me_txns->mti_rmutex /**< Shared reader lock */
1526 #   define      me_wmutex   me_txns->mti_wmutex /**< Shared writer lock */
1527 #else
1528     edb_mutex_t me_rmutex;
1529     edb_mutex_t me_wmutex;
1530 # if defined(_WIN32) || defined(EDB_USE_POSIX_SEM)
1531     /** Half-initialized name of mutexes, to be completed by #MUTEXNAME() */
1532     char        me_mutexname[sizeof(MUTEXNAME_PREFIX) + 11];
1533 # endif
1534 #endif
1535 #ifdef EDB_VL32
1536     EDB_ID3L    me_rpages;  /**< like #mt_rpages, but global to env */
1537     pthread_mutex_t me_rpmutex; /**< control access to #me_rpages */
1538 #define EDB_ERPAGE_SIZE 16384
1539 #define EDB_ERPAGE_MAX  (EDB_ERPAGE_SIZE-1)
1540     unsigned int me_rpcheck;
1541 #endif
1542     void        *me_userctx;     /**< User-settable context */
1543     EDB_assert_func *me_assert_func; /**< Callback for assertion failures */
1544 };
1545 
1546     /** Nested transaction */
1547 typedef struct EDB_ntxn {
1548     EDB_txn     mnt_txn;        /**< the transaction */
1549     EDB_pgstate mnt_pgstate;    /**< parent transaction's saved freestate */
1550 } EDB_ntxn;
1551 
1552     /** max number of pages to commit in one writev() call */
1553 #define EDB_COMMIT_PAGES     64
1554 #if defined(IOV_MAX) && IOV_MAX < EDB_COMMIT_PAGES
1555 #undef EDB_COMMIT_PAGES
1556 #define EDB_COMMIT_PAGES    IOV_MAX
1557 #endif
1558 
1559     /** max bytes to write in one call */
1560 #define MAX_WRITE       (0x40000000U >> (sizeof(ssize_t) == 4))
1561 
1562     /** Check \b txn and \b dbi arguments to a function */
1563 #define TXN_DBI_EXIST(txn, dbi, validity) \
1564     ((txn) && (dbi)<(txn)->mt_nuedbs && ((txn)->mt_dbflags[dbi] & (validity)))
1565 
1566     /** Check for misused \b dbi handles */
1567 #define TXN_DBI_CHANGED(txn, dbi) \
1568     ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
1569 
1570 static int  edb_page_alloc(EDB_cursor *mc, int num, EDB_page **mp);
1571 static int  edb_page_new(EDB_cursor *mc, uint32_t flags, int num, EDB_page **mp);
1572 static int  edb_page_touch(EDB_cursor *mc);
1573 
1574 #define EDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \
1575     "reset-tmp", "fail-begin", "fail-beginchild"}
1576 enum {
1577     /* edb_txn_end operation number, for logging */
1578     EDB_END_COMMITTED, EDB_END_EMPTY_COMMIT, EDB_END_ABORT, EDB_END_RESET,
1579     EDB_END_RESET_TMP, EDB_END_FAIL_BEGIN, EDB_END_FAIL_BEGINCHILD
1580 };
1581 #define EDB_END_OPMASK  0x0F    /**< mask for #edb_txn_end() operation number */
1582 #define EDB_END_UPDATE  0x10    /**< update env state (DBIs) */
1583 #define EDB_END_FREE    0x20    /**< free txn unless it is #EDB_env.%me_txn0 */
1584 #define EDB_END_SLOT EDB_NOTLS  /**< release any reader slot if #EDB_NOTLS */
1585 static void edb_txn_end(EDB_txn *txn, unsigned mode);
1586 
1587 static int  edb_page_get(EDB_cursor *mc, pgno_t pgno, EDB_page **mp, int *lvl);
1588 static int  edb_page_search_root(EDB_cursor *mc,
1589                 EDB_val *key, int modify);
1590 #define EDB_PS_MODIFY   1
1591 #define EDB_PS_ROOTONLY 2
1592 #define EDB_PS_FIRST    4
1593 #define EDB_PS_LAST     8
1594 static int  edb_page_search(EDB_cursor *mc,
1595                 EDB_val *key, int flags);
1596 static int  edb_page_merge(EDB_cursor *csrc, EDB_cursor *cdst);
1597 
1598 #define EDB_SPLIT_REPLACE   EDB_APPENDDUP   /**< newkey is not new */
1599 static int  edb_page_split(EDB_cursor *mc, EDB_val *newkey, EDB_val *newdata,
1600                 pgno_t newpgno, unsigned int nflags);
1601 
1602 static int  edb_env_read_header(EDB_env *env, int prev, EDB_meta *meta);
1603 static EDB_meta *edb_env_pick_meta(const EDB_env *env);
1604 static int  edb_env_write_meta(EDB_txn *txn);
1605 #ifdef EDB_USE_POSIX_MUTEX /* Drop unused excl arg */
1606 # define edb_env_close0(env, excl) edb_env_close1(env)
1607 #endif
1608 static void edb_env_close0(EDB_env *env, int excl);
1609 
1610 static EDB_node *edb_node_search(EDB_cursor *mc, EDB_val *key, int *exactp);
1611 static int  edb_node_add(EDB_cursor *mc, indx_t indx,
1612                 EDB_val *key, EDB_val *data, pgno_t pgno, unsigned int flags);
1613 static void edb_node_del(EDB_cursor *mc, int ksize);
1614 static void edb_node_shrink(EDB_page *mp, indx_t indx);
1615 static int  edb_node_move(EDB_cursor *csrc, EDB_cursor *cdst, int fromleft);
1616 static int  edb_node_read(EDB_cursor *mc, EDB_node *leaf, EDB_val *data);
1617 static size_t   edb_leaf_size(EDB_env *env, EDB_val *key, EDB_val *data);
1618 static size_t   edb_branch_size(EDB_env *env, EDB_val *key);
1619 
1620 static int  edb_rebalance(EDB_cursor *mc);
1621 static int  edb_update_key(EDB_cursor *mc, EDB_val *key);
1622 
1623 static void edb_cursor_pop(EDB_cursor *mc);
1624 static int  edb_cursor_push(EDB_cursor *mc, EDB_page *mp);
1625 
1626 static int  edb_cursor_del0(EDB_cursor *mc);
1627 static int  edb_del0(EDB_txn *txn, EDB_dbi dbi, EDB_val *key, EDB_val *data, unsigned flags);
1628 static int  edb_cursor_sibling(EDB_cursor *mc, int move_right);
1629 static int  edb_cursor_next(EDB_cursor *mc, EDB_val *key, EDB_val *data, EDB_cursor_op op);
1630 static int  edb_cursor_prev(EDB_cursor *mc, EDB_val *key, EDB_val *data, EDB_cursor_op op);
1631 static int  edb_cursor_set(EDB_cursor *mc, EDB_val *key, EDB_val *data, EDB_cursor_op op,
1632                 int *exactp);
1633 static int  edb_cursor_first(EDB_cursor *mc, EDB_val *key, EDB_val *data);
1634 static int  edb_cursor_last(EDB_cursor *mc, EDB_val *key, EDB_val *data);
1635 
1636 static void edb_cursor_init(EDB_cursor *mc, EDB_txn *txn, EDB_dbi dbi, EDB_xcursor *mx);
1637 static void edb_xcursor_init0(EDB_cursor *mc);
1638 static void edb_xcursor_init1(EDB_cursor *mc, EDB_node *node);
1639 static void edb_xcursor_init2(EDB_cursor *mc, EDB_xcursor *src_mx, int force);
1640 
1641 static int  edb_drop0(EDB_cursor *mc, int subs);
1642 static void edb_default_cmp(EDB_txn *txn, EDB_dbi dbi);
1643 static int edb_reader_check0(EDB_env *env, int rlocked, int *dead);
1644 
1645 /** @cond */
1646 static EDB_cmp_func edb_cmp_memn, edb_cmp_memnr, edb_cmp_int, edb_cmp_cint, edb_cmp_long;
1647 /** @endcond */
1648 
1649 /** Compare two items pointing at '#edb_size_t's of unknown alignment. */
1650 #ifdef MISALIGNED_OK
1651 # define edb_cmp_clong edb_cmp_long
1652 #else
1653 # define edb_cmp_clong edb_cmp_cint
1654 #endif
1655 
1656 /** True if we need #edb_cmp_clong() instead of \b cmp for #EDB_INTEGERDUP */
1657 #define NEED_CMP_CLONG(cmp, ksize) \
1658     (UINT_MAX < EDB_SIZE_MAX && \
1659      (cmp) == edb_cmp_int && (ksize) == sizeof(edb_size_t))
1660 
1661 #ifdef _WIN32
1662 static SECURITY_DESCRIPTOR edb_null_sd;
1663 static SECURITY_ATTRIBUTES edb_all_sa;
1664 static int edb_sec_inited;
1665 
1666 struct EDB_name;
1667 static int utf8_to_utf16(const char *src, struct EDB_name *dst, int xtra);
1668 #endif
1669 
1670 /** Return the library version info. */
1671 char * ESECT
1672 edb_version(int *major, int *minor, int *patch)
1673 {
1674     if (major) *major = EDB_VERSION_MAJOR;
1675     if (minor) *minor = EDB_VERSION_MINOR;
1676     if (patch) *patch = EDB_VERSION_PATCH;
1677     return EDB_VERSION_STRING;
1678 }
1679 
1680 /** Table of descriptions for EXDB @ref errors */
1681 static char *const edb_errstr[] = {
1682     "EDB_KEYEXIST: Key/data pair already exists",
1683     "EDB_NOTFOUND: No matching key/data pair found",
1684     "EDB_PAGE_NOTFOUND: Requested page not found",
1685     "EDB_CORRUPTED: Located page was wrong type",
1686     "EDB_PANIC: Update of meta page failed or environment had fatal error",
1687     "EDB_VERSION_MISMATCH: Database environment version mismatch",
1688     "EDB_INVALID: File is not an EXDB file",
1689     "EDB_MAP_FULL: Environment mapsize limit reached",
1690     "EDB_DBS_FULL: Environment maxdbs limit reached",
1691     "EDB_READERS_FULL: Environment maxreaders limit reached",
1692     "EDB_TLS_FULL: Thread-local storage keys full - too many environments open",
1693     "EDB_TXN_FULL: Transaction has too many dirty pages - transaction too big",
1694     "EDB_CURSOR_FULL: Internal error - cursor stack limit reached",
1695     "EDB_PAGE_FULL: Internal error - page has no more space",
1696     "EDB_MAP_RESIZED: Database contents grew beyond environment mapsize",
1697     "EDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed",
1698     "EDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
1699     "EDB_BAD_TXN: Transaction must abort, has a child, or is invalid",
1700     "EDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size",
1701     "EDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly",
1702     "EDB_PROBLEM: Unexpected problem - txn should abort",
1703 };
1704 
1705 char *
1706 edb_strerror(int err)
1707 {
1708 #ifdef _WIN32
1709     /** HACK: pad 4KB on stack over the buf. Return system msgs in buf.
1710      *  This works as long as no function between the call to edb_strerror
1711      *  and the actual use of the message uses more than 4K of stack.
1712      */
1713 #define MSGSIZE 1024
1714 #define PADSIZE 4096
1715     char buf[MSGSIZE+PADSIZE], *ptr = buf;
1716 #endif
1717     int i;
1718     if (!err)
1719         return ("Successful return: 0");
1720 
1721     if (err >= EDB_KEYEXIST && err <= EDB_LAST_ERRCODE) {
1722         i = err - EDB_KEYEXIST;
1723         return edb_errstr[i];
1724     }
1725 
1726 #ifdef _WIN32
1727     /* These are the C-runtime error codes we use. The comment indicates
1728      * their numeric value, and the Win32 error they would correspond to
1729      * if the error actually came from a Win32 API. A major mess, we should
1730      * have used EXDB-specific error codes for everything.
1731      */
1732     switch(err) {
1733     case ENOENT:    /* 2, FILE_NOT_FOUND */
1734     case EIO:       /* 5, ACCESS_DENIED */
1735     case ENOMEM:    /* 12, INVALID_ACCESS */
1736     case EACCES:    /* 13, INVALID_DATA */
1737     case EBUSY:     /* 16, CURRENT_DIRECTORY */
1738     case EINVAL:    /* 22, BAD_COMMAND */
1739     case ENOSPC:    /* 28, OUT_OF_PAPER */
1740         return strerror(err);
1741     default:
1742         ;
1743     }
1744     buf[0] = 0;
1745     FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM |
1746         FORMAT_MESSAGE_IGNORE_INSERTS,
1747         NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE);
1748     return ptr;
1749 #else
1750     return strerror(err);
1751 #endif
1752 }
1753 
1754 /** assert(3) variant in cursor context */
1755 #define edb_cassert(mc, expr)   edb_assert0((mc)->mc_txn->mt_env, expr, #expr)
1756 /** assert(3) variant in transaction context */
1757 #define edb_tassert(txn, expr)  edb_assert0((txn)->mt_env, expr, #expr)
1758 /** assert(3) variant in environment context */
1759 #define edb_eassert(env, expr)  edb_assert0(env, expr, #expr)
1760 
1761 #ifndef NDEBUG
1762 # define edb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \
1763         edb_assert_fail(env, expr_txt, edb_func_, __FILE__, __LINE__))
1764 
1765 static void ESECT
1766 edb_assert_fail(EDB_env *env, const char *expr_txt,
1767     const char *func, const char *file, int line)
1768 {
1769     char buf[400];
1770     sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()",
1771         file, line, expr_txt, func);
1772     if (env->me_assert_func)
1773         env->me_assert_func(env, buf);
1774     fprintf(stderr, "%s\n", buf);
1775     abort();
1776 }
1777 #else
1778 # define edb_assert0(env, expr, expr_txt) ((void) 0)
1779 #endif /* NDEBUG */
1780 
1781 #if EDB_DEBUG
1782 /** Return the page number of \b mp which may be sub-page, for debug output */
1783 static pgno_t
1784 edb_dbg_pgno(EDB_page *mp)
1785 {
1786     pgno_t ret;
1787     COPY_PGNO(ret, mp->mp_pgno);
1788     return ret;
1789 }
1790 
1791 /** Display a key in hexadecimal and return the address of the result.
1792  * @param[in] key the key to display
1793  * @param[in] buf the buffer to write into. Should always be #DKBUF.
1794  * @return The key in hexadecimal form.
1795  */
1796 char *
1797 edb_dkey(EDB_val *key, char *buf)
1798 {
1799     char *ptr = buf;
1800     unsigned char *c = key->mv_data;
1801     unsigned int i;
1802 
1803     if (!key)
1804         return "";
1805 
1806     if (key->mv_size > DKBUF_MAXKEYSIZE)
1807         return "EDB_MAXKEYSIZE";
1808     /* may want to make this a dynamic check: if the key is mostly
1809      * printable characters, print it as-is instead of converting to hex.
1810      */
1811 #if 1
1812     buf[0] = '\0';
1813     for (i=0; i<key->mv_size; i++)
1814         ptr += sprintf(ptr, "%02x", *c++);
1815 #else
1816     sprintf(buf, "%.*s", key->mv_size, key->mv_data);
1817 #endif
1818     return buf;
1819 }
1820 
1821 static const char *
1822 edb_leafnode_type(EDB_node *n)
1823 {
1824     static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}};
1825     return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" :
1826         tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)];
1827 }
1828 
1829 /** Display all the keys in the page. */
1830 void
1831 edb_page_list(EDB_page *mp)
1832 {
1833     pgno_t pgno = edb_dbg_pgno(mp);
1834     const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : "";
1835     EDB_node *node;
1836     unsigned int i, nkeys, nsize, total = 0;
1837     EDB_val key;
1838     DKBUF;
1839 
1840     switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) {
1841     case P_BRANCH:              type = "Branch page";       break;
1842     case P_LEAF:                type = "Leaf page";         break;
1843     case P_LEAF|P_SUBP:         type = "Sub-page";          break;
1844     case P_LEAF|P_LEAF2:        type = "LEAF2 page";        break;
1845     case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page";    break;
1846     case P_OVERFLOW:
1847         fprintf(stderr, "Overflow page %"Yu" pages %u%s\n",
1848             pgno, mp->mp_pages, state);
1849         return;
1850     case P_META:
1851         fprintf(stderr, "Meta-page %"Yu" txnid %"Yu"\n",
1852             pgno, ((EDB_meta *)METADATA(mp))->mm_txnid);
1853         return;
1854     default:
1855         fprintf(stderr, "Bad page %"Yu" flags 0x%X\n", pgno, mp->mp_flags);
1856         return;
1857     }
1858 
1859     nkeys = NUMKEYS(mp);
1860     fprintf(stderr, "%s %"Yu" numkeys %d%s\n", type, pgno, nkeys, state);
1861 
1862     for (i=0; i<nkeys; i++) {
1863         if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */
1864             key.mv_size = nsize = mp->mp_pad;
1865             key.mv_data = LEAF2KEY(mp, i, nsize);
1866             total += nsize;
1867             fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key));
1868             continue;
1869         }
1870         node = NODEPTR(mp, i);
1871         key.mv_size = node->mn_ksize;
1872         key.mv_data = node->mn_data;
1873         nsize = NODESIZE + key.mv_size;
1874         if (IS_BRANCH(mp)) {
1875             fprintf(stderr, "key %d: page %"Yu", %s\n", i, NODEPGNO(node),
1876                 DKEY(&key));
1877             total += nsize;
1878         } else {
1879             if (F_ISSET(node->mn_flags, F_BIGDATA))
1880                 nsize += sizeof(pgno_t);
1881             else
1882                 nsize += NODEDSZ(node);
1883             total += nsize;
1884             nsize += sizeof(indx_t);
1885             fprintf(stderr, "key %d: nsize %d, %s%s\n",
1886                 i, nsize, DKEY(&key), edb_leafnode_type(node));
1887         }
1888         total = EVEN(total);
1889     }
1890     fprintf(stderr, "Total: header %d + contents %d + unused %d\n",
1891         IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp));
1892 }
1893 
1894 void
1895 edb_cursor_chk(EDB_cursor *mc)
1896 {
1897     unsigned int i;
1898     EDB_node *node;
1899     EDB_page *mp;
1900 
1901     if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return;
1902     for (i=0; i<mc->mc_top; i++) {
1903         mp = mc->mc_pg[i];
1904         node = NODEPTR(mp, mc->mc_ki[i]);
1905         if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)
1906             printf("oops!\n");
1907     }
1908     if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))
1909         printf("ack!\n");
1910     if (XCURSOR_INITED(mc)) {
1911         node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
1912         if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) &&
1913             mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) {
1914             printf("blah!\n");
1915         }
1916     }
1917 }
1918 #endif
1919 
1920 #if (EDB_DEBUG) > 2
1921 /** Count all the pages in each DB and in the freelist
1922  *  and make sure it matches the actual number of pages
1923  *  being used.
1924  *  All named DBs must be open for a correct count.
1925  */
1926 static void edb_audit(EDB_txn *txn)
1927 {
1928     EDB_cursor mc;
1929     EDB_val key, data;
1930     EDB_ID freecount, count;
1931     EDB_dbi i;
1932     int rc;
1933 
1934     freecount = 0;
1935     edb_cursor_init(&mc, txn, FREE_DBI, NULL);
1936     while ((rc = edb_cursor_get(&mc, &key, &data, EDB_NEXT)) == 0)
1937         freecount += *(EDB_ID *)data.mv_data;
1938     edb_tassert(txn, rc == EDB_NOTFOUND);
1939 
1940     count = 0;
1941     for (i = 0; i<txn->mt_nuedbs; i++) {
1942         EDB_xcursor mx;
1943         if (!(txn->mt_dbflags[i] & DB_VALID))
1944             continue;
1945         edb_cursor_init(&mc, txn, i, &mx);
1946         if (txn->mt_dbs[i].md_root == P_INVALID)
1947             continue;
1948         count += txn->mt_dbs[i].md_branch_pages +
1949             txn->mt_dbs[i].md_leaf_pages +
1950             txn->mt_dbs[i].md_overflow_pages;
1951         if (txn->mt_dbs[i].md_flags & EDB_DUPSORT) {
1952             rc = edb_page_search(&mc, NULL, EDB_PS_FIRST);
1953             for (; rc == EDB_SUCCESS; rc = edb_cursor_sibling(&mc, 1)) {
1954                 unsigned j;
1955                 EDB_page *mp;
1956                 mp = mc.mc_pg[mc.mc_top];
1957                 for (j=0; j<NUMKEYS(mp); j++) {
1958                     EDB_node *leaf = NODEPTR(mp, j);
1959                     if (leaf->mn_flags & F_SUBDATA) {
1960                         EDB_db db;
1961                         memcpy(&db, NODEDATA(leaf), sizeof(db));
1962                         count += db.md_branch_pages + db.md_leaf_pages +
1963                             db.md_overflow_pages;
1964                     }
1965                 }
1966             }
1967             edb_tassert(txn, rc == EDB_NOTFOUND);
1968         }
1969     }
1970     if (freecount + count + NUM_METAS != txn->mt_next_pgno) {
1971         fprintf(stderr, "audit: %"Yu" freecount: %"Yu" count: %"Yu" total: %"Yu" next_pgno: %"Yu"\n",
1972             txn->mt_txnid, freecount, count+NUM_METAS,
1973             freecount+count+NUM_METAS, txn->mt_next_pgno);
1974     }
1975 }
1976 #endif
1977 
1978 int
1979 edb_cmp(EDB_txn *txn, EDB_dbi dbi, const EDB_val *a, const EDB_val *b)
1980 {
1981     return txn->mt_dbxs[dbi].md_cmp(a, b);
1982 }
1983 
1984 int
1985 edb_dcmp(EDB_txn *txn, EDB_dbi dbi, const EDB_val *a, const EDB_val *b)
1986 {
1987     EDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp;
1988     if (NEED_CMP_CLONG(dcmp, a->mv_size))
1989         dcmp = edb_cmp_clong;
1990     return dcmp(a, b);
1991 }
1992 
1993 /** Allocate memory for a page.
1994  * Re-use old malloc'd pages first for singletons, otherwise just malloc.
1995  * Set #EDB_TXN_ERROR on failure.
1996  */
1997 static EDB_page *
1998 edb_page_malloc(EDB_txn *txn, unsigned num)
1999 {
2000     EDB_env *env = txn->mt_env;
2001     EDB_page *ret = env->me_dpages;
2002     size_t psize = env->me_psize, sz = psize, off;
2003     /* For ! #EDB_NOMEMINIT, psize counts how much to init.
2004      * For a single page alloc, we init everything after the page header.
2005      * For multi-page, we init the final page; if the caller needed that
2006      * many pages they will be filling in at least up to the last page.
2007      */
2008     if (num == 1) {
2009         if (ret) {
2010             VGMEMP_ALLOC(env, ret, sz);
2011             VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
2012             env->me_dpages = ret->mp_next;
2013             return ret;
2014         }
2015         psize -= off = PAGEHDRSZ;
2016     } else {
2017         sz *= num;
2018         off = sz - psize;
2019     }
2020     if ((ret = malloc(sz)) != NULL) {
2021         VGMEMP_ALLOC(env, ret, sz);
2022         if (!(env->me_flags & EDB_NOMEMINIT)) {
2023             memset((char *)ret + off, 0, psize);
2024             ret->mp_pad = 0;
2025         }
2026     } else {
2027         txn->mt_flags |= EDB_TXN_ERROR;
2028     }
2029     return ret;
2030 }
2031 /** Free a single page.
2032  * Saves single pages to a list, for future reuse.
2033  * (This is not used for multi-page overflow pages.)
2034  */
2035 static void
2036 edb_page_free(EDB_env *env, EDB_page *mp)
2037 {
2038     mp->mp_next = env->me_dpages;
2039     VGMEMP_FREE(env, mp);
2040     env->me_dpages = mp;
2041 }
2042 
2043 /** Free a dirty page */
2044 static void
2045 edb_dpage_free(EDB_env *env, EDB_page *dp)
2046 {
2047     if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
2048         edb_page_free(env, dp);
2049     } else {
2050         /* large pages just get freed directly */
2051         VGMEMP_FREE(env, dp);
2052         free(dp);
2053     }
2054 }
2055 
2056 /** Return all dirty pages to dpage list */
2057 static void
2058 edb_dlist_free(EDB_txn *txn)
2059 {
2060     EDB_env *env = txn->mt_env;
2061     EDB_ID2L dl = txn->mt_u.dirty_list;
2062     unsigned i, n = dl[0].mid;
2063 
2064     for (i = 1; i <= n; i++) {
2065         edb_dpage_free(env, dl[i].mptr);
2066     }
2067     dl[0].mid = 0;
2068 }
2069 
2070 #ifdef EDB_VL32
2071 static void
2072 edb_page_unref(EDB_txn *txn, EDB_page *mp)
2073 {
2074     pgno_t pgno;
2075     EDB_ID3L tl = txn->mt_rpages;
2076     unsigned x, rem;
2077     if (mp->mp_flags & (P_SUBP|P_DIRTY))
2078         return;
2079     rem = mp->mp_pgno & (EDB_RPAGE_CHUNK-1);
2080     pgno = mp->mp_pgno ^ rem;
2081     x = edb_mid3l_search(tl, pgno);
2082     if (x != tl[0].mid && tl[x+1].mid == mp->mp_pgno)
2083         x++;
2084     if (tl[x].mref)
2085         tl[x].mref--;
2086 }
2087 #define EDB_PAGE_UNREF(txn, mp) edb_page_unref(txn, mp)
2088 
2089 static void
2090 edb_cursor_unref(EDB_cursor *mc)
2091 {
2092     int i;
2093     if (mc->mc_txn->mt_rpages[0].mid) {
2094         if (!mc->mc_snum || !mc->mc_pg[0] || IS_SUBP(mc->mc_pg[0]))
2095             return;
2096         for (i=0; i<mc->mc_snum; i++)
2097             edb_page_unref(mc->mc_txn, mc->mc_pg[i]);
2098         if (mc->mc_ovpg) {
2099             edb_page_unref(mc->mc_txn, mc->mc_ovpg);
2100             mc->mc_ovpg = 0;
2101         }
2102     }
2103     mc->mc_snum = mc->mc_top = 0;
2104     mc->mc_pg[0] = NULL;
2105     mc->mc_flags &= ~C_INITIALIZED;
2106 }
2107 #define EDB_CURSOR_UNREF(mc, force) \
2108     (((force) || ((mc)->mc_flags & C_INITIALIZED)) \
2109      ? edb_cursor_unref(mc) \
2110      : (void)0)
2111 
2112 #else
2113 #define EDB_PAGE_UNREF(txn, mp)
2114 #define EDB_CURSOR_UNREF(mc, force) ((void)0)
2115 #endif /* EDB_VL32 */
2116 
2117 /** Loosen or free a single page.
2118  * Saves single pages to a list for future reuse
2119  * in this same txn. It has been pulled from the freeDB
2120  * and already resides on the dirty list, but has been
2121  * deleted. Use these pages first before pulling again
2122  * from the freeDB.
2123  *
2124  * If the page wasn't dirtied in this txn, just add it
2125  * to this txn's free list.
2126  */
2127 static int
2128 edb_page_loose(EDB_cursor *mc, EDB_page *mp)
2129 {
2130     int loose = 0;
2131     pgno_t pgno = mp->mp_pgno;
2132     EDB_txn *txn = mc->mc_txn;
2133 
2134     if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) {
2135         if (txn->mt_parent) {
2136             EDB_ID2 *dl = txn->mt_u.dirty_list;
2137             /* If txn has a parent, make sure the page is in our
2138              * dirty list.
2139              */
2140             if (dl[0].mid) {
2141                 unsigned x = edb_mid2l_search(dl, pgno);
2142                 if (x <= dl[0].mid && dl[x].mid == pgno) {
2143                     if (mp != dl[x].mptr) { /* bad cursor? */
2144                         mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
2145                         txn->mt_flags |= EDB_TXN_ERROR;
2146                         return EDB_PROBLEM;
2147                     }
2148                     /* ok, it's ours */
2149                     loose = 1;
2150                 }
2151             }
2152         } else {
2153             /* no parent txn, so it's just ours */
2154             loose = 1;
2155         }
2156     }
2157     if (loose) {
2158         DPRINTF(("loosen db %d page %"Yu, DDBI(mc), mp->mp_pgno));
2159         NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs;
2160         txn->mt_loose_pgs = mp;
2161         txn->mt_loose_count++;
2162         mp->mp_flags |= P_LOOSE;
2163     } else {
2164         int rc = edb_eidl_append(&txn->mt_free_pgs, pgno);
2165         if (rc)
2166             return rc;
2167     }
2168 
2169     return EDB_SUCCESS;
2170 }
2171 
2172 /** Set or clear P_KEEP in dirty, non-overflow, non-sub pages watched by txn.
2173  * @param[in] mc A cursor handle for the current operation.
2174  * @param[in] pflags Flags of the pages to update:
2175  * P_DIRTY to set P_KEEP, P_DIRTY|P_KEEP to clear it.
2176  * @param[in] all No shortcuts. Needed except after a full #edb_page_flush().
2177  * @return 0 on success, non-zero on failure.
2178  */
2179 static int
2180 edb_pages_xkeep(EDB_cursor *mc, unsigned pflags, int all)
2181 {
2182     enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP };
2183     EDB_txn *txn = mc->mc_txn;
2184     EDB_cursor *m3, *m0 = mc;
2185     EDB_xcursor *mx;
2186     EDB_page *dp, *mp;
2187     EDB_node *leaf;
2188     unsigned i, j;
2189     int rc = EDB_SUCCESS, level;
2190 
2191     /* Mark pages seen by cursors: First m0, then tracked cursors */
2192     for (i = txn->mt_nuedbs;; ) {
2193         if (mc->mc_flags & C_INITIALIZED) {
2194             for (m3 = mc;; m3 = &mx->mx_cursor) {
2195                 mp = NULL;
2196                 for (j=0; j<m3->mc_snum; j++) {
2197                     mp = m3->mc_pg[j];
2198                     if ((mp->mp_flags & Mask) == pflags)
2199                         mp->mp_flags ^= P_KEEP;
2200                 }
2201                 mx = m3->mc_xcursor;
2202                 /* Proceed to mx if it is at a sub-database */
2203                 if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
2204                     break;
2205                 if (! (mp && (mp->mp_flags & P_LEAF)))
2206                     break;
2207                 leaf = NODEPTR(mp, m3->mc_ki[j-1]);
2208                 if (!(leaf->mn_flags & F_SUBDATA))
2209                     break;
2210             }
2211         }
2212         mc = mc->mc_next;
2213         for (; !mc || mc == m0; mc = txn->mt_cursors[--i])
2214             if (i == 0)
2215                 goto mark_done;
2216     }
2217 
2218 mark_done:
2219     if (all) {
2220         /* Mark dirty root pages */
2221         for (i=0; i<txn->mt_nuedbs; i++) {
2222             if (txn->mt_dbflags[i] & DB_DIRTY) {
2223                 pgno_t pgno = txn->mt_dbs[i].md_root;
2224                 if (pgno == P_INVALID)
2225                     continue;
2226                 if ((rc = edb_page_get(m0, pgno, &dp, &level)) != EDB_SUCCESS)
2227                     break;
2228                 if ((dp->mp_flags & Mask) == pflags && level <= 1)
2229                     dp->mp_flags ^= P_KEEP;
2230             }
2231         }
2232     }
2233 
2234     return rc;
2235 }
2236 
2237 static int edb_page_flush(EDB_txn *txn, int keep);
2238 
2239 /** Spill pages from the dirty list back to disk.
2240  * This is intended to prevent running into #EDB_TXN_FULL situations,
2241  * but note that they may still occur in a few cases:
2242  *  1) our estimate of the txn size could be too small. Currently this
2243  *   seems unlikely, except with a large number of #EDB_MULTIPLE items.
2244  *  2) child txns may run out of space if their parents dirtied a
2245  *   lot of pages and never spilled them. TODO: we probably should do
2246  *   a preemptive spill during #edb_txn_begin() of a child txn, if
2247  *   the parent's dirty_room is below a given threshold.
2248  *
2249  * Otherwise, if not using nested txns, it is expected that apps will
2250  * not run into #EDB_TXN_FULL any more. The pages are flushed to disk
2251  * the same way as for a txn commit, e.g. their P_DIRTY flag is cleared.
2252  * If the txn never references them again, they can be left alone.
2253  * If the txn only reads them, they can be used without any fuss.
2254  * If the txn writes them again, they can be dirtied immediately without
2255  * going thru all of the work of #edb_page_touch(). Such references are
2256  * handled by #edb_page_unspill().
2257  *
2258  * Also note, we never spill DB root pages, nor pages of active cursors,
2259  * because we'll need these back again soon anyway. And in nested txns,
2260  * we can't spill a page in a child txn if it was already spilled in a
2261  * parent txn. That would alter the parent txns' data even though
2262  * the child hasn't committed yet, and we'd have no way to undo it if
2263  * the child aborted.
2264  *
2265  * @param[in] m0 cursor A cursor handle identifying the transaction and
2266  *  database for which we are checking space.
2267  * @param[in] key For a put operation, the key being stored.
2268  * @param[in] data For a put operation, the data being stored.
2269  * @return 0 on success, non-zero on failure.
2270  */
2271 static int
2272 edb_page_spill(EDB_cursor *m0, EDB_val *key, EDB_val *data)
2273 {
2274     EDB_txn *txn = m0->mc_txn;
2275     EDB_page *dp;
2276     EDB_ID2L dl = txn->mt_u.dirty_list;
2277     unsigned int i, j, need;
2278     int rc;
2279 
2280     if (m0->mc_flags & C_SUB)
2281         return EDB_SUCCESS;
2282 
2283     /* Estimate how much space this op will take */
2284     i = m0->mc_db->md_depth;
2285     /* Named DBs also dirty the main DB */
2286     if (m0->mc_dbi >= CORE_DBS)
2287         i += txn->mt_dbs[MAIN_DBI].md_depth;
2288     /* For puts, roughly factor in the key+data size */
2289     if (key)
2290         i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize;
2291     i += i; /* double it for good measure */
2292     need = i;
2293 
2294     if (txn->mt_dirty_room > i)
2295         return EDB_SUCCESS;
2296 
2297     if (!txn->mt_spill_pgs) {
2298         txn->mt_spill_pgs = edb_eidl_alloc(EDB_IDL_UM_MAX);
2299         if (!txn->mt_spill_pgs)
2300             return ENOMEM;
2301     } else {
2302         /* purge deleted slots */
2303         EDB_IDL sl = txn->mt_spill_pgs;
2304         unsigned int num = sl[0];
2305         j=0;
2306         for (i=1; i<=num; i++) {
2307             if (!(sl[i] & 1))
2308                 sl[++j] = sl[i];
2309         }
2310         sl[0] = j;
2311     }
2312 
2313     /* Preserve pages which may soon be dirtied again */
2314     if ((rc = edb_pages_xkeep(m0, P_DIRTY, 1)) != EDB_SUCCESS)
2315         goto done;
2316 
2317     /* Less aggressive spill - we originally spilled the entire dirty list,
2318      * with a few exceptions for cursor pages and DB root pages. But this
2319      * turns out to be a lot of wasted effort because in a large txn many
2320      * of those pages will need to be used again. So now we spill only 1/8th
2321      * of the dirty pages. Testing revealed this to be a good tradeoff,
2322      * better than 1/2, 1/4, or 1/10.
2323      */
2324     if (need < EDB_IDL_UM_MAX / 8)
2325         need = EDB_IDL_UM_MAX / 8;
2326 
2327     /* Save the page IDs of all the pages we're flushing */
2328     /* flush from the tail forward, this saves a lot of shifting later on. */
2329     for (i=dl[0].mid; i && need; i--) {
2330         EDB_ID pn = dl[i].mid << 1;
2331         dp = dl[i].mptr;
2332         if (dp->mp_flags & (P_LOOSE|P_KEEP))
2333             continue;
2334         /* Can't spill twice, make sure it's not already in a parent's
2335          * spill list.
2336          */
2337         if (txn->mt_parent) {
2338             EDB_txn *tx2;
2339             for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
2340                 if (tx2->mt_spill_pgs) {
2341                     j = edb_eidl_search(tx2->mt_spill_pgs, pn);
2342                     if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) {
2343                         dp->mp_flags |= P_KEEP;
2344                         break;
2345                     }
2346                 }
2347             }
2348             if (tx2)
2349                 continue;
2350         }
2351         if ((rc = edb_eidl_append(&txn->mt_spill_pgs, pn)))
2352             goto done;
2353         need--;
2354     }
2355     edb_eidl_sort(txn->mt_spill_pgs);
2356 
2357     /* Flush the spilled part of dirty list */
2358     if ((rc = edb_page_flush(txn, i)) != EDB_SUCCESS)
2359         goto done;
2360 
2361     /* Reset any dirty pages we kept that page_flush didn't see */
2362     rc = edb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
2363 
2364 done:
2365     txn->mt_flags |= rc ? EDB_TXN_ERROR : EDB_TXN_SPILLS;
2366     return rc;
2367 }
2368 
2369 /** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */
2370 static txnid_t
2371 edb_find_oldest(EDB_txn *txn)
2372 {
2373     int i;
2374     txnid_t mr, oldest = txn->mt_txnid - 1;
2375     if (txn->mt_env->me_txns) {
2376         EDB_reader *r = txn->mt_env->me_txns->mti_readers;
2377         for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
2378             if (r[i].mr_pid) {
2379                 mr = r[i].mr_txnid;
2380                 if (oldest > mr)
2381                     oldest = mr;
2382             }
2383         }
2384     }
2385     return oldest;
2386 }
2387 
2388 /** Add a page to the txn's dirty list */
2389 static void
2390 edb_page_dirty(EDB_txn *txn, EDB_page *mp)
2391 {
2392     EDB_ID2 mid;
2393     int rc, (*insert)(EDB_ID2L, EDB_ID2 *);
2394 
2395     if (txn->mt_flags & EDB_TXN_WRITEMAP) {
2396         insert = edb_mid2l_append;
2397     } else {
2398         insert = edb_mid2l_insert;
2399     }
2400     mid.mid = mp->mp_pgno;
2401     mid.mptr = mp;
2402     rc = insert(txn->mt_u.dirty_list, &mid);
2403     edb_tassert(txn, rc == 0);
2404     txn->mt_dirty_room--;
2405 }
2406 
2407 /** Allocate page numbers and memory for writing.  Maintain me_pglast,
2408  * me_pghead and mt_next_pgno.  Set #EDB_TXN_ERROR on failure.
2409  *
2410  * If there are free pages available from older transactions, they
2411  * are re-used first. Otherwise allocate a new page at mt_next_pgno.
2412  * Do not modify the freedB, just merge freeDB records into me_pghead[]
2413  * and move me_pglast to say which records were consumed.  Only this
2414  * function can create me_pghead and move me_pglast/mt_next_pgno.
2415  * When #EDB_DEVEL & 2, it is not affected by #edb_freelist_save(): it
2416  * then uses the transaction's original snapshot of the freeDB.
2417  * @param[in] mc cursor A cursor handle identifying the transaction and
2418  *  database for which we are allocating.
2419  * @param[in] num the number of pages to allocate.
2420  * @param[out] mp Address of the allocated page(s). Requests for multiple pages
2421  *  will always be satisfied by a single contiguous chunk of memory.
2422  * @return 0 on success, non-zero on failure.
2423  */
2424 static int
2425 edb_page_alloc(EDB_cursor *mc, int num, EDB_page **mp)
2426 {
2427 #ifdef EDB_PARANOID /* Seems like we can ignore this now */
2428     /* Get at most <Max_retries> more freeDB records once me_pghead
2429      * has enough pages.  If not enough, use new pages from the map.
2430      * If <Paranoid> and mc is updating the freeDB, only get new
2431      * records if me_pghead is empty. Then the freelist cannot play
2432      * catch-up with itself by growing while trying to save it.
2433      */
2434     enum { Paranoid = 1, Max_retries = 500 };
2435 #else
2436     enum { Paranoid = 0, Max_retries = INT_MAX /*infinite*/ };
2437 #endif
2438     int rc, retry = num * 60;
2439     EDB_txn *txn = mc->mc_txn;
2440     EDB_env *env = txn->mt_env;
2441     pgno_t pgno, *mop = env->me_pghead;
2442     unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1;
2443     EDB_page *np;
2444     txnid_t oldest = 0, last;
2445     EDB_cursor_op op;
2446     EDB_cursor m2;
2447     int found_old = 0;
2448 
2449     /* If there are any loose pages, just use them */
2450     if (num == 1 && txn->mt_loose_pgs) {
2451         np = txn->mt_loose_pgs;
2452         txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np);
2453         txn->mt_loose_count--;
2454         DPRINTF(("db %d use loose page %"Yu, DDBI(mc), np->mp_pgno));
2455         *mp = np;
2456         return EDB_SUCCESS;
2457     }
2458 
2459     *mp = NULL;
2460 
2461     /* If our dirty list is already full, we can't do anything */
2462     if (txn->mt_dirty_room == 0) {
2463         rc = EDB_TXN_FULL;
2464         goto fail;
2465     }
2466 
2467     for (op = EDB_FIRST;; op = EDB_NEXT) {
2468         EDB_val key, data;
2469         EDB_node *leaf;
2470         pgno_t *idl;
2471 
2472         /* Seek a big enough contiguous page range. Prefer
2473          * pages at the tail, just truncating the list.
2474          */
2475         if (mop_len > n2) {
2476             i = mop_len;
2477             do {
2478                 pgno = mop[i];
2479                 if (mop[i-n2] == pgno+n2)
2480                     goto search_done;
2481             } while (--i > n2);
2482             if (--retry < 0)
2483                 break;
2484         }
2485 
2486         if (op == EDB_FIRST) {  /* 1st iteration */
2487             /* Prepare to fetch more and coalesce */
2488             last = env->me_pglast;
2489             oldest = env->me_pgoldest;
2490             edb_cursor_init(&m2, txn, FREE_DBI, NULL);
2491 #if (EDB_DEVEL) & 2 /* "& 2" so EDB_DEVEL=1 won't hide bugs breaking freeDB */
2492             /* Use original snapshot. TODO: Should need less care in code
2493              * which modifies the database. Maybe we can delete some code?
2494              */
2495             m2.mc_flags |= C_ORIG_RDONLY;
2496             m2.mc_db = &env->me_metas[(txn->mt_txnid-1) & 1]->mm_dbs[FREE_DBI];
2497             m2.mc_dbflag = (unsigned char *)""; /* probably unnecessary */
2498 #endif
2499             if (last) {
2500                 op = EDB_SET_RANGE;
2501                 key.mv_data = &last; /* will look up last+1 */
2502                 key.mv_size = sizeof(last);
2503             }
2504             if (Paranoid && mc->mc_dbi == FREE_DBI)
2505                 retry = -1;
2506         }
2507         if (Paranoid && retry < 0 && mop_len)
2508             break;
2509 
2510         last++;
2511         /* Do not fetch more if the record will be too recent */
2512         if (oldest <= last) {
2513             if (!found_old) {
2514                 oldest = edb_find_oldest(txn);
2515                 env->me_pgoldest = oldest;
2516                 found_old = 1;
2517             }
2518             if (oldest <= last)
2519                 break;
2520         }
2521         rc = edb_cursor_get(&m2, &key, NULL, op);
2522         if (rc) {
2523             if (rc == EDB_NOTFOUND)
2524                 break;
2525             goto fail;
2526         }
2527         last = *(txnid_t*)key.mv_data;
2528         if (oldest <= last) {
2529             if (!found_old) {
2530                 oldest = edb_find_oldest(txn);
2531                 env->me_pgoldest = oldest;
2532                 found_old = 1;
2533             }
2534             if (oldest <= last)
2535                 break;
2536         }
2537         np = m2.mc_pg[m2.mc_top];
2538         leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
2539         if ((rc = edb_node_read(&m2, leaf, &data)) != EDB_SUCCESS)
2540             goto fail;
2541 
2542         idl = (EDB_ID *) data.mv_data;
2543         i = idl[0];
2544         if (!mop) {
2545             if (!(env->me_pghead = mop = edb_eidl_alloc(i))) {
2546                 rc = ENOMEM;
2547                 goto fail;
2548             }
2549         } else {
2550             if ((rc = edb_eidl_need(&env->me_pghead, i)) != 0)
2551                 goto fail;
2552             mop = env->me_pghead;
2553         }
2554         env->me_pglast = last;
2555 #if (EDB_DEBUG) > 1
2556         DPRINTF(("IDL read txn %"Yu" root %"Yu" num %u",
2557             last, txn->mt_dbs[FREE_DBI].md_root, i));
2558         for (j = i; j; j--)
2559             DPRINTF(("IDL %"Yu, idl[j]));
2560 #endif
2561         /* Merge in descending sorted order */
2562         edb_eidl_xmerge(mop, idl);
2563         mop_len = mop[0];
2564     }
2565 
2566     /* Use new pages from the map when nothing suitable in the freeDB */
2567     i = 0;
2568     pgno = txn->mt_next_pgno;
2569     if (pgno + num >= env->me_maxpg) {
2570             DPUTS("DB size maxed out");
2571             rc = EDB_MAP_FULL;
2572             goto fail;
2573     }
2574 #if defined(_WIN32) && !defined(EDB_VL32)
2575     if (!(env->me_flags & EDB_RDONLY)) {
2576         void *p;
2577         p = (EDB_page *)(env->me_map + env->me_psize * pgno);
2578         p = VirtualAlloc(p, env->me_psize * num, MEM_COMMIT,
2579             (env->me_flags & EDB_WRITEMAP) ? PAGE_READWRITE:
2580             PAGE_READONLY);
2581         if (!p) {
2582             DPUTS("VirtualAlloc failed");
2583             rc = ErrCode();
2584             goto fail;
2585         }
2586     }
2587 #endif
2588 
2589 search_done:
2590     if (env->me_flags & EDB_WRITEMAP) {
2591         np = (EDB_page *)(env->me_map + env->me_psize * pgno);
2592     } else {
2593         if (!(np = edb_page_malloc(txn, num))) {
2594             rc = ENOMEM;
2595             goto fail;
2596         }
2597     }
2598     if (i) {
2599         mop[0] = mop_len -= num;
2600         /* Move any stragglers down */
2601         for (j = i-num; j < mop_len; )
2602             mop[++j] = mop[++i];
2603     } else {
2604         txn->mt_next_pgno = pgno + num;
2605     }
2606     np->mp_pgno = pgno;
2607     edb_page_dirty(txn, np);
2608     *mp = np;
2609 
2610     return EDB_SUCCESS;
2611 
2612 fail:
2613     txn->mt_flags |= EDB_TXN_ERROR;
2614     return rc;
2615 }
2616 
2617 /** Copy the used portions of a non-overflow page.
2618  * @param[in] dst page to copy into
2619  * @param[in] src page to copy from
2620  * @param[in] psize size of a page
2621  */
2622 static void
2623 edb_page_copy(EDB_page *dst, EDB_page *src, unsigned int psize)
2624 {
2625     enum { Align = sizeof(pgno_t) };
2626     indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower;
2627 
2628     /* If page isn't full, just copy the used portion. Adjust
2629      * alignment so memcpy may copy words instead of bytes.
2630      */
2631     if ((unused &= -Align) && !IS_LEAF2(src)) {
2632         upper = (upper + PAGEBASE) & -Align;
2633         memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align);
2634         memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper),
2635             psize - upper);
2636     } else {
2637         memcpy(dst, src, psize - unused);
2638     }
2639 }
2640 
2641 /** Pull a page off the txn's spill list, if present.
2642  * If a page being referenced was spilled to disk in this txn, bring
2643  * it back and make it dirty/writable again.
2644  * @param[in] txn the transaction handle.
2645  * @param[in] mp the page being referenced. It must not be dirty.
2646  * @param[out] ret the writable page, if any. ret is unchanged if
2647  * mp wasn't spilled.
2648  */
2649 static int
2650 edb_page_unspill(EDB_txn *txn, EDB_page *mp, EDB_page **ret)
2651 {
2652     EDB_env *env = txn->mt_env;
2653     const EDB_txn *tx2;
2654     unsigned x;
2655     pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
2656 
2657     for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
2658         if (!tx2->mt_spill_pgs)
2659             continue;
2660         x = edb_eidl_search(tx2->mt_spill_pgs, pn);
2661         if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
2662             EDB_page *np;
2663             int num;
2664             if (txn->mt_dirty_room == 0)
2665                 return EDB_TXN_FULL;
2666             if (IS_OVERFLOW(mp))
2667                 num = mp->mp_pages;
2668             else
2669                 num = 1;
2670             if (env->me_flags & EDB_WRITEMAP) {
2671                 np = mp;
2672             } else {
2673                 np = edb_page_malloc(txn, num);
2674                 if (!np)
2675                     return ENOMEM;
2676                 if (num > 1)
2677                     memcpy(np, mp, num * env->me_psize);
2678                 else
2679                     edb_page_copy(np, mp, env->me_psize);
2680             }
2681             if (tx2 == txn) {
2682                 /* If in current txn, this page is no longer spilled.
2683                  * If it happens to be the last page, truncate the spill list.
2684                  * Otherwise mark it as deleted by setting the LSB.
2685                  */
2686                 if (x == txn->mt_spill_pgs[0])
2687                     txn->mt_spill_pgs[0]--;
2688                 else
2689                     txn->mt_spill_pgs[x] |= 1;
2690             }   /* otherwise, if belonging to a parent txn, the
2691                  * page remains spilled until child commits
2692                  */
2693 
2694             edb_page_dirty(txn, np);
2695             np->mp_flags |= P_DIRTY;
2696             *ret = np;
2697             break;
2698         }
2699     }
2700     return EDB_SUCCESS;
2701 }
2702 
2703 /** Touch a page: make it dirty and re-insert into tree with updated pgno.
2704  * Set #EDB_TXN_ERROR on failure.
2705  * @param[in] mc cursor pointing to the page to be touched
2706  * @return 0 on success, non-zero on failure.
2707  */
2708 static int
2709 edb_page_touch(EDB_cursor *mc)
2710 {
2711     EDB_page *mp = mc->mc_pg[mc->mc_top], *np;
2712     EDB_txn *txn = mc->mc_txn;
2713     EDB_cursor *m2, *m3;
2714     pgno_t  pgno;
2715     int rc;
2716 
2717     if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
2718         if (txn->mt_flags & EDB_TXN_SPILLS) {
2719             np = NULL;
2720             rc = edb_page_unspill(txn, mp, &np);
2721             if (rc)
2722                 goto fail;
2723             if (np)
2724                 goto done;
2725         }
2726         if ((rc = edb_eidl_need(&txn->mt_free_pgs, 1)) ||
2727             (rc = edb_page_alloc(mc, 1, &np)))
2728             goto fail;
2729         pgno = np->mp_pgno;
2730         DPRINTF(("touched db %d page %"Yu" -> %"Yu, DDBI(mc),
2731             mp->mp_pgno, pgno));
2732         edb_cassert(mc, mp->mp_pgno != pgno);
2733         edb_eidl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2734         /* Update the parent page, if any, to point to the new page */
2735         if (mc->mc_top) {
2736             EDB_page *parent = mc->mc_pg[mc->mc_top-1];
2737             EDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]);
2738             SETPGNO(node, pgno);
2739         } else {
2740             mc->mc_db->md_root = pgno;
2741         }
2742     } else if (txn->mt_parent && !IS_SUBP(mp)) {
2743         EDB_ID2 mid, *dl = txn->mt_u.dirty_list;
2744         pgno = mp->mp_pgno;
2745         /* If txn has a parent, make sure the page is in our
2746          * dirty list.
2747          */
2748         if (dl[0].mid) {
2749             unsigned x = edb_mid2l_search(dl, pgno);
2750             if (x <= dl[0].mid && dl[x].mid == pgno) {
2751                 if (mp != dl[x].mptr) { /* bad cursor? */
2752                     mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
2753                     txn->mt_flags |= EDB_TXN_ERROR;
2754                     return EDB_PROBLEM;
2755                 }
2756                 return 0;
2757             }
2758         }
2759         edb_cassert(mc, dl[0].mid < EDB_IDL_UM_MAX);
2760         /* No - copy it */
2761         np = edb_page_malloc(txn, 1);
2762         if (!np)
2763             return ENOMEM;
2764         mid.mid = pgno;
2765         mid.mptr = np;
2766         rc = edb_mid2l_insert(dl, &mid);
2767         edb_cassert(mc, rc == 0);
2768     } else {
2769         return 0;
2770     }
2771 
2772     edb_page_copy(np, mp, txn->mt_env->me_psize);
2773     np->mp_pgno = pgno;
2774     np->mp_flags |= P_DIRTY;
2775 
2776 done:
2777     /* Adjust cursors pointing to mp */
2778     mc->mc_pg[mc->mc_top] = np;
2779     m2 = txn->mt_cursors[mc->mc_dbi];
2780     if (mc->mc_flags & C_SUB) {
2781         for (; m2; m2=m2->mc_next) {
2782             m3 = &m2->mc_xcursor->mx_cursor;
2783             if (m3->mc_snum < mc->mc_snum) continue;
2784             if (m3->mc_pg[mc->mc_top] == mp)
2785                 m3->mc_pg[mc->mc_top] = np;
2786         }
2787     } else {
2788         for (; m2; m2=m2->mc_next) {
2789             if (m2->mc_snum < mc->mc_snum) continue;
2790             if (m2 == mc) continue;
2791             if (m2->mc_pg[mc->mc_top] == mp) {
2792                 m2->mc_pg[mc->mc_top] = np;
2793                 if (IS_LEAF(np))
2794                     XCURSOR_REFRESH(m2, mc->mc_top, np);
2795             }
2796         }
2797     }
2798     EDB_PAGE_UNREF(mc->mc_txn, mp);
2799     return 0;
2800 
2801 fail:
2802     txn->mt_flags |= EDB_TXN_ERROR;
2803     return rc;
2804 }
2805 
2806 int
2807 edb_env_sync0(EDB_env *env, int force, pgno_t numpgs)
2808 {
2809     int rc = 0;
2810     if (env->me_flags & EDB_RDONLY)
2811         return EACCES;
2812     if (force || !F_ISSET(env->me_flags, EDB_NOSYNC)) {
2813         if (env->me_flags & EDB_WRITEMAP) {
2814             int flags = ((env->me_flags & EDB_MAPASYNC) && !force)
2815                 ? MS_ASYNC : MS_SYNC;
2816             if (EDB_MSYNC(env->me_map, env->me_psize * numpgs, flags))
2817                 rc = ErrCode();
2818 #ifdef _WIN32
2819             else if (flags == MS_SYNC && EDB_FDATASYNC(env->me_fd))
2820                 rc = ErrCode();
2821 #endif
2822         } else {
2823 #ifdef BROKEN_FDATASYNC
2824             if (env->me_flags & EDB_FSYNCONLY) {
2825                 if (fsync(env->me_fd))
2826                     rc = ErrCode();
2827             } else
2828 #endif
2829             if (EDB_FDATASYNC(env->me_fd))
2830                 rc = ErrCode();
2831         }
2832     }
2833     return rc;
2834 }
2835 
2836 int
2837 edb_env_sync(EDB_env *env, int force)
2838 {
2839     EDB_meta *m = edb_env_pick_meta(env);
2840     return edb_env_sync0(env, force, m->mm_last_pg+1);
2841 }
2842 
2843 /** Back up parent txn's cursors, then grab the originals for tracking */
2844 static int
2845 edb_cursor_shadow(EDB_txn *src, EDB_txn *dst)
2846 {
2847     EDB_cursor *mc, *bk;
2848     EDB_xcursor *mx;
2849     size_t size;
2850     int i;
2851 
2852     for (i = src->mt_nuedbs; --i >= 0; ) {
2853         if ((mc = src->mt_cursors[i]) != NULL) {
2854             size = sizeof(EDB_cursor);
2855             if (mc->mc_xcursor)
2856                 size += sizeof(EDB_xcursor);
2857             for (; mc; mc = bk->mc_next) {
2858                 bk = malloc(size);
2859                 if (!bk)
2860                     return ENOMEM;
2861                 *bk = *mc;
2862                 mc->mc_backup = bk;
2863                 mc->mc_db = &dst->mt_dbs[i];
2864                 /* Kill pointers into src to reduce abuse: The
2865                  * user may not use mc until dst ends. But we need a valid
2866                  * txn pointer here for cursor fixups to keep working.
2867                  */
2868                 mc->mc_txn    = dst;
2869                 mc->mc_dbflag = &dst->mt_dbflags[i];
2870                 if ((mx = mc->mc_xcursor) != NULL) {
2871                     *(EDB_xcursor *)(bk+1) = *mx;
2872                     mx->mx_cursor.mc_txn = dst;
2873                 }
2874                 mc->mc_next = dst->mt_cursors[i];
2875                 dst->mt_cursors[i] = mc;
2876             }
2877         }
2878     }
2879     return EDB_SUCCESS;
2880 }
2881 
2882 /** Close this write txn's cursors, give parent txn's cursors back to parent.
2883  * @param[in] txn the transaction handle.
2884  * @param[in] merge true to keep changes to parent cursors, false to revert.
2885  * @return 0 on success, non-zero on failure.
2886  */
2887 static void
2888 edb_cursors_close(EDB_txn *txn, unsigned merge)
2889 {
2890     EDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk;
2891     EDB_xcursor *mx;
2892     int i;
2893 
2894     for (i = txn->mt_nuedbs; --i >= 0; ) {
2895         for (mc = cursors[i]; mc; mc = next) {
2896             next = mc->mc_next;
2897             if ((bk = mc->mc_backup) != NULL) {
2898                 if (merge) {
2899                     /* Commit changes to parent txn */
2900                     mc->mc_next = bk->mc_next;
2901                     mc->mc_backup = bk->mc_backup;
2902                     mc->mc_txn = bk->mc_txn;
2903                     mc->mc_db = bk->mc_db;
2904                     mc->mc_dbflag = bk->mc_dbflag;
2905                     if ((mx = mc->mc_xcursor) != NULL)
2906                         mx->mx_cursor.mc_txn = bk->mc_txn;
2907                 } else {
2908                     /* Abort nested txn */
2909                     *mc = *bk;
2910                     if ((mx = mc->mc_xcursor) != NULL)
2911                         *mx = *(EDB_xcursor *)(bk+1);
2912                 }
2913                 mc = bk;
2914             }
2915             /* Only malloced cursors are permanently tracked. */
2916             free(mc);
2917         }
2918         cursors[i] = NULL;
2919     }
2920 }
2921 
2922 #if !(EDB_PIDLOCK)      /* Currently the same as defined(_WIN32) */
2923 enum Pidlock_op {
2924     Pidset, Pidcheck
2925 };
2926 #else
2927 enum Pidlock_op {
2928     Pidset = F_SETLK, Pidcheck = F_GETLK
2929 };
2930 #endif
2931 
2932 /** Set or check a pid lock. Set returns 0 on success.
2933  * Check returns 0 if the process is certainly dead, nonzero if it may
2934  * be alive (the lock exists or an error happened so we do not know).
2935  *
2936  * On Windows Pidset is a no-op, we merely check for the existence
2937  * of the process with the given pid. On POSIX we use a single byte
2938  * lock on the lockfile, set at an offset equal to the pid.
2939  */
2940 static int
2941 edb_reader_pid(EDB_env *env, enum Pidlock_op op, EDB_PID_T pid)
2942 {
2943 #if !(EDB_PIDLOCK)      /* Currently the same as defined(_WIN32) */
2944     int ret = 0;
2945     HANDLE h;
2946     if (op == Pidcheck) {
2947         h = OpenProcess(env->me_pidquery, FALSE, pid);
2948         /* No documented "no such process" code, but other program use this: */
2949         if (!h)
2950             return ErrCode() != ERROR_INVALID_PARAMETER;
2951         /* A process exists until all handles to it close. Has it exited? */
2952         ret = WaitForSingleObject(h, 0) != 0;
2953         CloseHandle(h);
2954     }
2955     return ret;
2956 #else
2957     for (;;) {
2958         int rc;
2959         struct flock lock_info;
2960         memset(&lock_info, 0, sizeof(lock_info));
2961         lock_info.l_type = F_WRLCK;
2962         lock_info.l_whence = SEEK_SET;
2963         lock_info.l_start = pid;
2964         lock_info.l_len = 1;
2965         if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) {
2966             if (op == F_GETLK && lock_info.l_type != F_UNLCK)
2967                 rc = -1;
2968         } else if ((rc = ErrCode()) == EINTR) {
2969             continue;
2970         }
2971         return rc;
2972     }
2973 #endif
2974 }
2975 
2976 /** Common code for #edb_txn_begin() and #edb_txn_renew().
2977  * @param[in] txn the transaction handle to initialize
2978  * @return 0 on success, non-zero on failure.
2979  */
2980 static int
2981 edb_txn_renew0(EDB_txn *txn)
2982 {
2983     EDB_env *env = txn->mt_env;
2984     EDB_txninfo *ti = env->me_txns;
2985     EDB_meta *meta;
2986     unsigned int i, nr, flags = txn->mt_flags;
2987     uint16_t x;
2988     int rc, new_notls = 0;
2989 
2990     if ((flags &= EDB_TXN_RDONLY) != 0) {
2991         if (!ti) {
2992             meta = edb_env_pick_meta(env);
2993             txn->mt_txnid = meta->mm_txnid;
2994             txn->mt_u.reader = NULL;
2995         } else {
2996             EDB_reader *r = (env->me_flags & EDB_NOTLS) ? txn->mt_u.reader :
2997                 pthread_getspecific(env->me_txkey);
2998             if (r) {
2999                 if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
3000                     return EDB_BAD_RSLOT;
3001             } else {
3002                 EDB_PID_T pid = env->me_pid;
3003                 EDB_THR_T tid = pthread_self();
3004                 edb_mutexref_t rmutex = env->me_rmutex;
3005 
3006                 if (!env->me_live_reader) {
3007                     rc = edb_reader_pid(env, Pidset, pid);
3008                     if (rc)
3009                         return rc;
3010                     env->me_live_reader = 1;
3011                 }
3012 
3013                 if (LOCK_MUTEX(rc, env, rmutex))
3014                     return rc;
3015                 nr = ti->mti_numreaders;
3016                 for (i=0; i<nr; i++)
3017                     if (ti->mti_readers[i].mr_pid == 0)
3018                         break;
3019                 if (i == env->me_maxreaders) {
3020                     UNLOCK_MUTEX(rmutex);
3021                     return EDB_READERS_FULL;
3022                 }
3023                 r = &ti->mti_readers[i];
3024                 /* Claim the reader slot, carefully since other code
3025                  * uses the reader table un-mutexed: First reset the
3026                  * slot, next publish it in mti_numreaders.  After
3027                  * that, it is safe for edb_env_close() to touch it.
3028                  * When it will be closed, we can finally claim it.
3029                  */
3030                 r->mr_pid = 0;
3031                 r->mr_txnid = (txnid_t)-1;
3032                 r->mr_tid = tid;
3033                 if (i == nr)
3034                     ti->mti_numreaders = ++nr;
3035                 env->me_close_readers = nr;
3036                 r->mr_pid = pid;
3037                 UNLOCK_MUTEX(rmutex);
3038 
3039                 new_notls = (env->me_flags & EDB_NOTLS);
3040                 if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
3041                     r->mr_pid = 0;
3042                     return rc;
3043                 }
3044             }
3045             do /* LY: Retry on a race, ITS#7970. */
3046                 r->mr_txnid = ti->mti_txnid;
3047             while(r->mr_txnid != ti->mti_txnid);
3048             txn->mt_txnid = r->mr_txnid;
3049             txn->mt_u.reader = r;
3050             meta = env->me_metas[txn->mt_txnid & 1];
3051         }
3052 
3053     } else {
3054         /* Not yet touching txn == env->me_txn0, it may be active */
3055         if (ti) {
3056             if (LOCK_MUTEX(rc, env, env->me_wmutex))
3057                 return rc;
3058             txn->mt_txnid = ti->mti_txnid;
3059             meta = env->me_metas[txn->mt_txnid & 1];
3060         } else {
3061             meta = edb_env_pick_meta(env);
3062             txn->mt_txnid = meta->mm_txnid;
3063         }
3064         txn->mt_txnid++;
3065 #if EDB_DEBUG
3066         if (txn->mt_txnid == edb_debug_start)
3067             edb_debug = 1;
3068 #endif
3069         txn->mt_child = NULL;
3070         txn->mt_loose_pgs = NULL;
3071         txn->mt_loose_count = 0;
3072         txn->mt_dirty_room = EDB_IDL_UM_MAX;
3073         txn->mt_u.dirty_list = env->me_dirty_list;
3074         txn->mt_u.dirty_list[0].mid = 0;
3075         txn->mt_free_pgs = env->me_free_pgs;
3076         txn->mt_free_pgs[0] = 0;
3077         txn->mt_spill_pgs = NULL;
3078         env->me_txn = txn;
3079         memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int));
3080     }
3081 
3082     /* Copy the DB info and flags */
3083     memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(EDB_db));
3084 
3085     /* Moved to here to avoid a data race in read TXNs */
3086     txn->mt_next_pgno = meta->mm_last_pg+1;
3087 #ifdef EDB_VL32
3088     txn->mt_last_pgno = txn->mt_next_pgno - 1;
3089 #endif
3090 
3091     txn->mt_flags = flags;
3092 
3093     /* Setup db info */
3094     txn->mt_nuedbs = env->me_nuedbs;
3095     for (i=CORE_DBS; i<txn->mt_nuedbs; i++) {
3096         x = env->me_dbflags[i];
3097         txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;
3098         txn->mt_dbflags[i] = (x & EDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0;
3099     }
3100     txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID;
3101     txn->mt_dbflags[FREE_DBI] = DB_VALID;
3102 
3103     if (env->me_flags & EDB_FATAL_ERROR) {
3104         DPUTS("environment had fatal error, must shutdown!");
3105         rc = EDB_PANIC;
3106     } else if (env->me_maxpg < txn->mt_next_pgno) {
3107         rc = EDB_MAP_RESIZED;
3108     } else {
3109         return EDB_SUCCESS;
3110     }
3111     edb_txn_end(txn, new_notls /*0 or EDB_END_SLOT*/ | EDB_END_FAIL_BEGIN);
3112     return rc;
3113 }
3114 
3115 int
3116 edb_txn_renew(EDB_txn *txn)
3117 {
3118     int rc;
3119 
3120     if (!txn || !F_ISSET(txn->mt_flags, EDB_TXN_RDONLY|EDB_TXN_FINISHED))
3121         return EINVAL;
3122 
3123     rc = edb_txn_renew0(txn);
3124     if (rc == EDB_SUCCESS) {
3125         DPRINTF(("renew txn %"Yu"%c %p on edbenv %p, root page %"Yu,
3126             txn->mt_txnid, (txn->mt_flags & EDB_TXN_RDONLY) ? 'r' : 'w',
3127             (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root));
3128     }
3129     return rc;
3130 }
3131 
3132 int
3133 edb_txn_begin(EDB_env *env, EDB_txn *parent, unsigned int flags, EDB_txn **ret)
3134 {
3135     EDB_txn *txn;
3136     EDB_ntxn *ntxn;
3137     int rc, size, tsize;
3138 
3139     flags &= EDB_TXN_BEGIN_FLAGS;
3140     flags |= env->me_flags & EDB_WRITEMAP;
3141 
3142     if (env->me_flags & EDB_RDONLY & ~flags) /* write txn in RDONLY env */
3143         return EACCES;
3144 
3145     if (parent) {
3146         /* Nested transactions: Max 1 child, write txns only, no writemap */
3147         flags |= parent->mt_flags;
3148         if (flags & (EDB_RDONLY|EDB_WRITEMAP|EDB_TXN_BLOCKED)) {
3149             return (parent->mt_flags & EDB_TXN_RDONLY) ? EINVAL : EDB_BAD_TXN;
3150         }
3151         /* Child txns save EDB_pgstate and use own copy of cursors */
3152         size = env->me_maxdbs * (sizeof(EDB_db)+sizeof(EDB_cursor *)+1);
3153         size += tsize = sizeof(EDB_ntxn);
3154     } else if (flags & EDB_RDONLY) {
3155         size = env->me_maxdbs * (sizeof(EDB_db)+1);
3156         size += tsize = sizeof(EDB_txn);
3157     } else {
3158         /* Reuse preallocated write txn. However, do not touch it until
3159          * edb_txn_renew0() succeeds, since it currently may be active.
3160          */
3161         txn = env->me_txn0;
3162         goto renew;
3163     }
3164     if ((txn = calloc(1, size)) == NULL) {
3165         DPRINTF(("calloc: %s", strerror(errno)));
3166         return ENOMEM;
3167     }
3168 #ifdef EDB_VL32
3169     if (!parent) {
3170         txn->mt_rpages = malloc(EDB_TRPAGE_SIZE * sizeof(EDB_ID3));
3171         if (!txn->mt_rpages) {
3172             free(txn);
3173             return ENOMEM;
3174         }
3175         txn->mt_rpages[0].mid = 0;
3176         txn->mt_rpcheck = EDB_TRPAGE_SIZE/2;
3177     }
3178 #endif
3179     txn->mt_dbxs = env->me_dbxs;    /* static */
3180     txn->mt_dbs = (EDB_db *) ((char *)txn + tsize);
3181     txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs;
3182     txn->mt_flags = flags;
3183     txn->mt_env = env;
3184 
3185     if (parent) {
3186         unsigned int i;
3187         txn->mt_cursors = (EDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
3188         txn->mt_dbiseqs = parent->mt_dbiseqs;
3189         txn->mt_u.dirty_list = malloc(sizeof(EDB_ID2)*EDB_IDL_UM_SIZE);
3190         if (!txn->mt_u.dirty_list ||
3191             !(txn->mt_free_pgs = edb_eidl_alloc(EDB_IDL_UM_MAX)))
3192         {
3193             free(txn->mt_u.dirty_list);
3194             free(txn);
3195             return ENOMEM;
3196         }
3197         txn->mt_txnid = parent->mt_txnid;
3198         txn->mt_dirty_room = parent->mt_dirty_room;
3199         txn->mt_u.dirty_list[0].mid = 0;
3200         txn->mt_spill_pgs = NULL;
3201         txn->mt_next_pgno = parent->mt_next_pgno;
3202         parent->mt_flags |= EDB_TXN_HAS_CHILD;
3203         parent->mt_child = txn;
3204         txn->mt_parent = parent;
3205         txn->mt_nuedbs = parent->mt_nuedbs;
3206 #ifdef EDB_VL32
3207         txn->mt_rpages = parent->mt_rpages;
3208 #endif
3209         memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_nuedbs * sizeof(EDB_db));
3210         /* Copy parent's mt_dbflags, but clear DB_NEW */
3211         for (i=0; i<txn->mt_nuedbs; i++)
3212             txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW;
3213         rc = 0;
3214         ntxn = (EDB_ntxn *)txn;
3215         ntxn->mnt_pgstate = env->me_pgstate; /* save parent me_pghead & co */
3216         if (env->me_pghead) {
3217             size = EDB_IDL_SIZEOF(env->me_pghead);
3218             env->me_pghead = edb_eidl_alloc(env->me_pghead[0]);
3219             if (env->me_pghead)
3220                 memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size);
3221             else
3222                 rc = ENOMEM;
3223         }
3224         if (!rc)
3225             rc = edb_cursor_shadow(parent, txn);
3226         if (rc)
3227             edb_txn_end(txn, EDB_END_FAIL_BEGINCHILD);
3228     } else { /* EDB_RDONLY */
3229         txn->mt_dbiseqs = env->me_dbiseqs;
3230 renew:
3231         rc = edb_txn_renew0(txn);
3232     }
3233     if (rc) {
3234         if (txn != env->me_txn0) {
3235 #ifdef EDB_VL32
3236             free(txn->mt_rpages);
3237 #endif
3238             free(txn);
3239         }
3240     } else {
3241         txn->mt_flags |= flags; /* could not change txn=me_txn0 earlier */
3242         *ret = txn;
3243         DPRINTF(("begin txn %"Yu"%c %p on edbenv %p, root page %"Yu,
3244             txn->mt_txnid, (flags & EDB_RDONLY) ? 'r' : 'w',
3245             (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root));
3246     }
3247 
3248     return rc;
3249 }
3250 
3251 EDB_env *
3252 edb_txn_env(EDB_txn *txn)
3253 {
3254     if(!txn) return NULL;
3255     return txn->mt_env;
3256 }
3257 
3258 edb_size_t
3259 edb_txn_id(EDB_txn *txn)
3260 {
3261     if(!txn) return 0;
3262     return txn->mt_txnid;
3263 }
3264 
3265 /** Export or close DBI handles opened in this txn. */
3266 static void
3267 edb_dbis_update(EDB_txn *txn, int keep)
3268 {
3269     int i;
3270     EDB_dbi n = txn->mt_nuedbs;
3271     EDB_env *env = txn->mt_env;
3272     unsigned char *tdbflags = txn->mt_dbflags;
3273 
3274     for (i = n; --i >= CORE_DBS;) {
3275         if (tdbflags[i] & DB_NEW) {
3276             if (keep) {
3277                 env->me_dbflags[i] = txn->mt_dbs[i].md_flags | EDB_VALID;
3278             } else {
3279                 char *ptr = env->me_dbxs[i].md_name.mv_data;
3280                 if (ptr) {
3281                     env->me_dbxs[i].md_name.mv_data = NULL;
3282                     env->me_dbxs[i].md_name.mv_size = 0;
3283                     env->me_dbflags[i] = 0;
3284                     env->me_dbiseqs[i]++;
3285                     free(ptr);
3286                 }
3287             }
3288         }
3289     }
3290     if (keep && env->me_nuedbs < n)
3291         env->me_nuedbs = n;
3292 }
3293 
3294 /** End a transaction, except successful commit of a nested transaction.
3295  * May be called twice for readonly txns: First reset it, then abort.
3296  * @param[in] txn the transaction handle to end
3297  * @param[in] mode why and how to end the transaction
3298  */
3299 static void
3300 edb_txn_end(EDB_txn *txn, unsigned mode)
3301 {
3302     EDB_env *env = txn->mt_env;
3303 #if EDB_DEBUG
3304     static const char *const names[] = EDB_END_NAMES;
3305 #endif
3306 
3307     /* Export or close DBI handles opened in this txn */
3308     edb_dbis_update(txn, mode & EDB_END_UPDATE);
3309 
3310     DPRINTF(("%s txn %"Yu"%c %p on edbenv %p, root page %"Yu,
3311         names[mode & EDB_END_OPMASK],
3312         txn->mt_txnid, (txn->mt_flags & EDB_TXN_RDONLY) ? 'r' : 'w',
3313         (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root));
3314 
3315     if (F_ISSET(txn->mt_flags, EDB_TXN_RDONLY)) {
3316         if (txn->mt_u.reader) {
3317             txn->mt_u.reader->mr_txnid = (txnid_t)-1;
3318             if (!(env->me_flags & EDB_NOTLS)) {
3319                 txn->mt_u.reader = NULL; /* txn does not own reader */
3320             } else if (mode & EDB_END_SLOT) {
3321                 txn->mt_u.reader->mr_pid = 0;
3322                 txn->mt_u.reader = NULL;
3323             } /* else txn owns the slot until it does EDB_END_SLOT */
3324         }
3325         txn->mt_nuedbs = 0;     /* prevent further DBI activity */
3326         txn->mt_flags |= EDB_TXN_FINISHED;
3327 
3328     } else if (!F_ISSET(txn->mt_flags, EDB_TXN_FINISHED)) {
3329         pgno_t *pghead = env->me_pghead;
3330 
3331         if (!(mode & EDB_END_UPDATE)) /* !(already closed cursors) */
3332             edb_cursors_close(txn, 0);
3333         if (!(env->me_flags & EDB_WRITEMAP)) {
3334             edb_dlist_free(txn);
3335         }
3336 
3337         txn->mt_nuedbs = 0;
3338         txn->mt_flags = EDB_TXN_FINISHED;
3339 
3340         if (!txn->mt_parent) {
3341             edb_eidl_shrink(&txn->mt_free_pgs);
3342             env->me_free_pgs = txn->mt_free_pgs;
3343             /* me_pgstate: */
3344             env->me_pghead = NULL;
3345             env->me_pglast = 0;
3346 
3347             env->me_txn = NULL;
3348             mode = 0;   /* txn == env->me_txn0, do not free() it */
3349 
3350             /* The writer mutex was locked in edb_txn_begin. */
3351             if (env->me_txns)
3352                 UNLOCK_MUTEX(env->me_wmutex);
3353         } else {
3354             txn->mt_parent->mt_child = NULL;
3355             txn->mt_parent->mt_flags &= ~EDB_TXN_HAS_CHILD;
3356             env->me_pgstate = ((EDB_ntxn *)txn)->mnt_pgstate;
3357             edb_eidl_free(txn->mt_free_pgs);
3358             free(txn->mt_u.dirty_list);
3359         }
3360         edb_eidl_free(txn->mt_spill_pgs);
3361 
3362         edb_eidl_free(pghead);
3363     }
3364 #ifdef EDB_VL32
3365     if (!txn->mt_parent) {
3366         EDB_ID3L el = env->me_rpages, tl = txn->mt_rpages;
3367         unsigned i, x, n = tl[0].mid;
3368         pthread_mutex_lock(&env->me_rpmutex);
3369         for (i = 1; i <= n; i++) {
3370             if (tl[i].mid & (EDB_RPAGE_CHUNK-1)) {
3371                 /* tmp overflow pages that we didn't share in env */
3372                 munmap(tl[i].mptr, tl[i].mcnt * env->me_psize);
3373             } else {
3374                 x = edb_mid3l_search(el, tl[i].mid);
3375                 if (tl[i].mptr == el[x].mptr) {
3376                     el[x].mref--;
3377                 } else {
3378                     /* another tmp overflow page */
3379                     munmap(tl[i].mptr, tl[i].mcnt * env->me_psize);
3380                 }
3381             }
3382         }
3383         pthread_mutex_unlock(&env->me_rpmutex);
3384         tl[0].mid = 0;
3385         if (mode & EDB_END_FREE)
3386             free(tl);
3387     }
3388 #endif
3389     if (mode & EDB_END_FREE)
3390         free(txn);
3391 }
3392 
3393 void
3394 edb_txn_reset(EDB_txn *txn)
3395 {
3396     if (txn == NULL)
3397         return;
3398 
3399     /* This call is only valid for read-only txns */
3400     if (!(txn->mt_flags & EDB_TXN_RDONLY))
3401         return;
3402 
3403     edb_txn_end(txn, EDB_END_RESET);
3404 }
3405 
3406 void
3407 edb_txn_abort(EDB_txn *txn)
3408 {
3409     if (txn == NULL)
3410         return;
3411 
3412     if (txn->mt_child)
3413         edb_txn_abort(txn->mt_child);
3414 
3415     edb_txn_end(txn, EDB_END_ABORT|EDB_END_SLOT|EDB_END_FREE);
3416 }
3417 
3418 /** Save the freelist as of this transaction to the freeDB.
3419  * This changes the freelist. Keep trying until it stabilizes.
3420  *
3421  * When (EDB_DEVEL) & 2, the changes do not affect #edb_page_alloc(),
3422  * it then uses the transaction's original snapshot of the freeDB.
3423  */
3424 static int
3425 edb_freelist_save(EDB_txn *txn)
3426 {
3427     /* env->me_pghead[] can grow and shrink during this call.
3428      * env->me_pglast and txn->mt_free_pgs[] can only grow.
3429      * Page numbers cannot disappear from txn->mt_free_pgs[].
3430      */
3431     EDB_cursor mc;
3432     EDB_env *env = txn->mt_env;
3433     int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
3434     txnid_t pglast = 0, head_id = 0;
3435     pgno_t  freecnt = 0, *free_pgs, *mop;
3436     ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
3437 
3438     edb_cursor_init(&mc, txn, FREE_DBI, NULL);
3439 
3440     if (env->me_pghead) {
3441         /* Make sure first page of freeDB is touched and on freelist */
3442         rc = edb_page_search(&mc, NULL, EDB_PS_FIRST|EDB_PS_MODIFY);
3443         if (rc && rc != EDB_NOTFOUND)
3444             return rc;
3445     }
3446 
3447     if (!env->me_pghead && txn->mt_loose_pgs) {
3448         /* Put loose page numbers in mt_free_pgs, since
3449          * we may be unable to return them to me_pghead.
3450          */
3451         EDB_page *mp = txn->mt_loose_pgs;
3452         EDB_ID2 *dl = txn->mt_u.dirty_list;
3453         unsigned x;
3454         if ((rc = edb_eidl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)
3455             return rc;
3456         for (; mp; mp = NEXT_LOOSE_PAGE(mp)) {
3457             edb_eidl_xappend(txn->mt_free_pgs, mp->mp_pgno);
3458             /* must also remove from dirty list */
3459             if (txn->mt_flags & EDB_TXN_WRITEMAP) {
3460                 for (x=1; x<=dl[0].mid; x++)
3461                     if (dl[x].mid == mp->mp_pgno)
3462                         break;
3463                 edb_tassert(txn, x <= dl[0].mid);
3464             } else {
3465                 x = edb_mid2l_search(dl, mp->mp_pgno);
3466                 edb_tassert(txn, dl[x].mid == mp->mp_pgno);
3467                 edb_dpage_free(env, mp);
3468             }
3469             dl[x].mptr = NULL;
3470         }
3471         {
3472             /* squash freed slots out of the dirty list */
3473             unsigned y;
3474             for (y=1; dl[y].mptr && y <= dl[0].mid; y++);
3475             if (y <= dl[0].mid) {
3476                 for(x=y, y++;;) {
3477                     while (!dl[y].mptr && y <= dl[0].mid) y++;
3478                     if (y > dl[0].mid) break;
3479                     dl[x++] = dl[y++];
3480                 }
3481                 dl[0].mid = x-1;
3482             } else {
3483                 /* all slots freed */
3484                 dl[0].mid = 0;
3485             }
3486         }
3487         txn->mt_loose_pgs = NULL;
3488         txn->mt_loose_count = 0;
3489     }
3490 
3491     /* EDB_RESERVE cancels meminit in ovpage malloc (when no WRITEMAP) */
3492     clean_limit = (env->me_flags & (EDB_NOMEMINIT|EDB_WRITEMAP))
3493         ? SSIZE_MAX : maxfree_1pg;
3494 
3495     for (;;) {
3496         /* Come back here after each Put() in case freelist changed */
3497         EDB_val key, data;
3498         pgno_t *pgs;
3499         ssize_t j;
3500 
3501         /* If using records from freeDB which we have not yet
3502          * deleted, delete them and any we reserved for me_pghead.
3503          */
3504         while (pglast < env->me_pglast) {
3505             rc = edb_cursor_first(&mc, &key, NULL);
3506             if (rc)
3507                 return rc;
3508             pglast = head_id = *(txnid_t *)key.mv_data;
3509             total_room = head_room = 0;
3510             edb_tassert(txn, pglast <= env->me_pglast);
3511             rc = edb_cursor_del(&mc, 0);
3512             if (rc)
3513                 return rc;
3514         }
3515 
3516         /* Save the IDL of pages freed by this txn, to a single record */
3517         if (freecnt < txn->mt_free_pgs[0]) {
3518             if (!freecnt) {
3519                 /* Make sure last page of freeDB is touched and on freelist */
3520                 rc = edb_page_search(&mc, NULL, EDB_PS_LAST|EDB_PS_MODIFY);
3521                 if (rc && rc != EDB_NOTFOUND)
3522                     return rc;
3523             }
3524             free_pgs = txn->mt_free_pgs;
3525             /* Write to last page of freeDB */
3526             key.mv_size = sizeof(txn->mt_txnid);
3527             key.mv_data = &txn->mt_txnid;
3528             do {
3529                 freecnt = free_pgs[0];
3530                 data.mv_size = EDB_IDL_SIZEOF(free_pgs);
3531                 rc = edb_cursor_put(&mc, &key, &data, EDB_RESERVE);
3532                 if (rc)
3533                     return rc;
3534                 /* Retry if mt_free_pgs[] grew during the Put() */
3535                 free_pgs = txn->mt_free_pgs;
3536             } while (freecnt < free_pgs[0]);
3537             edb_eidl_sort(free_pgs);
3538             memcpy(data.mv_data, free_pgs, data.mv_size);
3539 #if (EDB_DEBUG) > 1
3540             {
3541                 unsigned int i = free_pgs[0];
3542                 DPRINTF(("IDL write txn %"Yu" root %"Yu" num %u",
3543                     txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i));
3544                 for (; i; i--)
3545                     DPRINTF(("IDL %"Yu, free_pgs[i]));
3546             }
3547 #endif
3548             continue;
3549         }
3550 
3551         mop = env->me_pghead;
3552         mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count;
3553 
3554         /* Reserve records for me_pghead[]. Split it if multi-page,
3555          * to avoid searching freeDB for a page range. Use keys in
3556          * range [1,me_pglast]: Smaller than txnid of oldest reader.
3557          */
3558         if (total_room >= mop_len) {
3559             if (total_room == mop_len || --more < 0)
3560                 break;
3561         } else if (head_room >= maxfree_1pg && head_id > 1) {
3562             /* Keep current record (overflow page), add a new one */
3563             head_id--;
3564             head_room = 0;
3565         }
3566         /* (Re)write {key = head_id, IDL length = head_room} */
3567         total_room -= head_room;
3568         head_room = mop_len - total_room;
3569         if (head_room > maxfree_1pg && head_id > 1) {
3570             /* Overflow multi-page for part of me_pghead */
3571             head_room /= head_id; /* amortize page sizes */
3572             head_room += maxfree_1pg - head_room % (maxfree_1pg + 1);
3573         } else if (head_room < 0) {
3574             /* Rare case, not bothering to delete this record */
3575             head_room = 0;
3576         }
3577         key.mv_size = sizeof(head_id);
3578         key.mv_data = &head_id;
3579         data.mv_size = (head_room + 1) * sizeof(pgno_t);
3580         rc = edb_cursor_put(&mc, &key, &data, EDB_RESERVE);
3581         if (rc)
3582             return rc;
3583         /* IDL is initially empty, zero out at least the length */
3584         pgs = (pgno_t *)data.mv_data;
3585         j = head_room > clean_limit ? head_room : 0;
3586         do {
3587             pgs[j] = 0;
3588         } while (--j >= 0);
3589         total_room += head_room;
3590     }
3591 
3592     /* Return loose page numbers to me_pghead, though usually none are
3593      * left at this point.  The pages themselves remain in dirty_list.
3594      */
3595     if (txn->mt_loose_pgs) {
3596         EDB_page *mp = txn->mt_loose_pgs;
3597         unsigned count = txn->mt_loose_count;
3598         EDB_IDL loose;
3599         /* Room for loose pages + temp IDL with same */
3600         if ((rc = edb_eidl_need(&env->me_pghead, 2*count+1)) != 0)
3601             return rc;
3602         mop = env->me_pghead;
3603         loose = mop + EDB_IDL_ALLOCLEN(mop) - count;
3604         for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp))
3605             loose[ ++count ] = mp->mp_pgno;
3606         loose[0] = count;
3607         edb_eidl_sort(loose);
3608         edb_eidl_xmerge(mop, loose);
3609         txn->mt_loose_pgs = NULL;
3610         txn->mt_loose_count = 0;
3611         mop_len = mop[0];
3612     }
3613 
3614     /* Fill in the reserved me_pghead records */
3615     rc = EDB_SUCCESS;
3616     if (mop_len) {
3617         EDB_val key, data;
3618 
3619         mop += mop_len;
3620         rc = edb_cursor_first(&mc, &key, &data);
3621         for (; !rc; rc = edb_cursor_next(&mc, &key, &data, EDB_NEXT)) {
3622             txnid_t id = *(txnid_t *)key.mv_data;
3623             ssize_t len = (ssize_t)(data.mv_size / sizeof(EDB_ID)) - 1;
3624             EDB_ID save;
3625 
3626             edb_tassert(txn, len >= 0 && id <= env->me_pglast);
3627             key.mv_data = &id;
3628             if (len > mop_len) {
3629                 len = mop_len;
3630                 data.mv_size = (len + 1) * sizeof(EDB_ID);
3631             }
3632             data.mv_data = mop -= len;
3633             save = mop[0];
3634             mop[0] = len;
3635             rc = edb_cursor_put(&mc, &key, &data, EDB_CURRENT);
3636             mop[0] = save;
3637             if (rc || !(mop_len -= len))
3638                 break;
3639         }
3640     }
3641     return rc;
3642 }
3643 
3644 /** Flush (some) dirty pages to the map, after clearing their dirty flag.
3645  * @param[in] txn the transaction that's being committed
3646  * @param[in] keep number of initial pages in dirty_list to keep dirty.
3647  * @return 0 on success, non-zero on failure.
3648  */
3649 static int
3650 edb_page_flush(EDB_txn *txn, int keep)
3651 {
3652     EDB_env     *env = txn->mt_env;
3653     EDB_ID2L    dl = txn->mt_u.dirty_list;
3654     unsigned    psize = env->me_psize, j;
3655     int         i, pagecount = dl[0].mid, rc;
3656     size_t      size = 0;
3657     off_t       pos = 0;
3658     pgno_t      pgno = 0;
3659     EDB_page    *dp = NULL;
3660 #ifdef _WIN32
3661     OVERLAPPED  ov;
3662 #else
3663     struct iovec iov[EDB_COMMIT_PAGES];
3664     ssize_t     wsize = 0, wres;
3665     off_t       wpos = 0, next_pos = 1; /* impossible pos, so pos != next_pos */
3666     int         n = 0;
3667 #endif
3668 
3669     j = i = keep;
3670 
3671     if (env->me_flags & EDB_WRITEMAP) {
3672         /* Clear dirty flags */
3673         while (++i <= pagecount) {
3674             dp = dl[i].mptr;
3675             /* Don't flush this page yet */
3676             if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3677                 dp->mp_flags &= ~P_KEEP;
3678                 dl[++j] = dl[i];
3679                 continue;
3680             }
3681             dp->mp_flags &= ~P_DIRTY;
3682         }
3683         goto done;
3684     }
3685 
3686     /* Write the pages */
3687     for (;;) {
3688         if (++i <= pagecount) {
3689             dp = dl[i].mptr;
3690             /* Don't flush this page yet */
3691             if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3692                 dp->mp_flags &= ~P_KEEP;
3693                 dl[i].mid = 0;
3694                 continue;
3695             }
3696             pgno = dl[i].mid;
3697             /* clear dirty flag */
3698             dp->mp_flags &= ~P_DIRTY;
3699             pos = pgno * psize;
3700             size = psize;
3701             if (IS_OVERFLOW(dp)) size *= dp->mp_pages;
3702         }
3703 #ifdef _WIN32
3704         else break;
3705 
3706         /* Windows actually supports scatter/gather I/O, but only on
3707          * unbuffered file handles. Since we're relying on the OS page
3708          * cache for all our data, that's self-defeating. So we just
3709          * write pages one at a time. We use the ov structure to set
3710          * the write offset, to at least save the overhead of a Seek
3711          * system call.
3712          */
3713         DPRINTF(("committing page %"Yu, pgno));
3714         memset(&ov, 0, sizeof(ov));
3715         ov.Offset = pos & 0xffffffff;
3716         ov.OffsetHigh = pos >> 16 >> 16;
3717         if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) {
3718             rc = ErrCode();
3719             DPRINTF(("WriteFile: %d", rc));
3720             return rc;
3721         }
3722 #else
3723         /* Write up to EDB_COMMIT_PAGES dirty pages at a time. */
3724         if (pos!=next_pos || n==EDB_COMMIT_PAGES || wsize+size>MAX_WRITE) {
3725             if (n) {
3726 retry_write:
3727                 /* Write previous page(s) */
3728 #ifdef EDB_USE_PWRITEV
3729                 wres = pwritev(env->me_fd, iov, n, wpos);
3730 #else
3731                 if (n == 1) {
3732                     wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos);
3733                 } else {
3734 retry_seek:
3735                     if (lseek(env->me_fd, wpos, SEEK_SET) == -1) {
3736                         rc = ErrCode();
3737                         if (rc == EINTR)
3738                             goto retry_seek;
3739                         DPRINTF(("lseek: %s", strerror(rc)));
3740                         return rc;
3741                     }
3742                     wres = writev(env->me_fd, iov, n);
3743                 }
3744 #endif
3745                 if (wres != wsize) {
3746                     if (wres < 0) {
3747                         rc = ErrCode();
3748                         if (rc == EINTR)
3749                             goto retry_write;
3750                         DPRINTF(("Write error: %s", strerror(rc)));
3751                     } else {
3752                         rc = EIO; /* TODO: Use which error code? */
3753                         DPUTS("short write, filesystem full?");
3754                     }
3755                     return rc;
3756                 }
3757                 n = 0;
3758             }
3759             if (i > pagecount)
3760                 break;
3761             wpos = pos;
3762             wsize = 0;
3763         }
3764         DPRINTF(("committing page %"Yu, pgno));
3765         next_pos = pos + size;
3766         iov[n].iov_len = size;
3767         iov[n].iov_base = (char *)dp;
3768         wsize += size;
3769         n++;
3770 #endif  /* _WIN32 */
3771     }
3772 #ifdef EDB_VL32
3773     if (pgno > txn->mt_last_pgno)
3774         txn->mt_last_pgno = pgno;
3775 #endif
3776 
3777     /* MIPS has cache coherency issues, this is a no-op everywhere else
3778      * Note: for any size >= on-chip cache size, entire on-chip cache is
3779      * flushed.
3780      */
3781     CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE);
3782 
3783     for (i = keep; ++i <= pagecount; ) {
3784         dp = dl[i].mptr;
3785         /* This is a page we skipped above */
3786         if (!dl[i].mid) {
3787             dl[++j] = dl[i];
3788             dl[j].mid = dp->mp_pgno;
3789             continue;
3790         }
3791         edb_dpage_free(env, dp);
3792     }
3793 
3794 done:
3795     i--;
3796     txn->mt_dirty_room += i - j;
3797     dl[0].mid = j;
3798     return EDB_SUCCESS;
3799 }
3800 
3801 static int ESECT edb_env_share_locks(EDB_env *env, int *excl);
3802 
3803 int
3804 edb_txn_commit(EDB_txn *txn)
3805 {
3806     int     rc;
3807     unsigned int i, end_mode;
3808     EDB_env *env;
3809 
3810     if (txn == NULL)
3811         return EINVAL;
3812 
3813     /* edb_txn_end() mode for a commit which writes nothing */
3814     end_mode = EDB_END_EMPTY_COMMIT|EDB_END_UPDATE|EDB_END_SLOT|EDB_END_FREE;
3815 
3816     if (txn->mt_child) {
3817         rc = edb_txn_commit(txn->mt_child);
3818         if (rc)
3819             goto fail;
3820     }
3821 
3822     env = txn->mt_env;
3823 
3824     if (F_ISSET(txn->mt_flags, EDB_TXN_RDONLY)) {
3825         goto done;
3826     }
3827 
3828     if (txn->mt_flags & (EDB_TXN_FINISHED|EDB_TXN_ERROR)) {
3829         DPUTS("txn has failed/finished, can't commit");
3830         if (txn->mt_parent)
3831             txn->mt_parent->mt_flags |= EDB_TXN_ERROR;
3832         rc = EDB_BAD_TXN;
3833         goto fail;
3834     }
3835 
3836     if (txn->mt_parent) {
3837         EDB_txn *parent = txn->mt_parent;
3838         EDB_page **lp;
3839         EDB_ID2L dst, src;
3840         EDB_IDL pspill;
3841         unsigned x, y, len, ps_len;
3842 
3843         /* Append our free list to parent's */
3844         rc = edb_eidl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
3845         if (rc)
3846             goto fail;
3847         edb_eidl_free(txn->mt_free_pgs);
3848         /* Failures after this must either undo the changes
3849          * to the parent or set EDB_TXN_ERROR in the parent.
3850          */
3851 
3852         parent->mt_next_pgno = txn->mt_next_pgno;
3853         parent->mt_flags = txn->mt_flags;
3854 
3855         /* Merge our cursors into parent's and close them */
3856         edb_cursors_close(txn, 1);
3857 
3858         /* Update parent's DB table. */
3859         memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_nuedbs * sizeof(EDB_db));
3860         parent->mt_nuedbs = txn->mt_nuedbs;
3861         parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI];
3862         parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI];
3863         for (i=CORE_DBS; i<txn->mt_nuedbs; i++) {
3864             /* preserve parent's DB_NEW status */
3865             x = parent->mt_dbflags[i] & DB_NEW;
3866             parent->mt_dbflags[i] = txn->mt_dbflags[i] | x;
3867         }
3868 
3869         dst = parent->mt_u.dirty_list;
3870         src = txn->mt_u.dirty_list;
3871         /* Remove anything in our dirty list from parent's spill list */
3872         if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
3873             x = y = ps_len;
3874             pspill[0] = (pgno_t)-1;
3875             /* Mark our dirty pages as deleted in parent spill list */
3876             for (i=0, len=src[0].mid; ++i <= len; ) {
3877                 EDB_ID pn = src[i].mid << 1;
3878                 while (pn > pspill[x])
3879                     x--;
3880                 if (pn == pspill[x]) {
3881                     pspill[x] = 1;
3882                     y = --x;
3883                 }
3884             }
3885             /* Squash deleted pagenums if we deleted any */
3886             for (x=y; ++x <= ps_len; )
3887                 if (!(pspill[x] & 1))
3888                     pspill[++y] = pspill[x];
3889             pspill[0] = y;
3890         }
3891 
3892         /* Remove anything in our spill list from parent's dirty list */
3893         if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) {
3894             for (i=1; i<=txn->mt_spill_pgs[0]; i++) {
3895                 EDB_ID pn = txn->mt_spill_pgs[i];
3896                 if (pn & 1)
3897                     continue;   /* deleted spillpg */
3898                 pn >>= 1;
3899                 y = edb_mid2l_search(dst, pn);
3900                 if (y <= dst[0].mid && dst[y].mid == pn) {
3901                     free(dst[y].mptr);
3902                     while (y < dst[0].mid) {
3903                         dst[y] = dst[y+1];
3904                         y++;
3905                     }
3906                     dst[0].mid--;
3907                 }
3908             }
3909         }
3910 
3911         /* Find len = length of merging our dirty list with parent's */
3912         x = dst[0].mid;
3913         dst[0].mid = 0;     /* simplify loops */
3914         if (parent->mt_parent) {
3915             len = x + src[0].mid;
3916             y = edb_mid2l_search(src, dst[x].mid + 1) - 1;
3917             for (i = x; y && i; y--) {
3918                 pgno_t yp = src[y].mid;
3919                 while (yp < dst[i].mid)
3920                     i--;
3921                 if (yp == dst[i].mid) {
3922                     i--;
3923                     len--;
3924                 }
3925             }
3926         } else { /* Simplify the above for single-ancestor case */
3927             len = EDB_IDL_UM_MAX - txn->mt_dirty_room;
3928         }
3929         /* Merge our dirty list with parent's */
3930         y = src[0].mid;
3931         for (i = len; y; dst[i--] = src[y--]) {
3932             pgno_t yp = src[y].mid;
3933             while (yp < dst[x].mid)
3934                 dst[i--] = dst[x--];
3935             if (yp == dst[x].mid)
3936                 free(dst[x--].mptr);
3937         }
3938         edb_tassert(txn, i == x);
3939         dst[0].mid = len;
3940         free(txn->mt_u.dirty_list);
3941         parent->mt_dirty_room = txn->mt_dirty_room;
3942         if (txn->mt_spill_pgs) {
3943             if (parent->mt_spill_pgs) {
3944                 /* TODO: Prevent failure here, so parent does not fail */
3945                 rc = edb_eidl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
3946                 if (rc)
3947                     parent->mt_flags |= EDB_TXN_ERROR;
3948                 edb_eidl_free(txn->mt_spill_pgs);
3949                 edb_eidl_sort(parent->mt_spill_pgs);
3950             } else {
3951                 parent->mt_spill_pgs = txn->mt_spill_pgs;
3952             }
3953         }
3954 
3955         /* Append our loose page list to parent's */
3956         for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp))
3957             ;
3958         *lp = txn->mt_loose_pgs;
3959         parent->mt_loose_count += txn->mt_loose_count;
3960 
3961         parent->mt_child = NULL;
3962         edb_eidl_free(((EDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
3963         free(txn);
3964         return rc;
3965     }
3966 
3967     if (txn != env->me_txn) {
3968         DPUTS("attempt to commit unknown transaction");
3969         rc = EINVAL;
3970         goto fail;
3971     }
3972 
3973     edb_cursors_close(txn, 0);
3974 
3975     if (!txn->mt_u.dirty_list[0].mid &&
3976         !(txn->mt_flags & (EDB_TXN_DIRTY|EDB_TXN_SPILLS)))
3977         goto done;
3978 
3979     DPRINTF(("committing txn %"Yu" %p on edbenv %p, root page %"Yu,
3980         txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root));
3981 
3982     /* Update DB root pointers */
3983     if (txn->mt_nuedbs > CORE_DBS) {
3984         EDB_cursor mc;
3985         EDB_dbi i;
3986         EDB_val data;
3987         data.mv_size = sizeof(EDB_db);
3988 
3989         edb_cursor_init(&mc, txn, MAIN_DBI, NULL);
3990         for (i = CORE_DBS; i < txn->mt_nuedbs; i++) {
3991             if (txn->mt_dbflags[i] & DB_DIRTY) {
3992                 if (TXN_DBI_CHANGED(txn, i)) {
3993                     rc = EDB_BAD_DBI;
3994                     goto fail;
3995                 }
3996                 data.mv_data = &txn->mt_dbs[i];
3997                 rc = edb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data,
3998                     F_SUBDATA);
3999                 if (rc)
4000                     goto fail;
4001             }
4002         }
4003     }
4004 
4005     rc = edb_freelist_save(txn);
4006     if (rc)
4007         goto fail;
4008 
4009     edb_eidl_free(env->me_pghead);
4010     env->me_pghead = NULL;
4011     edb_eidl_shrink(&txn->mt_free_pgs);
4012 
4013 #if (EDB_DEBUG) > 2
4014     edb_audit(txn);
4015 #endif
4016 
4017     if ((rc = edb_page_flush(txn, 0)))
4018         goto fail;
4019     if (!F_ISSET(txn->mt_flags, EDB_TXN_NOSYNC) &&
4020         (rc = edb_env_sync0(env, 0, txn->mt_next_pgno)))
4021         goto fail;
4022     if ((rc = edb_env_write_meta(txn)))
4023         goto fail;
4024     end_mode = EDB_END_COMMITTED|EDB_END_UPDATE;
4025     if (env->me_flags & EDB_PREVSNAPSHOT) {
4026         if (!(env->me_flags & EDB_NOLOCK)) {
4027             int excl;
4028             rc = edb_env_share_locks(env, &excl);
4029             if (rc)
4030                 goto fail;
4031         }
4032         env->me_flags ^= EDB_PREVSNAPSHOT;
4033     }
4034 
4035 done:
4036     edb_txn_end(txn, end_mode);
4037     return EDB_SUCCESS;
4038 
4039 fail:
4040     edb_txn_abort(txn);
4041     return rc;
4042 }
4043 
4044 /** Read the environment parameters of a DB environment before
4045  * mapping it into memory.
4046  * @param[in] env the environment handle
4047  * @param[in] prev whether to read the backup meta page
4048  * @param[out] meta address of where to store the meta information
4049  * @return 0 on success, non-zero on failure.
4050  */
4051 static int ESECT
4052 edb_env_read_header(EDB_env *env, int prev, EDB_meta *meta)
4053 {
4054     EDB_metabuf pbuf;
4055     EDB_page    *p;
4056     EDB_meta    *m;
4057     int         i, rc, off;
4058     enum { Size = sizeof(pbuf) };
4059 
4060     /* We don't know the page size yet, so use a minimum value.
4061      * Read both meta pages so we can use the latest one.
4062      */
4063 
4064     for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) {
4065 #ifdef _WIN32
4066         DWORD len;
4067         OVERLAPPED ov;
4068         memset(&ov, 0, sizeof(ov));
4069         ov.Offset = off;
4070         rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
4071         if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
4072             rc = 0;
4073 #else
4074         rc = pread(env->me_fd, &pbuf, Size, off);
4075 #endif
4076         if (rc != Size) {
4077             if (rc == 0 && off == 0)
4078                 return ENOENT;
4079             rc = rc < 0 ? (int) ErrCode() : EDB_INVALID;
4080             DPRINTF(("read: %s", edb_strerror(rc)));
4081             return rc;
4082         }
4083 
4084         p = (EDB_page *)&pbuf;
4085 
4086         if (!F_ISSET(p->mp_flags, P_META)) {
4087             DPRINTF(("page %"Yu" not a meta page", p->mp_pgno));
4088             return EDB_INVALID;
4089         }
4090 
4091         m = METADATA(p);
4092         if (m->mm_magic != EDB_MAGIC) {
4093             DPUTS("meta has invalid magic");
4094             return EDB_INVALID;
4095         }
4096 
4097         if (m->mm_version != EDB_DATA_VERSION) {
4098             DPRINTF(("database is version %u, expected version %u",
4099                 m->mm_version, EDB_DATA_VERSION));
4100             return EDB_VERSION_MISMATCH;
4101         }
4102 
4103         if (off == 0 || (prev ? m->mm_txnid < meta->mm_txnid : m->mm_txnid > meta->mm_txnid))
4104             *meta = *m;
4105     }
4106     return 0;
4107 }
4108 
4109 /** Fill in most of the zeroed #EDB_meta for an empty database environment */
4110 static void ESECT
4111 edb_env_init_meta0(EDB_env *env, EDB_meta *meta)
4112 {
4113     meta->mm_magic = EDB_MAGIC;
4114     meta->mm_version = EDB_DATA_VERSION;
4115     meta->mm_mapsize = env->me_mapsize;
4116     meta->mm_psize = env->me_psize;
4117     meta->mm_last_pg = NUM_METAS-1;
4118     meta->mm_flags = env->me_flags & 0xffff;
4119     meta->mm_flags |= EDB_INTEGERKEY; /* this is mm_dbs[FREE_DBI].md_flags */
4120     meta->mm_dbs[FREE_DBI].md_root = P_INVALID;
4121     meta->mm_dbs[MAIN_DBI].md_root = P_INVALID;
4122 }
4123 
4124 /** Write the environment parameters of a freshly created DB environment.
4125  * @param[in] env the environment handle
4126  * @param[in] meta the #EDB_meta to write
4127  * @return 0 on success, non-zero on failure.
4128  */
4129 static int ESECT
4130 edb_env_init_meta(EDB_env *env, EDB_meta *meta)
4131 {
4132     EDB_page *p, *q;
4133     int rc;
4134     unsigned int     psize;
4135 #ifdef _WIN32
4136     DWORD len;
4137     OVERLAPPED ov;
4138     memset(&ov, 0, sizeof(ov));
4139 #define DO_PWRITE(rc, fd, ptr, size, len, pos)  do { \
4140     ov.Offset = pos;    \
4141     rc = WriteFile(fd, ptr, size, &len, &ov);   } while(0)
4142 #else
4143     int len;
4144 #define DO_PWRITE(rc, fd, ptr, size, len, pos)  do { \
4145     len = pwrite(fd, ptr, size, pos);   \
4146     if (len == -1 && ErrCode() == EINTR) continue; \
4147     rc = (len >= 0); break; } while(1)
4148 #endif
4149 
4150     DPUTS("writing new meta page");
4151 
4152     psize = env->me_psize;
4153 
4154     p = calloc(NUM_METAS, psize);
4155     if (!p)
4156         return ENOMEM;
4157     p->mp_pgno = 0;
4158     p->mp_flags = P_META;
4159     *(EDB_meta *)METADATA(p) = *meta;
4160 
4161     q = (EDB_page *)((char *)p + psize);
4162     q->mp_pgno = 1;
4163     q->mp_flags = P_META;
4164     *(EDB_meta *)METADATA(q) = *meta;
4165 
4166     DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0);
4167     if (!rc)
4168         rc = ErrCode();
4169     else if ((unsigned) len == psize * NUM_METAS)
4170         rc = EDB_SUCCESS;
4171     else
4172         rc = ENOSPC;
4173     free(p);
4174     return rc;
4175 }
4176 
4177 /** Update the environment info to commit a transaction.
4178  * @param[in] txn the transaction that's being committed
4179  * @return 0 on success, non-zero on failure.
4180  */
4181 static int
4182 edb_env_write_meta(EDB_txn *txn)
4183 {
4184     EDB_env *env;
4185     EDB_meta    meta, metab, *mp;
4186     unsigned flags;
4187     edb_size_t mapsize;
4188     off_t off;
4189     int rc, len, toggle;
4190     char *ptr;
4191     HANDLE mfd;
4192 #ifdef _WIN32
4193     OVERLAPPED ov;
4194 #else
4195     int r2;
4196 #endif
4197 
4198     toggle = txn->mt_txnid & 1;
4199     DPRINTF(("writing meta page %d for root page %"Yu,
4200         toggle, txn->mt_dbs[MAIN_DBI].md_root));
4201 
4202     env = txn->mt_env;
4203     flags = txn->mt_flags | env->me_flags;
4204     mp = env->me_metas[toggle];
4205     mapsize = env->me_metas[toggle ^ 1]->mm_mapsize;
4206     /* Persist any increases of mapsize config */
4207     if (mapsize < env->me_mapsize)
4208         mapsize = env->me_mapsize;
4209 
4210     if (flags & EDB_WRITEMAP) {
4211         mp->mm_mapsize = mapsize;
4212         mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
4213         mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
4214         mp->mm_last_pg = txn->mt_next_pgno - 1;
4215 #if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && /* TODO: portability */ \
4216     !(defined(__i386__) || defined(__x86_64__))
4217         /* LY: issue a memory barrier, if not x86. ITS#7969 */
4218         __sync_synchronize();
4219 #endif
4220         mp->mm_txnid = txn->mt_txnid;
4221         if (!(flags & (EDB_NOMETASYNC|EDB_NOSYNC))) {
4222             unsigned meta_size = env->me_psize;
4223             rc = (env->me_flags & EDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
4224             ptr = (char *)mp - PAGEHDRSZ;
4225 #ifndef _WIN32  /* POSIX msync() requires ptr = start of OS page */
4226             r2 = (ptr - env->me_map) & (env->me_os_psize - 1);
4227             ptr -= r2;
4228             meta_size += r2;
4229 #endif
4230             if (EDB_MSYNC(ptr, meta_size, rc)) {
4231                 rc = ErrCode();
4232                 goto fail;
4233             }
4234         }
4235         goto done;
4236     }
4237     metab.mm_txnid = mp->mm_txnid;
4238     metab.mm_last_pg = mp->mm_last_pg;
4239 
4240     meta.mm_mapsize = mapsize;
4241     meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
4242     meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
4243     meta.mm_last_pg = txn->mt_next_pgno - 1;
4244     meta.mm_txnid = txn->mt_txnid;
4245 
4246     off = offsetof(EDB_meta, mm_mapsize);
4247     ptr = (char *)&meta + off;
4248     len = sizeof(EDB_meta) - off;
4249     off += (char *)mp - env->me_map;
4250 
4251     /* Write to the SYNC fd unless EDB_NOSYNC/EDB_NOMETASYNC.
4252      * (me_mfd goes to the same file as me_fd, but writing to it
4253      * also syncs to disk.  Avoids a separate fdatasync() call.)
4254      */
4255     mfd = (flags & (EDB_NOSYNC|EDB_NOMETASYNC)) ? env->me_fd : env->me_mfd;
4256 #ifdef _WIN32
4257     {
4258         memset(&ov, 0, sizeof(ov));
4259         ov.Offset = off;
4260         if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov))
4261             rc = -1;
4262     }
4263 #else
4264 retry_write:
4265     rc = pwrite(mfd, ptr, len, off);
4266 #endif
4267     if (rc != len) {
4268         rc = rc < 0 ? ErrCode() : EIO;
4269 #ifndef _WIN32
4270         if (rc == EINTR)
4271             goto retry_write;
4272 #endif
4273         DPUTS("write failed, disk error?");
4274         /* On a failure, the pagecache still contains the new data.
4275          * Write some old data back, to prevent it from being used.
4276          * Use the non-SYNC fd; we know it will fail anyway.
4277          */
4278         meta.mm_last_pg = metab.mm_last_pg;
4279         meta.mm_txnid = metab.mm_txnid;
4280 #ifdef _WIN32
4281         memset(&ov, 0, sizeof(ov));
4282         ov.Offset = off;
4283         WriteFile(env->me_fd, ptr, len, NULL, &ov);
4284 #else
4285         r2 = pwrite(env->me_fd, ptr, len, off);
4286         (void)r2;   /* Silence warnings. We don't care about pwrite's return value */
4287 #endif
4288 fail:
4289         env->me_flags |= EDB_FATAL_ERROR;
4290         return rc;
4291     }
4292     /* MIPS has cache coherency issues, this is a no-op everywhere else */
4293     CACHEFLUSH(env->me_map + off, len, DCACHE);
4294 done:
4295     /* Memory ordering issues are irrelevant; since the entire writer
4296      * is wrapped by wmutex, all of these changes will become visible
4297      * after the wmutex is unlocked. Since the DB is multi-version,
4298      * readers will get consistent data regardless of how fresh or
4299      * how stale their view of these values is.
4300      */
4301     if (env->me_txns)
4302         env->me_txns->mti_txnid = txn->mt_txnid;
4303 
4304     return EDB_SUCCESS;
4305 }
4306 
4307 /** Check both meta pages to see which one is newer.
4308  * @param[in] env the environment handle
4309  * @return newest #EDB_meta.
4310  */
4311 static EDB_meta *
4312 edb_env_pick_meta(const EDB_env *env)
4313 {
4314     EDB_meta *const *metas = env->me_metas;
4315     return metas[ (metas[0]->mm_txnid < metas[1]->mm_txnid) ^
4316         ((env->me_flags & EDB_PREVSNAPSHOT) != 0) ];
4317 }
4318 
4319 int ESECT
4320 edb_env_create(EDB_env **env)
4321 {
4322     EDB_env *e;
4323 
4324     e = calloc(1, sizeof(EDB_env));
4325     if (!e)
4326         return ENOMEM;
4327 
4328     e->me_maxreaders = DEFAULT_READERS;
4329     e->me_maxdbs = e->me_nuedbs = CORE_DBS;
4330     e->me_fd = INVALID_HANDLE_VALUE;
4331     e->me_lfd = INVALID_HANDLE_VALUE;
4332     e->me_mfd = INVALID_HANDLE_VALUE;
4333 #ifdef EDB_USE_POSIX_SEM
4334     e->me_rmutex = SEM_FAILED;
4335     e->me_wmutex = SEM_FAILED;
4336 #elif defined EDB_USE_SYSV_SEM
4337     e->me_rmutex->semid = -1;
4338     e->me_wmutex->semid = -1;
4339 #endif
4340     e->me_pid = getpid();
4341     GET_PAGESIZE(e->me_os_psize);
4342     VGMEMP_CREATE(e,0,0);
4343     *env = e;
4344     return EDB_SUCCESS;
4345 }
4346 
4347 #ifdef _WIN32
4348 /** @brief Map a result from an NTAPI call to WIN32. */
4349 static DWORD
4350 edb_nt2win32(NTSTATUS st)
4351 {
4352     OVERLAPPED o = {0};
4353     DWORD br;
4354     o.Internal = st;
4355     GetOverlappedResult(NULL, &o, &br, FALSE);
4356     return GetLastError();
4357 }
4358 #endif
4359 
4360 static int ESECT
4361 edb_env_map(EDB_env *env, void *addr)
4362 {
4363     EDB_page *p;
4364     unsigned int flags = env->me_flags;
4365 #ifdef _WIN32
4366     int rc;
4367     int access = SECTION_MAP_READ;
4368     HANDLE mh;
4369     void *map;
4370     SIZE_T msize;
4371     ULONG pageprot = PAGE_READONLY, secprot, alloctype;
4372 
4373     if (flags & EDB_WRITEMAP) {
4374         access |= SECTION_MAP_WRITE;
4375         pageprot = PAGE_READWRITE;
4376     }
4377     if (flags & EDB_RDONLY) {
4378         secprot = PAGE_READONLY;
4379         msize = 0;
4380         alloctype = 0;
4381     } else {
4382         secprot = PAGE_READWRITE;
4383         msize = env->me_mapsize;
4384         alloctype = MEM_RESERVE;
4385     }
4386 
4387     rc = NtCreateSection(&mh, access, NULL, NULL, secprot, SEC_RESERVE, env->me_fd);
4388     if (rc)
4389         return edb_nt2win32(rc);
4390     map = addr;
4391 #ifdef EDB_VL32
4392     msize = NUM_METAS * env->me_psize;
4393 #endif
4394     rc = NtMapViewOfSection(mh, GetCurrentProcess(), &map, 0, 0, NULL, &msize, ViewUnmap, alloctype, pageprot);
4395 #ifdef EDB_VL32
4396     env->me_fmh = mh;
4397 #else
4398     NtClose(mh);
4399 #endif
4400     if (rc)
4401         return edb_nt2win32(rc);
4402     env->me_map = map;
4403 #else
4404     int mmap_flags = MAP_SHARED;
4405     int prot = PROT_READ;
4406 #ifdef MAP_NOSYNC   /* Used on FreeBSD */
4407     if (flags & EDB_NOSYNC)
4408         mmap_flags |= MAP_NOSYNC;
4409 #endif
4410 #ifdef EDB_VL32
4411     (void) flags;
4412     env->me_map = mmap(addr, NUM_METAS * env->me_psize, prot, mmap_flags,
4413         env->me_fd, 0);
4414     if (env->me_map == MAP_FAILED) {
4415         env->me_map = NULL;
4416         return ErrCode();
4417     }
4418 #else
4419     if (flags & EDB_WRITEMAP) {
4420         prot |= PROT_WRITE;
4421         if (ftruncate(env->me_fd, env->me_mapsize) < 0)
4422             return ErrCode();
4423     }
4424     env->me_map = mmap(addr, env->me_mapsize, prot, mmap_flags,
4425         env->me_fd, 0);
4426     if (env->me_map == MAP_FAILED) {
4427         env->me_map = NULL;
4428         return ErrCode();
4429     }
4430 
4431     if (flags & EDB_NORDAHEAD) {
4432         /* Turn off readahead. It's harmful when the DB is larger than RAM. */
4433 #ifdef MADV_RANDOM
4434         madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
4435 #else
4436 #ifdef POSIX_MADV_RANDOM
4437         posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
4438 #endif /* POSIX_MADV_RANDOM */
4439 #endif /* MADV_RANDOM */
4440     }
4441 #endif /* _WIN32 */
4442 
4443     /* Can happen because the address argument to mmap() is just a
4444      * hint.  mmap() can pick another, e.g. if the range is in use.
4445      * The MAP_FIXED flag would prevent that, but then mmap could
4446      * instead unmap existing pages to make room for the new map.
4447      */
4448     if (addr && env->me_map != addr)
4449         return EBUSY;   /* TODO: Make a new EDB_* error code? */
4450 #endif
4451 
4452     p = (EDB_page *)env->me_map;
4453     env->me_metas[0] = METADATA(p);
4454     env->me_metas[1] = (EDB_meta *)((char *)env->me_metas[0] + env->me_psize);
4455 
4456     return EDB_SUCCESS;
4457 }
4458 
4459 int ESECT
4460 edb_env_set_mapsize(EDB_env *env, edb_size_t size)
4461 {
4462     /* If env is already open, caller is responsible for making
4463      * sure there are no active txns.
4464      */
4465     if (env->me_map) {
4466         EDB_meta *meta;
4467 #ifndef EDB_VL32
4468         void *old;
4469         int rc;
4470 #endif
4471         if (env->me_txn)
4472             return EINVAL;
4473         meta = edb_env_pick_meta(env);
4474         if (!size)
4475             size = meta->mm_mapsize;
4476         {
4477             /* Silently round up to minimum if the size is too small */
4478             edb_size_t minsize = (meta->mm_last_pg + 1) * env->me_psize;
4479             if (size < minsize)
4480                 size = minsize;
4481         }
4482 #ifndef EDB_VL32
4483         /* For EDB_VL32 this bit is a noop since we dynamically remap
4484          * chunks of the DB anyway.
4485          */
4486         munmap(env->me_map, env->me_mapsize);
4487         env->me_mapsize = size;
4488         old = (env->me_flags & EDB_FIXEDMAP) ? env->me_map : NULL;
4489         rc = edb_env_map(env, old);
4490         if (rc)
4491             return rc;
4492 #endif /* !EDB_VL32 */
4493     }
4494     env->me_mapsize = size;
4495     if (env->me_psize)
4496         env->me_maxpg = env->me_mapsize / env->me_psize;
4497     return EDB_SUCCESS;
4498 }
4499 
4500 int ESECT
4501 edb_env_set_maxdbs(EDB_env *env, EDB_dbi dbs)
4502 {
4503     if (env->me_map)
4504         return EINVAL;
4505     env->me_maxdbs = dbs + CORE_DBS;
4506     return EDB_SUCCESS;
4507 }
4508 
4509 int ESECT
4510 edb_env_set_maxreaders(EDB_env *env, unsigned int readers)
4511 {
4512     if (env->me_map || readers < 1)
4513         return EINVAL;
4514     env->me_maxreaders = readers;
4515     return EDB_SUCCESS;
4516 }
4517 
4518 int ESECT
4519 edb_env_get_maxreaders(EDB_env *env, unsigned int *readers)
4520 {
4521     if (!env || !readers)
4522         return EINVAL;
4523     *readers = env->me_maxreaders;
4524     return EDB_SUCCESS;
4525 }
4526 
4527 static int ESECT
4528 edb_fsize(HANDLE fd, edb_size_t *size)
4529 {
4530 #ifdef _WIN32
4531     LARGE_INTEGER fsize;
4532 
4533     if (!GetFileSizeEx(fd, &fsize))
4534         return ErrCode();
4535 
4536     *size = fsize.QuadPart;
4537 #else
4538     struct stat st;
4539 
4540     if (fstat(fd, &st))
4541         return ErrCode();
4542 
4543     *size = st.st_size;
4544 #endif
4545     return EDB_SUCCESS;
4546 }
4547 
4548 
4549 #ifdef _WIN32
4550 typedef wchar_t edb_nchar_t;
4551 # define EDB_NAME(str)  L##str
4552 # define edb_name_cpy   wcscpy
4553 #else
4554 /** Character type for file names: char on Unix, wchar_t on Windows */
4555 typedef char    edb_nchar_t;
4556 # define EDB_NAME(str)  str     /**< #edb_nchar_t[] string literal */
4557 # define edb_name_cpy   strcpy  /**< Copy name (#edb_nchar_t string) */
4558 #endif
4559 
4560 /** Filename - string of #edb_nchar_t[] */
4561 typedef struct EDB_name {
4562     int mn_len;                 /**< Length  */
4563     int mn_alloced;             /**< True if #mn_val was malloced */
4564     edb_nchar_t *mn_val;        /**< Contents */
4565 } EDB_name;
4566 
4567 /** Filename suffixes [datafile,lockfile][without,with EDB_NOSUBDIR] */
4568 static const edb_nchar_t *const edb_suffixes[2][2] = {
4569     { EDB_NAME("/data.edb"), EDB_NAME("")      },
4570     { EDB_NAME("/lock.edb"), EDB_NAME("-lock") }
4571 };
4572 
4573 #define EDB_SUFFLEN 9   /**< Max string length in #edb_suffixes[] */
4574 
4575 /** Set up filename + scratch area for filename suffix, for opening files.
4576  * It should be freed with #edb_fname_destroy().
4577  * On Windows, paths are converted from char *UTF-8 to wchar_t *UTF-16.
4578  *
4579  * @param[in] path Pathname for #edb_env_open().
4580  * @param[in] envflags Whether a subdir and/or lockfile will be used.
4581  * @param[out] fname Resulting filename, with room for a suffix if necessary.
4582  */
4583 static int ESECT
4584 edb_fname_init(const char *path, unsigned envflags, EDB_name *fname)
4585 {
4586     int no_suffix = F_ISSET(envflags, EDB_NOSUBDIR|EDB_NOLOCK);
4587     fname->mn_alloced = 0;
4588 #ifdef _WIN32
4589     return utf8_to_utf16(path, fname, no_suffix ? 0 : EDB_SUFFLEN);
4590 #else
4591     fname->mn_len = strlen(path);
4592     if (no_suffix)
4593         fname->mn_val = (char *) path;
4594     else if ((fname->mn_val = malloc(fname->mn_len + EDB_SUFFLEN+1)) != NULL) {
4595         fname->mn_alloced = 1;
4596         strcpy(fname->mn_val, path);
4597     }
4598     else
4599     {
4600         NDRX_LOG(log_error, "%s: malloc fail: %s",
4601             __func__, strerror(errno));
4602         return ENOMEM;
4603     }
4604     return EDB_SUCCESS;
4605 #endif
4606 }
4607 
4608 /** Destroy \b fname from #edb_fname_init() */
4609 #define edb_fname_destroy(fname) \
4610     do { if ((fname).mn_alloced) free((fname).mn_val); } while (0)
4611 
4612 #ifdef O_CLOEXEC /* POSIX.1-2008: Set FD_CLOEXEC atomically at open() */
4613 # define EDB_CLOEXEC        O_CLOEXEC
4614 #else
4615 # define EDB_CLOEXEC        0
4616 #endif
4617 
4618 /** File type, access mode etc. for #edb_fopen() */
4619 enum edb_fopen_type {
4620 #ifdef _WIN32
4621     EDB_O_RDONLY, EDB_O_RDWR, EDB_O_META, EDB_O_COPY, EDB_O_LOCKS
4622 #else
4623     /* A comment in edb_fopen() explains some O_* flag choices. */
4624     EDB_O_RDONLY= O_RDONLY,                            /**< for RDONLY me_fd */
4625     EDB_O_RDWR  = O_RDWR  |O_CREAT,                    /**< for me_fd */
4626     EDB_O_META  = O_WRONLY|EDB_DSYNC     |EDB_CLOEXEC, /**< for me_mfd */
4627     EDB_O_COPY  = O_WRONLY|O_CREAT|O_EXCL|EDB_CLOEXEC, /**< for #edb_env_copy() */
4628     /** Bitmask for open() flags in enum #edb_fopen_type.  The other bits
4629      * distinguish otherwise-equal EDB_O_* constants from each other.
4630      */
4631     EDB_O_MASK  = EDB_O_RDWR|EDB_CLOEXEC | EDB_O_RDONLY|EDB_O_META|EDB_O_COPY,
4632     EDB_O_LOCKS = EDB_O_RDWR|EDB_CLOEXEC | ((EDB_O_MASK+1) & ~EDB_O_MASK) /**< for me_lfd */
4633 #endif
4634 };
4635 
4636 /** Open an EXDB file.
4637  * @param[in] env   The EXDB environment.
4638  * @param[in,out] fname Path from from #edb_fname_init().  A suffix is
4639  * appended if necessary to create the filename, without changing mn_len.
4640  * @param[in] which Determines file type, access mode, etc.
4641  * @param[in] mode  The Unix permissions for the file, if we create it.
4642  * @param[out] res  Resulting file handle.
4643  * @return 0 on success, non-zero on failure.
4644  */
4645 static int ESECT
4646 edb_fopen(const EDB_env *env, EDB_name *fname,
4647     enum edb_fopen_type which, edb_mode_t mode,
4648     HANDLE *res)
4649 {
4650     int rc = EDB_SUCCESS;
4651     HANDLE fd;
4652 #ifdef _WIN32
4653     DWORD acc, share, disp, attrs;
4654 #else
4655     int flags;
4656 #endif
4657 
4658     if (fname->mn_alloced)      /* modifiable copy */
4659         edb_name_cpy(fname->mn_val + fname->mn_len,
4660             edb_suffixes[which==EDB_O_LOCKS][F_ISSET(env->me_flags, EDB_NOSUBDIR)]);
4661 
4662     /* The directory must already exist.  Usually the file need not.
4663      * EDB_O_META requires the file because we already created it using
4664      * EDB_O_RDWR.  EDB_O_COPY must not overwrite an existing file.
4665      *
4666      * With EDB_O_COPY we do not want the OS to cache the writes, since
4667      * the source data is already in the OS cache.
4668      *
4669      * The lockfile needs FD_CLOEXEC (close file descriptor on exec*())
4670      * to avoid the flock() issues noted under Caveats in exdb.h.
4671      * Also set it for other filehandles which the user cannot get at
4672      * and close himself, which he may need after fork().  I.e. all but
4673      * me_fd, which programs do use via edb_env_get_fd().
4674      */
4675 
4676 #ifdef _WIN32
4677     acc = GENERIC_READ|GENERIC_WRITE;
4678     share = FILE_SHARE_READ|FILE_SHARE_WRITE;
4679     disp = OPEN_ALWAYS;
4680     attrs = FILE_ATTRIBUTE_NORMAL;
4681     switch (which) {
4682     case EDB_O_RDONLY:          /* read-only datafile */
4683         acc = GENERIC_READ;
4684         disp = OPEN_EXISTING;
4685         break;
4686     case EDB_O_META:            /* for writing metapages */
4687         acc = GENERIC_WRITE;
4688         disp = OPEN_EXISTING;
4689         attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH;
4690         break;
4691     case EDB_O_COPY:            /* edb_env_copy() & co */
4692         acc = GENERIC_WRITE;
4693         share = 0;
4694         disp = CREATE_NEW;
4695         attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH;
4696         break;
4697     default: break; /* silence gcc -Wswitch (not all enum values handled) */
4698     }
4699     fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL);
4700 #else
4701     fd = open(fname->mn_val, which & EDB_O_MASK, mode);
4702 #endif
4703 
4704     if (fd == INVALID_HANDLE_VALUE)
4705         rc = ErrCode();
4706 #ifndef _WIN32
4707     else {
4708         if (which != EDB_O_RDONLY && which != EDB_O_RDWR) {
4709             /* Set CLOEXEC if we could not pass it to open() */
4710             if (!EDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1)
4711                 (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
4712         }
4713         if (which == EDB_O_COPY && env->me_psize >= env->me_os_psize) {
4714             /* This may require buffer alignment.  There is no portable
4715              * way to ask how much, so we require OS pagesize alignment.
4716              */
4717 # ifdef F_NOCACHE   /* __APPLE__ */
4718             (void) fcntl(fd, F_NOCACHE, 1);
4719 # elif defined O_DIRECT
4720             /* open(...O_DIRECT...) would break on filesystems without
4721              * O_DIRECT support (ITS#7682). Try to set it here instead.
4722              */
4723             if ((flags = fcntl(fd, F_GETFL)) != -1)
4724                 (void) fcntl(fd, F_SETFL, flags | O_DIRECT);
4725 # endif
4726         }
4727     }
4728 #endif  /* !_WIN32 */
4729 
4730     *res = fd;
4731     return rc;
4732 }
4733 
4734 
4735 #ifdef BROKEN_FDATASYNC
4736 #include <sys/utsname.h>
4737 #include <sys/vfs.h>
4738 #endif
4739 
4740 /** Further setup required for opening an EXDB environment
4741  */
4742 static int ESECT
4743 edb_env_open2(EDB_env *env, int prev)
4744 {
4745     unsigned int flags = env->me_flags;
4746     int i, newenv = 0, rc;
4747     EDB_meta meta;
4748 
4749 #ifdef _WIN32
4750     /* See if we should use QueryLimited */
4751     rc = GetVersion();
4752     if ((rc & 0xff) > 5)
4753         env->me_pidquery = EDB_PROCESS_QUERY_LIMITED_INFORMATION;
4754     else
4755         env->me_pidquery = PROCESS_QUERY_INFORMATION;
4756     /* Grab functions we need from NTDLL */
4757     if (!NtCreateSection) {
4758         HMODULE h = GetModuleHandleW(L"NTDLL.DLL");
4759         if (!h)
4760             return EDB_PROBLEM;
4761         NtClose = (NtCloseFunc *)GetProcAddress(h, "NtClose");
4762         if (!NtClose)
4763             return EDB_PROBLEM;
4764         NtMapViewOfSection = (NtMapViewOfSectionFunc *)GetProcAddress(h, "NtMapViewOfSection");
4765         if (!NtMapViewOfSection)
4766             return EDB_PROBLEM;
4767         NtCreateSection = (NtCreateSectionFunc *)GetProcAddress(h, "NtCreateSection");
4768         if (!NtCreateSection)
4769             return EDB_PROBLEM;
4770     }
4771 #endif /* _WIN32 */
4772 
4773 #ifdef BROKEN_FDATASYNC
4774     /* ext3/ext4 fdatasync is broken on some older Linux kernels.
4775      * https://lkml.org/lkml/2012/9/3/83
4776      * Kernels after 3.6-rc6 are known good.
4777      * https://lkml.org/lkml/2012/9/10/556
4778      * See if the DB is on ext3/ext4, then check for new enough kernel
4779      * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known
4780      * to be patched.
4781      */
4782     {
4783         struct statfs st;
4784         fstatfs(env->me_fd, &st);
4785         while (st.f_type == 0xEF53) {
4786             struct utsname uts;
4787             int i;
4788             uname(&uts);
4789             if (uts.release[0] < '3') {
4790                 if (!strncmp(uts.release, "2.6.32.", 7)) {
4791                     i = atoi(uts.release+7);
4792                     if (i >= 60)
4793                         break;  /* 2.6.32.60 and newer is OK */
4794                 } else if (!strncmp(uts.release, "2.6.34.", 7)) {
4795                     i = atoi(uts.release+7);
4796                     if (i >= 15)
4797                         break;  /* 2.6.34.15 and newer is OK */
4798                 }
4799             } else if (uts.release[0] == '3') {
4800                 i = atoi(uts.release+2);
4801                 if (i > 5)
4802                     break;  /* 3.6 and newer is OK */
4803                 if (i == 5) {
4804                     i = atoi(uts.release+4);
4805                     if (i >= 4)
4806                         break;  /* 3.5.4 and newer is OK */
4807                 } else if (i == 2) {
4808                     i = atoi(uts.release+4);
4809                     if (i >= 30)
4810                         break;  /* 3.2.30 and newer is OK */
4811                 }
4812             } else {    /* 4.x and newer is OK */
4813                 break;
4814             }
4815             env->me_flags |= EDB_FSYNCONLY;
4816             break;
4817         }
4818     }
4819 #endif
4820 
4821     if ((i = edb_env_read_header(env, prev, &meta)) != 0) {
4822         if (i != ENOENT)
4823             return i;
4824         DPUTS("new edbenv");
4825         newenv = 1;
4826         env->me_psize = env->me_os_psize;
4827         if (env->me_psize > MAX_PAGESIZE)
4828             env->me_psize = MAX_PAGESIZE;
4829         memset(&meta, 0, sizeof(meta));
4830         edb_env_init_meta0(env, &meta);
4831         meta.mm_mapsize = DEFAULT_MAPSIZE;
4832     } else {
4833         env->me_psize = meta.mm_psize;
4834     }
4835 
4836     /* Was a mapsize configured? */
4837     if (!env->me_mapsize) {
4838         env->me_mapsize = meta.mm_mapsize;
4839     }
4840     {
4841         /* Make sure mapsize >= committed data size.  Even when using
4842          * mm_mapsize, which could be broken in old files (ITS#7789).
4843          */
4844         edb_size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize;
4845         if (env->me_mapsize < minsize)
4846             env->me_mapsize = minsize;
4847     }
4848     meta.mm_mapsize = env->me_mapsize;
4849 
4850     if (newenv && !(flags & EDB_FIXEDMAP)) {
4851         /* edb_env_map() may grow the datafile.  Write the metapages
4852          * first, so the file will be valid if initialization fails.
4853          * Except with FIXEDMAP, since we do not yet know mm_address.
4854          * We could fill in mm_address later, but then a different
4855          * program might end up doing that - one with a memory layout
4856          * and map address which does not suit the main program.
4857          */
4858         rc = edb_env_init_meta(env, &meta);
4859         if (rc) {
4860             NDRX_LOG(log_error, "%s: edb_env_init_meta failed: %d",
4861                 __func__, rc);
4862             return rc;
4863         }
4864         newenv = 0;
4865     }
4866 #ifdef _WIN32
4867     /* For FIXEDMAP, make sure the file is non-empty before we attempt to map it */
4868     if (newenv) {
4869         char dummy = 0;
4870         DWORD len;
4871         rc = WriteFile(env->me_fd, &dummy, 1, &len, NULL);
4872         if (!rc) {
4873             rc = ErrCode();
4874             return rc;
4875         }
4876     }
4877 #endif
4878 
4879     rc = edb_env_map(env, (flags & EDB_FIXEDMAP) ? meta.mm_address : NULL);
4880     if (rc) {
4881         NDRX_LOG(log_error, "%s: edb_env_map failed: %d",
4882             __func__, rc);
4883         return rc;
4884     }
4885 
4886     if (newenv) {
4887         if (flags & EDB_FIXEDMAP)
4888             meta.mm_address = env->me_map;
4889         i = edb_env_init_meta(env, &meta);
4890         if (i != EDB_SUCCESS) {
4891             return i;
4892         }
4893     }
4894 
4895     env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
4896     env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / EDB_MINKEYS) & -2)
4897         - sizeof(indx_t);
4898 #if !(EDB_MAXKEYSIZE)
4899     env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(EDB_db));
4900 #endif
4901     env->me_maxpg = env->me_mapsize / env->me_psize;
4902 
4903     if (env->me_txns)
4904         env->me_txns->mti_txnid = meta.mm_txnid;
4905 
4906 #if EDB_DEBUG
4907     {
4908         EDB_meta *meta = edb_env_pick_meta(env);
4909         EDB_db *db = &meta->mm_dbs[MAIN_DBI];
4910 
4911         DPRINTF(("opened database version %u, pagesize %u",
4912             meta->mm_version, env->me_psize));
4913         DPRINTF(("using meta page %d",  (int) (meta->mm_txnid & 1)));
4914         DPRINTF(("depth: %u",           db->md_depth));
4915         DPRINTF(("entries: %"Yu,        db->md_entries));
4916         DPRINTF(("branch pages: %"Yu,   db->md_branch_pages));
4917         DPRINTF(("leaf pages: %"Yu,     db->md_leaf_pages));
4918         DPRINTF(("overflow pages: %"Yu, db->md_overflow_pages));
4919         DPRINTF(("root: %"Yu,           db->md_root));
4920     }
4921 #endif
4922 
4923     return EDB_SUCCESS;
4924 }
4925 
4926 
4927 /** Release a reader thread's slot in the reader lock table.
4928  *  This function is called automatically when a thread exits.
4929  * @param[in] ptr This points to the slot in the reader lock table.
4930  */
4931 static void
4932 edb_env_reader_dest(void *ptr)
4933 {
4934     EDB_reader *reader = ptr;
4935 
4936 #ifndef _WIN32
4937     if (reader->mr_pid == getpid()) /* catch pthread_exit() in child process */
4938 #endif
4939         /* We omit the mutex, so do this atomically (i.e. skip mr_txnid) */
4940         reader->mr_pid = 0;
4941 }
4942 
4943 #ifdef _WIN32
4944 /** Junk for arranging thread-specific callbacks on Windows. This is
4945  *  necessarily platform and compiler-specific. Windows supports up
4946  *  to 1088 keys. Let's assume nobody opens more than 64 environments
4947  *  in a single process, for now. They can override this if needed.
4948  */
4949 #ifndef MAX_TLS_KEYS
4950 #define MAX_TLS_KEYS    64
4951 #endif
4952 static pthread_key_t edb_tls_keys[MAX_TLS_KEYS];
4953 static int edb_tls_nkeys;
4954 
4955 static void NTAPI edb_tls_callback(PVOID module, DWORD reason, PVOID ptr)
4956 {
4957     int i;
4958     switch(reason) {
4959     case DLL_PROCESS_ATTACH: break;
4960     case DLL_THREAD_ATTACH: break;
4961     case DLL_THREAD_DETACH:
4962         for (i=0; i<edb_tls_nkeys; i++) {
4963             EDB_reader *r = pthread_getspecific(edb_tls_keys[i]);
4964             if (r) {
4965                 edb_env_reader_dest(r);
4966             }
4967         }
4968         break;
4969     case DLL_PROCESS_DETACH: break;
4970     }
4971 }
4972 #ifdef __GNUC__
4973 #ifdef _WIN64
4974 const PIMAGE_TLS_CALLBACK edb_tls_cbp __attribute__((section (".CRT$XLB"))) = edb_tls_callback;
4975 #else
4976 PIMAGE_TLS_CALLBACK edb_tls_cbp __attribute__((section (".CRT$XLB"))) = edb_tls_callback;
4977 #endif
4978 #else
4979 #ifdef _WIN64
4980 /* Force some symbol references.
4981  *  _tls_used forces the linker to create the TLS directory if not already done
4982  *  edb_tls_cbp prevents whole-program-optimizer from dropping the symbol.
4983  */
4984 #pragma comment(linker, "/INCLUDE:_tls_used")
4985 #pragma comment(linker, "/INCLUDE:edb_tls_cbp")
4986 #pragma const_seg(".CRT$XLB")
4987 extern const PIMAGE_TLS_CALLBACK edb_tls_cbp;
4988 const PIMAGE_TLS_CALLBACK edb_tls_cbp = edb_tls_callback;
4989 #pragma const_seg()
4990 #else   /* _WIN32 */
4991 #pragma comment(linker, "/INCLUDE:__tls_used")
4992 #pragma comment(linker, "/INCLUDE:_edb_tls_cbp")
4993 #pragma data_seg(".CRT$XLB")
4994 PIMAGE_TLS_CALLBACK edb_tls_cbp = edb_tls_callback;
4995 #pragma data_seg()
4996 #endif  /* WIN 32/64 */
4997 #endif  /* !__GNUC__ */
4998 #endif
4999 
5000 /** Downgrade the exclusive lock on the region back to shared */
5001 static int ESECT
5002 edb_env_share_locks(EDB_env *env, int *excl)
5003 {
5004     int rc = 0;
5005 
5006 #ifdef _WIN32
5007     {
5008         OVERLAPPED ov;
5009         /* First acquire a shared lock. The Unlock will
5010          * then release the existing exclusive lock.
5011          */
5012         memset(&ov, 0, sizeof(ov));
5013         if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
5014             rc = ErrCode();
5015         } else {
5016             UnlockFile(env->me_lfd, 0, 0, 1, 0);
5017             *excl = 0;
5018         }
5019     }
5020 #else
5021     {
5022         struct flock lock_info;
5023         /* The shared lock replaces the existing lock */
5024         memset((void *)&lock_info, 0, sizeof(lock_info));
5025         lock_info.l_type = F_RDLCK;
5026         lock_info.l_whence = SEEK_SET;
5027         lock_info.l_start = 0;
5028         lock_info.l_len = 1;
5029         while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
5030                 (rc = ErrCode()) == EINTR) ;
5031         *excl = rc ? -1 : 0;    /* error may mean we lost the lock */
5032     }
5033 #endif
5034 
5035     return rc;
5036 }
5037 
5038 /** Try to get exclusive lock, otherwise shared.
5039  *  Maintain *excl = -1: no/unknown lock, 0: shared, 1: exclusive.
5040  */
5041 static int ESECT
5042 edb_env_excl_lock(EDB_env *env, int *excl)
5043 {
5044     int rc = 0;
5045 #ifdef _WIN32
5046     if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
5047         *excl = 1;
5048     } else {
5049         OVERLAPPED ov;
5050         memset(&ov, 0, sizeof(ov));
5051         if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
5052             *excl = 0;
5053         } else {
5054             rc = ErrCode();
5055         }
5056     }
5057 #else
5058     struct flock lock_info;
5059     memset((void *)&lock_info, 0, sizeof(lock_info));
5060     lock_info.l_type = F_WRLCK;
5061     lock_info.l_whence = SEEK_SET;
5062     lock_info.l_start = 0;
5063     lock_info.l_len = 1;
5064     while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
5065             (rc = ErrCode()) == EINTR) ;
5066     if (!rc) {
5067         *excl = 1;
5068     } else
5069 # ifndef EDB_USE_POSIX_MUTEX
5070     if (*excl < 0) /* always true when EDB_USE_POSIX_MUTEX */
5071 # endif
5072     {
5073         lock_info.l_type = F_RDLCK;
5074         while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) &&
5075                 (rc = ErrCode()) == EINTR) ;
5076         if (rc == 0)
5077             *excl = 0;
5078     }
5079 #endif
5080     return rc;
5081 }
5082 
5083 #ifdef EDB_USE_HASH
5084 /*
5085  * hash_64 - 64 bit Fowler/Noll/Vo-0 FNV-1a hash code
5086  *
5087  * @(#) $Revision: 5.1 $
5088  * @(#) $Id: hash_64a.c,v 5.1 2009/06/30 09:01:38 chongo Exp $
5089  * @(#) $Source: /usr/local/src/cmd/fnv/RCS/hash_64a.c,v $
5090  *
5091  *    http://www.isthe.com/chongo/tech/comp/fnv/index.html
5092  *
5093  ***
5094  *
5095  * Please do not copyright this code.  This code is in the public domain.
5096  *
5097  * LANDON CURT NOLL DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
5098  * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO
5099  * EVENT SHALL LANDON CURT NOLL BE LIABLE FOR ANY SPECIAL, INDIRECT OR
5100  * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF
5101  * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
5102  * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
5103  * PERFORMANCE OF THIS SOFTWARE.
5104  *
5105  * By:
5106  *  chongo <Landon Curt Noll> /\oo/\
5107  *    http://www.isthe.com/chongo/
5108  *
5109  * Share and Enjoy! :-)
5110  */
5111 
5112 /** perform a 64 bit Fowler/Noll/Vo FNV-1a hash on a buffer
5113  * @param[in] val   value to hash
5114  * @param[in] len   length of value
5115  * @return 64 bit hash
5116  */
5117 static edb_hash_t
5118 edb_hash(const void *val, size_t len)
5119 {
5120     const unsigned char *s = (const unsigned char *) val, *end = s + len;
5121     edb_hash_t hval = 0xcbf29ce484222325ULL;
5122     /*
5123      * FNV-1a hash each octet of the buffer
5124      */
5125     while (s < end) {
5126         hval = (hval ^ *s++) * 0x100000001b3ULL;
5127     }
5128     /* return our new hash value */
5129     return hval;
5130 }
5131 
5132 /** Hash the string and output the encoded hash.
5133  * This uses modified RFC1924 Ascii85 encoding to accommodate systems with
5134  * very short name limits. We don't care about the encoding being reversible,
5135  * we just want to preserve as many bits of the input as possible in a
5136  * small printable string.
5137  * @param[in] str string to hash
5138  * @param[out] encbuf an array of 11 chars to hold the hash
5139  */
5140 static const char edb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~";
5141 
5142 static void ESECT
5143 edb_pack85(unsigned long long l, char *out)
5144 {
5145     int i;
5146 
5147     for (i=0; i<10 && l; i++) {
5148         *out++ = edb_a85[l % 85];
5149         l /= 85;
5150     }
5151     *out = '\0';
5152 }
5153 
5154 /** Init #EDB_env.me_mutexname[] except the char which #MUTEXNAME() will set.
5155  *  Changes to this code must be reflected in #EDB_LOCK_FORMAT.
5156  */
5157 static void ESECT
5158 edb_env_mname_init(EDB_env *env)
5159 {
5160     char *nm = env->me_mutexname;
5161     strcpy(nm, MUTEXNAME_PREFIX);
5162     edb_pack85(env->me_txns->mti_mutexid, nm + sizeof(MUTEXNAME_PREFIX));
5163 }
5164 
5165 /** Return env->me_mutexname after filling in ch ('r'/'w') for convenience */
5166 #define MUTEXNAME(env, ch) ( \
5167         (void) ((env)->me_mutexname[sizeof(MUTEXNAME_PREFIX)-1] = (ch)), \
5168         (env)->me_mutexname)
5169 
5170 #endif
5171 
5172 /** Open and/or initialize the lock region for the environment.
5173  * @param[in] env The EXDB environment.
5174  * @param[in] fname Filename + scratch area, from #edb_fname_init().
5175  * @param[in] mode The Unix permissions for the file, if we create it.
5176  * @param[in,out] excl In -1, out lock type: -1 none, 0 shared, 1 exclusive
5177  * @return 0 on success, non-zero on failure.
5178  */
5179 static int ESECT
5180 edb_env_setup_locks(EDB_env *env, EDB_name *fname, int mode, int *excl)
5181 {
5182 #ifdef _WIN32
5183 #   define EDB_ERRCODE_ROFS ERROR_WRITE_PROTECT
5184 #else
5185 #   define EDB_ERRCODE_ROFS EROFS
5186 #endif
5187 #ifdef EDB_USE_SYSV_SEM
5188     int semid;
5189     union semun semu;
5190 #endif
5191     int rc;
5192     off_t size, rsize;
5193 
5194     rc = edb_fopen(env, fname, EDB_O_LOCKS, mode, &env->me_lfd);
5195     if (rc) {
5196         /* Omit lockfile if read-only env on read-only filesystem */
5197         if (rc == EDB_ERRCODE_ROFS && (env->me_flags & EDB_RDONLY)) {
5198             return EDB_SUCCESS;
5199         }
5200         goto fail;
5201     }
5202 
5203     if (!(env->me_flags & EDB_NOTLS)) {
5204         rc = pthread_key_create(&env->me_txkey, edb_env_reader_dest);
5205         if (rc) {
5206             NDRX_LOG(log_debug, "%s: pthread_key_create failed: %d",
5207                 __func__, rc);
5208             goto fail;
5209         }
5210         env->me_flags |= EDB_ENV_TXKEY;
5211 #ifdef _WIN32
5212         /* Windows TLS callbacks need help finding their TLS info. */
5213         if (edb_tls_nkeys >= MAX_TLS_KEYS) {
5214             rc = EDB_TLS_FULL;
5215             goto fail;
5216         }
5217         edb_tls_keys[edb_tls_nkeys++] = env->me_txkey;
5218 #endif
5219     }
5220 
5221     /* Try to get exclusive lock. If we succeed, then
5222      * nobody is using the lock region and we should initialize it.
5223      */
5224     if ((rc = edb_env_excl_lock(env, excl))) goto fail;
5225 
5226 #ifdef _WIN32
5227     size = GetFileSize(env->me_lfd, NULL);
5228 #else
5229     size = lseek(env->me_lfd, 0, SEEK_END);
5230     if (size == -1) {
5231         int err_ = errno;
5232         NDRX_LOG(log_error, "%s: lseek failed: %s", __func__, strerror(err_));
5233         errno = err_;
5234         goto fail_errno;
5235     }
5236 #endif
5237     rsize = (env->me_maxreaders-1) * sizeof(EDB_reader) + sizeof(EDB_txninfo);
5238     if (size < rsize && *excl > 0) {
5239 #ifdef _WIN32
5240         if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
5241             || !SetEndOfFile(env->me_lfd))
5242             goto fail_errno;
5243 #else
5244         if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno;
5245 #endif
5246     } else {
5247         rsize = size;
5248         size = rsize - sizeof(EDB_txninfo);
5249         env->me_maxreaders = size/sizeof(EDB_reader) + 1;
5250     }
5251     {
5252 #ifdef _WIN32
5253         HANDLE mh;
5254         mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
5255             0, 0, NULL);
5256         if (!mh) goto fail_errno;
5257         env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
5258         CloseHandle(mh);
5259         if (!env->me_txns) goto fail_errno;
5260 #else
5261         void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
5262             env->me_lfd, 0);
5263         if (m == MAP_FAILED) {
5264             int err_ = errno;
5265             NDRX_LOG(log_error, "%s: mmap failed: %s",
5266                 __func__, strerror(err_));
5267             errno = err_;
5268             goto fail_errno;
5269         }
5270         env->me_txns = m;
5271 #endif
5272     }
5273     if (*excl > 0) {
5274 #ifdef _WIN32
5275         BY_HANDLE_FILE_INFORMATION stbuf;
5276         struct {
5277             DWORD volume;
5278             DWORD nhigh;
5279             DWORD nlow;
5280         } idbuf;
5281 
5282         if (!edb_sec_inited) {
5283             InitializeSecurityDescriptor(&edb_null_sd,
5284                 SECURITY_DESCRIPTOR_REVISION);
5285             SetSecurityDescriptorDacl(&edb_null_sd, TRUE, 0, FALSE);
5286             edb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
5287             edb_all_sa.bInheritHandle = FALSE;
5288             edb_all_sa.lpSecurityDescriptor = &edb_null_sd;
5289             edb_sec_inited = 1;
5290         }
5291         if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno;
5292         idbuf.volume = stbuf.dwVolumeSerialNumber;
5293         idbuf.nhigh  = stbuf.nFileIndexHigh;
5294         idbuf.nlow   = stbuf.nFileIndexLow;
5295         env->me_txns->mti_mutexid = edb_hash(&idbuf, sizeof(idbuf));
5296         edb_env_mname_init(env);
5297         env->me_rmutex = CreateMutexA(&edb_all_sa, FALSE, MUTEXNAME(env, 'r'));
5298         if (!env->me_rmutex) goto fail_errno;
5299         env->me_wmutex = CreateMutexA(&edb_all_sa, FALSE, MUTEXNAME(env, 'w'));
5300         if (!env->me_wmutex) goto fail_errno;
5301 #elif defined(EDB_USE_POSIX_SEM)
5302         struct stat stbuf;
5303         struct {
5304             dev_t dev;
5305             ino_t ino;
5306         } idbuf;
5307 
5308 #if defined(__NetBSD__)
5309 #define EDB_SHORT_SEMNAMES  1   /* limited to 14 chars */
5310 #endif
5311         if (fstat(env->me_lfd, &stbuf)) goto fail_errno;
5312         memset(&idbuf, 0, sizeof(idbuf));
5313         idbuf.dev = stbuf.st_dev;
5314         idbuf.ino = stbuf.st_ino;
5315         env->me_txns->mti_mutexid = edb_hash(&idbuf, sizeof(idbuf))
5316 #ifdef EDB_SHORT_SEMNAMES
5317             /* Max 9 base85-digits.  We truncate here instead of in
5318              * edb_env_mname_init() to keep the latter portable.
5319              */
5320             % ((edb_hash_t)85*85*85*85*85*85*85*85*85)
5321 #endif
5322             ;
5323         edb_env_mname_init(env);
5324         /* Clean up after a previous run, if needed:  Try to
5325          * remove both semaphores before doing anything else.
5326          */
5327         sem_unlink(MUTEXNAME(env, 'r'));
5328         sem_unlink(MUTEXNAME(env, 'w'));
5329         env->me_rmutex = sem_open(MUTEXNAME(env, 'r'), O_CREAT|O_EXCL, mode, 1);
5330         if (env->me_rmutex == SEM_FAILED) goto fail_errno;
5331         env->me_wmutex = sem_open(MUTEXNAME(env, 'w'), O_CREAT|O_EXCL, mode, 1);
5332         if (env->me_wmutex == SEM_FAILED) goto fail_errno;
5333 #elif defined(EDB_USE_SYSV_SEM)
5334         unsigned short vals[2] = {1, 1};
5335         key_t key = ftok(fname->mn_val, 'M'); /* fname is lockfile path now */
5336         if (key == -1) {
5337             int err_ = errno;
5338             NDRX_LOG(log_error, "%s: ftok failed: %s",
5339                 __func__, strerror(err_));
5340             errno = err_;
5341             goto fail_errno;
5342         }
5343         semid = semget(key, 2, (mode & 0777) | IPC_CREAT);
5344         if (semid < 0) {
5345             int err_ = errno;
5346             NDRX_LOG(log_error, "%s: semget failed: %s",
5347                 __func__, strerror(err_));
5348             errno = err_;
5349             goto fail_errno;
5350         }
5351         semu.array = vals;
5352         if (semctl(semid, 0, SETALL, semu) < 0) {
5353             int err_ = errno;
5354             NDRX_LOG(log_error, "%s: semctl failed: %s",
5355                 __func__, strerror(err_));
5356             errno = err_;
5357             goto fail_errno;
5358         }
5359         env->me_txns->mti_semid = semid;
5360         env->me_txns->mti_rlocked = 0;
5361         env->me_txns->mti_wlocked = 0;
5362 #else   /* EDB_USE_POSIX_MUTEX: */
5363         pthread_mutexattr_t mattr;
5364 
5365         /* Solaris needs this before initing a robust mutex.  Otherwise
5366          * it may skip the init and return EBUSY "seems someone already
5367          * inited" or EINVAL "it was inited differently".
5368          */
5369         memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex));
5370         memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex));
5371 
5372         if ((rc = pthread_mutexattr_init(&mattr)) != 0)
5373             goto fail;
5374         rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
5375 #ifdef EDB_ROBUST_SUPPORTED
5376         if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST);
5377 #endif
5378         if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr);
5379         if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr);
5380         pthread_mutexattr_destroy(&mattr);
5381         if (rc) {
5382             NDRX_LOG(log_error, "%s: pthread_mutexattr_destroy failed: %s",
5383                 __func__, strerror(errno));
5384             goto fail;
5385         }
5386 #endif  /* _WIN32 || ... */
5387 
5388         env->me_txns->mti_magic = EDB_MAGIC;
5389         env->me_txns->mti_format = EDB_LOCK_FORMAT;
5390         env->me_txns->mti_txnid = 0;
5391         env->me_txns->mti_numreaders = 0;
5392 
5393     } else {
5394 #ifdef EDB_USE_SYSV_SEM
5395         struct semid_ds buf;
5396 #endif
5397         if (env->me_txns->mti_magic != EDB_MAGIC) {
5398             NDRX_LOG(log_error, "lock region has invalid magic");
5399             rc = EDB_INVALID;
5400             goto fail;
5401         }
5402         if (env->me_txns->mti_format != EDB_LOCK_FORMAT) {
5403             NDRX_LOG(log_error, "lock region has format+version 0x%x, expected 0x%x",
5404                 env->me_txns->mti_format, EDB_LOCK_FORMAT);
5405             rc = EDB_VERSION_MISMATCH;
5406             goto fail;
5407         }
5408         rc = ErrCode();
5409         if (rc && rc != EACCES && rc != EAGAIN) {
5410             NDRX_LOG(log_error, "Invalid rc=%d", rc);
5411             goto fail;
5412         }
5413 #ifdef _WIN32
5414         edb_env_mname_init(env);
5415         env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, MUTEXNAME(env, 'r'));
5416         if (!env->me_rmutex) goto fail_errno;
5417         env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, MUTEXNAME(env, 'w'));
5418         if (!env->me_wmutex) goto fail_errno;
5419 #elif defined(EDB_USE_POSIX_SEM)
5420         edb_env_mname_init(env);
5421         env->me_rmutex = sem_open(MUTEXNAME(env, 'r'), 0);
5422         if (env->me_rmutex == SEM_FAILED) goto fail_errno;
5423         env->me_wmutex = sem_open(MUTEXNAME(env, 'w'), 0);
5424         if (env->me_wmutex == SEM_FAILED) goto fail_errno;
5425 #elif defined(EDB_USE_SYSV_SEM)
5426         semid = env->me_txns->mti_semid;
5427         semu.buf = &buf;
5428         /* check for read access */
5429         if (semctl(semid, 0, IPC_STAT, semu) < 0) {
5430             int err_ = errno;
5431             NDRX_LOG(log_error, "%s: semctl failed: %s",
5432                 __func__, strerror(err_));
5433             errno = err_;
5434             goto fail_errno;
5435         }
5436         /* check for write access */
5437         if (semctl(semid, 0, IPC_SET, semu) < 0) {
5438             int err_ = errno;
5439             NDRX_LOG(log_error, "%s: semctl failed: %s",
5440                 __func__, strerror(err_));
5441             errno = err_;
5442             goto fail_errno;
5443         }
5444 #endif
5445     }
5446 #ifdef EDB_USE_SYSV_SEM
5447     env->me_rmutex->semid = semid;
5448     env->me_wmutex->semid = semid;
5449     env->me_rmutex->semnum = 0;
5450     env->me_wmutex->semnum = 1;
5451     env->me_rmutex->locked = &env->me_txns->mti_rlocked;
5452     env->me_wmutex->locked = &env->me_txns->mti_wlocked;
5453 #endif
5454 
5455     return EDB_SUCCESS;
5456 
5457 fail_errno:
5458     rc = ErrCode();
5459 fail:
5460     return rc;
5461 }
5462 
5463     /** Only a subset of the @ref edb_env flags can be changed
5464      *  at runtime. Changing other flags requires closing the
5465      *  environment and re-opening it with the new flags.
5466      */
5467 #define CHANGEABLE  (EDB_NOSYNC|EDB_NOMETASYNC|EDB_MAPASYNC|EDB_NOMEMINIT)
5468 #define CHANGELESS  (EDB_FIXEDMAP|EDB_NOSUBDIR|EDB_RDONLY| \
5469     EDB_WRITEMAP|EDB_NOTLS|EDB_NOLOCK|EDB_NORDAHEAD|EDB_PREVSNAPSHOT)
5470 
5471 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
5472 # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
5473 #endif
5474 
5475 int ESECT
5476 edb_env_open(EDB_env *env, const char *path, unsigned int flags, edb_mode_t mode)
5477 {
5478     int rc, excl = -1;
5479     EDB_name fname;
5480 
5481     if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
5482         return EINVAL;
5483 
5484 #ifdef EDB_VL32
5485     if (flags & EDB_WRITEMAP) {
5486         /* silently ignore WRITEMAP in 32 bit mode */
5487         flags ^= EDB_WRITEMAP;
5488     }
5489     if (flags & EDB_FIXEDMAP) {
5490         /* cannot support FIXEDMAP */
5491         return EINVAL;
5492     }
5493 #endif
5494     flags |= env->me_flags;
5495 
5496     rc = edb_fname_init(path, flags, &fname);
5497     if (rc)
5498         return rc;
5499 
5500 #ifdef EDB_VL32
5501 #ifdef _WIN32
5502     env->me_rpmutex = CreateMutex(NULL, FALSE, NULL);
5503     if (!env->me_rpmutex) {
5504         rc = ErrCode();
5505         goto leave;
5506     }
5507 #else
5508     rc = pthread_mutex_init(&env->me_rpmutex, NULL);
5509     if (rc)
5510     {
5511         NDRX_LOG(log_error, "%s: pthread_mutex_init failed: %d",
5512             __func__, rc);
5513         goto leave;
5514     }
5515 #endif
5516 #endif
5517     flags |= EDB_ENV_ACTIVE;    /* tell edb_env_close0() to clean up */
5518 
5519     if (flags & EDB_RDONLY) {
5520         /* silently ignore WRITEMAP when we're only getting read access */
5521         flags &= ~EDB_WRITEMAP;
5522     } else {
5523         if (!((env->me_free_pgs = edb_eidl_alloc(EDB_IDL_UM_MAX)) &&
5524               (env->me_dirty_list = calloc(EDB_IDL_UM_SIZE, sizeof(EDB_ID2)))))
5525         {
5526             NDRX_LOG(log_error, "edb_eidl_alloc failed");
5527             rc = ENOMEM;
5528         }
5529     }
5530 
5531     env->me_flags = flags;
5532     if (rc)
5533         goto leave;
5534 
5535 #ifdef EDB_VL32
5536     {
5537         env->me_rpages = malloc(EDB_ERPAGE_SIZE * sizeof(EDB_ID3));
5538         if (!env->me_rpages) {
5539             NDRX_LOG(log_error, "malloc failed: %ld",
5540                     (long)(EDB_ERPAGE_SIZE * sizeof(EDB_ID3)));
5541             rc = ENOMEM;
5542             goto leave;
5543         }
5544         env->me_rpages[0].mid = 0;
5545         env->me_rpcheck = EDB_ERPAGE_SIZE/2;
5546     }
5547 #endif
5548 
5549     env->me_path = strdup(path);
5550     env->me_dbxs = calloc(env->me_maxdbs, sizeof(EDB_dbx));
5551     env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
5552     env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int));
5553     if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) {
5554         NDRX_LOG(log_error, "calloc failed: %p %p %p %p",
5555             env->me_path, env->me_dbxs, env->me_dbflags, env->me_dbiseqs);
5556         rc = ENOMEM;
5557         goto leave;
5558     }
5559     env->me_dbxs[FREE_DBI].md_cmp = edb_cmp_long; /* aligned EDB_INTEGERKEY */
5560 
5561     /* For RDONLY, get lockfile after we know datafile exists */
5562     if (!(flags & (EDB_RDONLY|EDB_NOLOCK))) {
5563         rc = edb_env_setup_locks(env, &fname, mode, &excl);
5564         if (rc) {
5565             NDRX_LOG(log_error, "%s: edb_env_setup_locks failed: %d",
5566                 __func__, rc);
5567             goto leave;
5568         }
5569 
5570         if ((flags & EDB_PREVSNAPSHOT) && !excl) {
5571             rc = EAGAIN;
5572             goto leave;
5573         }
5574     }
5575 
5576     rc = edb_fopen(env, &fname,
5577         (flags & EDB_RDONLY) ? EDB_O_RDONLY : EDB_O_RDWR,
5578         mode, &env->me_fd);
5579     if (rc)
5580         goto leave;
5581 
5582     if ((flags & (EDB_RDONLY|EDB_NOLOCK)) == EDB_RDONLY) {
5583         rc = edb_env_setup_locks(env, &fname, mode, &excl);
5584         if (rc) {
5585             NDRX_LOG(log_error, "%s: edb_env_setup_locks (2) failed: %d",
5586                 __func__, rc);
5587             goto leave;
5588         }
5589     }
5590 
5591     if ((rc = edb_env_open2(env, flags & EDB_PREVSNAPSHOT)) == EDB_SUCCESS) {
5592         if (!(flags & (EDB_RDONLY|EDB_WRITEMAP))) {
5593             /* Synchronous fd for meta writes. Needed even with
5594              * EDB_NOSYNC/EDB_NOMETASYNC, in case these get reset.
5595              */
5596             rc = edb_fopen(env, &fname, EDB_O_META, mode, &env->me_mfd);
5597             if (rc) {
5598                 NDRX_LOG(log_error, "%s: edb_fopen failed: %d",
5599                     __func__, rc);
5600                 goto leave;
5601             }
5602         }
5603         DPRINTF(("opened dbenv %p", (void *) env));
5604         if (excl > 0 && !(flags & EDB_PREVSNAPSHOT)) {
5605             rc = edb_env_share_locks(env, &excl);
5606             if (rc)
5607                 goto leave;
5608         }
5609         if (!(flags & EDB_RDONLY)) {
5610             EDB_txn *txn;
5611             int tsize = sizeof(EDB_txn), size = tsize + env->me_maxdbs *
5612                 (sizeof(EDB_db)+sizeof(EDB_cursor *)+sizeof(unsigned int)+1);
5613             if ((env->me_pbuf = calloc(1, env->me_psize)) &&
5614                 (txn = calloc(1, size)))
5615             {
5616                 txn->mt_dbs = (EDB_db *)((char *)txn + tsize);
5617                 txn->mt_cursors = (EDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
5618                 txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
5619                 txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
5620                 txn->mt_env = env;
5621 #ifdef EDB_VL32
5622                 txn->mt_rpages = malloc(EDB_TRPAGE_SIZE * sizeof(EDB_ID3));
5623                 if (!txn->mt_rpages) {
5624                     NDRX_LOG(log_error, "malloc failed: %s", strerror(errno));
5625                     free(txn);
5626                     rc = ENOMEM;
5627                     goto leave;
5628                 }
5629                 txn->mt_rpages[0].mid = 0;
5630                 txn->mt_rpcheck = EDB_TRPAGE_SIZE/2;
5631 #endif
5632                 txn->mt_dbxs = env->me_dbxs;
5633                 txn->mt_flags = EDB_TXN_FINISHED;
5634                 env->me_txn0 = txn;
5635             } else {
5636                 NDRX_LOG(log_error, "malloc failed: %s", strerror(errno));
5637                 rc = ENOMEM;
5638             }
5639         }
5640     }
5641 
5642 leave:
5643     if (rc) {
5644         edb_env_close0(env, excl);
5645     }
5646     edb_fname_destroy(fname);
5647     return rc;
5648 }
5649 
5650 /** Destroy resources from edb_env_open(), clear our readers & DBIs */
5651 static void ESECT
5652 edb_env_close0(EDB_env *env, int excl)
5653 {
5654     int i;
5655 
5656     if (!(env->me_flags & EDB_ENV_ACTIVE))
5657         return;
5658 
5659     /* Doing this here since me_dbxs may not exist during edb_env_close */
5660     if (env->me_dbxs) {
5661         for (i = env->me_maxdbs; --i >= CORE_DBS; )
5662             free(env->me_dbxs[i].md_name.mv_data);
5663         free(env->me_dbxs);
5664     }
5665 
5666     free(env->me_pbuf);
5667     free(env->me_dbiseqs);
5668     free(env->me_dbflags);
5669     free(env->me_path);
5670     free(env->me_dirty_list);
5671 #ifdef EDB_VL32
5672     if (env->me_txn0 && env->me_txn0->mt_rpages)
5673         free(env->me_txn0->mt_rpages);
5674     if (env->me_rpages) {
5675         EDB_ID3L el = env->me_rpages;
5676         unsigned int x;
5677         for (x=1; x<=el[0].mid; x++)
5678             munmap(el[x].mptr, el[x].mcnt * env->me_psize);
5679         free(el);
5680     }
5681 #endif
5682     free(env->me_txn0);
5683     edb_eidl_free(env->me_free_pgs);
5684 
5685     if (env->me_flags & EDB_ENV_TXKEY) {
5686         pthread_key_delete(env->me_txkey);
5687 #ifdef _WIN32
5688         /* Delete our key from the global list */
5689         for (i=0; i<edb_tls_nkeys; i++)
5690             if (edb_tls_keys[i] == env->me_txkey) {
5691                 edb_tls_keys[i] = edb_tls_keys[edb_tls_nkeys-1];
5692                 edb_tls_nkeys--;
5693                 break;
5694             }
5695 #endif
5696     }
5697 
5698     if (env->me_map) {
5699 #ifdef EDB_VL32
5700         munmap(env->me_map, NUM_METAS*env->me_psize);
5701 #else
5702         munmap(env->me_map, env->me_mapsize);
5703 #endif
5704     }
5705     if (env->me_mfd != INVALID_HANDLE_VALUE)
5706         (void) close(env->me_mfd);
5707     if (env->me_fd != INVALID_HANDLE_VALUE)
5708         (void) close(env->me_fd);
5709     if (env->me_txns) {
5710         EDB_PID_T pid = getpid();
5711         /* Clearing readers is done in this function because
5712          * me_txkey with its destructor must be disabled first.
5713          *
5714          * We skip the the reader mutex, so we touch only
5715          * data owned by this process (me_close_readers and
5716          * our readers), and clear each reader atomically.
5717          */
5718         for (i = env->me_close_readers; --i >= 0; )
5719             if (env->me_txns->mti_readers[i].mr_pid == pid)
5720                 env->me_txns->mti_readers[i].mr_pid = 0;
5721 #ifdef _WIN32
5722         if (env->me_rmutex) {
5723             CloseHandle(env->me_rmutex);
5724             if (env->me_wmutex) CloseHandle(env->me_wmutex);
5725         }
5726         /* Windows automatically destroys the mutexes when
5727          * the last handle closes.
5728          */
5729 #elif defined(EDB_USE_POSIX_SEM)
5730         if (env->me_rmutex != SEM_FAILED) {
5731             sem_close(env->me_rmutex);
5732             if (env->me_wmutex != SEM_FAILED)
5733                 sem_close(env->me_wmutex);
5734             /* If we have the filelock:  If we are the
5735              * only remaining user, clean up semaphores.
5736              */
5737             if (excl == 0)
5738                 edb_env_excl_lock(env, &excl);
5739             if (excl > 0) {
5740                 sem_unlink(MUTEXNAME(env, 'r'));
5741                 sem_unlink(MUTEXNAME(env, 'w'));
5742             }
5743         }
5744 #elif defined(EDB_USE_SYSV_SEM)
5745         if (env->me_rmutex->semid != -1) {
5746             /* If we have the filelock:  If we are the
5747              * only remaining user, clean up semaphores.
5748              */
5749             if (excl == 0)
5750                 edb_env_excl_lock(env, &excl);
5751             if (excl > 0)
5752                 semctl(env->me_rmutex->semid, 0, IPC_RMID);
5753         }
5754 #endif
5755         munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(EDB_reader)+sizeof(EDB_txninfo));
5756     }
5757     if (env->me_lfd != INVALID_HANDLE_VALUE) {
5758 #ifdef _WIN32
5759         if (excl >= 0) {
5760             /* Unlock the lockfile.  Windows would have unlocked it
5761              * after closing anyway, but not necessarily at once.
5762              */
5763             UnlockFile(env->me_lfd, 0, 0, 1, 0);
5764         }
5765 #endif
5766         (void) close(env->me_lfd);
5767     }
5768 #ifdef EDB_VL32
5769 #ifdef _WIN32
5770     if (env->me_fmh) CloseHandle(env->me_fmh);
5771     if (env->me_rpmutex) CloseHandle(env->me_rpmutex);
5772 #else
5773     pthread_mutex_destroy(&env->me_rpmutex);
5774 #endif
5775 #endif
5776 
5777     env->me_flags &= ~(EDB_ENV_ACTIVE|EDB_ENV_TXKEY);
5778 }
5779 
5780 void ESECT
5781 edb_env_close(EDB_env *env)
5782 {
5783     EDB_page *dp;
5784 
5785     if (env == NULL)
5786         return;
5787 
5788     VGMEMP_DESTROY(env);
5789     while ((dp = env->me_dpages) != NULL) {
5790         VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
5791         env->me_dpages = dp->mp_next;
5792         free(dp);
5793     }
5794 
5795     edb_env_close0(env, 0);
5796     free(env);
5797 }
5798 
5799 /** Compare two items pointing at aligned #edb_size_t's */
5800 static int
5801 edb_cmp_long(const EDB_val *a, const EDB_val *b)
5802 {
5803     return (*(edb_size_t *)a->mv_data < *(edb_size_t *)b->mv_data) ? -1 :
5804         *(edb_size_t *)a->mv_data > *(edb_size_t *)b->mv_data;
5805 }
5806 
5807 /** Compare two items pointing at aligned unsigned int's.
5808  *
5809  *  This is also set as #EDB_INTEGERDUP|#EDB_DUPFIXED's #EDB_dbx.%md_dcmp,
5810  *  but #edb_cmp_clong() is called instead if the data type is #edb_size_t.
5811  */
5812 static int
5813 edb_cmp_int(const EDB_val *a, const EDB_val *b)
5814 {
5815     return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
5816         *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
5817 }
5818 
5819 /** Compare two items pointing at unsigned ints of unknown alignment.
5820  *  Nodes and keys are guaranteed to be 2-byte aligned.
5821  */
5822 static int
5823 edb_cmp_cint(const EDB_val *a, const EDB_val *b)
5824 {
5825 #if BYTE_ORDER == LITTLE_ENDIAN
5826     unsigned short *u, *c;
5827     int x;
5828 
5829     u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
5830     c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
5831     do {
5832         x = *--u - *--c;
5833     } while(!x && u > (unsigned short *)a->mv_data);
5834     return x;
5835 #else
5836     unsigned short *u, *c, *end;
5837     int x;
5838 
5839     end = (unsigned short *) ((char *) a->mv_data + a->mv_size);
5840     u = (unsigned short *)a->mv_data;
5841     c = (unsigned short *)b->mv_data;
5842     do {
5843         x = *u++ - *c++;
5844     } while(!x && u < end);
5845     return x;
5846 #endif
5847 }
5848 
5849 /** Compare two items lexically */
5850 static int
5851 edb_cmp_memn(const EDB_val *a, const EDB_val *b)
5852 {
5853     int diff;
5854     ssize_t len_diff;
5855     unsigned int len;
5856 
5857     len = a->mv_size;
5858     len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
5859     if (len_diff > 0) {
5860         len = b->mv_size;
5861         len_diff = 1;
5862     }
5863 
5864     diff = memcmp(a->mv_data, b->mv_data, len);
5865     return diff ? diff : len_diff<0 ? -1 : len_diff;
5866 }
5867 
5868 /** Compare two items in reverse byte order */
5869 static int
5870 edb_cmp_memnr(const EDB_val *a, const EDB_val *b)
5871 {
5872     const unsigned char *p1, *p2, *p1_lim;
5873     ssize_t len_diff;
5874     int diff;
5875 
5876     p1_lim = (const unsigned char *)a->mv_data;
5877     p1 = (const unsigned char *)a->mv_data + a->mv_size;
5878     p2 = (const unsigned char *)b->mv_data + b->mv_size;
5879 
5880     len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
5881     if (len_diff > 0) {
5882         p1_lim += len_diff;
5883         len_diff = 1;
5884     }
5885 
5886     while (p1 > p1_lim) {
5887         diff = *--p1 - *--p2;
5888         if (diff)
5889             return diff;
5890     }
5891     return len_diff<0 ? -1 : len_diff;
5892 }
5893 
5894 /** Search for key within a page, using binary search.
5895  * Returns the smallest entry larger or equal to the key.
5896  * If exactp is non-null, stores whether the found entry was an exact match
5897  * in *exactp (1 or 0).
5898  * Updates the cursor index with the index of the found entry.
5899  * If no entry larger or equal to the key is found, returns NULL.
5900  */
5901 static EDB_node *
5902 edb_node_search(EDB_cursor *mc, EDB_val *key, int *exactp)
5903 {
5904     unsigned int     i = 0, nkeys;
5905     int      low, high;
5906     int      rc = 0;
5907     EDB_page *mp = mc->mc_pg[mc->mc_top];
5908     EDB_node    *node = NULL;
5909     EDB_val  nodekey;
5910     EDB_cmp_func *cmp;
5911     DKBUF;
5912 
5913     nkeys = NUMKEYS(mp);
5914 
5915     DPRINTF(("searching %u keys in %s %spage %"Yu,
5916         nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
5917         edb_dbg_pgno(mp)));
5918 
5919     low = IS_LEAF(mp) ? 0 : 1;
5920     high = nkeys - 1;
5921     cmp = mc->mc_dbx->md_cmp;
5922 
5923     /* Branch pages have no data, so if using integer keys,
5924      * alignment is guaranteed. Use faster edb_cmp_int.
5925      */
5926     if (cmp == edb_cmp_cint && IS_BRANCH(mp)) {
5927         if (NODEPTR(mp, 1)->mn_ksize == sizeof(edb_size_t))
5928             cmp = edb_cmp_long;
5929         else
5930             cmp = edb_cmp_int;
5931     }
5932 
5933     if (IS_LEAF2(mp)) {
5934         nodekey.mv_size = mc->mc_db->md_pad;
5935         node = NODEPTR(mp, 0);  /* fake */
5936         while (low <= high) {
5937             i = (low + high) >> 1;
5938             nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size);
5939             rc = cmp(key, &nodekey);
5940             DPRINTF(("found leaf index %u [%s], rc = %i",
5941                 i, DKEY(&nodekey), rc));
5942             if (rc == 0)
5943                 break;
5944             if (rc > 0)
5945                 low = i + 1;
5946             else
5947                 high = i - 1;
5948         }
5949     } else {
5950         while (low <= high) {
5951             i = (low + high) >> 1;
5952 
5953             node = NODEPTR(mp, i);
5954             nodekey.mv_size = NODEKSZ(node);
5955             nodekey.mv_data = NODEKEY(node);
5956 
5957             rc = cmp(key, &nodekey);
5958 #if EDB_DEBUG
5959             if (IS_LEAF(mp))
5960                 DPRINTF(("found leaf index %u [%s], rc = %i",
5961                     i, DKEY(&nodekey), rc));
5962             else
5963                 DPRINTF(("found branch index %u [%s -> %"Yu"], rc = %i",
5964                     i, DKEY(&nodekey), NODEPGNO(node), rc));
5965 #endif
5966             if (rc == 0)
5967                 break;
5968             if (rc > 0)
5969                 low = i + 1;
5970             else
5971                 high = i - 1;
5972         }
5973     }
5974 
5975     if (rc > 0) {   /* Found entry is less than the key. */
5976         i++;    /* Skip to get the smallest entry larger than key. */
5977         if (!IS_LEAF2(mp))
5978             node = NODEPTR(mp, i);
5979     }
5980     if (exactp)
5981         *exactp = (rc == 0 && nkeys > 0);
5982     /* store the key index */
5983     mc->mc_ki[mc->mc_top] = i;
5984     if (i >= nkeys)
5985         /* There is no entry larger or equal to the key. */
5986         return NULL;
5987 
5988     /* nodeptr is fake for LEAF2 */
5989     return node;
5990 }
5991 
5992 #if 0
5993 static void
5994 edb_cursor_adjust(EDB_cursor *mc, func)
5995 {
5996     EDB_cursor *m2;
5997 
5998     for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
5999         if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) {
6000             func(mc, m2);
6001         }
6002     }
6003 }
6004 #endif
6005 
6006 /** Pop a page off the top of the cursor's stack. */
6007 static void
6008 edb_cursor_pop(EDB_cursor *mc)
6009 {
6010     if (mc->mc_snum) {
6011         DPRINTF(("popping page %"Yu" off db %d cursor %p",
6012             mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc));
6013 
6014         mc->mc_snum--;
6015         if (mc->mc_snum) {
6016             mc->mc_top--;
6017         } else {
6018             mc->mc_flags &= ~C_INITIALIZED;
6019         }
6020     }
6021 }
6022 
6023 /** Push a page onto the top of the cursor's stack.
6024  * Set #EDB_TXN_ERROR on failure.
6025  */
6026 static int
6027 edb_cursor_push(EDB_cursor *mc, EDB_page *mp)
6028 {
6029     DPRINTF(("pushing page %"Yu" on db %d cursor %p", mp->mp_pgno,
6030         DDBI(mc), (void *) mc));
6031 
6032     if (mc->mc_snum >= CURSOR_STACK) {
6033         mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
6034         return EDB_CURSOR_FULL;
6035     }
6036 
6037     mc->mc_top = mc->mc_snum++;
6038     mc->mc_pg[mc->mc_top] = mp;
6039     mc->mc_ki[mc->mc_top] = 0;
6040 
6041     return EDB_SUCCESS;
6042 }
6043 
6044 #ifdef EDB_VL32
6045 /** Map a read-only page.
6046  * There are two levels of tracking in use, a per-txn list and a per-env list.
6047  * ref'ing and unref'ing the per-txn list is faster since it requires no
6048  * locking. Pages are cached in the per-env list for global reuse, and a lock
6049  * is required. Pages are not immediately unmapped when their refcnt goes to
6050  * zero; they hang around in case they will be reused again soon.
6051  *
6052  * When the per-txn list gets full, all pages with refcnt=0 are purged from the
6053  * list and their refcnts in the per-env list are decremented.
6054  *
6055  * When the per-env list gets full, all pages with refcnt=0 are purged from the
6056  * list and their pages are unmapped.
6057  *
6058  * @note "full" means the list has reached its respective rpcheck threshold.
6059  * This threshold slowly raises if no pages could be purged on a given check,
6060  * and returns to its original value when enough pages were purged.
6061  *
6062  * If purging doesn't free any slots, filling the per-txn list will return
6063  * EDB_TXN_FULL, and filling the per-env list returns EDB_MAP_FULL.
6064  *
6065  * Reference tracking in a txn is imperfect, pages can linger with non-zero
6066  * refcnt even without active references. It was deemed to be too invasive
6067  * to add unrefs in every required location. However, all pages are unref'd
6068  * at the end of the transaction. This guarantees that no stale references
6069  * linger in the per-env list.
6070  *
6071  * Usually we map chunks of 16 pages at a time, but if an overflow page begins
6072  * at the tail of the chunk we extend the chunk to include the entire overflow
6073  * page. Unfortunately, pages can be turned into overflow pages after their
6074  * chunk was already mapped. In that case we must remap the chunk if the
6075  * overflow page is referenced. If the chunk's refcnt is 0 we can just remap
6076  * it, otherwise we temporarily map a new chunk just for the overflow page.
6077  *
6078  * @note this chunk handling means we cannot guarantee that a data item
6079  * returned from the DB will stay alive for the duration of the transaction:
6080  *   We unref pages as soon as a cursor moves away from the page
6081  *   A subsequent op may cause a purge, which may unmap any unref'd chunks
6082  * The caller must copy the data if it must be used later in the same txn.
6083  *
6084  * Also - our reference counting revolves around cursors, but overflow pages
6085  * aren't pointed to by a cursor's page stack. We have to remember them
6086  * explicitly, in the added mc_ovpg field. A single cursor can only hold a
6087  * reference to one overflow page at a time.
6088  *
6089  * @param[in] txn the transaction for this access.
6090  * @param[in] pgno the page number for the page to retrieve.
6091  * @param[out] ret address of a pointer where the page's address will be stored.
6092  * @return 0 on success, non-zero on failure.
6093  */
6094 static int
6095 edb_rpage_get(EDB_txn *txn, pgno_t pg0, EDB_page **ret)
6096 {
6097     EDB_env *env = txn->mt_env;
6098     EDB_page *p;
6099     EDB_ID3L tl = txn->mt_rpages;
6100     EDB_ID3L el = env->me_rpages;
6101     EDB_ID3 id3;
6102     unsigned x, rem;
6103     pgno_t pgno;
6104     int rc, retries = 1;
6105 #ifdef _WIN32
6106     LARGE_INTEGER off;
6107     SIZE_T len;
6108 #define SET_OFF(off,val)    off.QuadPart = val
6109 #define MAP(rc,env,addr,len,off)    \
6110     addr = NULL; \
6111     rc = NtMapViewOfSection(env->me_fmh, GetCurrentProcess(), &addr, 0, \
6112         len, &off, &len, ViewUnmap, (env->me_flags & EDB_RDONLY) ? 0 : MEM_RESERVE, PAGE_READONLY); \
6113     if (rc) rc = edb_nt2win32(rc)
6114 #else
6115     off_t off;
6116     size_t len;
6117 #define SET_OFF(off,val)    off = val
6118 #define MAP(rc,env,addr,len,off)    \
6119     addr = mmap(NULL, len, PROT_READ, MAP_SHARED, env->me_fd, off); \
6120     rc = (addr == MAP_FAILED) ? errno : 0
6121 #endif
6122 
6123     /* remember the offset of the actual page number, so we can
6124      * return the correct pointer at the end.
6125      */
6126     rem = pg0 & (EDB_RPAGE_CHUNK-1);
6127     pgno = pg0 ^ rem;
6128 
6129     id3.mid = 0;
6130     x = edb_mid3l_search(tl, pgno);
6131     if (x <= tl[0].mid && tl[x].mid == pgno) {
6132         if (x != tl[0].mid && tl[x+1].mid == pg0)
6133             x++;
6134         /* check for overflow size */
6135         p = (EDB_page *)((char *)tl[x].mptr + rem * env->me_psize);
6136         if (IS_OVERFLOW(p) && p->mp_pages + rem > tl[x].mcnt) {
6137             id3.mcnt = p->mp_pages + rem;
6138             len = id3.mcnt * env->me_psize;
6139             SET_OFF(off, pgno * env->me_psize);
6140             MAP(rc, env, id3.mptr, len, off);
6141             if (rc)
6142                 return rc;
6143             /* check for local-only page */
6144             if (rem) {
6145                 edb_tassert(txn, tl[x].mid != pg0);
6146                 /* hope there's room to insert this locally.
6147                  * setting mid here tells later code to just insert
6148                  * this id3 instead of searching for a match.
6149                  */
6150                 id3.mid = pg0;
6151                 goto notlocal;
6152             } else {
6153                 /* ignore the mapping we got from env, use new one */
6154                 tl[x].mptr = id3.mptr;
6155                 tl[x].mcnt = id3.mcnt;
6156                 /* if no active ref, see if we can replace in env */
6157                 if (!tl[x].mref) {
6158                     unsigned i;
6159                     pthread_mutex_lock(&env->me_rpmutex);
6160                     i = edb_mid3l_search(el, tl[x].mid);
6161                     if (el[i].mref == 1) {
6162                         /* just us, replace it */
6163                         munmap(el[i].mptr, el[i].mcnt * env->me_psize);
6164                         el[i].mptr = tl[x].mptr;
6165                         el[i].mcnt = tl[x].mcnt;
6166                     } else {
6167                         /* there are others, remove ourself */
6168                         el[i].mref--;
6169                     }
6170                     pthread_mutex_unlock(&env->me_rpmutex);
6171                 }
6172             }
6173         }
6174         id3.mptr = tl[x].mptr;
6175         id3.mcnt = tl[x].mcnt;
6176         tl[x].mref++;
6177         goto ok;
6178     }
6179 
6180 notlocal:
6181     if (tl[0].mid >= EDB_TRPAGE_MAX - txn->mt_rpcheck) {
6182         unsigned i, y;
6183         /* purge unref'd pages from our list and unref in env */
6184         pthread_mutex_lock(&env->me_rpmutex);
6185 retry:
6186         y = 0;
6187         for (i=1; i<=tl[0].mid; i++) {
6188             if (!tl[i].mref) {
6189                 if (!y) y = i;
6190                 /* tmp overflow pages don't go to env */
6191                 if (tl[i].mid & (EDB_RPAGE_CHUNK-1)) {
6192                     munmap(tl[i].mptr, tl[i].mcnt * env->me_psize);
6193                     continue;
6194                 }
6195                 x = edb_mid3l_search(el, tl[i].mid);
6196                 el[x].mref--;
6197             }
6198         }
6199         pthread_mutex_unlock(&env->me_rpmutex);
6200         if (!y) {
6201             /* we didn't find any unref'd chunks.
6202              * if we're out of room, fail.
6203              */
6204             if (tl[0].mid >= EDB_TRPAGE_MAX)
6205                 return EDB_TXN_FULL;
6206             /* otherwise, raise threshold for next time around
6207              * and let this go.
6208              */
6209             txn->mt_rpcheck /= 2;
6210         } else {
6211             /* we found some unused; consolidate the list */
6212             for (i=y+1; i<= tl[0].mid; i++)
6213                 if (tl[i].mref)
6214                     tl[y++] = tl[i];
6215             tl[0].mid = y-1;
6216             /* decrease the check threshold toward its original value */
6217             if (!txn->mt_rpcheck)
6218                 txn->mt_rpcheck = 1;
6219             while (txn->mt_rpcheck < tl[0].mid && txn->mt_rpcheck < EDB_TRPAGE_SIZE/2)
6220                 txn->mt_rpcheck *= 2;
6221         }
6222     }
6223     if (tl[0].mid < EDB_TRPAGE_SIZE) {
6224         id3.mref = 1;
6225         if (id3.mid)
6226             goto found;
6227         /* don't map past last written page in read-only envs */
6228         if ((env->me_flags & EDB_RDONLY) && pgno + EDB_RPAGE_CHUNK-1 > txn->mt_last_pgno)
6229             id3.mcnt = txn->mt_last_pgno + 1 - pgno;
6230         else
6231             id3.mcnt = EDB_RPAGE_CHUNK;
6232         len = id3.mcnt * env->me_psize;
6233         id3.mid = pgno;
6234 
6235         /* search for page in env */
6236         pthread_mutex_lock(&env->me_rpmutex);
6237         x = edb_mid3l_search(el, pgno);
6238         if (x <= el[0].mid && el[x].mid == pgno) {
6239             id3.mptr = el[x].mptr;
6240             id3.mcnt = el[x].mcnt;
6241             /* check for overflow size */
6242             p = (EDB_page *)((char *)id3.mptr + rem * env->me_psize);
6243             if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) {
6244                 id3.mcnt = p->mp_pages + rem;
6245                 len = id3.mcnt * env->me_psize;
6246                 SET_OFF(off, pgno * env->me_psize);
6247                 MAP(rc, env, id3.mptr, len, off);
6248                 if (rc)
6249                     goto fail;
6250                 if (!el[x].mref) {
6251                     munmap(el[x].mptr, env->me_psize * el[x].mcnt);
6252                     el[x].mptr = id3.mptr;
6253                     el[x].mcnt = id3.mcnt;
6254                 } else {
6255                     id3.mid = pg0;
6256                     pthread_mutex_unlock(&env->me_rpmutex);
6257                     goto found;
6258                 }
6259             }
6260             el[x].mref++;
6261             pthread_mutex_unlock(&env->me_rpmutex);
6262             goto found;
6263         }
6264         if (el[0].mid >= EDB_ERPAGE_MAX - env->me_rpcheck) {
6265             /* purge unref'd pages */
6266             unsigned i, y = 0;
6267             for (i=1; i<=el[0].mid; i++) {
6268                 if (!el[i].mref) {
6269                     if (!y) y = i;
6270                     munmap(el[i].mptr, env->me_psize * el[i].mcnt);
6271                 }
6272             }
6273             if (!y) {
6274                 if (retries) {
6275                     /* see if we can unref some local pages */
6276                     retries--;
6277                     id3.mid = 0;
6278                     goto retry;
6279                 }
6280                 if (el[0].mid >= EDB_ERPAGE_MAX) {
6281                     pthread_mutex_unlock(&env->me_rpmutex);
6282                     return EDB_MAP_FULL;
6283                 }
6284                 env->me_rpcheck /= 2;
6285             } else {
6286                 for (i=y+1; i<= el[0].mid; i++)
6287                     if (el[i].mref)
6288                         el[y++] = el[i];
6289                 el[0].mid = y-1;
6290                 if (!env->me_rpcheck)
6291                     env->me_rpcheck = 1;
6292                 while (env->me_rpcheck < el[0].mid && env->me_rpcheck < EDB_ERPAGE_SIZE/2)
6293                     env->me_rpcheck *= 2;
6294             }
6295         }
6296         SET_OFF(off, pgno * env->me_psize);
6297         MAP(rc, env, id3.mptr, len, off);
6298         if (rc) {
6299 fail:
6300             pthread_mutex_unlock(&env->me_rpmutex);
6301             return rc;
6302         }
6303         /* check for overflow size */
6304         p = (EDB_page *)((char *)id3.mptr + rem * env->me_psize);
6305         if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) {
6306             id3.mcnt = p->mp_pages + rem;
6307             munmap(id3.mptr, len);
6308             len = id3.mcnt * env->me_psize;
6309             MAP(rc, env, id3.mptr, len, off);
6310             if (rc)
6311                 goto fail;
6312         }
6313         edb_mid3l_insert(el, &id3);
6314         pthread_mutex_unlock(&env->me_rpmutex);
6315 found:
6316         edb_mid3l_insert(tl, &id3);
6317     } else {
6318         return EDB_TXN_FULL;
6319     }
6320 ok:
6321     p = (EDB_page *)((char *)id3.mptr + rem * env->me_psize);
6322 #if EDB_DEBUG   /* we don't need this check any more */
6323     if (IS_OVERFLOW(p)) {
6324         edb_tassert(txn, p->mp_pages + rem <= id3.mcnt);
6325     }
6326 #endif
6327     *ret = p;
6328     return EDB_SUCCESS;
6329 }
6330 #endif
6331 
6332 /** Find the address of the page corresponding to a given page number.
6333  * Set #EDB_TXN_ERROR on failure.
6334  * @param[in] mc the cursor accessing the page.
6335  * @param[in] pgno the page number for the page to retrieve.
6336  * @param[out] ret address of a pointer where the page's address will be stored.
6337  * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page.
6338  * @return 0 on success, non-zero on failure.
6339  */
6340 static int
6341 edb_page_get(EDB_cursor *mc, pgno_t pgno, EDB_page **ret, int *lvl)
6342 {
6343     EDB_txn *txn = mc->mc_txn;
6344     EDB_page *p = NULL;
6345     int level;
6346 
6347     if (! (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP))) {
6348         EDB_txn *tx2 = txn;
6349         level = 1;
6350         do {
6351             EDB_ID2L dl = tx2->mt_u.dirty_list;
6352             unsigned x;
6353             /* Spilled pages were dirtied in this txn and flushed
6354              * because the dirty list got full. Bring this page
6355              * back in from the map (but don't unspill it here,
6356              * leave that unless page_touch happens again).
6357              */
6358             if (tx2->mt_spill_pgs) {
6359                 EDB_ID pn = pgno << 1;
6360                 x = edb_eidl_search(tx2->mt_spill_pgs, pn);
6361                 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
6362                     goto mapped;
6363                 }
6364             }
6365             if (dl[0].mid) {
6366                 unsigned x = edb_mid2l_search(dl, pgno);
6367                 if (x <= dl[0].mid && dl[x].mid == pgno) {
6368                     p = dl[x].mptr;
6369                     goto done;
6370                 }
6371             }
6372             level++;
6373         } while ((tx2 = tx2->mt_parent) != NULL);
6374     }
6375 
6376     if (pgno >= txn->mt_next_pgno) {
6377         DPRINTF(("page %"Yu" not found", pgno));
6378         txn->mt_flags |= EDB_TXN_ERROR;
6379         return EDB_PAGE_NOTFOUND;
6380     }
6381 
6382     level = 0;
6383 
6384 mapped:
6385     {
6386 #ifdef EDB_VL32
6387         int rc = edb_rpage_get(txn, pgno, &p);
6388         if (rc) {
6389             txn->mt_flags |= EDB_TXN_ERROR;
6390             return rc;
6391         }
6392 #else
6393         EDB_env *env = txn->mt_env;
6394         p = (EDB_page *)(env->me_map + env->me_psize * pgno);
6395 #endif
6396     }
6397 
6398 done:
6399     *ret = p;
6400     if (lvl)
6401         *lvl = level;
6402     return EDB_SUCCESS;
6403 }
6404 
6405 /** Finish #edb_page_search() / #edb_page_search_lowest().
6406  *  The cursor is at the root page, set up the rest of it.
6407  */
6408 static int
6409 edb_page_search_root(EDB_cursor *mc, EDB_val *key, int flags)
6410 {
6411     EDB_page    *mp = mc->mc_pg[mc->mc_top];
6412     int rc;
6413     DKBUF;
6414 
6415     while (IS_BRANCH(mp)) {
6416         EDB_node    *node;
6417         indx_t      i;
6418 
6419         DPRINTF(("branch page %"Yu" has %u keys", mp->mp_pgno, NUMKEYS(mp)));
6420         /* Don't assert on branch pages in the FreeDB. We can get here
6421          * while in the process of rebalancing a FreeDB branch page; we must
6422          * let that proceed. ITS#8336
6423          */
6424         edb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1);
6425         DPRINTF(("found index 0 to page %"Yu, NODEPGNO(NODEPTR(mp, 0))));
6426 
6427         if (flags & (EDB_PS_FIRST|EDB_PS_LAST)) {
6428             i = 0;
6429             if (flags & EDB_PS_LAST) {
6430                 i = NUMKEYS(mp) - 1;
6431                 /* if already init'd, see if we're already in right place */
6432                 if (mc->mc_flags & C_INITIALIZED) {
6433                     if (mc->mc_ki[mc->mc_top] == i) {
6434                         mc->mc_top = mc->mc_snum++;
6435                         mp = mc->mc_pg[mc->mc_top];
6436                         goto ready;
6437                     }
6438                 }
6439             }
6440         } else {
6441             int  exact;
6442             node = edb_node_search(mc, key, &exact);
6443             if (node == NULL)
6444                 i = NUMKEYS(mp) - 1;
6445             else {
6446                 i = mc->mc_ki[mc->mc_top];
6447                 if (!exact) {
6448                     edb_cassert(mc, i > 0);
6449                     i--;
6450                 }
6451             }
6452             DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
6453         }
6454 
6455         edb_cassert(mc, i < NUMKEYS(mp));
6456         node = NODEPTR(mp, i);
6457 
6458         if ((rc = edb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)
6459             return rc;
6460 
6461         mc->mc_ki[mc->mc_top] = i;
6462         if ((rc = edb_cursor_push(mc, mp)))
6463             return rc;
6464 
6465 ready:
6466         if (flags & EDB_PS_MODIFY) {
6467             if ((rc = edb_page_touch(mc)) != 0)
6468                 return rc;
6469             mp = mc->mc_pg[mc->mc_top];
6470         }
6471     }
6472 
6473     if (!IS_LEAF(mp)) {
6474         DPRINTF(("internal error, index points to a %02X page!?",
6475             mp->mp_flags));
6476         mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
6477         return EDB_CORRUPTED;
6478     }
6479 
6480     DPRINTF(("found leaf page %"Yu" for key [%s]", mp->mp_pgno,
6481         key ? DKEY(key) : "null"));
6482     mc->mc_flags |= C_INITIALIZED;
6483     mc->mc_flags &= ~C_EOF;
6484 
6485     return EDB_SUCCESS;
6486 }
6487 
6488 /** Search for the lowest key under the current branch page.
6489  * This just bypasses a NUMKEYS check in the current page
6490  * before calling edb_page_search_root(), because the callers
6491  * are all in situations where the current page is known to
6492  * be underfilled.
6493  */
6494 static int
6495 edb_page_search_lowest(EDB_cursor *mc)
6496 {
6497     EDB_page    *mp = mc->mc_pg[mc->mc_top];
6498     EDB_node    *node = NODEPTR(mp, 0);
6499     int rc;
6500 
6501     if ((rc = edb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)
6502         return rc;
6503 
6504     mc->mc_ki[mc->mc_top] = 0;
6505     if ((rc = edb_cursor_push(mc, mp)))
6506         return rc;
6507     return edb_page_search_root(mc, NULL, EDB_PS_FIRST);
6508 }
6509 
6510 /** Search for the page a given key should be in.
6511  * Push it and its parent pages on the cursor stack.
6512  * @param[in,out] mc the cursor for this operation.
6513  * @param[in] key the key to search for, or NULL for first/last page.
6514  * @param[in] flags If EDB_PS_MODIFY is set, visited pages in the DB
6515  *   are touched (updated with new page numbers).
6516  *   If EDB_PS_FIRST or EDB_PS_LAST is set, find first or last leaf.
6517  *   This is used by #edb_cursor_first() and #edb_cursor_last().
6518  *   If EDB_PS_ROOTONLY set, just fetch root node, no further lookups.
6519  * @return 0 on success, non-zero on failure.
6520  */
6521 static int
6522 edb_page_search(EDB_cursor *mc, EDB_val *key, int flags)
6523 {
6524     int      rc;
6525     pgno_t       root;
6526 
6527     /* Make sure the txn is still viable, then find the root from
6528      * the txn's db table and set it as the root of the cursor's stack.
6529      */
6530     if (mc->mc_txn->mt_flags & EDB_TXN_BLOCKED) {
6531         DPUTS("transaction may not be used now");
6532         return EDB_BAD_TXN;
6533     } else {
6534         /* Make sure we're using an up-to-date root */
6535         if (*mc->mc_dbflag & DB_STALE) {
6536                 EDB_cursor mc2;
6537                 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
6538                     return EDB_BAD_DBI;
6539                 edb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
6540                 rc = edb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
6541                 if (rc)
6542                     return rc;
6543                 {
6544                     EDB_val data;
6545                     int exact = 0;
6546                     uint16_t flags;
6547                     EDB_node *leaf = edb_node_search(&mc2,
6548                         &mc->mc_dbx->md_name, &exact);
6549                     if (!exact)
6550                         return EDB_NOTFOUND;
6551                     if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)
6552                         return EDB_INCOMPATIBLE; /* not a named DB */
6553                     rc = edb_node_read(&mc2, leaf, &data);
6554                     if (rc)
6555                         return rc;
6556                     memcpy(&flags, ((char *) data.mv_data + offsetof(EDB_db, md_flags)),
6557                         sizeof(uint16_t));
6558                     /* The txn may not know this DBI, or another process may
6559                      * have dropped and recreated the DB with other flags.
6560                      */
6561                     if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)
6562                         return EDB_INCOMPATIBLE;
6563                     memcpy(mc->mc_db, data.mv_data, sizeof(EDB_db));
6564                 }
6565                 *mc->mc_dbflag &= ~DB_STALE;
6566         }
6567         root = mc->mc_db->md_root;
6568 
6569         if (root == P_INVALID) {        /* Tree is empty. */
6570             DPUTS("tree is empty");
6571             return EDB_NOTFOUND;
6572         }
6573     }
6574 
6575     edb_cassert(mc, root > 1);
6576     if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) {
6577 #ifdef EDB_VL32
6578         if (mc->mc_pg[0])
6579             EDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[0]);
6580 #endif
6581         if ((rc = edb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)
6582             return rc;
6583     }
6584 
6585 #ifdef EDB_VL32
6586     {
6587         int i;
6588         for (i=1; i<mc->mc_snum; i++)
6589             EDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[i]);
6590     }
6591 #endif
6592     mc->mc_snum = 1;
6593     mc->mc_top = 0;
6594 
6595     DPRINTF(("db %d root page %"Yu" has flags 0x%X",
6596         DDBI(mc), root, mc->mc_pg[0]->mp_flags));
6597 
6598     if (flags & EDB_PS_MODIFY) {
6599         if ((rc = edb_page_touch(mc)))
6600             return rc;
6601     }
6602 
6603     if (flags & EDB_PS_ROOTONLY)
6604         return EDB_SUCCESS;
6605 
6606     return edb_page_search_root(mc, key, flags);
6607 }
6608 
6609 static int
6610 edb_ovpage_free(EDB_cursor *mc, EDB_page *mp)
6611 {
6612     EDB_txn *txn = mc->mc_txn;
6613     pgno_t pg = mp->mp_pgno;
6614     unsigned x = 0, ovpages = mp->mp_pages;
6615     EDB_env *env = txn->mt_env;
6616     EDB_IDL sl = txn->mt_spill_pgs;
6617     EDB_ID pn = pg << 1;
6618     int rc;
6619 
6620     DPRINTF(("free ov page %"Yu" (%d)", pg, ovpages));
6621     /* If the page is dirty or on the spill list we just acquired it,
6622      * so we should give it back to our current free list, if any.
6623      * Otherwise put it onto the list of pages we freed in this txn.
6624      *
6625      * Won't create me_pghead: me_pglast must be inited along with it.
6626      * Unsupported in nested txns: They would need to hide the page
6627      * range in ancestor txns' dirty and spilled lists.
6628      */
6629     if (env->me_pghead &&
6630         !txn->mt_parent &&
6631         ((mp->mp_flags & P_DIRTY) ||
6632          (sl && (x = edb_eidl_search(sl, pn)) <= sl[0] && sl[x] == pn)))
6633     {
6634         unsigned i, j;
6635         pgno_t *mop;
6636         EDB_ID2 *dl, ix, iy;
6637         rc = edb_eidl_need(&env->me_pghead, ovpages);
6638         if (rc)
6639             return rc;
6640         if (!(mp->mp_flags & P_DIRTY)) {
6641             /* This page is no longer spilled */
6642             if (x == sl[0])
6643                 sl[0]--;
6644             else
6645                 sl[x] |= 1;
6646             goto release;
6647         }
6648         /* Remove from dirty list */
6649         dl = txn->mt_u.dirty_list;
6650         x = dl[0].mid--;
6651         for (ix = dl[x]; ix.mptr != mp; ix = iy) {
6652             if (x > 1) {
6653                 x--;
6654                 iy = dl[x];
6655                 dl[x] = ix;
6656             } else {
6657                 edb_cassert(mc, x > 1);
6658                 j = ++(dl[0].mid);
6659                 dl[j] = ix;     /* Unsorted. OK when EDB_TXN_ERROR. */
6660                 txn->mt_flags |= EDB_TXN_ERROR;
6661                 return EDB_PROBLEM;
6662             }
6663         }
6664         txn->mt_dirty_room++;
6665         if (!(env->me_flags & EDB_WRITEMAP))
6666             edb_dpage_free(env, mp);
6667 release:
6668         /* Insert in me_pghead */
6669         mop = env->me_pghead;
6670         j = mop[0] + ovpages;
6671         for (i = mop[0]; i && mop[i] < pg; i--)
6672             mop[j--] = mop[i];
6673         while (j>i)
6674             mop[j--] = pg++;
6675         mop[0] += ovpages;
6676     } else {
6677         rc = edb_eidl_append_range(&txn->mt_free_pgs, pg, ovpages);
6678         if (rc)
6679             return rc;
6680     }
6681 #ifdef EDB_VL32
6682     if (mc->mc_ovpg == mp)
6683         mc->mc_ovpg = NULL;
6684 #endif
6685     mc->mc_db->md_overflow_pages -= ovpages;
6686     return 0;
6687 }
6688 
6689 /** Return the data associated with a given node.
6690  * @param[in] mc The cursor for this operation.
6691  * @param[in] leaf The node being read.
6692  * @param[out] data Updated to point to the node's data.
6693  * @return 0 on success, non-zero on failure.
6694  */
6695 static int
6696 edb_node_read(EDB_cursor *mc, EDB_node *leaf, EDB_val *data)
6697 {
6698     EDB_page    *omp;       /* overflow page */
6699     pgno_t       pgno;
6700     int rc;
6701 
6702     if (MC_OVPG(mc)) {
6703         EDB_PAGE_UNREF(mc->mc_txn, MC_OVPG(mc));
6704         MC_SET_OVPG(mc, NULL);
6705     }
6706     if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
6707         data->mv_size = NODEDSZ(leaf);
6708         data->mv_data = NODEDATA(leaf);
6709         return EDB_SUCCESS;
6710     }
6711 
6712     /* Read overflow data.
6713      */
6714     data->mv_size = NODEDSZ(leaf);
6715     memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
6716     if ((rc = edb_page_get(mc, pgno, &omp, NULL)) != 0) {
6717         DPRINTF(("read overflow page %"Yu" failed", pgno));
6718         return rc;
6719     }
6720     data->mv_data = METADATA(omp);
6721     MC_SET_OVPG(mc, omp);
6722 
6723     return EDB_SUCCESS;
6724 }
6725 
6726 int
6727 edb_get(EDB_txn *txn, EDB_dbi dbi,
6728     EDB_val *key, EDB_val *data)
6729 {
6730     EDB_cursor  mc;
6731     EDB_xcursor mx;
6732     int exact = 0, rc;
6733     DKBUF;
6734 
6735     DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key)));
6736 
6737     if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
6738         return EINVAL;
6739 
6740     if (txn->mt_flags & EDB_TXN_BLOCKED)
6741         return EDB_BAD_TXN;
6742 
6743     edb_cursor_init(&mc, txn, dbi, &mx);
6744     rc = edb_cursor_set(&mc, key, data, EDB_SET, &exact);
6745     /* unref all the pages when EDB_VL32 - caller must copy the data
6746      * before doing anything else
6747      */
6748     EDB_CURSOR_UNREF(&mc, 1);
6749     return rc;
6750 }
6751 
6752 /** Find a sibling for a page.
6753  * Replaces the page at the top of the cursor's stack with the
6754  * specified sibling, if one exists.
6755  * @param[in] mc The cursor for this operation.
6756  * @param[in] move_right Non-zero if the right sibling is requested,
6757  * otherwise the left sibling.
6758  * @return 0 on success, non-zero on failure.
6759  */
6760 static int
6761 edb_cursor_sibling(EDB_cursor *mc, int move_right)
6762 {
6763     int      rc;
6764     EDB_node    *indx;
6765     EDB_page    *mp;
6766 #ifdef EDB_VL32
6767     EDB_page    *op;
6768 #endif
6769 
6770     if (mc->mc_snum < 2) {
6771         return EDB_NOTFOUND;        /* root has no siblings */
6772     }
6773 
6774 #ifdef EDB_VL32
6775     op = mc->mc_pg[mc->mc_top];
6776 #endif
6777     edb_cursor_pop(mc);
6778     DPRINTF(("parent page is page %"Yu", index %u",
6779         mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]));
6780 
6781     if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
6782                : (mc->mc_ki[mc->mc_top] == 0)) {
6783         DPRINTF(("no more keys left, moving to %s sibling",
6784             move_right ? "right" : "left"));
6785         if ((rc = edb_cursor_sibling(mc, move_right)) != EDB_SUCCESS) {
6786             /* undo cursor_pop before returning */
6787             mc->mc_top++;
6788             mc->mc_snum++;
6789             return rc;
6790         }
6791     } else {
6792         if (move_right)
6793             mc->mc_ki[mc->mc_top]++;
6794         else
6795             mc->mc_ki[mc->mc_top]--;
6796         DPRINTF(("just moving to %s index key %u",
6797             move_right ? "right" : "left", mc->mc_ki[mc->mc_top]));
6798     }
6799     edb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top]));
6800 
6801     EDB_PAGE_UNREF(mc->mc_txn, op);
6802 
6803     indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6804     if ((rc = edb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) {
6805         /* mc will be inconsistent if caller does mc_snum++ as above */
6806         mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
6807         return rc;
6808     }
6809 
6810     edb_cursor_push(mc, mp);
6811     if (!move_right)
6812         mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1;
6813 
6814     return EDB_SUCCESS;
6815 }
6816 
6817 /** Move the cursor to the next data item. */
6818 static int
6819 edb_cursor_next(EDB_cursor *mc, EDB_val *key, EDB_val *data, EDB_cursor_op op)
6820 {
6821     EDB_page    *mp;
6822     EDB_node    *leaf;
6823     int rc;
6824 
6825     if ((mc->mc_flags & C_DEL && op == EDB_NEXT_DUP))
6826         return EDB_NOTFOUND;
6827 
6828     if (!(mc->mc_flags & C_INITIALIZED))
6829         return edb_cursor_first(mc, key, data);
6830 
6831     mp = mc->mc_pg[mc->mc_top];
6832 
6833     if (mc->mc_flags & C_EOF) {
6834         if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1)
6835             return EDB_NOTFOUND;
6836         mc->mc_flags ^= C_EOF;
6837     }
6838 
6839     if (mc->mc_db->md_flags & EDB_DUPSORT) {
6840         leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6841         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6842             if (op == EDB_NEXT || op == EDB_NEXT_DUP) {
6843                 rc = edb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, EDB_NEXT);
6844                 if (op != EDB_NEXT || rc != EDB_NOTFOUND) {
6845                     if (rc == EDB_SUCCESS)
6846                         EDB_GET_KEY(leaf, key);
6847                     return rc;
6848                 }
6849             }
6850             else {
6851                 EDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0);
6852             }
6853         } else {
6854             mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
6855             if (op == EDB_NEXT_DUP)
6856                 return EDB_NOTFOUND;
6857         }
6858     }
6859 
6860     DPRINTF(("cursor_next: top page is %"Yu" in cursor %p",
6861         edb_dbg_pgno(mp), (void *) mc));
6862     if (mc->mc_flags & C_DEL) {
6863         mc->mc_flags ^= C_DEL;
6864         goto skip;
6865     }
6866 
6867     if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
6868         DPUTS("=====> move to next sibling page");
6869         if ((rc = edb_cursor_sibling(mc, 1)) != EDB_SUCCESS) {
6870             mc->mc_flags |= C_EOF;
6871             return rc;
6872         }
6873         mp = mc->mc_pg[mc->mc_top];
6874         DPRINTF(("next page is %"Yu", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
6875     } else
6876         mc->mc_ki[mc->mc_top]++;
6877 
6878 skip:
6879     DPRINTF(("==> cursor points to page %"Yu" with %u keys, key index %u",
6880         edb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
6881 
6882     if (IS_LEAF2(mp)) {
6883         key->mv_size = mc->mc_db->md_pad;
6884         key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
6885         return EDB_SUCCESS;
6886     }
6887 
6888     edb_cassert(mc, IS_LEAF(mp));
6889     leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6890 
6891     if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6892         edb_xcursor_init1(mc, leaf);
6893     }
6894     if (data) {
6895         if ((rc = edb_node_read(mc, leaf, data)) != EDB_SUCCESS)
6896             return rc;
6897 
6898         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6899             rc = edb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
6900             if (rc != EDB_SUCCESS)
6901                 return rc;
6902         }
6903     }
6904 
6905     EDB_GET_KEY(leaf, key);
6906     return EDB_SUCCESS;
6907 }
6908 
6909 /** Move the cursor to the previous data item. */
6910 static int
6911 edb_cursor_prev(EDB_cursor *mc, EDB_val *key, EDB_val *data, EDB_cursor_op op)
6912 {
6913     EDB_page    *mp;
6914     EDB_node    *leaf;
6915     int rc;
6916 
6917     if (!(mc->mc_flags & C_INITIALIZED)) {
6918         rc = edb_cursor_last(mc, key, data);
6919         if (rc)
6920             return rc;
6921         mc->mc_ki[mc->mc_top]++;
6922     }
6923 
6924     mp = mc->mc_pg[mc->mc_top];
6925 
6926     if (mc->mc_db->md_flags & EDB_DUPSORT) {
6927         leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6928         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6929             if (op == EDB_PREV || op == EDB_PREV_DUP) {
6930                 rc = edb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, EDB_PREV);
6931                 if (op != EDB_PREV || rc != EDB_NOTFOUND) {
6932                     if (rc == EDB_SUCCESS) {
6933                         EDB_GET_KEY(leaf, key);
6934                         mc->mc_flags &= ~C_EOF;
6935                     }
6936                     return rc;
6937                 }
6938             }
6939             else {
6940                 EDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0);
6941             }
6942         } else {
6943             mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
6944             if (op == EDB_PREV_DUP)
6945                 return EDB_NOTFOUND;
6946         }
6947     }
6948 
6949     DPRINTF(("cursor_prev: top page is %"Yu" in cursor %p",
6950         edb_dbg_pgno(mp), (void *) mc));
6951 
6952     mc->mc_flags &= ~(C_EOF|C_DEL);
6953 
6954     if (mc->mc_ki[mc->mc_top] == 0)  {
6955         DPUTS("=====> move to prev sibling page");
6956         if ((rc = edb_cursor_sibling(mc, 0)) != EDB_SUCCESS) {
6957             return rc;
6958         }
6959         mp = mc->mc_pg[mc->mc_top];
6960         mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
6961         DPRINTF(("prev page is %"Yu", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
6962     } else
6963         mc->mc_ki[mc->mc_top]--;
6964 
6965     DPRINTF(("==> cursor points to page %"Yu" with %u keys, key index %u",
6966         edb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
6967 
6968     if (IS_LEAF2(mp)) {
6969         key->mv_size = mc->mc_db->md_pad;
6970         key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
6971         return EDB_SUCCESS;
6972     }
6973 
6974     edb_cassert(mc, IS_LEAF(mp));
6975     leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6976 
6977     if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6978         edb_xcursor_init1(mc, leaf);
6979     }
6980     if (data) {
6981         if ((rc = edb_node_read(mc, leaf, data)) != EDB_SUCCESS)
6982             return rc;
6983 
6984         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6985             rc = edb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
6986             if (rc != EDB_SUCCESS)
6987                 return rc;
6988         }
6989     }
6990 
6991     EDB_GET_KEY(leaf, key);
6992     return EDB_SUCCESS;
6993 }
6994 
6995 /** Set the cursor on a specific data item. */
6996 static int
6997 edb_cursor_set(EDB_cursor *mc, EDB_val *key, EDB_val *data,
6998     EDB_cursor_op op, int *exactp)
6999 {
7000     int      rc;
7001     EDB_page    *mp;
7002     EDB_node    *leaf = NULL;
7003     DKBUF;
7004 
7005     if (key->mv_size == 0)
7006         return EDB_BAD_VALSIZE;
7007 
7008     if (mc->mc_xcursor) {
7009         EDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0);
7010         mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
7011     }
7012 
7013     /* See if we're already on the right page */
7014     if (mc->mc_flags & C_INITIALIZED) {
7015         EDB_val nodekey;
7016 
7017         mp = mc->mc_pg[mc->mc_top];
7018         if (!NUMKEYS(mp)) {
7019             mc->mc_ki[mc->mc_top] = 0;
7020             return EDB_NOTFOUND;
7021         }
7022         if (mp->mp_flags & P_LEAF2) {
7023             nodekey.mv_size = mc->mc_db->md_pad;
7024             nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size);
7025         } else {
7026             leaf = NODEPTR(mp, 0);
7027             EDB_GET_KEY2(leaf, nodekey);
7028         }
7029         rc = mc->mc_dbx->md_cmp(key, &nodekey);
7030         if (rc == 0) {
7031             /* Probably happens rarely, but first node on the page
7032              * was the one we wanted.
7033              */
7034             mc->mc_ki[mc->mc_top] = 0;
7035             if (exactp)
7036                 *exactp = 1;
7037             goto set1;
7038         }
7039         if (rc > 0) {
7040             unsigned int i;
7041             unsigned int nkeys = NUMKEYS(mp);
7042             if (nkeys > 1) {
7043                 if (mp->mp_flags & P_LEAF2) {
7044                     nodekey.mv_data = LEAF2KEY(mp,
7045                          nkeys-1, nodekey.mv_size);
7046                 } else {
7047                     leaf = NODEPTR(mp, nkeys-1);
7048                     EDB_GET_KEY2(leaf, nodekey);
7049                 }
7050                 rc = mc->mc_dbx->md_cmp(key, &nodekey);
7051                 if (rc == 0) {
7052                     /* last node was the one we wanted */
7053                     mc->mc_ki[mc->mc_top] = nkeys-1;
7054                     if (exactp)
7055                         *exactp = 1;
7056                     goto set1;
7057                 }
7058                 if (rc < 0) {
7059                     if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) {
7060                         /* This is definitely the right page, skip search_page */
7061                         if (mp->mp_flags & P_LEAF2) {
7062                             nodekey.mv_data = LEAF2KEY(mp,
7063                                  mc->mc_ki[mc->mc_top], nodekey.mv_size);
7064                         } else {
7065                             leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7066                             EDB_GET_KEY2(leaf, nodekey);
7067                         }
7068                         rc = mc->mc_dbx->md_cmp(key, &nodekey);
7069                         if (rc == 0) {
7070                             /* current node was the one we wanted */
7071                             if (exactp)
7072                                 *exactp = 1;
7073                             goto set1;
7074                         }
7075                     }
7076                     rc = 0;
7077                     mc->mc_flags &= ~C_EOF;
7078                     goto set2;
7079                 }
7080             }
7081             /* If any parents have right-sibs, search.
7082              * Otherwise, there's nothing further.
7083              */
7084             for (i=0; i<mc->mc_top; i++)
7085                 if (mc->mc_ki[i] <
7086                     NUMKEYS(mc->mc_pg[i])-1)
7087                     break;
7088             if (i == mc->mc_top) {
7089                 /* There are no other pages */
7090                 mc->mc_ki[mc->mc_top] = nkeys;
7091                 return EDB_NOTFOUND;
7092             }
7093         }
7094         if (!mc->mc_top) {
7095             /* There are no other pages */
7096             mc->mc_ki[mc->mc_top] = 0;
7097             if (op == EDB_SET_RANGE && !exactp) {
7098                 rc = 0;
7099                 goto set1;
7100             } else
7101                 return EDB_NOTFOUND;
7102         }
7103     } else {
7104         mc->mc_pg[0] = 0;
7105     }
7106 
7107     rc = edb_page_search(mc, key, 0);
7108     if (rc != EDB_SUCCESS)
7109         return rc;
7110 
7111     mp = mc->mc_pg[mc->mc_top];
7112     edb_cassert(mc, IS_LEAF(mp));
7113 
7114 set2:
7115     leaf = edb_node_search(mc, key, exactp);
7116     if (exactp != NULL && !*exactp) {
7117         /* EDB_SET specified and not an exact match. */
7118         return EDB_NOTFOUND;
7119     }
7120 
7121     if (leaf == NULL) {
7122         DPUTS("===> inexact leaf not found, goto sibling");
7123         if ((rc = edb_cursor_sibling(mc, 1)) != EDB_SUCCESS) {
7124             mc->mc_flags |= C_EOF;
7125             return rc;      /* no entries matched */
7126         }
7127         mp = mc->mc_pg[mc->mc_top];
7128         edb_cassert(mc, IS_LEAF(mp));
7129         leaf = NODEPTR(mp, 0);
7130     }
7131 
7132 set1:
7133     mc->mc_flags |= C_INITIALIZED;
7134     mc->mc_flags &= ~C_EOF;
7135 
7136     if (IS_LEAF2(mp)) {
7137         if (op == EDB_SET_RANGE || op == EDB_SET_KEY) {
7138             key->mv_size = mc->mc_db->md_pad;
7139             key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
7140         }
7141         return EDB_SUCCESS;
7142     }
7143 
7144     if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7145         edb_xcursor_init1(mc, leaf);
7146     }
7147     if (data) {
7148         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7149             if (op == EDB_SET || op == EDB_SET_KEY || op == EDB_SET_RANGE) {
7150                 rc = edb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
7151             } else {
7152                 int ex2, *ex2p;
7153                 if (op == EDB_GET_BOTH) {
7154                     ex2p = &ex2;
7155                     ex2 = 0;
7156                 } else {
7157                     ex2p = NULL;
7158                 }
7159                 rc = edb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, EDB_SET_RANGE, ex2p);
7160                 if (rc != EDB_SUCCESS)
7161                     return rc;
7162             }
7163         } else if (op == EDB_GET_BOTH || op == EDB_GET_BOTH_RANGE) {
7164             EDB_val olddata;
7165             EDB_cmp_func *dcmp;
7166             if ((rc = edb_node_read(mc, leaf, &olddata)) != EDB_SUCCESS)
7167                 return rc;
7168             dcmp = mc->mc_dbx->md_dcmp;
7169             if (NEED_CMP_CLONG(dcmp, olddata.mv_size))
7170                 dcmp = edb_cmp_clong;
7171             rc = dcmp(data, &olddata);
7172             if (rc) {
7173                 if (op == EDB_GET_BOTH || rc > 0)
7174                     return EDB_NOTFOUND;
7175                 rc = 0;
7176             }
7177             *data = olddata;
7178 
7179         } else {
7180             if (mc->mc_xcursor)
7181                 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
7182             if ((rc = edb_node_read(mc, leaf, data)) != EDB_SUCCESS)
7183                 return rc;
7184         }
7185     }
7186 
7187     /* The key already matches in all other cases */
7188     if (op == EDB_SET_RANGE || op == EDB_SET_KEY)
7189         EDB_GET_KEY(leaf, key);
7190     DPRINTF(("==> cursor placed on key [%s]", DKEY(key)));
7191 
7192     return rc;
7193 }
7194 
7195 /** Move the cursor to the first item in the database. */
7196 static int
7197 edb_cursor_first(EDB_cursor *mc, EDB_val *key, EDB_val *data)
7198 {
7199     int      rc;
7200     EDB_node    *leaf;
7201 
7202     if (mc->mc_xcursor) {
7203         EDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0);
7204         mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
7205     }
7206 
7207     if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
7208         rc = edb_page_search(mc, NULL, EDB_PS_FIRST);
7209         if (rc != EDB_SUCCESS)
7210             return rc;
7211     }
7212     edb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
7213 
7214     leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0);
7215     mc->mc_flags |= C_INITIALIZED;
7216     mc->mc_flags &= ~C_EOF;
7217 
7218     mc->mc_ki[mc->mc_top] = 0;
7219 
7220     if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
7221         key->mv_size = mc->mc_db->md_pad;
7222         key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size);
7223         return EDB_SUCCESS;
7224     }
7225 
7226     if (data) {
7227         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7228             edb_xcursor_init1(mc, leaf);
7229             rc = edb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
7230             if (rc)
7231                 return rc;
7232         } else {
7233             if ((rc = edb_node_read(mc, leaf, data)) != EDB_SUCCESS)
7234                 return rc;
7235         }
7236     }
7237     EDB_GET_KEY(leaf, key);
7238     return EDB_SUCCESS;
7239 }
7240 
7241 /** Move the cursor to the last item in the database. */
7242 static int
7243 edb_cursor_last(EDB_cursor *mc, EDB_val *key, EDB_val *data)
7244 {
7245     int      rc;
7246     EDB_node    *leaf;
7247 
7248     if (mc->mc_xcursor) {
7249         EDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0);
7250         mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
7251     }
7252 
7253     if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
7254         rc = edb_page_search(mc, NULL, EDB_PS_LAST);
7255         if (rc != EDB_SUCCESS)
7256             return rc;
7257     }
7258     edb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
7259 
7260     mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
7261     mc->mc_flags |= C_INITIALIZED|C_EOF;
7262     leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7263 
7264     if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
7265         key->mv_size = mc->mc_db->md_pad;
7266         key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size);
7267         return EDB_SUCCESS;
7268     }
7269 
7270     if (data) {
7271         if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7272             edb_xcursor_init1(mc, leaf);
7273             rc = edb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
7274             if (rc)
7275                 return rc;
7276         } else {
7277             if ((rc = edb_node_read(mc, leaf, data)) != EDB_SUCCESS)
7278                 return rc;
7279         }
7280     }
7281 
7282     EDB_GET_KEY(leaf, key);
7283     return EDB_SUCCESS;
7284 }
7285 
7286 int
7287 edb_cursor_get(EDB_cursor *mc, EDB_val *key, EDB_val *data,
7288     EDB_cursor_op op)
7289 {
7290     int      rc;
7291     int      exact = 0;
7292     int      (*mfunc)(EDB_cursor *mc, EDB_val *key, EDB_val *data);
7293 
7294     if (mc == NULL)
7295         return EINVAL;
7296 
7297     if (mc->mc_txn->mt_flags & EDB_TXN_BLOCKED)
7298         return EDB_BAD_TXN;
7299 
7300     switch (op) {
7301     case EDB_GET_CURRENT:
7302         if (!(mc->mc_flags & C_INITIALIZED)) {
7303             rc = EINVAL;
7304         } else {
7305             EDB_page *mp = mc->mc_pg[mc->mc_top];
7306             int nkeys = NUMKEYS(mp);
7307             if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
7308                 mc->mc_ki[mc->mc_top] = nkeys;
7309                 rc = EDB_NOTFOUND;
7310                 break;
7311             }
7312             rc = EDB_SUCCESS;
7313             if (IS_LEAF2(mp)) {
7314                 key->mv_size = mc->mc_db->md_pad;
7315                 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
7316             } else {
7317                 EDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7318                 EDB_GET_KEY(leaf, key);
7319                 if (data) {
7320                     if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7321                         rc = edb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, EDB_GET_CURRENT);
7322                     } else {
7323                         rc = edb_node_read(mc, leaf, data);
7324                     }
7325                 }
7326             }
7327         }
7328         break;
7329     case EDB_GET_BOTH:
7330     case EDB_GET_BOTH_RANGE:
7331         if (data == NULL) {
7332             rc = EINVAL;
7333             break;
7334         }
7335         if (mc->mc_xcursor == NULL) {
7336             rc = EDB_INCOMPATIBLE;
7337             break;
7338         }
7339         /* FALLTHRU */
7340     case EDB_SET:
7341     case EDB_SET_KEY:
7342     case EDB_SET_RANGE:
7343         if (key == NULL) {
7344             rc = EINVAL;
7345         } else {
7346             rc = edb_cursor_set(mc, key, data, op,
7347                 op == EDB_SET_RANGE ? NULL : &exact);
7348         }
7349         break;
7350     case EDB_GET_MULTIPLE:
7351         if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
7352             rc = EINVAL;
7353             break;
7354         }
7355         if (!(mc->mc_db->md_flags & EDB_DUPFIXED)) {
7356             rc = EDB_INCOMPATIBLE;
7357             break;
7358         }
7359         rc = EDB_SUCCESS;
7360         if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) ||
7361             (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF))
7362             break;
7363         goto fetchm;
7364     case EDB_NEXT_MULTIPLE:
7365         if (data == NULL) {
7366             rc = EINVAL;
7367             break;
7368         }
7369         if (!(mc->mc_db->md_flags & EDB_DUPFIXED)) {
7370             rc = EDB_INCOMPATIBLE;
7371             break;
7372         }
7373         rc = edb_cursor_next(mc, key, data, EDB_NEXT_DUP);
7374         if (rc == EDB_SUCCESS) {
7375             if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
7376                 EDB_cursor *mx;
7377 fetchm:
7378                 mx = &mc->mc_xcursor->mx_cursor;
7379                 data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) *
7380                     mx->mc_db->md_pad;
7381                 data->mv_data = METADATA(mx->mc_pg[mx->mc_top]);
7382                 mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1;
7383             } else {
7384                 rc = EDB_NOTFOUND;
7385             }
7386         }
7387         break;
7388     case EDB_PREV_MULTIPLE:
7389         if (data == NULL) {
7390             rc = EINVAL;
7391             break;
7392         }
7393         if (!(mc->mc_db->md_flags & EDB_DUPFIXED)) {
7394             rc = EDB_INCOMPATIBLE;
7395             break;
7396         }
7397         if (!(mc->mc_flags & C_INITIALIZED))
7398             rc = edb_cursor_last(mc, key, data);
7399         else
7400             rc = EDB_SUCCESS;
7401         if (rc == EDB_SUCCESS) {
7402             EDB_cursor *mx = &mc->mc_xcursor->mx_cursor;
7403             if (mx->mc_flags & C_INITIALIZED) {
7404                 rc = edb_cursor_sibling(mx, 0);
7405                 if (rc == EDB_SUCCESS)
7406                     goto fetchm;
7407             } else {
7408                 rc = EDB_NOTFOUND;
7409             }
7410         }
7411         break;
7412     case EDB_NEXT:
7413     case EDB_NEXT_DUP:
7414     case EDB_NEXT_NODUP:
7415         rc = edb_cursor_next(mc, key, data, op);
7416         break;
7417     case EDB_PREV:
7418     case EDB_PREV_DUP:
7419     case EDB_PREV_NODUP:
7420         rc = edb_cursor_prev(mc, key, data, op);
7421         break;
7422     case EDB_FIRST:
7423         rc = edb_cursor_first(mc, key, data);
7424         break;
7425     case EDB_FIRST_DUP:
7426         mfunc = edb_cursor_first;
7427     mmove:
7428         if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
7429             rc = EINVAL;
7430             break;
7431         }
7432         if (mc->mc_xcursor == NULL) {
7433             rc = EDB_INCOMPATIBLE;
7434             break;
7435         }
7436         if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) {
7437             mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
7438             rc = EDB_NOTFOUND;
7439             break;
7440         }
7441         {
7442             EDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7443             if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7444                 EDB_GET_KEY(leaf, key);
7445                 rc = edb_node_read(mc, leaf, data);
7446                 break;
7447             }
7448         }
7449         if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
7450             rc = EINVAL;
7451             break;
7452         }
7453         rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL);
7454         break;
7455     case EDB_LAST:
7456         rc = edb_cursor_last(mc, key, data);
7457         break;
7458     case EDB_LAST_DUP:
7459         mfunc = edb_cursor_last;
7460         goto mmove;
7461     default:
7462         DPRINTF(("unhandled/unimplemented cursor operation %u", op));
7463         rc = EINVAL;
7464         break;
7465     }
7466 
7467     if (mc->mc_flags & C_DEL)
7468         mc->mc_flags ^= C_DEL;
7469 
7470     return rc;
7471 }
7472 
7473 /** Touch all the pages in the cursor stack. Set mc_top.
7474  *  Makes sure all the pages are writable, before attempting a write operation.
7475  * @param[in] mc The cursor to operate on.
7476  */
7477 static int
7478 edb_cursor_touch(EDB_cursor *mc)
7479 {
7480     int rc = EDB_SUCCESS;
7481 
7482     if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) {
7483         /* Touch DB record of named DB */
7484         EDB_cursor mc2;
7485         EDB_xcursor mcx;
7486         if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
7487             return EDB_BAD_DBI;
7488         edb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx);
7489         rc = edb_page_search(&mc2, &mc->mc_dbx->md_name, EDB_PS_MODIFY);
7490         if (rc)
7491              return rc;
7492         *mc->mc_dbflag |= DB_DIRTY;
7493     }
7494     mc->mc_top = 0;
7495     if (mc->mc_snum) {
7496         do {
7497             rc = edb_page_touch(mc);
7498         } while (!rc && ++(mc->mc_top) < mc->mc_snum);
7499         mc->mc_top = mc->mc_snum-1;
7500     }
7501     return rc;
7502 }
7503 
7504 /** Do not spill pages to disk if txn is getting full, may fail instead */
7505 #define EDB_NOSPILL 0x8000
7506 
7507 int
7508 edb_cursor_put(EDB_cursor *mc, EDB_val *key, EDB_val *data,
7509     unsigned int flags)
7510 {
7511     EDB_env     *env;
7512     EDB_node    *leaf = NULL;
7513     EDB_page    *fp, *mp, *sub_root = NULL;
7514     uint16_t    fp_flags;
7515     EDB_val     xdata, *rdata, dkey, olddata;
7516     EDB_db dummy;
7517     int do_sub = 0, insert_key, insert_data;
7518     unsigned int mcount = 0, dcount = 0, nospill;
7519     size_t nsize;
7520     int rc, rc2;
7521     unsigned int nflags;
7522     DKBUF;
7523 
7524     if (mc == NULL || key == NULL)
7525         return EINVAL;
7526 
7527     env = mc->mc_txn->mt_env;
7528 
7529     /* Check this first so counter will always be zero on any
7530      * early failures.
7531      */
7532     if (flags & EDB_MULTIPLE) {
7533         dcount = data[1].mv_size;
7534         data[1].mv_size = 0;
7535         if (!F_ISSET(mc->mc_db->md_flags, EDB_DUPFIXED))
7536             return EDB_INCOMPATIBLE;
7537     }
7538 
7539     nospill = flags & EDB_NOSPILL;
7540     flags &= ~EDB_NOSPILL;
7541 
7542     if (mc->mc_txn->mt_flags & (EDB_TXN_RDONLY|EDB_TXN_BLOCKED))
7543         return (mc->mc_txn->mt_flags & EDB_TXN_RDONLY) ? EACCES : EDB_BAD_TXN;
7544 
7545     if (key->mv_size-1 >= ENV_MAXKEY(env))
7546         return EDB_BAD_VALSIZE;
7547 
7548 #if SIZE_MAX > MAXDATASIZE
7549     if (data->mv_size > ((mc->mc_db->md_flags & EDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE))
7550         return EDB_BAD_VALSIZE;
7551 #else
7552     if ((mc->mc_db->md_flags & EDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env))
7553         return EDB_BAD_VALSIZE;
7554 #endif
7555 
7556     DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
7557         DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
7558 
7559     dkey.mv_size = 0;
7560 
7561     if (flags & EDB_CURRENT) {
7562         if (!(mc->mc_flags & C_INITIALIZED))
7563             return EINVAL;
7564         rc = EDB_SUCCESS;
7565     } else if (mc->mc_db->md_root == P_INVALID) {
7566         /* new database, cursor has nothing to point to */
7567         mc->mc_snum = 0;
7568         mc->mc_top = 0;
7569         mc->mc_flags &= ~C_INITIALIZED;
7570         rc = EDB_NO_ROOT;
7571     } else {
7572         int exact = 0;
7573         EDB_val d2;
7574         if (flags & EDB_APPEND) {
7575             EDB_val k2;
7576             rc = edb_cursor_last(mc, &k2, &d2);
7577             if (rc == 0) {
7578                 rc = mc->mc_dbx->md_cmp(key, &k2);
7579                 if (rc > 0) {
7580                     rc = EDB_NOTFOUND;
7581                     mc->mc_ki[mc->mc_top]++;
7582                 } else {
7583                     /* new key is <= last key */
7584                     rc = EDB_KEYEXIST;
7585                 }
7586             }
7587         } else {
7588             rc = edb_cursor_set(mc, key, &d2, EDB_SET, &exact);
7589         }
7590         if ((flags & EDB_NOOVERWRITE) && rc == 0) {
7591             DPRINTF(("duplicate key [%s]", DKEY(key)));
7592             *data = d2;
7593             return EDB_KEYEXIST;
7594         }
7595         if (rc && rc != EDB_NOTFOUND)
7596             return rc;
7597     }
7598 
7599     if (mc->mc_flags & C_DEL)
7600         mc->mc_flags ^= C_DEL;
7601 
7602     /* Cursor is positioned, check for room in the dirty list */
7603     if (!nospill) {
7604         if (flags & EDB_MULTIPLE) {
7605             rdata = &xdata;
7606             xdata.mv_size = data->mv_size * dcount;
7607         } else {
7608             rdata = data;
7609         }
7610         if ((rc2 = edb_page_spill(mc, key, rdata)))
7611             return rc2;
7612     }
7613 
7614     if (rc == EDB_NO_ROOT) {
7615         EDB_page *np;
7616         /* new database, write a root leaf page */
7617         DPUTS("allocating new root leaf page");
7618         if ((rc2 = edb_page_new(mc, P_LEAF, 1, &np))) {
7619             return rc2;
7620         }
7621         edb_cursor_push(mc, np);
7622         mc->mc_db->md_root = np->mp_pgno;
7623         mc->mc_db->md_depth++;
7624         *mc->mc_dbflag |= DB_DIRTY;
7625         if ((mc->mc_db->md_flags & (EDB_DUPSORT|EDB_DUPFIXED))
7626             == EDB_DUPFIXED)
7627             np->mp_flags |= P_LEAF2;
7628         mc->mc_flags |= C_INITIALIZED;
7629     } else {
7630         /* make sure all cursor pages are writable */
7631         rc2 = edb_cursor_touch(mc);
7632         if (rc2)
7633             return rc2;
7634     }
7635 
7636     insert_key = insert_data = rc;
7637     if (insert_key) {
7638         /* The key does not exist */
7639         DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top]));
7640         if ((mc->mc_db->md_flags & EDB_DUPSORT) &&
7641             LEAFSIZE(key, data) > env->me_nodemax)
7642         {
7643             /* Too big for a node, insert in sub-DB.  Set up an empty
7644              * "old sub-page" for prep_subDB to expand to a full page.
7645              */
7646             fp_flags = P_LEAF|P_DIRTY;
7647             fp = env->me_pbuf;
7648             fp->mp_pad = data->mv_size; /* used if EDB_DUPFIXED */
7649             fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE);
7650             olddata.mv_size = PAGEHDRSZ;
7651             goto prep_subDB;
7652         }
7653     } else {
7654         /* there's only a key anyway, so this is a no-op */
7655         if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
7656             char *ptr;
7657             unsigned int ksize = mc->mc_db->md_pad;
7658             if (key->mv_size != ksize)
7659                 return EDB_BAD_VALSIZE;
7660             ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
7661             memcpy(ptr, key->mv_data, ksize);
7662 fix_parent:
7663             /* if overwriting slot 0 of leaf, need to
7664              * update branch key if there is a parent page
7665              */
7666             if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
7667                 unsigned short dtop = 1;
7668                 mc->mc_top--;
7669                 /* slot 0 is always an empty key, find real slot */
7670                 while (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
7671                     mc->mc_top--;
7672                     dtop++;
7673                 }
7674                 if (mc->mc_ki[mc->mc_top])
7675                     rc2 = edb_update_key(mc, key);
7676                 else
7677                     rc2 = EDB_SUCCESS;
7678                 mc->mc_top += dtop;
7679                 if (rc2)
7680                     return rc2;
7681             }
7682             return EDB_SUCCESS;
7683         }
7684 
7685 more:
7686         leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7687         olddata.mv_size = NODEDSZ(leaf);
7688         olddata.mv_data = NODEDATA(leaf);
7689 
7690         /* DB has dups? */
7691         if (F_ISSET(mc->mc_db->md_flags, EDB_DUPSORT)) {
7692             /* Prepare (sub-)page/sub-DB to accept the new item,
7693              * if needed.  fp: old sub-page or a header faking
7694              * it.  mp: new (sub-)page.  offset: growth in page
7695              * size.  xdata: node data with new page or DB.
7696              */
7697             unsigned    i, offset = 0;
7698             mp = fp = xdata.mv_data = env->me_pbuf;
7699             mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
7700 
7701             /* Was a single item before, must convert now */
7702             if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7703                 EDB_cmp_func *dcmp;
7704                 /* Just overwrite the current item */
7705                 if (flags == EDB_CURRENT)
7706                     goto current;
7707                 dcmp = mc->mc_dbx->md_dcmp;
7708                 if (NEED_CMP_CLONG(dcmp, olddata.mv_size))
7709                     dcmp = edb_cmp_clong;
7710                 /* does data match? */
7711                 if (!dcmp(data, &olddata)) {
7712                     if (flags & (EDB_NODUPDATA|EDB_APPENDDUP))
7713                         return EDB_KEYEXIST;
7714                     /* overwrite it */
7715                     goto current;
7716                 }
7717 
7718                 /* Back up original data item */
7719                 dkey.mv_size = olddata.mv_size;
7720                 dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size);
7721 
7722                 /* Make sub-page header for the dup items, with dummy body */
7723                 fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
7724                 fp->mp_lower = (PAGEHDRSZ-PAGEBASE);
7725                 xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
7726                 if (mc->mc_db->md_flags & EDB_DUPFIXED) {
7727                     fp->mp_flags |= P_LEAF2;
7728                     fp->mp_pad = data->mv_size;
7729                     xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
7730                 } else {
7731                     xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
7732                         (dkey.mv_size & 1) + (data->mv_size & 1);
7733                 }
7734                 fp->mp_upper = xdata.mv_size - PAGEBASE;
7735                 olddata.mv_size = xdata.mv_size; /* pretend olddata is fp */
7736             } else if (leaf->mn_flags & F_SUBDATA) {
7737                 /* Data is on sub-DB, just store it */
7738                 flags |= F_DUPDATA|F_SUBDATA;
7739                 goto put_sub;
7740             } else {
7741                 /* Data is on sub-page */
7742                 fp = olddata.mv_data;
7743                 switch (flags) {
7744                 default:
7745                     if (!(mc->mc_db->md_flags & EDB_DUPFIXED)) {
7746                         offset = EVEN(NODESIZE + sizeof(indx_t) +
7747                             data->mv_size);
7748                         break;
7749                     }
7750                     offset = fp->mp_pad;
7751                     if (SIZELEFT(fp) < offset) {
7752                         offset *= 4; /* space for 4 more */
7753                         break;
7754                     }
7755                     /* FALLTHRU */ /* Big enough EDB_DUPFIXED sub-page */
7756                 case EDB_CURRENT:
7757                     fp->mp_flags |= P_DIRTY;
7758                     COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
7759                     mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
7760                     flags |= F_DUPDATA;
7761                     goto put_sub;
7762                 }
7763                 xdata.mv_size = olddata.mv_size + offset;
7764             }
7765 
7766             fp_flags = fp->mp_flags;
7767             if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) {
7768                     /* Too big for a sub-page, convert to sub-DB */
7769                     fp_flags &= ~P_SUBP;
7770 prep_subDB:
7771                     if (mc->mc_db->md_flags & EDB_DUPFIXED) {
7772                         fp_flags |= P_LEAF2;
7773                         dummy.md_pad = fp->mp_pad;
7774                         dummy.md_flags = EDB_DUPFIXED;
7775                         if (mc->mc_db->md_flags & EDB_INTEGERDUP)
7776                             dummy.md_flags |= EDB_INTEGERKEY;
7777                     } else {
7778                         dummy.md_pad = 0;
7779                         dummy.md_flags = 0;
7780                     }
7781                     dummy.md_depth = 1;
7782                     dummy.md_branch_pages = 0;
7783                     dummy.md_leaf_pages = 1;
7784                     dummy.md_overflow_pages = 0;
7785                     dummy.md_entries = NUMKEYS(fp);
7786                     xdata.mv_size = sizeof(EDB_db);
7787                     xdata.mv_data = &dummy;
7788                     if ((rc = edb_page_alloc(mc, 1, &mp)))
7789                         return rc;
7790                     offset = env->me_psize - olddata.mv_size;
7791                     flags |= F_DUPDATA|F_SUBDATA;
7792                     dummy.md_root = mp->mp_pgno;
7793                     sub_root = mp;
7794             }
7795             if (mp != fp) {
7796                 mp->mp_flags = fp_flags | P_DIRTY;
7797                 mp->mp_pad   = fp->mp_pad;
7798                 mp->mp_lower = fp->mp_lower;
7799                 mp->mp_upper = fp->mp_upper + offset;
7800                 if (fp_flags & P_LEAF2) {
7801                     memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
7802                 } else {
7803                     memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
7804                         olddata.mv_size - fp->mp_upper - PAGEBASE);
7805                     memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0]));
7806                     for (i=0; i<NUMKEYS(fp); i++)
7807                         mp->mp_ptrs[i] += offset;
7808                 }
7809             }
7810 
7811             rdata = &xdata;
7812             flags |= F_DUPDATA;
7813             do_sub = 1;
7814             if (!insert_key)
7815                 edb_node_del(mc, 0);
7816             goto new_sub;
7817         }
7818 current:
7819         /* EXDB passes F_SUBDATA in 'flags' to write a DB record */
7820         if ((leaf->mn_flags ^ flags) & F_SUBDATA)
7821             return EDB_INCOMPATIBLE;
7822         /* overflow page overwrites need special handling */
7823         if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
7824             EDB_page *omp;
7825             pgno_t pg;
7826             int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
7827 
7828             memcpy(&pg, olddata.mv_data, sizeof(pg));
7829             if ((rc2 = edb_page_get(mc, pg, &omp, &level)) != 0)
7830                 return rc2;
7831             ovpages = omp->mp_pages;
7832 
7833             /* Is the ov page large enough? */
7834             if (ovpages >= dpages) {
7835               if (!(omp->mp_flags & P_DIRTY) &&
7836                   (level || (env->me_flags & EDB_WRITEMAP)))
7837               {
7838                 rc = edb_page_unspill(mc->mc_txn, omp, &omp);
7839                 if (rc)
7840                     return rc;
7841                 level = 0;      /* dirty in this txn or clean */
7842               }
7843               /* Is it dirty? */
7844               if (omp->mp_flags & P_DIRTY) {
7845                 /* yes, overwrite it. Note in this case we don't
7846                  * bother to try shrinking the page if the new data
7847                  * is smaller than the overflow threshold.
7848                  */
7849                 if (level > 1) {
7850                     /* It is writable only in a parent txn */
7851                     size_t sz = (size_t) env->me_psize * ovpages, off;
7852                     EDB_page *np = edb_page_malloc(mc->mc_txn, ovpages);
7853                     EDB_ID2 id2;
7854                     if (!np)
7855                         return ENOMEM;
7856                     id2.mid = pg;
7857                     id2.mptr = np;
7858                     /* Note - this page is already counted in parent's dirty_room */
7859                     rc2 = edb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
7860                     edb_cassert(mc, rc2 == 0);
7861                     /* Currently we make the page look as with put() in the
7862                      * parent txn, in case the user peeks at EDB_RESERVEd
7863                      * or unused parts. Some users treat ovpages specially.
7864                      */
7865                     if (!(flags & EDB_RESERVE)) {
7866                         /* Skip the part where EXDB will put *data.
7867                          * Copy end of page, adjusting alignment so
7868                          * compiler may copy words instead of bytes.
7869                          */
7870                         off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t);
7871                         memcpy((size_t *)((char *)np + off),
7872                             (size_t *)((char *)omp + off), sz - off);
7873                         sz = PAGEHDRSZ;
7874                     }
7875                     memcpy(np, omp, sz); /* Copy beginning of page */
7876                     omp = np;
7877                 }
7878                 SETDSZ(leaf, data->mv_size);
7879                 if (F_ISSET(flags, EDB_RESERVE))
7880                     data->mv_data = METADATA(omp);
7881                 else
7882                     memcpy(METADATA(omp), data->mv_data, data->mv_size);
7883                 return EDB_SUCCESS;
7884               }
7885             }
7886             if ((rc2 = edb_ovpage_free(mc, omp)) != EDB_SUCCESS)
7887                 return rc2;
7888         } else if (data->mv_size == olddata.mv_size) {
7889             /* same size, just replace it. Note that we could
7890              * also reuse this node if the new data is smaller,
7891              * but instead we opt to shrink the node in that case.
7892              */
7893             if (F_ISSET(flags, EDB_RESERVE))
7894                 data->mv_data = olddata.mv_data;
7895             else if (!(mc->mc_flags & C_SUB))
7896                 memcpy(olddata.mv_data, data->mv_data, data->mv_size);
7897             else {
7898                 memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
7899                 goto fix_parent;
7900             }
7901             return EDB_SUCCESS;
7902         }
7903         edb_node_del(mc, 0);
7904     }
7905 
7906     rdata = data;
7907 
7908 new_sub:
7909     nflags = flags & NODE_ADD_FLAGS;
7910     nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : edb_leaf_size(env, key, rdata);
7911     if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
7912         if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
7913             nflags &= ~EDB_APPEND; /* sub-page may need room to grow */
7914         if (!insert_key)
7915             nflags |= EDB_SPLIT_REPLACE;
7916         rc = edb_page_split(mc, key, rdata, P_INVALID, nflags);
7917     } else {
7918         /* There is room already in this leaf page. */
7919         rc = edb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
7920         if (rc == 0) {
7921             /* Adjust other cursors pointing to mp */
7922             EDB_cursor *m2, *m3;
7923             EDB_dbi dbi = mc->mc_dbi;
7924             unsigned i = mc->mc_top;
7925             EDB_page *mp = mc->mc_pg[i];
7926 
7927             for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7928                 if (mc->mc_flags & C_SUB)
7929                     m3 = &m2->mc_xcursor->mx_cursor;
7930                 else
7931                     m3 = m2;
7932                 if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue;
7933                 if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) {
7934                     m3->mc_ki[i]++;
7935                 }
7936                 XCURSOR_REFRESH(m3, i, mp);
7937             }
7938         }
7939     }
7940 
7941     if (rc == EDB_SUCCESS) {
7942         /* Now store the actual data in the child DB. Note that we're
7943          * storing the user data in the keys field, so there are strict
7944          * size limits on dupdata. The actual data fields of the child
7945          * DB are all zero size.
7946          */
7947         if (do_sub) {
7948             int xflags, new_dupdata;
7949             edb_size_t ecount;
7950 put_sub:
7951             xdata.mv_size = 0;
7952             xdata.mv_data = "";
7953             leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7954             if (flags == EDB_CURRENT) {
7955                 xflags = EDB_CURRENT|EDB_NOSPILL;
7956             } else {
7957                 edb_xcursor_init1(mc, leaf);
7958                 xflags = (flags & EDB_NODUPDATA) ?
7959                     EDB_NOOVERWRITE|EDB_NOSPILL : EDB_NOSPILL;
7960             }
7961             if (sub_root)
7962                 mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root;
7963             new_dupdata = (int)dkey.mv_size;
7964             /* converted, write the original data first */
7965             if (dkey.mv_size) {
7966                 rc = edb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
7967                 if (rc)
7968                     goto bad_sub;
7969                 /* we've done our job */
7970                 dkey.mv_size = 0;
7971             }
7972             if (!(leaf->mn_flags & F_SUBDATA) || sub_root) {
7973                 /* Adjust other cursors pointing to mp */
7974                 EDB_cursor *m2;
7975                 EDB_xcursor *mx = mc->mc_xcursor;
7976                 unsigned i = mc->mc_top;
7977                 EDB_page *mp = mc->mc_pg[i];
7978 
7979                 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
7980                     if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
7981                     if (!(m2->mc_flags & C_INITIALIZED)) continue;
7982                     if (m2->mc_pg[i] == mp) {
7983                         if (m2->mc_ki[i] == mc->mc_ki[i]) {
7984                             edb_xcursor_init2(m2, mx, new_dupdata);
7985                         } else if (!insert_key) {
7986                             XCURSOR_REFRESH(m2, i, mp);
7987                         }
7988                     }
7989                 }
7990             }
7991             ecount = mc->mc_xcursor->mx_db.md_entries;
7992             if (flags & EDB_APPENDDUP)
7993                 xflags |= EDB_APPEND;
7994             rc = edb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
7995             if (flags & F_SUBDATA) {
7996                 void *db = NODEDATA(leaf);
7997                 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(EDB_db));
7998             }
7999             insert_data = mc->mc_xcursor->mx_db.md_entries - ecount;
8000         }
8001         /* Increment count unless we just replaced an existing item. */
8002         if (insert_data)
8003             mc->mc_db->md_entries++;
8004         if (insert_key) {
8005             /* Invalidate txn if we created an empty sub-DB */
8006             if (rc)
8007                 goto bad_sub;
8008             /* If we succeeded and the key didn't exist before,
8009              * make sure the cursor is marked valid.
8010              */
8011             mc->mc_flags |= C_INITIALIZED;
8012         }
8013         if (flags & EDB_MULTIPLE) {
8014             if (!rc) {
8015                 mcount++;
8016                 /* let caller know how many succeeded, if any */
8017                 data[1].mv_size = mcount;
8018                 if (mcount < dcount) {
8019                     data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
8020                     insert_key = insert_data = 0;
8021                     goto more;
8022                 }
8023             }
8024         }
8025         return rc;
8026 bad_sub:
8027         if (rc == EDB_KEYEXIST) /* should not happen, we deleted that item */
8028             rc = EDB_PROBLEM;
8029     }
8030     mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
8031     return rc;
8032 }
8033 
8034 int
8035 edb_cursor_del(EDB_cursor *mc, unsigned int flags)
8036 {
8037     EDB_node    *leaf;
8038     EDB_page    *mp;
8039     int rc;
8040 
8041     if (mc->mc_txn->mt_flags & (EDB_TXN_RDONLY|EDB_TXN_BLOCKED))
8042         return (mc->mc_txn->mt_flags & EDB_TXN_RDONLY) ? EACCES : EDB_BAD_TXN;
8043 
8044     if (!(mc->mc_flags & C_INITIALIZED))
8045         return EINVAL;
8046 
8047     if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
8048         return EDB_NOTFOUND;
8049 
8050     if (!(flags & EDB_NOSPILL) && (rc = edb_page_spill(mc, NULL, NULL)))
8051         return rc;
8052 
8053     rc = edb_cursor_touch(mc);
8054     if (rc)
8055         return rc;
8056 
8057     mp = mc->mc_pg[mc->mc_top];
8058     if (IS_LEAF2(mp))
8059         goto del_key;
8060     leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
8061 
8062     if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
8063         if (flags & EDB_NODUPDATA) {
8064             /* edb_cursor_del0() will subtract the final entry */
8065             mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1;
8066             mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
8067         } else {
8068             if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
8069                 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
8070             }
8071             rc = edb_cursor_del(&mc->mc_xcursor->mx_cursor, EDB_NOSPILL);
8072             if (rc)
8073                 return rc;
8074             /* If sub-DB still has entries, we're done */
8075             if (mc->mc_xcursor->mx_db.md_entries) {
8076                 if (leaf->mn_flags & F_SUBDATA) {
8077                     /* update subDB info */
8078                     void *db = NODEDATA(leaf);
8079                     memcpy(db, &mc->mc_xcursor->mx_db, sizeof(EDB_db));
8080                 } else {
8081                     EDB_cursor *m2;
8082                     /* shrink fake page */
8083                     edb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
8084                     leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
8085                     mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
8086                     /* fix other sub-DB cursors pointed at fake pages on this page */
8087                     for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
8088                         if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
8089                         if (!(m2->mc_flags & C_INITIALIZED)) continue;
8090                         if (m2->mc_pg[mc->mc_top] == mp) {
8091                             XCURSOR_REFRESH(m2, mc->mc_top, mp);
8092                         }
8093                     }
8094                 }
8095                 mc->mc_db->md_entries--;
8096                 return rc;
8097             } else {
8098                 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
8099             }
8100             /* otherwise fall thru and delete the sub-DB */
8101         }
8102 
8103         if (leaf->mn_flags & F_SUBDATA) {
8104             /* add all the child DB's pages to the free list */
8105             rc = edb_drop0(&mc->mc_xcursor->mx_cursor, 0);
8106             if (rc)
8107                 goto fail;
8108         }
8109     }
8110     /* EXDB passes F_SUBDATA in 'flags' to delete a DB record */
8111     else if ((leaf->mn_flags ^ flags) & F_SUBDATA) {
8112         rc = EDB_INCOMPATIBLE;
8113         goto fail;
8114     }
8115 
8116     /* add overflow pages to free list */
8117     if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
8118         EDB_page *omp;
8119         pgno_t pg;
8120 
8121         memcpy(&pg, NODEDATA(leaf), sizeof(pg));
8122         if ((rc = edb_page_get(mc, pg, &omp, NULL)) ||
8123             (rc = edb_ovpage_free(mc, omp)))
8124             goto fail;
8125     }
8126 
8127 del_key:
8128     return edb_cursor_del0(mc);
8129 
8130 fail:
8131     mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
8132     return rc;
8133 }
8134 
8135 /** Allocate and initialize new pages for a database.
8136  * Set #EDB_TXN_ERROR on failure.
8137  * @param[in] mc a cursor on the database being added to.
8138  * @param[in] flags flags defining what type of page is being allocated.
8139  * @param[in] num the number of pages to allocate. This is usually 1,
8140  * unless allocating overflow pages for a large record.
8141  * @param[out] mp Address of a page, or NULL on failure.
8142  * @return 0 on success, non-zero on failure.
8143  */
8144 static int
8145 edb_page_new(EDB_cursor *mc, uint32_t flags, int num, EDB_page **mp)
8146 {
8147     EDB_page    *np;
8148     int rc;
8149 
8150     if ((rc = edb_page_alloc(mc, num, &np)))
8151         return rc;
8152     DPRINTF(("allocated new mpage %"Yu", page size %u",
8153         np->mp_pgno, mc->mc_txn->mt_env->me_psize));
8154     np->mp_flags = flags | P_DIRTY;
8155     np->mp_lower = (PAGEHDRSZ-PAGEBASE);
8156     np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE;
8157 
8158     if (IS_BRANCH(np))
8159         mc->mc_db->md_branch_pages++;
8160     else if (IS_LEAF(np))
8161         mc->mc_db->md_leaf_pages++;
8162     else if (IS_OVERFLOW(np)) {
8163         mc->mc_db->md_overflow_pages += num;
8164         np->mp_pages = num;
8165     }
8166     *mp = np;
8167 
8168     return 0;
8169 }
8170 
8171 /** Calculate the size of a leaf node.
8172  * The size depends on the environment's page size; if a data item
8173  * is too large it will be put onto an overflow page and the node
8174  * size will only include the key and not the data. Sizes are always
8175  * rounded up to an even number of bytes, to guarantee 2-byte alignment
8176  * of the #EDB_node headers.
8177  * @param[in] env The environment handle.
8178  * @param[in] key The key for the node.
8179  * @param[in] data The data for the node.
8180  * @return The number of bytes needed to store the node.
8181  */
8182 static size_t
8183 edb_leaf_size(EDB_env *env, EDB_val *key, EDB_val *data)
8184 {
8185     size_t       sz;
8186 
8187     sz = LEAFSIZE(key, data);
8188     if (sz > env->me_nodemax) {
8189         /* put on overflow page */
8190         sz -= data->mv_size - sizeof(pgno_t);
8191     }
8192 
8193     return EVEN(sz + sizeof(indx_t));
8194 }
8195 
8196 /** Calculate the size of a branch node.
8197  * The size should depend on the environment's page size but since
8198  * we currently don't support spilling large keys onto overflow
8199  * pages, it's simply the size of the #EDB_node header plus the
8200  * size of the key. Sizes are always rounded up to an even number
8201  * of bytes, to guarantee 2-byte alignment of the #EDB_node headers.
8202  * @param[in] env The environment handle.
8203  * @param[in] key The key for the node.
8204  * @return The number of bytes needed to store the node.
8205  */
8206 static size_t
8207 edb_branch_size(EDB_env *env, EDB_val *key)
8208 {
8209     size_t       sz;
8210 
8211     sz = INDXSIZE(key);
8212     if (sz > env->me_nodemax) {
8213         /* put on overflow page */
8214         /* not implemented */
8215         /* sz -= key->size - sizeof(pgno_t); */
8216     }
8217 
8218     return sz + sizeof(indx_t);
8219 }
8220 
8221 /** Add a node to the page pointed to by the cursor.
8222  * Set #EDB_TXN_ERROR on failure.
8223  * @param[in] mc The cursor for this operation.
8224  * @param[in] indx The index on the page where the new node should be added.
8225  * @param[in] key The key for the new node.
8226  * @param[in] data The data for the new node, if any.
8227  * @param[in] pgno The page number, if adding a branch node.
8228  * @param[in] flags Flags for the node.
8229  * @return 0 on success, non-zero on failure. Possible errors are:
8230  * <ul>
8231  *  <li>ENOMEM - failed to allocate overflow pages for the node.
8232  *  <li>EDB_PAGE_FULL - there is insufficient room in the page. This error
8233  *  should never happen since all callers already calculate the
8234  *  page's free space before calling this function.
8235  * </ul>
8236  */
8237 static int
8238 edb_node_add(EDB_cursor *mc, indx_t indx,
8239     EDB_val *key, EDB_val *data, pgno_t pgno, unsigned int flags)
8240 {
8241     unsigned int     i;
8242     size_t       node_size = NODESIZE;
8243     ssize_t      room;
8244     indx_t       ofs;
8245     EDB_node    *node;
8246     EDB_page    *mp = mc->mc_pg[mc->mc_top];
8247     EDB_page    *ofp = NULL;        /* overflow page */
8248     void        *ndata;
8249     DKBUF;
8250 
8251     edb_cassert(mc, mp->mp_upper >= mp->mp_lower);
8252 
8253     DPRINTF(("add to %s %spage %"Yu" index %i, data size %"Z"u key size %"Z"u [%s]",
8254         IS_LEAF(mp) ? "leaf" : "branch",
8255         IS_SUBP(mp) ? "sub-" : "",
8256         edb_dbg_pgno(mp), indx, data ? data->mv_size : 0,
8257         key ? key->mv_size : 0, key ? DKEY(key) : "null"));
8258 
8259     if (IS_LEAF2(mp)) {
8260         /* Move higher keys up one slot. */
8261         int ksize = mc->mc_db->md_pad, dif;
8262         char *ptr = LEAF2KEY(mp, indx, ksize);
8263         dif = NUMKEYS(mp) - indx;
8264         if (dif > 0)
8265             memmove(ptr+ksize, ptr, dif*ksize);
8266         /* insert new key */
8267         memcpy(ptr, key->mv_data, ksize);
8268 
8269         /* Just using these for counting */
8270         mp->mp_lower += sizeof(indx_t);
8271         mp->mp_upper -= ksize - sizeof(indx_t);
8272         return EDB_SUCCESS;
8273     }
8274 
8275     room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
8276     if (key != NULL)
8277         node_size += key->mv_size;
8278     if (IS_LEAF(mp)) {
8279         edb_cassert(mc, key && data);
8280         if (F_ISSET(flags, F_BIGDATA)) {
8281             /* Data already on overflow page. */
8282             node_size += sizeof(pgno_t);
8283         } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) {
8284             int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
8285             int rc;
8286             /* Put data on overflow page. */
8287             DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
8288                 data->mv_size, node_size+data->mv_size));
8289             node_size = EVEN(node_size + sizeof(pgno_t));
8290             if ((ssize_t)node_size > room)
8291                 goto full;
8292             if ((rc = edb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
8293                 return rc;
8294             DPRINTF(("allocated overflow page %"Yu, ofp->mp_pgno));
8295             flags |= F_BIGDATA;
8296             goto update;
8297         } else {
8298             node_size += data->mv_size;
8299         }
8300     }
8301     node_size = EVEN(node_size);
8302     if ((ssize_t)node_size > room)
8303         goto full;
8304 
8305 update:
8306     /* Move higher pointers up one slot. */
8307     for (i = NUMKEYS(mp); i > indx; i--)
8308         mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
8309 
8310     /* Adjust free space offsets. */
8311     ofs = mp->mp_upper - node_size;
8312     edb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t));
8313     mp->mp_ptrs[indx] = ofs;
8314     mp->mp_upper = ofs;
8315     mp->mp_lower += sizeof(indx_t);
8316 
8317     /* Write the node data. */
8318     node = NODEPTR(mp, indx);
8319     node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
8320     node->mn_flags = flags;
8321     if (IS_LEAF(mp))
8322         SETDSZ(node,data->mv_size);
8323     else
8324         SETPGNO(node,pgno);
8325 
8326     if (key)
8327         memcpy(NODEKEY(node), key->mv_data, key->mv_size);
8328 
8329     if (IS_LEAF(mp)) {
8330         ndata = NODEDATA(node);
8331         if (ofp == NULL) {
8332             if (F_ISSET(flags, F_BIGDATA))
8333                 memcpy(ndata, data->mv_data, sizeof(pgno_t));
8334             else if (F_ISSET(flags, EDB_RESERVE))
8335                 data->mv_data = ndata;
8336             else
8337                 memcpy(ndata, data->mv_data, data->mv_size);
8338         } else {
8339             memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t));
8340             ndata = METADATA(ofp);
8341             if (F_ISSET(flags, EDB_RESERVE))
8342                 data->mv_data = ndata;
8343             else
8344                 memcpy(ndata, data->mv_data, data->mv_size);
8345         }
8346     }
8347 
8348     return EDB_SUCCESS;
8349 
8350 full:
8351     DPRINTF(("not enough room in page %"Yu", got %u ptrs",
8352         edb_dbg_pgno(mp), NUMKEYS(mp)));
8353     DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
8354     DPRINTF(("node size = %"Z"u", node_size));
8355     mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
8356     return EDB_PAGE_FULL;
8357 }
8358 
8359 /** Delete the specified node from a page.
8360  * @param[in] mc Cursor pointing to the node to delete.
8361  * @param[in] ksize The size of a node. Only used if the page is
8362  * part of a #EDB_DUPFIXED database.
8363  */
8364 static void
8365 edb_node_del(EDB_cursor *mc, int ksize)
8366 {
8367     EDB_page *mp = mc->mc_pg[mc->mc_top];
8368     indx_t  indx = mc->mc_ki[mc->mc_top];
8369     unsigned int     sz;
8370     indx_t       i, j, numkeys, ptr;
8371     EDB_node    *node;
8372     char        *base;
8373 
8374     DPRINTF(("delete node %u on %s page %"Yu, indx,
8375         IS_LEAF(mp) ? "leaf" : "branch", edb_dbg_pgno(mp)));
8376     numkeys = NUMKEYS(mp);
8377     edb_cassert(mc, indx < numkeys);
8378 
8379     if (IS_LEAF2(mp)) {
8380         int x = numkeys - 1 - indx;
8381         base = LEAF2KEY(mp, indx, ksize);
8382         if (x)
8383             memmove(base, base + ksize, x * ksize);
8384         mp->mp_lower -= sizeof(indx_t);
8385         mp->mp_upper += ksize - sizeof(indx_t);
8386         return;
8387     }
8388 
8389     node = NODEPTR(mp, indx);
8390     sz = NODESIZE + node->mn_ksize;
8391     if (IS_LEAF(mp)) {
8392         if (F_ISSET(node->mn_flags, F_BIGDATA))
8393             sz += sizeof(pgno_t);
8394         else
8395             sz += NODEDSZ(node);
8396     }
8397     sz = EVEN(sz);
8398 
8399     ptr = mp->mp_ptrs[indx];
8400     for (i = j = 0; i < numkeys; i++) {
8401         if (i != indx) {
8402             mp->mp_ptrs[j] = mp->mp_ptrs[i];
8403             if (mp->mp_ptrs[i] < ptr)
8404                 mp->mp_ptrs[j] += sz;
8405             j++;
8406         }
8407     }
8408 
8409     base = (char *)mp + mp->mp_upper + PAGEBASE;
8410     memmove(base + sz, base, ptr - mp->mp_upper);
8411 
8412     mp->mp_lower -= sizeof(indx_t);
8413     mp->mp_upper += sz;
8414 }
8415 
8416 /** Compact the main page after deleting a node on a subpage.
8417  * @param[in] mp The main page to operate on.
8418  * @param[in] indx The index of the subpage on the main page.
8419  */
8420 static void
8421 edb_node_shrink(EDB_page *mp, indx_t indx)
8422 {
8423     EDB_node *node;
8424     EDB_page *sp, *xp;
8425     char *base;
8426     indx_t delta, nsize, len, ptr;
8427     int i;
8428 
8429     node = NODEPTR(mp, indx);
8430     sp = (EDB_page *)NODEDATA(node);
8431     delta = SIZELEFT(sp);
8432     nsize = NODEDSZ(node) - delta;
8433 
8434     /* Prepare to shift upward, set len = length(subpage part to shift) */
8435     if (IS_LEAF2(sp)) {
8436         len = nsize;
8437         if (nsize & 1)
8438             return;     /* do not make the node uneven-sized */
8439     } else {
8440         xp = (EDB_page *)((char *)sp + delta); /* destination subpage */
8441         for (i = NUMKEYS(sp); --i >= 0; )
8442             xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta;
8443         len = PAGEHDRSZ;
8444     }
8445     sp->mp_upper = sp->mp_lower;
8446     COPY_PGNO(sp->mp_pgno, mp->mp_pgno);
8447     SETDSZ(node, nsize);
8448 
8449     /* Shift <lower nodes...initial part of subpage> upward */
8450     base = (char *)mp + mp->mp_upper + PAGEBASE;
8451     memmove(base + delta, base, (char *)sp + len - base);
8452 
8453     ptr = mp->mp_ptrs[indx];
8454     for (i = NUMKEYS(mp); --i >= 0; ) {
8455         if (mp->mp_ptrs[i] <= ptr)
8456             mp->mp_ptrs[i] += delta;
8457     }
8458     mp->mp_upper += delta;
8459 }
8460 
8461 /** Initial setup of a sorted-dups cursor.
8462  * Sorted duplicates are implemented as a sub-database for the given key.
8463  * The duplicate data items are actually keys of the sub-database.
8464  * Operations on the duplicate data items are performed using a sub-cursor
8465  * initialized when the sub-database is first accessed. This function does
8466  * the preliminary setup of the sub-cursor, filling in the fields that
8467  * depend only on the parent DB.
8468  * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
8469  */
8470 static void
8471 edb_xcursor_init0(EDB_cursor *mc)
8472 {
8473     EDB_xcursor *mx = mc->mc_xcursor;
8474 
8475     mx->mx_cursor.mc_xcursor = NULL;
8476     mx->mx_cursor.mc_txn = mc->mc_txn;
8477     mx->mx_cursor.mc_db = &mx->mx_db;
8478     mx->mx_cursor.mc_dbx = &mx->mx_dbx;
8479     mx->mx_cursor.mc_dbi = mc->mc_dbi;
8480     mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
8481     mx->mx_cursor.mc_snum = 0;
8482     mx->mx_cursor.mc_top = 0;
8483     MC_SET_OVPG(&mx->mx_cursor, NULL);
8484     mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP));
8485     mx->mx_dbx.md_name.mv_size = 0;
8486     mx->mx_dbx.md_name.mv_data = NULL;
8487     mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
8488     mx->mx_dbx.md_dcmp = NULL;
8489     mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
8490 }
8491 
8492 /** Final setup of a sorted-dups cursor.
8493  *  Sets up the fields that depend on the data from the main cursor.
8494  * @param[in] mc The main cursor whose sorted-dups cursor is to be initialized.
8495  * @param[in] node The data containing the #EDB_db record for the
8496  * sorted-dup database.
8497  */
8498 static void
8499 edb_xcursor_init1(EDB_cursor *mc, EDB_node *node)
8500 {
8501     EDB_xcursor *mx = mc->mc_xcursor;
8502 
8503     mx->mx_cursor.mc_flags &= C_SUB|C_ORIG_RDONLY|C_WRITEMAP;
8504     if (node->mn_flags & F_SUBDATA) {
8505         memcpy(&mx->mx_db, NODEDATA(node), sizeof(EDB_db));
8506         mx->mx_cursor.mc_pg[0] = 0;
8507         mx->mx_cursor.mc_snum = 0;
8508         mx->mx_cursor.mc_top = 0;
8509     } else {
8510         EDB_page *fp = NODEDATA(node);
8511         mx->mx_db.md_pad = 0;
8512         mx->mx_db.md_flags = 0;
8513         mx->mx_db.md_depth = 1;
8514         mx->mx_db.md_branch_pages = 0;
8515         mx->mx_db.md_leaf_pages = 1;
8516         mx->mx_db.md_overflow_pages = 0;
8517         mx->mx_db.md_entries = NUMKEYS(fp);
8518         COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
8519         mx->mx_cursor.mc_snum = 1;
8520         mx->mx_cursor.mc_top = 0;
8521         mx->mx_cursor.mc_flags |= C_INITIALIZED;
8522         mx->mx_cursor.mc_pg[0] = fp;
8523         mx->mx_cursor.mc_ki[0] = 0;
8524         if (mc->mc_db->md_flags & EDB_DUPFIXED) {
8525             mx->mx_db.md_flags = EDB_DUPFIXED;
8526             mx->mx_db.md_pad = fp->mp_pad;
8527             if (mc->mc_db->md_flags & EDB_INTEGERDUP)
8528                 mx->mx_db.md_flags |= EDB_INTEGERKEY;
8529         }
8530     }
8531     DPRINTF(("Sub-db -%u root page %"Yu, mx->mx_cursor.mc_dbi,
8532         mx->mx_db.md_root));
8533     mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA;
8534     if (NEED_CMP_CLONG(mx->mx_dbx.md_cmp, mx->mx_db.md_pad))
8535         mx->mx_dbx.md_cmp = edb_cmp_clong;
8536 }
8537 
8538 
8539 /** Fixup a sorted-dups cursor due to underlying update.
8540  *  Sets up some fields that depend on the data from the main cursor.
8541  *  Almost the same as init1, but skips initialization steps if the
8542  *  xcursor had already been used.
8543  * @param[in] mc The main cursor whose sorted-dups cursor is to be fixed up.
8544  * @param[in] src_mx The xcursor of an up-to-date cursor.
8545  * @param[in] new_dupdata True if converting from a non-#F_DUPDATA item.
8546  */
8547 static void
8548 edb_xcursor_init2(EDB_cursor *mc, EDB_xcursor *src_mx, int new_dupdata)
8549 {
8550     EDB_xcursor *mx = mc->mc_xcursor;
8551 
8552     if (new_dupdata) {
8553         mx->mx_cursor.mc_snum = 1;
8554         mx->mx_cursor.mc_top = 0;
8555         mx->mx_cursor.mc_flags |= C_INITIALIZED;
8556         mx->mx_cursor.mc_ki[0] = 0;
8557         mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA;
8558 #if UINT_MAX < EDB_SIZE_MAX /* matches edb_xcursor_init1:NEED_CMP_CLONG() */
8559         mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp;
8560 #endif
8561     } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) {
8562         return;
8563     }
8564     mx->mx_db = src_mx->mx_db;
8565     mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0];
8566     DPRINTF(("Sub-db -%u root page %"Yu, mx->mx_cursor.mc_dbi,
8567         mx->mx_db.md_root));
8568 }
8569 
8570 /** Initialize a cursor for a given transaction and database. */
8571 static void
8572 edb_cursor_init(EDB_cursor *mc, EDB_txn *txn, EDB_dbi dbi, EDB_xcursor *mx)
8573 {
8574     mc->mc_next = NULL;
8575     mc->mc_backup = NULL;
8576     mc->mc_dbi = dbi;
8577     mc->mc_txn = txn;
8578     mc->mc_db = &txn->mt_dbs[dbi];
8579     mc->mc_dbx = &txn->mt_dbxs[dbi];
8580     mc->mc_dbflag = &txn->mt_dbflags[dbi];
8581     mc->mc_snum = 0;
8582     mc->mc_top = 0;
8583     mc->mc_pg[0] = 0;
8584     mc->mc_ki[0] = 0;
8585     MC_SET_OVPG(mc, NULL);
8586     mc->mc_flags = txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP);
8587     if (txn->mt_dbs[dbi].md_flags & EDB_DUPSORT) {
8588         edb_tassert(txn, mx != NULL);
8589         mc->mc_xcursor = mx;
8590         edb_xcursor_init0(mc);
8591     } else {
8592         mc->mc_xcursor = NULL;
8593     }
8594     if (*mc->mc_dbflag & DB_STALE) {
8595         edb_page_search(mc, NULL, EDB_PS_ROOTONLY);
8596     }
8597 }
8598 
8599 int
8600 edb_cursor_open(EDB_txn *txn, EDB_dbi dbi, EDB_cursor **ret)
8601 {
8602     EDB_cursor  *mc;
8603     size_t size = sizeof(EDB_cursor);
8604 
8605     if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID))
8606         return EINVAL;
8607 
8608     if (txn->mt_flags & EDB_TXN_BLOCKED)
8609         return EDB_BAD_TXN;
8610 
8611     if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, EDB_TXN_RDONLY))
8612         return EINVAL;
8613 
8614     if (txn->mt_dbs[dbi].md_flags & EDB_DUPSORT)
8615         size += sizeof(EDB_xcursor);
8616 
8617     if ((mc = malloc(size)) != NULL) {
8618         edb_cursor_init(mc, txn, dbi, (EDB_xcursor *)(mc + 1));
8619         if (txn->mt_cursors) {
8620             mc->mc_next = txn->mt_cursors[dbi];
8621             txn->mt_cursors[dbi] = mc;
8622             mc->mc_flags |= C_UNTRACK;
8623         }
8624     } else {
8625         return ENOMEM;
8626     }
8627 
8628     *ret = mc;
8629 
8630     return EDB_SUCCESS;
8631 }
8632 
8633 int
8634 edb_cursor_renew(EDB_txn *txn, EDB_cursor *mc)
8635 {
8636     if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID))
8637         return EINVAL;
8638 
8639     if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)
8640         return EINVAL;
8641 
8642     if (txn->mt_flags & EDB_TXN_BLOCKED)
8643         return EDB_BAD_TXN;
8644 
8645     edb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor);
8646     return EDB_SUCCESS;
8647 }
8648 
8649 /* Return the count of duplicate data items for the current key */
8650 int
8651 edb_cursor_count(EDB_cursor *mc, edb_size_t *countp)
8652 {
8653     EDB_node    *leaf;
8654 
8655     if (mc == NULL || countp == NULL)
8656         return EINVAL;
8657 
8658     if (mc->mc_xcursor == NULL)
8659         return EDB_INCOMPATIBLE;
8660 
8661     if (mc->mc_txn->mt_flags & EDB_TXN_BLOCKED)
8662         return EDB_BAD_TXN;
8663 
8664     if (!(mc->mc_flags & C_INITIALIZED))
8665         return EINVAL;
8666 
8667     if (!mc->mc_snum)
8668         return EDB_NOTFOUND;
8669 
8670     if (mc->mc_flags & C_EOF) {
8671         if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
8672             return EDB_NOTFOUND;
8673         mc->mc_flags ^= C_EOF;
8674     }
8675 
8676     leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
8677     if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
8678         *countp = 1;
8679     } else {
8680         if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
8681             return EINVAL;
8682 
8683         *countp = mc->mc_xcursor->mx_db.md_entries;
8684     }
8685     return EDB_SUCCESS;
8686 }
8687 
8688 void
8689 edb_cursor_close(EDB_cursor *mc)
8690 {
8691     if (mc) {
8692         EDB_CURSOR_UNREF(mc, 0);
8693     }
8694     if (mc && !mc->mc_backup) {
8695         /* Remove from txn, if tracked.
8696          * A read-only txn (!C_UNTRACK) may have been freed already,
8697          * so do not peek inside it.  Only write txns track cursors.
8698          */
8699         if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) {
8700             EDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
8701             while (*prev && *prev != mc) prev = &(*prev)->mc_next;
8702             if (*prev == mc)
8703                 *prev = mc->mc_next;
8704         }
8705         free(mc);
8706     }
8707 }
8708 
8709 EDB_txn *
8710 edb_cursor_txn(EDB_cursor *mc)
8711 {
8712     if (!mc) return NULL;
8713     return mc->mc_txn;
8714 }
8715 
8716 EDB_dbi
8717 edb_cursor_dbi(EDB_cursor *mc)
8718 {
8719     return mc->mc_dbi;
8720 }
8721 
8722 /** Replace the key for a branch node with a new key.
8723  * Set #EDB_TXN_ERROR on failure.
8724  * @param[in] mc Cursor pointing to the node to operate on.
8725  * @param[in] key The new key to use.
8726  * @return 0 on success, non-zero on failure.
8727  */
8728 static int
8729 edb_update_key(EDB_cursor *mc, EDB_val *key)
8730 {
8731     EDB_page        *mp;
8732     EDB_node        *node;
8733     char            *base;
8734     size_t           len;
8735     int              delta, ksize, oksize;
8736     indx_t           ptr, i, numkeys, indx;
8737     DKBUF;
8738 
8739     indx = mc->mc_ki[mc->mc_top];
8740     mp = mc->mc_pg[mc->mc_top];
8741     node = NODEPTR(mp, indx);
8742     ptr = mp->mp_ptrs[indx];
8743 #if EDB_DEBUG
8744     {
8745         EDB_val k2;
8746         char kbuf2[DKBUF_MAXKEYSIZE*2+1];
8747         k2.mv_data = NODEKEY(node);
8748         k2.mv_size = node->mn_ksize;
8749         DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Yu,
8750             indx, ptr,
8751             edb_dkey(&k2, kbuf2),
8752             DKEY(key),
8753             mp->mp_pgno));
8754     }
8755 #endif
8756 
8757     /* Sizes must be 2-byte aligned. */
8758     ksize = EVEN(key->mv_size);
8759     oksize = EVEN(node->mn_ksize);
8760     delta = ksize - oksize;
8761 
8762     /* Shift node contents if EVEN(key length) changed. */
8763     if (delta) {
8764         if (delta > 0 && SIZELEFT(mp) < delta) {
8765             pgno_t pgno;
8766             /* not enough space left, do a delete and split */
8767             DPRINTF(("Not enough room, delta = %d, splitting...", delta));
8768             pgno = NODEPGNO(node);
8769             edb_node_del(mc, 0);
8770             return edb_page_split(mc, key, NULL, pgno, EDB_SPLIT_REPLACE);
8771         }
8772 
8773         numkeys = NUMKEYS(mp);
8774         for (i = 0; i < numkeys; i++) {
8775             if (mp->mp_ptrs[i] <= ptr)
8776                 mp->mp_ptrs[i] -= delta;
8777         }
8778 
8779         base = (char *)mp + mp->mp_upper + PAGEBASE;
8780         len = ptr - mp->mp_upper + NODESIZE;
8781         memmove(base - delta, base, len);
8782         mp->mp_upper -= delta;
8783 
8784         node = NODEPTR(mp, indx);
8785     }
8786 
8787     /* But even if no shift was needed, update ksize */
8788     if (node->mn_ksize != key->mv_size)
8789         node->mn_ksize = key->mv_size;
8790 
8791     if (key->mv_size)
8792         memcpy(NODEKEY(node), key->mv_data, key->mv_size);
8793 
8794     return EDB_SUCCESS;
8795 }
8796 
8797 static void
8798 edb_cursor_copy(const EDB_cursor *csrc, EDB_cursor *cdst);
8799 
8800 /** Perform \b act while tracking temporary cursor \b mn */
8801 #define WITH_CURSOR_TRACKING(mn, act) do { \
8802     EDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \
8803     if ((mn).mc_flags & C_SUB) { \
8804         dummy.mc_flags =  C_INITIALIZED; \
8805         dummy.mc_xcursor = (EDB_xcursor *)&(mn);    \
8806         tracked = &dummy; \
8807     } else { \
8808         tracked = &(mn); \
8809     } \
8810     tracked->mc_next = *tp; \
8811     *tp = tracked; \
8812     { act; } \
8813     *tp = tracked->mc_next; \
8814 } while (0)
8815 
8816 /** Move a node from csrc to cdst.
8817  */
8818 static int
8819 edb_node_move(EDB_cursor *csrc, EDB_cursor *cdst, int fromleft)
8820 {
8821     EDB_node        *srcnode;
8822     EDB_val      key, data;
8823     pgno_t  srcpg;
8824     EDB_cursor mn;
8825     int          rc;
8826     unsigned short flags;
8827 
8828     DKBUF;
8829 
8830     /* Mark src and dst as dirty. */
8831     if ((rc = edb_page_touch(csrc)) ||
8832         (rc = edb_page_touch(cdst)))
8833         return rc;
8834 
8835     if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
8836         key.mv_size = csrc->mc_db->md_pad;
8837         key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
8838         data.mv_size = 0;
8839         data.mv_data = NULL;
8840         srcpg = 0;
8841         flags = 0;
8842     } else {
8843         srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
8844         edb_cassert(csrc, !((size_t)srcnode & 1));
8845         srcpg = NODEPGNO(srcnode);
8846         flags = srcnode->mn_flags;
8847         if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
8848             unsigned int snum = csrc->mc_snum;
8849             EDB_node *s2;
8850             /* must find the lowest key below src */
8851             rc = edb_page_search_lowest(csrc);
8852             if (rc)
8853                 return rc;
8854             if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
8855                 key.mv_size = csrc->mc_db->md_pad;
8856                 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
8857             } else {
8858                 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
8859                 key.mv_size = NODEKSZ(s2);
8860                 key.mv_data = NODEKEY(s2);
8861             }
8862             csrc->mc_snum = snum--;
8863             csrc->mc_top = snum;
8864         } else {
8865             key.mv_size = NODEKSZ(srcnode);
8866             key.mv_data = NODEKEY(srcnode);
8867         }
8868         data.mv_size = NODEDSZ(srcnode);
8869         data.mv_data = NODEDATA(srcnode);
8870     }
8871     mn.mc_xcursor = NULL;
8872     if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
8873         unsigned int snum = cdst->mc_snum;
8874         EDB_node *s2;
8875         EDB_val bkey;
8876         /* must find the lowest key below dst */
8877         edb_cursor_copy(cdst, &mn);
8878         rc = edb_page_search_lowest(&mn);
8879         if (rc)
8880             return rc;
8881         if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
8882             bkey.mv_size = mn.mc_db->md_pad;
8883             bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size);
8884         } else {
8885             s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
8886             bkey.mv_size = NODEKSZ(s2);
8887             bkey.mv_data = NODEKEY(s2);
8888         }
8889         mn.mc_snum = snum--;
8890         mn.mc_top = snum;
8891         mn.mc_ki[snum] = 0;
8892         rc = edb_update_key(&mn, &bkey);
8893         if (rc)
8894             return rc;
8895     }
8896 
8897     DPRINTF(("moving %s node %u [%s] on page %"Yu" to node %u on page %"Yu,
8898         IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
8899         csrc->mc_ki[csrc->mc_top],
8900         DKEY(&key),
8901         csrc->mc_pg[csrc->mc_top]->mp_pgno,
8902         cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno));
8903 
8904     /* Add the node to the destination page.
8905      */
8906     rc = edb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
8907     if (rc != EDB_SUCCESS)
8908         return rc;
8909 
8910     /* Delete the node from the source page.
8911      */
8912     edb_node_del(csrc, key.mv_size);
8913 
8914     {
8915         /* Adjust other cursors pointing to mp */
8916         EDB_cursor *m2, *m3;
8917         EDB_dbi dbi = csrc->mc_dbi;
8918         EDB_page *mpd, *mps;
8919 
8920         mps = csrc->mc_pg[csrc->mc_top];
8921         /* If we're adding on the left, bump others up */
8922         if (fromleft) {
8923             mpd = cdst->mc_pg[csrc->mc_top];
8924             for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8925                 if (csrc->mc_flags & C_SUB)
8926                     m3 = &m2->mc_xcursor->mx_cursor;
8927                 else
8928                     m3 = m2;
8929                 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
8930                     continue;
8931                 if (m3 != cdst &&
8932                     m3->mc_pg[csrc->mc_top] == mpd &&
8933                     m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) {
8934                     m3->mc_ki[csrc->mc_top]++;
8935                 }
8936                 if (m3 !=csrc &&
8937                     m3->mc_pg[csrc->mc_top] == mps &&
8938                     m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) {
8939                     m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
8940                     m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
8941                     m3->mc_ki[csrc->mc_top-1]++;
8942                 }
8943                 if (IS_LEAF(mps))
8944                     XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]);
8945             }
8946         } else
8947         /* Adding on the right, bump others down */
8948         {
8949             for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8950                 if (csrc->mc_flags & C_SUB)
8951                     m3 = &m2->mc_xcursor->mx_cursor;
8952                 else
8953                     m3 = m2;
8954                 if (m3 == csrc) continue;
8955                 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
8956                     continue;
8957                 if (m3->mc_pg[csrc->mc_top] == mps) {
8958                     if (!m3->mc_ki[csrc->mc_top]) {
8959                         m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
8960                         m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
8961                         m3->mc_ki[csrc->mc_top-1]--;
8962                     } else {
8963                         m3->mc_ki[csrc->mc_top]--;
8964                     }
8965                     if (IS_LEAF(mps))
8966                         XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]);
8967                 }
8968             }
8969         }
8970     }
8971 
8972     /* Update the parent separators.
8973      */
8974     if (csrc->mc_ki[csrc->mc_top] == 0) {
8975         if (csrc->mc_ki[csrc->mc_top-1] != 0) {
8976             if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
8977                 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
8978             } else {
8979                 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
8980                 key.mv_size = NODEKSZ(srcnode);
8981                 key.mv_data = NODEKEY(srcnode);
8982             }
8983             DPRINTF(("update separator for source page %"Yu" to [%s]",
8984                 csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)));
8985             edb_cursor_copy(csrc, &mn);
8986             mn.mc_snum--;
8987             mn.mc_top--;
8988             /* We want edb_rebalance to find mn when doing fixups */
8989             WITH_CURSOR_TRACKING(mn,
8990                 rc = edb_update_key(&mn, &key));
8991             if (rc)
8992                 return rc;
8993         }
8994         if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
8995             EDB_val  nullkey;
8996             indx_t  ix = csrc->mc_ki[csrc->mc_top];
8997             nullkey.mv_size = 0;
8998             csrc->mc_ki[csrc->mc_top] = 0;
8999             rc = edb_update_key(csrc, &nullkey);
9000             csrc->mc_ki[csrc->mc_top] = ix;
9001             edb_cassert(csrc, rc == EDB_SUCCESS);
9002         }
9003     }
9004 
9005     if (cdst->mc_ki[cdst->mc_top] == 0) {
9006         if (cdst->mc_ki[cdst->mc_top-1] != 0) {
9007             if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
9008                 key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
9009             } else {
9010                 srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
9011                 key.mv_size = NODEKSZ(srcnode);
9012                 key.mv_data = NODEKEY(srcnode);
9013             }
9014             DPRINTF(("update separator for destination page %"Yu" to [%s]",
9015                 cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)));
9016             edb_cursor_copy(cdst, &mn);
9017             mn.mc_snum--;
9018             mn.mc_top--;
9019             /* We want edb_rebalance to find mn when doing fixups */
9020             WITH_CURSOR_TRACKING(mn,
9021                 rc = edb_update_key(&mn, &key));
9022             if (rc)
9023                 return rc;
9024         }
9025         if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
9026             EDB_val  nullkey;
9027             indx_t  ix = cdst->mc_ki[cdst->mc_top];
9028             nullkey.mv_size = 0;
9029             cdst->mc_ki[cdst->mc_top] = 0;
9030             rc = edb_update_key(cdst, &nullkey);
9031             cdst->mc_ki[cdst->mc_top] = ix;
9032             edb_cassert(cdst, rc == EDB_SUCCESS);
9033         }
9034     }
9035 
9036     return EDB_SUCCESS;
9037 }
9038 
9039 /** Merge one page into another.
9040  *  The nodes from the page pointed to by \b csrc will
9041  *  be copied to the page pointed to by \b cdst and then
9042  *  the \b csrc page will be freed.
9043  * @param[in] csrc Cursor pointing to the source page.
9044  * @param[in] cdst Cursor pointing to the destination page.
9045  * @return 0 on success, non-zero on failure.
9046  */
9047 static int
9048 edb_page_merge(EDB_cursor *csrc, EDB_cursor *cdst)
9049 {
9050     EDB_page    *psrc, *pdst;
9051     EDB_node    *srcnode;
9052     EDB_val      key, data;
9053     unsigned     nkeys;
9054     int          rc;
9055     indx_t       i, j;
9056 
9057     psrc = csrc->mc_pg[csrc->mc_top];
9058     pdst = cdst->mc_pg[cdst->mc_top];
9059 
9060     DPRINTF(("merging page %"Yu" into %"Yu, psrc->mp_pgno, pdst->mp_pgno));
9061 
9062     edb_cassert(csrc, csrc->mc_snum > 1);   /* can't merge root page */
9063     edb_cassert(csrc, cdst->mc_snum > 1);
9064 
9065     /* Mark dst as dirty. */
9066     if ((rc = edb_page_touch(cdst)))
9067         return rc;
9068 
9069     /* get dst page again now that we've touched it. */
9070     pdst = cdst->mc_pg[cdst->mc_top];
9071 
9072     /* Move all nodes from src to dst.
9073      */
9074     j = nkeys = NUMKEYS(pdst);
9075     if (IS_LEAF2(psrc)) {
9076         key.mv_size = csrc->mc_db->md_pad;
9077         key.mv_data = METADATA(psrc);
9078         for (i = 0; i < NUMKEYS(psrc); i++, j++) {
9079             rc = edb_node_add(cdst, j, &key, NULL, 0, 0);
9080             if (rc != EDB_SUCCESS)
9081                 return rc;
9082             key.mv_data = (char *)key.mv_data + key.mv_size;
9083         }
9084     } else {
9085         for (i = 0; i < NUMKEYS(psrc); i++, j++) {
9086             srcnode = NODEPTR(psrc, i);
9087             if (i == 0 && IS_BRANCH(psrc)) {
9088                 EDB_cursor mn;
9089                 EDB_node *s2;
9090                 edb_cursor_copy(csrc, &mn);
9091                 mn.mc_xcursor = NULL;
9092                 /* must find the lowest key below src */
9093                 rc = edb_page_search_lowest(&mn);
9094                 if (rc)
9095                     return rc;
9096                 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
9097                     key.mv_size = mn.mc_db->md_pad;
9098                     key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size);
9099                 } else {
9100                     s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
9101                     key.mv_size = NODEKSZ(s2);
9102                     key.mv_data = NODEKEY(s2);
9103                 }
9104             } else {
9105                 key.mv_size = srcnode->mn_ksize;
9106                 key.mv_data = NODEKEY(srcnode);
9107             }
9108 
9109             data.mv_size = NODEDSZ(srcnode);
9110             data.mv_data = NODEDATA(srcnode);
9111             rc = edb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
9112             if (rc != EDB_SUCCESS)
9113                 return rc;
9114         }
9115     }
9116 
9117     DPRINTF(("dst page %"Yu" now has %u keys (%.1f%% filled)",
9118         pdst->mp_pgno, NUMKEYS(pdst),
9119         (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10));
9120 
9121     /* Unlink the src page from parent and add to free list.
9122      */
9123     csrc->mc_top--;
9124     edb_node_del(csrc, 0);
9125     if (csrc->mc_ki[csrc->mc_top] == 0) {
9126         key.mv_size = 0;
9127         rc = edb_update_key(csrc, &key);
9128         if (rc) {
9129             csrc->mc_top++;
9130             return rc;
9131         }
9132     }
9133     csrc->mc_top++;
9134 
9135     psrc = csrc->mc_pg[csrc->mc_top];
9136     /* If not operating on FreeDB, allow this page to be reused
9137      * in this txn. Otherwise just add to free list.
9138      */
9139     rc = edb_page_loose(csrc, psrc);
9140     if (rc)
9141         return rc;
9142     if (IS_LEAF(psrc))
9143         csrc->mc_db->md_leaf_pages--;
9144     else
9145         csrc->mc_db->md_branch_pages--;
9146     {
9147         /* Adjust other cursors pointing to mp */
9148         EDB_cursor *m2, *m3;
9149         EDB_dbi dbi = csrc->mc_dbi;
9150         unsigned int top = csrc->mc_top;
9151 
9152         for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
9153             if (csrc->mc_flags & C_SUB)
9154                 m3 = &m2->mc_xcursor->mx_cursor;
9155             else
9156                 m3 = m2;
9157             if (m3 == csrc) continue;
9158             if (m3->mc_snum < csrc->mc_snum) continue;
9159             if (m3->mc_pg[top] == psrc) {
9160                 m3->mc_pg[top] = pdst;
9161                 m3->mc_ki[top] += nkeys;
9162                 m3->mc_ki[top-1] = cdst->mc_ki[top-1];
9163             } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] &&
9164                 m3->mc_ki[top-1] > csrc->mc_ki[top-1]) {
9165                 m3->mc_ki[top-1]--;
9166             }
9167             if (IS_LEAF(psrc))
9168                 XCURSOR_REFRESH(m3, top, m3->mc_pg[top]);
9169         }
9170     }
9171     {
9172         unsigned int snum = cdst->mc_snum;
9173         uint16_t depth = cdst->mc_db->md_depth;
9174         edb_cursor_pop(cdst);
9175         rc = edb_rebalance(cdst);
9176         /* Did the tree height change? */
9177         if (depth != cdst->mc_db->md_depth)
9178             snum += cdst->mc_db->md_depth - depth;
9179         cdst->mc_snum = snum;
9180         cdst->mc_top = snum-1;
9181     }
9182     return rc;
9183 }
9184 
9185 /** Copy the contents of a cursor.
9186  * @param[in] csrc The cursor to copy from.
9187  * @param[out] cdst The cursor to copy to.
9188  */
9189 static void
9190 edb_cursor_copy(const EDB_cursor *csrc, EDB_cursor *cdst)
9191 {
9192     unsigned int i;
9193 
9194     cdst->mc_txn = csrc->mc_txn;
9195     cdst->mc_dbi = csrc->mc_dbi;
9196     cdst->mc_db  = csrc->mc_db;
9197     cdst->mc_dbx = csrc->mc_dbx;
9198     cdst->mc_snum = csrc->mc_snum;
9199     cdst->mc_top = csrc->mc_top;
9200     cdst->mc_flags = csrc->mc_flags;
9201     MC_SET_OVPG(cdst, MC_OVPG(csrc));
9202 
9203     for (i=0; i<csrc->mc_snum; i++) {
9204         cdst->mc_pg[i] = csrc->mc_pg[i];
9205         cdst->mc_ki[i] = csrc->mc_ki[i];
9206     }
9207 }
9208 
9209 /** Rebalance the tree after a delete operation.
9210  * @param[in] mc Cursor pointing to the page where rebalancing
9211  * should begin.
9212  * @return 0 on success, non-zero on failure.
9213  */
9214 static int
9215 edb_rebalance(EDB_cursor *mc)
9216 {
9217     EDB_node    *node;
9218     int rc, fromleft;
9219     unsigned int ptop, minkeys, thresh;
9220     EDB_cursor  mn;
9221     indx_t oldki;
9222 
9223     if (IS_BRANCH(mc->mc_pg[mc->mc_top])) {
9224         minkeys = 2;
9225         thresh = 1;
9226     } else {
9227         minkeys = 1;
9228         thresh = FILL_THRESHOLD;
9229     }
9230     DPRINTF(("rebalancing %s page %"Yu" (has %u keys, %.1f%% full)",
9231         IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
9232         edb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]),
9233         (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10));
9234 
9235     if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh &&
9236         NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) {
9237         DPRINTF(("no need to rebalance page %"Yu", above fill threshold",
9238             edb_dbg_pgno(mc->mc_pg[mc->mc_top])));
9239         return EDB_SUCCESS;
9240     }
9241 
9242     if (mc->mc_snum < 2) {
9243         EDB_page *mp = mc->mc_pg[0];
9244         if (IS_SUBP(mp)) {
9245             DPUTS("Can't rebalance a subpage, ignoring");
9246             return EDB_SUCCESS;
9247         }
9248         if (NUMKEYS(mp) == 0) {
9249             DPUTS("tree is completely empty");
9250             mc->mc_db->md_root = P_INVALID;
9251             mc->mc_db->md_depth = 0;
9252             mc->mc_db->md_leaf_pages = 0;
9253             rc = edb_eidl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
9254             if (rc)
9255                 return rc;
9256             /* Adjust cursors pointing to mp */
9257             mc->mc_snum = 0;
9258             mc->mc_top = 0;
9259             mc->mc_flags &= ~C_INITIALIZED;
9260             {
9261                 EDB_cursor *m2, *m3;
9262                 EDB_dbi dbi = mc->mc_dbi;
9263 
9264                 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
9265                     if (mc->mc_flags & C_SUB)
9266                         m3 = &m2->mc_xcursor->mx_cursor;
9267                     else
9268                         m3 = m2;
9269                     if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum))
9270                         continue;
9271                     if (m3->mc_pg[0] == mp) {
9272                         m3->mc_snum = 0;
9273                         m3->mc_top = 0;
9274                         m3->mc_flags &= ~C_INITIALIZED;
9275                     }
9276                 }
9277             }
9278         } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
9279             int i;
9280             DPUTS("collapsing root page!");
9281             rc = edb_eidl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
9282             if (rc)
9283                 return rc;
9284             mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0));
9285             rc = edb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL);
9286             if (rc)
9287                 return rc;
9288             mc->mc_db->md_depth--;
9289             mc->mc_db->md_branch_pages--;
9290             mc->mc_ki[0] = mc->mc_ki[1];
9291             for (i = 1; i<mc->mc_db->md_depth; i++) {
9292                 mc->mc_pg[i] = mc->mc_pg[i+1];
9293                 mc->mc_ki[i] = mc->mc_ki[i+1];
9294             }
9295             {
9296                 /* Adjust other cursors pointing to mp */
9297                 EDB_cursor *m2, *m3;
9298                 EDB_dbi dbi = mc->mc_dbi;
9299 
9300                 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
9301                     if (mc->mc_flags & C_SUB)
9302                         m3 = &m2->mc_xcursor->mx_cursor;
9303                     else
9304                         m3 = m2;
9305                     if (m3 == mc) continue;
9306                     if (!(m3->mc_flags & C_INITIALIZED))
9307                         continue;
9308                     if (m3->mc_pg[0] == mp) {
9309                         for (i=0; i<mc->mc_db->md_depth; i++) {
9310                             m3->mc_pg[i] = m3->mc_pg[i+1];
9311                             m3->mc_ki[i] = m3->mc_ki[i+1];
9312                         }
9313                         m3->mc_snum--;
9314                         m3->mc_top--;
9315                     }
9316                 }
9317             }
9318         } else
9319             DPUTS("root page doesn't need rebalancing");
9320         return EDB_SUCCESS;
9321     }
9322 
9323     /* The parent (branch page) must have at least 2 pointers,
9324      * otherwise the tree is invalid.
9325      */
9326     ptop = mc->mc_top-1;
9327     edb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1);
9328 
9329     /* Leaf page fill factor is below the threshold.
9330      * Try to move keys from left or right neighbor, or
9331      * merge with a neighbor page.
9332      */
9333 
9334     /* Find neighbors.
9335      */
9336     edb_cursor_copy(mc, &mn);
9337     mn.mc_xcursor = NULL;
9338 
9339     oldki = mc->mc_ki[mc->mc_top];
9340     if (mc->mc_ki[ptop] == 0) {
9341         /* We're the leftmost leaf in our parent.
9342          */
9343         DPUTS("reading right neighbor");
9344         mn.mc_ki[ptop]++;
9345         node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
9346         rc = edb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL);
9347         if (rc)
9348             return rc;
9349         mn.mc_ki[mn.mc_top] = 0;
9350         mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
9351         fromleft = 0;
9352     } else {
9353         /* There is at least one neighbor to the left.
9354          */
9355         DPUTS("reading left neighbor");
9356         mn.mc_ki[ptop]--;
9357         node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
9358         rc = edb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL);
9359         if (rc)
9360             return rc;
9361         mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
9362         mc->mc_ki[mc->mc_top] = 0;
9363         fromleft = 1;
9364     }
9365 
9366     DPRINTF(("found neighbor page %"Yu" (%u keys, %.1f%% full)",
9367         mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]),
9368         (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10));
9369 
9370     /* If the neighbor page is above threshold and has enough keys,
9371      * move one key from it. Otherwise we should try to merge them.
9372      * (A branch page must never have less than 2 keys.)
9373      */
9374     if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) {
9375         rc = edb_node_move(&mn, mc, fromleft);
9376         if (fromleft) {
9377             /* if we inserted on left, bump position up */
9378             oldki++;
9379         }
9380     } else {
9381         if (!fromleft) {
9382             rc = edb_page_merge(&mn, mc);
9383         } else {
9384             oldki += NUMKEYS(mn.mc_pg[mn.mc_top]);
9385             mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
9386             /* We want edb_rebalance to find mn when doing fixups */
9387             WITH_CURSOR_TRACKING(mn,
9388                 rc = edb_page_merge(mc, &mn));
9389             edb_cursor_copy(&mn, mc);
9390         }
9391         mc->mc_flags &= ~C_EOF;
9392     }
9393     mc->mc_ki[mc->mc_top] = oldki;
9394     return rc;
9395 }
9396 
9397 /** Complete a delete operation started by #edb_cursor_del(). */
9398 static int
9399 edb_cursor_del0(EDB_cursor *mc)
9400 {
9401     int rc;
9402     EDB_page *mp;
9403     indx_t ki;
9404     unsigned int nkeys;
9405     EDB_cursor *m2, *m3;
9406     EDB_dbi dbi = mc->mc_dbi;
9407 
9408     ki = mc->mc_ki[mc->mc_top];
9409     mp = mc->mc_pg[mc->mc_top];
9410     edb_node_del(mc, mc->mc_db->md_pad);
9411     mc->mc_db->md_entries--;
9412     {
9413         /* Adjust other cursors pointing to mp */
9414         for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
9415             m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
9416             if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED))
9417                 continue;
9418             if (m3 == mc || m3->mc_snum < mc->mc_snum)
9419                 continue;
9420             if (m3->mc_pg[mc->mc_top] == mp) {
9421                 if (m3->mc_ki[mc->mc_top] == ki) {
9422                     m3->mc_flags |= C_DEL;
9423                     if (mc->mc_db->md_flags & EDB_DUPSORT) {
9424                         /* Sub-cursor referred into dataset which is gone */
9425                         m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
9426                     }
9427                     continue;
9428                 } else if (m3->mc_ki[mc->mc_top] > ki) {
9429                     m3->mc_ki[mc->mc_top]--;
9430                 }
9431                 XCURSOR_REFRESH(m3, mc->mc_top, mp);
9432             }
9433         }
9434     }
9435     rc = edb_rebalance(mc);
9436 
9437     if (rc == EDB_SUCCESS) {
9438         /* DB is totally empty now, just bail out.
9439          * Other cursors adjustments were already done
9440          * by edb_rebalance and aren't needed here.
9441          */
9442         if (!mc->mc_snum)
9443             return rc;
9444 
9445         mp = mc->mc_pg[mc->mc_top];
9446         nkeys = NUMKEYS(mp);
9447 
9448         /* Adjust other cursors pointing to mp */
9449         for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) {
9450             m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
9451             if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED))
9452                 continue;
9453             if (m3->mc_snum < mc->mc_snum)
9454                 continue;
9455             if (m3->mc_pg[mc->mc_top] == mp) {
9456                 /* if m3 points past last node in page, find next sibling */
9457                 if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) {
9458                     if (m3->mc_ki[mc->mc_top] >= nkeys) {
9459                         rc = edb_cursor_sibling(m3, 1);
9460                         if (rc == EDB_NOTFOUND) {
9461                             m3->mc_flags |= C_EOF;
9462                             rc = EDB_SUCCESS;
9463                             continue;
9464                         }
9465                     }
9466                     if (mc->mc_db->md_flags & EDB_DUPSORT) {
9467                         EDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]);
9468                         /* If this node has dupdata, it may need to be reinited
9469                          * because its data has moved.
9470                          * If the xcursor was not initd it must be reinited.
9471                          * Else if node points to a subDB, nothing is needed.
9472                          * Else (xcursor was initd, not a subDB) needs mc_pg[0] reset.
9473                          */
9474                         if (node->mn_flags & F_DUPDATA) {
9475                             if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
9476                                 if (!(node->mn_flags & F_SUBDATA))
9477                                     m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
9478                             } else {
9479                                 edb_xcursor_init1(m3, node);
9480                                 m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL;
9481                             }
9482                         }
9483                     }
9484                 }
9485             }
9486         }
9487         mc->mc_flags |= C_DEL;
9488     }
9489 
9490     if (rc)
9491         mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
9492     return rc;
9493 }
9494 
9495 int
9496 edb_del(EDB_txn *txn, EDB_dbi dbi,
9497     EDB_val *key, EDB_val *data)
9498 {
9499     if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
9500         return EINVAL;
9501 
9502     if (txn->mt_flags & (EDB_TXN_RDONLY|EDB_TXN_BLOCKED))
9503         return (txn->mt_flags & EDB_TXN_RDONLY) ? EACCES : EDB_BAD_TXN;
9504 
9505     if (!F_ISSET(txn->mt_dbs[dbi].md_flags, EDB_DUPSORT)) {
9506         /* must ignore any data */
9507         data = NULL;
9508     }
9509 
9510     return edb_del0(txn, dbi, key, data, 0);
9511 }
9512 
9513 static int
9514 edb_del0(EDB_txn *txn, EDB_dbi dbi,
9515     EDB_val *key, EDB_val *data, unsigned flags)
9516 {
9517     EDB_cursor mc;
9518     EDB_xcursor mx;
9519     EDB_cursor_op op;
9520     EDB_val rdata, *xdata;
9521     int      rc, exact = 0;
9522     DKBUF;
9523 
9524     DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key)));
9525 
9526     edb_cursor_init(&mc, txn, dbi, &mx);
9527 
9528     if (data) {
9529         op = EDB_GET_BOTH;
9530         rdata = *data;
9531         xdata = &rdata;
9532     } else {
9533         op = EDB_SET;
9534         xdata = NULL;
9535         flags |= EDB_NODUPDATA;
9536     }
9537     rc = edb_cursor_set(&mc, key, xdata, op, &exact);
9538     if (rc == 0) {
9539         /* let edb_page_split know about this cursor if needed:
9540          * delete will trigger a rebalance; if it needs to move
9541          * a node from one page to another, it will have to
9542          * update the parent's separator key(s). If the new sepkey
9543          * is larger than the current one, the parent page may
9544          * run out of space, triggering a split. We need this
9545          * cursor to be consistent until the end of the rebalance.
9546          */
9547         mc.mc_next = txn->mt_cursors[dbi];
9548         txn->mt_cursors[dbi] = &mc;
9549         rc = edb_cursor_del(&mc, flags);
9550         txn->mt_cursors[dbi] = mc.mc_next;
9551     }
9552     return rc;
9553 }
9554 
9555 /** Split a page and insert a new node.
9556  * Set #EDB_TXN_ERROR on failure.
9557  * @param[in,out] mc Cursor pointing to the page and desired insertion index.
9558  * The cursor will be updated to point to the actual page and index where
9559  * the node got inserted after the split.
9560  * @param[in] newkey The key for the newly inserted node.
9561  * @param[in] newdata The data for the newly inserted node.
9562  * @param[in] newpgno The page number, if the new node is a branch node.
9563  * @param[in] nflags The #NODE_ADD_FLAGS for the new node.
9564  * @return 0 on success, non-zero on failure.
9565  */
9566 static int
9567 edb_page_split(EDB_cursor *mc, EDB_val *newkey, EDB_val *newdata, pgno_t newpgno,
9568     unsigned int nflags)
9569 {
9570     unsigned int flags;
9571     int      rc = EDB_SUCCESS, new_root = 0, did_split = 0;
9572     indx_t       newindx;
9573     pgno_t       pgno = 0;
9574     int  i, j, split_indx, nkeys, pmax;
9575     EDB_env     *env = mc->mc_txn->mt_env;
9576     EDB_node    *node;
9577     EDB_val  sepkey, rkey, xdata, *rdata = &xdata;
9578     EDB_page    *copy = NULL;
9579     EDB_page    *mp, *rp, *pp;
9580     int ptop;
9581     EDB_cursor  mn;
9582     DKBUF;
9583 
9584     mp = mc->mc_pg[mc->mc_top];
9585     newindx = mc->mc_ki[mc->mc_top];
9586     nkeys = NUMKEYS(mp);
9587 
9588     DPRINTF(("-----> splitting %s page %"Yu" and adding [%s] at index %i/%i",
9589         IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
9590         DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
9591 
9592     /* Create a right sibling. */
9593     if ((rc = edb_page_new(mc, mp->mp_flags, 1, &rp)))
9594         return rc;
9595     rp->mp_pad = mp->mp_pad;
9596     DPRINTF(("new right sibling: page %"Yu, rp->mp_pgno));
9597 
9598     /* Usually when splitting the root page, the cursor
9599      * height is 1. But when called from edb_update_key,
9600      * the cursor height may be greater because it walks
9601      * up the stack while finding the branch slot to update.
9602      */
9603     if (mc->mc_top < 1) {
9604         if ((rc = edb_page_new(mc, P_BRANCH, 1, &pp)))
9605             goto done;
9606         /* shift current top to make room for new parent */
9607         for (i=mc->mc_snum; i>0; i--) {
9608             mc->mc_pg[i] = mc->mc_pg[i-1];
9609             mc->mc_ki[i] = mc->mc_ki[i-1];
9610         }
9611         mc->mc_pg[0] = pp;
9612         mc->mc_ki[0] = 0;
9613         mc->mc_db->md_root = pp->mp_pgno;
9614         DPRINTF(("root split! new root = %"Yu, pp->mp_pgno));
9615         new_root = mc->mc_db->md_depth++;
9616 
9617         /* Add left (implicit) pointer. */
9618         if ((rc = edb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != EDB_SUCCESS) {
9619             /* undo the pre-push */
9620             mc->mc_pg[0] = mc->mc_pg[1];
9621             mc->mc_ki[0] = mc->mc_ki[1];
9622             mc->mc_db->md_root = mp->mp_pgno;
9623             mc->mc_db->md_depth--;
9624             goto done;
9625         }
9626         mc->mc_snum++;
9627         mc->mc_top++;
9628         ptop = 0;
9629     } else {
9630         ptop = mc->mc_top-1;
9631         DPRINTF(("parent branch page is %"Yu, mc->mc_pg[ptop]->mp_pgno));
9632     }
9633 
9634     edb_cursor_copy(mc, &mn);
9635     mn.mc_xcursor = NULL;
9636     mn.mc_pg[mn.mc_top] = rp;
9637     mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
9638 
9639     if (nflags & EDB_APPEND) {
9640         mn.mc_ki[mn.mc_top] = 0;
9641         sepkey = *newkey;
9642         split_indx = newindx;
9643         nkeys = 0;
9644     } else {
9645 
9646         split_indx = (nkeys+1) / 2;
9647 
9648         if (IS_LEAF2(rp)) {
9649             char *split, *ins;
9650             int x;
9651             unsigned int lsize, rsize, ksize;
9652             /* Move half of the keys to the right sibling */
9653             x = mc->mc_ki[mc->mc_top] - split_indx;
9654             ksize = mc->mc_db->md_pad;
9655             split = LEAF2KEY(mp, split_indx, ksize);
9656             rsize = (nkeys - split_indx) * ksize;
9657             lsize = (nkeys - split_indx) * sizeof(indx_t);
9658             mp->mp_lower -= lsize;
9659             rp->mp_lower += lsize;
9660             mp->mp_upper += rsize - lsize;
9661             rp->mp_upper -= rsize - lsize;
9662             sepkey.mv_size = ksize;
9663             if (newindx == split_indx) {
9664                 sepkey.mv_data = newkey->mv_data;
9665             } else {
9666                 sepkey.mv_data = split;
9667             }
9668             if (x<0) {
9669                 ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
9670                 memcpy(rp->mp_ptrs, split, rsize);
9671                 sepkey.mv_data = rp->mp_ptrs;
9672                 memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
9673                 memcpy(ins, newkey->mv_data, ksize);
9674                 mp->mp_lower += sizeof(indx_t);
9675                 mp->mp_upper -= ksize - sizeof(indx_t);
9676             } else {
9677                 if (x)
9678                     memcpy(rp->mp_ptrs, split, x * ksize);
9679                 ins = LEAF2KEY(rp, x, ksize);
9680                 memcpy(ins, newkey->mv_data, ksize);
9681                 memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
9682                 rp->mp_lower += sizeof(indx_t);
9683                 rp->mp_upper -= ksize - sizeof(indx_t);
9684                 mc->mc_ki[mc->mc_top] = x;
9685             }
9686         } else {
9687             int psize, nsize, k;
9688             /* Maximum free space in an empty page */
9689             pmax = env->me_psize - PAGEHDRSZ;
9690             if (IS_LEAF(mp))
9691                 nsize = edb_leaf_size(env, newkey, newdata);
9692             else
9693                 nsize = edb_branch_size(env, newkey);
9694             nsize = EVEN(nsize);
9695 
9696             /* grab a page to hold a temporary copy */
9697             copy = edb_page_malloc(mc->mc_txn, 1);
9698             if (copy == NULL) {
9699                 rc = ENOMEM;
9700                 goto done;
9701             }
9702             copy->mp_pgno  = mp->mp_pgno;
9703             copy->mp_flags = mp->mp_flags;
9704             copy->mp_lower = (PAGEHDRSZ-PAGEBASE);
9705             copy->mp_upper = env->me_psize - PAGEBASE;
9706 
9707             /* prepare to insert */
9708             for (i=0, j=0; i<nkeys; i++) {
9709                 if (i == newindx) {
9710                     copy->mp_ptrs[j++] = 0;
9711                 }
9712                 copy->mp_ptrs[j++] = mp->mp_ptrs[i];
9713             }
9714 
9715             /* When items are relatively large the split point needs
9716              * to be checked, because being off-by-one will make the
9717              * difference between success or failure in edb_node_add.
9718              *
9719              * It's also relevant if a page happens to be laid out
9720              * such that one half of its nodes are all "small" and
9721              * the other half of its nodes are "large." If the new
9722              * item is also "large" and falls on the half with
9723              * "large" nodes, it also may not fit.
9724              *
9725              * As a final tweak, if the new item goes on the last
9726              * spot on the page (and thus, onto the new page), bias
9727              * the split so the new page is emptier than the old page.
9728              * This yields better packing during sequential inserts.
9729              */
9730             if (nkeys < 32 || nsize > pmax/16 || newindx >= nkeys) {
9731                 /* Find split point */
9732                 psize = 0;
9733                 if (newindx <= split_indx || newindx >= nkeys) {
9734                     i = 0; j = 1;
9735                     k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp);
9736                 } else {
9737                     i = nkeys; j = -1;
9738                     k = split_indx-1;
9739                 }
9740                 for (; i!=k; i+=j) {
9741                     if (i == newindx) {
9742                         psize += nsize;
9743                         node = NULL;
9744                     } else {
9745                         node = (EDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
9746                         psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
9747                         if (IS_LEAF(mp)) {
9748                             if (F_ISSET(node->mn_flags, F_BIGDATA))
9749                                 psize += sizeof(pgno_t);
9750                             else
9751                                 psize += NODEDSZ(node);
9752                         }
9753                         psize = EVEN(psize);
9754                     }
9755                     if (psize > pmax || i == k-j) {
9756                         split_indx = i + (j<0);
9757                         break;
9758                     }
9759                 }
9760             }
9761             if (split_indx == newindx) {
9762                 sepkey.mv_size = newkey->mv_size;
9763                 sepkey.mv_data = newkey->mv_data;
9764             } else {
9765                 node = (EDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE);
9766                 sepkey.mv_size = node->mn_ksize;
9767                 sepkey.mv_data = NODEKEY(node);
9768             }
9769         }
9770     }
9771 
9772     DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
9773 
9774     /* Copy separator key to the parent.
9775      */
9776     if (SIZELEFT(mn.mc_pg[ptop]) < edb_branch_size(env, &sepkey)) {
9777         int snum = mc->mc_snum;
9778         mn.mc_snum--;
9779         mn.mc_top--;
9780         did_split = 1;
9781         /* We want other splits to find mn when doing fixups */
9782         WITH_CURSOR_TRACKING(mn,
9783             rc = edb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0));
9784         if (rc)
9785             goto done;
9786 
9787         /* root split? */
9788         if (mc->mc_snum > snum) {
9789             ptop++;
9790         }
9791         /* Right page might now have changed parent.
9792          * Check if left page also changed parent.
9793          */
9794         if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
9795             mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
9796             for (i=0; i<ptop; i++) {
9797                 mc->mc_pg[i] = mn.mc_pg[i];
9798                 mc->mc_ki[i] = mn.mc_ki[i];
9799             }
9800             mc->mc_pg[ptop] = mn.mc_pg[ptop];
9801             if (mn.mc_ki[ptop]) {
9802                 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
9803             } else {
9804                 /* find right page's left sibling */
9805                 mc->mc_ki[ptop] = mn.mc_ki[ptop];
9806                 rc = edb_cursor_sibling(mc, 0);
9807             }
9808         }
9809     } else {
9810         mn.mc_top--;
9811         rc = edb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
9812         mn.mc_top++;
9813     }
9814     if (rc != EDB_SUCCESS) {
9815         if (rc == EDB_NOTFOUND) /* improper edb_cursor_sibling() result */
9816             rc = EDB_PROBLEM;
9817         goto done;
9818     }
9819     if (nflags & EDB_APPEND) {
9820         mc->mc_pg[mc->mc_top] = rp;
9821         mc->mc_ki[mc->mc_top] = 0;
9822         rc = edb_node_add(mc, 0, newkey, newdata, newpgno, nflags);
9823         if (rc)
9824             goto done;
9825         for (i=0; i<mc->mc_top; i++)
9826             mc->mc_ki[i] = mn.mc_ki[i];
9827     } else if (!IS_LEAF2(mp)) {
9828         /* Move nodes */
9829         mc->mc_pg[mc->mc_top] = rp;
9830         i = split_indx;
9831         j = 0;
9832         do {
9833             if (i == newindx) {
9834                 rkey.mv_data = newkey->mv_data;
9835                 rkey.mv_size = newkey->mv_size;
9836                 if (IS_LEAF(mp)) {
9837                     rdata = newdata;
9838                 } else
9839                     pgno = newpgno;
9840                 flags = nflags;
9841                 /* Update index for the new key. */
9842                 mc->mc_ki[mc->mc_top] = j;
9843             } else {
9844                 node = (EDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
9845                 rkey.mv_data = NODEKEY(node);
9846                 rkey.mv_size = node->mn_ksize;
9847                 if (IS_LEAF(mp)) {
9848                     xdata.mv_data = NODEDATA(node);
9849                     xdata.mv_size = NODEDSZ(node);
9850                     rdata = &xdata;
9851                 } else
9852                     pgno = NODEPGNO(node);
9853                 flags = node->mn_flags;
9854             }
9855 
9856             if (!IS_LEAF(mp) && j == 0) {
9857                 /* First branch index doesn't need key data. */
9858                 rkey.mv_size = 0;
9859             }
9860 
9861             rc = edb_node_add(mc, j, &rkey, rdata, pgno, flags);
9862             if (rc)
9863                 goto done;
9864             if (i == nkeys) {
9865                 i = 0;
9866                 j = 0;
9867                 mc->mc_pg[mc->mc_top] = copy;
9868             } else {
9869                 i++;
9870                 j++;
9871             }
9872         } while (i != split_indx);
9873 
9874         nkeys = NUMKEYS(copy);
9875         for (i=0; i<nkeys; i++)
9876             mp->mp_ptrs[i] = copy->mp_ptrs[i];
9877         mp->mp_lower = copy->mp_lower;
9878         mp->mp_upper = copy->mp_upper;
9879         memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
9880             env->me_psize - copy->mp_upper - PAGEBASE);
9881 
9882         /* reset back to original page */
9883         if (newindx < split_indx) {
9884             mc->mc_pg[mc->mc_top] = mp;
9885         } else {
9886             mc->mc_pg[mc->mc_top] = rp;
9887             mc->mc_ki[ptop]++;
9888             /* Make sure mc_ki is still valid.
9889              */
9890             if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
9891                 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
9892                 for (i=0; i<=ptop; i++) {
9893                     mc->mc_pg[i] = mn.mc_pg[i];
9894                     mc->mc_ki[i] = mn.mc_ki[i];
9895                 }
9896             }
9897         }
9898         if (nflags & EDB_RESERVE) {
9899             node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
9900             if (!(node->mn_flags & F_BIGDATA))
9901                 newdata->mv_data = NODEDATA(node);
9902         }
9903     } else {
9904         if (newindx >= split_indx) {
9905             mc->mc_pg[mc->mc_top] = rp;
9906             mc->mc_ki[ptop]++;
9907             /* Make sure mc_ki is still valid.
9908              */
9909             if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
9910                 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
9911                 for (i=0; i<=ptop; i++) {
9912                     mc->mc_pg[i] = mn.mc_pg[i];
9913                     mc->mc_ki[i] = mn.mc_ki[i];
9914                 }
9915             }
9916         }
9917     }
9918 
9919     {
9920         /* Adjust other cursors pointing to mp */
9921         EDB_cursor *m2, *m3;
9922         EDB_dbi dbi = mc->mc_dbi;
9923         nkeys = NUMKEYS(mp);
9924 
9925         for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
9926             if (mc->mc_flags & C_SUB)
9927                 m3 = &m2->mc_xcursor->mx_cursor;
9928             else
9929                 m3 = m2;
9930             if (m3 == mc)
9931                 continue;
9932             if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
9933                 continue;
9934             if (new_root) {
9935                 int k;
9936                 /* sub cursors may be on different DB */
9937                 if (m3->mc_pg[0] != mp)
9938                     continue;
9939                 /* root split */
9940                 for (k=new_root; k>=0; k--) {
9941                     m3->mc_ki[k+1] = m3->mc_ki[k];
9942                     m3->mc_pg[k+1] = m3->mc_pg[k];
9943                 }
9944                 if (m3->mc_ki[0] >= nkeys) {
9945                     m3->mc_ki[0] = 1;
9946                 } else {
9947                     m3->mc_ki[0] = 0;
9948                 }
9949                 m3->mc_pg[0] = mc->mc_pg[0];
9950                 m3->mc_snum++;
9951                 m3->mc_top++;
9952             }
9953             if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) {
9954                 if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & EDB_SPLIT_REPLACE))
9955                     m3->mc_ki[mc->mc_top]++;
9956                 if (m3->mc_ki[mc->mc_top] >= nkeys) {
9957                     m3->mc_pg[mc->mc_top] = rp;
9958                     m3->mc_ki[mc->mc_top] -= nkeys;
9959                     for (i=0; i<mc->mc_top; i++) {
9960                         m3->mc_ki[i] = mn.mc_ki[i];
9961                         m3->mc_pg[i] = mn.mc_pg[i];
9962                     }
9963                 }
9964             } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
9965                 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
9966                 m3->mc_ki[ptop]++;
9967             }
9968             if (IS_LEAF(mp))
9969                 XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]);
9970         }
9971     }
9972     DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
9973 
9974 done:
9975     if (copy)                   /* tmp page */
9976         edb_page_free(env, copy);
9977     if (rc)
9978         mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
9979     return rc;
9980 }
9981 
9982 int
9983 edb_put(EDB_txn *txn, EDB_dbi dbi,
9984     EDB_val *key, EDB_val *data, unsigned int flags)
9985 {
9986     EDB_cursor mc;
9987     EDB_xcursor mx;
9988     int rc;
9989 
9990     if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
9991         return EINVAL;
9992 
9993     if (flags & ~(EDB_NOOVERWRITE|EDB_NODUPDATA|EDB_RESERVE|EDB_APPEND|EDB_APPENDDUP))
9994         return EINVAL;
9995 
9996     if (txn->mt_flags & (EDB_TXN_RDONLY|EDB_TXN_BLOCKED))
9997         return (txn->mt_flags & EDB_TXN_RDONLY) ? EACCES : EDB_BAD_TXN;
9998 
9999     edb_cursor_init(&mc, txn, dbi, &mx);
10000     mc.mc_next = txn->mt_cursors[dbi];
10001     txn->mt_cursors[dbi] = &mc;
10002     rc = edb_cursor_put(&mc, key, data, flags);
10003     txn->mt_cursors[dbi] = mc.mc_next;
10004     return rc;
10005 }
10006 
10007 #ifndef EDB_WBUF
10008 #define EDB_WBUF    (1024*1024)
10009 #endif
10010 #define EDB_EOF     0x10    /**< #edb_env_copyfd1() is done reading */
10011 
10012     /** State needed for a double-buffering compacting copy. */
10013 typedef struct edb_copy {
10014     EDB_env *mc_env;
10015     EDB_txn *mc_txn;
10016     pthread_mutex_t mc_mutex;
10017     pthread_cond_t mc_cond; /**< Condition variable for #mc_new */
10018     char *mc_wbuf[2];
10019     char *mc_over[2];
10020     int mc_wlen[2];
10021     int mc_olen[2];
10022     pgno_t mc_next_pgno;
10023     HANDLE mc_fd;
10024     int mc_toggle;          /**< Buffer number in provider */
10025     int mc_new;             /**< (0-2 buffers to write) | (#EDB_EOF at end) */
10026     /** Error code.  Never cleared if set.  Both threads can set nonzero
10027      *  to fail the copy.  Not mutex-protected, EXDB expects atomic int.
10028      */
10029     volatile int mc_error;
10030 } edb_copy;
10031 
10032     /** Dedicated writer thread for compacting copy. */
10033 static THREAD_RET ESECT CALL_CONV
10034 edb_env_copythr(void *arg)
10035 {
10036     edb_copy *my = arg;
10037     char *ptr;
10038     int toggle = 0, wsize, rc;
10039 #ifdef _WIN32
10040     DWORD len;
10041 #define DO_WRITE(rc, fd, ptr, w2, len)  rc = WriteFile(fd, ptr, w2, &len, NULL)
10042 #else
10043     int len;
10044 #define DO_WRITE(rc, fd, ptr, w2, len)  len = write(fd, ptr, w2); rc = (len >= 0)
10045 #ifdef SIGPIPE
10046     sigset_t set;
10047     sigemptyset(&set);
10048     sigaddset(&set, SIGPIPE);
10049     if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0)
10050         my->mc_error = rc;
10051 #endif
10052 #endif
10053 
10054     pthread_mutex_lock(&my->mc_mutex);
10055     for(;;) {
10056         while (!my->mc_new)
10057             pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
10058         if (my->mc_new == 0 + EDB_EOF) /* 0 buffers, just EOF */
10059             break;
10060         wsize = my->mc_wlen[toggle];
10061         ptr = my->mc_wbuf[toggle];
10062 again:
10063         rc = EDB_SUCCESS;
10064         while (wsize > 0 && !my->mc_error) {
10065             DO_WRITE(rc, my->mc_fd, ptr, wsize, len);
10066             if (!rc) {
10067                 rc = ErrCode();
10068 #if defined(SIGPIPE) && !defined(_WIN32)
10069                 if (rc == EPIPE) {
10070                     /* Collect the pending SIGPIPE, otherwise at least OS X
10071                      * gives it to the process on thread-exit (ITS#8504).
10072                      */
10073                     int tmp;
10074                     sigwait(&set, &tmp);
10075                 }
10076 #endif
10077                 break;
10078             } else if (len > 0) {
10079                 rc = EDB_SUCCESS;
10080                 ptr += len;
10081                 wsize -= len;
10082                 continue;
10083             } else {
10084                 rc = EIO;
10085                 break;
10086             }
10087         }
10088         if (rc) {
10089             my->mc_error = rc;
10090         }
10091         /* If there's an overflow page tail, write it too */
10092         if (my->mc_olen[toggle]) {
10093             wsize = my->mc_olen[toggle];
10094             ptr = my->mc_over[toggle];
10095             my->mc_olen[toggle] = 0;
10096             goto again;
10097         }
10098         my->mc_wlen[toggle] = 0;
10099         toggle ^= 1;
10100         /* Return the empty buffer to provider */
10101         my->mc_new--;
10102         pthread_cond_signal(&my->mc_cond);
10103     }
10104     pthread_mutex_unlock(&my->mc_mutex);
10105     return (THREAD_RET)0;
10106 #undef DO_WRITE
10107 }
10108 
10109     /** Give buffer and/or #EDB_EOF to writer thread, await unused buffer.
10110      *
10111      * @param[in] my control structure.
10112      * @param[in] adjust (1 to hand off 1 buffer) | (EDB_EOF when ending).
10113      */
10114 static int ESECT
10115 edb_env_cthr_toggle(edb_copy *my, int adjust)
10116 {
10117     pthread_mutex_lock(&my->mc_mutex);
10118     my->mc_new += adjust;
10119     pthread_cond_signal(&my->mc_cond);
10120     while (my->mc_new & 2)      /* both buffers in use */
10121         pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
10122     pthread_mutex_unlock(&my->mc_mutex);
10123 
10124     my->mc_toggle ^= (adjust & 1);
10125     /* Both threads reset mc_wlen, to be safe from threading errors */
10126     my->mc_wlen[my->mc_toggle] = 0;
10127     return my->mc_error;
10128 }
10129 
10130     /** Depth-first tree traversal for compacting copy.
10131      * @param[in] my control structure.
10132      * @param[in,out] pg database root.
10133      * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB.
10134      */
10135 static int ESECT
10136 edb_env_cwalk(edb_copy *my, pgno_t *pg, int flags)
10137 {
10138     EDB_cursor mc = {0};
10139     EDB_node *ni;
10140     EDB_page *mo, *mp, *leaf;
10141     char *buf, *ptr;
10142     int rc, toggle;
10143     unsigned int i;
10144 
10145     /* Empty DB, nothing to do */
10146     if (*pg == P_INVALID)
10147         return EDB_SUCCESS;
10148 
10149     mc.mc_snum = 1;
10150     mc.mc_txn = my->mc_txn;
10151     mc.mc_flags = my->mc_txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP);
10152 
10153     rc = edb_page_get(&mc, *pg, &mc.mc_pg[0], NULL);
10154     if (rc)
10155         return rc;
10156     rc = edb_page_search_root(&mc, NULL, EDB_PS_FIRST);
10157     if (rc)
10158         return rc;
10159 
10160     /* Make cursor pages writable */
10161     buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum);
10162     if (buf == NULL)
10163         return ENOMEM;
10164 
10165     for (i=0; i<mc.mc_top; i++) {
10166         edb_page_copy((EDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize);
10167         mc.mc_pg[i] = (EDB_page *)ptr;
10168         ptr += my->mc_env->me_psize;
10169     }
10170 
10171     /* This is writable space for a leaf page. Usually not needed. */
10172     leaf = (EDB_page *)ptr;
10173 
10174     toggle = my->mc_toggle;
10175     while (mc.mc_snum > 0) {
10176         unsigned n;
10177         mp = mc.mc_pg[mc.mc_top];
10178         n = NUMKEYS(mp);
10179 
10180         if (IS_LEAF(mp)) {
10181             if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) {
10182                 for (i=0; i<n; i++) {
10183                     ni = NODEPTR(mp, i);
10184                     if (ni->mn_flags & F_BIGDATA) {
10185                         EDB_page *omp;
10186                         pgno_t pg;
10187 
10188                         /* Need writable leaf */
10189                         if (mp != leaf) {
10190                             mc.mc_pg[mc.mc_top] = leaf;
10191                             edb_page_copy(leaf, mp, my->mc_env->me_psize);
10192                             mp = leaf;
10193                             ni = NODEPTR(mp, i);
10194                         }
10195 
10196                         memcpy(&pg, NODEDATA(ni), sizeof(pg));
10197                         memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t));
10198                         rc = edb_page_get(&mc, pg, &omp, NULL);
10199                         if (rc)
10200                             goto done;
10201                         if (my->mc_wlen[toggle] >= EDB_WBUF) {
10202                             rc = edb_env_cthr_toggle(my, 1);
10203                             if (rc)
10204                                 goto done;
10205                             toggle = my->mc_toggle;
10206                         }
10207                         mo = (EDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
10208                         memcpy(mo, omp, my->mc_env->me_psize);
10209                         mo->mp_pgno = my->mc_next_pgno;
10210                         my->mc_next_pgno += omp->mp_pages;
10211                         my->mc_wlen[toggle] += my->mc_env->me_psize;
10212                         if (omp->mp_pages > 1) {
10213                             my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1);
10214                             my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize;
10215                             rc = edb_env_cthr_toggle(my, 1);
10216                             if (rc)
10217                                 goto done;
10218                             toggle = my->mc_toggle;
10219                         }
10220                     } else if (ni->mn_flags & F_SUBDATA) {
10221                         EDB_db db;
10222 
10223                         /* Need writable leaf */
10224                         if (mp != leaf) {
10225                             mc.mc_pg[mc.mc_top] = leaf;
10226                             edb_page_copy(leaf, mp, my->mc_env->me_psize);
10227                             mp = leaf;
10228                             ni = NODEPTR(mp, i);
10229                         }
10230 
10231                         memcpy(&db, NODEDATA(ni), sizeof(db));
10232                         my->mc_toggle = toggle;
10233                         rc = edb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA);
10234                         if (rc)
10235                             goto done;
10236                         toggle = my->mc_toggle;
10237                         memcpy(NODEDATA(ni), &db, sizeof(db));
10238                     }
10239                 }
10240             }
10241         } else {
10242             mc.mc_ki[mc.mc_top]++;
10243             if (mc.mc_ki[mc.mc_top] < n) {
10244                 pgno_t pg;
10245 again:
10246                 ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]);
10247                 pg = NODEPGNO(ni);
10248                 rc = edb_page_get(&mc, pg, &mp, NULL);
10249                 if (rc)
10250                     goto done;
10251                 mc.mc_top++;
10252                 mc.mc_snum++;
10253                 mc.mc_ki[mc.mc_top] = 0;
10254                 if (IS_BRANCH(mp)) {
10255                     /* Whenever we advance to a sibling branch page,
10256                      * we must proceed all the way down to its first leaf.
10257                      */
10258                     edb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize);
10259                     goto again;
10260                 } else
10261                     mc.mc_pg[mc.mc_top] = mp;
10262                 continue;
10263             }
10264         }
10265         if (my->mc_wlen[toggle] >= EDB_WBUF) {
10266             rc = edb_env_cthr_toggle(my, 1);
10267             if (rc)
10268                 goto done;
10269             toggle = my->mc_toggle;
10270         }
10271         mo = (EDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
10272         edb_page_copy(mo, mp, my->mc_env->me_psize);
10273         mo->mp_pgno = my->mc_next_pgno++;
10274         my->mc_wlen[toggle] += my->mc_env->me_psize;
10275         if (mc.mc_top) {
10276             /* Update parent if there is one */
10277             ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]);
10278             SETPGNO(ni, mo->mp_pgno);
10279             edb_cursor_pop(&mc);
10280         } else {
10281             /* Otherwise we're done */
10282             *pg = mo->mp_pgno;
10283             break;
10284         }
10285     }
10286 done:
10287     free(buf);
10288     return rc;
10289 }
10290 
10291     /** Copy environment with compaction. */
10292 static int ESECT
10293 edb_env_copyfd1(EDB_env *env, HANDLE fd)
10294 {
10295     EDB_meta *mm;
10296     EDB_page *mp;
10297     edb_copy my = {0};
10298     EDB_txn *txn = NULL;
10299     pthread_t thr;
10300     pgno_t root, new_root;
10301     int rc = EDB_SUCCESS;
10302 
10303 #ifdef _WIN32
10304     if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) ||
10305         !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) {
10306         rc = ErrCode();
10307         goto done;
10308     }
10309     my.mc_wbuf[0] = _aligned_malloc(EDB_WBUF*2, env->me_os_psize);
10310     if (my.mc_wbuf[0] == NULL) {
10311         /* _aligned_malloc() sets errno, but we use Windows error codes */
10312         rc = ERROR_NOT_ENOUGH_MEMORY;
10313         goto done;
10314     }
10315 #else
10316     if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0)
10317         return rc;
10318     if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0)
10319         goto done2;
10320 #ifdef HAVE_MEMALIGN
10321     my.mc_wbuf[0] = memalign(env->me_os_psize, EDB_WBUF*2);
10322     if (my.mc_wbuf[0] == NULL) {
10323         rc = errno;
10324         goto done;
10325     }
10326 #else
10327     {
10328         void *p;
10329         if ((rc = posix_memalign(&p, env->me_os_psize, EDB_WBUF*2)) != 0)
10330             goto done;
10331         my.mc_wbuf[0] = p;
10332     }
10333 #endif
10334 #endif
10335     memset(my.mc_wbuf[0], 0, EDB_WBUF*2);
10336     my.mc_wbuf[1] = my.mc_wbuf[0] + EDB_WBUF;
10337     my.mc_next_pgno = NUM_METAS;
10338     my.mc_env = env;
10339     my.mc_fd = fd;
10340     rc = THREAD_CREATE(thr, edb_env_copythr, &my);
10341     if (rc)
10342         {
10343                 NDRX_PLATF_DIAG(NDRX_DIAG_PTHREAD_CREATE, errno, "edb_env_copyfd1");
10344         goto done;
10345         }
10346 
10347     rc = edb_txn_begin(env, NULL, EDB_RDONLY, &txn);
10348     if (rc)
10349         goto finish;
10350 
10351     mp = (EDB_page *)my.mc_wbuf[0];
10352     memset(mp, 0, NUM_METAS * env->me_psize);
10353     mp->mp_pgno = 0;
10354     mp->mp_flags = P_META;
10355     mm = (EDB_meta *)METADATA(mp);
10356     edb_env_init_meta0(env, mm);
10357     mm->mm_address = env->me_metas[0]->mm_address;
10358 
10359     mp = (EDB_page *)(my.mc_wbuf[0] + env->me_psize);
10360     mp->mp_pgno = 1;
10361     mp->mp_flags = P_META;
10362     *(EDB_meta *)METADATA(mp) = *mm;
10363     mm = (EDB_meta *)METADATA(mp);
10364 
10365     /* Set metapage 1 with current main DB */
10366     root = new_root = txn->mt_dbs[MAIN_DBI].md_root;
10367     if (root != P_INVALID) {
10368         /* Count free pages + freeDB pages.  Subtract from last_pg
10369          * to find the new last_pg, which also becomes the new root.
10370          */
10371         EDB_ID freecount = 0;
10372         EDB_cursor mc;
10373         EDB_val key, data;
10374         edb_cursor_init(&mc, txn, FREE_DBI, NULL);
10375         while ((rc = edb_cursor_get(&mc, &key, &data, EDB_NEXT)) == 0)
10376             freecount += *(EDB_ID *)data.mv_data;
10377         if (rc != EDB_NOTFOUND)
10378             goto finish;
10379         freecount += txn->mt_dbs[FREE_DBI].md_branch_pages +
10380             txn->mt_dbs[FREE_DBI].md_leaf_pages +
10381             txn->mt_dbs[FREE_DBI].md_overflow_pages;
10382 
10383         new_root = txn->mt_next_pgno - 1 - freecount;
10384         mm->mm_last_pg = new_root;
10385         mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
10386         mm->mm_dbs[MAIN_DBI].md_root = new_root;
10387     } else {
10388         /* When the DB is empty, handle it specially to
10389          * fix any breakage like page leaks from ITS#8174.
10390          */
10391         mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags;
10392     }
10393     if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) {
10394         mm->mm_txnid = 1;       /* use metapage 1 */
10395     }
10396 
10397     my.mc_wlen[0] = env->me_psize * NUM_METAS;
10398     my.mc_txn = txn;
10399     rc = edb_env_cwalk(&my, &root, 0);
10400     if (rc == EDB_SUCCESS && root != new_root) {
10401         rc = EDB_INCOMPATIBLE;  /* page leak or corrupt DB */
10402     }
10403 
10404 finish:
10405     if (rc)
10406         my.mc_error = rc;
10407     edb_env_cthr_toggle(&my, 1 | EDB_EOF);
10408     rc = THREAD_FINISH(thr);
10409     edb_txn_abort(txn);
10410 
10411 done:
10412 #ifdef _WIN32
10413     if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]);
10414     if (my.mc_cond)  CloseHandle(my.mc_cond);
10415     if (my.mc_mutex) CloseHandle(my.mc_mutex);
10416 #else
10417     free(my.mc_wbuf[0]);
10418     pthread_cond_destroy(&my.mc_cond);
10419 done2:
10420     pthread_mutex_destroy(&my.mc_mutex);
10421 #endif
10422     return rc ? rc : my.mc_error;
10423 }
10424 
10425     /** Copy environment as-is. */
10426 static int ESECT
10427 edb_env_copyfd0(EDB_env *env, HANDLE fd)
10428 {
10429     EDB_txn *txn = NULL;
10430     edb_mutexref_t wmutex = NULL;
10431     int rc;
10432     edb_size_t wsize, w3;
10433     char *ptr;
10434 #ifdef _WIN32
10435     DWORD len, w2;
10436 #define DO_WRITE(rc, fd, ptr, w2, len)  rc = WriteFile(fd, ptr, w2, &len, NULL)
10437 #else
10438     ssize_t len;
10439     size_t w2;
10440 #define DO_WRITE(rc, fd, ptr, w2, len)  len = write(fd, ptr, w2); rc = (len >= 0)
10441 #endif
10442 
10443     /* Do the lock/unlock of the reader mutex before starting the
10444      * write txn.  Otherwise other read txns could block writers.
10445      */
10446     rc = edb_txn_begin(env, NULL, EDB_RDONLY, &txn);
10447     if (rc)
10448         return rc;
10449 
10450     if (env->me_txns) {
10451         /* We must start the actual read txn after blocking writers */
10452         edb_txn_end(txn, EDB_END_RESET_TMP);
10453 
10454         /* Temporarily block writers until we snapshot the meta pages */
10455         wmutex = env->me_wmutex;
10456         if (LOCK_MUTEX(rc, env, wmutex))
10457             goto leave;
10458 
10459         rc = edb_txn_renew0(txn);
10460         if (rc) {
10461             UNLOCK_MUTEX(wmutex);
10462             goto leave;
10463         }
10464     }
10465 
10466     wsize = env->me_psize * NUM_METAS;
10467     ptr = env->me_map;
10468     w2 = wsize;
10469     while (w2 > 0) {
10470         DO_WRITE(rc, fd, ptr, w2, len);
10471         if (!rc) {
10472             rc = ErrCode();
10473             break;
10474         } else if (len > 0) {
10475             rc = EDB_SUCCESS;
10476             ptr += len;
10477             w2 -= len;
10478             continue;
10479         } else {
10480             /* Non-blocking or async handles are not supported */
10481             rc = EIO;
10482             break;
10483         }
10484     }
10485     if (wmutex)
10486         UNLOCK_MUTEX(wmutex);
10487 
10488     if (rc)
10489         goto leave;
10490 
10491     w3 = txn->mt_next_pgno * env->me_psize;
10492     {
10493         edb_size_t fsize = 0;
10494         if ((rc = edb_fsize(env->me_fd, &fsize)))
10495             goto leave;
10496         if (w3 > fsize)
10497             w3 = fsize;
10498     }
10499     wsize = w3 - wsize;
10500     while (wsize > 0) {
10501         if (wsize > MAX_WRITE)
10502             w2 = MAX_WRITE;
10503         else
10504             w2 = wsize;
10505         DO_WRITE(rc, fd, ptr, w2, len);
10506         if (!rc) {
10507             rc = ErrCode();
10508             break;
10509         } else if (len > 0) {
10510             rc = EDB_SUCCESS;
10511             ptr += len;
10512             wsize -= len;
10513             continue;
10514         } else {
10515             rc = EIO;
10516             break;
10517         }
10518     }
10519 
10520 leave:
10521     edb_txn_abort(txn);
10522     return rc;
10523 }
10524 
10525 int ESECT
10526 edb_env_copyfd2(EDB_env *env, HANDLE fd, unsigned int flags)
10527 {
10528     if (flags & EDB_CP_COMPACT)
10529         return edb_env_copyfd1(env, fd);
10530     else
10531         return edb_env_copyfd0(env, fd);
10532 }
10533 
10534 int ESECT
10535 edb_env_copyfd(EDB_env *env, HANDLE fd)
10536 {
10537     return edb_env_copyfd2(env, fd, 0);
10538 }
10539 
10540 int ESECT
10541 edb_env_copy2(EDB_env *env, const char *path, unsigned int flags)
10542 {
10543     int rc;
10544     EDB_name fname;
10545     HANDLE newfd = INVALID_HANDLE_VALUE;
10546 
10547     rc = edb_fname_init(path, env->me_flags | EDB_NOLOCK, &fname);
10548     if (rc == EDB_SUCCESS) {
10549         rc = edb_fopen(env, &fname, EDB_O_COPY, 0666, &newfd);
10550         edb_fname_destroy(fname);
10551     }
10552     if (rc == EDB_SUCCESS) {
10553         rc = edb_env_copyfd2(env, newfd, flags);
10554         if (close(newfd) < 0 && rc == EDB_SUCCESS)
10555             rc = ErrCode();
10556     }
10557     return rc;
10558 }
10559 
10560 int ESECT
10561 edb_env_copy(EDB_env *env, const char *path)
10562 {
10563     return edb_env_copy2(env, path, 0);
10564 }
10565 
10566 int ESECT
10567 edb_env_set_flags(EDB_env *env, unsigned int flag, int onoff)
10568 {
10569     if (flag & ~CHANGEABLE)
10570         return EINVAL;
10571     if (onoff)
10572         env->me_flags |= flag;
10573     else
10574         env->me_flags &= ~flag;
10575     return EDB_SUCCESS;
10576 }
10577 
10578 int ESECT
10579 edb_env_get_flags(EDB_env *env, unsigned int *arg)
10580 {
10581     if (!env || !arg)
10582         return EINVAL;
10583 
10584     *arg = env->me_flags & (CHANGEABLE|CHANGELESS);
10585     return EDB_SUCCESS;
10586 }
10587 
10588 int ESECT
10589 edb_env_set_userctx(EDB_env *env, void *ctx)
10590 {
10591     if (!env)
10592         return EINVAL;
10593     env->me_userctx = ctx;
10594     return EDB_SUCCESS;
10595 }
10596 
10597 void * ESECT
10598 edb_env_get_userctx(EDB_env *env)
10599 {
10600     return env ? env->me_userctx : NULL;
10601 }
10602 
10603 int ESECT
10604 edb_env_set_assert(EDB_env *env, EDB_assert_func *func)
10605 {
10606     if (!env)
10607         return EINVAL;
10608 #ifndef NDEBUG
10609     env->me_assert_func = func;
10610 #endif
10611     return EDB_SUCCESS;
10612 }
10613 
10614 int ESECT
10615 edb_env_get_path(EDB_env *env, const char **arg)
10616 {
10617     if (!env || !arg)
10618         return EINVAL;
10619 
10620     *arg = env->me_path;
10621     return EDB_SUCCESS;
10622 }
10623 
10624 int ESECT
10625 edb_env_get_fd(EDB_env *env, edb_filehandle_t *arg)
10626 {
10627     if (!env || !arg)
10628         return EINVAL;
10629 
10630     *arg = env->me_fd;
10631     return EDB_SUCCESS;
10632 }
10633 
10634 /** Common code for #edb_stat() and #edb_env_stat().
10635  * @param[in] env the environment to operate in.
10636  * @param[in] db the #EDB_db record containing the stats to return.
10637  * @param[out] arg the address of an #EDB_stat structure to receive the stats.
10638  * @return 0, this function always succeeds.
10639  */
10640 static int ESECT
10641 edb_stat0(EDB_env *env, EDB_db *db, EDB_stat *arg)
10642 {
10643     arg->ms_psize = env->me_psize;
10644     arg->ms_depth = db->md_depth;
10645     arg->ms_branch_pages = db->md_branch_pages;
10646     arg->ms_leaf_pages = db->md_leaf_pages;
10647     arg->ms_overflow_pages = db->md_overflow_pages;
10648     arg->ms_entries = db->md_entries;
10649 
10650     return EDB_SUCCESS;
10651 }
10652 
10653 int ESECT
10654 edb_env_stat(EDB_env *env, EDB_stat *arg)
10655 {
10656     EDB_meta *meta;
10657 
10658     if (env == NULL || arg == NULL)
10659         return EINVAL;
10660 
10661     meta = edb_env_pick_meta(env);
10662 
10663     return edb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg);
10664 }
10665 
10666 int ESECT
10667 edb_env_info(EDB_env *env, EDB_envinfo *arg)
10668 {
10669     EDB_meta *meta;
10670 
10671     if (env == NULL || arg == NULL)
10672         return EINVAL;
10673 
10674     meta = edb_env_pick_meta(env);
10675     arg->me_mapaddr = meta->mm_address;
10676     arg->me_last_pgno = meta->mm_last_pg;
10677     arg->me_last_txnid = meta->mm_txnid;
10678 
10679     arg->me_mapsize = env->me_mapsize;
10680     arg->me_maxreaders = env->me_maxreaders;
10681     arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0;
10682     return EDB_SUCCESS;
10683 }
10684 
10685 /** Set the default comparison functions for a database.
10686  * Called immediately after a database is opened to set the defaults.
10687  * The user can then override them with #edb_set_compare() or
10688  * #edb_set_dupsort().
10689  * @param[in] txn A transaction handle returned by #edb_txn_begin()
10690  * @param[in] dbi A database handle returned by #edb_dbi_open()
10691  */
10692 static void
10693 edb_default_cmp(EDB_txn *txn, EDB_dbi dbi)
10694 {
10695     uint16_t f = txn->mt_dbs[dbi].md_flags;
10696 
10697     txn->mt_dbxs[dbi].md_cmp =
10698         (f & EDB_REVERSEKEY) ? edb_cmp_memnr :
10699         (f & EDB_INTEGERKEY) ? edb_cmp_cint  : edb_cmp_memn;
10700 
10701     txn->mt_dbxs[dbi].md_dcmp =
10702         !(f & EDB_DUPSORT) ? 0 :
10703         ((f & EDB_INTEGERDUP)
10704          ? ((f & EDB_DUPFIXED)   ? edb_cmp_int   : edb_cmp_cint)
10705          : ((f & EDB_REVERSEDUP) ? edb_cmp_memnr : edb_cmp_memn));
10706 }
10707 
10708 int edb_dbi_open(EDB_txn *txn, const char *name, unsigned int flags, EDB_dbi *dbi)
10709 {
10710     EDB_val key, data;
10711     EDB_dbi i;
10712     EDB_cursor mc;
10713     EDB_db dummy;
10714     int rc, dbflag, exact;
10715     unsigned int unused = 0, seq;
10716     char *namedup;
10717     size_t len;
10718 
10719     if (flags & ~VALID_FLAGS)
10720         return EINVAL;
10721     if (txn->mt_flags & EDB_TXN_BLOCKED)
10722         return EDB_BAD_TXN;
10723 
10724     /* main DB? */
10725     if (!name) {
10726         *dbi = MAIN_DBI;
10727         if (flags & PERSISTENT_FLAGS) {
10728             uint16_t f2 = flags & PERSISTENT_FLAGS;
10729             /* make sure flag changes get committed */
10730             if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) {
10731                 txn->mt_dbs[MAIN_DBI].md_flags |= f2;
10732                 txn->mt_flags |= EDB_TXN_DIRTY;
10733             }
10734         }
10735         edb_default_cmp(txn, MAIN_DBI);
10736         return EDB_SUCCESS;
10737     }
10738 
10739     if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
10740         edb_default_cmp(txn, MAIN_DBI);
10741     }
10742 
10743     /* Is the DB already open? */
10744     len = strlen(name);
10745     for (i=CORE_DBS; i<txn->mt_nuedbs; i++) {
10746         if (!txn->mt_dbxs[i].md_name.mv_size) {
10747             /* Remember this free slot */
10748             if (!unused) unused = i;
10749             continue;
10750         }
10751         if (len == txn->mt_dbxs[i].md_name.mv_size &&
10752             !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) {
10753             *dbi = i;
10754             return EDB_SUCCESS;
10755         }
10756     }
10757 
10758     /* If no free slot and max hit, fail */
10759     if (!unused && txn->mt_nuedbs >= txn->mt_env->me_maxdbs)
10760         return EDB_DBS_FULL;
10761 
10762     /* Cannot mix named databases with some mainDB flags */
10763     if (txn->mt_dbs[MAIN_DBI].md_flags & (EDB_DUPSORT|EDB_INTEGERKEY))
10764         return (flags & EDB_CREATE) ? EDB_INCOMPATIBLE : EDB_NOTFOUND;
10765 
10766     /* Find the DB info */
10767     dbflag = DB_NEW|DB_VALID|DB_USRVALID;
10768     exact = 0;
10769     key.mv_size = len;
10770     key.mv_data = (void *)name;
10771     edb_cursor_init(&mc, txn, MAIN_DBI, NULL);
10772     rc = edb_cursor_set(&mc, &key, &data, EDB_SET, &exact);
10773     if (rc == EDB_SUCCESS) {
10774         /* make sure this is actually a DB */
10775         EDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
10776         if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)
10777             return EDB_INCOMPATIBLE;
10778     } else {
10779         if (rc != EDB_NOTFOUND || !(flags & EDB_CREATE))
10780             return rc;
10781         if (F_ISSET(txn->mt_flags, EDB_TXN_RDONLY))
10782             return EACCES;
10783     }
10784 
10785     /* Done here so we cannot fail after creating a new DB */
10786     if ((namedup = strdup(name)) == NULL)
10787         return ENOMEM;
10788 
10789     if (rc) {
10790         /* EDB_NOTFOUND and EDB_CREATE: Create new DB */
10791         data.mv_size = sizeof(EDB_db);
10792         data.mv_data = &dummy;
10793         memset(&dummy, 0, sizeof(dummy));
10794         dummy.md_root = P_INVALID;
10795         dummy.md_flags = flags & PERSISTENT_FLAGS;
10796         WITH_CURSOR_TRACKING(mc,
10797             rc = edb_cursor_put(&mc, &key, &data, F_SUBDATA));
10798         dbflag |= DB_DIRTY;
10799     }
10800 
10801     if (rc) {
10802         free(namedup);
10803     } else {
10804         /* Got info, register DBI in this txn */
10805         unsigned int slot = unused ? unused : txn->mt_nuedbs;
10806         txn->mt_dbxs[slot].md_name.mv_data = namedup;
10807         txn->mt_dbxs[slot].md_name.mv_size = len;
10808         txn->mt_dbxs[slot].md_rel = NULL;
10809         txn->mt_dbflags[slot] = dbflag;
10810         /* txn-> and env-> are the same in read txns, use
10811          * tmp variable to avoid undefined assignment
10812          */
10813         seq = ++txn->mt_env->me_dbiseqs[slot];
10814         txn->mt_dbiseqs[slot] = seq;
10815 
10816         memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(EDB_db));
10817         *dbi = slot;
10818         edb_default_cmp(txn, slot);
10819         if (!unused) {
10820             txn->mt_nuedbs++;
10821         }
10822     }
10823 
10824     return rc;
10825 }
10826 
10827 int ESECT
10828 edb_stat(EDB_txn *txn, EDB_dbi dbi, EDB_stat *arg)
10829 {
10830     if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID))
10831         return EINVAL;
10832 
10833     if (txn->mt_flags & EDB_TXN_BLOCKED)
10834         return EDB_BAD_TXN;
10835 
10836     if (txn->mt_dbflags[dbi] & DB_STALE) {
10837         EDB_cursor mc;
10838         EDB_xcursor mx;
10839         /* Stale, must read the DB's root. cursor_init does it for us. */
10840         edb_cursor_init(&mc, txn, dbi, &mx);
10841     }
10842     return edb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg);
10843 }
10844 
10845 void edb_dbi_close(EDB_env *env, EDB_dbi dbi)
10846 {
10847     char *ptr;
10848     if (dbi < CORE_DBS || dbi >= env->me_maxdbs)
10849         return;
10850     ptr = env->me_dbxs[dbi].md_name.mv_data;
10851     /* If there was no name, this was already closed */
10852     if (ptr) {
10853         env->me_dbxs[dbi].md_name.mv_data = NULL;
10854         env->me_dbxs[dbi].md_name.mv_size = 0;
10855         env->me_dbflags[dbi] = 0;
10856         env->me_dbiseqs[dbi]++;
10857         free(ptr);
10858     }
10859 }
10860 
10861 int edb_dbi_flags(EDB_txn *txn, EDB_dbi dbi, unsigned int *flags)
10862 {
10863     /* We could return the flags for the FREE_DBI too but what's the point? */
10864     if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
10865         return EINVAL;
10866     *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS;
10867     return EDB_SUCCESS;
10868 }
10869 
10870 /** Add all the DB's pages to the free list.
10871  * @param[in] mc Cursor on the DB to free.
10872  * @param[in] subs non-Zero to check for sub-DBs in this DB.
10873  * @return 0 on success, non-zero on failure.
10874  */
10875 static int
10876 edb_drop0(EDB_cursor *mc, int subs)
10877 {
10878     int rc;
10879 
10880     rc = edb_page_search(mc, NULL, EDB_PS_FIRST);
10881     if (rc == EDB_SUCCESS) {
10882         EDB_txn *txn = mc->mc_txn;
10883         EDB_node *ni;
10884         EDB_cursor mx;
10885         unsigned int i;
10886 
10887         /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves.
10888          * This also avoids any P_LEAF2 pages, which have no nodes.
10889          * Also if the DB doesn't have sub-DBs and has no overflow
10890          * pages, omit scanning leaves.
10891          */
10892         if ((mc->mc_flags & C_SUB) ||
10893             (!subs && !mc->mc_db->md_overflow_pages))
10894             edb_cursor_pop(mc);
10895 
10896         edb_cursor_copy(mc, &mx);
10897 #ifdef EDB_VL32
10898         /* bump refcount for mx's pages */
10899         for (i=0; i<mc->mc_snum; i++)
10900             edb_page_get(&mx, mc->mc_pg[i]->mp_pgno, &mx.mc_pg[i], NULL);
10901 #endif
10902         while (mc->mc_snum > 0) {
10903             EDB_page *mp = mc->mc_pg[mc->mc_top];
10904             unsigned n = NUMKEYS(mp);
10905             if (IS_LEAF(mp)) {
10906                 for (i=0; i<n; i++) {
10907                     ni = NODEPTR(mp, i);
10908                     if (ni->mn_flags & F_BIGDATA) {
10909                         EDB_page *omp;
10910                         pgno_t pg;
10911                         memcpy(&pg, NODEDATA(ni), sizeof(pg));
10912                         rc = edb_page_get(mc, pg, &omp, NULL);
10913                         if (rc != 0)
10914                             goto done;
10915                         edb_cassert(mc, IS_OVERFLOW(omp));
10916                         rc = edb_eidl_append_range(&txn->mt_free_pgs,
10917                             pg, omp->mp_pages);
10918                         if (rc)
10919                             goto done;
10920                         mc->mc_db->md_overflow_pages -= omp->mp_pages;
10921                         if (!mc->mc_db->md_overflow_pages && !subs)
10922                             break;
10923                     } else if (subs && (ni->mn_flags & F_SUBDATA)) {
10924                         edb_xcursor_init1(mc, ni);
10925                         rc = edb_drop0(&mc->mc_xcursor->mx_cursor, 0);
10926                         if (rc)
10927                             goto done;
10928                     }
10929                 }
10930                 if (!subs && !mc->mc_db->md_overflow_pages)
10931                     goto pop;
10932             } else {
10933                 if ((rc = edb_eidl_need(&txn->mt_free_pgs, n)) != 0)
10934                     goto done;
10935                 for (i=0; i<n; i++) {
10936                     pgno_t pg;
10937                     ni = NODEPTR(mp, i);
10938                     pg = NODEPGNO(ni);
10939                     /* free it */
10940                     edb_eidl_xappend(txn->mt_free_pgs, pg);
10941                 }
10942             }
10943             if (!mc->mc_top)
10944                 break;
10945             mc->mc_ki[mc->mc_top] = i;
10946             rc = edb_cursor_sibling(mc, 1);
10947             if (rc) {
10948                 if (rc != EDB_NOTFOUND)
10949                     goto done;
10950                 /* no more siblings, go back to beginning
10951                  * of previous level.
10952                  */
10953 pop:
10954                 edb_cursor_pop(mc);
10955                 mc->mc_ki[0] = 0;
10956                 for (i=1; i<mc->mc_snum; i++) {
10957                     mc->mc_ki[i] = 0;
10958                     mc->mc_pg[i] = mx.mc_pg[i];
10959                 }
10960             }
10961         }
10962         /* free it */
10963         rc = edb_eidl_append(&txn->mt_free_pgs, mc->mc_db->md_root);
10964 done:
10965         if (rc)
10966             txn->mt_flags |= EDB_TXN_ERROR;
10967         /* drop refcount for mx's pages */
10968         EDB_CURSOR_UNREF(&mx, 0);
10969     } else if (rc == EDB_NOTFOUND) {
10970         rc = EDB_SUCCESS;
10971     }
10972     mc->mc_flags &= ~C_INITIALIZED;
10973     return rc;
10974 }
10975 
10976 int edb_drop(EDB_txn *txn, EDB_dbi dbi, int del)
10977 {
10978     EDB_cursor *mc, *m2;
10979     int rc;
10980 
10981     if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
10982         return EINVAL;
10983 
10984     if (F_ISSET(txn->mt_flags, EDB_TXN_RDONLY))
10985         return EACCES;
10986 
10987     if (TXN_DBI_CHANGED(txn, dbi))
10988         return EDB_BAD_DBI;
10989 
10990     rc = edb_cursor_open(txn, dbi, &mc);
10991     if (rc)
10992         return rc;
10993 
10994     rc = edb_drop0(mc, mc->mc_db->md_flags & EDB_DUPSORT);
10995     /* Invalidate the dropped DB's cursors */
10996     for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next)
10997         m2->mc_flags &= ~(C_INITIALIZED|C_EOF);
10998     if (rc)
10999         goto leave;
11000 
11001     /* Can't delete the main DB */
11002     if (del && dbi >= CORE_DBS) {
11003         rc = edb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA);
11004         if (!rc) {
11005             txn->mt_dbflags[dbi] = DB_STALE;
11006             edb_dbi_close(txn->mt_env, dbi);
11007         } else {
11008             txn->mt_flags |= EDB_TXN_ERROR;
11009         }
11010     } else {
11011         /* reset the DB record, mark it dirty */
11012         txn->mt_dbflags[dbi] |= DB_DIRTY;
11013         txn->mt_dbs[dbi].md_depth = 0;
11014         txn->mt_dbs[dbi].md_branch_pages = 0;
11015         txn->mt_dbs[dbi].md_leaf_pages = 0;
11016         txn->mt_dbs[dbi].md_overflow_pages = 0;
11017         txn->mt_dbs[dbi].md_entries = 0;
11018         txn->mt_dbs[dbi].md_root = P_INVALID;
11019 
11020         txn->mt_flags |= EDB_TXN_DIRTY;
11021     }
11022 leave:
11023     edb_cursor_close(mc);
11024     return rc;
11025 }
11026 
11027 int edb_set_compare(EDB_txn *txn, EDB_dbi dbi, EDB_cmp_func *cmp)
11028 {
11029     if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
11030         return EINVAL;
11031 
11032     txn->mt_dbxs[dbi].md_cmp = cmp;
11033     return EDB_SUCCESS;
11034 }
11035 
11036 int edb_set_dupsort(EDB_txn *txn, EDB_dbi dbi, EDB_cmp_func *cmp)
11037 {
11038     if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
11039         return EINVAL;
11040 
11041     txn->mt_dbxs[dbi].md_dcmp = cmp;
11042     return EDB_SUCCESS;
11043 }
11044 
11045 int edb_set_relfunc(EDB_txn *txn, EDB_dbi dbi, EDB_rel_func *rel)
11046 {
11047     if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
11048         return EINVAL;
11049 
11050     txn->mt_dbxs[dbi].md_rel = rel;
11051     return EDB_SUCCESS;
11052 }
11053 
11054 int edb_set_relctx(EDB_txn *txn, EDB_dbi dbi, void *ctx)
11055 {
11056     if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
11057         return EINVAL;
11058 
11059     txn->mt_dbxs[dbi].md_relctx = ctx;
11060     return EDB_SUCCESS;
11061 }
11062 
11063 int ESECT
11064 edb_env_get_maxkeysize(EDB_env *env)
11065 {
11066     return ENV_MAXKEY(env);
11067 }
11068 
11069 int ESECT
11070 edb_reader_list(EDB_env *env, EDB_msg_func *func, void *ctx)
11071 {
11072     unsigned int i, rdrs;
11073     EDB_reader *mr;
11074     char buf[64];
11075     int rc = 0, first = 1;
11076 
11077     if (!env || !func)
11078         return -1;
11079     if (!env->me_txns) {
11080         return func("(no reader locks)\n", ctx);
11081     }
11082     rdrs = env->me_txns->mti_numreaders;
11083     mr = env->me_txns->mti_readers;
11084     for (i=0; i<rdrs; i++) {
11085         if (mr[i].mr_pid) {
11086             txnid_t txnid = mr[i].mr_txnid;
11087             sprintf(buf, txnid == (txnid_t)-1 ?
11088                 "%10d %"Z"x -\n" : "%10d %"Z"x %"Yu"\n",
11089                 (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid);
11090             if (first) {
11091                 first = 0;
11092                 rc = func("    pid     thread     txnid\n", ctx);
11093                 if (rc < 0)
11094                     break;
11095             }
11096             rc = func(buf, ctx);
11097             if (rc < 0)
11098                 break;
11099         }
11100     }
11101     if (first) {
11102         rc = func("(no active readers)\n", ctx);
11103     }
11104     return rc;
11105 }
11106 
11107 /** Insert pid into list if not already present.
11108  * return -1 if already present.
11109  */
11110 static int ESECT
11111 edb_pid_insert(EDB_PID_T *ids, EDB_PID_T pid)
11112 {
11113     /* binary search of pid in list */
11114     unsigned base = 0;
11115     unsigned cursor = 1;
11116     int val = 0;
11117     unsigned n = ids[0];
11118 
11119     while( 0 < n ) {
11120         unsigned pivot = n >> 1;
11121         cursor = base + pivot + 1;
11122         val = pid - ids[cursor];
11123 
11124         if( val < 0 ) {
11125             n = pivot;
11126 
11127         } else if ( val > 0 ) {
11128             base = cursor;
11129             n -= pivot + 1;
11130 
11131         } else {
11132             /* found, so it's a duplicate */
11133             return -1;
11134         }
11135     }
11136 
11137     if( val > 0 ) {
11138         ++cursor;
11139     }
11140     ids[0]++;
11141     for (n = ids[0]; n > cursor; n--)
11142         ids[n] = ids[n-1];
11143     ids[n] = pid;
11144     return 0;
11145 }
11146 
11147 int ESECT
11148 edb_reader_check(EDB_env *env, int *dead)
11149 {
11150     if (!env)
11151         return EINVAL;
11152     if (dead)
11153         *dead = 0;
11154     return env->me_txns ? edb_reader_check0(env, 0, dead) : EDB_SUCCESS;
11155 }
11156 
11157 /** As #edb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */
11158 static int ESECT
11159 edb_reader_check0(EDB_env *env, int rlocked, int *dead)
11160 {
11161     edb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex;
11162     unsigned int i, j, rdrs;
11163     EDB_reader *mr;
11164     EDB_PID_T *pids, pid;
11165     int rc = EDB_SUCCESS, count = 0;
11166 
11167     rdrs = env->me_txns->mti_numreaders;
11168     pids = malloc((rdrs+1) * sizeof(EDB_PID_T));
11169     if (!pids)
11170         return ENOMEM;
11171     pids[0] = 0;
11172     mr = env->me_txns->mti_readers;
11173     for (i=0; i<rdrs; i++) {
11174         pid = mr[i].mr_pid;
11175         if (pid && pid != env->me_pid) {
11176             if (edb_pid_insert(pids, pid) == 0) {
11177                 if (!edb_reader_pid(env, Pidcheck, pid)) {
11178                     /* Stale reader found */
11179                     j = i;
11180                     if (rmutex) {
11181                         if ((rc = LOCK_MUTEX0(rmutex)) != 0) {
11182                             if ((rc = edb_mutex_failed(env, rmutex, rc)))
11183                                 break;
11184                             rdrs = 0; /* the above checked all readers */
11185                         } else {
11186                             /* Recheck, a new process may have reused pid */
11187                             if (edb_reader_pid(env, Pidcheck, pid))
11188                                 j = rdrs;
11189                         }
11190                     }
11191                     for (; j<rdrs; j++)
11192                             if (mr[j].mr_pid == pid) {
11193                                 DPRINTF(("clear stale reader pid %u txn %"Yd,
11194                                     (unsigned) pid, mr[j].mr_txnid));
11195                                 mr[j].mr_pid = 0;
11196                                 count++;
11197                             }
11198                     if (rmutex)
11199                         UNLOCK_MUTEX(rmutex);
11200                 }
11201             }
11202         }
11203     }
11204     free(pids);
11205     if (dead)
11206         *dead = count;
11207     return rc;
11208 }
11209 
11210 #ifdef EDB_ROBUST_SUPPORTED
11211 /** Handle #LOCK_MUTEX0() failure.
11212  * Try to repair the lock file if the mutex owner died.
11213  * @param[in] env   the environment handle
11214  * @param[in] mutex LOCK_MUTEX0() mutex
11215  * @param[in] rc    LOCK_MUTEX0() error (nonzero)
11216  * @return 0 on success with the mutex locked, or an error code on failure.
11217  */
11218 static int ESECT
11219 edb_mutex_failed(EDB_env *env, edb_mutexref_t mutex, int rc)
11220 {
11221     int rlocked, rc2;
11222     EDB_meta *meta;
11223 
11224     if (rc == EDB_OWNERDEAD) {
11225         /* We own the mutex. Clean up after dead previous owner. */
11226         rc = EDB_SUCCESS;
11227         rlocked = (mutex == env->me_rmutex);
11228         if (!rlocked) {
11229             /* Keep mti_txnid updated, otherwise next writer can
11230              * overwrite data which latest meta page refers to.
11231              */
11232             meta = edb_env_pick_meta(env);
11233             env->me_txns->mti_txnid = meta->mm_txnid;
11234             /* env is hosed if the dead thread was ours */
11235             if (env->me_txn) {
11236                 env->me_flags |= EDB_FATAL_ERROR;
11237                 env->me_txn = NULL;
11238                 rc = EDB_PANIC;
11239             }
11240         }
11241         DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'),
11242             (rc ? "this process' env is hosed" : "recovering")));
11243         rc2 = edb_reader_check0(env, rlocked, NULL);
11244         if (rc2 == 0)
11245             rc2 = edb_mutex_consistent(mutex);
11246         if (rc || (rc = rc2)) {
11247             DPRINTF(("LOCK_MUTEX recovery failed, %s", edb_strerror(rc)));
11248             UNLOCK_MUTEX(mutex);
11249         }
11250     } else {
11251 #ifdef _WIN32
11252         rc = ErrCode();
11253 #endif
11254         DPRINTF(("LOCK_MUTEX failed, %s", edb_strerror(rc)));
11255     }
11256 
11257     return rc;
11258 }
11259 #endif  /* EDB_ROBUST_SUPPORTED */
11260 
11261 #if defined(_WIN32)
11262 /** Convert \b src to new wchar_t[] string with room for \b xtra extra chars */
11263 static int ESECT
11264 utf8_to_utf16(const char *src, EDB_name *dst, int xtra)
11265 {
11266     int rc, need = 0;
11267     wchar_t *result = NULL;
11268     for (;;) {                  /* malloc result, then fill it in */
11269         need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need);
11270         if (!need) {
11271             rc = ErrCode();
11272             free(result);
11273             return rc;
11274         }
11275         if (!result) {
11276             result = malloc(sizeof(wchar_t) * (need + xtra));
11277             if (!result)
11278                 return ENOMEM;
11279             continue;
11280         }
11281         dst->mn_alloced = 1;
11282         dst->mn_len = need - 1;
11283         dst->mn_val = result;
11284         return EDB_SUCCESS;
11285     }
11286 }
11287 #endif /* defined(_WIN32) */
11288 /** @} */
11289