0001
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014
0015
0016
0017
0018
0019
0020
0021
0022
0023
0024
0025
0026
0027
0028
0029
0030
0031
0032
0033
0034
0035 #ifndef _GNU_SOURCE
0036 #define _GNU_SOURCE 1
0037 #endif
0038 #if defined(EDB_VL32) || defined(__WIN64__)
0039 #define _FILE_OFFSET_BITS 64
0040 #endif
0041 #ifdef _WIN32
0042 #include <malloc.h>
0043 #include <windows.h>
0044 #include <wchar.h> /* get wcscpy() */
0045
0046
0047
0048
0049
0050
0051
0052
0053
0054
0055 typedef NTSTATUS (WINAPI NtCreateSectionFunc)
0056 (OUT PHANDLE sh, IN ACCESS_MASK acc,
0057 IN void * oa OPTIONAL,
0058 IN PLARGE_INTEGER ms OPTIONAL,
0059 IN ULONG pp, IN ULONG aa, IN HANDLE fh OPTIONAL);
0060
0061 static NtCreateSectionFunc *NtCreateSection;
0062
0063 typedef enum _SECTION_INHERIT {
0064 ViewShare = 1,
0065 ViewUnmap = 2
0066 } SECTION_INHERIT;
0067
0068 typedef NTSTATUS (WINAPI NtMapViewOfSectionFunc)
0069 (IN PHANDLE sh, IN HANDLE ph,
0070 IN OUT PVOID *addr, IN ULONG_PTR zbits,
0071 IN SIZE_T cs, IN OUT PLARGE_INTEGER off OPTIONAL,
0072 IN OUT PSIZE_T vs, IN SECTION_INHERIT ih,
0073 IN ULONG at, IN ULONG pp);
0074
0075 static NtMapViewOfSectionFunc *NtMapViewOfSection;
0076
0077 typedef NTSTATUS (WINAPI NtCloseFunc)(HANDLE h);
0078
0079 static NtCloseFunc *NtClose;
0080
0081
0082
0083
0084
0085 #define EDB_PID_T int
0086 #define EDB_THR_T DWORD
0087 #include <sys/types.h>
0088 #include <sys/stat.h>
0089 #ifdef __GNUC__
0090 # include <sys/param.h>
0091 #else
0092 # define LITTLE_ENDIAN 1234
0093 # define BIG_ENDIAN 4321
0094 # define BYTE_ORDER LITTLE_ENDIAN
0095 # ifndef SSIZE_MAX
0096 # define SSIZE_MAX INT_MAX
0097 # endif
0098 #endif
0099 #else
0100 #include <sys/types.h>
0101 #include <sys/stat.h>
0102 #define EDB_PID_T pid_t
0103 #define EDB_THR_T pthread_t
0104 #include <sys/param.h>
0105 #include <sys/uio.h>
0106 #include <sys/mman.h>
0107 #ifdef HAVE_SYS_FILE_H
0108 #include <sys/file.h>
0109 #endif
0110 #include <fcntl.h>
0111 #endif
0112
0113 #include <ndrx_config.h>
0114 #include <ndebug.h>
0115 #include <ndrxdiag.h>
0116
0117 #if defined(__mips) && defined(__linux)
0118
0119 #include <asm/cachectl.h>
0120 extern int cacheflush(char *addr, int nbytes, int cache);
0121 #define CACHEFLUSH(addr, bytes, cache) cacheflush(addr, bytes, cache)
0122 #else
0123 #define CACHEFLUSH(addr, bytes, cache)
0124 #endif
0125
0126 #if defined(__linux) && !defined(EDB_FDATASYNC_WORKS)
0127
0128
0129
0130
0131
0132 #define BROKEN_FDATASYNC
0133 #endif
0134
0135 #include <errno.h>
0136 #include <limits.h>
0137 #include <stddef.h>
0138 #include <inttypes.h>
0139 #include <stdio.h>
0140 #include <stdlib.h>
0141 #include <string.h>
0142 #include <time.h>
0143 #include <ndrx_config.h>
0144
0145 #ifdef _MSC_VER
0146 #include <io.h>
0147 typedef SSIZE_T ssize_t;
0148 #else
0149 #include <unistd.h>
0150 #endif
0151
0152 #if defined(__sun) || defined(__ANDROID__)
0153
0154 #define HAVE_MEMALIGN 1
0155 #include <malloc.h>
0156
0157 #if defined (__sun) && !defined(_POSIX_PTHREAD_SEMANTICS)
0158 # define _POSIX_PTHREAD_SEMANTICS 1
0159 #endif
0160 #endif
0161
0162 #if !(defined(BYTE_ORDER) || defined(__BYTE_ORDER))
0163 #include <netinet/in.h>
0164 #include <resolv.h> /* defines BYTE_ORDER on HPUX and Solaris */
0165 #endif
0166
0167 #if defined(__APPLE__) || defined (BSD) || defined(__FreeBSD_kernel__) || defined (EX_OS_AIX)
0168 # if !(defined(EDB_USE_POSIX_MUTEX) || defined(EDB_USE_POSIX_SEM))
0169 # define EDB_USE_SYSV_SEM 1
0170 # endif
0171 # define EDB_FDATASYNC fsync
0172 #elif defined(__ANDROID__)
0173 # define EDB_FDATASYNC fsync
0174 #endif
0175
0176 #ifndef _WIN32
0177 #include <pthread.h>
0178 #include <signal.h>
0179 #ifdef EDB_USE_POSIX_SEM
0180 # define EDB_USE_HASH 1
0181 #include <semaphore.h>
0182 #elif defined(EDB_USE_SYSV_SEM)
0183 #include <sys/ipc.h>
0184 #include <sys/sem.h>
0185 #ifndef EX_HAVE_SEMUN
0186 union semun {
0187 int val;
0188 struct semid_ds *buf;
0189 unsigned short *array;
0190 };
0191 #endif
0192 #else
0193 #define EDB_USE_POSIX_MUTEX 1
0194 #endif
0195 #endif
0196
0197 #if defined(_WIN32) + defined(EDB_USE_POSIX_SEM) + defined(EDB_USE_SYSV_SEM) \
0198 + defined(EDB_USE_POSIX_MUTEX) != 1
0199 # error "Ambiguous shared-lock implementation"
0200 #endif
0201
0202 #ifdef USE_VALGRIND
0203 #include <valgrind/memcheck.h>
0204 #define VGMEMP_CREATE(h,r,z) VALGRIND_CREATE_MEMPOOL(h,r,z)
0205 #define VGMEMP_ALLOC(h,a,s) VALGRIND_MEMPOOL_ALLOC(h,a,s)
0206 #define VGMEMP_FREE(h,a) VALGRIND_MEMPOOL_FREE(h,a)
0207 #define VGMEMP_DESTROY(h) VALGRIND_DESTROY_MEMPOOL(h)
0208 #define VGMEMP_DEFINED(a,s) VALGRIND_MAKE_MEM_DEFINED(a,s)
0209 #else
0210 #define VGMEMP_CREATE(h,r,z)
0211 #define VGMEMP_ALLOC(h,a,s)
0212 #define VGMEMP_FREE(h,a)
0213 #define VGMEMP_DESTROY(h)
0214 #define VGMEMP_DEFINED(a,s)
0215 #endif
0216
0217 #ifndef BYTE_ORDER
0218 # if (defined(_LITTLE_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(_LITTLE_ENDIAN) && defined(_BIG_ENDIAN))
0219
0220 # define LITTLE_ENDIAN 1234
0221 # define BIG_ENDIAN 4321
0222 # ifdef _LITTLE_ENDIAN
0223 # define BYTE_ORDER LITTLE_ENDIAN
0224 # else
0225 # define BYTE_ORDER BIG_ENDIAN
0226 # endif
0227 # else
0228 # define BYTE_ORDER __BYTE_ORDER
0229 # endif
0230 #endif
0231
0232 #ifndef LITTLE_ENDIAN
0233 #define LITTLE_ENDIAN __LITTLE_ENDIAN
0234 #endif
0235 #ifndef BIG_ENDIAN
0236 #define BIG_ENDIAN __BIG_ENDIAN
0237 #endif
0238
0239 #if defined(__i386) || defined(__x86_64) || defined(_M_IX86)
0240 #define MISALIGNED_OK 1
0241 #endif
0242
0243 #include "exdb.h"
0244 #include "eidl.h"
0245
0246 #if (BYTE_ORDER == LITTLE_ENDIAN) == (BYTE_ORDER == BIG_ENDIAN)
0247 # error "Unknown or unsupported endianness (BYTE_ORDER)"
0248 #elif (-6 & 5) || CHAR_BIT!=8 || UINT_MAX!=0xffffffff || EDB_SIZE_MAX%UINT_MAX
0249 # error "Two's complement, reasonably sized integer types, please"
0250 #endif
0251
0252 #ifdef __GNUC__
0253
0254 # ifdef __APPLE__
0255 # define ESECT __attribute__ ((section("__TEXT,text_env")))
0256 # else
0257 # define ESECT __attribute__ ((section("text_env")))
0258 # endif
0259 #else
0260 #define ESECT
0261 #endif
0262
0263 #ifdef _WIN32
0264 #define CALL_CONV WINAPI
0265 #else
0266 #define CALL_CONV
0267 #endif
0268
0269
0270
0271
0272
0273
0274
0275
0276
0277
0278
0279
0280
0281 #ifndef EDB_DEVEL
0282 #define EDB_DEVEL 0
0283 #endif
0284
0285
0286 #if __STDC_VERSION__ >= 199901L
0287 # define edb_func_ __func__
0288 #elif __GNUC__ >= 2 || _MSC_VER >= 1300
0289 # define edb_func_ __FUNCTION__
0290 #else
0291
0292 # define edb_func_ "<edb_unknown>"
0293 #endif
0294
0295
0296 #define EDB_NO_ROOT (EDB_LAST_ERRCODE + 10)
0297 #ifdef _WIN32
0298 #define EDB_OWNERDEAD ((int) WAIT_ABANDONED)
0299 #elif defined EDB_USE_SYSV_SEM
0300 #define EDB_OWNERDEAD (EDB_LAST_ERRCODE + 11)
0301 #elif defined(EDB_USE_POSIX_MUTEX) && defined(EOWNERDEAD)
0302 #define EDB_OWNERDEAD EOWNERDEAD
0303 #endif
0304
0305 #ifdef __GLIBC__
0306 #define GLIBC_VER ((__GLIBC__ << 16 )| __GLIBC_MINOR__)
0307 #endif
0308
0309
0310
0311
0312
0313
0314
0315
0316 #ifndef EDB_USE_ROBUST
0317
0318 # if defined(EDB_USE_POSIX_MUTEX) && (defined(__ANDROID__) || \
0319 (defined(__GLIBC__) && GLIBC_VER < 0x020004))
0320 # define EDB_USE_ROBUST 0
0321 # else
0322 # define EDB_USE_ROBUST 1
0323 # endif
0324 #endif
0325
0326 #if defined(EDB_USE_POSIX_MUTEX) && (EDB_USE_ROBUST)
0327
0328 # if (defined(__GLIBC__) && GLIBC_VER < 0x02000c) || \
0329 (defined(PTHREAD_MUTEX_ROBUST_NP) && !defined(PTHREAD_MUTEX_ROBUST))
0330 # define PTHREAD_MUTEX_ROBUST PTHREAD_MUTEX_ROBUST_NP
0331 # define pthread_mutexattr_setrobust(attr, flag) pthread_mutexattr_setrobust_np(attr, flag)
0332 # define pthread_mutex_consistent(mutex) pthread_mutex_consistent_np(mutex)
0333 # endif
0334 #endif
0335
0336 #if defined(EDB_OWNERDEAD) && (EDB_USE_ROBUST)
0337 #define EDB_ROBUST_SUPPORTED 1
0338 #endif
0339
0340 #ifdef _WIN32
0341 #define EDB_USE_HASH 1
0342 #define EDB_PIDLOCK 0
0343 #define THREAD_RET DWORD
0344 #define pthread_t HANDLE
0345 #define pthread_mutex_t HANDLE
0346 #define pthread_cond_t HANDLE
0347 typedef HANDLE edb_mutex_t, edb_mutexref_t;
0348 #define pthread_key_t DWORD
0349 #define pthread_self() GetCurrentThreadId()
0350 #define pthread_key_create(x,y) \
0351 ((*(x) = TlsAlloc()) == TLS_OUT_OF_INDEXES ? ErrCode() : 0)
0352 #define pthread_key_delete(x) TlsFree(x)
0353 #define pthread_getspecific(x) TlsGetValue(x)
0354 #define pthread_setspecific(x,y) (TlsSetValue(x,y) ? 0 : ErrCode())
0355 #define pthread_mutex_unlock(x) ReleaseMutex(*x)
0356 #define pthread_mutex_lock(x) WaitForSingleObject(*x, INFINITE)
0357 #define pthread_cond_signal(x) SetEvent(*x)
0358 #define pthread_cond_wait(cond,mutex) do{SignalObjectAndWait(*mutex, *cond, INFINITE, FALSE); WaitForSingleObject(*mutex, INFINITE);}while(0)
0359 #define THREAD_CREATE(thr,start,arg) \
0360 (((thr) = CreateThread(NULL, 0, start, arg, 0, NULL)) ? 0 : ErrCode())
0361 #define THREAD_FINISH(thr) \
0362 (WaitForSingleObject(thr, INFINITE) ? ErrCode() : 0)
0363 #define LOCK_MUTEX0(mutex) WaitForSingleObject(mutex, INFINITE)
0364 #define UNLOCK_MUTEX(mutex) ReleaseMutex(mutex)
0365 #define edb_mutex_consistent(mutex) 0
0366 #define getpid() GetCurrentProcessId()
0367 #define EDB_FDATASYNC(fd) (!FlushFileBuffers(fd))
0368 #define EDB_MSYNC(addr,len,flags) (!FlushViewOfFile(addr,len))
0369 #define ErrCode() GetLastError()
0370 #define GET_PAGESIZE(x) {SYSTEM_INFO si; GetSystemInfo(&si); (x) = si.dwPageSize;}
0371 #define close(fd) (CloseHandle(fd) ? 0 : -1)
0372 #define munmap(ptr,len) UnmapViewOfFile(ptr)
0373 #ifdef PROCESS_QUERY_LIMITED_INFORMATION
0374 #define EDB_PROCESS_QUERY_LIMITED_INFORMATION PROCESS_QUERY_LIMITED_INFORMATION
0375 #else
0376 #define EDB_PROCESS_QUERY_LIMITED_INFORMATION 0x1000
0377 #endif
0378 #else
0379 #define THREAD_RET void *
0380 #define THREAD_CREATE(thr,start,arg) pthread_create(&thr,NULL,start,arg)
0381 #define THREAD_FINISH(thr) pthread_join(thr,NULL)
0382
0383
0384 #define EDB_PIDLOCK 1
0385
0386 #ifdef EDB_USE_POSIX_SEM
0387
0388 typedef sem_t *edb_mutex_t, *edb_mutexref_t;
0389 #define LOCK_MUTEX0(mutex) edb_sem_wait(mutex)
0390 #define UNLOCK_MUTEX(mutex) sem_post(mutex)
0391
0392 static int
0393 edb_sem_wait(sem_t *sem)
0394 {
0395 int rc;
0396 while ((rc = sem_wait(sem)) && (rc = errno) == EINTR) ;
0397 return rc;
0398 }
0399
0400 #elif defined EDB_USE_SYSV_SEM
0401
0402 typedef struct edb_mutex {
0403 int semid;
0404 int semnum;
0405 int *locked;
0406 } edb_mutex_t[1], *edb_mutexref_t;
0407
0408 #define LOCK_MUTEX0(mutex) edb_sem_wait(mutex)
0409 #define UNLOCK_MUTEX(mutex) do { \
0410 struct sembuf sb = { 0, 1, SEM_UNDO }; \
0411 sb.sem_num = (mutex)->semnum; \
0412 *(mutex)->locked = 0; \
0413 semop((mutex)->semid, &sb, 1); \
0414 } while(0)
0415
0416 static int
0417 edb_sem_wait(edb_mutexref_t sem)
0418 {
0419 int rc, *locked = sem->locked;
0420 struct sembuf sb = { 0, -1, SEM_UNDO };
0421 sb.sem_num = sem->semnum;
0422 do {
0423 if (!semop(sem->semid, &sb, 1)) {
0424 rc = *locked ? EDB_OWNERDEAD : EDB_SUCCESS;
0425 *locked = 1;
0426 break;
0427 }
0428 } while ((rc = errno) == EINTR);
0429 return rc;
0430 }
0431
0432 #define edb_mutex_consistent(mutex) 0
0433
0434 #else
0435
0436
0437
0438
0439
0440
0441 typedef pthread_mutex_t edb_mutex_t[1];
0442
0443 typedef pthread_mutex_t *edb_mutexref_t;
0444
0445
0446
0447 #define LOCK_MUTEX0(mutex) pthread_mutex_lock(mutex)
0448
0449
0450 #define UNLOCK_MUTEX(mutex) pthread_mutex_unlock(mutex)
0451
0452
0453 #define edb_mutex_consistent(mutex) pthread_mutex_consistent(mutex)
0454 #endif
0455
0456
0457
0458 #define ErrCode() errno
0459
0460
0461
0462
0463
0464 #define HANDLE int
0465
0466
0467
0468
0469
0470 #define INVALID_HANDLE_VALUE (-1)
0471
0472
0473
0474
0475
0476 #define GET_PAGESIZE(x) ((x) = sysconf(_SC_PAGE_SIZE))
0477 #endif
0478
0479 #define Z EDB_FMT_Z
0480 #define Yu EDB_PRIy(u)
0481 #define Yd EDB_PRIy(d)
0482
0483 #ifdef EDB_USE_SYSV_SEM
0484 #define MNAME_LEN (sizeof(int))
0485 #else
0486 #define MNAME_LEN (sizeof(pthread_mutex_t))
0487 #endif
0488
0489
0490
0491
0492 #ifdef _WIN32
0493 #define MUTEXNAME_PREFIX "Global\\EDB"
0494 #elif defined EDB_USE_POSIX_SEM
0495 #define MUTEXNAME_PREFIX "/EDB"
0496 #endif
0497
0498
0499
0500 #ifdef EDB_ROBUST_SUPPORTED
0501
0502
0503
0504 #define LOCK_MUTEX(rc, env, mutex) \
0505 (((rc) = LOCK_MUTEX0(mutex)) && \
0506 ((rc) = edb_mutex_failed(env, mutex, rc)))
0507 static int edb_mutex_failed(EDB_env *env, edb_mutexref_t mutex, int rc);
0508 #else
0509 #define LOCK_MUTEX(rc, env, mutex) ((rc) = LOCK_MUTEX0(mutex))
0510 #define edb_mutex_failed(env, mutex, rc) (rc)
0511 #endif
0512
0513 #ifndef _WIN32
0514
0515
0516
0517
0518
0519
0520
0521
0522 #ifndef EDB_DSYNC
0523 # ifdef O_DSYNC
0524 # define EDB_DSYNC O_DSYNC
0525 # else
0526 # define EDB_DSYNC O_SYNC
0527 # endif
0528 #endif
0529 #endif
0530
0531
0532
0533
0534 #ifndef EDB_FDATASYNC
0535 # define EDB_FDATASYNC fdatasync
0536 #endif
0537
0538 #ifndef EDB_MSYNC
0539 # define EDB_MSYNC(addr,len,flags) msync(addr,len,flags)
0540 #endif
0541
0542 #ifndef MS_SYNC
0543 #define MS_SYNC 1
0544 #endif
0545
0546 #ifndef MS_ASYNC
0547 #define MS_ASYNC 0
0548 #endif
0549
0550
0551
0552
0553
0554
0555
0556
0557
0558 typedef EDB_ID pgno_t;
0559
0560
0561
0562
0563 typedef EDB_ID txnid_t;
0564
0565
0566
0567
0568 #ifndef EDB_DEBUG
0569
0570
0571
0572
0573 #define EDB_DEBUG 0
0574 #endif
0575
0576 #if EDB_DEBUG
0577 static int edb_debug;
0578 static txnid_t edb_debug_start;
0579
0580
0581
0582
0583 # define DPRINTF(args) ((void) ((edb_debug) && DPRINTF0 args))
0584 # define DPRINTF0(fmt, ...) \
0585 fprintf(stderr, "%s:%d " fmt "\n", edb_func_, __LINE__, __VA_ARGS__)
0586 #else
0587 # define DPRINTF(args) ((void) 0)
0588 #endif
0589
0590
0591
0592 #define DPUTS(arg) DPRINTF(("%s", arg))
0593
0594 #define DDBI(mc) \
0595 (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
0596
0597
0598
0599
0600
0601
0602
0603
0604
0605
0606
0607
0608
0609
0610
0611
0612
0613
0614 #define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000)
0615
0616
0617
0618
0619
0620
0621
0622
0623
0624
0625
0626
0627
0628 #define EDB_MINKEYS 2
0629
0630
0631
0632
0633
0634 #define EDB_MAGIC 0xBEEFC0DE
0635
0636
0637 #define EDB_DATA_VERSION ((EDB_DEVEL) ? 999 : 1)
0638
0639 #define EDB_LOCK_VERSION ((EDB_DEVEL) ? 999 : 2)
0640
0641
0642
0643 #define EDB_LOCK_VERSION_BITS 12
0644
0645
0646
0647
0648
0649
0650
0651
0652
0653
0654
0655
0656
0657
0658
0659
0660
0661 #ifndef EDB_MAXKEYSIZE
0662 #define EDB_MAXKEYSIZE ((EDB_DEVEL) ? 0 : 511)
0663 #endif
0664
0665
0666 #if EDB_MAXKEYSIZE
0667 #define ENV_MAXKEY(env) (EDB_MAXKEYSIZE)
0668 #else
0669 #define ENV_MAXKEY(env) ((env)->me_maxkey)
0670 #endif
0671
0672
0673
0674
0675
0676 #define MAXDATASIZE 0xffffffffUL
0677
0678 #if EDB_DEBUG
0679
0680
0681
0682 #define DKBUF_MAXKEYSIZE ((EDB_MAXKEYSIZE) > 0 ? (EDB_MAXKEYSIZE) : 511)
0683
0684
0685
0686
0687 #define DKBUF char kbuf[DKBUF_MAXKEYSIZE*2+1]
0688
0689
0690
0691
0692 #define DKEY(x) edb_dkey(x, kbuf)
0693 #else
0694 #define DKBUF
0695 #define DKEY(x) 0
0696 #endif
0697
0698
0699
0700
0701 #define P_INVALID (~(pgno_t)0)
0702
0703
0704 #define F_ISSET(w, f) (((w) & (f)) == (f))
0705
0706
0707 #define EVEN(n) (((n) + 1U) & -2)
0708
0709
0710 #define LOW_BIT(n) ((n) & (-(n)))
0711
0712
0713 #define LOG2_MOD(p2, n) (7 - 86 / ((p2) % ((1U<<(n))-1) + 11))
0714
0715
0716
0717
0718
0719 #define ALIGNOF2(type) \
0720 LOW_BIT(offsetof(struct { char ch_; type align_; }, align_))
0721
0722
0723
0724
0725
0726 typedef uint16_t indx_t;
0727
0728 typedef unsigned long long edb_hash_t;
0729
0730
0731
0732
0733
0734 #define DEFAULT_MAPSIZE 1048576
0735
0736
0737
0738
0739
0740
0741
0742
0743
0744
0745
0746
0747
0748
0749
0750
0751
0752
0753
0754
0755
0756
0757
0758
0759
0760
0761
0762
0763
0764
0765
0766
0767
0768
0769
0770
0771
0772
0773
0774
0775
0776
0777
0778
0779
0780 #define DEFAULT_READERS 126
0781
0782
0783
0784
0785
0786
0787 #ifndef CACHELINE
0788 #define CACHELINE 64
0789 #endif
0790
0791
0792
0793
0794
0795
0796
0797
0798
0799 typedef struct EDB_rxbody {
0800
0801
0802
0803
0804
0805
0806
0807 volatile txnid_t mrb_txnid;
0808
0809 volatile EDB_PID_T mrb_pid;
0810
0811 volatile EDB_THR_T mrb_tid;
0812 } EDB_rxbody;
0813
0814
0815 typedef struct EDB_reader {
0816 union {
0817 EDB_rxbody mrx;
0818
0819 #define mr_txnid mru.mrx.mrb_txnid
0820 #define mr_pid mru.mrx.mrb_pid
0821 #define mr_tid mru.mrx.mrb_tid
0822
0823 char pad[(sizeof(EDB_rxbody)+CACHELINE-1) & ~(CACHELINE-1)];
0824 } mru;
0825 } EDB_reader;
0826
0827
0828
0829
0830
0831
0832
0833
0834
0835
0836
0837
0838
0839
0840
0841 typedef struct EDB_txbody {
0842
0843
0844 uint32_t mtb_magic;
0845
0846 uint32_t mtb_format;
0847
0848
0849
0850
0851 volatile txnid_t mtb_txnid;
0852
0853
0854
0855
0856 volatile unsigned mtb_numreaders;
0857 #if defined(_WIN32) || defined(EDB_USE_POSIX_SEM)
0858
0859 edb_hash_t mtb_mutexid;
0860 #elif defined(EDB_USE_SYSV_SEM)
0861 int mtb_semid;
0862 int mtb_rlocked;
0863 #else
0864
0865
0866
0867 edb_mutex_t mtb_rmutex;
0868 #endif
0869 } EDB_txbody;
0870
0871
0872 typedef struct EDB_txninfo {
0873 union {
0874 EDB_txbody mtb;
0875 #define mti_magic mt1.mtb.mtb_magic
0876 #define mti_format mt1.mtb.mtb_format
0877 #define mti_rmutex mt1.mtb.mtb_rmutex
0878 #define mti_txnid mt1.mtb.mtb_txnid
0879 #define mti_numreaders mt1.mtb.mtb_numreaders
0880 #define mti_mutexid mt1.mtb.mtb_mutexid
0881 #ifdef EDB_USE_SYSV_SEM
0882 #define mti_semid mt1.mtb.mtb_semid
0883 #define mti_rlocked mt1.mtb.mtb_rlocked
0884 #endif
0885 char pad[(sizeof(EDB_txbody)+CACHELINE-1) & ~(CACHELINE-1)];
0886 } mt1;
0887 #if !(defined(_WIN32) || defined(EDB_USE_POSIX_SEM))
0888 union {
0889 #ifdef EDB_USE_SYSV_SEM
0890 int mt2_wlocked;
0891 #define mti_wlocked mt2.mt2_wlocked
0892 #else
0893 edb_mutex_t mt2_wmutex;
0894 #define mti_wmutex mt2.mt2_wmutex
0895 #endif
0896 char pad[(MNAME_LEN+CACHELINE-1) & ~(CACHELINE-1)];
0897 } mt2;
0898 #endif
0899 EDB_reader mti_readers[1];
0900 } EDB_txninfo;
0901
0902
0903 #define EDB_LOCK_FORMAT \
0904 ((uint32_t) \
0905 (((EDB_LOCK_VERSION) % (1U << EDB_LOCK_VERSION_BITS)) \
0906 + EDB_lock_desc * (1U << EDB_LOCK_VERSION_BITS)))
0907
0908
0909
0910
0911 #ifdef _WIN32
0912 # define EDB_LOCK_TYPE (0 + ALIGNOF2(edb_hash_t)/8 % 2)
0913 #elif defined EDB_USE_POSIX_SEM
0914 # define EDB_LOCK_TYPE (4 + ALIGNOF2(edb_hash_t)/8 % 2)
0915 #elif defined EDB_USE_SYSV_SEM
0916 # define EDB_LOCK_TYPE (8)
0917 #elif defined EDB_USE_POSIX_MUTEX
0918
0919
0920
0921 # define EDB_LOCK_TYPE (10 + \
0922 LOG2_MOD(ALIGNOF2(pthread_mutex_t), 5) + \
0923 sizeof(pthread_mutex_t) / 4U % 22 * 5)
0924 #endif
0925
0926 enum {
0927
0928
0929
0930
0931
0932
0933
0934
0935
0936
0937
0938 EDB_lock_desc =
0939
0940 (CACHELINE==64 ? 0 : 1 + LOG2_MOD(CACHELINE >> (CACHELINE>64), 5))
0941 + 6 * (sizeof(EDB_PID_T)/4 % 3)
0942 + 18 * (sizeof(pthread_t)/4 % 5)
0943 + 90 * (sizeof(EDB_txbody) / CACHELINE % 3)
0944 + 270 * (EDB_LOCK_TYPE % 120)
0945
0946 + ((sizeof(txnid_t) == 8) << 15)
0947 + ((sizeof(EDB_reader) > CACHELINE) << 16)
0948
0949 + (((EDB_PIDLOCK) != 0) << 17)
0950
0951 };
0952
0953
0954
0955
0956
0957
0958
0959
0960
0961
0962
0963
0964
0965
0966
0967
0968
0969
0970
0971
0972 typedef struct EDB_page {
0973 #define mp_pgno mp_p.p_pgno
0974 #define mp_next mp_p.p_next
0975 union {
0976 pgno_t p_pgno;
0977 struct EDB_page *p_next;
0978 } mp_p;
0979 uint16_t mp_pad;
0980
0981
0982
0983
0984
0985 #define P_BRANCH 0x01
0986 #define P_LEAF 0x02
0987 #define P_OVERFLOW 0x04
0988 #define P_META 0x08
0989 #define P_DIRTY 0x10
0990 #define P_LEAF2 0x20
0991 #define P_SUBP 0x40
0992 #define P_LOOSE 0x4000
0993 #define P_KEEP 0x8000
0994
0995 uint16_t mp_flags;
0996 #define mp_lower mp_pb.pb.pb_lower
0997 #define mp_upper mp_pb.pb.pb_upper
0998 #define mp_pages mp_pb.pb_pages
0999 union {
1000 struct {
1001 indx_t pb_lower;
1002 indx_t pb_upper;
1003 } pb;
1004 uint32_t pb_pages;
1005 } mp_pb;
1006 indx_t mp_ptrs[1];
1007 } EDB_page;
1008
1009
1010 #define PAGEHDRSZ ((unsigned) offsetof(EDB_page, mp_ptrs))
1011
1012
1013 #define METADATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
1014
1015
1016 #define PAGEBASE ((EDB_DEVEL) ? PAGEHDRSZ : 0)
1017
1018
1019 #define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ-PAGEBASE)) >> 1)
1020
1021
1022 #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
1023
1024
1025 #define PAGEFILL(env, p) (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
1026 ((env)->me_psize - PAGEHDRSZ))
1027
1028
1029
1030 #define FILL_THRESHOLD 250
1031
1032
1033 #define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF)
1034
1035 #define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2)
1036
1037 #define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH)
1038
1039 #define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW)
1040
1041 #define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP)
1042
1043
1044 #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
1045
1046
1047
1048
1049 #define NEXT_LOOSE_PAGE(p) (*(EDB_page **)((p) + 2))
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065 typedef struct EDB_node {
1066
1067
1068 #if BYTE_ORDER == LITTLE_ENDIAN
1069 unsigned short mn_lo, mn_hi;
1070 #else
1071 unsigned short mn_hi, mn_lo;
1072 #endif
1073
1074
1075
1076
1077
1078
1079 #define F_BIGDATA 0x01
1080 #define F_SUBDATA 0x02
1081 #define F_DUPDATA 0x04
1082
1083
1084 #define NODE_ADD_FLAGS (F_DUPDATA|F_SUBDATA|EDB_RESERVE|EDB_APPEND)
1085
1086
1087 unsigned short mn_flags;
1088 unsigned short mn_ksize;
1089 char mn_data[1];
1090 } EDB_node;
1091
1092
1093 #define NODESIZE offsetof(EDB_node, mn_data)
1094
1095
1096 #define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
1097
1098
1099
1100
1101 #define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->mv_size))
1102
1103
1104
1105
1106 #define LEAFSIZE(k, d) (NODESIZE + (k)->mv_size + (d)->mv_size)
1107
1108
1109 #define NODEPTR(p, i) ((EDB_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE))
1110
1111
1112 #define NODEKEY(node) (void *)((node)->mn_data)
1113
1114
1115 #define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize)
1116
1117
1118 #define NODEPGNO(node) \
1119 ((node)->mn_lo | ((pgno_t) (node)->mn_hi << 16) | \
1120 (PGNO_TOPWORD ? ((pgno_t) (node)->mn_flags << PGNO_TOPWORD) : 0))
1121
1122 #define SETPGNO(node,pgno) do { \
1123 (node)->mn_lo = (pgno) & 0xffff; (node)->mn_hi = (pgno) >> 16; \
1124 if (PGNO_TOPWORD) (node)->mn_flags = (pgno) >> PGNO_TOPWORD; } while(0)
1125
1126
1127 #define NODEDSZ(node) ((node)->mn_lo | ((unsigned)(node)->mn_hi << 16))
1128
1129 #define SETDSZ(node,size) do { \
1130 (node)->mn_lo = (size) & 0xffff; (node)->mn_hi = (size) >> 16;} while(0)
1131
1132 #define NODEKSZ(node) ((node)->mn_ksize)
1133
1134
1135 #ifdef MISALIGNED_OK
1136 #define COPY_PGNO(dst,src) dst = src
1137 #else
1138 #if EDB_SIZE_MAX > 0xffffffffU
1139 #define COPY_PGNO(dst,src) do { \
1140 unsigned short *s, *d; \
1141 s = (unsigned short *)&(src); \
1142 d = (unsigned short *)&(dst); \
1143 *d++ = *s++; \
1144 *d++ = *s++; \
1145 *d++ = *s++; \
1146 *d = *s; \
1147 } while (0)
1148 #else
1149 #define COPY_PGNO(dst,src) do { \
1150 unsigned short *s, *d; \
1151 s = (unsigned short *)&(src); \
1152 d = (unsigned short *)&(dst); \
1153 *d++ = *s++; \
1154 *d = *s; \
1155 } while (0)
1156 #endif
1157 #endif
1158
1159
1160
1161
1162 #define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i)*(ks)))
1163
1164
1165 #define EDB_GET_KEY(node, keyptr) { if ((keyptr) != NULL) { \
1166 (keyptr)->mv_size = NODEKSZ(node); (keyptr)->mv_data = NODEKEY(node); } }
1167
1168
1169 #define EDB_GET_KEY2(node, key) { key.mv_size = NODEKSZ(node); key.mv_data = NODEKEY(node); }
1170
1171
1172 typedef struct EDB_db {
1173 uint32_t md_pad;
1174 uint16_t md_flags;
1175 uint16_t md_depth;
1176 pgno_t md_branch_pages;
1177 pgno_t md_leaf_pages;
1178 pgno_t md_overflow_pages;
1179 edb_size_t md_entries;
1180 pgno_t md_root;
1181 } EDB_db;
1182
1183 #define EDB_VALID 0x8000
1184 #define PERSISTENT_FLAGS (0xffff & ~(EDB_VALID))
1185
1186 #define VALID_FLAGS (EDB_REVERSEKEY|EDB_DUPSORT|EDB_INTEGERKEY|EDB_DUPFIXED|\
1187 EDB_INTEGERDUP|EDB_REVERSEDUP|EDB_CREATE)
1188
1189
1190 #define FREE_DBI 0
1191
1192 #define MAIN_DBI 1
1193
1194 #define CORE_DBS 2
1195
1196
1197 #define NUM_METAS 2
1198
1199
1200
1201
1202
1203 typedef struct EDB_meta {
1204
1205
1206 uint32_t mm_magic;
1207
1208 uint32_t mm_version;
1209 #ifdef EDB_VL32
1210 union {
1211 EDB_ID mmun_ull;
1212 void *mmun_address;
1213 } mm_un;
1214 #define mm_address mm_un.mmun_address
1215 #else
1216 void *mm_address;
1217 #endif
1218 edb_size_t mm_mapsize;
1219 EDB_db mm_dbs[CORE_DBS];
1220
1221 #define mm_psize mm_dbs[FREE_DBI].md_pad
1222
1223 #define mm_flags mm_dbs[FREE_DBI].md_flags
1224
1225
1226
1227 pgno_t mm_last_pg;
1228 volatile txnid_t mm_txnid;
1229 } EDB_meta;
1230
1231
1232
1233
1234
1235
1236 typedef union EDB_metabuf {
1237 EDB_page mb_page;
1238 struct {
1239 char mm_pad[PAGEHDRSZ];
1240 EDB_meta mm_meta;
1241 } mb_metabuf;
1242 } EDB_metabuf;
1243
1244
1245
1246
1247
1248 typedef struct EDB_dbx {
1249 EDB_val md_name;
1250 EDB_cmp_func *md_cmp;
1251 EDB_cmp_func *md_dcmp;
1252 EDB_rel_func *md_rel;
1253 void *md_relctx;
1254 } EDB_dbx;
1255
1256
1257
1258
1259 struct EDB_txn {
1260 EDB_txn *mt_parent;
1261
1262 EDB_txn *mt_child;
1263 pgno_t mt_next_pgno;
1264 #ifdef EDB_VL32
1265 pgno_t mt_last_pgno;
1266 #endif
1267
1268
1269
1270
1271 txnid_t mt_txnid;
1272 EDB_env *mt_env;
1273
1274
1275 EDB_IDL mt_free_pgs;
1276
1277
1278
1279 EDB_page *mt_loose_pgs;
1280
1281 int mt_loose_count;
1282
1283
1284
1285
1286 EDB_IDL mt_spill_pgs;
1287 union {
1288
1289 EDB_ID2L dirty_list;
1290
1291 EDB_reader *reader;
1292 } mt_u;
1293
1294 EDB_dbx *mt_dbxs;
1295
1296 EDB_db *mt_dbs;
1297
1298 unsigned int *mt_dbiseqs;
1299
1300
1301
1302
1303 #define DB_DIRTY 0x01
1304 #define DB_STALE 0x02
1305 #define DB_NEW 0x04
1306 #define DB_VALID 0x08
1307 #define DB_USRVALID 0x10
1308 #define DB_DUPDATA 0x20
1309
1310
1311 EDB_cursor **mt_cursors;
1312
1313 unsigned char *mt_dbflags;
1314 #ifdef EDB_VL32
1315
1316 EDB_ID3L mt_rpages;
1317
1318
1319
1320
1321
1322 #define EDB_RPAGE_CHUNK 16
1323 #define EDB_TRPAGE_SIZE 4096
1324 #define EDB_TRPAGE_MAX (EDB_TRPAGE_SIZE-1)
1325 unsigned int mt_rpcheck;
1326 #endif
1327
1328
1329
1330
1331 EDB_dbi mt_nuedbs;
1332
1333
1334
1335
1336
1337
1338 #define EDB_TXN_BEGIN_FLAGS (EDB_NOMETASYNC|EDB_NOSYNC|EDB_RDONLY)
1339 #define EDB_TXN_NOMETASYNC EDB_NOMETASYNC
1340 #define EDB_TXN_NOSYNC EDB_NOSYNC
1341 #define EDB_TXN_RDONLY EDB_RDONLY
1342
1343 #define EDB_TXN_WRITEMAP EDB_WRITEMAP
1344 #define EDB_TXN_FINISHED 0x01
1345 #define EDB_TXN_ERROR 0x02
1346 #define EDB_TXN_DIRTY 0x04
1347 #define EDB_TXN_SPILLS 0x08
1348 #define EDB_TXN_HAS_CHILD 0x10
1349
1350 #define EDB_TXN_BLOCKED (EDB_TXN_FINISHED|EDB_TXN_ERROR|EDB_TXN_HAS_CHILD)
1351
1352 unsigned int mt_flags;
1353
1354
1355
1356
1357
1358 unsigned int mt_dirty_room;
1359 };
1360
1361
1362
1363
1364
1365 #define CURSOR_STACK 32
1366
1367 struct EDB_xcursor;
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377 struct EDB_cursor {
1378
1379 EDB_cursor *mc_next;
1380
1381 EDB_cursor *mc_backup;
1382
1383 struct EDB_xcursor *mc_xcursor;
1384
1385 EDB_txn *mc_txn;
1386
1387 EDB_dbi mc_dbi;
1388
1389 EDB_db *mc_db;
1390
1391 EDB_dbx *mc_dbx;
1392
1393 unsigned char *mc_dbflag;
1394 unsigned short mc_snum;
1395 unsigned short mc_top;
1396
1397
1398
1399
1400
1401 #define C_INITIALIZED 0x01
1402 #define C_EOF 0x02
1403 #define C_SUB 0x04
1404 #define C_DEL 0x08
1405 #define C_UNTRACK 0x40
1406 #define C_WRITEMAP EDB_TXN_WRITEMAP
1407
1408
1409
1410
1411 #define C_ORIG_RDONLY EDB_TXN_RDONLY
1412
1413 unsigned int mc_flags;
1414 EDB_page *mc_pg[CURSOR_STACK];
1415 indx_t mc_ki[CURSOR_STACK];
1416 #ifdef EDB_VL32
1417 EDB_page *mc_ovpg;
1418 # define MC_OVPG(mc) ((mc)->mc_ovpg)
1419 # define MC_SET_OVPG(mc, pg) ((mc)->mc_ovpg = (pg))
1420 #else
1421 # define MC_OVPG(mc) ((EDB_page *)0)
1422 # define MC_SET_OVPG(mc, pg) ((void)0)
1423 #endif
1424 };
1425
1426
1427
1428
1429
1430
1431 typedef struct EDB_xcursor {
1432
1433 EDB_cursor mx_cursor;
1434
1435 EDB_db mx_db;
1436
1437 EDB_dbx mx_dbx;
1438
1439 unsigned char mx_dbflag;
1440 } EDB_xcursor;
1441
1442
1443 #define XCURSOR_INITED(mc) \
1444 ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
1445
1446
1447
1448
1449
1450 #define XCURSOR_REFRESH(mc, top, mp) do { \
1451 EDB_page *xr_pg = (mp); \
1452 EDB_node *xr_node; \
1453 if (!XCURSOR_INITED(mc) || (mc)->mc_ki[top] >= NUMKEYS(xr_pg)) break; \
1454 xr_node = NODEPTR(xr_pg, (mc)->mc_ki[top]); \
1455 if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \
1456 (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \
1457 } while (0)
1458
1459
1460 typedef struct EDB_pgstate {
1461 pgno_t *mf_pghead;
1462 txnid_t mf_pglast;
1463 } EDB_pgstate;
1464
1465
1466 struct EDB_env {
1467 HANDLE me_fd;
1468 HANDLE me_lfd;
1469 HANDLE me_mfd;
1470 #if defined(EDB_VL32) && defined(_WIN32)
1471 HANDLE me_fmh;
1472 #endif
1473
1474 #define EDB_FATAL_ERROR 0x80000000U
1475
1476 #define EDB_ENV_ACTIVE 0x20000000U
1477
1478 #define EDB_ENV_TXKEY 0x10000000U
1479
1480 #define EDB_FSYNCONLY 0x08000000U
1481 uint32_t me_flags;
1482 unsigned int me_psize;
1483 unsigned int me_os_psize;
1484 unsigned int me_maxreaders;
1485
1486 volatile int me_close_readers;
1487 EDB_dbi me_nuedbs;
1488 EDB_dbi me_maxdbs;
1489 EDB_PID_T me_pid;
1490 char *me_path;
1491 char *me_map;
1492 EDB_txninfo *me_txns;
1493 EDB_meta *me_metas[NUM_METAS];
1494 void *me_pbuf;
1495 EDB_txn *me_txn;
1496 EDB_txn *me_txn0;
1497 edb_size_t me_mapsize;
1498 off_t me_size;
1499 pgno_t me_maxpg;
1500 EDB_dbx *me_dbxs;
1501 uint16_t *me_dbflags;
1502 unsigned int *me_dbiseqs;
1503 pthread_key_t me_txkey;
1504 txnid_t me_pgoldest;
1505 EDB_pgstate me_pgstate;
1506 # define me_pglast me_pgstate.mf_pglast
1507 # define me_pghead me_pgstate.mf_pghead
1508 EDB_page *me_dpages;
1509
1510 EDB_IDL me_free_pgs;
1511
1512 EDB_ID2L me_dirty_list;
1513
1514 int me_maxfree_1pg;
1515
1516 unsigned int me_nodemax;
1517 #if !(EDB_MAXKEYSIZE)
1518 unsigned int me_maxkey;
1519 #endif
1520 int me_live_reader;
1521 #ifdef _WIN32
1522 int me_pidquery;
1523 #endif
1524 #ifdef EDB_USE_POSIX_MUTEX
1525 # define me_rmutex me_txns->mti_rmutex
1526 # define me_wmutex me_txns->mti_wmutex
1527 #else
1528 edb_mutex_t me_rmutex;
1529 edb_mutex_t me_wmutex;
1530 # if defined(_WIN32) || defined(EDB_USE_POSIX_SEM)
1531
1532 char me_mutexname[sizeof(MUTEXNAME_PREFIX) + 11];
1533 # endif
1534 #endif
1535 #ifdef EDB_VL32
1536 EDB_ID3L me_rpages;
1537 pthread_mutex_t me_rpmutex;
1538 #define EDB_ERPAGE_SIZE 16384
1539 #define EDB_ERPAGE_MAX (EDB_ERPAGE_SIZE-1)
1540 unsigned int me_rpcheck;
1541 #endif
1542 void *me_userctx;
1543 EDB_assert_func *me_assert_func;
1544 };
1545
1546
1547 typedef struct EDB_ntxn {
1548 EDB_txn mnt_txn;
1549 EDB_pgstate mnt_pgstate;
1550 } EDB_ntxn;
1551
1552
1553 #define EDB_COMMIT_PAGES 64
1554 #if defined(IOV_MAX) && IOV_MAX < EDB_COMMIT_PAGES
1555 #undef EDB_COMMIT_PAGES
1556 #define EDB_COMMIT_PAGES IOV_MAX
1557 #endif
1558
1559
1560 #define MAX_WRITE (0x40000000U >> (sizeof(ssize_t) == 4))
1561
1562
1563 #define TXN_DBI_EXIST(txn, dbi, validity) \
1564 ((txn) && (dbi)<(txn)->mt_nuedbs && ((txn)->mt_dbflags[dbi] & (validity)))
1565
1566
1567 #define TXN_DBI_CHANGED(txn, dbi) \
1568 ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
1569
1570 static int edb_page_alloc(EDB_cursor *mc, int num, EDB_page **mp);
1571 static int edb_page_new(EDB_cursor *mc, uint32_t flags, int num, EDB_page **mp);
1572 static int edb_page_touch(EDB_cursor *mc);
1573
1574 #define EDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \
1575 "reset-tmp", "fail-begin", "fail-beginchild"}
1576 enum {
1577
1578 EDB_END_COMMITTED, EDB_END_EMPTY_COMMIT, EDB_END_ABORT, EDB_END_RESET,
1579 EDB_END_RESET_TMP, EDB_END_FAIL_BEGIN, EDB_END_FAIL_BEGINCHILD
1580 };
1581 #define EDB_END_OPMASK 0x0F
1582 #define EDB_END_UPDATE 0x10
1583 #define EDB_END_FREE 0x20
1584 #define EDB_END_SLOT EDB_NOTLS
1585 static void edb_txn_end(EDB_txn *txn, unsigned mode);
1586
1587 static int edb_page_get(EDB_cursor *mc, pgno_t pgno, EDB_page **mp, int *lvl);
1588 static int edb_page_search_root(EDB_cursor *mc,
1589 EDB_val *key, int modify);
1590 #define EDB_PS_MODIFY 1
1591 #define EDB_PS_ROOTONLY 2
1592 #define EDB_PS_FIRST 4
1593 #define EDB_PS_LAST 8
1594 static int edb_page_search(EDB_cursor *mc,
1595 EDB_val *key, int flags);
1596 static int edb_page_merge(EDB_cursor *csrc, EDB_cursor *cdst);
1597
1598 #define EDB_SPLIT_REPLACE EDB_APPENDDUP
1599 static int edb_page_split(EDB_cursor *mc, EDB_val *newkey, EDB_val *newdata,
1600 pgno_t newpgno, unsigned int nflags);
1601
1602 static int edb_env_read_header(EDB_env *env, int prev, EDB_meta *meta);
1603 static EDB_meta *edb_env_pick_meta(const EDB_env *env);
1604 static int edb_env_write_meta(EDB_txn *txn);
1605 #ifdef EDB_USE_POSIX_MUTEX
1606 # define edb_env_close0(env, excl) edb_env_close1(env)
1607 #endif
1608 static void edb_env_close0(EDB_env *env, int excl);
1609
1610 static EDB_node *edb_node_search(EDB_cursor *mc, EDB_val *key, int *exactp);
1611 static int edb_node_add(EDB_cursor *mc, indx_t indx,
1612 EDB_val *key, EDB_val *data, pgno_t pgno, unsigned int flags);
1613 static void edb_node_del(EDB_cursor *mc, int ksize);
1614 static void edb_node_shrink(EDB_page *mp, indx_t indx);
1615 static int edb_node_move(EDB_cursor *csrc, EDB_cursor *cdst, int fromleft);
1616 static int edb_node_read(EDB_cursor *mc, EDB_node *leaf, EDB_val *data);
1617 static size_t edb_leaf_size(EDB_env *env, EDB_val *key, EDB_val *data);
1618 static size_t edb_branch_size(EDB_env *env, EDB_val *key);
1619
1620 static int edb_rebalance(EDB_cursor *mc);
1621 static int edb_update_key(EDB_cursor *mc, EDB_val *key);
1622
1623 static void edb_cursor_pop(EDB_cursor *mc);
1624 static int edb_cursor_push(EDB_cursor *mc, EDB_page *mp);
1625
1626 static int edb_cursor_del0(EDB_cursor *mc);
1627 static int edb_del0(EDB_txn *txn, EDB_dbi dbi, EDB_val *key, EDB_val *data, unsigned flags);
1628 static int edb_cursor_sibling(EDB_cursor *mc, int move_right);
1629 static int edb_cursor_next(EDB_cursor *mc, EDB_val *key, EDB_val *data, EDB_cursor_op op);
1630 static int edb_cursor_prev(EDB_cursor *mc, EDB_val *key, EDB_val *data, EDB_cursor_op op);
1631 static int edb_cursor_set(EDB_cursor *mc, EDB_val *key, EDB_val *data, EDB_cursor_op op,
1632 int *exactp);
1633 static int edb_cursor_first(EDB_cursor *mc, EDB_val *key, EDB_val *data);
1634 static int edb_cursor_last(EDB_cursor *mc, EDB_val *key, EDB_val *data);
1635
1636 static void edb_cursor_init(EDB_cursor *mc, EDB_txn *txn, EDB_dbi dbi, EDB_xcursor *mx);
1637 static void edb_xcursor_init0(EDB_cursor *mc);
1638 static void edb_xcursor_init1(EDB_cursor *mc, EDB_node *node);
1639 static void edb_xcursor_init2(EDB_cursor *mc, EDB_xcursor *src_mx, int force);
1640
1641 static int edb_drop0(EDB_cursor *mc, int subs);
1642 static void edb_default_cmp(EDB_txn *txn, EDB_dbi dbi);
1643 static int edb_reader_check0(EDB_env *env, int rlocked, int *dead);
1644
1645
1646 static EDB_cmp_func edb_cmp_memn, edb_cmp_memnr, edb_cmp_int, edb_cmp_cint, edb_cmp_long;
1647
1648
1649
1650 #ifdef MISALIGNED_OK
1651 # define edb_cmp_clong edb_cmp_long
1652 #else
1653 # define edb_cmp_clong edb_cmp_cint
1654 #endif
1655
1656
1657 #define NEED_CMP_CLONG(cmp, ksize) \
1658 (UINT_MAX < EDB_SIZE_MAX && \
1659 (cmp) == edb_cmp_int && (ksize) == sizeof(edb_size_t))
1660
1661 #ifdef _WIN32
1662 static SECURITY_DESCRIPTOR edb_null_sd;
1663 static SECURITY_ATTRIBUTES edb_all_sa;
1664 static int edb_sec_inited;
1665
1666 struct EDB_name;
1667 static int utf8_to_utf16(const char *src, struct EDB_name *dst, int xtra);
1668 #endif
1669
1670
1671 char * ESECT
1672 edb_version(int *major, int *minor, int *patch)
1673 {
1674 if (major) *major = EDB_VERSION_MAJOR;
1675 if (minor) *minor = EDB_VERSION_MINOR;
1676 if (patch) *patch = EDB_VERSION_PATCH;
1677 return EDB_VERSION_STRING;
1678 }
1679
1680
1681 static char *const edb_errstr[] = {
1682 "EDB_KEYEXIST: Key/data pair already exists",
1683 "EDB_NOTFOUND: No matching key/data pair found",
1684 "EDB_PAGE_NOTFOUND: Requested page not found",
1685 "EDB_CORRUPTED: Located page was wrong type",
1686 "EDB_PANIC: Update of meta page failed or environment had fatal error",
1687 "EDB_VERSION_MISMATCH: Database environment version mismatch",
1688 "EDB_INVALID: File is not an EXDB file",
1689 "EDB_MAP_FULL: Environment mapsize limit reached",
1690 "EDB_DBS_FULL: Environment maxdbs limit reached",
1691 "EDB_READERS_FULL: Environment maxreaders limit reached",
1692 "EDB_TLS_FULL: Thread-local storage keys full - too many environments open",
1693 "EDB_TXN_FULL: Transaction has too many dirty pages - transaction too big",
1694 "EDB_CURSOR_FULL: Internal error - cursor stack limit reached",
1695 "EDB_PAGE_FULL: Internal error - page has no more space",
1696 "EDB_MAP_RESIZED: Database contents grew beyond environment mapsize",
1697 "EDB_INCOMPATIBLE: Operation and DB incompatible, or DB flags changed",
1698 "EDB_BAD_RSLOT: Invalid reuse of reader locktable slot",
1699 "EDB_BAD_TXN: Transaction must abort, has a child, or is invalid",
1700 "EDB_BAD_VALSIZE: Unsupported size of key/DB name/data, or wrong DUPFIXED size",
1701 "EDB_BAD_DBI: The specified DBI handle was closed/changed unexpectedly",
1702 "EDB_PROBLEM: Unexpected problem - txn should abort",
1703 };
1704
1705 char *
1706 edb_strerror(int err)
1707 {
1708 #ifdef _WIN32
1709
1710
1711
1712
1713 #define MSGSIZE 1024
1714 #define PADSIZE 4096
1715 char buf[MSGSIZE+PADSIZE], *ptr = buf;
1716 #endif
1717 int i;
1718 if (!err)
1719 return ("Successful return: 0");
1720
1721 if (err >= EDB_KEYEXIST && err <= EDB_LAST_ERRCODE) {
1722 i = err - EDB_KEYEXIST;
1723 return edb_errstr[i];
1724 }
1725
1726 #ifdef _WIN32
1727
1728
1729
1730
1731
1732 switch(err) {
1733 case ENOENT:
1734 case EIO:
1735 case ENOMEM:
1736 case EACCES:
1737 case EBUSY:
1738 case EINVAL:
1739 case ENOSPC:
1740 return strerror(err);
1741 default:
1742 ;
1743 }
1744 buf[0] = 0;
1745 FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM |
1746 FORMAT_MESSAGE_IGNORE_INSERTS,
1747 NULL, err, 0, ptr, MSGSIZE, (va_list *)buf+MSGSIZE);
1748 return ptr;
1749 #else
1750 return strerror(err);
1751 #endif
1752 }
1753
1754
1755 #define edb_cassert(mc, expr) edb_assert0((mc)->mc_txn->mt_env, expr, #expr)
1756
1757 #define edb_tassert(txn, expr) edb_assert0((txn)->mt_env, expr, #expr)
1758
1759 #define edb_eassert(env, expr) edb_assert0(env, expr, #expr)
1760
1761 #ifndef NDEBUG
1762 # define edb_assert0(env, expr, expr_txt) ((expr) ? (void)0 : \
1763 edb_assert_fail(env, expr_txt, edb_func_, __FILE__, __LINE__))
1764
1765 static void ESECT
1766 edb_assert_fail(EDB_env *env, const char *expr_txt,
1767 const char *func, const char *file, int line)
1768 {
1769 char buf[400];
1770 sprintf(buf, "%.100s:%d: Assertion '%.200s' failed in %.40s()",
1771 file, line, expr_txt, func);
1772 if (env->me_assert_func)
1773 env->me_assert_func(env, buf);
1774 fprintf(stderr, "%s\n", buf);
1775 abort();
1776 }
1777 #else
1778 # define edb_assert0(env, expr, expr_txt) ((void) 0)
1779 #endif
1780
1781 #if EDB_DEBUG
1782
1783 static pgno_t
1784 edb_dbg_pgno(EDB_page *mp)
1785 {
1786 pgno_t ret;
1787 COPY_PGNO(ret, mp->mp_pgno);
1788 return ret;
1789 }
1790
1791
1792
1793
1794
1795
1796 char *
1797 edb_dkey(EDB_val *key, char *buf)
1798 {
1799 char *ptr = buf;
1800 unsigned char *c = key->mv_data;
1801 unsigned int i;
1802
1803 if (!key)
1804 return "";
1805
1806 if (key->mv_size > DKBUF_MAXKEYSIZE)
1807 return "EDB_MAXKEYSIZE";
1808
1809
1810
1811 #if 1
1812 buf[0] = '\0';
1813 for (i=0; i<key->mv_size; i++)
1814 ptr += sprintf(ptr, "%02x", *c++);
1815 #else
1816 sprintf(buf, "%.*s", key->mv_size, key->mv_data);
1817 #endif
1818 return buf;
1819 }
1820
1821 static const char *
1822 edb_leafnode_type(EDB_node *n)
1823 {
1824 static char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}};
1825 return F_ISSET(n->mn_flags, F_BIGDATA) ? ": overflow page" :
1826 tp[F_ISSET(n->mn_flags, F_DUPDATA)][F_ISSET(n->mn_flags, F_SUBDATA)];
1827 }
1828
1829
1830 void
1831 edb_page_list(EDB_page *mp)
1832 {
1833 pgno_t pgno = edb_dbg_pgno(mp);
1834 const char *type, *state = (mp->mp_flags & P_DIRTY) ? ", dirty" : "";
1835 EDB_node *node;
1836 unsigned int i, nkeys, nsize, total = 0;
1837 EDB_val key;
1838 DKBUF;
1839
1840 switch (mp->mp_flags & (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP)) {
1841 case P_BRANCH: type = "Branch page"; break;
1842 case P_LEAF: type = "Leaf page"; break;
1843 case P_LEAF|P_SUBP: type = "Sub-page"; break;
1844 case P_LEAF|P_LEAF2: type = "LEAF2 page"; break;
1845 case P_LEAF|P_LEAF2|P_SUBP: type = "LEAF2 sub-page"; break;
1846 case P_OVERFLOW:
1847 fprintf(stderr, "Overflow page %"Yu" pages %u%s\n",
1848 pgno, mp->mp_pages, state);
1849 return;
1850 case P_META:
1851 fprintf(stderr, "Meta-page %"Yu" txnid %"Yu"\n",
1852 pgno, ((EDB_meta *)METADATA(mp))->mm_txnid);
1853 return;
1854 default:
1855 fprintf(stderr, "Bad page %"Yu" flags 0x%X\n", pgno, mp->mp_flags);
1856 return;
1857 }
1858
1859 nkeys = NUMKEYS(mp);
1860 fprintf(stderr, "%s %"Yu" numkeys %d%s\n", type, pgno, nkeys, state);
1861
1862 for (i=0; i<nkeys; i++) {
1863 if (IS_LEAF2(mp)) {
1864 key.mv_size = nsize = mp->mp_pad;
1865 key.mv_data = LEAF2KEY(mp, i, nsize);
1866 total += nsize;
1867 fprintf(stderr, "key %d: nsize %d, %s\n", i, nsize, DKEY(&key));
1868 continue;
1869 }
1870 node = NODEPTR(mp, i);
1871 key.mv_size = node->mn_ksize;
1872 key.mv_data = node->mn_data;
1873 nsize = NODESIZE + key.mv_size;
1874 if (IS_BRANCH(mp)) {
1875 fprintf(stderr, "key %d: page %"Yu", %s\n", i, NODEPGNO(node),
1876 DKEY(&key));
1877 total += nsize;
1878 } else {
1879 if (F_ISSET(node->mn_flags, F_BIGDATA))
1880 nsize += sizeof(pgno_t);
1881 else
1882 nsize += NODEDSZ(node);
1883 total += nsize;
1884 nsize += sizeof(indx_t);
1885 fprintf(stderr, "key %d: nsize %d, %s%s\n",
1886 i, nsize, DKEY(&key), edb_leafnode_type(node));
1887 }
1888 total = EVEN(total);
1889 }
1890 fprintf(stderr, "Total: header %d + contents %d + unused %d\n",
1891 IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, SIZELEFT(mp));
1892 }
1893
1894 void
1895 edb_cursor_chk(EDB_cursor *mc)
1896 {
1897 unsigned int i;
1898 EDB_node *node;
1899 EDB_page *mp;
1900
1901 if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return;
1902 for (i=0; i<mc->mc_top; i++) {
1903 mp = mc->mc_pg[i];
1904 node = NODEPTR(mp, mc->mc_ki[i]);
1905 if (NODEPGNO(node) != mc->mc_pg[i+1]->mp_pgno)
1906 printf("oops!\n");
1907 }
1908 if (mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))
1909 printf("ack!\n");
1910 if (XCURSOR_INITED(mc)) {
1911 node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
1912 if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) &&
1913 mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) {
1914 printf("blah!\n");
1915 }
1916 }
1917 }
1918 #endif
1919
1920 #if (EDB_DEBUG) > 2
1921
1922
1923
1924
1925
1926 static void edb_audit(EDB_txn *txn)
1927 {
1928 EDB_cursor mc;
1929 EDB_val key, data;
1930 EDB_ID freecount, count;
1931 EDB_dbi i;
1932 int rc;
1933
1934 freecount = 0;
1935 edb_cursor_init(&mc, txn, FREE_DBI, NULL);
1936 while ((rc = edb_cursor_get(&mc, &key, &data, EDB_NEXT)) == 0)
1937 freecount += *(EDB_ID *)data.mv_data;
1938 edb_tassert(txn, rc == EDB_NOTFOUND);
1939
1940 count = 0;
1941 for (i = 0; i<txn->mt_nuedbs; i++) {
1942 EDB_xcursor mx;
1943 if (!(txn->mt_dbflags[i] & DB_VALID))
1944 continue;
1945 edb_cursor_init(&mc, txn, i, &mx);
1946 if (txn->mt_dbs[i].md_root == P_INVALID)
1947 continue;
1948 count += txn->mt_dbs[i].md_branch_pages +
1949 txn->mt_dbs[i].md_leaf_pages +
1950 txn->mt_dbs[i].md_overflow_pages;
1951 if (txn->mt_dbs[i].md_flags & EDB_DUPSORT) {
1952 rc = edb_page_search(&mc, NULL, EDB_PS_FIRST);
1953 for (; rc == EDB_SUCCESS; rc = edb_cursor_sibling(&mc, 1)) {
1954 unsigned j;
1955 EDB_page *mp;
1956 mp = mc.mc_pg[mc.mc_top];
1957 for (j=0; j<NUMKEYS(mp); j++) {
1958 EDB_node *leaf = NODEPTR(mp, j);
1959 if (leaf->mn_flags & F_SUBDATA) {
1960 EDB_db db;
1961 memcpy(&db, NODEDATA(leaf), sizeof(db));
1962 count += db.md_branch_pages + db.md_leaf_pages +
1963 db.md_overflow_pages;
1964 }
1965 }
1966 }
1967 edb_tassert(txn, rc == EDB_NOTFOUND);
1968 }
1969 }
1970 if (freecount + count + NUM_METAS != txn->mt_next_pgno) {
1971 fprintf(stderr, "audit: %"Yu" freecount: %"Yu" count: %"Yu" total: %"Yu" next_pgno: %"Yu"\n",
1972 txn->mt_txnid, freecount, count+NUM_METAS,
1973 freecount+count+NUM_METAS, txn->mt_next_pgno);
1974 }
1975 }
1976 #endif
1977
1978 int
1979 edb_cmp(EDB_txn *txn, EDB_dbi dbi, const EDB_val *a, const EDB_val *b)
1980 {
1981 return txn->mt_dbxs[dbi].md_cmp(a, b);
1982 }
1983
1984 int
1985 edb_dcmp(EDB_txn *txn, EDB_dbi dbi, const EDB_val *a, const EDB_val *b)
1986 {
1987 EDB_cmp_func *dcmp = txn->mt_dbxs[dbi].md_dcmp;
1988 if (NEED_CMP_CLONG(dcmp, a->mv_size))
1989 dcmp = edb_cmp_clong;
1990 return dcmp(a, b);
1991 }
1992
1993
1994
1995
1996
1997 static EDB_page *
1998 edb_page_malloc(EDB_txn *txn, unsigned num)
1999 {
2000 EDB_env *env = txn->mt_env;
2001 EDB_page *ret = env->me_dpages;
2002 size_t psize = env->me_psize, sz = psize, off;
2003
2004
2005
2006
2007
2008 if (num == 1) {
2009 if (ret) {
2010 VGMEMP_ALLOC(env, ret, sz);
2011 VGMEMP_DEFINED(ret, sizeof(ret->mp_next));
2012 env->me_dpages = ret->mp_next;
2013 return ret;
2014 }
2015 psize -= off = PAGEHDRSZ;
2016 } else {
2017 sz *= num;
2018 off = sz - psize;
2019 }
2020 if ((ret = malloc(sz)) != NULL) {
2021 VGMEMP_ALLOC(env, ret, sz);
2022 if (!(env->me_flags & EDB_NOMEMINIT)) {
2023 memset((char *)ret + off, 0, psize);
2024 ret->mp_pad = 0;
2025 }
2026 } else {
2027 txn->mt_flags |= EDB_TXN_ERROR;
2028 }
2029 return ret;
2030 }
2031
2032
2033
2034
2035 static void
2036 edb_page_free(EDB_env *env, EDB_page *mp)
2037 {
2038 mp->mp_next = env->me_dpages;
2039 VGMEMP_FREE(env, mp);
2040 env->me_dpages = mp;
2041 }
2042
2043
2044 static void
2045 edb_dpage_free(EDB_env *env, EDB_page *dp)
2046 {
2047 if (!IS_OVERFLOW(dp) || dp->mp_pages == 1) {
2048 edb_page_free(env, dp);
2049 } else {
2050
2051 VGMEMP_FREE(env, dp);
2052 free(dp);
2053 }
2054 }
2055
2056
2057 static void
2058 edb_dlist_free(EDB_txn *txn)
2059 {
2060 EDB_env *env = txn->mt_env;
2061 EDB_ID2L dl = txn->mt_u.dirty_list;
2062 unsigned i, n = dl[0].mid;
2063
2064 for (i = 1; i <= n; i++) {
2065 edb_dpage_free(env, dl[i].mptr);
2066 }
2067 dl[0].mid = 0;
2068 }
2069
2070 #ifdef EDB_VL32
2071 static void
2072 edb_page_unref(EDB_txn *txn, EDB_page *mp)
2073 {
2074 pgno_t pgno;
2075 EDB_ID3L tl = txn->mt_rpages;
2076 unsigned x, rem;
2077 if (mp->mp_flags & (P_SUBP|P_DIRTY))
2078 return;
2079 rem = mp->mp_pgno & (EDB_RPAGE_CHUNK-1);
2080 pgno = mp->mp_pgno ^ rem;
2081 x = edb_mid3l_search(tl, pgno);
2082 if (x != tl[0].mid && tl[x+1].mid == mp->mp_pgno)
2083 x++;
2084 if (tl[x].mref)
2085 tl[x].mref--;
2086 }
2087 #define EDB_PAGE_UNREF(txn, mp) edb_page_unref(txn, mp)
2088
2089 static void
2090 edb_cursor_unref(EDB_cursor *mc)
2091 {
2092 int i;
2093 if (mc->mc_txn->mt_rpages[0].mid) {
2094 if (!mc->mc_snum || !mc->mc_pg[0] || IS_SUBP(mc->mc_pg[0]))
2095 return;
2096 for (i=0; i<mc->mc_snum; i++)
2097 edb_page_unref(mc->mc_txn, mc->mc_pg[i]);
2098 if (mc->mc_ovpg) {
2099 edb_page_unref(mc->mc_txn, mc->mc_ovpg);
2100 mc->mc_ovpg = 0;
2101 }
2102 }
2103 mc->mc_snum = mc->mc_top = 0;
2104 mc->mc_pg[0] = NULL;
2105 mc->mc_flags &= ~C_INITIALIZED;
2106 }
2107 #define EDB_CURSOR_UNREF(mc, force) \
2108 (((force) || ((mc)->mc_flags & C_INITIALIZED)) \
2109 ? edb_cursor_unref(mc) \
2110 : (void)0)
2111
2112 #else
2113 #define EDB_PAGE_UNREF(txn, mp)
2114 #define EDB_CURSOR_UNREF(mc, force) ((void)0)
2115 #endif
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127 static int
2128 edb_page_loose(EDB_cursor *mc, EDB_page *mp)
2129 {
2130 int loose = 0;
2131 pgno_t pgno = mp->mp_pgno;
2132 EDB_txn *txn = mc->mc_txn;
2133
2134 if ((mp->mp_flags & P_DIRTY) && mc->mc_dbi != FREE_DBI) {
2135 if (txn->mt_parent) {
2136 EDB_ID2 *dl = txn->mt_u.dirty_list;
2137
2138
2139
2140 if (dl[0].mid) {
2141 unsigned x = edb_mid2l_search(dl, pgno);
2142 if (x <= dl[0].mid && dl[x].mid == pgno) {
2143 if (mp != dl[x].mptr) {
2144 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
2145 txn->mt_flags |= EDB_TXN_ERROR;
2146 return EDB_PROBLEM;
2147 }
2148
2149 loose = 1;
2150 }
2151 }
2152 } else {
2153
2154 loose = 1;
2155 }
2156 }
2157 if (loose) {
2158 DPRINTF(("loosen db %d page %"Yu, DDBI(mc), mp->mp_pgno));
2159 NEXT_LOOSE_PAGE(mp) = txn->mt_loose_pgs;
2160 txn->mt_loose_pgs = mp;
2161 txn->mt_loose_count++;
2162 mp->mp_flags |= P_LOOSE;
2163 } else {
2164 int rc = edb_eidl_append(&txn->mt_free_pgs, pgno);
2165 if (rc)
2166 return rc;
2167 }
2168
2169 return EDB_SUCCESS;
2170 }
2171
2172
2173
2174
2175
2176
2177
2178
2179 static int
2180 edb_pages_xkeep(EDB_cursor *mc, unsigned pflags, int all)
2181 {
2182 enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP };
2183 EDB_txn *txn = mc->mc_txn;
2184 EDB_cursor *m3, *m0 = mc;
2185 EDB_xcursor *mx;
2186 EDB_page *dp, *mp;
2187 EDB_node *leaf;
2188 unsigned i, j;
2189 int rc = EDB_SUCCESS, level;
2190
2191
2192 for (i = txn->mt_nuedbs;; ) {
2193 if (mc->mc_flags & C_INITIALIZED) {
2194 for (m3 = mc;; m3 = &mx->mx_cursor) {
2195 mp = NULL;
2196 for (j=0; j<m3->mc_snum; j++) {
2197 mp = m3->mc_pg[j];
2198 if ((mp->mp_flags & Mask) == pflags)
2199 mp->mp_flags ^= P_KEEP;
2200 }
2201 mx = m3->mc_xcursor;
2202
2203 if (! (mx && (mx->mx_cursor.mc_flags & C_INITIALIZED)))
2204 break;
2205 if (! (mp && (mp->mp_flags & P_LEAF)))
2206 break;
2207 leaf = NODEPTR(mp, m3->mc_ki[j-1]);
2208 if (!(leaf->mn_flags & F_SUBDATA))
2209 break;
2210 }
2211 }
2212 mc = mc->mc_next;
2213 for (; !mc || mc == m0; mc = txn->mt_cursors[--i])
2214 if (i == 0)
2215 goto mark_done;
2216 }
2217
2218 mark_done:
2219 if (all) {
2220
2221 for (i=0; i<txn->mt_nuedbs; i++) {
2222 if (txn->mt_dbflags[i] & DB_DIRTY) {
2223 pgno_t pgno = txn->mt_dbs[i].md_root;
2224 if (pgno == P_INVALID)
2225 continue;
2226 if ((rc = edb_page_get(m0, pgno, &dp, &level)) != EDB_SUCCESS)
2227 break;
2228 if ((dp->mp_flags & Mask) == pflags && level <= 1)
2229 dp->mp_flags ^= P_KEEP;
2230 }
2231 }
2232 }
2233
2234 return rc;
2235 }
2236
2237 static int edb_page_flush(EDB_txn *txn, int keep);
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271 static int
2272 edb_page_spill(EDB_cursor *m0, EDB_val *key, EDB_val *data)
2273 {
2274 EDB_txn *txn = m0->mc_txn;
2275 EDB_page *dp;
2276 EDB_ID2L dl = txn->mt_u.dirty_list;
2277 unsigned int i, j, need;
2278 int rc;
2279
2280 if (m0->mc_flags & C_SUB)
2281 return EDB_SUCCESS;
2282
2283
2284 i = m0->mc_db->md_depth;
2285
2286 if (m0->mc_dbi >= CORE_DBS)
2287 i += txn->mt_dbs[MAIN_DBI].md_depth;
2288
2289 if (key)
2290 i += (LEAFSIZE(key, data) + txn->mt_env->me_psize) / txn->mt_env->me_psize;
2291 i += i;
2292 need = i;
2293
2294 if (txn->mt_dirty_room > i)
2295 return EDB_SUCCESS;
2296
2297 if (!txn->mt_spill_pgs) {
2298 txn->mt_spill_pgs = edb_eidl_alloc(EDB_IDL_UM_MAX);
2299 if (!txn->mt_spill_pgs)
2300 return ENOMEM;
2301 } else {
2302
2303 EDB_IDL sl = txn->mt_spill_pgs;
2304 unsigned int num = sl[0];
2305 j=0;
2306 for (i=1; i<=num; i++) {
2307 if (!(sl[i] & 1))
2308 sl[++j] = sl[i];
2309 }
2310 sl[0] = j;
2311 }
2312
2313
2314 if ((rc = edb_pages_xkeep(m0, P_DIRTY, 1)) != EDB_SUCCESS)
2315 goto done;
2316
2317
2318
2319
2320
2321
2322
2323
2324 if (need < EDB_IDL_UM_MAX / 8)
2325 need = EDB_IDL_UM_MAX / 8;
2326
2327
2328
2329 for (i=dl[0].mid; i && need; i--) {
2330 EDB_ID pn = dl[i].mid << 1;
2331 dp = dl[i].mptr;
2332 if (dp->mp_flags & (P_LOOSE|P_KEEP))
2333 continue;
2334
2335
2336
2337 if (txn->mt_parent) {
2338 EDB_txn *tx2;
2339 for (tx2 = txn->mt_parent; tx2; tx2 = tx2->mt_parent) {
2340 if (tx2->mt_spill_pgs) {
2341 j = edb_eidl_search(tx2->mt_spill_pgs, pn);
2342 if (j <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[j] == pn) {
2343 dp->mp_flags |= P_KEEP;
2344 break;
2345 }
2346 }
2347 }
2348 if (tx2)
2349 continue;
2350 }
2351 if ((rc = edb_eidl_append(&txn->mt_spill_pgs, pn)))
2352 goto done;
2353 need--;
2354 }
2355 edb_eidl_sort(txn->mt_spill_pgs);
2356
2357
2358 if ((rc = edb_page_flush(txn, i)) != EDB_SUCCESS)
2359 goto done;
2360
2361
2362 rc = edb_pages_xkeep(m0, P_DIRTY|P_KEEP, i);
2363
2364 done:
2365 txn->mt_flags |= rc ? EDB_TXN_ERROR : EDB_TXN_SPILLS;
2366 return rc;
2367 }
2368
2369
2370 static txnid_t
2371 edb_find_oldest(EDB_txn *txn)
2372 {
2373 int i;
2374 txnid_t mr, oldest = txn->mt_txnid - 1;
2375 if (txn->mt_env->me_txns) {
2376 EDB_reader *r = txn->mt_env->me_txns->mti_readers;
2377 for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) {
2378 if (r[i].mr_pid) {
2379 mr = r[i].mr_txnid;
2380 if (oldest > mr)
2381 oldest = mr;
2382 }
2383 }
2384 }
2385 return oldest;
2386 }
2387
2388
2389 static void
2390 edb_page_dirty(EDB_txn *txn, EDB_page *mp)
2391 {
2392 EDB_ID2 mid;
2393 int rc, (*insert)(EDB_ID2L, EDB_ID2 *);
2394
2395 if (txn->mt_flags & EDB_TXN_WRITEMAP) {
2396 insert = edb_mid2l_append;
2397 } else {
2398 insert = edb_mid2l_insert;
2399 }
2400 mid.mid = mp->mp_pgno;
2401 mid.mptr = mp;
2402 rc = insert(txn->mt_u.dirty_list, &mid);
2403 edb_tassert(txn, rc == 0);
2404 txn->mt_dirty_room--;
2405 }
2406
2407
2408
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424 static int
2425 edb_page_alloc(EDB_cursor *mc, int num, EDB_page **mp)
2426 {
2427 #ifdef EDB_PARANOID
2428
2429
2430
2431
2432
2433
2434 enum { Paranoid = 1, Max_retries = 500 };
2435 #else
2436 enum { Paranoid = 0, Max_retries = INT_MAX };
2437 #endif
2438 int rc, retry = num * 60;
2439 EDB_txn *txn = mc->mc_txn;
2440 EDB_env *env = txn->mt_env;
2441 pgno_t pgno, *mop = env->me_pghead;
2442 unsigned i, j, mop_len = mop ? mop[0] : 0, n2 = num-1;
2443 EDB_page *np;
2444 txnid_t oldest = 0, last;
2445 EDB_cursor_op op;
2446 EDB_cursor m2;
2447 int found_old = 0;
2448
2449
2450 if (num == 1 && txn->mt_loose_pgs) {
2451 np = txn->mt_loose_pgs;
2452 txn->mt_loose_pgs = NEXT_LOOSE_PAGE(np);
2453 txn->mt_loose_count--;
2454 DPRINTF(("db %d use loose page %"Yu, DDBI(mc), np->mp_pgno));
2455 *mp = np;
2456 return EDB_SUCCESS;
2457 }
2458
2459 *mp = NULL;
2460
2461
2462 if (txn->mt_dirty_room == 0) {
2463 rc = EDB_TXN_FULL;
2464 goto fail;
2465 }
2466
2467 for (op = EDB_FIRST;; op = EDB_NEXT) {
2468 EDB_val key, data;
2469 EDB_node *leaf;
2470 pgno_t *idl;
2471
2472
2473
2474
2475 if (mop_len > n2) {
2476 i = mop_len;
2477 do {
2478 pgno = mop[i];
2479 if (mop[i-n2] == pgno+n2)
2480 goto search_done;
2481 } while (--i > n2);
2482 if (--retry < 0)
2483 break;
2484 }
2485
2486 if (op == EDB_FIRST) {
2487
2488 last = env->me_pglast;
2489 oldest = env->me_pgoldest;
2490 edb_cursor_init(&m2, txn, FREE_DBI, NULL);
2491 #if (EDB_DEVEL) & 2
2492
2493
2494
2495 m2.mc_flags |= C_ORIG_RDONLY;
2496 m2.mc_db = &env->me_metas[(txn->mt_txnid-1) & 1]->mm_dbs[FREE_DBI];
2497 m2.mc_dbflag = (unsigned char *)"";
2498 #endif
2499 if (last) {
2500 op = EDB_SET_RANGE;
2501 key.mv_data = &last;
2502 key.mv_size = sizeof(last);
2503 }
2504 if (Paranoid && mc->mc_dbi == FREE_DBI)
2505 retry = -1;
2506 }
2507 if (Paranoid && retry < 0 && mop_len)
2508 break;
2509
2510 last++;
2511
2512 if (oldest <= last) {
2513 if (!found_old) {
2514 oldest = edb_find_oldest(txn);
2515 env->me_pgoldest = oldest;
2516 found_old = 1;
2517 }
2518 if (oldest <= last)
2519 break;
2520 }
2521 rc = edb_cursor_get(&m2, &key, NULL, op);
2522 if (rc) {
2523 if (rc == EDB_NOTFOUND)
2524 break;
2525 goto fail;
2526 }
2527 last = *(txnid_t*)key.mv_data;
2528 if (oldest <= last) {
2529 if (!found_old) {
2530 oldest = edb_find_oldest(txn);
2531 env->me_pgoldest = oldest;
2532 found_old = 1;
2533 }
2534 if (oldest <= last)
2535 break;
2536 }
2537 np = m2.mc_pg[m2.mc_top];
2538 leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]);
2539 if ((rc = edb_node_read(&m2, leaf, &data)) != EDB_SUCCESS)
2540 goto fail;
2541
2542 idl = (EDB_ID *) data.mv_data;
2543 i = idl[0];
2544 if (!mop) {
2545 if (!(env->me_pghead = mop = edb_eidl_alloc(i))) {
2546 rc = ENOMEM;
2547 goto fail;
2548 }
2549 } else {
2550 if ((rc = edb_eidl_need(&env->me_pghead, i)) != 0)
2551 goto fail;
2552 mop = env->me_pghead;
2553 }
2554 env->me_pglast = last;
2555 #if (EDB_DEBUG) > 1
2556 DPRINTF(("IDL read txn %"Yu" root %"Yu" num %u",
2557 last, txn->mt_dbs[FREE_DBI].md_root, i));
2558 for (j = i; j; j--)
2559 DPRINTF(("IDL %"Yu, idl[j]));
2560 #endif
2561
2562 edb_eidl_xmerge(mop, idl);
2563 mop_len = mop[0];
2564 }
2565
2566
2567 i = 0;
2568 pgno = txn->mt_next_pgno;
2569 if (pgno + num >= env->me_maxpg) {
2570 DPUTS("DB size maxed out");
2571 rc = EDB_MAP_FULL;
2572 goto fail;
2573 }
2574 #if defined(_WIN32) && !defined(EDB_VL32)
2575 if (!(env->me_flags & EDB_RDONLY)) {
2576 void *p;
2577 p = (EDB_page *)(env->me_map + env->me_psize * pgno);
2578 p = VirtualAlloc(p, env->me_psize * num, MEM_COMMIT,
2579 (env->me_flags & EDB_WRITEMAP) ? PAGE_READWRITE:
2580 PAGE_READONLY);
2581 if (!p) {
2582 DPUTS("VirtualAlloc failed");
2583 rc = ErrCode();
2584 goto fail;
2585 }
2586 }
2587 #endif
2588
2589 search_done:
2590 if (env->me_flags & EDB_WRITEMAP) {
2591 np = (EDB_page *)(env->me_map + env->me_psize * pgno);
2592 } else {
2593 if (!(np = edb_page_malloc(txn, num))) {
2594 rc = ENOMEM;
2595 goto fail;
2596 }
2597 }
2598 if (i) {
2599 mop[0] = mop_len -= num;
2600
2601 for (j = i-num; j < mop_len; )
2602 mop[++j] = mop[++i];
2603 } else {
2604 txn->mt_next_pgno = pgno + num;
2605 }
2606 np->mp_pgno = pgno;
2607 edb_page_dirty(txn, np);
2608 *mp = np;
2609
2610 return EDB_SUCCESS;
2611
2612 fail:
2613 txn->mt_flags |= EDB_TXN_ERROR;
2614 return rc;
2615 }
2616
2617
2618
2619
2620
2621
2622 static void
2623 edb_page_copy(EDB_page *dst, EDB_page *src, unsigned int psize)
2624 {
2625 enum { Align = sizeof(pgno_t) };
2626 indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper-lower;
2627
2628
2629
2630
2631 if ((unused &= -Align) && !IS_LEAF2(src)) {
2632 upper = (upper + PAGEBASE) & -Align;
2633 memcpy(dst, src, (lower + PAGEBASE + (Align-1)) & -Align);
2634 memcpy((pgno_t *)((char *)dst+upper), (pgno_t *)((char *)src+upper),
2635 psize - upper);
2636 } else {
2637 memcpy(dst, src, psize - unused);
2638 }
2639 }
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649 static int
2650 edb_page_unspill(EDB_txn *txn, EDB_page *mp, EDB_page **ret)
2651 {
2652 EDB_env *env = txn->mt_env;
2653 const EDB_txn *tx2;
2654 unsigned x;
2655 pgno_t pgno = mp->mp_pgno, pn = pgno << 1;
2656
2657 for (tx2 = txn; tx2; tx2=tx2->mt_parent) {
2658 if (!tx2->mt_spill_pgs)
2659 continue;
2660 x = edb_eidl_search(tx2->mt_spill_pgs, pn);
2661 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
2662 EDB_page *np;
2663 int num;
2664 if (txn->mt_dirty_room == 0)
2665 return EDB_TXN_FULL;
2666 if (IS_OVERFLOW(mp))
2667 num = mp->mp_pages;
2668 else
2669 num = 1;
2670 if (env->me_flags & EDB_WRITEMAP) {
2671 np = mp;
2672 } else {
2673 np = edb_page_malloc(txn, num);
2674 if (!np)
2675 return ENOMEM;
2676 if (num > 1)
2677 memcpy(np, mp, num * env->me_psize);
2678 else
2679 edb_page_copy(np, mp, env->me_psize);
2680 }
2681 if (tx2 == txn) {
2682
2683
2684
2685
2686 if (x == txn->mt_spill_pgs[0])
2687 txn->mt_spill_pgs[0]--;
2688 else
2689 txn->mt_spill_pgs[x] |= 1;
2690 }
2691
2692
2693
2694 edb_page_dirty(txn, np);
2695 np->mp_flags |= P_DIRTY;
2696 *ret = np;
2697 break;
2698 }
2699 }
2700 return EDB_SUCCESS;
2701 }
2702
2703
2704
2705
2706
2707
2708 static int
2709 edb_page_touch(EDB_cursor *mc)
2710 {
2711 EDB_page *mp = mc->mc_pg[mc->mc_top], *np;
2712 EDB_txn *txn = mc->mc_txn;
2713 EDB_cursor *m2, *m3;
2714 pgno_t pgno;
2715 int rc;
2716
2717 if (!F_ISSET(mp->mp_flags, P_DIRTY)) {
2718 if (txn->mt_flags & EDB_TXN_SPILLS) {
2719 np = NULL;
2720 rc = edb_page_unspill(txn, mp, &np);
2721 if (rc)
2722 goto fail;
2723 if (np)
2724 goto done;
2725 }
2726 if ((rc = edb_eidl_need(&txn->mt_free_pgs, 1)) ||
2727 (rc = edb_page_alloc(mc, 1, &np)))
2728 goto fail;
2729 pgno = np->mp_pgno;
2730 DPRINTF(("touched db %d page %"Yu" -> %"Yu, DDBI(mc),
2731 mp->mp_pgno, pgno));
2732 edb_cassert(mc, mp->mp_pgno != pgno);
2733 edb_eidl_xappend(txn->mt_free_pgs, mp->mp_pgno);
2734
2735 if (mc->mc_top) {
2736 EDB_page *parent = mc->mc_pg[mc->mc_top-1];
2737 EDB_node *node = NODEPTR(parent, mc->mc_ki[mc->mc_top-1]);
2738 SETPGNO(node, pgno);
2739 } else {
2740 mc->mc_db->md_root = pgno;
2741 }
2742 } else if (txn->mt_parent && !IS_SUBP(mp)) {
2743 EDB_ID2 mid, *dl = txn->mt_u.dirty_list;
2744 pgno = mp->mp_pgno;
2745
2746
2747
2748 if (dl[0].mid) {
2749 unsigned x = edb_mid2l_search(dl, pgno);
2750 if (x <= dl[0].mid && dl[x].mid == pgno) {
2751 if (mp != dl[x].mptr) {
2752 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
2753 txn->mt_flags |= EDB_TXN_ERROR;
2754 return EDB_PROBLEM;
2755 }
2756 return 0;
2757 }
2758 }
2759 edb_cassert(mc, dl[0].mid < EDB_IDL_UM_MAX);
2760
2761 np = edb_page_malloc(txn, 1);
2762 if (!np)
2763 return ENOMEM;
2764 mid.mid = pgno;
2765 mid.mptr = np;
2766 rc = edb_mid2l_insert(dl, &mid);
2767 edb_cassert(mc, rc == 0);
2768 } else {
2769 return 0;
2770 }
2771
2772 edb_page_copy(np, mp, txn->mt_env->me_psize);
2773 np->mp_pgno = pgno;
2774 np->mp_flags |= P_DIRTY;
2775
2776 done:
2777
2778 mc->mc_pg[mc->mc_top] = np;
2779 m2 = txn->mt_cursors[mc->mc_dbi];
2780 if (mc->mc_flags & C_SUB) {
2781 for (; m2; m2=m2->mc_next) {
2782 m3 = &m2->mc_xcursor->mx_cursor;
2783 if (m3->mc_snum < mc->mc_snum) continue;
2784 if (m3->mc_pg[mc->mc_top] == mp)
2785 m3->mc_pg[mc->mc_top] = np;
2786 }
2787 } else {
2788 for (; m2; m2=m2->mc_next) {
2789 if (m2->mc_snum < mc->mc_snum) continue;
2790 if (m2 == mc) continue;
2791 if (m2->mc_pg[mc->mc_top] == mp) {
2792 m2->mc_pg[mc->mc_top] = np;
2793 if (IS_LEAF(np))
2794 XCURSOR_REFRESH(m2, mc->mc_top, np);
2795 }
2796 }
2797 }
2798 EDB_PAGE_UNREF(mc->mc_txn, mp);
2799 return 0;
2800
2801 fail:
2802 txn->mt_flags |= EDB_TXN_ERROR;
2803 return rc;
2804 }
2805
2806 int
2807 edb_env_sync0(EDB_env *env, int force, pgno_t numpgs)
2808 {
2809 int rc = 0;
2810 if (env->me_flags & EDB_RDONLY)
2811 return EACCES;
2812 if (force || !F_ISSET(env->me_flags, EDB_NOSYNC)) {
2813 if (env->me_flags & EDB_WRITEMAP) {
2814 int flags = ((env->me_flags & EDB_MAPASYNC) && !force)
2815 ? MS_ASYNC : MS_SYNC;
2816 if (EDB_MSYNC(env->me_map, env->me_psize * numpgs, flags))
2817 rc = ErrCode();
2818 #ifdef _WIN32
2819 else if (flags == MS_SYNC && EDB_FDATASYNC(env->me_fd))
2820 rc = ErrCode();
2821 #endif
2822 } else {
2823 #ifdef BROKEN_FDATASYNC
2824 if (env->me_flags & EDB_FSYNCONLY) {
2825 if (fsync(env->me_fd))
2826 rc = ErrCode();
2827 } else
2828 #endif
2829 if (EDB_FDATASYNC(env->me_fd))
2830 rc = ErrCode();
2831 }
2832 }
2833 return rc;
2834 }
2835
2836 int
2837 edb_env_sync(EDB_env *env, int force)
2838 {
2839 EDB_meta *m = edb_env_pick_meta(env);
2840 return edb_env_sync0(env, force, m->mm_last_pg+1);
2841 }
2842
2843
2844 static int
2845 edb_cursor_shadow(EDB_txn *src, EDB_txn *dst)
2846 {
2847 EDB_cursor *mc, *bk;
2848 EDB_xcursor *mx;
2849 size_t size;
2850 int i;
2851
2852 for (i = src->mt_nuedbs; --i >= 0; ) {
2853 if ((mc = src->mt_cursors[i]) != NULL) {
2854 size = sizeof(EDB_cursor);
2855 if (mc->mc_xcursor)
2856 size += sizeof(EDB_xcursor);
2857 for (; mc; mc = bk->mc_next) {
2858 bk = malloc(size);
2859 if (!bk)
2860 return ENOMEM;
2861 *bk = *mc;
2862 mc->mc_backup = bk;
2863 mc->mc_db = &dst->mt_dbs[i];
2864
2865
2866
2867
2868 mc->mc_txn = dst;
2869 mc->mc_dbflag = &dst->mt_dbflags[i];
2870 if ((mx = mc->mc_xcursor) != NULL) {
2871 *(EDB_xcursor *)(bk+1) = *mx;
2872 mx->mx_cursor.mc_txn = dst;
2873 }
2874 mc->mc_next = dst->mt_cursors[i];
2875 dst->mt_cursors[i] = mc;
2876 }
2877 }
2878 }
2879 return EDB_SUCCESS;
2880 }
2881
2882
2883
2884
2885
2886
2887 static void
2888 edb_cursors_close(EDB_txn *txn, unsigned merge)
2889 {
2890 EDB_cursor **cursors = txn->mt_cursors, *mc, *next, *bk;
2891 EDB_xcursor *mx;
2892 int i;
2893
2894 for (i = txn->mt_nuedbs; --i >= 0; ) {
2895 for (mc = cursors[i]; mc; mc = next) {
2896 next = mc->mc_next;
2897 if ((bk = mc->mc_backup) != NULL) {
2898 if (merge) {
2899
2900 mc->mc_next = bk->mc_next;
2901 mc->mc_backup = bk->mc_backup;
2902 mc->mc_txn = bk->mc_txn;
2903 mc->mc_db = bk->mc_db;
2904 mc->mc_dbflag = bk->mc_dbflag;
2905 if ((mx = mc->mc_xcursor) != NULL)
2906 mx->mx_cursor.mc_txn = bk->mc_txn;
2907 } else {
2908
2909 *mc = *bk;
2910 if ((mx = mc->mc_xcursor) != NULL)
2911 *mx = *(EDB_xcursor *)(bk+1);
2912 }
2913 mc = bk;
2914 }
2915
2916 free(mc);
2917 }
2918 cursors[i] = NULL;
2919 }
2920 }
2921
2922 #if !(EDB_PIDLOCK)
2923 enum Pidlock_op {
2924 Pidset, Pidcheck
2925 };
2926 #else
2927 enum Pidlock_op {
2928 Pidset = F_SETLK, Pidcheck = F_GETLK
2929 };
2930 #endif
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940 static int
2941 edb_reader_pid(EDB_env *env, enum Pidlock_op op, EDB_PID_T pid)
2942 {
2943 #if !(EDB_PIDLOCK)
2944 int ret = 0;
2945 HANDLE h;
2946 if (op == Pidcheck) {
2947 h = OpenProcess(env->me_pidquery, FALSE, pid);
2948
2949 if (!h)
2950 return ErrCode() != ERROR_INVALID_PARAMETER;
2951
2952 ret = WaitForSingleObject(h, 0) != 0;
2953 CloseHandle(h);
2954 }
2955 return ret;
2956 #else
2957 for (;;) {
2958 int rc;
2959 struct flock lock_info;
2960 memset(&lock_info, 0, sizeof(lock_info));
2961 lock_info.l_type = F_WRLCK;
2962 lock_info.l_whence = SEEK_SET;
2963 lock_info.l_start = pid;
2964 lock_info.l_len = 1;
2965 if ((rc = fcntl(env->me_lfd, op, &lock_info)) == 0) {
2966 if (op == F_GETLK && lock_info.l_type != F_UNLCK)
2967 rc = -1;
2968 } else if ((rc = ErrCode()) == EINTR) {
2969 continue;
2970 }
2971 return rc;
2972 }
2973 #endif
2974 }
2975
2976
2977
2978
2979
2980 static int
2981 edb_txn_renew0(EDB_txn *txn)
2982 {
2983 EDB_env *env = txn->mt_env;
2984 EDB_txninfo *ti = env->me_txns;
2985 EDB_meta *meta;
2986 unsigned int i, nr, flags = txn->mt_flags;
2987 uint16_t x;
2988 int rc, new_notls = 0;
2989
2990 if ((flags &= EDB_TXN_RDONLY) != 0) {
2991 if (!ti) {
2992 meta = edb_env_pick_meta(env);
2993 txn->mt_txnid = meta->mm_txnid;
2994 txn->mt_u.reader = NULL;
2995 } else {
2996 EDB_reader *r = (env->me_flags & EDB_NOTLS) ? txn->mt_u.reader :
2997 pthread_getspecific(env->me_txkey);
2998 if (r) {
2999 if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
3000 return EDB_BAD_RSLOT;
3001 } else {
3002 EDB_PID_T pid = env->me_pid;
3003 EDB_THR_T tid = pthread_self();
3004 edb_mutexref_t rmutex = env->me_rmutex;
3005
3006 if (!env->me_live_reader) {
3007 rc = edb_reader_pid(env, Pidset, pid);
3008 if (rc)
3009 return rc;
3010 env->me_live_reader = 1;
3011 }
3012
3013 if (LOCK_MUTEX(rc, env, rmutex))
3014 return rc;
3015 nr = ti->mti_numreaders;
3016 for (i=0; i<nr; i++)
3017 if (ti->mti_readers[i].mr_pid == 0)
3018 break;
3019 if (i == env->me_maxreaders) {
3020 UNLOCK_MUTEX(rmutex);
3021 return EDB_READERS_FULL;
3022 }
3023 r = &ti->mti_readers[i];
3024
3025
3026
3027
3028
3029
3030 r->mr_pid = 0;
3031 r->mr_txnid = (txnid_t)-1;
3032 r->mr_tid = tid;
3033 if (i == nr)
3034 ti->mti_numreaders = ++nr;
3035 env->me_close_readers = nr;
3036 r->mr_pid = pid;
3037 UNLOCK_MUTEX(rmutex);
3038
3039 new_notls = (env->me_flags & EDB_NOTLS);
3040 if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
3041 r->mr_pid = 0;
3042 return rc;
3043 }
3044 }
3045 do
3046 r->mr_txnid = ti->mti_txnid;
3047 while(r->mr_txnid != ti->mti_txnid);
3048 txn->mt_txnid = r->mr_txnid;
3049 txn->mt_u.reader = r;
3050 meta = env->me_metas[txn->mt_txnid & 1];
3051 }
3052
3053 } else {
3054
3055 if (ti) {
3056 if (LOCK_MUTEX(rc, env, env->me_wmutex))
3057 return rc;
3058 txn->mt_txnid = ti->mti_txnid;
3059 meta = env->me_metas[txn->mt_txnid & 1];
3060 } else {
3061 meta = edb_env_pick_meta(env);
3062 txn->mt_txnid = meta->mm_txnid;
3063 }
3064 txn->mt_txnid++;
3065 #if EDB_DEBUG
3066 if (txn->mt_txnid == edb_debug_start)
3067 edb_debug = 1;
3068 #endif
3069 txn->mt_child = NULL;
3070 txn->mt_loose_pgs = NULL;
3071 txn->mt_loose_count = 0;
3072 txn->mt_dirty_room = EDB_IDL_UM_MAX;
3073 txn->mt_u.dirty_list = env->me_dirty_list;
3074 txn->mt_u.dirty_list[0].mid = 0;
3075 txn->mt_free_pgs = env->me_free_pgs;
3076 txn->mt_free_pgs[0] = 0;
3077 txn->mt_spill_pgs = NULL;
3078 env->me_txn = txn;
3079 memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned int));
3080 }
3081
3082
3083 memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(EDB_db));
3084
3085
3086 txn->mt_next_pgno = meta->mm_last_pg+1;
3087 #ifdef EDB_VL32
3088 txn->mt_last_pgno = txn->mt_next_pgno - 1;
3089 #endif
3090
3091 txn->mt_flags = flags;
3092
3093
3094 txn->mt_nuedbs = env->me_nuedbs;
3095 for (i=CORE_DBS; i<txn->mt_nuedbs; i++) {
3096 x = env->me_dbflags[i];
3097 txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;
3098 txn->mt_dbflags[i] = (x & EDB_VALID) ? DB_VALID|DB_USRVALID|DB_STALE : 0;
3099 }
3100 txn->mt_dbflags[MAIN_DBI] = DB_VALID|DB_USRVALID;
3101 txn->mt_dbflags[FREE_DBI] = DB_VALID;
3102
3103 if (env->me_flags & EDB_FATAL_ERROR) {
3104 DPUTS("environment had fatal error, must shutdown!");
3105 rc = EDB_PANIC;
3106 } else if (env->me_maxpg < txn->mt_next_pgno) {
3107 rc = EDB_MAP_RESIZED;
3108 } else {
3109 return EDB_SUCCESS;
3110 }
3111 edb_txn_end(txn, new_notls | EDB_END_FAIL_BEGIN);
3112 return rc;
3113 }
3114
3115 int
3116 edb_txn_renew(EDB_txn *txn)
3117 {
3118 int rc;
3119
3120 if (!txn || !F_ISSET(txn->mt_flags, EDB_TXN_RDONLY|EDB_TXN_FINISHED))
3121 return EINVAL;
3122
3123 rc = edb_txn_renew0(txn);
3124 if (rc == EDB_SUCCESS) {
3125 DPRINTF(("renew txn %"Yu"%c %p on edbenv %p, root page %"Yu,
3126 txn->mt_txnid, (txn->mt_flags & EDB_TXN_RDONLY) ? 'r' : 'w',
3127 (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root));
3128 }
3129 return rc;
3130 }
3131
3132 int
3133 edb_txn_begin(EDB_env *env, EDB_txn *parent, unsigned int flags, EDB_txn **ret)
3134 {
3135 EDB_txn *txn;
3136 EDB_ntxn *ntxn;
3137 int rc, size, tsize;
3138
3139 flags &= EDB_TXN_BEGIN_FLAGS;
3140 flags |= env->me_flags & EDB_WRITEMAP;
3141
3142 if (env->me_flags & EDB_RDONLY & ~flags)
3143 return EACCES;
3144
3145 if (parent) {
3146
3147 flags |= parent->mt_flags;
3148 if (flags & (EDB_RDONLY|EDB_WRITEMAP|EDB_TXN_BLOCKED)) {
3149 return (parent->mt_flags & EDB_TXN_RDONLY) ? EINVAL : EDB_BAD_TXN;
3150 }
3151
3152 size = env->me_maxdbs * (sizeof(EDB_db)+sizeof(EDB_cursor *)+1);
3153 size += tsize = sizeof(EDB_ntxn);
3154 } else if (flags & EDB_RDONLY) {
3155 size = env->me_maxdbs * (sizeof(EDB_db)+1);
3156 size += tsize = sizeof(EDB_txn);
3157 } else {
3158
3159
3160
3161 txn = env->me_txn0;
3162 goto renew;
3163 }
3164 if ((txn = calloc(1, size)) == NULL) {
3165 DPRINTF(("calloc: %s", strerror(errno)));
3166 return ENOMEM;
3167 }
3168 #ifdef EDB_VL32
3169 if (!parent) {
3170 txn->mt_rpages = malloc(EDB_TRPAGE_SIZE * sizeof(EDB_ID3));
3171 if (!txn->mt_rpages) {
3172 free(txn);
3173 return ENOMEM;
3174 }
3175 txn->mt_rpages[0].mid = 0;
3176 txn->mt_rpcheck = EDB_TRPAGE_SIZE/2;
3177 }
3178 #endif
3179 txn->mt_dbxs = env->me_dbxs;
3180 txn->mt_dbs = (EDB_db *) ((char *)txn + tsize);
3181 txn->mt_dbflags = (unsigned char *)txn + size - env->me_maxdbs;
3182 txn->mt_flags = flags;
3183 txn->mt_env = env;
3184
3185 if (parent) {
3186 unsigned int i;
3187 txn->mt_cursors = (EDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
3188 txn->mt_dbiseqs = parent->mt_dbiseqs;
3189 txn->mt_u.dirty_list = malloc(sizeof(EDB_ID2)*EDB_IDL_UM_SIZE);
3190 if (!txn->mt_u.dirty_list ||
3191 !(txn->mt_free_pgs = edb_eidl_alloc(EDB_IDL_UM_MAX)))
3192 {
3193 free(txn->mt_u.dirty_list);
3194 free(txn);
3195 return ENOMEM;
3196 }
3197 txn->mt_txnid = parent->mt_txnid;
3198 txn->mt_dirty_room = parent->mt_dirty_room;
3199 txn->mt_u.dirty_list[0].mid = 0;
3200 txn->mt_spill_pgs = NULL;
3201 txn->mt_next_pgno = parent->mt_next_pgno;
3202 parent->mt_flags |= EDB_TXN_HAS_CHILD;
3203 parent->mt_child = txn;
3204 txn->mt_parent = parent;
3205 txn->mt_nuedbs = parent->mt_nuedbs;
3206 #ifdef EDB_VL32
3207 txn->mt_rpages = parent->mt_rpages;
3208 #endif
3209 memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_nuedbs * sizeof(EDB_db));
3210
3211 for (i=0; i<txn->mt_nuedbs; i++)
3212 txn->mt_dbflags[i] = parent->mt_dbflags[i] & ~DB_NEW;
3213 rc = 0;
3214 ntxn = (EDB_ntxn *)txn;
3215 ntxn->mnt_pgstate = env->me_pgstate;
3216 if (env->me_pghead) {
3217 size = EDB_IDL_SIZEOF(env->me_pghead);
3218 env->me_pghead = edb_eidl_alloc(env->me_pghead[0]);
3219 if (env->me_pghead)
3220 memcpy(env->me_pghead, ntxn->mnt_pgstate.mf_pghead, size);
3221 else
3222 rc = ENOMEM;
3223 }
3224 if (!rc)
3225 rc = edb_cursor_shadow(parent, txn);
3226 if (rc)
3227 edb_txn_end(txn, EDB_END_FAIL_BEGINCHILD);
3228 } else {
3229 txn->mt_dbiseqs = env->me_dbiseqs;
3230 renew:
3231 rc = edb_txn_renew0(txn);
3232 }
3233 if (rc) {
3234 if (txn != env->me_txn0) {
3235 #ifdef EDB_VL32
3236 free(txn->mt_rpages);
3237 #endif
3238 free(txn);
3239 }
3240 } else {
3241 txn->mt_flags |= flags;
3242 *ret = txn;
3243 DPRINTF(("begin txn %"Yu"%c %p on edbenv %p, root page %"Yu,
3244 txn->mt_txnid, (flags & EDB_RDONLY) ? 'r' : 'w',
3245 (void *) txn, (void *) env, txn->mt_dbs[MAIN_DBI].md_root));
3246 }
3247
3248 return rc;
3249 }
3250
3251 EDB_env *
3252 edb_txn_env(EDB_txn *txn)
3253 {
3254 if(!txn) return NULL;
3255 return txn->mt_env;
3256 }
3257
3258 edb_size_t
3259 edb_txn_id(EDB_txn *txn)
3260 {
3261 if(!txn) return 0;
3262 return txn->mt_txnid;
3263 }
3264
3265
3266 static void
3267 edb_dbis_update(EDB_txn *txn, int keep)
3268 {
3269 int i;
3270 EDB_dbi n = txn->mt_nuedbs;
3271 EDB_env *env = txn->mt_env;
3272 unsigned char *tdbflags = txn->mt_dbflags;
3273
3274 for (i = n; --i >= CORE_DBS;) {
3275 if (tdbflags[i] & DB_NEW) {
3276 if (keep) {
3277 env->me_dbflags[i] = txn->mt_dbs[i].md_flags | EDB_VALID;
3278 } else {
3279 char *ptr = env->me_dbxs[i].md_name.mv_data;
3280 if (ptr) {
3281 env->me_dbxs[i].md_name.mv_data = NULL;
3282 env->me_dbxs[i].md_name.mv_size = 0;
3283 env->me_dbflags[i] = 0;
3284 env->me_dbiseqs[i]++;
3285 free(ptr);
3286 }
3287 }
3288 }
3289 }
3290 if (keep && env->me_nuedbs < n)
3291 env->me_nuedbs = n;
3292 }
3293
3294
3295
3296
3297
3298
3299 static void
3300 edb_txn_end(EDB_txn *txn, unsigned mode)
3301 {
3302 EDB_env *env = txn->mt_env;
3303 #if EDB_DEBUG
3304 static const char *const names[] = EDB_END_NAMES;
3305 #endif
3306
3307
3308 edb_dbis_update(txn, mode & EDB_END_UPDATE);
3309
3310 DPRINTF(("%s txn %"Yu"%c %p on edbenv %p, root page %"Yu,
3311 names[mode & EDB_END_OPMASK],
3312 txn->mt_txnid, (txn->mt_flags & EDB_TXN_RDONLY) ? 'r' : 'w',
3313 (void *) txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root));
3314
3315 if (F_ISSET(txn->mt_flags, EDB_TXN_RDONLY)) {
3316 if (txn->mt_u.reader) {
3317 txn->mt_u.reader->mr_txnid = (txnid_t)-1;
3318 if (!(env->me_flags & EDB_NOTLS)) {
3319 txn->mt_u.reader = NULL;
3320 } else if (mode & EDB_END_SLOT) {
3321 txn->mt_u.reader->mr_pid = 0;
3322 txn->mt_u.reader = NULL;
3323 }
3324 }
3325 txn->mt_nuedbs = 0;
3326 txn->mt_flags |= EDB_TXN_FINISHED;
3327
3328 } else if (!F_ISSET(txn->mt_flags, EDB_TXN_FINISHED)) {
3329 pgno_t *pghead = env->me_pghead;
3330
3331 if (!(mode & EDB_END_UPDATE))
3332 edb_cursors_close(txn, 0);
3333 if (!(env->me_flags & EDB_WRITEMAP)) {
3334 edb_dlist_free(txn);
3335 }
3336
3337 txn->mt_nuedbs = 0;
3338 txn->mt_flags = EDB_TXN_FINISHED;
3339
3340 if (!txn->mt_parent) {
3341 edb_eidl_shrink(&txn->mt_free_pgs);
3342 env->me_free_pgs = txn->mt_free_pgs;
3343
3344 env->me_pghead = NULL;
3345 env->me_pglast = 0;
3346
3347 env->me_txn = NULL;
3348 mode = 0;
3349
3350
3351 if (env->me_txns)
3352 UNLOCK_MUTEX(env->me_wmutex);
3353 } else {
3354 txn->mt_parent->mt_child = NULL;
3355 txn->mt_parent->mt_flags &= ~EDB_TXN_HAS_CHILD;
3356 env->me_pgstate = ((EDB_ntxn *)txn)->mnt_pgstate;
3357 edb_eidl_free(txn->mt_free_pgs);
3358 free(txn->mt_u.dirty_list);
3359 }
3360 edb_eidl_free(txn->mt_spill_pgs);
3361
3362 edb_eidl_free(pghead);
3363 }
3364 #ifdef EDB_VL32
3365 if (!txn->mt_parent) {
3366 EDB_ID3L el = env->me_rpages, tl = txn->mt_rpages;
3367 unsigned i, x, n = tl[0].mid;
3368 pthread_mutex_lock(&env->me_rpmutex);
3369 for (i = 1; i <= n; i++) {
3370 if (tl[i].mid & (EDB_RPAGE_CHUNK-1)) {
3371
3372 munmap(tl[i].mptr, tl[i].mcnt * env->me_psize);
3373 } else {
3374 x = edb_mid3l_search(el, tl[i].mid);
3375 if (tl[i].mptr == el[x].mptr) {
3376 el[x].mref--;
3377 } else {
3378
3379 munmap(tl[i].mptr, tl[i].mcnt * env->me_psize);
3380 }
3381 }
3382 }
3383 pthread_mutex_unlock(&env->me_rpmutex);
3384 tl[0].mid = 0;
3385 if (mode & EDB_END_FREE)
3386 free(tl);
3387 }
3388 #endif
3389 if (mode & EDB_END_FREE)
3390 free(txn);
3391 }
3392
3393 void
3394 edb_txn_reset(EDB_txn *txn)
3395 {
3396 if (txn == NULL)
3397 return;
3398
3399
3400 if (!(txn->mt_flags & EDB_TXN_RDONLY))
3401 return;
3402
3403 edb_txn_end(txn, EDB_END_RESET);
3404 }
3405
3406 void
3407 edb_txn_abort(EDB_txn *txn)
3408 {
3409 if (txn == NULL)
3410 return;
3411
3412 if (txn->mt_child)
3413 edb_txn_abort(txn->mt_child);
3414
3415 edb_txn_end(txn, EDB_END_ABORT|EDB_END_SLOT|EDB_END_FREE);
3416 }
3417
3418
3419
3420
3421
3422
3423
3424 static int
3425 edb_freelist_save(EDB_txn *txn)
3426 {
3427
3428
3429
3430
3431 EDB_cursor mc;
3432 EDB_env *env = txn->mt_env;
3433 int rc, maxfree_1pg = env->me_maxfree_1pg, more = 1;
3434 txnid_t pglast = 0, head_id = 0;
3435 pgno_t freecnt = 0, *free_pgs, *mop;
3436 ssize_t head_room = 0, total_room = 0, mop_len, clean_limit;
3437
3438 edb_cursor_init(&mc, txn, FREE_DBI, NULL);
3439
3440 if (env->me_pghead) {
3441
3442 rc = edb_page_search(&mc, NULL, EDB_PS_FIRST|EDB_PS_MODIFY);
3443 if (rc && rc != EDB_NOTFOUND)
3444 return rc;
3445 }
3446
3447 if (!env->me_pghead && txn->mt_loose_pgs) {
3448
3449
3450
3451 EDB_page *mp = txn->mt_loose_pgs;
3452 EDB_ID2 *dl = txn->mt_u.dirty_list;
3453 unsigned x;
3454 if ((rc = edb_eidl_need(&txn->mt_free_pgs, txn->mt_loose_count)) != 0)
3455 return rc;
3456 for (; mp; mp = NEXT_LOOSE_PAGE(mp)) {
3457 edb_eidl_xappend(txn->mt_free_pgs, mp->mp_pgno);
3458
3459 if (txn->mt_flags & EDB_TXN_WRITEMAP) {
3460 for (x=1; x<=dl[0].mid; x++)
3461 if (dl[x].mid == mp->mp_pgno)
3462 break;
3463 edb_tassert(txn, x <= dl[0].mid);
3464 } else {
3465 x = edb_mid2l_search(dl, mp->mp_pgno);
3466 edb_tassert(txn, dl[x].mid == mp->mp_pgno);
3467 edb_dpage_free(env, mp);
3468 }
3469 dl[x].mptr = NULL;
3470 }
3471 {
3472
3473 unsigned y;
3474 for (y=1; dl[y].mptr && y <= dl[0].mid; y++);
3475 if (y <= dl[0].mid) {
3476 for(x=y, y++;;) {
3477 while (!dl[y].mptr && y <= dl[0].mid) y++;
3478 if (y > dl[0].mid) break;
3479 dl[x++] = dl[y++];
3480 }
3481 dl[0].mid = x-1;
3482 } else {
3483
3484 dl[0].mid = 0;
3485 }
3486 }
3487 txn->mt_loose_pgs = NULL;
3488 txn->mt_loose_count = 0;
3489 }
3490
3491
3492 clean_limit = (env->me_flags & (EDB_NOMEMINIT|EDB_WRITEMAP))
3493 ? SSIZE_MAX : maxfree_1pg;
3494
3495 for (;;) {
3496
3497 EDB_val key, data;
3498 pgno_t *pgs;
3499 ssize_t j;
3500
3501
3502
3503
3504 while (pglast < env->me_pglast) {
3505 rc = edb_cursor_first(&mc, &key, NULL);
3506 if (rc)
3507 return rc;
3508 pglast = head_id = *(txnid_t *)key.mv_data;
3509 total_room = head_room = 0;
3510 edb_tassert(txn, pglast <= env->me_pglast);
3511 rc = edb_cursor_del(&mc, 0);
3512 if (rc)
3513 return rc;
3514 }
3515
3516
3517 if (freecnt < txn->mt_free_pgs[0]) {
3518 if (!freecnt) {
3519
3520 rc = edb_page_search(&mc, NULL, EDB_PS_LAST|EDB_PS_MODIFY);
3521 if (rc && rc != EDB_NOTFOUND)
3522 return rc;
3523 }
3524 free_pgs = txn->mt_free_pgs;
3525
3526 key.mv_size = sizeof(txn->mt_txnid);
3527 key.mv_data = &txn->mt_txnid;
3528 do {
3529 freecnt = free_pgs[0];
3530 data.mv_size = EDB_IDL_SIZEOF(free_pgs);
3531 rc = edb_cursor_put(&mc, &key, &data, EDB_RESERVE);
3532 if (rc)
3533 return rc;
3534
3535 free_pgs = txn->mt_free_pgs;
3536 } while (freecnt < free_pgs[0]);
3537 edb_eidl_sort(free_pgs);
3538 memcpy(data.mv_data, free_pgs, data.mv_size);
3539 #if (EDB_DEBUG) > 1
3540 {
3541 unsigned int i = free_pgs[0];
3542 DPRINTF(("IDL write txn %"Yu" root %"Yu" num %u",
3543 txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i));
3544 for (; i; i--)
3545 DPRINTF(("IDL %"Yu, free_pgs[i]));
3546 }
3547 #endif
3548 continue;
3549 }
3550
3551 mop = env->me_pghead;
3552 mop_len = (mop ? mop[0] : 0) + txn->mt_loose_count;
3553
3554
3555
3556
3557
3558 if (total_room >= mop_len) {
3559 if (total_room == mop_len || --more < 0)
3560 break;
3561 } else if (head_room >= maxfree_1pg && head_id > 1) {
3562
3563 head_id--;
3564 head_room = 0;
3565 }
3566
3567 total_room -= head_room;
3568 head_room = mop_len - total_room;
3569 if (head_room > maxfree_1pg && head_id > 1) {
3570
3571 head_room /= head_id;
3572 head_room += maxfree_1pg - head_room % (maxfree_1pg + 1);
3573 } else if (head_room < 0) {
3574
3575 head_room = 0;
3576 }
3577 key.mv_size = sizeof(head_id);
3578 key.mv_data = &head_id;
3579 data.mv_size = (head_room + 1) * sizeof(pgno_t);
3580 rc = edb_cursor_put(&mc, &key, &data, EDB_RESERVE);
3581 if (rc)
3582 return rc;
3583
3584 pgs = (pgno_t *)data.mv_data;
3585 j = head_room > clean_limit ? head_room : 0;
3586 do {
3587 pgs[j] = 0;
3588 } while (--j >= 0);
3589 total_room += head_room;
3590 }
3591
3592
3593
3594
3595 if (txn->mt_loose_pgs) {
3596 EDB_page *mp = txn->mt_loose_pgs;
3597 unsigned count = txn->mt_loose_count;
3598 EDB_IDL loose;
3599
3600 if ((rc = edb_eidl_need(&env->me_pghead, 2*count+1)) != 0)
3601 return rc;
3602 mop = env->me_pghead;
3603 loose = mop + EDB_IDL_ALLOCLEN(mop) - count;
3604 for (count = 0; mp; mp = NEXT_LOOSE_PAGE(mp))
3605 loose[ ++count ] = mp->mp_pgno;
3606 loose[0] = count;
3607 edb_eidl_sort(loose);
3608 edb_eidl_xmerge(mop, loose);
3609 txn->mt_loose_pgs = NULL;
3610 txn->mt_loose_count = 0;
3611 mop_len = mop[0];
3612 }
3613
3614
3615 rc = EDB_SUCCESS;
3616 if (mop_len) {
3617 EDB_val key, data;
3618
3619 mop += mop_len;
3620 rc = edb_cursor_first(&mc, &key, &data);
3621 for (; !rc; rc = edb_cursor_next(&mc, &key, &data, EDB_NEXT)) {
3622 txnid_t id = *(txnid_t *)key.mv_data;
3623 ssize_t len = (ssize_t)(data.mv_size / sizeof(EDB_ID)) - 1;
3624 EDB_ID save;
3625
3626 edb_tassert(txn, len >= 0 && id <= env->me_pglast);
3627 key.mv_data = &id;
3628 if (len > mop_len) {
3629 len = mop_len;
3630 data.mv_size = (len + 1) * sizeof(EDB_ID);
3631 }
3632 data.mv_data = mop -= len;
3633 save = mop[0];
3634 mop[0] = len;
3635 rc = edb_cursor_put(&mc, &key, &data, EDB_CURRENT);
3636 mop[0] = save;
3637 if (rc || !(mop_len -= len))
3638 break;
3639 }
3640 }
3641 return rc;
3642 }
3643
3644
3645
3646
3647
3648
3649 static int
3650 edb_page_flush(EDB_txn *txn, int keep)
3651 {
3652 EDB_env *env = txn->mt_env;
3653 EDB_ID2L dl = txn->mt_u.dirty_list;
3654 unsigned psize = env->me_psize, j;
3655 int i, pagecount = dl[0].mid, rc;
3656 size_t size = 0;
3657 off_t pos = 0;
3658 pgno_t pgno = 0;
3659 EDB_page *dp = NULL;
3660 #ifdef _WIN32
3661 OVERLAPPED ov;
3662 #else
3663 struct iovec iov[EDB_COMMIT_PAGES];
3664 ssize_t wsize = 0, wres;
3665 off_t wpos = 0, next_pos = 1;
3666 int n = 0;
3667 #endif
3668
3669 j = i = keep;
3670
3671 if (env->me_flags & EDB_WRITEMAP) {
3672
3673 while (++i <= pagecount) {
3674 dp = dl[i].mptr;
3675
3676 if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3677 dp->mp_flags &= ~P_KEEP;
3678 dl[++j] = dl[i];
3679 continue;
3680 }
3681 dp->mp_flags &= ~P_DIRTY;
3682 }
3683 goto done;
3684 }
3685
3686
3687 for (;;) {
3688 if (++i <= pagecount) {
3689 dp = dl[i].mptr;
3690
3691 if (dp->mp_flags & (P_LOOSE|P_KEEP)) {
3692 dp->mp_flags &= ~P_KEEP;
3693 dl[i].mid = 0;
3694 continue;
3695 }
3696 pgno = dl[i].mid;
3697
3698 dp->mp_flags &= ~P_DIRTY;
3699 pos = pgno * psize;
3700 size = psize;
3701 if (IS_OVERFLOW(dp)) size *= dp->mp_pages;
3702 }
3703 #ifdef _WIN32
3704 else break;
3705
3706
3707
3708
3709
3710
3711
3712
3713 DPRINTF(("committing page %"Yu, pgno));
3714 memset(&ov, 0, sizeof(ov));
3715 ov.Offset = pos & 0xffffffff;
3716 ov.OffsetHigh = pos >> 16 >> 16;
3717 if (!WriteFile(env->me_fd, dp, size, NULL, &ov)) {
3718 rc = ErrCode();
3719 DPRINTF(("WriteFile: %d", rc));
3720 return rc;
3721 }
3722 #else
3723
3724 if (pos!=next_pos || n==EDB_COMMIT_PAGES || wsize+size>MAX_WRITE) {
3725 if (n) {
3726 retry_write:
3727
3728 #ifdef EDB_USE_PWRITEV
3729 wres = pwritev(env->me_fd, iov, n, wpos);
3730 #else
3731 if (n == 1) {
3732 wres = pwrite(env->me_fd, iov[0].iov_base, wsize, wpos);
3733 } else {
3734 retry_seek:
3735 if (lseek(env->me_fd, wpos, SEEK_SET) == -1) {
3736 rc = ErrCode();
3737 if (rc == EINTR)
3738 goto retry_seek;
3739 DPRINTF(("lseek: %s", strerror(rc)));
3740 return rc;
3741 }
3742 wres = writev(env->me_fd, iov, n);
3743 }
3744 #endif
3745 if (wres != wsize) {
3746 if (wres < 0) {
3747 rc = ErrCode();
3748 if (rc == EINTR)
3749 goto retry_write;
3750 DPRINTF(("Write error: %s", strerror(rc)));
3751 } else {
3752 rc = EIO;
3753 DPUTS("short write, filesystem full?");
3754 }
3755 return rc;
3756 }
3757 n = 0;
3758 }
3759 if (i > pagecount)
3760 break;
3761 wpos = pos;
3762 wsize = 0;
3763 }
3764 DPRINTF(("committing page %"Yu, pgno));
3765 next_pos = pos + size;
3766 iov[n].iov_len = size;
3767 iov[n].iov_base = (char *)dp;
3768 wsize += size;
3769 n++;
3770 #endif
3771 }
3772 #ifdef EDB_VL32
3773 if (pgno > txn->mt_last_pgno)
3774 txn->mt_last_pgno = pgno;
3775 #endif
3776
3777
3778
3779
3780
3781 CACHEFLUSH(env->me_map, txn->mt_next_pgno * env->me_psize, DCACHE);
3782
3783 for (i = keep; ++i <= pagecount; ) {
3784 dp = dl[i].mptr;
3785
3786 if (!dl[i].mid) {
3787 dl[++j] = dl[i];
3788 dl[j].mid = dp->mp_pgno;
3789 continue;
3790 }
3791 edb_dpage_free(env, dp);
3792 }
3793
3794 done:
3795 i--;
3796 txn->mt_dirty_room += i - j;
3797 dl[0].mid = j;
3798 return EDB_SUCCESS;
3799 }
3800
3801 static int ESECT edb_env_share_locks(EDB_env *env, int *excl);
3802
3803 int
3804 edb_txn_commit(EDB_txn *txn)
3805 {
3806 int rc;
3807 unsigned int i, end_mode;
3808 EDB_env *env;
3809
3810 if (txn == NULL)
3811 return EINVAL;
3812
3813
3814 end_mode = EDB_END_EMPTY_COMMIT|EDB_END_UPDATE|EDB_END_SLOT|EDB_END_FREE;
3815
3816 if (txn->mt_child) {
3817 rc = edb_txn_commit(txn->mt_child);
3818 if (rc)
3819 goto fail;
3820 }
3821
3822 env = txn->mt_env;
3823
3824 if (F_ISSET(txn->mt_flags, EDB_TXN_RDONLY)) {
3825 goto done;
3826 }
3827
3828 if (txn->mt_flags & (EDB_TXN_FINISHED|EDB_TXN_ERROR)) {
3829 DPUTS("txn has failed/finished, can't commit");
3830 if (txn->mt_parent)
3831 txn->mt_parent->mt_flags |= EDB_TXN_ERROR;
3832 rc = EDB_BAD_TXN;
3833 goto fail;
3834 }
3835
3836 if (txn->mt_parent) {
3837 EDB_txn *parent = txn->mt_parent;
3838 EDB_page **lp;
3839 EDB_ID2L dst, src;
3840 EDB_IDL pspill;
3841 unsigned x, y, len, ps_len;
3842
3843
3844 rc = edb_eidl_append_list(&parent->mt_free_pgs, txn->mt_free_pgs);
3845 if (rc)
3846 goto fail;
3847 edb_eidl_free(txn->mt_free_pgs);
3848
3849
3850
3851
3852 parent->mt_next_pgno = txn->mt_next_pgno;
3853 parent->mt_flags = txn->mt_flags;
3854
3855
3856 edb_cursors_close(txn, 1);
3857
3858
3859 memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_nuedbs * sizeof(EDB_db));
3860 parent->mt_nuedbs = txn->mt_nuedbs;
3861 parent->mt_dbflags[FREE_DBI] = txn->mt_dbflags[FREE_DBI];
3862 parent->mt_dbflags[MAIN_DBI] = txn->mt_dbflags[MAIN_DBI];
3863 for (i=CORE_DBS; i<txn->mt_nuedbs; i++) {
3864
3865 x = parent->mt_dbflags[i] & DB_NEW;
3866 parent->mt_dbflags[i] = txn->mt_dbflags[i] | x;
3867 }
3868
3869 dst = parent->mt_u.dirty_list;
3870 src = txn->mt_u.dirty_list;
3871
3872 if ((pspill = parent->mt_spill_pgs) && (ps_len = pspill[0])) {
3873 x = y = ps_len;
3874 pspill[0] = (pgno_t)-1;
3875
3876 for (i=0, len=src[0].mid; ++i <= len; ) {
3877 EDB_ID pn = src[i].mid << 1;
3878 while (pn > pspill[x])
3879 x--;
3880 if (pn == pspill[x]) {
3881 pspill[x] = 1;
3882 y = --x;
3883 }
3884 }
3885
3886 for (x=y; ++x <= ps_len; )
3887 if (!(pspill[x] & 1))
3888 pspill[++y] = pspill[x];
3889 pspill[0] = y;
3890 }
3891
3892
3893 if (txn->mt_spill_pgs && txn->mt_spill_pgs[0]) {
3894 for (i=1; i<=txn->mt_spill_pgs[0]; i++) {
3895 EDB_ID pn = txn->mt_spill_pgs[i];
3896 if (pn & 1)
3897 continue;
3898 pn >>= 1;
3899 y = edb_mid2l_search(dst, pn);
3900 if (y <= dst[0].mid && dst[y].mid == pn) {
3901 free(dst[y].mptr);
3902 while (y < dst[0].mid) {
3903 dst[y] = dst[y+1];
3904 y++;
3905 }
3906 dst[0].mid--;
3907 }
3908 }
3909 }
3910
3911
3912 x = dst[0].mid;
3913 dst[0].mid = 0;
3914 if (parent->mt_parent) {
3915 len = x + src[0].mid;
3916 y = edb_mid2l_search(src, dst[x].mid + 1) - 1;
3917 for (i = x; y && i; y--) {
3918 pgno_t yp = src[y].mid;
3919 while (yp < dst[i].mid)
3920 i--;
3921 if (yp == dst[i].mid) {
3922 i--;
3923 len--;
3924 }
3925 }
3926 } else {
3927 len = EDB_IDL_UM_MAX - txn->mt_dirty_room;
3928 }
3929
3930 y = src[0].mid;
3931 for (i = len; y; dst[i--] = src[y--]) {
3932 pgno_t yp = src[y].mid;
3933 while (yp < dst[x].mid)
3934 dst[i--] = dst[x--];
3935 if (yp == dst[x].mid)
3936 free(dst[x--].mptr);
3937 }
3938 edb_tassert(txn, i == x);
3939 dst[0].mid = len;
3940 free(txn->mt_u.dirty_list);
3941 parent->mt_dirty_room = txn->mt_dirty_room;
3942 if (txn->mt_spill_pgs) {
3943 if (parent->mt_spill_pgs) {
3944
3945 rc = edb_eidl_append_list(&parent->mt_spill_pgs, txn->mt_spill_pgs);
3946 if (rc)
3947 parent->mt_flags |= EDB_TXN_ERROR;
3948 edb_eidl_free(txn->mt_spill_pgs);
3949 edb_eidl_sort(parent->mt_spill_pgs);
3950 } else {
3951 parent->mt_spill_pgs = txn->mt_spill_pgs;
3952 }
3953 }
3954
3955
3956 for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp))
3957 ;
3958 *lp = txn->mt_loose_pgs;
3959 parent->mt_loose_count += txn->mt_loose_count;
3960
3961 parent->mt_child = NULL;
3962 edb_eidl_free(((EDB_ntxn *)txn)->mnt_pgstate.mf_pghead);
3963 free(txn);
3964 return rc;
3965 }
3966
3967 if (txn != env->me_txn) {
3968 DPUTS("attempt to commit unknown transaction");
3969 rc = EINVAL;
3970 goto fail;
3971 }
3972
3973 edb_cursors_close(txn, 0);
3974
3975 if (!txn->mt_u.dirty_list[0].mid &&
3976 !(txn->mt_flags & (EDB_TXN_DIRTY|EDB_TXN_SPILLS)))
3977 goto done;
3978
3979 DPRINTF(("committing txn %"Yu" %p on edbenv %p, root page %"Yu,
3980 txn->mt_txnid, (void*)txn, (void*)env, txn->mt_dbs[MAIN_DBI].md_root));
3981
3982
3983 if (txn->mt_nuedbs > CORE_DBS) {
3984 EDB_cursor mc;
3985 EDB_dbi i;
3986 EDB_val data;
3987 data.mv_size = sizeof(EDB_db);
3988
3989 edb_cursor_init(&mc, txn, MAIN_DBI, NULL);
3990 for (i = CORE_DBS; i < txn->mt_nuedbs; i++) {
3991 if (txn->mt_dbflags[i] & DB_DIRTY) {
3992 if (TXN_DBI_CHANGED(txn, i)) {
3993 rc = EDB_BAD_DBI;
3994 goto fail;
3995 }
3996 data.mv_data = &txn->mt_dbs[i];
3997 rc = edb_cursor_put(&mc, &txn->mt_dbxs[i].md_name, &data,
3998 F_SUBDATA);
3999 if (rc)
4000 goto fail;
4001 }
4002 }
4003 }
4004
4005 rc = edb_freelist_save(txn);
4006 if (rc)
4007 goto fail;
4008
4009 edb_eidl_free(env->me_pghead);
4010 env->me_pghead = NULL;
4011 edb_eidl_shrink(&txn->mt_free_pgs);
4012
4013 #if (EDB_DEBUG) > 2
4014 edb_audit(txn);
4015 #endif
4016
4017 if ((rc = edb_page_flush(txn, 0)))
4018 goto fail;
4019 if (!F_ISSET(txn->mt_flags, EDB_TXN_NOSYNC) &&
4020 (rc = edb_env_sync0(env, 0, txn->mt_next_pgno)))
4021 goto fail;
4022 if ((rc = edb_env_write_meta(txn)))
4023 goto fail;
4024 end_mode = EDB_END_COMMITTED|EDB_END_UPDATE;
4025 if (env->me_flags & EDB_PREVSNAPSHOT) {
4026 if (!(env->me_flags & EDB_NOLOCK)) {
4027 int excl;
4028 rc = edb_env_share_locks(env, &excl);
4029 if (rc)
4030 goto fail;
4031 }
4032 env->me_flags ^= EDB_PREVSNAPSHOT;
4033 }
4034
4035 done:
4036 edb_txn_end(txn, end_mode);
4037 return EDB_SUCCESS;
4038
4039 fail:
4040 edb_txn_abort(txn);
4041 return rc;
4042 }
4043
4044
4045
4046
4047
4048
4049
4050
4051 static int ESECT
4052 edb_env_read_header(EDB_env *env, int prev, EDB_meta *meta)
4053 {
4054 EDB_metabuf pbuf;
4055 EDB_page *p;
4056 EDB_meta *m;
4057 int i, rc, off;
4058 enum { Size = sizeof(pbuf) };
4059
4060
4061
4062
4063
4064 for (i=off=0; i<NUM_METAS; i++, off += meta->mm_psize) {
4065 #ifdef _WIN32
4066 DWORD len;
4067 OVERLAPPED ov;
4068 memset(&ov, 0, sizeof(ov));
4069 ov.Offset = off;
4070 rc = ReadFile(env->me_fd, &pbuf, Size, &len, &ov) ? (int)len : -1;
4071 if (rc == -1 && ErrCode() == ERROR_HANDLE_EOF)
4072 rc = 0;
4073 #else
4074 rc = pread(env->me_fd, &pbuf, Size, off);
4075 #endif
4076 if (rc != Size) {
4077 if (rc == 0 && off == 0)
4078 return ENOENT;
4079 rc = rc < 0 ? (int) ErrCode() : EDB_INVALID;
4080 DPRINTF(("read: %s", edb_strerror(rc)));
4081 return rc;
4082 }
4083
4084 p = (EDB_page *)&pbuf;
4085
4086 if (!F_ISSET(p->mp_flags, P_META)) {
4087 DPRINTF(("page %"Yu" not a meta page", p->mp_pgno));
4088 return EDB_INVALID;
4089 }
4090
4091 m = METADATA(p);
4092 if (m->mm_magic != EDB_MAGIC) {
4093 DPUTS("meta has invalid magic");
4094 return EDB_INVALID;
4095 }
4096
4097 if (m->mm_version != EDB_DATA_VERSION) {
4098 DPRINTF(("database is version %u, expected version %u",
4099 m->mm_version, EDB_DATA_VERSION));
4100 return EDB_VERSION_MISMATCH;
4101 }
4102
4103 if (off == 0 || (prev ? m->mm_txnid < meta->mm_txnid : m->mm_txnid > meta->mm_txnid))
4104 *meta = *m;
4105 }
4106 return 0;
4107 }
4108
4109
4110 static void ESECT
4111 edb_env_init_meta0(EDB_env *env, EDB_meta *meta)
4112 {
4113 meta->mm_magic = EDB_MAGIC;
4114 meta->mm_version = EDB_DATA_VERSION;
4115 meta->mm_mapsize = env->me_mapsize;
4116 meta->mm_psize = env->me_psize;
4117 meta->mm_last_pg = NUM_METAS-1;
4118 meta->mm_flags = env->me_flags & 0xffff;
4119 meta->mm_flags |= EDB_INTEGERKEY;
4120 meta->mm_dbs[FREE_DBI].md_root = P_INVALID;
4121 meta->mm_dbs[MAIN_DBI].md_root = P_INVALID;
4122 }
4123
4124
4125
4126
4127
4128
4129 static int ESECT
4130 edb_env_init_meta(EDB_env *env, EDB_meta *meta)
4131 {
4132 EDB_page *p, *q;
4133 int rc;
4134 unsigned int psize;
4135 #ifdef _WIN32
4136 DWORD len;
4137 OVERLAPPED ov;
4138 memset(&ov, 0, sizeof(ov));
4139 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
4140 ov.Offset = pos; \
4141 rc = WriteFile(fd, ptr, size, &len, &ov); } while(0)
4142 #else
4143 int len;
4144 #define DO_PWRITE(rc, fd, ptr, size, len, pos) do { \
4145 len = pwrite(fd, ptr, size, pos); \
4146 if (len == -1 && ErrCode() == EINTR) continue; \
4147 rc = (len >= 0); break; } while(1)
4148 #endif
4149
4150 DPUTS("writing new meta page");
4151
4152 psize = env->me_psize;
4153
4154 p = calloc(NUM_METAS, psize);
4155 if (!p)
4156 return ENOMEM;
4157 p->mp_pgno = 0;
4158 p->mp_flags = P_META;
4159 *(EDB_meta *)METADATA(p) = *meta;
4160
4161 q = (EDB_page *)((char *)p + psize);
4162 q->mp_pgno = 1;
4163 q->mp_flags = P_META;
4164 *(EDB_meta *)METADATA(q) = *meta;
4165
4166 DO_PWRITE(rc, env->me_fd, p, psize * NUM_METAS, len, 0);
4167 if (!rc)
4168 rc = ErrCode();
4169 else if ((unsigned) len == psize * NUM_METAS)
4170 rc = EDB_SUCCESS;
4171 else
4172 rc = ENOSPC;
4173 free(p);
4174 return rc;
4175 }
4176
4177
4178
4179
4180
4181 static int
4182 edb_env_write_meta(EDB_txn *txn)
4183 {
4184 EDB_env *env;
4185 EDB_meta meta, metab, *mp;
4186 unsigned flags;
4187 edb_size_t mapsize;
4188 off_t off;
4189 int rc, len, toggle;
4190 char *ptr;
4191 HANDLE mfd;
4192 #ifdef _WIN32
4193 OVERLAPPED ov;
4194 #else
4195 int r2;
4196 #endif
4197
4198 toggle = txn->mt_txnid & 1;
4199 DPRINTF(("writing meta page %d for root page %"Yu,
4200 toggle, txn->mt_dbs[MAIN_DBI].md_root));
4201
4202 env = txn->mt_env;
4203 flags = txn->mt_flags | env->me_flags;
4204 mp = env->me_metas[toggle];
4205 mapsize = env->me_metas[toggle ^ 1]->mm_mapsize;
4206
4207 if (mapsize < env->me_mapsize)
4208 mapsize = env->me_mapsize;
4209
4210 if (flags & EDB_WRITEMAP) {
4211 mp->mm_mapsize = mapsize;
4212 mp->mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
4213 mp->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
4214 mp->mm_last_pg = txn->mt_next_pgno - 1;
4215 #if (__GNUC__ * 100 + __GNUC_MINOR__ >= 404) && \
4216 !(defined(__i386__) || defined(__x86_64__))
4217
4218 __sync_synchronize();
4219 #endif
4220 mp->mm_txnid = txn->mt_txnid;
4221 if (!(flags & (EDB_NOMETASYNC|EDB_NOSYNC))) {
4222 unsigned meta_size = env->me_psize;
4223 rc = (env->me_flags & EDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
4224 ptr = (char *)mp - PAGEHDRSZ;
4225 #ifndef _WIN32
4226 r2 = (ptr - env->me_map) & (env->me_os_psize - 1);
4227 ptr -= r2;
4228 meta_size += r2;
4229 #endif
4230 if (EDB_MSYNC(ptr, meta_size, rc)) {
4231 rc = ErrCode();
4232 goto fail;
4233 }
4234 }
4235 goto done;
4236 }
4237 metab.mm_txnid = mp->mm_txnid;
4238 metab.mm_last_pg = mp->mm_last_pg;
4239
4240 meta.mm_mapsize = mapsize;
4241 meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI];
4242 meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
4243 meta.mm_last_pg = txn->mt_next_pgno - 1;
4244 meta.mm_txnid = txn->mt_txnid;
4245
4246 off = offsetof(EDB_meta, mm_mapsize);
4247 ptr = (char *)&meta + off;
4248 len = sizeof(EDB_meta) - off;
4249 off += (char *)mp - env->me_map;
4250
4251
4252
4253
4254
4255 mfd = (flags & (EDB_NOSYNC|EDB_NOMETASYNC)) ? env->me_fd : env->me_mfd;
4256 #ifdef _WIN32
4257 {
4258 memset(&ov, 0, sizeof(ov));
4259 ov.Offset = off;
4260 if (!WriteFile(mfd, ptr, len, (DWORD *)&rc, &ov))
4261 rc = -1;
4262 }
4263 #else
4264 retry_write:
4265 rc = pwrite(mfd, ptr, len, off);
4266 #endif
4267 if (rc != len) {
4268 rc = rc < 0 ? ErrCode() : EIO;
4269 #ifndef _WIN32
4270 if (rc == EINTR)
4271 goto retry_write;
4272 #endif
4273 DPUTS("write failed, disk error?");
4274
4275
4276
4277
4278 meta.mm_last_pg = metab.mm_last_pg;
4279 meta.mm_txnid = metab.mm_txnid;
4280 #ifdef _WIN32
4281 memset(&ov, 0, sizeof(ov));
4282 ov.Offset = off;
4283 WriteFile(env->me_fd, ptr, len, NULL, &ov);
4284 #else
4285 r2 = pwrite(env->me_fd, ptr, len, off);
4286 (void)r2;
4287 #endif
4288 fail:
4289 env->me_flags |= EDB_FATAL_ERROR;
4290 return rc;
4291 }
4292
4293 CACHEFLUSH(env->me_map + off, len, DCACHE);
4294 done:
4295
4296
4297
4298
4299
4300
4301 if (env->me_txns)
4302 env->me_txns->mti_txnid = txn->mt_txnid;
4303
4304 return EDB_SUCCESS;
4305 }
4306
4307
4308
4309
4310
4311 static EDB_meta *
4312 edb_env_pick_meta(const EDB_env *env)
4313 {
4314 EDB_meta *const *metas = env->me_metas;
4315 return metas[ (metas[0]->mm_txnid < metas[1]->mm_txnid) ^
4316 ((env->me_flags & EDB_PREVSNAPSHOT) != 0) ];
4317 }
4318
4319 int ESECT
4320 edb_env_create(EDB_env **env)
4321 {
4322 EDB_env *e;
4323
4324 e = calloc(1, sizeof(EDB_env));
4325 if (!e)
4326 return ENOMEM;
4327
4328 e->me_maxreaders = DEFAULT_READERS;
4329 e->me_maxdbs = e->me_nuedbs = CORE_DBS;
4330 e->me_fd = INVALID_HANDLE_VALUE;
4331 e->me_lfd = INVALID_HANDLE_VALUE;
4332 e->me_mfd = INVALID_HANDLE_VALUE;
4333 #ifdef EDB_USE_POSIX_SEM
4334 e->me_rmutex = SEM_FAILED;
4335 e->me_wmutex = SEM_FAILED;
4336 #elif defined EDB_USE_SYSV_SEM
4337 e->me_rmutex->semid = -1;
4338 e->me_wmutex->semid = -1;
4339 #endif
4340 e->me_pid = getpid();
4341 GET_PAGESIZE(e->me_os_psize);
4342 VGMEMP_CREATE(e,0,0);
4343 *env = e;
4344 return EDB_SUCCESS;
4345 }
4346
4347 #ifdef _WIN32
4348
4349 static DWORD
4350 edb_nt2win32(NTSTATUS st)
4351 {
4352 OVERLAPPED o = {0};
4353 DWORD br;
4354 o.Internal = st;
4355 GetOverlappedResult(NULL, &o, &br, FALSE);
4356 return GetLastError();
4357 }
4358 #endif
4359
4360 static int ESECT
4361 edb_env_map(EDB_env *env, void *addr)
4362 {
4363 EDB_page *p;
4364 unsigned int flags = env->me_flags;
4365 #ifdef _WIN32
4366 int rc;
4367 int access = SECTION_MAP_READ;
4368 HANDLE mh;
4369 void *map;
4370 SIZE_T msize;
4371 ULONG pageprot = PAGE_READONLY, secprot, alloctype;
4372
4373 if (flags & EDB_WRITEMAP) {
4374 access |= SECTION_MAP_WRITE;
4375 pageprot = PAGE_READWRITE;
4376 }
4377 if (flags & EDB_RDONLY) {
4378 secprot = PAGE_READONLY;
4379 msize = 0;
4380 alloctype = 0;
4381 } else {
4382 secprot = PAGE_READWRITE;
4383 msize = env->me_mapsize;
4384 alloctype = MEM_RESERVE;
4385 }
4386
4387 rc = NtCreateSection(&mh, access, NULL, NULL, secprot, SEC_RESERVE, env->me_fd);
4388 if (rc)
4389 return edb_nt2win32(rc);
4390 map = addr;
4391 #ifdef EDB_VL32
4392 msize = NUM_METAS * env->me_psize;
4393 #endif
4394 rc = NtMapViewOfSection(mh, GetCurrentProcess(), &map, 0, 0, NULL, &msize, ViewUnmap, alloctype, pageprot);
4395 #ifdef EDB_VL32
4396 env->me_fmh = mh;
4397 #else
4398 NtClose(mh);
4399 #endif
4400 if (rc)
4401 return edb_nt2win32(rc);
4402 env->me_map = map;
4403 #else
4404 int mmap_flags = MAP_SHARED;
4405 int prot = PROT_READ;
4406 #ifdef MAP_NOSYNC
4407 if (flags & EDB_NOSYNC)
4408 mmap_flags |= MAP_NOSYNC;
4409 #endif
4410 #ifdef EDB_VL32
4411 (void) flags;
4412 env->me_map = mmap(addr, NUM_METAS * env->me_psize, prot, mmap_flags,
4413 env->me_fd, 0);
4414 if (env->me_map == MAP_FAILED) {
4415 env->me_map = NULL;
4416 return ErrCode();
4417 }
4418 #else
4419 if (flags & EDB_WRITEMAP) {
4420 prot |= PROT_WRITE;
4421 if (ftruncate(env->me_fd, env->me_mapsize) < 0)
4422 return ErrCode();
4423 }
4424 env->me_map = mmap(addr, env->me_mapsize, prot, mmap_flags,
4425 env->me_fd, 0);
4426 if (env->me_map == MAP_FAILED) {
4427 env->me_map = NULL;
4428 return ErrCode();
4429 }
4430
4431 if (flags & EDB_NORDAHEAD) {
4432
4433 #ifdef MADV_RANDOM
4434 madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
4435 #else
4436 #ifdef POSIX_MADV_RANDOM
4437 posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
4438 #endif
4439 #endif
4440 }
4441 #endif
4442
4443
4444
4445
4446
4447
4448 if (addr && env->me_map != addr)
4449 return EBUSY;
4450 #endif
4451
4452 p = (EDB_page *)env->me_map;
4453 env->me_metas[0] = METADATA(p);
4454 env->me_metas[1] = (EDB_meta *)((char *)env->me_metas[0] + env->me_psize);
4455
4456 return EDB_SUCCESS;
4457 }
4458
4459 int ESECT
4460 edb_env_set_mapsize(EDB_env *env, edb_size_t size)
4461 {
4462
4463
4464
4465 if (env->me_map) {
4466 EDB_meta *meta;
4467 #ifndef EDB_VL32
4468 void *old;
4469 int rc;
4470 #endif
4471 if (env->me_txn)
4472 return EINVAL;
4473 meta = edb_env_pick_meta(env);
4474 if (!size)
4475 size = meta->mm_mapsize;
4476 {
4477
4478 edb_size_t minsize = (meta->mm_last_pg + 1) * env->me_psize;
4479 if (size < minsize)
4480 size = minsize;
4481 }
4482 #ifndef EDB_VL32
4483
4484
4485
4486 munmap(env->me_map, env->me_mapsize);
4487 env->me_mapsize = size;
4488 old = (env->me_flags & EDB_FIXEDMAP) ? env->me_map : NULL;
4489 rc = edb_env_map(env, old);
4490 if (rc)
4491 return rc;
4492 #endif
4493 }
4494 env->me_mapsize = size;
4495 if (env->me_psize)
4496 env->me_maxpg = env->me_mapsize / env->me_psize;
4497 return EDB_SUCCESS;
4498 }
4499
4500 int ESECT
4501 edb_env_set_maxdbs(EDB_env *env, EDB_dbi dbs)
4502 {
4503 if (env->me_map)
4504 return EINVAL;
4505 env->me_maxdbs = dbs + CORE_DBS;
4506 return EDB_SUCCESS;
4507 }
4508
4509 int ESECT
4510 edb_env_set_maxreaders(EDB_env *env, unsigned int readers)
4511 {
4512 if (env->me_map || readers < 1)
4513 return EINVAL;
4514 env->me_maxreaders = readers;
4515 return EDB_SUCCESS;
4516 }
4517
4518 int ESECT
4519 edb_env_get_maxreaders(EDB_env *env, unsigned int *readers)
4520 {
4521 if (!env || !readers)
4522 return EINVAL;
4523 *readers = env->me_maxreaders;
4524 return EDB_SUCCESS;
4525 }
4526
4527 static int ESECT
4528 edb_fsize(HANDLE fd, edb_size_t *size)
4529 {
4530 #ifdef _WIN32
4531 LARGE_INTEGER fsize;
4532
4533 if (!GetFileSizeEx(fd, &fsize))
4534 return ErrCode();
4535
4536 *size = fsize.QuadPart;
4537 #else
4538 struct stat st;
4539
4540 if (fstat(fd, &st))
4541 return ErrCode();
4542
4543 *size = st.st_size;
4544 #endif
4545 return EDB_SUCCESS;
4546 }
4547
4548
4549 #ifdef _WIN32
4550 typedef wchar_t edb_nchar_t;
4551 # define EDB_NAME(str) L##str
4552 # define edb_name_cpy wcscpy
4553 #else
4554
4555 typedef char edb_nchar_t;
4556 # define EDB_NAME(str) str
4557 # define edb_name_cpy strcpy
4558 #endif
4559
4560
4561 typedef struct EDB_name {
4562 int mn_len;
4563 int mn_alloced;
4564 edb_nchar_t *mn_val;
4565 } EDB_name;
4566
4567
4568 static const edb_nchar_t *const edb_suffixes[2][2] = {
4569 { EDB_NAME("/data.edb"), EDB_NAME("") },
4570 { EDB_NAME("/lock.edb"), EDB_NAME("-lock") }
4571 };
4572
4573 #define EDB_SUFFLEN 9
4574
4575
4576
4577
4578
4579
4580
4581
4582
4583 static int ESECT
4584 edb_fname_init(const char *path, unsigned envflags, EDB_name *fname)
4585 {
4586 int no_suffix = F_ISSET(envflags, EDB_NOSUBDIR|EDB_NOLOCK);
4587 fname->mn_alloced = 0;
4588 #ifdef _WIN32
4589 return utf8_to_utf16(path, fname, no_suffix ? 0 : EDB_SUFFLEN);
4590 #else
4591 fname->mn_len = strlen(path);
4592 if (no_suffix)
4593 fname->mn_val = (char *) path;
4594 else if ((fname->mn_val = malloc(fname->mn_len + EDB_SUFFLEN+1)) != NULL) {
4595 fname->mn_alloced = 1;
4596 strcpy(fname->mn_val, path);
4597 }
4598 else
4599 {
4600 NDRX_LOG(log_error, "%s: malloc fail: %s",
4601 __func__, strerror(errno));
4602 return ENOMEM;
4603 }
4604 return EDB_SUCCESS;
4605 #endif
4606 }
4607
4608
4609 #define edb_fname_destroy(fname) \
4610 do { if ((fname).mn_alloced) free((fname).mn_val); } while (0)
4611
4612 #ifdef O_CLOEXEC
4613 # define EDB_CLOEXEC O_CLOEXEC
4614 #else
4615 # define EDB_CLOEXEC 0
4616 #endif
4617
4618
4619 enum edb_fopen_type {
4620 #ifdef _WIN32
4621 EDB_O_RDONLY, EDB_O_RDWR, EDB_O_META, EDB_O_COPY, EDB_O_LOCKS
4622 #else
4623
4624 EDB_O_RDONLY= O_RDONLY,
4625 EDB_O_RDWR = O_RDWR |O_CREAT,
4626 EDB_O_META = O_WRONLY|EDB_DSYNC |EDB_CLOEXEC,
4627 EDB_O_COPY = O_WRONLY|O_CREAT|O_EXCL|EDB_CLOEXEC,
4628
4629
4630
4631 EDB_O_MASK = EDB_O_RDWR|EDB_CLOEXEC | EDB_O_RDONLY|EDB_O_META|EDB_O_COPY,
4632 EDB_O_LOCKS = EDB_O_RDWR|EDB_CLOEXEC | ((EDB_O_MASK+1) & ~EDB_O_MASK)
4633 #endif
4634 };
4635
4636
4637
4638
4639
4640
4641
4642
4643
4644
4645 static int ESECT
4646 edb_fopen(const EDB_env *env, EDB_name *fname,
4647 enum edb_fopen_type which, edb_mode_t mode,
4648 HANDLE *res)
4649 {
4650 int rc = EDB_SUCCESS;
4651 HANDLE fd;
4652 #ifdef _WIN32
4653 DWORD acc, share, disp, attrs;
4654 #else
4655 int flags;
4656 #endif
4657
4658 if (fname->mn_alloced)
4659 edb_name_cpy(fname->mn_val + fname->mn_len,
4660 edb_suffixes[which==EDB_O_LOCKS][F_ISSET(env->me_flags, EDB_NOSUBDIR)]);
4661
4662
4663
4664
4665
4666
4667
4668
4669
4670
4671
4672
4673
4674
4675
4676 #ifdef _WIN32
4677 acc = GENERIC_READ|GENERIC_WRITE;
4678 share = FILE_SHARE_READ|FILE_SHARE_WRITE;
4679 disp = OPEN_ALWAYS;
4680 attrs = FILE_ATTRIBUTE_NORMAL;
4681 switch (which) {
4682 case EDB_O_RDONLY:
4683 acc = GENERIC_READ;
4684 disp = OPEN_EXISTING;
4685 break;
4686 case EDB_O_META:
4687 acc = GENERIC_WRITE;
4688 disp = OPEN_EXISTING;
4689 attrs = FILE_ATTRIBUTE_NORMAL|FILE_FLAG_WRITE_THROUGH;
4690 break;
4691 case EDB_O_COPY:
4692 acc = GENERIC_WRITE;
4693 share = 0;
4694 disp = CREATE_NEW;
4695 attrs = FILE_FLAG_NO_BUFFERING|FILE_FLAG_WRITE_THROUGH;
4696 break;
4697 default: break;
4698 }
4699 fd = CreateFileW(fname->mn_val, acc, share, NULL, disp, attrs, NULL);
4700 #else
4701 fd = open(fname->mn_val, which & EDB_O_MASK, mode);
4702 #endif
4703
4704 if (fd == INVALID_HANDLE_VALUE)
4705 rc = ErrCode();
4706 #ifndef _WIN32
4707 else {
4708 if (which != EDB_O_RDONLY && which != EDB_O_RDWR) {
4709
4710 if (!EDB_CLOEXEC && (flags = fcntl(fd, F_GETFD)) != -1)
4711 (void) fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
4712 }
4713 if (which == EDB_O_COPY && env->me_psize >= env->me_os_psize) {
4714
4715
4716
4717 # ifdef F_NOCACHE
4718 (void) fcntl(fd, F_NOCACHE, 1);
4719 # elif defined O_DIRECT
4720
4721
4722
4723 if ((flags = fcntl(fd, F_GETFL)) != -1)
4724 (void) fcntl(fd, F_SETFL, flags | O_DIRECT);
4725 # endif
4726 }
4727 }
4728 #endif
4729
4730 *res = fd;
4731 return rc;
4732 }
4733
4734
4735 #ifdef BROKEN_FDATASYNC
4736 #include <sys/utsname.h>
4737 #include <sys/vfs.h>
4738 #endif
4739
4740
4741
4742 static int ESECT
4743 edb_env_open2(EDB_env *env, int prev)
4744 {
4745 unsigned int flags = env->me_flags;
4746 int i, newenv = 0, rc;
4747 EDB_meta meta;
4748
4749 #ifdef _WIN32
4750
4751 rc = GetVersion();
4752 if ((rc & 0xff) > 5)
4753 env->me_pidquery = EDB_PROCESS_QUERY_LIMITED_INFORMATION;
4754 else
4755 env->me_pidquery = PROCESS_QUERY_INFORMATION;
4756
4757 if (!NtCreateSection) {
4758 HMODULE h = GetModuleHandleW(L"NTDLL.DLL");
4759 if (!h)
4760 return EDB_PROBLEM;
4761 NtClose = (NtCloseFunc *)GetProcAddress(h, "NtClose");
4762 if (!NtClose)
4763 return EDB_PROBLEM;
4764 NtMapViewOfSection = (NtMapViewOfSectionFunc *)GetProcAddress(h, "NtMapViewOfSection");
4765 if (!NtMapViewOfSection)
4766 return EDB_PROBLEM;
4767 NtCreateSection = (NtCreateSectionFunc *)GetProcAddress(h, "NtCreateSection");
4768 if (!NtCreateSection)
4769 return EDB_PROBLEM;
4770 }
4771 #endif
4772
4773 #ifdef BROKEN_FDATASYNC
4774
4775
4776
4777
4778
4779
4780
4781
4782 {
4783 struct statfs st;
4784 fstatfs(env->me_fd, &st);
4785 while (st.f_type == 0xEF53) {
4786 struct utsname uts;
4787 int i;
4788 uname(&uts);
4789 if (uts.release[0] < '3') {
4790 if (!strncmp(uts.release, "2.6.32.", 7)) {
4791 i = atoi(uts.release+7);
4792 if (i >= 60)
4793 break;
4794 } else if (!strncmp(uts.release, "2.6.34.", 7)) {
4795 i = atoi(uts.release+7);
4796 if (i >= 15)
4797 break;
4798 }
4799 } else if (uts.release[0] == '3') {
4800 i = atoi(uts.release+2);
4801 if (i > 5)
4802 break;
4803 if (i == 5) {
4804 i = atoi(uts.release+4);
4805 if (i >= 4)
4806 break;
4807 } else if (i == 2) {
4808 i = atoi(uts.release+4);
4809 if (i >= 30)
4810 break;
4811 }
4812 } else {
4813 break;
4814 }
4815 env->me_flags |= EDB_FSYNCONLY;
4816 break;
4817 }
4818 }
4819 #endif
4820
4821 if ((i = edb_env_read_header(env, prev, &meta)) != 0) {
4822 if (i != ENOENT)
4823 return i;
4824 DPUTS("new edbenv");
4825 newenv = 1;
4826 env->me_psize = env->me_os_psize;
4827 if (env->me_psize > MAX_PAGESIZE)
4828 env->me_psize = MAX_PAGESIZE;
4829 memset(&meta, 0, sizeof(meta));
4830 edb_env_init_meta0(env, &meta);
4831 meta.mm_mapsize = DEFAULT_MAPSIZE;
4832 } else {
4833 env->me_psize = meta.mm_psize;
4834 }
4835
4836
4837 if (!env->me_mapsize) {
4838 env->me_mapsize = meta.mm_mapsize;
4839 }
4840 {
4841
4842
4843
4844 edb_size_t minsize = (meta.mm_last_pg + 1) * meta.mm_psize;
4845 if (env->me_mapsize < minsize)
4846 env->me_mapsize = minsize;
4847 }
4848 meta.mm_mapsize = env->me_mapsize;
4849
4850 if (newenv && !(flags & EDB_FIXEDMAP)) {
4851
4852
4853
4854
4855
4856
4857
4858 rc = edb_env_init_meta(env, &meta);
4859 if (rc) {
4860 NDRX_LOG(log_error, "%s: edb_env_init_meta failed: %d",
4861 __func__, rc);
4862 return rc;
4863 }
4864 newenv = 0;
4865 }
4866 #ifdef _WIN32
4867
4868 if (newenv) {
4869 char dummy = 0;
4870 DWORD len;
4871 rc = WriteFile(env->me_fd, &dummy, 1, &len, NULL);
4872 if (!rc) {
4873 rc = ErrCode();
4874 return rc;
4875 }
4876 }
4877 #endif
4878
4879 rc = edb_env_map(env, (flags & EDB_FIXEDMAP) ? meta.mm_address : NULL);
4880 if (rc) {
4881 NDRX_LOG(log_error, "%s: edb_env_map failed: %d",
4882 __func__, rc);
4883 return rc;
4884 }
4885
4886 if (newenv) {
4887 if (flags & EDB_FIXEDMAP)
4888 meta.mm_address = env->me_map;
4889 i = edb_env_init_meta(env, &meta);
4890 if (i != EDB_SUCCESS) {
4891 return i;
4892 }
4893 }
4894
4895 env->me_maxfree_1pg = (env->me_psize - PAGEHDRSZ) / sizeof(pgno_t) - 1;
4896 env->me_nodemax = (((env->me_psize - PAGEHDRSZ) / EDB_MINKEYS) & -2)
4897 - sizeof(indx_t);
4898 #if !(EDB_MAXKEYSIZE)
4899 env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(EDB_db));
4900 #endif
4901 env->me_maxpg = env->me_mapsize / env->me_psize;
4902
4903 if (env->me_txns)
4904 env->me_txns->mti_txnid = meta.mm_txnid;
4905
4906 #if EDB_DEBUG
4907 {
4908 EDB_meta *meta = edb_env_pick_meta(env);
4909 EDB_db *db = &meta->mm_dbs[MAIN_DBI];
4910
4911 DPRINTF(("opened database version %u, pagesize %u",
4912 meta->mm_version, env->me_psize));
4913 DPRINTF(("using meta page %d", (int) (meta->mm_txnid & 1)));
4914 DPRINTF(("depth: %u", db->md_depth));
4915 DPRINTF(("entries: %"Yu, db->md_entries));
4916 DPRINTF(("branch pages: %"Yu, db->md_branch_pages));
4917 DPRINTF(("leaf pages: %"Yu, db->md_leaf_pages));
4918 DPRINTF(("overflow pages: %"Yu, db->md_overflow_pages));
4919 DPRINTF(("root: %"Yu, db->md_root));
4920 }
4921 #endif
4922
4923 return EDB_SUCCESS;
4924 }
4925
4926
4927
4928
4929
4930
4931 static void
4932 edb_env_reader_dest(void *ptr)
4933 {
4934 EDB_reader *reader = ptr;
4935
4936 #ifndef _WIN32
4937 if (reader->mr_pid == getpid())
4938 #endif
4939
4940 reader->mr_pid = 0;
4941 }
4942
4943 #ifdef _WIN32
4944
4945
4946
4947
4948
4949 #ifndef MAX_TLS_KEYS
4950 #define MAX_TLS_KEYS 64
4951 #endif
4952 static pthread_key_t edb_tls_keys[MAX_TLS_KEYS];
4953 static int edb_tls_nkeys;
4954
4955 static void NTAPI edb_tls_callback(PVOID module, DWORD reason, PVOID ptr)
4956 {
4957 int i;
4958 switch(reason) {
4959 case DLL_PROCESS_ATTACH: break;
4960 case DLL_THREAD_ATTACH: break;
4961 case DLL_THREAD_DETACH:
4962 for (i=0; i<edb_tls_nkeys; i++) {
4963 EDB_reader *r = pthread_getspecific(edb_tls_keys[i]);
4964 if (r) {
4965 edb_env_reader_dest(r);
4966 }
4967 }
4968 break;
4969 case DLL_PROCESS_DETACH: break;
4970 }
4971 }
4972 #ifdef __GNUC__
4973 #ifdef _WIN64
4974 const PIMAGE_TLS_CALLBACK edb_tls_cbp __attribute__((section (".CRT$XLB"))) = edb_tls_callback;
4975 #else
4976 PIMAGE_TLS_CALLBACK edb_tls_cbp __attribute__((section (".CRT$XLB"))) = edb_tls_callback;
4977 #endif
4978 #else
4979 #ifdef _WIN64
4980
4981
4982
4983
4984 #pragma comment(linker, "/INCLUDE:_tls_used")
4985 #pragma comment(linker, "/INCLUDE:edb_tls_cbp")
4986 #pragma const_seg(".CRT$XLB")
4987 extern const PIMAGE_TLS_CALLBACK edb_tls_cbp;
4988 const PIMAGE_TLS_CALLBACK edb_tls_cbp = edb_tls_callback;
4989 #pragma const_seg()
4990 #else
4991 #pragma comment(linker, "/INCLUDE:__tls_used")
4992 #pragma comment(linker, "/INCLUDE:_edb_tls_cbp")
4993 #pragma data_seg(".CRT$XLB")
4994 PIMAGE_TLS_CALLBACK edb_tls_cbp = edb_tls_callback;
4995 #pragma data_seg()
4996 #endif
4997 #endif
4998 #endif
4999
5000
5001 static int ESECT
5002 edb_env_share_locks(EDB_env *env, int *excl)
5003 {
5004 int rc = 0;
5005
5006 #ifdef _WIN32
5007 {
5008 OVERLAPPED ov;
5009
5010
5011
5012 memset(&ov, 0, sizeof(ov));
5013 if (!LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
5014 rc = ErrCode();
5015 } else {
5016 UnlockFile(env->me_lfd, 0, 0, 1, 0);
5017 *excl = 0;
5018 }
5019 }
5020 #else
5021 {
5022 struct flock lock_info;
5023
5024 memset((void *)&lock_info, 0, sizeof(lock_info));
5025 lock_info.l_type = F_RDLCK;
5026 lock_info.l_whence = SEEK_SET;
5027 lock_info.l_start = 0;
5028 lock_info.l_len = 1;
5029 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
5030 (rc = ErrCode()) == EINTR) ;
5031 *excl = rc ? -1 : 0;
5032 }
5033 #endif
5034
5035 return rc;
5036 }
5037
5038
5039
5040
5041 static int ESECT
5042 edb_env_excl_lock(EDB_env *env, int *excl)
5043 {
5044 int rc = 0;
5045 #ifdef _WIN32
5046 if (LockFile(env->me_lfd, 0, 0, 1, 0)) {
5047 *excl = 1;
5048 } else {
5049 OVERLAPPED ov;
5050 memset(&ov, 0, sizeof(ov));
5051 if (LockFileEx(env->me_lfd, 0, 0, 1, 0, &ov)) {
5052 *excl = 0;
5053 } else {
5054 rc = ErrCode();
5055 }
5056 }
5057 #else
5058 struct flock lock_info;
5059 memset((void *)&lock_info, 0, sizeof(lock_info));
5060 lock_info.l_type = F_WRLCK;
5061 lock_info.l_whence = SEEK_SET;
5062 lock_info.l_start = 0;
5063 lock_info.l_len = 1;
5064 while ((rc = fcntl(env->me_lfd, F_SETLK, &lock_info)) &&
5065 (rc = ErrCode()) == EINTR) ;
5066 if (!rc) {
5067 *excl = 1;
5068 } else
5069 # ifndef EDB_USE_POSIX_MUTEX
5070 if (*excl < 0)
5071 # endif
5072 {
5073 lock_info.l_type = F_RDLCK;
5074 while ((rc = fcntl(env->me_lfd, F_SETLKW, &lock_info)) &&
5075 (rc = ErrCode()) == EINTR) ;
5076 if (rc == 0)
5077 *excl = 0;
5078 }
5079 #endif
5080 return rc;
5081 }
5082
5083 #ifdef EDB_USE_HASH
5084
5085
5086
5087
5088
5089
5090
5091
5092
5093
5094
5095
5096
5097
5098
5099
5100
5101
5102
5103
5104
5105
5106
5107
5108
5109
5110
5111
5112
5113
5114
5115
5116
5117 static edb_hash_t
5118 edb_hash(const void *val, size_t len)
5119 {
5120 const unsigned char *s = (const unsigned char *) val, *end = s + len;
5121 edb_hash_t hval = 0xcbf29ce484222325ULL;
5122
5123
5124
5125 while (s < end) {
5126 hval = (hval ^ *s++) * 0x100000001b3ULL;
5127 }
5128
5129 return hval;
5130 }
5131
5132
5133
5134
5135
5136
5137
5138
5139
5140 static const char edb_a85[]= "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!#$%&()*+-;<=>?@^_`{|}~";
5141
5142 static void ESECT
5143 edb_pack85(unsigned long long l, char *out)
5144 {
5145 int i;
5146
5147 for (i=0; i<10 && l; i++) {
5148 *out++ = edb_a85[l % 85];
5149 l /= 85;
5150 }
5151 *out = '\0';
5152 }
5153
5154
5155
5156
5157 static void ESECT
5158 edb_env_mname_init(EDB_env *env)
5159 {
5160 char *nm = env->me_mutexname;
5161 strcpy(nm, MUTEXNAME_PREFIX);
5162 edb_pack85(env->me_txns->mti_mutexid, nm + sizeof(MUTEXNAME_PREFIX));
5163 }
5164
5165
5166 #define MUTEXNAME(env, ch) ( \
5167 (void) ((env)->me_mutexname[sizeof(MUTEXNAME_PREFIX)-1] = (ch)), \
5168 (env)->me_mutexname)
5169
5170 #endif
5171
5172
5173
5174
5175
5176
5177
5178
5179 static int ESECT
5180 edb_env_setup_locks(EDB_env *env, EDB_name *fname, int mode, int *excl)
5181 {
5182 #ifdef _WIN32
5183 # define EDB_ERRCODE_ROFS ERROR_WRITE_PROTECT
5184 #else
5185 # define EDB_ERRCODE_ROFS EROFS
5186 #endif
5187 #ifdef EDB_USE_SYSV_SEM
5188 int semid;
5189 union semun semu;
5190 #endif
5191 int rc;
5192 off_t size, rsize;
5193
5194 rc = edb_fopen(env, fname, EDB_O_LOCKS, mode, &env->me_lfd);
5195 if (rc) {
5196
5197 if (rc == EDB_ERRCODE_ROFS && (env->me_flags & EDB_RDONLY)) {
5198 return EDB_SUCCESS;
5199 }
5200 goto fail;
5201 }
5202
5203 if (!(env->me_flags & EDB_NOTLS)) {
5204 rc = pthread_key_create(&env->me_txkey, edb_env_reader_dest);
5205 if (rc) {
5206 NDRX_LOG(log_debug, "%s: pthread_key_create failed: %d",
5207 __func__, rc);
5208 goto fail;
5209 }
5210 env->me_flags |= EDB_ENV_TXKEY;
5211 #ifdef _WIN32
5212
5213 if (edb_tls_nkeys >= MAX_TLS_KEYS) {
5214 rc = EDB_TLS_FULL;
5215 goto fail;
5216 }
5217 edb_tls_keys[edb_tls_nkeys++] = env->me_txkey;
5218 #endif
5219 }
5220
5221
5222
5223
5224 if ((rc = edb_env_excl_lock(env, excl))) goto fail;
5225
5226 #ifdef _WIN32
5227 size = GetFileSize(env->me_lfd, NULL);
5228 #else
5229 size = lseek(env->me_lfd, 0, SEEK_END);
5230 if (size == -1) {
5231 int err_ = errno;
5232 NDRX_LOG(log_error, "%s: lseek failed: %s", __func__, strerror(err_));
5233 errno = err_;
5234 goto fail_errno;
5235 }
5236 #endif
5237 rsize = (env->me_maxreaders-1) * sizeof(EDB_reader) + sizeof(EDB_txninfo);
5238 if (size < rsize && *excl > 0) {
5239 #ifdef _WIN32
5240 if (SetFilePointer(env->me_lfd, rsize, NULL, FILE_BEGIN) != (DWORD)rsize
5241 || !SetEndOfFile(env->me_lfd))
5242 goto fail_errno;
5243 #else
5244 if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno;
5245 #endif
5246 } else {
5247 rsize = size;
5248 size = rsize - sizeof(EDB_txninfo);
5249 env->me_maxreaders = size/sizeof(EDB_reader) + 1;
5250 }
5251 {
5252 #ifdef _WIN32
5253 HANDLE mh;
5254 mh = CreateFileMapping(env->me_lfd, NULL, PAGE_READWRITE,
5255 0, 0, NULL);
5256 if (!mh) goto fail_errno;
5257 env->me_txns = MapViewOfFileEx(mh, FILE_MAP_WRITE, 0, 0, rsize, NULL);
5258 CloseHandle(mh);
5259 if (!env->me_txns) goto fail_errno;
5260 #else
5261 void *m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED,
5262 env->me_lfd, 0);
5263 if (m == MAP_FAILED) {
5264 int err_ = errno;
5265 NDRX_LOG(log_error, "%s: mmap failed: %s",
5266 __func__, strerror(err_));
5267 errno = err_;
5268 goto fail_errno;
5269 }
5270 env->me_txns = m;
5271 #endif
5272 }
5273 if (*excl > 0) {
5274 #ifdef _WIN32
5275 BY_HANDLE_FILE_INFORMATION stbuf;
5276 struct {
5277 DWORD volume;
5278 DWORD nhigh;
5279 DWORD nlow;
5280 } idbuf;
5281
5282 if (!edb_sec_inited) {
5283 InitializeSecurityDescriptor(&edb_null_sd,
5284 SECURITY_DESCRIPTOR_REVISION);
5285 SetSecurityDescriptorDacl(&edb_null_sd, TRUE, 0, FALSE);
5286 edb_all_sa.nLength = sizeof(SECURITY_ATTRIBUTES);
5287 edb_all_sa.bInheritHandle = FALSE;
5288 edb_all_sa.lpSecurityDescriptor = &edb_null_sd;
5289 edb_sec_inited = 1;
5290 }
5291 if (!GetFileInformationByHandle(env->me_lfd, &stbuf)) goto fail_errno;
5292 idbuf.volume = stbuf.dwVolumeSerialNumber;
5293 idbuf.nhigh = stbuf.nFileIndexHigh;
5294 idbuf.nlow = stbuf.nFileIndexLow;
5295 env->me_txns->mti_mutexid = edb_hash(&idbuf, sizeof(idbuf));
5296 edb_env_mname_init(env);
5297 env->me_rmutex = CreateMutexA(&edb_all_sa, FALSE, MUTEXNAME(env, 'r'));
5298 if (!env->me_rmutex) goto fail_errno;
5299 env->me_wmutex = CreateMutexA(&edb_all_sa, FALSE, MUTEXNAME(env, 'w'));
5300 if (!env->me_wmutex) goto fail_errno;
5301 #elif defined(EDB_USE_POSIX_SEM)
5302 struct stat stbuf;
5303 struct {
5304 dev_t dev;
5305 ino_t ino;
5306 } idbuf;
5307
5308 #if defined(__NetBSD__)
5309 #define EDB_SHORT_SEMNAMES 1
5310 #endif
5311 if (fstat(env->me_lfd, &stbuf)) goto fail_errno;
5312 memset(&idbuf, 0, sizeof(idbuf));
5313 idbuf.dev = stbuf.st_dev;
5314 idbuf.ino = stbuf.st_ino;
5315 env->me_txns->mti_mutexid = edb_hash(&idbuf, sizeof(idbuf))
5316 #ifdef EDB_SHORT_SEMNAMES
5317
5318
5319
5320 % ((edb_hash_t)85*85*85*85*85*85*85*85*85)
5321 #endif
5322 ;
5323 edb_env_mname_init(env);
5324
5325
5326
5327 sem_unlink(MUTEXNAME(env, 'r'));
5328 sem_unlink(MUTEXNAME(env, 'w'));
5329 env->me_rmutex = sem_open(MUTEXNAME(env, 'r'), O_CREAT|O_EXCL, mode, 1);
5330 if (env->me_rmutex == SEM_FAILED) goto fail_errno;
5331 env->me_wmutex = sem_open(MUTEXNAME(env, 'w'), O_CREAT|O_EXCL, mode, 1);
5332 if (env->me_wmutex == SEM_FAILED) goto fail_errno;
5333 #elif defined(EDB_USE_SYSV_SEM)
5334 unsigned short vals[2] = {1, 1};
5335 key_t key = ftok(fname->mn_val, 'M');
5336 if (key == -1) {
5337 int err_ = errno;
5338 NDRX_LOG(log_error, "%s: ftok failed: %s",
5339 __func__, strerror(err_));
5340 errno = err_;
5341 goto fail_errno;
5342 }
5343 semid = semget(key, 2, (mode & 0777) | IPC_CREAT);
5344 if (semid < 0) {
5345 int err_ = errno;
5346 NDRX_LOG(log_error, "%s: semget failed: %s",
5347 __func__, strerror(err_));
5348 errno = err_;
5349 goto fail_errno;
5350 }
5351 semu.array = vals;
5352 if (semctl(semid, 0, SETALL, semu) < 0) {
5353 int err_ = errno;
5354 NDRX_LOG(log_error, "%s: semctl failed: %s",
5355 __func__, strerror(err_));
5356 errno = err_;
5357 goto fail_errno;
5358 }
5359 env->me_txns->mti_semid = semid;
5360 env->me_txns->mti_rlocked = 0;
5361 env->me_txns->mti_wlocked = 0;
5362 #else
5363 pthread_mutexattr_t mattr;
5364
5365
5366
5367
5368
5369 memset(env->me_txns->mti_rmutex, 0, sizeof(*env->me_txns->mti_rmutex));
5370 memset(env->me_txns->mti_wmutex, 0, sizeof(*env->me_txns->mti_wmutex));
5371
5372 if ((rc = pthread_mutexattr_init(&mattr)) != 0)
5373 goto fail;
5374 rc = pthread_mutexattr_setpshared(&mattr, PTHREAD_PROCESS_SHARED);
5375 #ifdef EDB_ROBUST_SUPPORTED
5376 if (!rc) rc = pthread_mutexattr_setrobust(&mattr, PTHREAD_MUTEX_ROBUST);
5377 #endif
5378 if (!rc) rc = pthread_mutex_init(env->me_txns->mti_rmutex, &mattr);
5379 if (!rc) rc = pthread_mutex_init(env->me_txns->mti_wmutex, &mattr);
5380 pthread_mutexattr_destroy(&mattr);
5381 if (rc) {
5382 NDRX_LOG(log_error, "%s: pthread_mutexattr_destroy failed: %s",
5383 __func__, strerror(errno));
5384 goto fail;
5385 }
5386 #endif
5387
5388 env->me_txns->mti_magic = EDB_MAGIC;
5389 env->me_txns->mti_format = EDB_LOCK_FORMAT;
5390 env->me_txns->mti_txnid = 0;
5391 env->me_txns->mti_numreaders = 0;
5392
5393 } else {
5394 #ifdef EDB_USE_SYSV_SEM
5395 struct semid_ds buf;
5396 #endif
5397 if (env->me_txns->mti_magic != EDB_MAGIC) {
5398 NDRX_LOG(log_error, "lock region has invalid magic");
5399 rc = EDB_INVALID;
5400 goto fail;
5401 }
5402 if (env->me_txns->mti_format != EDB_LOCK_FORMAT) {
5403 NDRX_LOG(log_error, "lock region has format+version 0x%x, expected 0x%x",
5404 env->me_txns->mti_format, EDB_LOCK_FORMAT);
5405 rc = EDB_VERSION_MISMATCH;
5406 goto fail;
5407 }
5408 rc = ErrCode();
5409 if (rc && rc != EACCES && rc != EAGAIN) {
5410 NDRX_LOG(log_error, "Invalid rc=%d", rc);
5411 goto fail;
5412 }
5413 #ifdef _WIN32
5414 edb_env_mname_init(env);
5415 env->me_rmutex = OpenMutexA(SYNCHRONIZE, FALSE, MUTEXNAME(env, 'r'));
5416 if (!env->me_rmutex) goto fail_errno;
5417 env->me_wmutex = OpenMutexA(SYNCHRONIZE, FALSE, MUTEXNAME(env, 'w'));
5418 if (!env->me_wmutex) goto fail_errno;
5419 #elif defined(EDB_USE_POSIX_SEM)
5420 edb_env_mname_init(env);
5421 env->me_rmutex = sem_open(MUTEXNAME(env, 'r'), 0);
5422 if (env->me_rmutex == SEM_FAILED) goto fail_errno;
5423 env->me_wmutex = sem_open(MUTEXNAME(env, 'w'), 0);
5424 if (env->me_wmutex == SEM_FAILED) goto fail_errno;
5425 #elif defined(EDB_USE_SYSV_SEM)
5426 semid = env->me_txns->mti_semid;
5427 semu.buf = &buf;
5428
5429 if (semctl(semid, 0, IPC_STAT, semu) < 0) {
5430 int err_ = errno;
5431 NDRX_LOG(log_error, "%s: semctl failed: %s",
5432 __func__, strerror(err_));
5433 errno = err_;
5434 goto fail_errno;
5435 }
5436
5437 if (semctl(semid, 0, IPC_SET, semu) < 0) {
5438 int err_ = errno;
5439 NDRX_LOG(log_error, "%s: semctl failed: %s",
5440 __func__, strerror(err_));
5441 errno = err_;
5442 goto fail_errno;
5443 }
5444 #endif
5445 }
5446 #ifdef EDB_USE_SYSV_SEM
5447 env->me_rmutex->semid = semid;
5448 env->me_wmutex->semid = semid;
5449 env->me_rmutex->semnum = 0;
5450 env->me_wmutex->semnum = 1;
5451 env->me_rmutex->locked = &env->me_txns->mti_rlocked;
5452 env->me_wmutex->locked = &env->me_txns->mti_wlocked;
5453 #endif
5454
5455 return EDB_SUCCESS;
5456
5457 fail_errno:
5458 rc = ErrCode();
5459 fail:
5460 return rc;
5461 }
5462
5463
5464
5465
5466
5467 #define CHANGEABLE (EDB_NOSYNC|EDB_NOMETASYNC|EDB_MAPASYNC|EDB_NOMEMINIT)
5468 #define CHANGELESS (EDB_FIXEDMAP|EDB_NOSUBDIR|EDB_RDONLY| \
5469 EDB_WRITEMAP|EDB_NOTLS|EDB_NOLOCK|EDB_NORDAHEAD|EDB_PREVSNAPSHOT)
5470
5471 #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
5472 # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
5473 #endif
5474
5475 int ESECT
5476 edb_env_open(EDB_env *env, const char *path, unsigned int flags, edb_mode_t mode)
5477 {
5478 int rc, excl = -1;
5479 EDB_name fname;
5480
5481 if (env->me_fd!=INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
5482 return EINVAL;
5483
5484 #ifdef EDB_VL32
5485 if (flags & EDB_WRITEMAP) {
5486
5487 flags ^= EDB_WRITEMAP;
5488 }
5489 if (flags & EDB_FIXEDMAP) {
5490
5491 return EINVAL;
5492 }
5493 #endif
5494 flags |= env->me_flags;
5495
5496 rc = edb_fname_init(path, flags, &fname);
5497 if (rc)
5498 return rc;
5499
5500 #ifdef EDB_VL32
5501 #ifdef _WIN32
5502 env->me_rpmutex = CreateMutex(NULL, FALSE, NULL);
5503 if (!env->me_rpmutex) {
5504 rc = ErrCode();
5505 goto leave;
5506 }
5507 #else
5508 rc = pthread_mutex_init(&env->me_rpmutex, NULL);
5509 if (rc)
5510 {
5511 NDRX_LOG(log_error, "%s: pthread_mutex_init failed: %d",
5512 __func__, rc);
5513 goto leave;
5514 }
5515 #endif
5516 #endif
5517 flags |= EDB_ENV_ACTIVE;
5518
5519 if (flags & EDB_RDONLY) {
5520
5521 flags &= ~EDB_WRITEMAP;
5522 } else {
5523 if (!((env->me_free_pgs = edb_eidl_alloc(EDB_IDL_UM_MAX)) &&
5524 (env->me_dirty_list = calloc(EDB_IDL_UM_SIZE, sizeof(EDB_ID2)))))
5525 {
5526 NDRX_LOG(log_error, "edb_eidl_alloc failed");
5527 rc = ENOMEM;
5528 }
5529 }
5530
5531 env->me_flags = flags;
5532 if (rc)
5533 goto leave;
5534
5535 #ifdef EDB_VL32
5536 {
5537 env->me_rpages = malloc(EDB_ERPAGE_SIZE * sizeof(EDB_ID3));
5538 if (!env->me_rpages) {
5539 NDRX_LOG(log_error, "malloc failed: %ld",
5540 (long)(EDB_ERPAGE_SIZE * sizeof(EDB_ID3)));
5541 rc = ENOMEM;
5542 goto leave;
5543 }
5544 env->me_rpages[0].mid = 0;
5545 env->me_rpcheck = EDB_ERPAGE_SIZE/2;
5546 }
5547 #endif
5548
5549 env->me_path = strdup(path);
5550 env->me_dbxs = calloc(env->me_maxdbs, sizeof(EDB_dbx));
5551 env->me_dbflags = calloc(env->me_maxdbs, sizeof(uint16_t));
5552 env->me_dbiseqs = calloc(env->me_maxdbs, sizeof(unsigned int));
5553 if (!(env->me_dbxs && env->me_path && env->me_dbflags && env->me_dbiseqs)) {
5554 NDRX_LOG(log_error, "calloc failed: %p %p %p %p",
5555 env->me_path, env->me_dbxs, env->me_dbflags, env->me_dbiseqs);
5556 rc = ENOMEM;
5557 goto leave;
5558 }
5559 env->me_dbxs[FREE_DBI].md_cmp = edb_cmp_long;
5560
5561
5562 if (!(flags & (EDB_RDONLY|EDB_NOLOCK))) {
5563 rc = edb_env_setup_locks(env, &fname, mode, &excl);
5564 if (rc) {
5565 NDRX_LOG(log_error, "%s: edb_env_setup_locks failed: %d",
5566 __func__, rc);
5567 goto leave;
5568 }
5569
5570 if ((flags & EDB_PREVSNAPSHOT) && !excl) {
5571 rc = EAGAIN;
5572 goto leave;
5573 }
5574 }
5575
5576 rc = edb_fopen(env, &fname,
5577 (flags & EDB_RDONLY) ? EDB_O_RDONLY : EDB_O_RDWR,
5578 mode, &env->me_fd);
5579 if (rc)
5580 goto leave;
5581
5582 if ((flags & (EDB_RDONLY|EDB_NOLOCK)) == EDB_RDONLY) {
5583 rc = edb_env_setup_locks(env, &fname, mode, &excl);
5584 if (rc) {
5585 NDRX_LOG(log_error, "%s: edb_env_setup_locks (2) failed: %d",
5586 __func__, rc);
5587 goto leave;
5588 }
5589 }
5590
5591 if ((rc = edb_env_open2(env, flags & EDB_PREVSNAPSHOT)) == EDB_SUCCESS) {
5592 if (!(flags & (EDB_RDONLY|EDB_WRITEMAP))) {
5593
5594
5595
5596 rc = edb_fopen(env, &fname, EDB_O_META, mode, &env->me_mfd);
5597 if (rc) {
5598 NDRX_LOG(log_error, "%s: edb_fopen failed: %d",
5599 __func__, rc);
5600 goto leave;
5601 }
5602 }
5603 DPRINTF(("opened dbenv %p", (void *) env));
5604 if (excl > 0 && !(flags & EDB_PREVSNAPSHOT)) {
5605 rc = edb_env_share_locks(env, &excl);
5606 if (rc)
5607 goto leave;
5608 }
5609 if (!(flags & EDB_RDONLY)) {
5610 EDB_txn *txn;
5611 int tsize = sizeof(EDB_txn), size = tsize + env->me_maxdbs *
5612 (sizeof(EDB_db)+sizeof(EDB_cursor *)+sizeof(unsigned int)+1);
5613 if ((env->me_pbuf = calloc(1, env->me_psize)) &&
5614 (txn = calloc(1, size)))
5615 {
5616 txn->mt_dbs = (EDB_db *)((char *)txn + tsize);
5617 txn->mt_cursors = (EDB_cursor **)(txn->mt_dbs + env->me_maxdbs);
5618 txn->mt_dbiseqs = (unsigned int *)(txn->mt_cursors + env->me_maxdbs);
5619 txn->mt_dbflags = (unsigned char *)(txn->mt_dbiseqs + env->me_maxdbs);
5620 txn->mt_env = env;
5621 #ifdef EDB_VL32
5622 txn->mt_rpages = malloc(EDB_TRPAGE_SIZE * sizeof(EDB_ID3));
5623 if (!txn->mt_rpages) {
5624 NDRX_LOG(log_error, "malloc failed: %s", strerror(errno));
5625 free(txn);
5626 rc = ENOMEM;
5627 goto leave;
5628 }
5629 txn->mt_rpages[0].mid = 0;
5630 txn->mt_rpcheck = EDB_TRPAGE_SIZE/2;
5631 #endif
5632 txn->mt_dbxs = env->me_dbxs;
5633 txn->mt_flags = EDB_TXN_FINISHED;
5634 env->me_txn0 = txn;
5635 } else {
5636 NDRX_LOG(log_error, "malloc failed: %s", strerror(errno));
5637 rc = ENOMEM;
5638 }
5639 }
5640 }
5641
5642 leave:
5643 if (rc) {
5644 edb_env_close0(env, excl);
5645 }
5646 edb_fname_destroy(fname);
5647 return rc;
5648 }
5649
5650
5651 static void ESECT
5652 edb_env_close0(EDB_env *env, int excl)
5653 {
5654 int i;
5655
5656 if (!(env->me_flags & EDB_ENV_ACTIVE))
5657 return;
5658
5659
5660 if (env->me_dbxs) {
5661 for (i = env->me_maxdbs; --i >= CORE_DBS; )
5662 free(env->me_dbxs[i].md_name.mv_data);
5663 free(env->me_dbxs);
5664 }
5665
5666 free(env->me_pbuf);
5667 free(env->me_dbiseqs);
5668 free(env->me_dbflags);
5669 free(env->me_path);
5670 free(env->me_dirty_list);
5671 #ifdef EDB_VL32
5672 if (env->me_txn0 && env->me_txn0->mt_rpages)
5673 free(env->me_txn0->mt_rpages);
5674 if (env->me_rpages) {
5675 EDB_ID3L el = env->me_rpages;
5676 unsigned int x;
5677 for (x=1; x<=el[0].mid; x++)
5678 munmap(el[x].mptr, el[x].mcnt * env->me_psize);
5679 free(el);
5680 }
5681 #endif
5682 free(env->me_txn0);
5683 edb_eidl_free(env->me_free_pgs);
5684
5685 if (env->me_flags & EDB_ENV_TXKEY) {
5686 pthread_key_delete(env->me_txkey);
5687 #ifdef _WIN32
5688
5689 for (i=0; i<edb_tls_nkeys; i++)
5690 if (edb_tls_keys[i] == env->me_txkey) {
5691 edb_tls_keys[i] = edb_tls_keys[edb_tls_nkeys-1];
5692 edb_tls_nkeys--;
5693 break;
5694 }
5695 #endif
5696 }
5697
5698 if (env->me_map) {
5699 #ifdef EDB_VL32
5700 munmap(env->me_map, NUM_METAS*env->me_psize);
5701 #else
5702 munmap(env->me_map, env->me_mapsize);
5703 #endif
5704 }
5705 if (env->me_mfd != INVALID_HANDLE_VALUE)
5706 (void) close(env->me_mfd);
5707 if (env->me_fd != INVALID_HANDLE_VALUE)
5708 (void) close(env->me_fd);
5709 if (env->me_txns) {
5710 EDB_PID_T pid = getpid();
5711
5712
5713
5714
5715
5716
5717
5718 for (i = env->me_close_readers; --i >= 0; )
5719 if (env->me_txns->mti_readers[i].mr_pid == pid)
5720 env->me_txns->mti_readers[i].mr_pid = 0;
5721 #ifdef _WIN32
5722 if (env->me_rmutex) {
5723 CloseHandle(env->me_rmutex);
5724 if (env->me_wmutex) CloseHandle(env->me_wmutex);
5725 }
5726
5727
5728
5729 #elif defined(EDB_USE_POSIX_SEM)
5730 if (env->me_rmutex != SEM_FAILED) {
5731 sem_close(env->me_rmutex);
5732 if (env->me_wmutex != SEM_FAILED)
5733 sem_close(env->me_wmutex);
5734
5735
5736
5737 if (excl == 0)
5738 edb_env_excl_lock(env, &excl);
5739 if (excl > 0) {
5740 sem_unlink(MUTEXNAME(env, 'r'));
5741 sem_unlink(MUTEXNAME(env, 'w'));
5742 }
5743 }
5744 #elif defined(EDB_USE_SYSV_SEM)
5745 if (env->me_rmutex->semid != -1) {
5746
5747
5748
5749 if (excl == 0)
5750 edb_env_excl_lock(env, &excl);
5751 if (excl > 0)
5752 semctl(env->me_rmutex->semid, 0, IPC_RMID);
5753 }
5754 #endif
5755 munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(EDB_reader)+sizeof(EDB_txninfo));
5756 }
5757 if (env->me_lfd != INVALID_HANDLE_VALUE) {
5758 #ifdef _WIN32
5759 if (excl >= 0) {
5760
5761
5762
5763 UnlockFile(env->me_lfd, 0, 0, 1, 0);
5764 }
5765 #endif
5766 (void) close(env->me_lfd);
5767 }
5768 #ifdef EDB_VL32
5769 #ifdef _WIN32
5770 if (env->me_fmh) CloseHandle(env->me_fmh);
5771 if (env->me_rpmutex) CloseHandle(env->me_rpmutex);
5772 #else
5773 pthread_mutex_destroy(&env->me_rpmutex);
5774 #endif
5775 #endif
5776
5777 env->me_flags &= ~(EDB_ENV_ACTIVE|EDB_ENV_TXKEY);
5778 }
5779
5780 void ESECT
5781 edb_env_close(EDB_env *env)
5782 {
5783 EDB_page *dp;
5784
5785 if (env == NULL)
5786 return;
5787
5788 VGMEMP_DESTROY(env);
5789 while ((dp = env->me_dpages) != NULL) {
5790 VGMEMP_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
5791 env->me_dpages = dp->mp_next;
5792 free(dp);
5793 }
5794
5795 edb_env_close0(env, 0);
5796 free(env);
5797 }
5798
5799
5800 static int
5801 edb_cmp_long(const EDB_val *a, const EDB_val *b)
5802 {
5803 return (*(edb_size_t *)a->mv_data < *(edb_size_t *)b->mv_data) ? -1 :
5804 *(edb_size_t *)a->mv_data > *(edb_size_t *)b->mv_data;
5805 }
5806
5807
5808
5809
5810
5811
5812 static int
5813 edb_cmp_int(const EDB_val *a, const EDB_val *b)
5814 {
5815 return (*(unsigned int *)a->mv_data < *(unsigned int *)b->mv_data) ? -1 :
5816 *(unsigned int *)a->mv_data > *(unsigned int *)b->mv_data;
5817 }
5818
5819
5820
5821
5822 static int
5823 edb_cmp_cint(const EDB_val *a, const EDB_val *b)
5824 {
5825 #if BYTE_ORDER == LITTLE_ENDIAN
5826 unsigned short *u, *c;
5827 int x;
5828
5829 u = (unsigned short *) ((char *) a->mv_data + a->mv_size);
5830 c = (unsigned short *) ((char *) b->mv_data + a->mv_size);
5831 do {
5832 x = *--u - *--c;
5833 } while(!x && u > (unsigned short *)a->mv_data);
5834 return x;
5835 #else
5836 unsigned short *u, *c, *end;
5837 int x;
5838
5839 end = (unsigned short *) ((char *) a->mv_data + a->mv_size);
5840 u = (unsigned short *)a->mv_data;
5841 c = (unsigned short *)b->mv_data;
5842 do {
5843 x = *u++ - *c++;
5844 } while(!x && u < end);
5845 return x;
5846 #endif
5847 }
5848
5849
5850 static int
5851 edb_cmp_memn(const EDB_val *a, const EDB_val *b)
5852 {
5853 int diff;
5854 ssize_t len_diff;
5855 unsigned int len;
5856
5857 len = a->mv_size;
5858 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
5859 if (len_diff > 0) {
5860 len = b->mv_size;
5861 len_diff = 1;
5862 }
5863
5864 diff = memcmp(a->mv_data, b->mv_data, len);
5865 return diff ? diff : len_diff<0 ? -1 : len_diff;
5866 }
5867
5868
5869 static int
5870 edb_cmp_memnr(const EDB_val *a, const EDB_val *b)
5871 {
5872 const unsigned char *p1, *p2, *p1_lim;
5873 ssize_t len_diff;
5874 int diff;
5875
5876 p1_lim = (const unsigned char *)a->mv_data;
5877 p1 = (const unsigned char *)a->mv_data + a->mv_size;
5878 p2 = (const unsigned char *)b->mv_data + b->mv_size;
5879
5880 len_diff = (ssize_t) a->mv_size - (ssize_t) b->mv_size;
5881 if (len_diff > 0) {
5882 p1_lim += len_diff;
5883 len_diff = 1;
5884 }
5885
5886 while (p1 > p1_lim) {
5887 diff = *--p1 - *--p2;
5888 if (diff)
5889 return diff;
5890 }
5891 return len_diff<0 ? -1 : len_diff;
5892 }
5893
5894
5895
5896
5897
5898
5899
5900
5901 static EDB_node *
5902 edb_node_search(EDB_cursor *mc, EDB_val *key, int *exactp)
5903 {
5904 unsigned int i = 0, nkeys;
5905 int low, high;
5906 int rc = 0;
5907 EDB_page *mp = mc->mc_pg[mc->mc_top];
5908 EDB_node *node = NULL;
5909 EDB_val nodekey;
5910 EDB_cmp_func *cmp;
5911 DKBUF;
5912
5913 nkeys = NUMKEYS(mp);
5914
5915 DPRINTF(("searching %u keys in %s %spage %"Yu,
5916 nkeys, IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "",
5917 edb_dbg_pgno(mp)));
5918
5919 low = IS_LEAF(mp) ? 0 : 1;
5920 high = nkeys - 1;
5921 cmp = mc->mc_dbx->md_cmp;
5922
5923
5924
5925
5926 if (cmp == edb_cmp_cint && IS_BRANCH(mp)) {
5927 if (NODEPTR(mp, 1)->mn_ksize == sizeof(edb_size_t))
5928 cmp = edb_cmp_long;
5929 else
5930 cmp = edb_cmp_int;
5931 }
5932
5933 if (IS_LEAF2(mp)) {
5934 nodekey.mv_size = mc->mc_db->md_pad;
5935 node = NODEPTR(mp, 0);
5936 while (low <= high) {
5937 i = (low + high) >> 1;
5938 nodekey.mv_data = LEAF2KEY(mp, i, nodekey.mv_size);
5939 rc = cmp(key, &nodekey);
5940 DPRINTF(("found leaf index %u [%s], rc = %i",
5941 i, DKEY(&nodekey), rc));
5942 if (rc == 0)
5943 break;
5944 if (rc > 0)
5945 low = i + 1;
5946 else
5947 high = i - 1;
5948 }
5949 } else {
5950 while (low <= high) {
5951 i = (low + high) >> 1;
5952
5953 node = NODEPTR(mp, i);
5954 nodekey.mv_size = NODEKSZ(node);
5955 nodekey.mv_data = NODEKEY(node);
5956
5957 rc = cmp(key, &nodekey);
5958 #if EDB_DEBUG
5959 if (IS_LEAF(mp))
5960 DPRINTF(("found leaf index %u [%s], rc = %i",
5961 i, DKEY(&nodekey), rc));
5962 else
5963 DPRINTF(("found branch index %u [%s -> %"Yu"], rc = %i",
5964 i, DKEY(&nodekey), NODEPGNO(node), rc));
5965 #endif
5966 if (rc == 0)
5967 break;
5968 if (rc > 0)
5969 low = i + 1;
5970 else
5971 high = i - 1;
5972 }
5973 }
5974
5975 if (rc > 0) {
5976 i++;
5977 if (!IS_LEAF2(mp))
5978 node = NODEPTR(mp, i);
5979 }
5980 if (exactp)
5981 *exactp = (rc == 0 && nkeys > 0);
5982
5983 mc->mc_ki[mc->mc_top] = i;
5984 if (i >= nkeys)
5985
5986 return NULL;
5987
5988
5989 return node;
5990 }
5991
5992 #if 0
5993 static void
5994 edb_cursor_adjust(EDB_cursor *mc, func)
5995 {
5996 EDB_cursor *m2;
5997
5998 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
5999 if (m2->mc_pg[m2->mc_top] == mc->mc_pg[mc->mc_top]) {
6000 func(mc, m2);
6001 }
6002 }
6003 }
6004 #endif
6005
6006
6007 static void
6008 edb_cursor_pop(EDB_cursor *mc)
6009 {
6010 if (mc->mc_snum) {
6011 DPRINTF(("popping page %"Yu" off db %d cursor %p",
6012 mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *) mc));
6013
6014 mc->mc_snum--;
6015 if (mc->mc_snum) {
6016 mc->mc_top--;
6017 } else {
6018 mc->mc_flags &= ~C_INITIALIZED;
6019 }
6020 }
6021 }
6022
6023
6024
6025
6026 static int
6027 edb_cursor_push(EDB_cursor *mc, EDB_page *mp)
6028 {
6029 DPRINTF(("pushing page %"Yu" on db %d cursor %p", mp->mp_pgno,
6030 DDBI(mc), (void *) mc));
6031
6032 if (mc->mc_snum >= CURSOR_STACK) {
6033 mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
6034 return EDB_CURSOR_FULL;
6035 }
6036
6037 mc->mc_top = mc->mc_snum++;
6038 mc->mc_pg[mc->mc_top] = mp;
6039 mc->mc_ki[mc->mc_top] = 0;
6040
6041 return EDB_SUCCESS;
6042 }
6043
6044 #ifdef EDB_VL32
6045
6046
6047
6048
6049
6050
6051
6052
6053
6054
6055
6056
6057
6058
6059
6060
6061
6062
6063
6064
6065
6066
6067
6068
6069
6070
6071
6072
6073
6074
6075
6076
6077
6078
6079
6080
6081
6082
6083
6084
6085
6086
6087
6088
6089
6090
6091
6092
6093
6094 static int
6095 edb_rpage_get(EDB_txn *txn, pgno_t pg0, EDB_page **ret)
6096 {
6097 EDB_env *env = txn->mt_env;
6098 EDB_page *p;
6099 EDB_ID3L tl = txn->mt_rpages;
6100 EDB_ID3L el = env->me_rpages;
6101 EDB_ID3 id3;
6102 unsigned x, rem;
6103 pgno_t pgno;
6104 int rc, retries = 1;
6105 #ifdef _WIN32
6106 LARGE_INTEGER off;
6107 SIZE_T len;
6108 #define SET_OFF(off,val) off.QuadPart = val
6109 #define MAP(rc,env,addr,len,off) \
6110 addr = NULL; \
6111 rc = NtMapViewOfSection(env->me_fmh, GetCurrentProcess(), &addr, 0, \
6112 len, &off, &len, ViewUnmap, (env->me_flags & EDB_RDONLY) ? 0 : MEM_RESERVE, PAGE_READONLY); \
6113 if (rc) rc = edb_nt2win32(rc)
6114 #else
6115 off_t off;
6116 size_t len;
6117 #define SET_OFF(off,val) off = val
6118 #define MAP(rc,env,addr,len,off) \
6119 addr = mmap(NULL, len, PROT_READ, MAP_SHARED, env->me_fd, off); \
6120 rc = (addr == MAP_FAILED) ? errno : 0
6121 #endif
6122
6123
6124
6125
6126 rem = pg0 & (EDB_RPAGE_CHUNK-1);
6127 pgno = pg0 ^ rem;
6128
6129 id3.mid = 0;
6130 x = edb_mid3l_search(tl, pgno);
6131 if (x <= tl[0].mid && tl[x].mid == pgno) {
6132 if (x != tl[0].mid && tl[x+1].mid == pg0)
6133 x++;
6134
6135 p = (EDB_page *)((char *)tl[x].mptr + rem * env->me_psize);
6136 if (IS_OVERFLOW(p) && p->mp_pages + rem > tl[x].mcnt) {
6137 id3.mcnt = p->mp_pages + rem;
6138 len = id3.mcnt * env->me_psize;
6139 SET_OFF(off, pgno * env->me_psize);
6140 MAP(rc, env, id3.mptr, len, off);
6141 if (rc)
6142 return rc;
6143
6144 if (rem) {
6145 edb_tassert(txn, tl[x].mid != pg0);
6146
6147
6148
6149
6150 id3.mid = pg0;
6151 goto notlocal;
6152 } else {
6153
6154 tl[x].mptr = id3.mptr;
6155 tl[x].mcnt = id3.mcnt;
6156
6157 if (!tl[x].mref) {
6158 unsigned i;
6159 pthread_mutex_lock(&env->me_rpmutex);
6160 i = edb_mid3l_search(el, tl[x].mid);
6161 if (el[i].mref == 1) {
6162
6163 munmap(el[i].mptr, el[i].mcnt * env->me_psize);
6164 el[i].mptr = tl[x].mptr;
6165 el[i].mcnt = tl[x].mcnt;
6166 } else {
6167
6168 el[i].mref--;
6169 }
6170 pthread_mutex_unlock(&env->me_rpmutex);
6171 }
6172 }
6173 }
6174 id3.mptr = tl[x].mptr;
6175 id3.mcnt = tl[x].mcnt;
6176 tl[x].mref++;
6177 goto ok;
6178 }
6179
6180 notlocal:
6181 if (tl[0].mid >= EDB_TRPAGE_MAX - txn->mt_rpcheck) {
6182 unsigned i, y;
6183
6184 pthread_mutex_lock(&env->me_rpmutex);
6185 retry:
6186 y = 0;
6187 for (i=1; i<=tl[0].mid; i++) {
6188 if (!tl[i].mref) {
6189 if (!y) y = i;
6190
6191 if (tl[i].mid & (EDB_RPAGE_CHUNK-1)) {
6192 munmap(tl[i].mptr, tl[i].mcnt * env->me_psize);
6193 continue;
6194 }
6195 x = edb_mid3l_search(el, tl[i].mid);
6196 el[x].mref--;
6197 }
6198 }
6199 pthread_mutex_unlock(&env->me_rpmutex);
6200 if (!y) {
6201
6202
6203
6204 if (tl[0].mid >= EDB_TRPAGE_MAX)
6205 return EDB_TXN_FULL;
6206
6207
6208
6209 txn->mt_rpcheck /= 2;
6210 } else {
6211
6212 for (i=y+1; i<= tl[0].mid; i++)
6213 if (tl[i].mref)
6214 tl[y++] = tl[i];
6215 tl[0].mid = y-1;
6216
6217 if (!txn->mt_rpcheck)
6218 txn->mt_rpcheck = 1;
6219 while (txn->mt_rpcheck < tl[0].mid && txn->mt_rpcheck < EDB_TRPAGE_SIZE/2)
6220 txn->mt_rpcheck *= 2;
6221 }
6222 }
6223 if (tl[0].mid < EDB_TRPAGE_SIZE) {
6224 id3.mref = 1;
6225 if (id3.mid)
6226 goto found;
6227
6228 if ((env->me_flags & EDB_RDONLY) && pgno + EDB_RPAGE_CHUNK-1 > txn->mt_last_pgno)
6229 id3.mcnt = txn->mt_last_pgno + 1 - pgno;
6230 else
6231 id3.mcnt = EDB_RPAGE_CHUNK;
6232 len = id3.mcnt * env->me_psize;
6233 id3.mid = pgno;
6234
6235
6236 pthread_mutex_lock(&env->me_rpmutex);
6237 x = edb_mid3l_search(el, pgno);
6238 if (x <= el[0].mid && el[x].mid == pgno) {
6239 id3.mptr = el[x].mptr;
6240 id3.mcnt = el[x].mcnt;
6241
6242 p = (EDB_page *)((char *)id3.mptr + rem * env->me_psize);
6243 if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) {
6244 id3.mcnt = p->mp_pages + rem;
6245 len = id3.mcnt * env->me_psize;
6246 SET_OFF(off, pgno * env->me_psize);
6247 MAP(rc, env, id3.mptr, len, off);
6248 if (rc)
6249 goto fail;
6250 if (!el[x].mref) {
6251 munmap(el[x].mptr, env->me_psize * el[x].mcnt);
6252 el[x].mptr = id3.mptr;
6253 el[x].mcnt = id3.mcnt;
6254 } else {
6255 id3.mid = pg0;
6256 pthread_mutex_unlock(&env->me_rpmutex);
6257 goto found;
6258 }
6259 }
6260 el[x].mref++;
6261 pthread_mutex_unlock(&env->me_rpmutex);
6262 goto found;
6263 }
6264 if (el[0].mid >= EDB_ERPAGE_MAX - env->me_rpcheck) {
6265
6266 unsigned i, y = 0;
6267 for (i=1; i<=el[0].mid; i++) {
6268 if (!el[i].mref) {
6269 if (!y) y = i;
6270 munmap(el[i].mptr, env->me_psize * el[i].mcnt);
6271 }
6272 }
6273 if (!y) {
6274 if (retries) {
6275
6276 retries--;
6277 id3.mid = 0;
6278 goto retry;
6279 }
6280 if (el[0].mid >= EDB_ERPAGE_MAX) {
6281 pthread_mutex_unlock(&env->me_rpmutex);
6282 return EDB_MAP_FULL;
6283 }
6284 env->me_rpcheck /= 2;
6285 } else {
6286 for (i=y+1; i<= el[0].mid; i++)
6287 if (el[i].mref)
6288 el[y++] = el[i];
6289 el[0].mid = y-1;
6290 if (!env->me_rpcheck)
6291 env->me_rpcheck = 1;
6292 while (env->me_rpcheck < el[0].mid && env->me_rpcheck < EDB_ERPAGE_SIZE/2)
6293 env->me_rpcheck *= 2;
6294 }
6295 }
6296 SET_OFF(off, pgno * env->me_psize);
6297 MAP(rc, env, id3.mptr, len, off);
6298 if (rc) {
6299 fail:
6300 pthread_mutex_unlock(&env->me_rpmutex);
6301 return rc;
6302 }
6303
6304 p = (EDB_page *)((char *)id3.mptr + rem * env->me_psize);
6305 if (IS_OVERFLOW(p) && p->mp_pages + rem > id3.mcnt) {
6306 id3.mcnt = p->mp_pages + rem;
6307 munmap(id3.mptr, len);
6308 len = id3.mcnt * env->me_psize;
6309 MAP(rc, env, id3.mptr, len, off);
6310 if (rc)
6311 goto fail;
6312 }
6313 edb_mid3l_insert(el, &id3);
6314 pthread_mutex_unlock(&env->me_rpmutex);
6315 found:
6316 edb_mid3l_insert(tl, &id3);
6317 } else {
6318 return EDB_TXN_FULL;
6319 }
6320 ok:
6321 p = (EDB_page *)((char *)id3.mptr + rem * env->me_psize);
6322 #if EDB_DEBUG
6323 if (IS_OVERFLOW(p)) {
6324 edb_tassert(txn, p->mp_pages + rem <= id3.mcnt);
6325 }
6326 #endif
6327 *ret = p;
6328 return EDB_SUCCESS;
6329 }
6330 #endif
6331
6332
6333
6334
6335
6336
6337
6338
6339
6340 static int
6341 edb_page_get(EDB_cursor *mc, pgno_t pgno, EDB_page **ret, int *lvl)
6342 {
6343 EDB_txn *txn = mc->mc_txn;
6344 EDB_page *p = NULL;
6345 int level;
6346
6347 if (! (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP))) {
6348 EDB_txn *tx2 = txn;
6349 level = 1;
6350 do {
6351 EDB_ID2L dl = tx2->mt_u.dirty_list;
6352 unsigned x;
6353
6354
6355
6356
6357
6358 if (tx2->mt_spill_pgs) {
6359 EDB_ID pn = pgno << 1;
6360 x = edb_eidl_search(tx2->mt_spill_pgs, pn);
6361 if (x <= tx2->mt_spill_pgs[0] && tx2->mt_spill_pgs[x] == pn) {
6362 goto mapped;
6363 }
6364 }
6365 if (dl[0].mid) {
6366 unsigned x = edb_mid2l_search(dl, pgno);
6367 if (x <= dl[0].mid && dl[x].mid == pgno) {
6368 p = dl[x].mptr;
6369 goto done;
6370 }
6371 }
6372 level++;
6373 } while ((tx2 = tx2->mt_parent) != NULL);
6374 }
6375
6376 if (pgno >= txn->mt_next_pgno) {
6377 DPRINTF(("page %"Yu" not found", pgno));
6378 txn->mt_flags |= EDB_TXN_ERROR;
6379 return EDB_PAGE_NOTFOUND;
6380 }
6381
6382 level = 0;
6383
6384 mapped:
6385 {
6386 #ifdef EDB_VL32
6387 int rc = edb_rpage_get(txn, pgno, &p);
6388 if (rc) {
6389 txn->mt_flags |= EDB_TXN_ERROR;
6390 return rc;
6391 }
6392 #else
6393 EDB_env *env = txn->mt_env;
6394 p = (EDB_page *)(env->me_map + env->me_psize * pgno);
6395 #endif
6396 }
6397
6398 done:
6399 *ret = p;
6400 if (lvl)
6401 *lvl = level;
6402 return EDB_SUCCESS;
6403 }
6404
6405
6406
6407
6408 static int
6409 edb_page_search_root(EDB_cursor *mc, EDB_val *key, int flags)
6410 {
6411 EDB_page *mp = mc->mc_pg[mc->mc_top];
6412 int rc;
6413 DKBUF;
6414
6415 while (IS_BRANCH(mp)) {
6416 EDB_node *node;
6417 indx_t i;
6418
6419 DPRINTF(("branch page %"Yu" has %u keys", mp->mp_pgno, NUMKEYS(mp)));
6420
6421
6422
6423
6424 edb_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1);
6425 DPRINTF(("found index 0 to page %"Yu, NODEPGNO(NODEPTR(mp, 0))));
6426
6427 if (flags & (EDB_PS_FIRST|EDB_PS_LAST)) {
6428 i = 0;
6429 if (flags & EDB_PS_LAST) {
6430 i = NUMKEYS(mp) - 1;
6431
6432 if (mc->mc_flags & C_INITIALIZED) {
6433 if (mc->mc_ki[mc->mc_top] == i) {
6434 mc->mc_top = mc->mc_snum++;
6435 mp = mc->mc_pg[mc->mc_top];
6436 goto ready;
6437 }
6438 }
6439 }
6440 } else {
6441 int exact;
6442 node = edb_node_search(mc, key, &exact);
6443 if (node == NULL)
6444 i = NUMKEYS(mp) - 1;
6445 else {
6446 i = mc->mc_ki[mc->mc_top];
6447 if (!exact) {
6448 edb_cassert(mc, i > 0);
6449 i--;
6450 }
6451 }
6452 DPRINTF(("following index %u for key [%s]", i, DKEY(key)));
6453 }
6454
6455 edb_cassert(mc, i < NUMKEYS(mp));
6456 node = NODEPTR(mp, i);
6457
6458 if ((rc = edb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)
6459 return rc;
6460
6461 mc->mc_ki[mc->mc_top] = i;
6462 if ((rc = edb_cursor_push(mc, mp)))
6463 return rc;
6464
6465 ready:
6466 if (flags & EDB_PS_MODIFY) {
6467 if ((rc = edb_page_touch(mc)) != 0)
6468 return rc;
6469 mp = mc->mc_pg[mc->mc_top];
6470 }
6471 }
6472
6473 if (!IS_LEAF(mp)) {
6474 DPRINTF(("internal error, index points to a %02X page!?",
6475 mp->mp_flags));
6476 mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
6477 return EDB_CORRUPTED;
6478 }
6479
6480 DPRINTF(("found leaf page %"Yu" for key [%s]", mp->mp_pgno,
6481 key ? DKEY(key) : "null"));
6482 mc->mc_flags |= C_INITIALIZED;
6483 mc->mc_flags &= ~C_EOF;
6484
6485 return EDB_SUCCESS;
6486 }
6487
6488
6489
6490
6491
6492
6493
6494 static int
6495 edb_page_search_lowest(EDB_cursor *mc)
6496 {
6497 EDB_page *mp = mc->mc_pg[mc->mc_top];
6498 EDB_node *node = NODEPTR(mp, 0);
6499 int rc;
6500
6501 if ((rc = edb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)
6502 return rc;
6503
6504 mc->mc_ki[mc->mc_top] = 0;
6505 if ((rc = edb_cursor_push(mc, mp)))
6506 return rc;
6507 return edb_page_search_root(mc, NULL, EDB_PS_FIRST);
6508 }
6509
6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521 static int
6522 edb_page_search(EDB_cursor *mc, EDB_val *key, int flags)
6523 {
6524 int rc;
6525 pgno_t root;
6526
6527
6528
6529
6530 if (mc->mc_txn->mt_flags & EDB_TXN_BLOCKED) {
6531 DPUTS("transaction may not be used now");
6532 return EDB_BAD_TXN;
6533 } else {
6534
6535 if (*mc->mc_dbflag & DB_STALE) {
6536 EDB_cursor mc2;
6537 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
6538 return EDB_BAD_DBI;
6539 edb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, NULL);
6540 rc = edb_page_search(&mc2, &mc->mc_dbx->md_name, 0);
6541 if (rc)
6542 return rc;
6543 {
6544 EDB_val data;
6545 int exact = 0;
6546 uint16_t flags;
6547 EDB_node *leaf = edb_node_search(&mc2,
6548 &mc->mc_dbx->md_name, &exact);
6549 if (!exact)
6550 return EDB_NOTFOUND;
6551 if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)
6552 return EDB_INCOMPATIBLE;
6553 rc = edb_node_read(&mc2, leaf, &data);
6554 if (rc)
6555 return rc;
6556 memcpy(&flags, ((char *) data.mv_data + offsetof(EDB_db, md_flags)),
6557 sizeof(uint16_t));
6558
6559
6560
6561 if ((mc->mc_db->md_flags & PERSISTENT_FLAGS) != flags)
6562 return EDB_INCOMPATIBLE;
6563 memcpy(mc->mc_db, data.mv_data, sizeof(EDB_db));
6564 }
6565 *mc->mc_dbflag &= ~DB_STALE;
6566 }
6567 root = mc->mc_db->md_root;
6568
6569 if (root == P_INVALID) {
6570 DPUTS("tree is empty");
6571 return EDB_NOTFOUND;
6572 }
6573 }
6574
6575 edb_cassert(mc, root > 1);
6576 if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) {
6577 #ifdef EDB_VL32
6578 if (mc->mc_pg[0])
6579 EDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[0]);
6580 #endif
6581 if ((rc = edb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)
6582 return rc;
6583 }
6584
6585 #ifdef EDB_VL32
6586 {
6587 int i;
6588 for (i=1; i<mc->mc_snum; i++)
6589 EDB_PAGE_UNREF(mc->mc_txn, mc->mc_pg[i]);
6590 }
6591 #endif
6592 mc->mc_snum = 1;
6593 mc->mc_top = 0;
6594
6595 DPRINTF(("db %d root page %"Yu" has flags 0x%X",
6596 DDBI(mc), root, mc->mc_pg[0]->mp_flags));
6597
6598 if (flags & EDB_PS_MODIFY) {
6599 if ((rc = edb_page_touch(mc)))
6600 return rc;
6601 }
6602
6603 if (flags & EDB_PS_ROOTONLY)
6604 return EDB_SUCCESS;
6605
6606 return edb_page_search_root(mc, key, flags);
6607 }
6608
6609 static int
6610 edb_ovpage_free(EDB_cursor *mc, EDB_page *mp)
6611 {
6612 EDB_txn *txn = mc->mc_txn;
6613 pgno_t pg = mp->mp_pgno;
6614 unsigned x = 0, ovpages = mp->mp_pages;
6615 EDB_env *env = txn->mt_env;
6616 EDB_IDL sl = txn->mt_spill_pgs;
6617 EDB_ID pn = pg << 1;
6618 int rc;
6619
6620 DPRINTF(("free ov page %"Yu" (%d)", pg, ovpages));
6621
6622
6623
6624
6625
6626
6627
6628
6629 if (env->me_pghead &&
6630 !txn->mt_parent &&
6631 ((mp->mp_flags & P_DIRTY) ||
6632 (sl && (x = edb_eidl_search(sl, pn)) <= sl[0] && sl[x] == pn)))
6633 {
6634 unsigned i, j;
6635 pgno_t *mop;
6636 EDB_ID2 *dl, ix, iy;
6637 rc = edb_eidl_need(&env->me_pghead, ovpages);
6638 if (rc)
6639 return rc;
6640 if (!(mp->mp_flags & P_DIRTY)) {
6641
6642 if (x == sl[0])
6643 sl[0]--;
6644 else
6645 sl[x] |= 1;
6646 goto release;
6647 }
6648
6649 dl = txn->mt_u.dirty_list;
6650 x = dl[0].mid--;
6651 for (ix = dl[x]; ix.mptr != mp; ix = iy) {
6652 if (x > 1) {
6653 x--;
6654 iy = dl[x];
6655 dl[x] = ix;
6656 } else {
6657 edb_cassert(mc, x > 1);
6658 j = ++(dl[0].mid);
6659 dl[j] = ix;
6660 txn->mt_flags |= EDB_TXN_ERROR;
6661 return EDB_PROBLEM;
6662 }
6663 }
6664 txn->mt_dirty_room++;
6665 if (!(env->me_flags & EDB_WRITEMAP))
6666 edb_dpage_free(env, mp);
6667 release:
6668
6669 mop = env->me_pghead;
6670 j = mop[0] + ovpages;
6671 for (i = mop[0]; i && mop[i] < pg; i--)
6672 mop[j--] = mop[i];
6673 while (j>i)
6674 mop[j--] = pg++;
6675 mop[0] += ovpages;
6676 } else {
6677 rc = edb_eidl_append_range(&txn->mt_free_pgs, pg, ovpages);
6678 if (rc)
6679 return rc;
6680 }
6681 #ifdef EDB_VL32
6682 if (mc->mc_ovpg == mp)
6683 mc->mc_ovpg = NULL;
6684 #endif
6685 mc->mc_db->md_overflow_pages -= ovpages;
6686 return 0;
6687 }
6688
6689
6690
6691
6692
6693
6694
6695 static int
6696 edb_node_read(EDB_cursor *mc, EDB_node *leaf, EDB_val *data)
6697 {
6698 EDB_page *omp;
6699 pgno_t pgno;
6700 int rc;
6701
6702 if (MC_OVPG(mc)) {
6703 EDB_PAGE_UNREF(mc->mc_txn, MC_OVPG(mc));
6704 MC_SET_OVPG(mc, NULL);
6705 }
6706 if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
6707 data->mv_size = NODEDSZ(leaf);
6708 data->mv_data = NODEDATA(leaf);
6709 return EDB_SUCCESS;
6710 }
6711
6712
6713
6714 data->mv_size = NODEDSZ(leaf);
6715 memcpy(&pgno, NODEDATA(leaf), sizeof(pgno));
6716 if ((rc = edb_page_get(mc, pgno, &omp, NULL)) != 0) {
6717 DPRINTF(("read overflow page %"Yu" failed", pgno));
6718 return rc;
6719 }
6720 data->mv_data = METADATA(omp);
6721 MC_SET_OVPG(mc, omp);
6722
6723 return EDB_SUCCESS;
6724 }
6725
6726 int
6727 edb_get(EDB_txn *txn, EDB_dbi dbi,
6728 EDB_val *key, EDB_val *data)
6729 {
6730 EDB_cursor mc;
6731 EDB_xcursor mx;
6732 int exact = 0, rc;
6733 DKBUF;
6734
6735 DPRINTF(("===> get db %u key [%s]", dbi, DKEY(key)));
6736
6737 if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
6738 return EINVAL;
6739
6740 if (txn->mt_flags & EDB_TXN_BLOCKED)
6741 return EDB_BAD_TXN;
6742
6743 edb_cursor_init(&mc, txn, dbi, &mx);
6744 rc = edb_cursor_set(&mc, key, data, EDB_SET, &exact);
6745
6746
6747
6748 EDB_CURSOR_UNREF(&mc, 1);
6749 return rc;
6750 }
6751
6752
6753
6754
6755
6756
6757
6758
6759
6760 static int
6761 edb_cursor_sibling(EDB_cursor *mc, int move_right)
6762 {
6763 int rc;
6764 EDB_node *indx;
6765 EDB_page *mp;
6766 #ifdef EDB_VL32
6767 EDB_page *op;
6768 #endif
6769
6770 if (mc->mc_snum < 2) {
6771 return EDB_NOTFOUND;
6772 }
6773
6774 #ifdef EDB_VL32
6775 op = mc->mc_pg[mc->mc_top];
6776 #endif
6777 edb_cursor_pop(mc);
6778 DPRINTF(("parent page is page %"Yu", index %u",
6779 mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]));
6780
6781 if (move_right ? (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mc->mc_pg[mc->mc_top]))
6782 : (mc->mc_ki[mc->mc_top] == 0)) {
6783 DPRINTF(("no more keys left, moving to %s sibling",
6784 move_right ? "right" : "left"));
6785 if ((rc = edb_cursor_sibling(mc, move_right)) != EDB_SUCCESS) {
6786
6787 mc->mc_top++;
6788 mc->mc_snum++;
6789 return rc;
6790 }
6791 } else {
6792 if (move_right)
6793 mc->mc_ki[mc->mc_top]++;
6794 else
6795 mc->mc_ki[mc->mc_top]--;
6796 DPRINTF(("just moving to %s index key %u",
6797 move_right ? "right" : "left", mc->mc_ki[mc->mc_top]));
6798 }
6799 edb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top]));
6800
6801 EDB_PAGE_UNREF(mc->mc_txn, op);
6802
6803 indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
6804 if ((rc = edb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0) {
6805
6806 mc->mc_flags &= ~(C_INITIALIZED|C_EOF);
6807 return rc;
6808 }
6809
6810 edb_cursor_push(mc, mp);
6811 if (!move_right)
6812 mc->mc_ki[mc->mc_top] = NUMKEYS(mp)-1;
6813
6814 return EDB_SUCCESS;
6815 }
6816
6817
6818 static int
6819 edb_cursor_next(EDB_cursor *mc, EDB_val *key, EDB_val *data, EDB_cursor_op op)
6820 {
6821 EDB_page *mp;
6822 EDB_node *leaf;
6823 int rc;
6824
6825 if ((mc->mc_flags & C_DEL && op == EDB_NEXT_DUP))
6826 return EDB_NOTFOUND;
6827
6828 if (!(mc->mc_flags & C_INITIALIZED))
6829 return edb_cursor_first(mc, key, data);
6830
6831 mp = mc->mc_pg[mc->mc_top];
6832
6833 if (mc->mc_flags & C_EOF) {
6834 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mp)-1)
6835 return EDB_NOTFOUND;
6836 mc->mc_flags ^= C_EOF;
6837 }
6838
6839 if (mc->mc_db->md_flags & EDB_DUPSORT) {
6840 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6841 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6842 if (op == EDB_NEXT || op == EDB_NEXT_DUP) {
6843 rc = edb_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, EDB_NEXT);
6844 if (op != EDB_NEXT || rc != EDB_NOTFOUND) {
6845 if (rc == EDB_SUCCESS)
6846 EDB_GET_KEY(leaf, key);
6847 return rc;
6848 }
6849 }
6850 else {
6851 EDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0);
6852 }
6853 } else {
6854 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
6855 if (op == EDB_NEXT_DUP)
6856 return EDB_NOTFOUND;
6857 }
6858 }
6859
6860 DPRINTF(("cursor_next: top page is %"Yu" in cursor %p",
6861 edb_dbg_pgno(mp), (void *) mc));
6862 if (mc->mc_flags & C_DEL) {
6863 mc->mc_flags ^= C_DEL;
6864 goto skip;
6865 }
6866
6867 if (mc->mc_ki[mc->mc_top] + 1u >= NUMKEYS(mp)) {
6868 DPUTS("=====> move to next sibling page");
6869 if ((rc = edb_cursor_sibling(mc, 1)) != EDB_SUCCESS) {
6870 mc->mc_flags |= C_EOF;
6871 return rc;
6872 }
6873 mp = mc->mc_pg[mc->mc_top];
6874 DPRINTF(("next page is %"Yu", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
6875 } else
6876 mc->mc_ki[mc->mc_top]++;
6877
6878 skip:
6879 DPRINTF(("==> cursor points to page %"Yu" with %u keys, key index %u",
6880 edb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
6881
6882 if (IS_LEAF2(mp)) {
6883 key->mv_size = mc->mc_db->md_pad;
6884 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
6885 return EDB_SUCCESS;
6886 }
6887
6888 edb_cassert(mc, IS_LEAF(mp));
6889 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6890
6891 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6892 edb_xcursor_init1(mc, leaf);
6893 }
6894 if (data) {
6895 if ((rc = edb_node_read(mc, leaf, data)) != EDB_SUCCESS)
6896 return rc;
6897
6898 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6899 rc = edb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
6900 if (rc != EDB_SUCCESS)
6901 return rc;
6902 }
6903 }
6904
6905 EDB_GET_KEY(leaf, key);
6906 return EDB_SUCCESS;
6907 }
6908
6909
6910 static int
6911 edb_cursor_prev(EDB_cursor *mc, EDB_val *key, EDB_val *data, EDB_cursor_op op)
6912 {
6913 EDB_page *mp;
6914 EDB_node *leaf;
6915 int rc;
6916
6917 if (!(mc->mc_flags & C_INITIALIZED)) {
6918 rc = edb_cursor_last(mc, key, data);
6919 if (rc)
6920 return rc;
6921 mc->mc_ki[mc->mc_top]++;
6922 }
6923
6924 mp = mc->mc_pg[mc->mc_top];
6925
6926 if (mc->mc_db->md_flags & EDB_DUPSORT) {
6927 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6928 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6929 if (op == EDB_PREV || op == EDB_PREV_DUP) {
6930 rc = edb_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, EDB_PREV);
6931 if (op != EDB_PREV || rc != EDB_NOTFOUND) {
6932 if (rc == EDB_SUCCESS) {
6933 EDB_GET_KEY(leaf, key);
6934 mc->mc_flags &= ~C_EOF;
6935 }
6936 return rc;
6937 }
6938 }
6939 else {
6940 EDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0);
6941 }
6942 } else {
6943 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
6944 if (op == EDB_PREV_DUP)
6945 return EDB_NOTFOUND;
6946 }
6947 }
6948
6949 DPRINTF(("cursor_prev: top page is %"Yu" in cursor %p",
6950 edb_dbg_pgno(mp), (void *) mc));
6951
6952 mc->mc_flags &= ~(C_EOF|C_DEL);
6953
6954 if (mc->mc_ki[mc->mc_top] == 0) {
6955 DPUTS("=====> move to prev sibling page");
6956 if ((rc = edb_cursor_sibling(mc, 0)) != EDB_SUCCESS) {
6957 return rc;
6958 }
6959 mp = mc->mc_pg[mc->mc_top];
6960 mc->mc_ki[mc->mc_top] = NUMKEYS(mp) - 1;
6961 DPRINTF(("prev page is %"Yu", key index %u", mp->mp_pgno, mc->mc_ki[mc->mc_top]));
6962 } else
6963 mc->mc_ki[mc->mc_top]--;
6964
6965 DPRINTF(("==> cursor points to page %"Yu" with %u keys, key index %u",
6966 edb_dbg_pgno(mp), NUMKEYS(mp), mc->mc_ki[mc->mc_top]));
6967
6968 if (IS_LEAF2(mp)) {
6969 key->mv_size = mc->mc_db->md_pad;
6970 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
6971 return EDB_SUCCESS;
6972 }
6973
6974 edb_cassert(mc, IS_LEAF(mp));
6975 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
6976
6977 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6978 edb_xcursor_init1(mc, leaf);
6979 }
6980 if (data) {
6981 if ((rc = edb_node_read(mc, leaf, data)) != EDB_SUCCESS)
6982 return rc;
6983
6984 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
6985 rc = edb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
6986 if (rc != EDB_SUCCESS)
6987 return rc;
6988 }
6989 }
6990
6991 EDB_GET_KEY(leaf, key);
6992 return EDB_SUCCESS;
6993 }
6994
6995
6996 static int
6997 edb_cursor_set(EDB_cursor *mc, EDB_val *key, EDB_val *data,
6998 EDB_cursor_op op, int *exactp)
6999 {
7000 int rc;
7001 EDB_page *mp;
7002 EDB_node *leaf = NULL;
7003 DKBUF;
7004
7005 if (key->mv_size == 0)
7006 return EDB_BAD_VALSIZE;
7007
7008 if (mc->mc_xcursor) {
7009 EDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0);
7010 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
7011 }
7012
7013
7014 if (mc->mc_flags & C_INITIALIZED) {
7015 EDB_val nodekey;
7016
7017 mp = mc->mc_pg[mc->mc_top];
7018 if (!NUMKEYS(mp)) {
7019 mc->mc_ki[mc->mc_top] = 0;
7020 return EDB_NOTFOUND;
7021 }
7022 if (mp->mp_flags & P_LEAF2) {
7023 nodekey.mv_size = mc->mc_db->md_pad;
7024 nodekey.mv_data = LEAF2KEY(mp, 0, nodekey.mv_size);
7025 } else {
7026 leaf = NODEPTR(mp, 0);
7027 EDB_GET_KEY2(leaf, nodekey);
7028 }
7029 rc = mc->mc_dbx->md_cmp(key, &nodekey);
7030 if (rc == 0) {
7031
7032
7033
7034 mc->mc_ki[mc->mc_top] = 0;
7035 if (exactp)
7036 *exactp = 1;
7037 goto set1;
7038 }
7039 if (rc > 0) {
7040 unsigned int i;
7041 unsigned int nkeys = NUMKEYS(mp);
7042 if (nkeys > 1) {
7043 if (mp->mp_flags & P_LEAF2) {
7044 nodekey.mv_data = LEAF2KEY(mp,
7045 nkeys-1, nodekey.mv_size);
7046 } else {
7047 leaf = NODEPTR(mp, nkeys-1);
7048 EDB_GET_KEY2(leaf, nodekey);
7049 }
7050 rc = mc->mc_dbx->md_cmp(key, &nodekey);
7051 if (rc == 0) {
7052
7053 mc->mc_ki[mc->mc_top] = nkeys-1;
7054 if (exactp)
7055 *exactp = 1;
7056 goto set1;
7057 }
7058 if (rc < 0) {
7059 if (mc->mc_ki[mc->mc_top] < NUMKEYS(mp)) {
7060
7061 if (mp->mp_flags & P_LEAF2) {
7062 nodekey.mv_data = LEAF2KEY(mp,
7063 mc->mc_ki[mc->mc_top], nodekey.mv_size);
7064 } else {
7065 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7066 EDB_GET_KEY2(leaf, nodekey);
7067 }
7068 rc = mc->mc_dbx->md_cmp(key, &nodekey);
7069 if (rc == 0) {
7070
7071 if (exactp)
7072 *exactp = 1;
7073 goto set1;
7074 }
7075 }
7076 rc = 0;
7077 mc->mc_flags &= ~C_EOF;
7078 goto set2;
7079 }
7080 }
7081
7082
7083
7084 for (i=0; i<mc->mc_top; i++)
7085 if (mc->mc_ki[i] <
7086 NUMKEYS(mc->mc_pg[i])-1)
7087 break;
7088 if (i == mc->mc_top) {
7089
7090 mc->mc_ki[mc->mc_top] = nkeys;
7091 return EDB_NOTFOUND;
7092 }
7093 }
7094 if (!mc->mc_top) {
7095
7096 mc->mc_ki[mc->mc_top] = 0;
7097 if (op == EDB_SET_RANGE && !exactp) {
7098 rc = 0;
7099 goto set1;
7100 } else
7101 return EDB_NOTFOUND;
7102 }
7103 } else {
7104 mc->mc_pg[0] = 0;
7105 }
7106
7107 rc = edb_page_search(mc, key, 0);
7108 if (rc != EDB_SUCCESS)
7109 return rc;
7110
7111 mp = mc->mc_pg[mc->mc_top];
7112 edb_cassert(mc, IS_LEAF(mp));
7113
7114 set2:
7115 leaf = edb_node_search(mc, key, exactp);
7116 if (exactp != NULL && !*exactp) {
7117
7118 return EDB_NOTFOUND;
7119 }
7120
7121 if (leaf == NULL) {
7122 DPUTS("===> inexact leaf not found, goto sibling");
7123 if ((rc = edb_cursor_sibling(mc, 1)) != EDB_SUCCESS) {
7124 mc->mc_flags |= C_EOF;
7125 return rc;
7126 }
7127 mp = mc->mc_pg[mc->mc_top];
7128 edb_cassert(mc, IS_LEAF(mp));
7129 leaf = NODEPTR(mp, 0);
7130 }
7131
7132 set1:
7133 mc->mc_flags |= C_INITIALIZED;
7134 mc->mc_flags &= ~C_EOF;
7135
7136 if (IS_LEAF2(mp)) {
7137 if (op == EDB_SET_RANGE || op == EDB_SET_KEY) {
7138 key->mv_size = mc->mc_db->md_pad;
7139 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
7140 }
7141 return EDB_SUCCESS;
7142 }
7143
7144 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7145 edb_xcursor_init1(mc, leaf);
7146 }
7147 if (data) {
7148 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7149 if (op == EDB_SET || op == EDB_SET_KEY || op == EDB_SET_RANGE) {
7150 rc = edb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
7151 } else {
7152 int ex2, *ex2p;
7153 if (op == EDB_GET_BOTH) {
7154 ex2p = &ex2;
7155 ex2 = 0;
7156 } else {
7157 ex2p = NULL;
7158 }
7159 rc = edb_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, EDB_SET_RANGE, ex2p);
7160 if (rc != EDB_SUCCESS)
7161 return rc;
7162 }
7163 } else if (op == EDB_GET_BOTH || op == EDB_GET_BOTH_RANGE) {
7164 EDB_val olddata;
7165 EDB_cmp_func *dcmp;
7166 if ((rc = edb_node_read(mc, leaf, &olddata)) != EDB_SUCCESS)
7167 return rc;
7168 dcmp = mc->mc_dbx->md_dcmp;
7169 if (NEED_CMP_CLONG(dcmp, olddata.mv_size))
7170 dcmp = edb_cmp_clong;
7171 rc = dcmp(data, &olddata);
7172 if (rc) {
7173 if (op == EDB_GET_BOTH || rc > 0)
7174 return EDB_NOTFOUND;
7175 rc = 0;
7176 }
7177 *data = olddata;
7178
7179 } else {
7180 if (mc->mc_xcursor)
7181 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
7182 if ((rc = edb_node_read(mc, leaf, data)) != EDB_SUCCESS)
7183 return rc;
7184 }
7185 }
7186
7187
7188 if (op == EDB_SET_RANGE || op == EDB_SET_KEY)
7189 EDB_GET_KEY(leaf, key);
7190 DPRINTF(("==> cursor placed on key [%s]", DKEY(key)));
7191
7192 return rc;
7193 }
7194
7195
7196 static int
7197 edb_cursor_first(EDB_cursor *mc, EDB_val *key, EDB_val *data)
7198 {
7199 int rc;
7200 EDB_node *leaf;
7201
7202 if (mc->mc_xcursor) {
7203 EDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0);
7204 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
7205 }
7206
7207 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
7208 rc = edb_page_search(mc, NULL, EDB_PS_FIRST);
7209 if (rc != EDB_SUCCESS)
7210 return rc;
7211 }
7212 edb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
7213
7214 leaf = NODEPTR(mc->mc_pg[mc->mc_top], 0);
7215 mc->mc_flags |= C_INITIALIZED;
7216 mc->mc_flags &= ~C_EOF;
7217
7218 mc->mc_ki[mc->mc_top] = 0;
7219
7220 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
7221 key->mv_size = mc->mc_db->md_pad;
7222 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], 0, key->mv_size);
7223 return EDB_SUCCESS;
7224 }
7225
7226 if (data) {
7227 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7228 edb_xcursor_init1(mc, leaf);
7229 rc = edb_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL);
7230 if (rc)
7231 return rc;
7232 } else {
7233 if ((rc = edb_node_read(mc, leaf, data)) != EDB_SUCCESS)
7234 return rc;
7235 }
7236 }
7237 EDB_GET_KEY(leaf, key);
7238 return EDB_SUCCESS;
7239 }
7240
7241
7242 static int
7243 edb_cursor_last(EDB_cursor *mc, EDB_val *key, EDB_val *data)
7244 {
7245 int rc;
7246 EDB_node *leaf;
7247
7248 if (mc->mc_xcursor) {
7249 EDB_CURSOR_UNREF(&mc->mc_xcursor->mx_cursor, 0);
7250 mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
7251 }
7252
7253 if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) {
7254 rc = edb_page_search(mc, NULL, EDB_PS_LAST);
7255 if (rc != EDB_SUCCESS)
7256 return rc;
7257 }
7258 edb_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]));
7259
7260 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]) - 1;
7261 mc->mc_flags |= C_INITIALIZED|C_EOF;
7262 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7263
7264 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
7265 key->mv_size = mc->mc_db->md_pad;
7266 key->mv_data = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], key->mv_size);
7267 return EDB_SUCCESS;
7268 }
7269
7270 if (data) {
7271 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7272 edb_xcursor_init1(mc, leaf);
7273 rc = edb_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL);
7274 if (rc)
7275 return rc;
7276 } else {
7277 if ((rc = edb_node_read(mc, leaf, data)) != EDB_SUCCESS)
7278 return rc;
7279 }
7280 }
7281
7282 EDB_GET_KEY(leaf, key);
7283 return EDB_SUCCESS;
7284 }
7285
7286 int
7287 edb_cursor_get(EDB_cursor *mc, EDB_val *key, EDB_val *data,
7288 EDB_cursor_op op)
7289 {
7290 int rc;
7291 int exact = 0;
7292 int (*mfunc)(EDB_cursor *mc, EDB_val *key, EDB_val *data);
7293
7294 if (mc == NULL)
7295 return EINVAL;
7296
7297 if (mc->mc_txn->mt_flags & EDB_TXN_BLOCKED)
7298 return EDB_BAD_TXN;
7299
7300 switch (op) {
7301 case EDB_GET_CURRENT:
7302 if (!(mc->mc_flags & C_INITIALIZED)) {
7303 rc = EINVAL;
7304 } else {
7305 EDB_page *mp = mc->mc_pg[mc->mc_top];
7306 int nkeys = NUMKEYS(mp);
7307 if (!nkeys || mc->mc_ki[mc->mc_top] >= nkeys) {
7308 mc->mc_ki[mc->mc_top] = nkeys;
7309 rc = EDB_NOTFOUND;
7310 break;
7311 }
7312 rc = EDB_SUCCESS;
7313 if (IS_LEAF2(mp)) {
7314 key->mv_size = mc->mc_db->md_pad;
7315 key->mv_data = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], key->mv_size);
7316 } else {
7317 EDB_node *leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
7318 EDB_GET_KEY(leaf, key);
7319 if (data) {
7320 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7321 rc = edb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, EDB_GET_CURRENT);
7322 } else {
7323 rc = edb_node_read(mc, leaf, data);
7324 }
7325 }
7326 }
7327 }
7328 break;
7329 case EDB_GET_BOTH:
7330 case EDB_GET_BOTH_RANGE:
7331 if (data == NULL) {
7332 rc = EINVAL;
7333 break;
7334 }
7335 if (mc->mc_xcursor == NULL) {
7336 rc = EDB_INCOMPATIBLE;
7337 break;
7338 }
7339
7340 case EDB_SET:
7341 case EDB_SET_KEY:
7342 case EDB_SET_RANGE:
7343 if (key == NULL) {
7344 rc = EINVAL;
7345 } else {
7346 rc = edb_cursor_set(mc, key, data, op,
7347 op == EDB_SET_RANGE ? NULL : &exact);
7348 }
7349 break;
7350 case EDB_GET_MULTIPLE:
7351 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
7352 rc = EINVAL;
7353 break;
7354 }
7355 if (!(mc->mc_db->md_flags & EDB_DUPFIXED)) {
7356 rc = EDB_INCOMPATIBLE;
7357 break;
7358 }
7359 rc = EDB_SUCCESS;
7360 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) ||
7361 (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF))
7362 break;
7363 goto fetchm;
7364 case EDB_NEXT_MULTIPLE:
7365 if (data == NULL) {
7366 rc = EINVAL;
7367 break;
7368 }
7369 if (!(mc->mc_db->md_flags & EDB_DUPFIXED)) {
7370 rc = EDB_INCOMPATIBLE;
7371 break;
7372 }
7373 rc = edb_cursor_next(mc, key, data, EDB_NEXT_DUP);
7374 if (rc == EDB_SUCCESS) {
7375 if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
7376 EDB_cursor *mx;
7377 fetchm:
7378 mx = &mc->mc_xcursor->mx_cursor;
7379 data->mv_size = NUMKEYS(mx->mc_pg[mx->mc_top]) *
7380 mx->mc_db->md_pad;
7381 data->mv_data = METADATA(mx->mc_pg[mx->mc_top]);
7382 mx->mc_ki[mx->mc_top] = NUMKEYS(mx->mc_pg[mx->mc_top])-1;
7383 } else {
7384 rc = EDB_NOTFOUND;
7385 }
7386 }
7387 break;
7388 case EDB_PREV_MULTIPLE:
7389 if (data == NULL) {
7390 rc = EINVAL;
7391 break;
7392 }
7393 if (!(mc->mc_db->md_flags & EDB_DUPFIXED)) {
7394 rc = EDB_INCOMPATIBLE;
7395 break;
7396 }
7397 if (!(mc->mc_flags & C_INITIALIZED))
7398 rc = edb_cursor_last(mc, key, data);
7399 else
7400 rc = EDB_SUCCESS;
7401 if (rc == EDB_SUCCESS) {
7402 EDB_cursor *mx = &mc->mc_xcursor->mx_cursor;
7403 if (mx->mc_flags & C_INITIALIZED) {
7404 rc = edb_cursor_sibling(mx, 0);
7405 if (rc == EDB_SUCCESS)
7406 goto fetchm;
7407 } else {
7408 rc = EDB_NOTFOUND;
7409 }
7410 }
7411 break;
7412 case EDB_NEXT:
7413 case EDB_NEXT_DUP:
7414 case EDB_NEXT_NODUP:
7415 rc = edb_cursor_next(mc, key, data, op);
7416 break;
7417 case EDB_PREV:
7418 case EDB_PREV_DUP:
7419 case EDB_PREV_NODUP:
7420 rc = edb_cursor_prev(mc, key, data, op);
7421 break;
7422 case EDB_FIRST:
7423 rc = edb_cursor_first(mc, key, data);
7424 break;
7425 case EDB_FIRST_DUP:
7426 mfunc = edb_cursor_first;
7427 mmove:
7428 if (data == NULL || !(mc->mc_flags & C_INITIALIZED)) {
7429 rc = EINVAL;
7430 break;
7431 }
7432 if (mc->mc_xcursor == NULL) {
7433 rc = EDB_INCOMPATIBLE;
7434 break;
7435 }
7436 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top])) {
7437 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
7438 rc = EDB_NOTFOUND;
7439 break;
7440 }
7441 {
7442 EDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7443 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7444 EDB_GET_KEY(leaf, key);
7445 rc = edb_node_read(mc, leaf, data);
7446 break;
7447 }
7448 }
7449 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) {
7450 rc = EINVAL;
7451 break;
7452 }
7453 rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL);
7454 break;
7455 case EDB_LAST:
7456 rc = edb_cursor_last(mc, key, data);
7457 break;
7458 case EDB_LAST_DUP:
7459 mfunc = edb_cursor_last;
7460 goto mmove;
7461 default:
7462 DPRINTF(("unhandled/unimplemented cursor operation %u", op));
7463 rc = EINVAL;
7464 break;
7465 }
7466
7467 if (mc->mc_flags & C_DEL)
7468 mc->mc_flags ^= C_DEL;
7469
7470 return rc;
7471 }
7472
7473
7474
7475
7476
7477 static int
7478 edb_cursor_touch(EDB_cursor *mc)
7479 {
7480 int rc = EDB_SUCCESS;
7481
7482 if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) {
7483
7484 EDB_cursor mc2;
7485 EDB_xcursor mcx;
7486 if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))
7487 return EDB_BAD_DBI;
7488 edb_cursor_init(&mc2, mc->mc_txn, MAIN_DBI, &mcx);
7489 rc = edb_page_search(&mc2, &mc->mc_dbx->md_name, EDB_PS_MODIFY);
7490 if (rc)
7491 return rc;
7492 *mc->mc_dbflag |= DB_DIRTY;
7493 }
7494 mc->mc_top = 0;
7495 if (mc->mc_snum) {
7496 do {
7497 rc = edb_page_touch(mc);
7498 } while (!rc && ++(mc->mc_top) < mc->mc_snum);
7499 mc->mc_top = mc->mc_snum-1;
7500 }
7501 return rc;
7502 }
7503
7504
7505 #define EDB_NOSPILL 0x8000
7506
7507 int
7508 edb_cursor_put(EDB_cursor *mc, EDB_val *key, EDB_val *data,
7509 unsigned int flags)
7510 {
7511 EDB_env *env;
7512 EDB_node *leaf = NULL;
7513 EDB_page *fp, *mp, *sub_root = NULL;
7514 uint16_t fp_flags;
7515 EDB_val xdata, *rdata, dkey, olddata;
7516 EDB_db dummy;
7517 int do_sub = 0, insert_key, insert_data;
7518 unsigned int mcount = 0, dcount = 0, nospill;
7519 size_t nsize;
7520 int rc, rc2;
7521 unsigned int nflags;
7522 DKBUF;
7523
7524 if (mc == NULL || key == NULL)
7525 return EINVAL;
7526
7527 env = mc->mc_txn->mt_env;
7528
7529
7530
7531
7532 if (flags & EDB_MULTIPLE) {
7533 dcount = data[1].mv_size;
7534 data[1].mv_size = 0;
7535 if (!F_ISSET(mc->mc_db->md_flags, EDB_DUPFIXED))
7536 return EDB_INCOMPATIBLE;
7537 }
7538
7539 nospill = flags & EDB_NOSPILL;
7540 flags &= ~EDB_NOSPILL;
7541
7542 if (mc->mc_txn->mt_flags & (EDB_TXN_RDONLY|EDB_TXN_BLOCKED))
7543 return (mc->mc_txn->mt_flags & EDB_TXN_RDONLY) ? EACCES : EDB_BAD_TXN;
7544
7545 if (key->mv_size-1 >= ENV_MAXKEY(env))
7546 return EDB_BAD_VALSIZE;
7547
7548 #if SIZE_MAX > MAXDATASIZE
7549 if (data->mv_size > ((mc->mc_db->md_flags & EDB_DUPSORT) ? ENV_MAXKEY(env) : MAXDATASIZE))
7550 return EDB_BAD_VALSIZE;
7551 #else
7552 if ((mc->mc_db->md_flags & EDB_DUPSORT) && data->mv_size > ENV_MAXKEY(env))
7553 return EDB_BAD_VALSIZE;
7554 #endif
7555
7556 DPRINTF(("==> put db %d key [%s], size %"Z"u, data size %"Z"u",
7557 DDBI(mc), DKEY(key), key ? key->mv_size : 0, data->mv_size));
7558
7559 dkey.mv_size = 0;
7560
7561 if (flags & EDB_CURRENT) {
7562 if (!(mc->mc_flags & C_INITIALIZED))
7563 return EINVAL;
7564 rc = EDB_SUCCESS;
7565 } else if (mc->mc_db->md_root == P_INVALID) {
7566
7567 mc->mc_snum = 0;
7568 mc->mc_top = 0;
7569 mc->mc_flags &= ~C_INITIALIZED;
7570 rc = EDB_NO_ROOT;
7571 } else {
7572 int exact = 0;
7573 EDB_val d2;
7574 if (flags & EDB_APPEND) {
7575 EDB_val k2;
7576 rc = edb_cursor_last(mc, &k2, &d2);
7577 if (rc == 0) {
7578 rc = mc->mc_dbx->md_cmp(key, &k2);
7579 if (rc > 0) {
7580 rc = EDB_NOTFOUND;
7581 mc->mc_ki[mc->mc_top]++;
7582 } else {
7583
7584 rc = EDB_KEYEXIST;
7585 }
7586 }
7587 } else {
7588 rc = edb_cursor_set(mc, key, &d2, EDB_SET, &exact);
7589 }
7590 if ((flags & EDB_NOOVERWRITE) && rc == 0) {
7591 DPRINTF(("duplicate key [%s]", DKEY(key)));
7592 *data = d2;
7593 return EDB_KEYEXIST;
7594 }
7595 if (rc && rc != EDB_NOTFOUND)
7596 return rc;
7597 }
7598
7599 if (mc->mc_flags & C_DEL)
7600 mc->mc_flags ^= C_DEL;
7601
7602
7603 if (!nospill) {
7604 if (flags & EDB_MULTIPLE) {
7605 rdata = &xdata;
7606 xdata.mv_size = data->mv_size * dcount;
7607 } else {
7608 rdata = data;
7609 }
7610 if ((rc2 = edb_page_spill(mc, key, rdata)))
7611 return rc2;
7612 }
7613
7614 if (rc == EDB_NO_ROOT) {
7615 EDB_page *np;
7616
7617 DPUTS("allocating new root leaf page");
7618 if ((rc2 = edb_page_new(mc, P_LEAF, 1, &np))) {
7619 return rc2;
7620 }
7621 edb_cursor_push(mc, np);
7622 mc->mc_db->md_root = np->mp_pgno;
7623 mc->mc_db->md_depth++;
7624 *mc->mc_dbflag |= DB_DIRTY;
7625 if ((mc->mc_db->md_flags & (EDB_DUPSORT|EDB_DUPFIXED))
7626 == EDB_DUPFIXED)
7627 np->mp_flags |= P_LEAF2;
7628 mc->mc_flags |= C_INITIALIZED;
7629 } else {
7630
7631 rc2 = edb_cursor_touch(mc);
7632 if (rc2)
7633 return rc2;
7634 }
7635
7636 insert_key = insert_data = rc;
7637 if (insert_key) {
7638
7639 DPRINTF(("inserting key at index %i", mc->mc_ki[mc->mc_top]));
7640 if ((mc->mc_db->md_flags & EDB_DUPSORT) &&
7641 LEAFSIZE(key, data) > env->me_nodemax)
7642 {
7643
7644
7645
7646 fp_flags = P_LEAF|P_DIRTY;
7647 fp = env->me_pbuf;
7648 fp->mp_pad = data->mv_size;
7649 fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE);
7650 olddata.mv_size = PAGEHDRSZ;
7651 goto prep_subDB;
7652 }
7653 } else {
7654
7655 if (IS_LEAF2(mc->mc_pg[mc->mc_top])) {
7656 char *ptr;
7657 unsigned int ksize = mc->mc_db->md_pad;
7658 if (key->mv_size != ksize)
7659 return EDB_BAD_VALSIZE;
7660 ptr = LEAF2KEY(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize);
7661 memcpy(ptr, key->mv_data, ksize);
7662 fix_parent:
7663
7664
7665
7666 if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
7667 unsigned short dtop = 1;
7668 mc->mc_top--;
7669
7670 while (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
7671 mc->mc_top--;
7672 dtop++;
7673 }
7674 if (mc->mc_ki[mc->mc_top])
7675 rc2 = edb_update_key(mc, key);
7676 else
7677 rc2 = EDB_SUCCESS;
7678 mc->mc_top += dtop;
7679 if (rc2)
7680 return rc2;
7681 }
7682 return EDB_SUCCESS;
7683 }
7684
7685 more:
7686 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7687 olddata.mv_size = NODEDSZ(leaf);
7688 olddata.mv_data = NODEDATA(leaf);
7689
7690
7691 if (F_ISSET(mc->mc_db->md_flags, EDB_DUPSORT)) {
7692
7693
7694
7695
7696
7697 unsigned i, offset = 0;
7698 mp = fp = xdata.mv_data = env->me_pbuf;
7699 mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno;
7700
7701
7702 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
7703 EDB_cmp_func *dcmp;
7704
7705 if (flags == EDB_CURRENT)
7706 goto current;
7707 dcmp = mc->mc_dbx->md_dcmp;
7708 if (NEED_CMP_CLONG(dcmp, olddata.mv_size))
7709 dcmp = edb_cmp_clong;
7710
7711 if (!dcmp(data, &olddata)) {
7712 if (flags & (EDB_NODUPDATA|EDB_APPENDDUP))
7713 return EDB_KEYEXIST;
7714
7715 goto current;
7716 }
7717
7718
7719 dkey.mv_size = olddata.mv_size;
7720 dkey.mv_data = memcpy(fp+1, olddata.mv_data, olddata.mv_size);
7721
7722
7723 fp->mp_flags = P_LEAF|P_DIRTY|P_SUBP;
7724 fp->mp_lower = (PAGEHDRSZ-PAGEBASE);
7725 xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
7726 if (mc->mc_db->md_flags & EDB_DUPFIXED) {
7727 fp->mp_flags |= P_LEAF2;
7728 fp->mp_pad = data->mv_size;
7729 xdata.mv_size += 2 * data->mv_size;
7730 } else {
7731 xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
7732 (dkey.mv_size & 1) + (data->mv_size & 1);
7733 }
7734 fp->mp_upper = xdata.mv_size - PAGEBASE;
7735 olddata.mv_size = xdata.mv_size;
7736 } else if (leaf->mn_flags & F_SUBDATA) {
7737
7738 flags |= F_DUPDATA|F_SUBDATA;
7739 goto put_sub;
7740 } else {
7741
7742 fp = olddata.mv_data;
7743 switch (flags) {
7744 default:
7745 if (!(mc->mc_db->md_flags & EDB_DUPFIXED)) {
7746 offset = EVEN(NODESIZE + sizeof(indx_t) +
7747 data->mv_size);
7748 break;
7749 }
7750 offset = fp->mp_pad;
7751 if (SIZELEFT(fp) < offset) {
7752 offset *= 4;
7753 break;
7754 }
7755
7756 case EDB_CURRENT:
7757 fp->mp_flags |= P_DIRTY;
7758 COPY_PGNO(fp->mp_pgno, mp->mp_pgno);
7759 mc->mc_xcursor->mx_cursor.mc_pg[0] = fp;
7760 flags |= F_DUPDATA;
7761 goto put_sub;
7762 }
7763 xdata.mv_size = olddata.mv_size + offset;
7764 }
7765
7766 fp_flags = fp->mp_flags;
7767 if (NODESIZE + NODEKSZ(leaf) + xdata.mv_size > env->me_nodemax) {
7768
7769 fp_flags &= ~P_SUBP;
7770 prep_subDB:
7771 if (mc->mc_db->md_flags & EDB_DUPFIXED) {
7772 fp_flags |= P_LEAF2;
7773 dummy.md_pad = fp->mp_pad;
7774 dummy.md_flags = EDB_DUPFIXED;
7775 if (mc->mc_db->md_flags & EDB_INTEGERDUP)
7776 dummy.md_flags |= EDB_INTEGERKEY;
7777 } else {
7778 dummy.md_pad = 0;
7779 dummy.md_flags = 0;
7780 }
7781 dummy.md_depth = 1;
7782 dummy.md_branch_pages = 0;
7783 dummy.md_leaf_pages = 1;
7784 dummy.md_overflow_pages = 0;
7785 dummy.md_entries = NUMKEYS(fp);
7786 xdata.mv_size = sizeof(EDB_db);
7787 xdata.mv_data = &dummy;
7788 if ((rc = edb_page_alloc(mc, 1, &mp)))
7789 return rc;
7790 offset = env->me_psize - olddata.mv_size;
7791 flags |= F_DUPDATA|F_SUBDATA;
7792 dummy.md_root = mp->mp_pgno;
7793 sub_root = mp;
7794 }
7795 if (mp != fp) {
7796 mp->mp_flags = fp_flags | P_DIRTY;
7797 mp->mp_pad = fp->mp_pad;
7798 mp->mp_lower = fp->mp_lower;
7799 mp->mp_upper = fp->mp_upper + offset;
7800 if (fp_flags & P_LEAF2) {
7801 memcpy(METADATA(mp), METADATA(fp), NUMKEYS(fp) * fp->mp_pad);
7802 } else {
7803 memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
7804 olddata.mv_size - fp->mp_upper - PAGEBASE);
7805 memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), NUMKEYS(fp) * sizeof(mp->mp_ptrs[0]));
7806 for (i=0; i<NUMKEYS(fp); i++)
7807 mp->mp_ptrs[i] += offset;
7808 }
7809 }
7810
7811 rdata = &xdata;
7812 flags |= F_DUPDATA;
7813 do_sub = 1;
7814 if (!insert_key)
7815 edb_node_del(mc, 0);
7816 goto new_sub;
7817 }
7818 current:
7819
7820 if ((leaf->mn_flags ^ flags) & F_SUBDATA)
7821 return EDB_INCOMPATIBLE;
7822
7823 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
7824 EDB_page *omp;
7825 pgno_t pg;
7826 int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize);
7827
7828 memcpy(&pg, olddata.mv_data, sizeof(pg));
7829 if ((rc2 = edb_page_get(mc, pg, &omp, &level)) != 0)
7830 return rc2;
7831 ovpages = omp->mp_pages;
7832
7833
7834 if (ovpages >= dpages) {
7835 if (!(omp->mp_flags & P_DIRTY) &&
7836 (level || (env->me_flags & EDB_WRITEMAP)))
7837 {
7838 rc = edb_page_unspill(mc->mc_txn, omp, &omp);
7839 if (rc)
7840 return rc;
7841 level = 0;
7842 }
7843
7844 if (omp->mp_flags & P_DIRTY) {
7845
7846
7847
7848
7849 if (level > 1) {
7850
7851 size_t sz = (size_t) env->me_psize * ovpages, off;
7852 EDB_page *np = edb_page_malloc(mc->mc_txn, ovpages);
7853 EDB_ID2 id2;
7854 if (!np)
7855 return ENOMEM;
7856 id2.mid = pg;
7857 id2.mptr = np;
7858
7859 rc2 = edb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
7860 edb_cassert(mc, rc2 == 0);
7861
7862
7863
7864
7865 if (!(flags & EDB_RESERVE)) {
7866
7867
7868
7869
7870 off = (PAGEHDRSZ + data->mv_size) & -sizeof(size_t);
7871 memcpy((size_t *)((char *)np + off),
7872 (size_t *)((char *)omp + off), sz - off);
7873 sz = PAGEHDRSZ;
7874 }
7875 memcpy(np, omp, sz);
7876 omp = np;
7877 }
7878 SETDSZ(leaf, data->mv_size);
7879 if (F_ISSET(flags, EDB_RESERVE))
7880 data->mv_data = METADATA(omp);
7881 else
7882 memcpy(METADATA(omp), data->mv_data, data->mv_size);
7883 return EDB_SUCCESS;
7884 }
7885 }
7886 if ((rc2 = edb_ovpage_free(mc, omp)) != EDB_SUCCESS)
7887 return rc2;
7888 } else if (data->mv_size == olddata.mv_size) {
7889
7890
7891
7892
7893 if (F_ISSET(flags, EDB_RESERVE))
7894 data->mv_data = olddata.mv_data;
7895 else if (!(mc->mc_flags & C_SUB))
7896 memcpy(olddata.mv_data, data->mv_data, data->mv_size);
7897 else {
7898 memcpy(NODEKEY(leaf), key->mv_data, key->mv_size);
7899 goto fix_parent;
7900 }
7901 return EDB_SUCCESS;
7902 }
7903 edb_node_del(mc, 0);
7904 }
7905
7906 rdata = data;
7907
7908 new_sub:
7909 nflags = flags & NODE_ADD_FLAGS;
7910 nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->mv_size : edb_leaf_size(env, key, rdata);
7911 if (SIZELEFT(mc->mc_pg[mc->mc_top]) < nsize) {
7912 if (( flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA )
7913 nflags &= ~EDB_APPEND;
7914 if (!insert_key)
7915 nflags |= EDB_SPLIT_REPLACE;
7916 rc = edb_page_split(mc, key, rdata, P_INVALID, nflags);
7917 } else {
7918
7919 rc = edb_node_add(mc, mc->mc_ki[mc->mc_top], key, rdata, 0, nflags);
7920 if (rc == 0) {
7921
7922 EDB_cursor *m2, *m3;
7923 EDB_dbi dbi = mc->mc_dbi;
7924 unsigned i = mc->mc_top;
7925 EDB_page *mp = mc->mc_pg[i];
7926
7927 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
7928 if (mc->mc_flags & C_SUB)
7929 m3 = &m2->mc_xcursor->mx_cursor;
7930 else
7931 m3 = m2;
7932 if (m3 == mc || m3->mc_snum < mc->mc_snum || m3->mc_pg[i] != mp) continue;
7933 if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) {
7934 m3->mc_ki[i]++;
7935 }
7936 XCURSOR_REFRESH(m3, i, mp);
7937 }
7938 }
7939 }
7940
7941 if (rc == EDB_SUCCESS) {
7942
7943
7944
7945
7946
7947 if (do_sub) {
7948 int xflags, new_dupdata;
7949 edb_size_t ecount;
7950 put_sub:
7951 xdata.mv_size = 0;
7952 xdata.mv_data = "";
7953 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
7954 if (flags == EDB_CURRENT) {
7955 xflags = EDB_CURRENT|EDB_NOSPILL;
7956 } else {
7957 edb_xcursor_init1(mc, leaf);
7958 xflags = (flags & EDB_NODUPDATA) ?
7959 EDB_NOOVERWRITE|EDB_NOSPILL : EDB_NOSPILL;
7960 }
7961 if (sub_root)
7962 mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root;
7963 new_dupdata = (int)dkey.mv_size;
7964
7965 if (dkey.mv_size) {
7966 rc = edb_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags);
7967 if (rc)
7968 goto bad_sub;
7969
7970 dkey.mv_size = 0;
7971 }
7972 if (!(leaf->mn_flags & F_SUBDATA) || sub_root) {
7973
7974 EDB_cursor *m2;
7975 EDB_xcursor *mx = mc->mc_xcursor;
7976 unsigned i = mc->mc_top;
7977 EDB_page *mp = mc->mc_pg[i];
7978
7979 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
7980 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
7981 if (!(m2->mc_flags & C_INITIALIZED)) continue;
7982 if (m2->mc_pg[i] == mp) {
7983 if (m2->mc_ki[i] == mc->mc_ki[i]) {
7984 edb_xcursor_init2(m2, mx, new_dupdata);
7985 } else if (!insert_key) {
7986 XCURSOR_REFRESH(m2, i, mp);
7987 }
7988 }
7989 }
7990 }
7991 ecount = mc->mc_xcursor->mx_db.md_entries;
7992 if (flags & EDB_APPENDDUP)
7993 xflags |= EDB_APPEND;
7994 rc = edb_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags);
7995 if (flags & F_SUBDATA) {
7996 void *db = NODEDATA(leaf);
7997 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(EDB_db));
7998 }
7999 insert_data = mc->mc_xcursor->mx_db.md_entries - ecount;
8000 }
8001
8002 if (insert_data)
8003 mc->mc_db->md_entries++;
8004 if (insert_key) {
8005
8006 if (rc)
8007 goto bad_sub;
8008
8009
8010
8011 mc->mc_flags |= C_INITIALIZED;
8012 }
8013 if (flags & EDB_MULTIPLE) {
8014 if (!rc) {
8015 mcount++;
8016
8017 data[1].mv_size = mcount;
8018 if (mcount < dcount) {
8019 data[0].mv_data = (char *)data[0].mv_data + data[0].mv_size;
8020 insert_key = insert_data = 0;
8021 goto more;
8022 }
8023 }
8024 }
8025 return rc;
8026 bad_sub:
8027 if (rc == EDB_KEYEXIST)
8028 rc = EDB_PROBLEM;
8029 }
8030 mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
8031 return rc;
8032 }
8033
8034 int
8035 edb_cursor_del(EDB_cursor *mc, unsigned int flags)
8036 {
8037 EDB_node *leaf;
8038 EDB_page *mp;
8039 int rc;
8040
8041 if (mc->mc_txn->mt_flags & (EDB_TXN_RDONLY|EDB_TXN_BLOCKED))
8042 return (mc->mc_txn->mt_flags & EDB_TXN_RDONLY) ? EACCES : EDB_BAD_TXN;
8043
8044 if (!(mc->mc_flags & C_INITIALIZED))
8045 return EINVAL;
8046
8047 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
8048 return EDB_NOTFOUND;
8049
8050 if (!(flags & EDB_NOSPILL) && (rc = edb_page_spill(mc, NULL, NULL)))
8051 return rc;
8052
8053 rc = edb_cursor_touch(mc);
8054 if (rc)
8055 return rc;
8056
8057 mp = mc->mc_pg[mc->mc_top];
8058 if (IS_LEAF2(mp))
8059 goto del_key;
8060 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
8061
8062 if (F_ISSET(leaf->mn_flags, F_DUPDATA)) {
8063 if (flags & EDB_NODUPDATA) {
8064
8065 mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1;
8066 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
8067 } else {
8068 if (!F_ISSET(leaf->mn_flags, F_SUBDATA)) {
8069 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
8070 }
8071 rc = edb_cursor_del(&mc->mc_xcursor->mx_cursor, EDB_NOSPILL);
8072 if (rc)
8073 return rc;
8074
8075 if (mc->mc_xcursor->mx_db.md_entries) {
8076 if (leaf->mn_flags & F_SUBDATA) {
8077
8078 void *db = NODEDATA(leaf);
8079 memcpy(db, &mc->mc_xcursor->mx_db, sizeof(EDB_db));
8080 } else {
8081 EDB_cursor *m2;
8082
8083 edb_node_shrink(mp, mc->mc_ki[mc->mc_top]);
8084 leaf = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
8085 mc->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf);
8086
8087 for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
8088 if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
8089 if (!(m2->mc_flags & C_INITIALIZED)) continue;
8090 if (m2->mc_pg[mc->mc_top] == mp) {
8091 XCURSOR_REFRESH(m2, mc->mc_top, mp);
8092 }
8093 }
8094 }
8095 mc->mc_db->md_entries--;
8096 return rc;
8097 } else {
8098 mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
8099 }
8100
8101 }
8102
8103 if (leaf->mn_flags & F_SUBDATA) {
8104
8105 rc = edb_drop0(&mc->mc_xcursor->mx_cursor, 0);
8106 if (rc)
8107 goto fail;
8108 }
8109 }
8110
8111 else if ((leaf->mn_flags ^ flags) & F_SUBDATA) {
8112 rc = EDB_INCOMPATIBLE;
8113 goto fail;
8114 }
8115
8116
8117 if (F_ISSET(leaf->mn_flags, F_BIGDATA)) {
8118 EDB_page *omp;
8119 pgno_t pg;
8120
8121 memcpy(&pg, NODEDATA(leaf), sizeof(pg));
8122 if ((rc = edb_page_get(mc, pg, &omp, NULL)) ||
8123 (rc = edb_ovpage_free(mc, omp)))
8124 goto fail;
8125 }
8126
8127 del_key:
8128 return edb_cursor_del0(mc);
8129
8130 fail:
8131 mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
8132 return rc;
8133 }
8134
8135
8136
8137
8138
8139
8140
8141
8142
8143
8144 static int
8145 edb_page_new(EDB_cursor *mc, uint32_t flags, int num, EDB_page **mp)
8146 {
8147 EDB_page *np;
8148 int rc;
8149
8150 if ((rc = edb_page_alloc(mc, num, &np)))
8151 return rc;
8152 DPRINTF(("allocated new mpage %"Yu", page size %u",
8153 np->mp_pgno, mc->mc_txn->mt_env->me_psize));
8154 np->mp_flags = flags | P_DIRTY;
8155 np->mp_lower = (PAGEHDRSZ-PAGEBASE);
8156 np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE;
8157
8158 if (IS_BRANCH(np))
8159 mc->mc_db->md_branch_pages++;
8160 else if (IS_LEAF(np))
8161 mc->mc_db->md_leaf_pages++;
8162 else if (IS_OVERFLOW(np)) {
8163 mc->mc_db->md_overflow_pages += num;
8164 np->mp_pages = num;
8165 }
8166 *mp = np;
8167
8168 return 0;
8169 }
8170
8171
8172
8173
8174
8175
8176
8177
8178
8179
8180
8181
8182 static size_t
8183 edb_leaf_size(EDB_env *env, EDB_val *key, EDB_val *data)
8184 {
8185 size_t sz;
8186
8187 sz = LEAFSIZE(key, data);
8188 if (sz > env->me_nodemax) {
8189
8190 sz -= data->mv_size - sizeof(pgno_t);
8191 }
8192
8193 return EVEN(sz + sizeof(indx_t));
8194 }
8195
8196
8197
8198
8199
8200
8201
8202
8203
8204
8205
8206 static size_t
8207 edb_branch_size(EDB_env *env, EDB_val *key)
8208 {
8209 size_t sz;
8210
8211 sz = INDXSIZE(key);
8212 if (sz > env->me_nodemax) {
8213
8214
8215
8216 }
8217
8218 return sz + sizeof(indx_t);
8219 }
8220
8221
8222
8223
8224
8225
8226
8227
8228
8229
8230
8231
8232
8233
8234
8235
8236
8237 static int
8238 edb_node_add(EDB_cursor *mc, indx_t indx,
8239 EDB_val *key, EDB_val *data, pgno_t pgno, unsigned int flags)
8240 {
8241 unsigned int i;
8242 size_t node_size = NODESIZE;
8243 ssize_t room;
8244 indx_t ofs;
8245 EDB_node *node;
8246 EDB_page *mp = mc->mc_pg[mc->mc_top];
8247 EDB_page *ofp = NULL;
8248 void *ndata;
8249 DKBUF;
8250
8251 edb_cassert(mc, mp->mp_upper >= mp->mp_lower);
8252
8253 DPRINTF(("add to %s %spage %"Yu" index %i, data size %"Z"u key size %"Z"u [%s]",
8254 IS_LEAF(mp) ? "leaf" : "branch",
8255 IS_SUBP(mp) ? "sub-" : "",
8256 edb_dbg_pgno(mp), indx, data ? data->mv_size : 0,
8257 key ? key->mv_size : 0, key ? DKEY(key) : "null"));
8258
8259 if (IS_LEAF2(mp)) {
8260
8261 int ksize = mc->mc_db->md_pad, dif;
8262 char *ptr = LEAF2KEY(mp, indx, ksize);
8263 dif = NUMKEYS(mp) - indx;
8264 if (dif > 0)
8265 memmove(ptr+ksize, ptr, dif*ksize);
8266
8267 memcpy(ptr, key->mv_data, ksize);
8268
8269
8270 mp->mp_lower += sizeof(indx_t);
8271 mp->mp_upper -= ksize - sizeof(indx_t);
8272 return EDB_SUCCESS;
8273 }
8274
8275 room = (ssize_t)SIZELEFT(mp) - (ssize_t)sizeof(indx_t);
8276 if (key != NULL)
8277 node_size += key->mv_size;
8278 if (IS_LEAF(mp)) {
8279 edb_cassert(mc, key && data);
8280 if (F_ISSET(flags, F_BIGDATA)) {
8281
8282 node_size += sizeof(pgno_t);
8283 } else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) {
8284 int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
8285 int rc;
8286
8287 DPRINTF(("data size is %"Z"u, node would be %"Z"u, put data on overflow page",
8288 data->mv_size, node_size+data->mv_size));
8289 node_size = EVEN(node_size + sizeof(pgno_t));
8290 if ((ssize_t)node_size > room)
8291 goto full;
8292 if ((rc = edb_page_new(mc, P_OVERFLOW, ovpages, &ofp)))
8293 return rc;
8294 DPRINTF(("allocated overflow page %"Yu, ofp->mp_pgno));
8295 flags |= F_BIGDATA;
8296 goto update;
8297 } else {
8298 node_size += data->mv_size;
8299 }
8300 }
8301 node_size = EVEN(node_size);
8302 if ((ssize_t)node_size > room)
8303 goto full;
8304
8305 update:
8306
8307 for (i = NUMKEYS(mp); i > indx; i--)
8308 mp->mp_ptrs[i] = mp->mp_ptrs[i - 1];
8309
8310
8311 ofs = mp->mp_upper - node_size;
8312 edb_cassert(mc, ofs >= mp->mp_lower + sizeof(indx_t));
8313 mp->mp_ptrs[indx] = ofs;
8314 mp->mp_upper = ofs;
8315 mp->mp_lower += sizeof(indx_t);
8316
8317
8318 node = NODEPTR(mp, indx);
8319 node->mn_ksize = (key == NULL) ? 0 : key->mv_size;
8320 node->mn_flags = flags;
8321 if (IS_LEAF(mp))
8322 SETDSZ(node,data->mv_size);
8323 else
8324 SETPGNO(node,pgno);
8325
8326 if (key)
8327 memcpy(NODEKEY(node), key->mv_data, key->mv_size);
8328
8329 if (IS_LEAF(mp)) {
8330 ndata = NODEDATA(node);
8331 if (ofp == NULL) {
8332 if (F_ISSET(flags, F_BIGDATA))
8333 memcpy(ndata, data->mv_data, sizeof(pgno_t));
8334 else if (F_ISSET(flags, EDB_RESERVE))
8335 data->mv_data = ndata;
8336 else
8337 memcpy(ndata, data->mv_data, data->mv_size);
8338 } else {
8339 memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t));
8340 ndata = METADATA(ofp);
8341 if (F_ISSET(flags, EDB_RESERVE))
8342 data->mv_data = ndata;
8343 else
8344 memcpy(ndata, data->mv_data, data->mv_size);
8345 }
8346 }
8347
8348 return EDB_SUCCESS;
8349
8350 full:
8351 DPRINTF(("not enough room in page %"Yu", got %u ptrs",
8352 edb_dbg_pgno(mp), NUMKEYS(mp)));
8353 DPRINTF(("upper-lower = %u - %u = %"Z"d", mp->mp_upper,mp->mp_lower,room));
8354 DPRINTF(("node size = %"Z"u", node_size));
8355 mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
8356 return EDB_PAGE_FULL;
8357 }
8358
8359
8360
8361
8362
8363
8364 static void
8365 edb_node_del(EDB_cursor *mc, int ksize)
8366 {
8367 EDB_page *mp = mc->mc_pg[mc->mc_top];
8368 indx_t indx = mc->mc_ki[mc->mc_top];
8369 unsigned int sz;
8370 indx_t i, j, numkeys, ptr;
8371 EDB_node *node;
8372 char *base;
8373
8374 DPRINTF(("delete node %u on %s page %"Yu, indx,
8375 IS_LEAF(mp) ? "leaf" : "branch", edb_dbg_pgno(mp)));
8376 numkeys = NUMKEYS(mp);
8377 edb_cassert(mc, indx < numkeys);
8378
8379 if (IS_LEAF2(mp)) {
8380 int x = numkeys - 1 - indx;
8381 base = LEAF2KEY(mp, indx, ksize);
8382 if (x)
8383 memmove(base, base + ksize, x * ksize);
8384 mp->mp_lower -= sizeof(indx_t);
8385 mp->mp_upper += ksize - sizeof(indx_t);
8386 return;
8387 }
8388
8389 node = NODEPTR(mp, indx);
8390 sz = NODESIZE + node->mn_ksize;
8391 if (IS_LEAF(mp)) {
8392 if (F_ISSET(node->mn_flags, F_BIGDATA))
8393 sz += sizeof(pgno_t);
8394 else
8395 sz += NODEDSZ(node);
8396 }
8397 sz = EVEN(sz);
8398
8399 ptr = mp->mp_ptrs[indx];
8400 for (i = j = 0; i < numkeys; i++) {
8401 if (i != indx) {
8402 mp->mp_ptrs[j] = mp->mp_ptrs[i];
8403 if (mp->mp_ptrs[i] < ptr)
8404 mp->mp_ptrs[j] += sz;
8405 j++;
8406 }
8407 }
8408
8409 base = (char *)mp + mp->mp_upper + PAGEBASE;
8410 memmove(base + sz, base, ptr - mp->mp_upper);
8411
8412 mp->mp_lower -= sizeof(indx_t);
8413 mp->mp_upper += sz;
8414 }
8415
8416
8417
8418
8419
8420 static void
8421 edb_node_shrink(EDB_page *mp, indx_t indx)
8422 {
8423 EDB_node *node;
8424 EDB_page *sp, *xp;
8425 char *base;
8426 indx_t delta, nsize, len, ptr;
8427 int i;
8428
8429 node = NODEPTR(mp, indx);
8430 sp = (EDB_page *)NODEDATA(node);
8431 delta = SIZELEFT(sp);
8432 nsize = NODEDSZ(node) - delta;
8433
8434
8435 if (IS_LEAF2(sp)) {
8436 len = nsize;
8437 if (nsize & 1)
8438 return;
8439 } else {
8440 xp = (EDB_page *)((char *)sp + delta);
8441 for (i = NUMKEYS(sp); --i >= 0; )
8442 xp->mp_ptrs[i] = sp->mp_ptrs[i] - delta;
8443 len = PAGEHDRSZ;
8444 }
8445 sp->mp_upper = sp->mp_lower;
8446 COPY_PGNO(sp->mp_pgno, mp->mp_pgno);
8447 SETDSZ(node, nsize);
8448
8449
8450 base = (char *)mp + mp->mp_upper + PAGEBASE;
8451 memmove(base + delta, base, (char *)sp + len - base);
8452
8453 ptr = mp->mp_ptrs[indx];
8454 for (i = NUMKEYS(mp); --i >= 0; ) {
8455 if (mp->mp_ptrs[i] <= ptr)
8456 mp->mp_ptrs[i] += delta;
8457 }
8458 mp->mp_upper += delta;
8459 }
8460
8461
8462
8463
8464
8465
8466
8467
8468
8469
8470 static void
8471 edb_xcursor_init0(EDB_cursor *mc)
8472 {
8473 EDB_xcursor *mx = mc->mc_xcursor;
8474
8475 mx->mx_cursor.mc_xcursor = NULL;
8476 mx->mx_cursor.mc_txn = mc->mc_txn;
8477 mx->mx_cursor.mc_db = &mx->mx_db;
8478 mx->mx_cursor.mc_dbx = &mx->mx_dbx;
8479 mx->mx_cursor.mc_dbi = mc->mc_dbi;
8480 mx->mx_cursor.mc_dbflag = &mx->mx_dbflag;
8481 mx->mx_cursor.mc_snum = 0;
8482 mx->mx_cursor.mc_top = 0;
8483 MC_SET_OVPG(&mx->mx_cursor, NULL);
8484 mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_ORIG_RDONLY|C_WRITEMAP));
8485 mx->mx_dbx.md_name.mv_size = 0;
8486 mx->mx_dbx.md_name.mv_data = NULL;
8487 mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp;
8488 mx->mx_dbx.md_dcmp = NULL;
8489 mx->mx_dbx.md_rel = mc->mc_dbx->md_rel;
8490 }
8491
8492
8493
8494
8495
8496
8497
8498 static void
8499 edb_xcursor_init1(EDB_cursor *mc, EDB_node *node)
8500 {
8501 EDB_xcursor *mx = mc->mc_xcursor;
8502
8503 mx->mx_cursor.mc_flags &= C_SUB|C_ORIG_RDONLY|C_WRITEMAP;
8504 if (node->mn_flags & F_SUBDATA) {
8505 memcpy(&mx->mx_db, NODEDATA(node), sizeof(EDB_db));
8506 mx->mx_cursor.mc_pg[0] = 0;
8507 mx->mx_cursor.mc_snum = 0;
8508 mx->mx_cursor.mc_top = 0;
8509 } else {
8510 EDB_page *fp = NODEDATA(node);
8511 mx->mx_db.md_pad = 0;
8512 mx->mx_db.md_flags = 0;
8513 mx->mx_db.md_depth = 1;
8514 mx->mx_db.md_branch_pages = 0;
8515 mx->mx_db.md_leaf_pages = 1;
8516 mx->mx_db.md_overflow_pages = 0;
8517 mx->mx_db.md_entries = NUMKEYS(fp);
8518 COPY_PGNO(mx->mx_db.md_root, fp->mp_pgno);
8519 mx->mx_cursor.mc_snum = 1;
8520 mx->mx_cursor.mc_top = 0;
8521 mx->mx_cursor.mc_flags |= C_INITIALIZED;
8522 mx->mx_cursor.mc_pg[0] = fp;
8523 mx->mx_cursor.mc_ki[0] = 0;
8524 if (mc->mc_db->md_flags & EDB_DUPFIXED) {
8525 mx->mx_db.md_flags = EDB_DUPFIXED;
8526 mx->mx_db.md_pad = fp->mp_pad;
8527 if (mc->mc_db->md_flags & EDB_INTEGERDUP)
8528 mx->mx_db.md_flags |= EDB_INTEGERKEY;
8529 }
8530 }
8531 DPRINTF(("Sub-db -%u root page %"Yu, mx->mx_cursor.mc_dbi,
8532 mx->mx_db.md_root));
8533 mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA;
8534 if (NEED_CMP_CLONG(mx->mx_dbx.md_cmp, mx->mx_db.md_pad))
8535 mx->mx_dbx.md_cmp = edb_cmp_clong;
8536 }
8537
8538
8539
8540
8541
8542
8543
8544
8545
8546
8547 static void
8548 edb_xcursor_init2(EDB_cursor *mc, EDB_xcursor *src_mx, int new_dupdata)
8549 {
8550 EDB_xcursor *mx = mc->mc_xcursor;
8551
8552 if (new_dupdata) {
8553 mx->mx_cursor.mc_snum = 1;
8554 mx->mx_cursor.mc_top = 0;
8555 mx->mx_cursor.mc_flags |= C_INITIALIZED;
8556 mx->mx_cursor.mc_ki[0] = 0;
8557 mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA;
8558 #if UINT_MAX < EDB_SIZE_MAX
8559 mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp;
8560 #endif
8561 } else if (!(mx->mx_cursor.mc_flags & C_INITIALIZED)) {
8562 return;
8563 }
8564 mx->mx_db = src_mx->mx_db;
8565 mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0];
8566 DPRINTF(("Sub-db -%u root page %"Yu, mx->mx_cursor.mc_dbi,
8567 mx->mx_db.md_root));
8568 }
8569
8570
8571 static void
8572 edb_cursor_init(EDB_cursor *mc, EDB_txn *txn, EDB_dbi dbi, EDB_xcursor *mx)
8573 {
8574 mc->mc_next = NULL;
8575 mc->mc_backup = NULL;
8576 mc->mc_dbi = dbi;
8577 mc->mc_txn = txn;
8578 mc->mc_db = &txn->mt_dbs[dbi];
8579 mc->mc_dbx = &txn->mt_dbxs[dbi];
8580 mc->mc_dbflag = &txn->mt_dbflags[dbi];
8581 mc->mc_snum = 0;
8582 mc->mc_top = 0;
8583 mc->mc_pg[0] = 0;
8584 mc->mc_ki[0] = 0;
8585 MC_SET_OVPG(mc, NULL);
8586 mc->mc_flags = txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP);
8587 if (txn->mt_dbs[dbi].md_flags & EDB_DUPSORT) {
8588 edb_tassert(txn, mx != NULL);
8589 mc->mc_xcursor = mx;
8590 edb_xcursor_init0(mc);
8591 } else {
8592 mc->mc_xcursor = NULL;
8593 }
8594 if (*mc->mc_dbflag & DB_STALE) {
8595 edb_page_search(mc, NULL, EDB_PS_ROOTONLY);
8596 }
8597 }
8598
8599 int
8600 edb_cursor_open(EDB_txn *txn, EDB_dbi dbi, EDB_cursor **ret)
8601 {
8602 EDB_cursor *mc;
8603 size_t size = sizeof(EDB_cursor);
8604
8605 if (!ret || !TXN_DBI_EXIST(txn, dbi, DB_VALID))
8606 return EINVAL;
8607
8608 if (txn->mt_flags & EDB_TXN_BLOCKED)
8609 return EDB_BAD_TXN;
8610
8611 if (dbi == FREE_DBI && !F_ISSET(txn->mt_flags, EDB_TXN_RDONLY))
8612 return EINVAL;
8613
8614 if (txn->mt_dbs[dbi].md_flags & EDB_DUPSORT)
8615 size += sizeof(EDB_xcursor);
8616
8617 if ((mc = malloc(size)) != NULL) {
8618 edb_cursor_init(mc, txn, dbi, (EDB_xcursor *)(mc + 1));
8619 if (txn->mt_cursors) {
8620 mc->mc_next = txn->mt_cursors[dbi];
8621 txn->mt_cursors[dbi] = mc;
8622 mc->mc_flags |= C_UNTRACK;
8623 }
8624 } else {
8625 return ENOMEM;
8626 }
8627
8628 *ret = mc;
8629
8630 return EDB_SUCCESS;
8631 }
8632
8633 int
8634 edb_cursor_renew(EDB_txn *txn, EDB_cursor *mc)
8635 {
8636 if (!mc || !TXN_DBI_EXIST(txn, mc->mc_dbi, DB_VALID))
8637 return EINVAL;
8638
8639 if ((mc->mc_flags & C_UNTRACK) || txn->mt_cursors)
8640 return EINVAL;
8641
8642 if (txn->mt_flags & EDB_TXN_BLOCKED)
8643 return EDB_BAD_TXN;
8644
8645 edb_cursor_init(mc, txn, mc->mc_dbi, mc->mc_xcursor);
8646 return EDB_SUCCESS;
8647 }
8648
8649
8650 int
8651 edb_cursor_count(EDB_cursor *mc, edb_size_t *countp)
8652 {
8653 EDB_node *leaf;
8654
8655 if (mc == NULL || countp == NULL)
8656 return EINVAL;
8657
8658 if (mc->mc_xcursor == NULL)
8659 return EDB_INCOMPATIBLE;
8660
8661 if (mc->mc_txn->mt_flags & EDB_TXN_BLOCKED)
8662 return EDB_BAD_TXN;
8663
8664 if (!(mc->mc_flags & C_INITIALIZED))
8665 return EINVAL;
8666
8667 if (!mc->mc_snum)
8668 return EDB_NOTFOUND;
8669
8670 if (mc->mc_flags & C_EOF) {
8671 if (mc->mc_ki[mc->mc_top] >= NUMKEYS(mc->mc_pg[mc->mc_top]))
8672 return EDB_NOTFOUND;
8673 mc->mc_flags ^= C_EOF;
8674 }
8675
8676 leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
8677 if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) {
8678 *countp = 1;
8679 } else {
8680 if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
8681 return EINVAL;
8682
8683 *countp = mc->mc_xcursor->mx_db.md_entries;
8684 }
8685 return EDB_SUCCESS;
8686 }
8687
8688 void
8689 edb_cursor_close(EDB_cursor *mc)
8690 {
8691 if (mc) {
8692 EDB_CURSOR_UNREF(mc, 0);
8693 }
8694 if (mc && !mc->mc_backup) {
8695
8696
8697
8698
8699 if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) {
8700 EDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi];
8701 while (*prev && *prev != mc) prev = &(*prev)->mc_next;
8702 if (*prev == mc)
8703 *prev = mc->mc_next;
8704 }
8705 free(mc);
8706 }
8707 }
8708
8709 EDB_txn *
8710 edb_cursor_txn(EDB_cursor *mc)
8711 {
8712 if (!mc) return NULL;
8713 return mc->mc_txn;
8714 }
8715
8716 EDB_dbi
8717 edb_cursor_dbi(EDB_cursor *mc)
8718 {
8719 return mc->mc_dbi;
8720 }
8721
8722
8723
8724
8725
8726
8727
8728 static int
8729 edb_update_key(EDB_cursor *mc, EDB_val *key)
8730 {
8731 EDB_page *mp;
8732 EDB_node *node;
8733 char *base;
8734 size_t len;
8735 int delta, ksize, oksize;
8736 indx_t ptr, i, numkeys, indx;
8737 DKBUF;
8738
8739 indx = mc->mc_ki[mc->mc_top];
8740 mp = mc->mc_pg[mc->mc_top];
8741 node = NODEPTR(mp, indx);
8742 ptr = mp->mp_ptrs[indx];
8743 #if EDB_DEBUG
8744 {
8745 EDB_val k2;
8746 char kbuf2[DKBUF_MAXKEYSIZE*2+1];
8747 k2.mv_data = NODEKEY(node);
8748 k2.mv_size = node->mn_ksize;
8749 DPRINTF(("update key %u (ofs %u) [%s] to [%s] on page %"Yu,
8750 indx, ptr,
8751 edb_dkey(&k2, kbuf2),
8752 DKEY(key),
8753 mp->mp_pgno));
8754 }
8755 #endif
8756
8757
8758 ksize = EVEN(key->mv_size);
8759 oksize = EVEN(node->mn_ksize);
8760 delta = ksize - oksize;
8761
8762
8763 if (delta) {
8764 if (delta > 0 && SIZELEFT(mp) < delta) {
8765 pgno_t pgno;
8766
8767 DPRINTF(("Not enough room, delta = %d, splitting...", delta));
8768 pgno = NODEPGNO(node);
8769 edb_node_del(mc, 0);
8770 return edb_page_split(mc, key, NULL, pgno, EDB_SPLIT_REPLACE);
8771 }
8772
8773 numkeys = NUMKEYS(mp);
8774 for (i = 0; i < numkeys; i++) {
8775 if (mp->mp_ptrs[i] <= ptr)
8776 mp->mp_ptrs[i] -= delta;
8777 }
8778
8779 base = (char *)mp + mp->mp_upper + PAGEBASE;
8780 len = ptr - mp->mp_upper + NODESIZE;
8781 memmove(base - delta, base, len);
8782 mp->mp_upper -= delta;
8783
8784 node = NODEPTR(mp, indx);
8785 }
8786
8787
8788 if (node->mn_ksize != key->mv_size)
8789 node->mn_ksize = key->mv_size;
8790
8791 if (key->mv_size)
8792 memcpy(NODEKEY(node), key->mv_data, key->mv_size);
8793
8794 return EDB_SUCCESS;
8795 }
8796
8797 static void
8798 edb_cursor_copy(const EDB_cursor *csrc, EDB_cursor *cdst);
8799
8800
8801 #define WITH_CURSOR_TRACKING(mn, act) do { \
8802 EDB_cursor dummy, *tracked, **tp = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \
8803 if ((mn).mc_flags & C_SUB) { \
8804 dummy.mc_flags = C_INITIALIZED; \
8805 dummy.mc_xcursor = (EDB_xcursor *)&(mn); \
8806 tracked = &dummy; \
8807 } else { \
8808 tracked = &(mn); \
8809 } \
8810 tracked->mc_next = *tp; \
8811 *tp = tracked; \
8812 { act; } \
8813 *tp = tracked->mc_next; \
8814 } while (0)
8815
8816
8817
8818 static int
8819 edb_node_move(EDB_cursor *csrc, EDB_cursor *cdst, int fromleft)
8820 {
8821 EDB_node *srcnode;
8822 EDB_val key, data;
8823 pgno_t srcpg;
8824 EDB_cursor mn;
8825 int rc;
8826 unsigned short flags;
8827
8828 DKBUF;
8829
8830
8831 if ((rc = edb_page_touch(csrc)) ||
8832 (rc = edb_page_touch(cdst)))
8833 return rc;
8834
8835 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
8836 key.mv_size = csrc->mc_db->md_pad;
8837 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top], key.mv_size);
8838 data.mv_size = 0;
8839 data.mv_data = NULL;
8840 srcpg = 0;
8841 flags = 0;
8842 } else {
8843 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], csrc->mc_ki[csrc->mc_top]);
8844 edb_cassert(csrc, !((size_t)srcnode & 1));
8845 srcpg = NODEPGNO(srcnode);
8846 flags = srcnode->mn_flags;
8847 if (csrc->mc_ki[csrc->mc_top] == 0 && IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
8848 unsigned int snum = csrc->mc_snum;
8849 EDB_node *s2;
8850
8851 rc = edb_page_search_lowest(csrc);
8852 if (rc)
8853 return rc;
8854 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
8855 key.mv_size = csrc->mc_db->md_pad;
8856 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
8857 } else {
8858 s2 = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
8859 key.mv_size = NODEKSZ(s2);
8860 key.mv_data = NODEKEY(s2);
8861 }
8862 csrc->mc_snum = snum--;
8863 csrc->mc_top = snum;
8864 } else {
8865 key.mv_size = NODEKSZ(srcnode);
8866 key.mv_data = NODEKEY(srcnode);
8867 }
8868 data.mv_size = NODEDSZ(srcnode);
8869 data.mv_data = NODEDATA(srcnode);
8870 }
8871 mn.mc_xcursor = NULL;
8872 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top]) && cdst->mc_ki[cdst->mc_top] == 0) {
8873 unsigned int snum = cdst->mc_snum;
8874 EDB_node *s2;
8875 EDB_val bkey;
8876
8877 edb_cursor_copy(cdst, &mn);
8878 rc = edb_page_search_lowest(&mn);
8879 if (rc)
8880 return rc;
8881 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
8882 bkey.mv_size = mn.mc_db->md_pad;
8883 bkey.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, bkey.mv_size);
8884 } else {
8885 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
8886 bkey.mv_size = NODEKSZ(s2);
8887 bkey.mv_data = NODEKEY(s2);
8888 }
8889 mn.mc_snum = snum--;
8890 mn.mc_top = snum;
8891 mn.mc_ki[snum] = 0;
8892 rc = edb_update_key(&mn, &bkey);
8893 if (rc)
8894 return rc;
8895 }
8896
8897 DPRINTF(("moving %s node %u [%s] on page %"Yu" to node %u on page %"Yu,
8898 IS_LEAF(csrc->mc_pg[csrc->mc_top]) ? "leaf" : "branch",
8899 csrc->mc_ki[csrc->mc_top],
8900 DKEY(&key),
8901 csrc->mc_pg[csrc->mc_top]->mp_pgno,
8902 cdst->mc_ki[cdst->mc_top], cdst->mc_pg[cdst->mc_top]->mp_pgno));
8903
8904
8905
8906 rc = edb_node_add(cdst, cdst->mc_ki[cdst->mc_top], &key, &data, srcpg, flags);
8907 if (rc != EDB_SUCCESS)
8908 return rc;
8909
8910
8911
8912 edb_node_del(csrc, key.mv_size);
8913
8914 {
8915
8916 EDB_cursor *m2, *m3;
8917 EDB_dbi dbi = csrc->mc_dbi;
8918 EDB_page *mpd, *mps;
8919
8920 mps = csrc->mc_pg[csrc->mc_top];
8921
8922 if (fromleft) {
8923 mpd = cdst->mc_pg[csrc->mc_top];
8924 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8925 if (csrc->mc_flags & C_SUB)
8926 m3 = &m2->mc_xcursor->mx_cursor;
8927 else
8928 m3 = m2;
8929 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
8930 continue;
8931 if (m3 != cdst &&
8932 m3->mc_pg[csrc->mc_top] == mpd &&
8933 m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) {
8934 m3->mc_ki[csrc->mc_top]++;
8935 }
8936 if (m3 !=csrc &&
8937 m3->mc_pg[csrc->mc_top] == mps &&
8938 m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) {
8939 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
8940 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
8941 m3->mc_ki[csrc->mc_top-1]++;
8942 }
8943 if (IS_LEAF(mps))
8944 XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]);
8945 }
8946 } else
8947
8948 {
8949 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
8950 if (csrc->mc_flags & C_SUB)
8951 m3 = &m2->mc_xcursor->mx_cursor;
8952 else
8953 m3 = m2;
8954 if (m3 == csrc) continue;
8955 if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
8956 continue;
8957 if (m3->mc_pg[csrc->mc_top] == mps) {
8958 if (!m3->mc_ki[csrc->mc_top]) {
8959 m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
8960 m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top];
8961 m3->mc_ki[csrc->mc_top-1]--;
8962 } else {
8963 m3->mc_ki[csrc->mc_top]--;
8964 }
8965 if (IS_LEAF(mps))
8966 XCURSOR_REFRESH(m3, csrc->mc_top, m3->mc_pg[csrc->mc_top]);
8967 }
8968 }
8969 }
8970 }
8971
8972
8973
8974 if (csrc->mc_ki[csrc->mc_top] == 0) {
8975 if (csrc->mc_ki[csrc->mc_top-1] != 0) {
8976 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
8977 key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
8978 } else {
8979 srcnode = NODEPTR(csrc->mc_pg[csrc->mc_top], 0);
8980 key.mv_size = NODEKSZ(srcnode);
8981 key.mv_data = NODEKEY(srcnode);
8982 }
8983 DPRINTF(("update separator for source page %"Yu" to [%s]",
8984 csrc->mc_pg[csrc->mc_top]->mp_pgno, DKEY(&key)));
8985 edb_cursor_copy(csrc, &mn);
8986 mn.mc_snum--;
8987 mn.mc_top--;
8988
8989 WITH_CURSOR_TRACKING(mn,
8990 rc = edb_update_key(&mn, &key));
8991 if (rc)
8992 return rc;
8993 }
8994 if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
8995 EDB_val nullkey;
8996 indx_t ix = csrc->mc_ki[csrc->mc_top];
8997 nullkey.mv_size = 0;
8998 csrc->mc_ki[csrc->mc_top] = 0;
8999 rc = edb_update_key(csrc, &nullkey);
9000 csrc->mc_ki[csrc->mc_top] = ix;
9001 edb_cassert(csrc, rc == EDB_SUCCESS);
9002 }
9003 }
9004
9005 if (cdst->mc_ki[cdst->mc_top] == 0) {
9006 if (cdst->mc_ki[cdst->mc_top-1] != 0) {
9007 if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
9008 key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
9009 } else {
9010 srcnode = NODEPTR(cdst->mc_pg[cdst->mc_top], 0);
9011 key.mv_size = NODEKSZ(srcnode);
9012 key.mv_data = NODEKEY(srcnode);
9013 }
9014 DPRINTF(("update separator for destination page %"Yu" to [%s]",
9015 cdst->mc_pg[cdst->mc_top]->mp_pgno, DKEY(&key)));
9016 edb_cursor_copy(cdst, &mn);
9017 mn.mc_snum--;
9018 mn.mc_top--;
9019
9020 WITH_CURSOR_TRACKING(mn,
9021 rc = edb_update_key(&mn, &key));
9022 if (rc)
9023 return rc;
9024 }
9025 if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
9026 EDB_val nullkey;
9027 indx_t ix = cdst->mc_ki[cdst->mc_top];
9028 nullkey.mv_size = 0;
9029 cdst->mc_ki[cdst->mc_top] = 0;
9030 rc = edb_update_key(cdst, &nullkey);
9031 cdst->mc_ki[cdst->mc_top] = ix;
9032 edb_cassert(cdst, rc == EDB_SUCCESS);
9033 }
9034 }
9035
9036 return EDB_SUCCESS;
9037 }
9038
9039
9040
9041
9042
9043
9044
9045
9046
9047 static int
9048 edb_page_merge(EDB_cursor *csrc, EDB_cursor *cdst)
9049 {
9050 EDB_page *psrc, *pdst;
9051 EDB_node *srcnode;
9052 EDB_val key, data;
9053 unsigned nkeys;
9054 int rc;
9055 indx_t i, j;
9056
9057 psrc = csrc->mc_pg[csrc->mc_top];
9058 pdst = cdst->mc_pg[cdst->mc_top];
9059
9060 DPRINTF(("merging page %"Yu" into %"Yu, psrc->mp_pgno, pdst->mp_pgno));
9061
9062 edb_cassert(csrc, csrc->mc_snum > 1);
9063 edb_cassert(csrc, cdst->mc_snum > 1);
9064
9065
9066 if ((rc = edb_page_touch(cdst)))
9067 return rc;
9068
9069
9070 pdst = cdst->mc_pg[cdst->mc_top];
9071
9072
9073
9074 j = nkeys = NUMKEYS(pdst);
9075 if (IS_LEAF2(psrc)) {
9076 key.mv_size = csrc->mc_db->md_pad;
9077 key.mv_data = METADATA(psrc);
9078 for (i = 0; i < NUMKEYS(psrc); i++, j++) {
9079 rc = edb_node_add(cdst, j, &key, NULL, 0, 0);
9080 if (rc != EDB_SUCCESS)
9081 return rc;
9082 key.mv_data = (char *)key.mv_data + key.mv_size;
9083 }
9084 } else {
9085 for (i = 0; i < NUMKEYS(psrc); i++, j++) {
9086 srcnode = NODEPTR(psrc, i);
9087 if (i == 0 && IS_BRANCH(psrc)) {
9088 EDB_cursor mn;
9089 EDB_node *s2;
9090 edb_cursor_copy(csrc, &mn);
9091 mn.mc_xcursor = NULL;
9092
9093 rc = edb_page_search_lowest(&mn);
9094 if (rc)
9095 return rc;
9096 if (IS_LEAF2(mn.mc_pg[mn.mc_top])) {
9097 key.mv_size = mn.mc_db->md_pad;
9098 key.mv_data = LEAF2KEY(mn.mc_pg[mn.mc_top], 0, key.mv_size);
9099 } else {
9100 s2 = NODEPTR(mn.mc_pg[mn.mc_top], 0);
9101 key.mv_size = NODEKSZ(s2);
9102 key.mv_data = NODEKEY(s2);
9103 }
9104 } else {
9105 key.mv_size = srcnode->mn_ksize;
9106 key.mv_data = NODEKEY(srcnode);
9107 }
9108
9109 data.mv_size = NODEDSZ(srcnode);
9110 data.mv_data = NODEDATA(srcnode);
9111 rc = edb_node_add(cdst, j, &key, &data, NODEPGNO(srcnode), srcnode->mn_flags);
9112 if (rc != EDB_SUCCESS)
9113 return rc;
9114 }
9115 }
9116
9117 DPRINTF(("dst page %"Yu" now has %u keys (%.1f%% filled)",
9118 pdst->mp_pgno, NUMKEYS(pdst),
9119 (float)PAGEFILL(cdst->mc_txn->mt_env, pdst) / 10));
9120
9121
9122
9123 csrc->mc_top--;
9124 edb_node_del(csrc, 0);
9125 if (csrc->mc_ki[csrc->mc_top] == 0) {
9126 key.mv_size = 0;
9127 rc = edb_update_key(csrc, &key);
9128 if (rc) {
9129 csrc->mc_top++;
9130 return rc;
9131 }
9132 }
9133 csrc->mc_top++;
9134
9135 psrc = csrc->mc_pg[csrc->mc_top];
9136
9137
9138
9139 rc = edb_page_loose(csrc, psrc);
9140 if (rc)
9141 return rc;
9142 if (IS_LEAF(psrc))
9143 csrc->mc_db->md_leaf_pages--;
9144 else
9145 csrc->mc_db->md_branch_pages--;
9146 {
9147
9148 EDB_cursor *m2, *m3;
9149 EDB_dbi dbi = csrc->mc_dbi;
9150 unsigned int top = csrc->mc_top;
9151
9152 for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
9153 if (csrc->mc_flags & C_SUB)
9154 m3 = &m2->mc_xcursor->mx_cursor;
9155 else
9156 m3 = m2;
9157 if (m3 == csrc) continue;
9158 if (m3->mc_snum < csrc->mc_snum) continue;
9159 if (m3->mc_pg[top] == psrc) {
9160 m3->mc_pg[top] = pdst;
9161 m3->mc_ki[top] += nkeys;
9162 m3->mc_ki[top-1] = cdst->mc_ki[top-1];
9163 } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] &&
9164 m3->mc_ki[top-1] > csrc->mc_ki[top-1]) {
9165 m3->mc_ki[top-1]--;
9166 }
9167 if (IS_LEAF(psrc))
9168 XCURSOR_REFRESH(m3, top, m3->mc_pg[top]);
9169 }
9170 }
9171 {
9172 unsigned int snum = cdst->mc_snum;
9173 uint16_t depth = cdst->mc_db->md_depth;
9174 edb_cursor_pop(cdst);
9175 rc = edb_rebalance(cdst);
9176
9177 if (depth != cdst->mc_db->md_depth)
9178 snum += cdst->mc_db->md_depth - depth;
9179 cdst->mc_snum = snum;
9180 cdst->mc_top = snum-1;
9181 }
9182 return rc;
9183 }
9184
9185
9186
9187
9188
9189 static void
9190 edb_cursor_copy(const EDB_cursor *csrc, EDB_cursor *cdst)
9191 {
9192 unsigned int i;
9193
9194 cdst->mc_txn = csrc->mc_txn;
9195 cdst->mc_dbi = csrc->mc_dbi;
9196 cdst->mc_db = csrc->mc_db;
9197 cdst->mc_dbx = csrc->mc_dbx;
9198 cdst->mc_snum = csrc->mc_snum;
9199 cdst->mc_top = csrc->mc_top;
9200 cdst->mc_flags = csrc->mc_flags;
9201 MC_SET_OVPG(cdst, MC_OVPG(csrc));
9202
9203 for (i=0; i<csrc->mc_snum; i++) {
9204 cdst->mc_pg[i] = csrc->mc_pg[i];
9205 cdst->mc_ki[i] = csrc->mc_ki[i];
9206 }
9207 }
9208
9209
9210
9211
9212
9213
9214 static int
9215 edb_rebalance(EDB_cursor *mc)
9216 {
9217 EDB_node *node;
9218 int rc, fromleft;
9219 unsigned int ptop, minkeys, thresh;
9220 EDB_cursor mn;
9221 indx_t oldki;
9222
9223 if (IS_BRANCH(mc->mc_pg[mc->mc_top])) {
9224 minkeys = 2;
9225 thresh = 1;
9226 } else {
9227 minkeys = 1;
9228 thresh = FILL_THRESHOLD;
9229 }
9230 DPRINTF(("rebalancing %s page %"Yu" (has %u keys, %.1f%% full)",
9231 IS_LEAF(mc->mc_pg[mc->mc_top]) ? "leaf" : "branch",
9232 edb_dbg_pgno(mc->mc_pg[mc->mc_top]), NUMKEYS(mc->mc_pg[mc->mc_top]),
9233 (float)PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) / 10));
9234
9235 if (PAGEFILL(mc->mc_txn->mt_env, mc->mc_pg[mc->mc_top]) >= thresh &&
9236 NUMKEYS(mc->mc_pg[mc->mc_top]) >= minkeys) {
9237 DPRINTF(("no need to rebalance page %"Yu", above fill threshold",
9238 edb_dbg_pgno(mc->mc_pg[mc->mc_top])));
9239 return EDB_SUCCESS;
9240 }
9241
9242 if (mc->mc_snum < 2) {
9243 EDB_page *mp = mc->mc_pg[0];
9244 if (IS_SUBP(mp)) {
9245 DPUTS("Can't rebalance a subpage, ignoring");
9246 return EDB_SUCCESS;
9247 }
9248 if (NUMKEYS(mp) == 0) {
9249 DPUTS("tree is completely empty");
9250 mc->mc_db->md_root = P_INVALID;
9251 mc->mc_db->md_depth = 0;
9252 mc->mc_db->md_leaf_pages = 0;
9253 rc = edb_eidl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
9254 if (rc)
9255 return rc;
9256
9257 mc->mc_snum = 0;
9258 mc->mc_top = 0;
9259 mc->mc_flags &= ~C_INITIALIZED;
9260 {
9261 EDB_cursor *m2, *m3;
9262 EDB_dbi dbi = mc->mc_dbi;
9263
9264 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
9265 if (mc->mc_flags & C_SUB)
9266 m3 = &m2->mc_xcursor->mx_cursor;
9267 else
9268 m3 = m2;
9269 if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum))
9270 continue;
9271 if (m3->mc_pg[0] == mp) {
9272 m3->mc_snum = 0;
9273 m3->mc_top = 0;
9274 m3->mc_flags &= ~C_INITIALIZED;
9275 }
9276 }
9277 }
9278 } else if (IS_BRANCH(mp) && NUMKEYS(mp) == 1) {
9279 int i;
9280 DPUTS("collapsing root page!");
9281 rc = edb_eidl_append(&mc->mc_txn->mt_free_pgs, mp->mp_pgno);
9282 if (rc)
9283 return rc;
9284 mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0));
9285 rc = edb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL);
9286 if (rc)
9287 return rc;
9288 mc->mc_db->md_depth--;
9289 mc->mc_db->md_branch_pages--;
9290 mc->mc_ki[0] = mc->mc_ki[1];
9291 for (i = 1; i<mc->mc_db->md_depth; i++) {
9292 mc->mc_pg[i] = mc->mc_pg[i+1];
9293 mc->mc_ki[i] = mc->mc_ki[i+1];
9294 }
9295 {
9296
9297 EDB_cursor *m2, *m3;
9298 EDB_dbi dbi = mc->mc_dbi;
9299
9300 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
9301 if (mc->mc_flags & C_SUB)
9302 m3 = &m2->mc_xcursor->mx_cursor;
9303 else
9304 m3 = m2;
9305 if (m3 == mc) continue;
9306 if (!(m3->mc_flags & C_INITIALIZED))
9307 continue;
9308 if (m3->mc_pg[0] == mp) {
9309 for (i=0; i<mc->mc_db->md_depth; i++) {
9310 m3->mc_pg[i] = m3->mc_pg[i+1];
9311 m3->mc_ki[i] = m3->mc_ki[i+1];
9312 }
9313 m3->mc_snum--;
9314 m3->mc_top--;
9315 }
9316 }
9317 }
9318 } else
9319 DPUTS("root page doesn't need rebalancing");
9320 return EDB_SUCCESS;
9321 }
9322
9323
9324
9325
9326 ptop = mc->mc_top-1;
9327 edb_cassert(mc, NUMKEYS(mc->mc_pg[ptop]) > 1);
9328
9329
9330
9331
9332
9333
9334
9335
9336 edb_cursor_copy(mc, &mn);
9337 mn.mc_xcursor = NULL;
9338
9339 oldki = mc->mc_ki[mc->mc_top];
9340 if (mc->mc_ki[ptop] == 0) {
9341
9342
9343 DPUTS("reading right neighbor");
9344 mn.mc_ki[ptop]++;
9345 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
9346 rc = edb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL);
9347 if (rc)
9348 return rc;
9349 mn.mc_ki[mn.mc_top] = 0;
9350 mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
9351 fromleft = 0;
9352 } else {
9353
9354
9355 DPUTS("reading left neighbor");
9356 mn.mc_ki[ptop]--;
9357 node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]);
9358 rc = edb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL);
9359 if (rc)
9360 return rc;
9361 mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
9362 mc->mc_ki[mc->mc_top] = 0;
9363 fromleft = 1;
9364 }
9365
9366 DPRINTF(("found neighbor page %"Yu" (%u keys, %.1f%% full)",
9367 mn.mc_pg[mn.mc_top]->mp_pgno, NUMKEYS(mn.mc_pg[mn.mc_top]),
9368 (float)PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) / 10));
9369
9370
9371
9372
9373
9374 if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) {
9375 rc = edb_node_move(&mn, mc, fromleft);
9376 if (fromleft) {
9377
9378 oldki++;
9379 }
9380 } else {
9381 if (!fromleft) {
9382 rc = edb_page_merge(&mn, mc);
9383 } else {
9384 oldki += NUMKEYS(mn.mc_pg[mn.mc_top]);
9385 mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
9386
9387 WITH_CURSOR_TRACKING(mn,
9388 rc = edb_page_merge(mc, &mn));
9389 edb_cursor_copy(&mn, mc);
9390 }
9391 mc->mc_flags &= ~C_EOF;
9392 }
9393 mc->mc_ki[mc->mc_top] = oldki;
9394 return rc;
9395 }
9396
9397
9398 static int
9399 edb_cursor_del0(EDB_cursor *mc)
9400 {
9401 int rc;
9402 EDB_page *mp;
9403 indx_t ki;
9404 unsigned int nkeys;
9405 EDB_cursor *m2, *m3;
9406 EDB_dbi dbi = mc->mc_dbi;
9407
9408 ki = mc->mc_ki[mc->mc_top];
9409 mp = mc->mc_pg[mc->mc_top];
9410 edb_node_del(mc, mc->mc_db->md_pad);
9411 mc->mc_db->md_entries--;
9412 {
9413
9414 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
9415 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
9416 if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED))
9417 continue;
9418 if (m3 == mc || m3->mc_snum < mc->mc_snum)
9419 continue;
9420 if (m3->mc_pg[mc->mc_top] == mp) {
9421 if (m3->mc_ki[mc->mc_top] == ki) {
9422 m3->mc_flags |= C_DEL;
9423 if (mc->mc_db->md_flags & EDB_DUPSORT) {
9424
9425 m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF);
9426 }
9427 continue;
9428 } else if (m3->mc_ki[mc->mc_top] > ki) {
9429 m3->mc_ki[mc->mc_top]--;
9430 }
9431 XCURSOR_REFRESH(m3, mc->mc_top, mp);
9432 }
9433 }
9434 }
9435 rc = edb_rebalance(mc);
9436
9437 if (rc == EDB_SUCCESS) {
9438
9439
9440
9441
9442 if (!mc->mc_snum)
9443 return rc;
9444
9445 mp = mc->mc_pg[mc->mc_top];
9446 nkeys = NUMKEYS(mp);
9447
9448
9449 for (m2 = mc->mc_txn->mt_cursors[dbi]; !rc && m2; m2=m2->mc_next) {
9450 m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2;
9451 if (! (m2->mc_flags & m3->mc_flags & C_INITIALIZED))
9452 continue;
9453 if (m3->mc_snum < mc->mc_snum)
9454 continue;
9455 if (m3->mc_pg[mc->mc_top] == mp) {
9456
9457 if (m3->mc_ki[mc->mc_top] >= mc->mc_ki[mc->mc_top]) {
9458 if (m3->mc_ki[mc->mc_top] >= nkeys) {
9459 rc = edb_cursor_sibling(m3, 1);
9460 if (rc == EDB_NOTFOUND) {
9461 m3->mc_flags |= C_EOF;
9462 rc = EDB_SUCCESS;
9463 continue;
9464 }
9465 }
9466 if (mc->mc_db->md_flags & EDB_DUPSORT) {
9467 EDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]);
9468
9469
9470
9471
9472
9473
9474 if (node->mn_flags & F_DUPDATA) {
9475 if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) {
9476 if (!(node->mn_flags & F_SUBDATA))
9477 m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node);
9478 } else {
9479 edb_xcursor_init1(m3, node);
9480 m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL;
9481 }
9482 }
9483 }
9484 }
9485 }
9486 }
9487 mc->mc_flags |= C_DEL;
9488 }
9489
9490 if (rc)
9491 mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
9492 return rc;
9493 }
9494
9495 int
9496 edb_del(EDB_txn *txn, EDB_dbi dbi,
9497 EDB_val *key, EDB_val *data)
9498 {
9499 if (!key || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
9500 return EINVAL;
9501
9502 if (txn->mt_flags & (EDB_TXN_RDONLY|EDB_TXN_BLOCKED))
9503 return (txn->mt_flags & EDB_TXN_RDONLY) ? EACCES : EDB_BAD_TXN;
9504
9505 if (!F_ISSET(txn->mt_dbs[dbi].md_flags, EDB_DUPSORT)) {
9506
9507 data = NULL;
9508 }
9509
9510 return edb_del0(txn, dbi, key, data, 0);
9511 }
9512
9513 static int
9514 edb_del0(EDB_txn *txn, EDB_dbi dbi,
9515 EDB_val *key, EDB_val *data, unsigned flags)
9516 {
9517 EDB_cursor mc;
9518 EDB_xcursor mx;
9519 EDB_cursor_op op;
9520 EDB_val rdata, *xdata;
9521 int rc, exact = 0;
9522 DKBUF;
9523
9524 DPRINTF(("====> delete db %u key [%s]", dbi, DKEY(key)));
9525
9526 edb_cursor_init(&mc, txn, dbi, &mx);
9527
9528 if (data) {
9529 op = EDB_GET_BOTH;
9530 rdata = *data;
9531 xdata = &rdata;
9532 } else {
9533 op = EDB_SET;
9534 xdata = NULL;
9535 flags |= EDB_NODUPDATA;
9536 }
9537 rc = edb_cursor_set(&mc, key, xdata, op, &exact);
9538 if (rc == 0) {
9539
9540
9541
9542
9543
9544
9545
9546
9547 mc.mc_next = txn->mt_cursors[dbi];
9548 txn->mt_cursors[dbi] = &mc;
9549 rc = edb_cursor_del(&mc, flags);
9550 txn->mt_cursors[dbi] = mc.mc_next;
9551 }
9552 return rc;
9553 }
9554
9555
9556
9557
9558
9559
9560
9561
9562
9563
9564
9565
9566 static int
9567 edb_page_split(EDB_cursor *mc, EDB_val *newkey, EDB_val *newdata, pgno_t newpgno,
9568 unsigned int nflags)
9569 {
9570 unsigned int flags;
9571 int rc = EDB_SUCCESS, new_root = 0, did_split = 0;
9572 indx_t newindx;
9573 pgno_t pgno = 0;
9574 int i, j, split_indx, nkeys, pmax;
9575 EDB_env *env = mc->mc_txn->mt_env;
9576 EDB_node *node;
9577 EDB_val sepkey, rkey, xdata, *rdata = &xdata;
9578 EDB_page *copy = NULL;
9579 EDB_page *mp, *rp, *pp;
9580 int ptop;
9581 EDB_cursor mn;
9582 DKBUF;
9583
9584 mp = mc->mc_pg[mc->mc_top];
9585 newindx = mc->mc_ki[mc->mc_top];
9586 nkeys = NUMKEYS(mp);
9587
9588 DPRINTF(("-----> splitting %s page %"Yu" and adding [%s] at index %i/%i",
9589 IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno,
9590 DKEY(newkey), mc->mc_ki[mc->mc_top], nkeys));
9591
9592
9593 if ((rc = edb_page_new(mc, mp->mp_flags, 1, &rp)))
9594 return rc;
9595 rp->mp_pad = mp->mp_pad;
9596 DPRINTF(("new right sibling: page %"Yu, rp->mp_pgno));
9597
9598
9599
9600
9601
9602
9603 if (mc->mc_top < 1) {
9604 if ((rc = edb_page_new(mc, P_BRANCH, 1, &pp)))
9605 goto done;
9606
9607 for (i=mc->mc_snum; i>0; i--) {
9608 mc->mc_pg[i] = mc->mc_pg[i-1];
9609 mc->mc_ki[i] = mc->mc_ki[i-1];
9610 }
9611 mc->mc_pg[0] = pp;
9612 mc->mc_ki[0] = 0;
9613 mc->mc_db->md_root = pp->mp_pgno;
9614 DPRINTF(("root split! new root = %"Yu, pp->mp_pgno));
9615 new_root = mc->mc_db->md_depth++;
9616
9617
9618 if ((rc = edb_node_add(mc, 0, NULL, NULL, mp->mp_pgno, 0)) != EDB_SUCCESS) {
9619
9620 mc->mc_pg[0] = mc->mc_pg[1];
9621 mc->mc_ki[0] = mc->mc_ki[1];
9622 mc->mc_db->md_root = mp->mp_pgno;
9623 mc->mc_db->md_depth--;
9624 goto done;
9625 }
9626 mc->mc_snum++;
9627 mc->mc_top++;
9628 ptop = 0;
9629 } else {
9630 ptop = mc->mc_top-1;
9631 DPRINTF(("parent branch page is %"Yu, mc->mc_pg[ptop]->mp_pgno));
9632 }
9633
9634 edb_cursor_copy(mc, &mn);
9635 mn.mc_xcursor = NULL;
9636 mn.mc_pg[mn.mc_top] = rp;
9637 mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
9638
9639 if (nflags & EDB_APPEND) {
9640 mn.mc_ki[mn.mc_top] = 0;
9641 sepkey = *newkey;
9642 split_indx = newindx;
9643 nkeys = 0;
9644 } else {
9645
9646 split_indx = (nkeys+1) / 2;
9647
9648 if (IS_LEAF2(rp)) {
9649 char *split, *ins;
9650 int x;
9651 unsigned int lsize, rsize, ksize;
9652
9653 x = mc->mc_ki[mc->mc_top] - split_indx;
9654 ksize = mc->mc_db->md_pad;
9655 split = LEAF2KEY(mp, split_indx, ksize);
9656 rsize = (nkeys - split_indx) * ksize;
9657 lsize = (nkeys - split_indx) * sizeof(indx_t);
9658 mp->mp_lower -= lsize;
9659 rp->mp_lower += lsize;
9660 mp->mp_upper += rsize - lsize;
9661 rp->mp_upper -= rsize - lsize;
9662 sepkey.mv_size = ksize;
9663 if (newindx == split_indx) {
9664 sepkey.mv_data = newkey->mv_data;
9665 } else {
9666 sepkey.mv_data = split;
9667 }
9668 if (x<0) {
9669 ins = LEAF2KEY(mp, mc->mc_ki[mc->mc_top], ksize);
9670 memcpy(rp->mp_ptrs, split, rsize);
9671 sepkey.mv_data = rp->mp_ptrs;
9672 memmove(ins+ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize);
9673 memcpy(ins, newkey->mv_data, ksize);
9674 mp->mp_lower += sizeof(indx_t);
9675 mp->mp_upper -= ksize - sizeof(indx_t);
9676 } else {
9677 if (x)
9678 memcpy(rp->mp_ptrs, split, x * ksize);
9679 ins = LEAF2KEY(rp, x, ksize);
9680 memcpy(ins, newkey->mv_data, ksize);
9681 memcpy(ins+ksize, split + x * ksize, rsize - x * ksize);
9682 rp->mp_lower += sizeof(indx_t);
9683 rp->mp_upper -= ksize - sizeof(indx_t);
9684 mc->mc_ki[mc->mc_top] = x;
9685 }
9686 } else {
9687 int psize, nsize, k;
9688
9689 pmax = env->me_psize - PAGEHDRSZ;
9690 if (IS_LEAF(mp))
9691 nsize = edb_leaf_size(env, newkey, newdata);
9692 else
9693 nsize = edb_branch_size(env, newkey);
9694 nsize = EVEN(nsize);
9695
9696
9697 copy = edb_page_malloc(mc->mc_txn, 1);
9698 if (copy == NULL) {
9699 rc = ENOMEM;
9700 goto done;
9701 }
9702 copy->mp_pgno = mp->mp_pgno;
9703 copy->mp_flags = mp->mp_flags;
9704 copy->mp_lower = (PAGEHDRSZ-PAGEBASE);
9705 copy->mp_upper = env->me_psize - PAGEBASE;
9706
9707
9708 for (i=0, j=0; i<nkeys; i++) {
9709 if (i == newindx) {
9710 copy->mp_ptrs[j++] = 0;
9711 }
9712 copy->mp_ptrs[j++] = mp->mp_ptrs[i];
9713 }
9714
9715
9716
9717
9718
9719
9720
9721
9722
9723
9724
9725
9726
9727
9728
9729
9730 if (nkeys < 32 || nsize > pmax/16 || newindx >= nkeys) {
9731
9732 psize = 0;
9733 if (newindx <= split_indx || newindx >= nkeys) {
9734 i = 0; j = 1;
9735 k = newindx >= nkeys ? nkeys : split_indx+1+IS_LEAF(mp);
9736 } else {
9737 i = nkeys; j = -1;
9738 k = split_indx-1;
9739 }
9740 for (; i!=k; i+=j) {
9741 if (i == newindx) {
9742 psize += nsize;
9743 node = NULL;
9744 } else {
9745 node = (EDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
9746 psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
9747 if (IS_LEAF(mp)) {
9748 if (F_ISSET(node->mn_flags, F_BIGDATA))
9749 psize += sizeof(pgno_t);
9750 else
9751 psize += NODEDSZ(node);
9752 }
9753 psize = EVEN(psize);
9754 }
9755 if (psize > pmax || i == k-j) {
9756 split_indx = i + (j<0);
9757 break;
9758 }
9759 }
9760 }
9761 if (split_indx == newindx) {
9762 sepkey.mv_size = newkey->mv_size;
9763 sepkey.mv_data = newkey->mv_data;
9764 } else {
9765 node = (EDB_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE);
9766 sepkey.mv_size = node->mn_ksize;
9767 sepkey.mv_data = NODEKEY(node);
9768 }
9769 }
9770 }
9771
9772 DPRINTF(("separator is %d [%s]", split_indx, DKEY(&sepkey)));
9773
9774
9775
9776 if (SIZELEFT(mn.mc_pg[ptop]) < edb_branch_size(env, &sepkey)) {
9777 int snum = mc->mc_snum;
9778 mn.mc_snum--;
9779 mn.mc_top--;
9780 did_split = 1;
9781
9782 WITH_CURSOR_TRACKING(mn,
9783 rc = edb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0));
9784 if (rc)
9785 goto done;
9786
9787
9788 if (mc->mc_snum > snum) {
9789 ptop++;
9790 }
9791
9792
9793
9794 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
9795 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
9796 for (i=0; i<ptop; i++) {
9797 mc->mc_pg[i] = mn.mc_pg[i];
9798 mc->mc_ki[i] = mn.mc_ki[i];
9799 }
9800 mc->mc_pg[ptop] = mn.mc_pg[ptop];
9801 if (mn.mc_ki[ptop]) {
9802 mc->mc_ki[ptop] = mn.mc_ki[ptop] - 1;
9803 } else {
9804
9805 mc->mc_ki[ptop] = mn.mc_ki[ptop];
9806 rc = edb_cursor_sibling(mc, 0);
9807 }
9808 }
9809 } else {
9810 mn.mc_top--;
9811 rc = edb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
9812 mn.mc_top++;
9813 }
9814 if (rc != EDB_SUCCESS) {
9815 if (rc == EDB_NOTFOUND)
9816 rc = EDB_PROBLEM;
9817 goto done;
9818 }
9819 if (nflags & EDB_APPEND) {
9820 mc->mc_pg[mc->mc_top] = rp;
9821 mc->mc_ki[mc->mc_top] = 0;
9822 rc = edb_node_add(mc, 0, newkey, newdata, newpgno, nflags);
9823 if (rc)
9824 goto done;
9825 for (i=0; i<mc->mc_top; i++)
9826 mc->mc_ki[i] = mn.mc_ki[i];
9827 } else if (!IS_LEAF2(mp)) {
9828
9829 mc->mc_pg[mc->mc_top] = rp;
9830 i = split_indx;
9831 j = 0;
9832 do {
9833 if (i == newindx) {
9834 rkey.mv_data = newkey->mv_data;
9835 rkey.mv_size = newkey->mv_size;
9836 if (IS_LEAF(mp)) {
9837 rdata = newdata;
9838 } else
9839 pgno = newpgno;
9840 flags = nflags;
9841
9842 mc->mc_ki[mc->mc_top] = j;
9843 } else {
9844 node = (EDB_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
9845 rkey.mv_data = NODEKEY(node);
9846 rkey.mv_size = node->mn_ksize;
9847 if (IS_LEAF(mp)) {
9848 xdata.mv_data = NODEDATA(node);
9849 xdata.mv_size = NODEDSZ(node);
9850 rdata = &xdata;
9851 } else
9852 pgno = NODEPGNO(node);
9853 flags = node->mn_flags;
9854 }
9855
9856 if (!IS_LEAF(mp) && j == 0) {
9857
9858 rkey.mv_size = 0;
9859 }
9860
9861 rc = edb_node_add(mc, j, &rkey, rdata, pgno, flags);
9862 if (rc)
9863 goto done;
9864 if (i == nkeys) {
9865 i = 0;
9866 j = 0;
9867 mc->mc_pg[mc->mc_top] = copy;
9868 } else {
9869 i++;
9870 j++;
9871 }
9872 } while (i != split_indx);
9873
9874 nkeys = NUMKEYS(copy);
9875 for (i=0; i<nkeys; i++)
9876 mp->mp_ptrs[i] = copy->mp_ptrs[i];
9877 mp->mp_lower = copy->mp_lower;
9878 mp->mp_upper = copy->mp_upper;
9879 memcpy(NODEPTR(mp, nkeys-1), NODEPTR(copy, nkeys-1),
9880 env->me_psize - copy->mp_upper - PAGEBASE);
9881
9882
9883 if (newindx < split_indx) {
9884 mc->mc_pg[mc->mc_top] = mp;
9885 } else {
9886 mc->mc_pg[mc->mc_top] = rp;
9887 mc->mc_ki[ptop]++;
9888
9889
9890 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
9891 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
9892 for (i=0; i<=ptop; i++) {
9893 mc->mc_pg[i] = mn.mc_pg[i];
9894 mc->mc_ki[i] = mn.mc_ki[i];
9895 }
9896 }
9897 }
9898 if (nflags & EDB_RESERVE) {
9899 node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
9900 if (!(node->mn_flags & F_BIGDATA))
9901 newdata->mv_data = NODEDATA(node);
9902 }
9903 } else {
9904 if (newindx >= split_indx) {
9905 mc->mc_pg[mc->mc_top] = rp;
9906 mc->mc_ki[ptop]++;
9907
9908
9909 if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
9910 mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
9911 for (i=0; i<=ptop; i++) {
9912 mc->mc_pg[i] = mn.mc_pg[i];
9913 mc->mc_ki[i] = mn.mc_ki[i];
9914 }
9915 }
9916 }
9917 }
9918
9919 {
9920
9921 EDB_cursor *m2, *m3;
9922 EDB_dbi dbi = mc->mc_dbi;
9923 nkeys = NUMKEYS(mp);
9924
9925 for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
9926 if (mc->mc_flags & C_SUB)
9927 m3 = &m2->mc_xcursor->mx_cursor;
9928 else
9929 m3 = m2;
9930 if (m3 == mc)
9931 continue;
9932 if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
9933 continue;
9934 if (new_root) {
9935 int k;
9936
9937 if (m3->mc_pg[0] != mp)
9938 continue;
9939
9940 for (k=new_root; k>=0; k--) {
9941 m3->mc_ki[k+1] = m3->mc_ki[k];
9942 m3->mc_pg[k+1] = m3->mc_pg[k];
9943 }
9944 if (m3->mc_ki[0] >= nkeys) {
9945 m3->mc_ki[0] = 1;
9946 } else {
9947 m3->mc_ki[0] = 0;
9948 }
9949 m3->mc_pg[0] = mc->mc_pg[0];
9950 m3->mc_snum++;
9951 m3->mc_top++;
9952 }
9953 if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) {
9954 if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & EDB_SPLIT_REPLACE))
9955 m3->mc_ki[mc->mc_top]++;
9956 if (m3->mc_ki[mc->mc_top] >= nkeys) {
9957 m3->mc_pg[mc->mc_top] = rp;
9958 m3->mc_ki[mc->mc_top] -= nkeys;
9959 for (i=0; i<mc->mc_top; i++) {
9960 m3->mc_ki[i] = mn.mc_ki[i];
9961 m3->mc_pg[i] = mn.mc_pg[i];
9962 }
9963 }
9964 } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
9965 m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
9966 m3->mc_ki[ptop]++;
9967 }
9968 if (IS_LEAF(mp))
9969 XCURSOR_REFRESH(m3, mc->mc_top, m3->mc_pg[mc->mc_top]);
9970 }
9971 }
9972 DPRINTF(("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)));
9973
9974 done:
9975 if (copy)
9976 edb_page_free(env, copy);
9977 if (rc)
9978 mc->mc_txn->mt_flags |= EDB_TXN_ERROR;
9979 return rc;
9980 }
9981
9982 int
9983 edb_put(EDB_txn *txn, EDB_dbi dbi,
9984 EDB_val *key, EDB_val *data, unsigned int flags)
9985 {
9986 EDB_cursor mc;
9987 EDB_xcursor mx;
9988 int rc;
9989
9990 if (!key || !data || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
9991 return EINVAL;
9992
9993 if (flags & ~(EDB_NOOVERWRITE|EDB_NODUPDATA|EDB_RESERVE|EDB_APPEND|EDB_APPENDDUP))
9994 return EINVAL;
9995
9996 if (txn->mt_flags & (EDB_TXN_RDONLY|EDB_TXN_BLOCKED))
9997 return (txn->mt_flags & EDB_TXN_RDONLY) ? EACCES : EDB_BAD_TXN;
9998
9999 edb_cursor_init(&mc, txn, dbi, &mx);
10000 mc.mc_next = txn->mt_cursors[dbi];
10001 txn->mt_cursors[dbi] = &mc;
10002 rc = edb_cursor_put(&mc, key, data, flags);
10003 txn->mt_cursors[dbi] = mc.mc_next;
10004 return rc;
10005 }
10006
10007 #ifndef EDB_WBUF
10008 #define EDB_WBUF (1024*1024)
10009 #endif
10010 #define EDB_EOF 0x10
10011
10012
10013 typedef struct edb_copy {
10014 EDB_env *mc_env;
10015 EDB_txn *mc_txn;
10016 pthread_mutex_t mc_mutex;
10017 pthread_cond_t mc_cond;
10018 char *mc_wbuf[2];
10019 char *mc_over[2];
10020 int mc_wlen[2];
10021 int mc_olen[2];
10022 pgno_t mc_next_pgno;
10023 HANDLE mc_fd;
10024 int mc_toggle;
10025 int mc_new;
10026
10027
10028
10029 volatile int mc_error;
10030 } edb_copy;
10031
10032
10033 static THREAD_RET ESECT CALL_CONV
10034 edb_env_copythr(void *arg)
10035 {
10036 edb_copy *my = arg;
10037 char *ptr;
10038 int toggle = 0, wsize, rc;
10039 #ifdef _WIN32
10040 DWORD len;
10041 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
10042 #else
10043 int len;
10044 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
10045 #ifdef SIGPIPE
10046 sigset_t set;
10047 sigemptyset(&set);
10048 sigaddset(&set, SIGPIPE);
10049 if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0)
10050 my->mc_error = rc;
10051 #endif
10052 #endif
10053
10054 pthread_mutex_lock(&my->mc_mutex);
10055 for(;;) {
10056 while (!my->mc_new)
10057 pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
10058 if (my->mc_new == 0 + EDB_EOF)
10059 break;
10060 wsize = my->mc_wlen[toggle];
10061 ptr = my->mc_wbuf[toggle];
10062 again:
10063 rc = EDB_SUCCESS;
10064 while (wsize > 0 && !my->mc_error) {
10065 DO_WRITE(rc, my->mc_fd, ptr, wsize, len);
10066 if (!rc) {
10067 rc = ErrCode();
10068 #if defined(SIGPIPE) && !defined(_WIN32)
10069 if (rc == EPIPE) {
10070
10071
10072
10073 int tmp;
10074 sigwait(&set, &tmp);
10075 }
10076 #endif
10077 break;
10078 } else if (len > 0) {
10079 rc = EDB_SUCCESS;
10080 ptr += len;
10081 wsize -= len;
10082 continue;
10083 } else {
10084 rc = EIO;
10085 break;
10086 }
10087 }
10088 if (rc) {
10089 my->mc_error = rc;
10090 }
10091
10092 if (my->mc_olen[toggle]) {
10093 wsize = my->mc_olen[toggle];
10094 ptr = my->mc_over[toggle];
10095 my->mc_olen[toggle] = 0;
10096 goto again;
10097 }
10098 my->mc_wlen[toggle] = 0;
10099 toggle ^= 1;
10100
10101 my->mc_new--;
10102 pthread_cond_signal(&my->mc_cond);
10103 }
10104 pthread_mutex_unlock(&my->mc_mutex);
10105 return (THREAD_RET)0;
10106 #undef DO_WRITE
10107 }
10108
10109
10110
10111
10112
10113
10114 static int ESECT
10115 edb_env_cthr_toggle(edb_copy *my, int adjust)
10116 {
10117 pthread_mutex_lock(&my->mc_mutex);
10118 my->mc_new += adjust;
10119 pthread_cond_signal(&my->mc_cond);
10120 while (my->mc_new & 2)
10121 pthread_cond_wait(&my->mc_cond, &my->mc_mutex);
10122 pthread_mutex_unlock(&my->mc_mutex);
10123
10124 my->mc_toggle ^= (adjust & 1);
10125
10126 my->mc_wlen[my->mc_toggle] = 0;
10127 return my->mc_error;
10128 }
10129
10130
10131
10132
10133
10134
10135 static int ESECT
10136 edb_env_cwalk(edb_copy *my, pgno_t *pg, int flags)
10137 {
10138 EDB_cursor mc = {0};
10139 EDB_node *ni;
10140 EDB_page *mo, *mp, *leaf;
10141 char *buf, *ptr;
10142 int rc, toggle;
10143 unsigned int i;
10144
10145
10146 if (*pg == P_INVALID)
10147 return EDB_SUCCESS;
10148
10149 mc.mc_snum = 1;
10150 mc.mc_txn = my->mc_txn;
10151 mc.mc_flags = my->mc_txn->mt_flags & (C_ORIG_RDONLY|C_WRITEMAP);
10152
10153 rc = edb_page_get(&mc, *pg, &mc.mc_pg[0], NULL);
10154 if (rc)
10155 return rc;
10156 rc = edb_page_search_root(&mc, NULL, EDB_PS_FIRST);
10157 if (rc)
10158 return rc;
10159
10160
10161 buf = ptr = malloc(my->mc_env->me_psize * mc.mc_snum);
10162 if (buf == NULL)
10163 return ENOMEM;
10164
10165 for (i=0; i<mc.mc_top; i++) {
10166 edb_page_copy((EDB_page *)ptr, mc.mc_pg[i], my->mc_env->me_psize);
10167 mc.mc_pg[i] = (EDB_page *)ptr;
10168 ptr += my->mc_env->me_psize;
10169 }
10170
10171
10172 leaf = (EDB_page *)ptr;
10173
10174 toggle = my->mc_toggle;
10175 while (mc.mc_snum > 0) {
10176 unsigned n;
10177 mp = mc.mc_pg[mc.mc_top];
10178 n = NUMKEYS(mp);
10179
10180 if (IS_LEAF(mp)) {
10181 if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) {
10182 for (i=0; i<n; i++) {
10183 ni = NODEPTR(mp, i);
10184 if (ni->mn_flags & F_BIGDATA) {
10185 EDB_page *omp;
10186 pgno_t pg;
10187
10188
10189 if (mp != leaf) {
10190 mc.mc_pg[mc.mc_top] = leaf;
10191 edb_page_copy(leaf, mp, my->mc_env->me_psize);
10192 mp = leaf;
10193 ni = NODEPTR(mp, i);
10194 }
10195
10196 memcpy(&pg, NODEDATA(ni), sizeof(pg));
10197 memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t));
10198 rc = edb_page_get(&mc, pg, &omp, NULL);
10199 if (rc)
10200 goto done;
10201 if (my->mc_wlen[toggle] >= EDB_WBUF) {
10202 rc = edb_env_cthr_toggle(my, 1);
10203 if (rc)
10204 goto done;
10205 toggle = my->mc_toggle;
10206 }
10207 mo = (EDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
10208 memcpy(mo, omp, my->mc_env->me_psize);
10209 mo->mp_pgno = my->mc_next_pgno;
10210 my->mc_next_pgno += omp->mp_pages;
10211 my->mc_wlen[toggle] += my->mc_env->me_psize;
10212 if (omp->mp_pages > 1) {
10213 my->mc_olen[toggle] = my->mc_env->me_psize * (omp->mp_pages - 1);
10214 my->mc_over[toggle] = (char *)omp + my->mc_env->me_psize;
10215 rc = edb_env_cthr_toggle(my, 1);
10216 if (rc)
10217 goto done;
10218 toggle = my->mc_toggle;
10219 }
10220 } else if (ni->mn_flags & F_SUBDATA) {
10221 EDB_db db;
10222
10223
10224 if (mp != leaf) {
10225 mc.mc_pg[mc.mc_top] = leaf;
10226 edb_page_copy(leaf, mp, my->mc_env->me_psize);
10227 mp = leaf;
10228 ni = NODEPTR(mp, i);
10229 }
10230
10231 memcpy(&db, NODEDATA(ni), sizeof(db));
10232 my->mc_toggle = toggle;
10233 rc = edb_env_cwalk(my, &db.md_root, ni->mn_flags & F_DUPDATA);
10234 if (rc)
10235 goto done;
10236 toggle = my->mc_toggle;
10237 memcpy(NODEDATA(ni), &db, sizeof(db));
10238 }
10239 }
10240 }
10241 } else {
10242 mc.mc_ki[mc.mc_top]++;
10243 if (mc.mc_ki[mc.mc_top] < n) {
10244 pgno_t pg;
10245 again:
10246 ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]);
10247 pg = NODEPGNO(ni);
10248 rc = edb_page_get(&mc, pg, &mp, NULL);
10249 if (rc)
10250 goto done;
10251 mc.mc_top++;
10252 mc.mc_snum++;
10253 mc.mc_ki[mc.mc_top] = 0;
10254 if (IS_BRANCH(mp)) {
10255
10256
10257
10258 edb_page_copy(mc.mc_pg[mc.mc_top], mp, my->mc_env->me_psize);
10259 goto again;
10260 } else
10261 mc.mc_pg[mc.mc_top] = mp;
10262 continue;
10263 }
10264 }
10265 if (my->mc_wlen[toggle] >= EDB_WBUF) {
10266 rc = edb_env_cthr_toggle(my, 1);
10267 if (rc)
10268 goto done;
10269 toggle = my->mc_toggle;
10270 }
10271 mo = (EDB_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
10272 edb_page_copy(mo, mp, my->mc_env->me_psize);
10273 mo->mp_pgno = my->mc_next_pgno++;
10274 my->mc_wlen[toggle] += my->mc_env->me_psize;
10275 if (mc.mc_top) {
10276
10277 ni = NODEPTR(mc.mc_pg[mc.mc_top-1], mc.mc_ki[mc.mc_top-1]);
10278 SETPGNO(ni, mo->mp_pgno);
10279 edb_cursor_pop(&mc);
10280 } else {
10281
10282 *pg = mo->mp_pgno;
10283 break;
10284 }
10285 }
10286 done:
10287 free(buf);
10288 return rc;
10289 }
10290
10291
10292 static int ESECT
10293 edb_env_copyfd1(EDB_env *env, HANDLE fd)
10294 {
10295 EDB_meta *mm;
10296 EDB_page *mp;
10297 edb_copy my = {0};
10298 EDB_txn *txn = NULL;
10299 pthread_t thr;
10300 pgno_t root, new_root;
10301 int rc = EDB_SUCCESS;
10302
10303 #ifdef _WIN32
10304 if (!(my.mc_mutex = CreateMutex(NULL, FALSE, NULL)) ||
10305 !(my.mc_cond = CreateEvent(NULL, FALSE, FALSE, NULL))) {
10306 rc = ErrCode();
10307 goto done;
10308 }
10309 my.mc_wbuf[0] = _aligned_malloc(EDB_WBUF*2, env->me_os_psize);
10310 if (my.mc_wbuf[0] == NULL) {
10311
10312 rc = ERROR_NOT_ENOUGH_MEMORY;
10313 goto done;
10314 }
10315 #else
10316 if ((rc = pthread_mutex_init(&my.mc_mutex, NULL)) != 0)
10317 return rc;
10318 if ((rc = pthread_cond_init(&my.mc_cond, NULL)) != 0)
10319 goto done2;
10320 #ifdef HAVE_MEMALIGN
10321 my.mc_wbuf[0] = memalign(env->me_os_psize, EDB_WBUF*2);
10322 if (my.mc_wbuf[0] == NULL) {
10323 rc = errno;
10324 goto done;
10325 }
10326 #else
10327 {
10328 void *p;
10329 if ((rc = posix_memalign(&p, env->me_os_psize, EDB_WBUF*2)) != 0)
10330 goto done;
10331 my.mc_wbuf[0] = p;
10332 }
10333 #endif
10334 #endif
10335 memset(my.mc_wbuf[0], 0, EDB_WBUF*2);
10336 my.mc_wbuf[1] = my.mc_wbuf[0] + EDB_WBUF;
10337 my.mc_next_pgno = NUM_METAS;
10338 my.mc_env = env;
10339 my.mc_fd = fd;
10340 rc = THREAD_CREATE(thr, edb_env_copythr, &my);
10341 if (rc)
10342 {
10343 NDRX_PLATF_DIAG(NDRX_DIAG_PTHREAD_CREATE, errno, "edb_env_copyfd1");
10344 goto done;
10345 }
10346
10347 rc = edb_txn_begin(env, NULL, EDB_RDONLY, &txn);
10348 if (rc)
10349 goto finish;
10350
10351 mp = (EDB_page *)my.mc_wbuf[0];
10352 memset(mp, 0, NUM_METAS * env->me_psize);
10353 mp->mp_pgno = 0;
10354 mp->mp_flags = P_META;
10355 mm = (EDB_meta *)METADATA(mp);
10356 edb_env_init_meta0(env, mm);
10357 mm->mm_address = env->me_metas[0]->mm_address;
10358
10359 mp = (EDB_page *)(my.mc_wbuf[0] + env->me_psize);
10360 mp->mp_pgno = 1;
10361 mp->mp_flags = P_META;
10362 *(EDB_meta *)METADATA(mp) = *mm;
10363 mm = (EDB_meta *)METADATA(mp);
10364
10365
10366 root = new_root = txn->mt_dbs[MAIN_DBI].md_root;
10367 if (root != P_INVALID) {
10368
10369
10370
10371 EDB_ID freecount = 0;
10372 EDB_cursor mc;
10373 EDB_val key, data;
10374 edb_cursor_init(&mc, txn, FREE_DBI, NULL);
10375 while ((rc = edb_cursor_get(&mc, &key, &data, EDB_NEXT)) == 0)
10376 freecount += *(EDB_ID *)data.mv_data;
10377 if (rc != EDB_NOTFOUND)
10378 goto finish;
10379 freecount += txn->mt_dbs[FREE_DBI].md_branch_pages +
10380 txn->mt_dbs[FREE_DBI].md_leaf_pages +
10381 txn->mt_dbs[FREE_DBI].md_overflow_pages;
10382
10383 new_root = txn->mt_next_pgno - 1 - freecount;
10384 mm->mm_last_pg = new_root;
10385 mm->mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI];
10386 mm->mm_dbs[MAIN_DBI].md_root = new_root;
10387 } else {
10388
10389
10390
10391 mm->mm_dbs[MAIN_DBI].md_flags = txn->mt_dbs[MAIN_DBI].md_flags;
10392 }
10393 if (root != P_INVALID || mm->mm_dbs[MAIN_DBI].md_flags) {
10394 mm->mm_txnid = 1;
10395 }
10396
10397 my.mc_wlen[0] = env->me_psize * NUM_METAS;
10398 my.mc_txn = txn;
10399 rc = edb_env_cwalk(&my, &root, 0);
10400 if (rc == EDB_SUCCESS && root != new_root) {
10401 rc = EDB_INCOMPATIBLE;
10402 }
10403
10404 finish:
10405 if (rc)
10406 my.mc_error = rc;
10407 edb_env_cthr_toggle(&my, 1 | EDB_EOF);
10408 rc = THREAD_FINISH(thr);
10409 edb_txn_abort(txn);
10410
10411 done:
10412 #ifdef _WIN32
10413 if (my.mc_wbuf[0]) _aligned_free(my.mc_wbuf[0]);
10414 if (my.mc_cond) CloseHandle(my.mc_cond);
10415 if (my.mc_mutex) CloseHandle(my.mc_mutex);
10416 #else
10417 free(my.mc_wbuf[0]);
10418 pthread_cond_destroy(&my.mc_cond);
10419 done2:
10420 pthread_mutex_destroy(&my.mc_mutex);
10421 #endif
10422 return rc ? rc : my.mc_error;
10423 }
10424
10425
10426 static int ESECT
10427 edb_env_copyfd0(EDB_env *env, HANDLE fd)
10428 {
10429 EDB_txn *txn = NULL;
10430 edb_mutexref_t wmutex = NULL;
10431 int rc;
10432 edb_size_t wsize, w3;
10433 char *ptr;
10434 #ifdef _WIN32
10435 DWORD len, w2;
10436 #define DO_WRITE(rc, fd, ptr, w2, len) rc = WriteFile(fd, ptr, w2, &len, NULL)
10437 #else
10438 ssize_t len;
10439 size_t w2;
10440 #define DO_WRITE(rc, fd, ptr, w2, len) len = write(fd, ptr, w2); rc = (len >= 0)
10441 #endif
10442
10443
10444
10445
10446 rc = edb_txn_begin(env, NULL, EDB_RDONLY, &txn);
10447 if (rc)
10448 return rc;
10449
10450 if (env->me_txns) {
10451
10452 edb_txn_end(txn, EDB_END_RESET_TMP);
10453
10454
10455 wmutex = env->me_wmutex;
10456 if (LOCK_MUTEX(rc, env, wmutex))
10457 goto leave;
10458
10459 rc = edb_txn_renew0(txn);
10460 if (rc) {
10461 UNLOCK_MUTEX(wmutex);
10462 goto leave;
10463 }
10464 }
10465
10466 wsize = env->me_psize * NUM_METAS;
10467 ptr = env->me_map;
10468 w2 = wsize;
10469 while (w2 > 0) {
10470 DO_WRITE(rc, fd, ptr, w2, len);
10471 if (!rc) {
10472 rc = ErrCode();
10473 break;
10474 } else if (len > 0) {
10475 rc = EDB_SUCCESS;
10476 ptr += len;
10477 w2 -= len;
10478 continue;
10479 } else {
10480
10481 rc = EIO;
10482 break;
10483 }
10484 }
10485 if (wmutex)
10486 UNLOCK_MUTEX(wmutex);
10487
10488 if (rc)
10489 goto leave;
10490
10491 w3 = txn->mt_next_pgno * env->me_psize;
10492 {
10493 edb_size_t fsize = 0;
10494 if ((rc = edb_fsize(env->me_fd, &fsize)))
10495 goto leave;
10496 if (w3 > fsize)
10497 w3 = fsize;
10498 }
10499 wsize = w3 - wsize;
10500 while (wsize > 0) {
10501 if (wsize > MAX_WRITE)
10502 w2 = MAX_WRITE;
10503 else
10504 w2 = wsize;
10505 DO_WRITE(rc, fd, ptr, w2, len);
10506 if (!rc) {
10507 rc = ErrCode();
10508 break;
10509 } else if (len > 0) {
10510 rc = EDB_SUCCESS;
10511 ptr += len;
10512 wsize -= len;
10513 continue;
10514 } else {
10515 rc = EIO;
10516 break;
10517 }
10518 }
10519
10520 leave:
10521 edb_txn_abort(txn);
10522 return rc;
10523 }
10524
10525 int ESECT
10526 edb_env_copyfd2(EDB_env *env, HANDLE fd, unsigned int flags)
10527 {
10528 if (flags & EDB_CP_COMPACT)
10529 return edb_env_copyfd1(env, fd);
10530 else
10531 return edb_env_copyfd0(env, fd);
10532 }
10533
10534 int ESECT
10535 edb_env_copyfd(EDB_env *env, HANDLE fd)
10536 {
10537 return edb_env_copyfd2(env, fd, 0);
10538 }
10539
10540 int ESECT
10541 edb_env_copy2(EDB_env *env, const char *path, unsigned int flags)
10542 {
10543 int rc;
10544 EDB_name fname;
10545 HANDLE newfd = INVALID_HANDLE_VALUE;
10546
10547 rc = edb_fname_init(path, env->me_flags | EDB_NOLOCK, &fname);
10548 if (rc == EDB_SUCCESS) {
10549 rc = edb_fopen(env, &fname, EDB_O_COPY, 0666, &newfd);
10550 edb_fname_destroy(fname);
10551 }
10552 if (rc == EDB_SUCCESS) {
10553 rc = edb_env_copyfd2(env, newfd, flags);
10554 if (close(newfd) < 0 && rc == EDB_SUCCESS)
10555 rc = ErrCode();
10556 }
10557 return rc;
10558 }
10559
10560 int ESECT
10561 edb_env_copy(EDB_env *env, const char *path)
10562 {
10563 return edb_env_copy2(env, path, 0);
10564 }
10565
10566 int ESECT
10567 edb_env_set_flags(EDB_env *env, unsigned int flag, int onoff)
10568 {
10569 if (flag & ~CHANGEABLE)
10570 return EINVAL;
10571 if (onoff)
10572 env->me_flags |= flag;
10573 else
10574 env->me_flags &= ~flag;
10575 return EDB_SUCCESS;
10576 }
10577
10578 int ESECT
10579 edb_env_get_flags(EDB_env *env, unsigned int *arg)
10580 {
10581 if (!env || !arg)
10582 return EINVAL;
10583
10584 *arg = env->me_flags & (CHANGEABLE|CHANGELESS);
10585 return EDB_SUCCESS;
10586 }
10587
10588 int ESECT
10589 edb_env_set_userctx(EDB_env *env, void *ctx)
10590 {
10591 if (!env)
10592 return EINVAL;
10593 env->me_userctx = ctx;
10594 return EDB_SUCCESS;
10595 }
10596
10597 void * ESECT
10598 edb_env_get_userctx(EDB_env *env)
10599 {
10600 return env ? env->me_userctx : NULL;
10601 }
10602
10603 int ESECT
10604 edb_env_set_assert(EDB_env *env, EDB_assert_func *func)
10605 {
10606 if (!env)
10607 return EINVAL;
10608 #ifndef NDEBUG
10609 env->me_assert_func = func;
10610 #endif
10611 return EDB_SUCCESS;
10612 }
10613
10614 int ESECT
10615 edb_env_get_path(EDB_env *env, const char **arg)
10616 {
10617 if (!env || !arg)
10618 return EINVAL;
10619
10620 *arg = env->me_path;
10621 return EDB_SUCCESS;
10622 }
10623
10624 int ESECT
10625 edb_env_get_fd(EDB_env *env, edb_filehandle_t *arg)
10626 {
10627 if (!env || !arg)
10628 return EINVAL;
10629
10630 *arg = env->me_fd;
10631 return EDB_SUCCESS;
10632 }
10633
10634
10635
10636
10637
10638
10639
10640 static int ESECT
10641 edb_stat0(EDB_env *env, EDB_db *db, EDB_stat *arg)
10642 {
10643 arg->ms_psize = env->me_psize;
10644 arg->ms_depth = db->md_depth;
10645 arg->ms_branch_pages = db->md_branch_pages;
10646 arg->ms_leaf_pages = db->md_leaf_pages;
10647 arg->ms_overflow_pages = db->md_overflow_pages;
10648 arg->ms_entries = db->md_entries;
10649
10650 return EDB_SUCCESS;
10651 }
10652
10653 int ESECT
10654 edb_env_stat(EDB_env *env, EDB_stat *arg)
10655 {
10656 EDB_meta *meta;
10657
10658 if (env == NULL || arg == NULL)
10659 return EINVAL;
10660
10661 meta = edb_env_pick_meta(env);
10662
10663 return edb_stat0(env, &meta->mm_dbs[MAIN_DBI], arg);
10664 }
10665
10666 int ESECT
10667 edb_env_info(EDB_env *env, EDB_envinfo *arg)
10668 {
10669 EDB_meta *meta;
10670
10671 if (env == NULL || arg == NULL)
10672 return EINVAL;
10673
10674 meta = edb_env_pick_meta(env);
10675 arg->me_mapaddr = meta->mm_address;
10676 arg->me_last_pgno = meta->mm_last_pg;
10677 arg->me_last_txnid = meta->mm_txnid;
10678
10679 arg->me_mapsize = env->me_mapsize;
10680 arg->me_maxreaders = env->me_maxreaders;
10681 arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0;
10682 return EDB_SUCCESS;
10683 }
10684
10685
10686
10687
10688
10689
10690
10691
10692 static void
10693 edb_default_cmp(EDB_txn *txn, EDB_dbi dbi)
10694 {
10695 uint16_t f = txn->mt_dbs[dbi].md_flags;
10696
10697 txn->mt_dbxs[dbi].md_cmp =
10698 (f & EDB_REVERSEKEY) ? edb_cmp_memnr :
10699 (f & EDB_INTEGERKEY) ? edb_cmp_cint : edb_cmp_memn;
10700
10701 txn->mt_dbxs[dbi].md_dcmp =
10702 !(f & EDB_DUPSORT) ? 0 :
10703 ((f & EDB_INTEGERDUP)
10704 ? ((f & EDB_DUPFIXED) ? edb_cmp_int : edb_cmp_cint)
10705 : ((f & EDB_REVERSEDUP) ? edb_cmp_memnr : edb_cmp_memn));
10706 }
10707
10708 int edb_dbi_open(EDB_txn *txn, const char *name, unsigned int flags, EDB_dbi *dbi)
10709 {
10710 EDB_val key, data;
10711 EDB_dbi i;
10712 EDB_cursor mc;
10713 EDB_db dummy;
10714 int rc, dbflag, exact;
10715 unsigned int unused = 0, seq;
10716 char *namedup;
10717 size_t len;
10718
10719 if (flags & ~VALID_FLAGS)
10720 return EINVAL;
10721 if (txn->mt_flags & EDB_TXN_BLOCKED)
10722 return EDB_BAD_TXN;
10723
10724
10725 if (!name) {
10726 *dbi = MAIN_DBI;
10727 if (flags & PERSISTENT_FLAGS) {
10728 uint16_t f2 = flags & PERSISTENT_FLAGS;
10729
10730 if ((txn->mt_dbs[MAIN_DBI].md_flags | f2) != txn->mt_dbs[MAIN_DBI].md_flags) {
10731 txn->mt_dbs[MAIN_DBI].md_flags |= f2;
10732 txn->mt_flags |= EDB_TXN_DIRTY;
10733 }
10734 }
10735 edb_default_cmp(txn, MAIN_DBI);
10736 return EDB_SUCCESS;
10737 }
10738
10739 if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) {
10740 edb_default_cmp(txn, MAIN_DBI);
10741 }
10742
10743
10744 len = strlen(name);
10745 for (i=CORE_DBS; i<txn->mt_nuedbs; i++) {
10746 if (!txn->mt_dbxs[i].md_name.mv_size) {
10747
10748 if (!unused) unused = i;
10749 continue;
10750 }
10751 if (len == txn->mt_dbxs[i].md_name.mv_size &&
10752 !strncmp(name, txn->mt_dbxs[i].md_name.mv_data, len)) {
10753 *dbi = i;
10754 return EDB_SUCCESS;
10755 }
10756 }
10757
10758
10759 if (!unused && txn->mt_nuedbs >= txn->mt_env->me_maxdbs)
10760 return EDB_DBS_FULL;
10761
10762
10763 if (txn->mt_dbs[MAIN_DBI].md_flags & (EDB_DUPSORT|EDB_INTEGERKEY))
10764 return (flags & EDB_CREATE) ? EDB_INCOMPATIBLE : EDB_NOTFOUND;
10765
10766
10767 dbflag = DB_NEW|DB_VALID|DB_USRVALID;
10768 exact = 0;
10769 key.mv_size = len;
10770 key.mv_data = (void *)name;
10771 edb_cursor_init(&mc, txn, MAIN_DBI, NULL);
10772 rc = edb_cursor_set(&mc, &key, &data, EDB_SET, &exact);
10773 if (rc == EDB_SUCCESS) {
10774
10775 EDB_node *node = NODEPTR(mc.mc_pg[mc.mc_top], mc.mc_ki[mc.mc_top]);
10776 if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)
10777 return EDB_INCOMPATIBLE;
10778 } else {
10779 if (rc != EDB_NOTFOUND || !(flags & EDB_CREATE))
10780 return rc;
10781 if (F_ISSET(txn->mt_flags, EDB_TXN_RDONLY))
10782 return EACCES;
10783 }
10784
10785
10786 if ((namedup = strdup(name)) == NULL)
10787 return ENOMEM;
10788
10789 if (rc) {
10790
10791 data.mv_size = sizeof(EDB_db);
10792 data.mv_data = &dummy;
10793 memset(&dummy, 0, sizeof(dummy));
10794 dummy.md_root = P_INVALID;
10795 dummy.md_flags = flags & PERSISTENT_FLAGS;
10796 WITH_CURSOR_TRACKING(mc,
10797 rc = edb_cursor_put(&mc, &key, &data, F_SUBDATA));
10798 dbflag |= DB_DIRTY;
10799 }
10800
10801 if (rc) {
10802 free(namedup);
10803 } else {
10804
10805 unsigned int slot = unused ? unused : txn->mt_nuedbs;
10806 txn->mt_dbxs[slot].md_name.mv_data = namedup;
10807 txn->mt_dbxs[slot].md_name.mv_size = len;
10808 txn->mt_dbxs[slot].md_rel = NULL;
10809 txn->mt_dbflags[slot] = dbflag;
10810
10811
10812
10813 seq = ++txn->mt_env->me_dbiseqs[slot];
10814 txn->mt_dbiseqs[slot] = seq;
10815
10816 memcpy(&txn->mt_dbs[slot], data.mv_data, sizeof(EDB_db));
10817 *dbi = slot;
10818 edb_default_cmp(txn, slot);
10819 if (!unused) {
10820 txn->mt_nuedbs++;
10821 }
10822 }
10823
10824 return rc;
10825 }
10826
10827 int ESECT
10828 edb_stat(EDB_txn *txn, EDB_dbi dbi, EDB_stat *arg)
10829 {
10830 if (!arg || !TXN_DBI_EXIST(txn, dbi, DB_VALID))
10831 return EINVAL;
10832
10833 if (txn->mt_flags & EDB_TXN_BLOCKED)
10834 return EDB_BAD_TXN;
10835
10836 if (txn->mt_dbflags[dbi] & DB_STALE) {
10837 EDB_cursor mc;
10838 EDB_xcursor mx;
10839
10840 edb_cursor_init(&mc, txn, dbi, &mx);
10841 }
10842 return edb_stat0(txn->mt_env, &txn->mt_dbs[dbi], arg);
10843 }
10844
10845 void edb_dbi_close(EDB_env *env, EDB_dbi dbi)
10846 {
10847 char *ptr;
10848 if (dbi < CORE_DBS || dbi >= env->me_maxdbs)
10849 return;
10850 ptr = env->me_dbxs[dbi].md_name.mv_data;
10851
10852 if (ptr) {
10853 env->me_dbxs[dbi].md_name.mv_data = NULL;
10854 env->me_dbxs[dbi].md_name.mv_size = 0;
10855 env->me_dbflags[dbi] = 0;
10856 env->me_dbiseqs[dbi]++;
10857 free(ptr);
10858 }
10859 }
10860
10861 int edb_dbi_flags(EDB_txn *txn, EDB_dbi dbi, unsigned int *flags)
10862 {
10863
10864 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
10865 return EINVAL;
10866 *flags = txn->mt_dbs[dbi].md_flags & PERSISTENT_FLAGS;
10867 return EDB_SUCCESS;
10868 }
10869
10870
10871
10872
10873
10874
10875 static int
10876 edb_drop0(EDB_cursor *mc, int subs)
10877 {
10878 int rc;
10879
10880 rc = edb_page_search(mc, NULL, EDB_PS_FIRST);
10881 if (rc == EDB_SUCCESS) {
10882 EDB_txn *txn = mc->mc_txn;
10883 EDB_node *ni;
10884 EDB_cursor mx;
10885 unsigned int i;
10886
10887
10888
10889
10890
10891
10892 if ((mc->mc_flags & C_SUB) ||
10893 (!subs && !mc->mc_db->md_overflow_pages))
10894 edb_cursor_pop(mc);
10895
10896 edb_cursor_copy(mc, &mx);
10897 #ifdef EDB_VL32
10898
10899 for (i=0; i<mc->mc_snum; i++)
10900 edb_page_get(&mx, mc->mc_pg[i]->mp_pgno, &mx.mc_pg[i], NULL);
10901 #endif
10902 while (mc->mc_snum > 0) {
10903 EDB_page *mp = mc->mc_pg[mc->mc_top];
10904 unsigned n = NUMKEYS(mp);
10905 if (IS_LEAF(mp)) {
10906 for (i=0; i<n; i++) {
10907 ni = NODEPTR(mp, i);
10908 if (ni->mn_flags & F_BIGDATA) {
10909 EDB_page *omp;
10910 pgno_t pg;
10911 memcpy(&pg, NODEDATA(ni), sizeof(pg));
10912 rc = edb_page_get(mc, pg, &omp, NULL);
10913 if (rc != 0)
10914 goto done;
10915 edb_cassert(mc, IS_OVERFLOW(omp));
10916 rc = edb_eidl_append_range(&txn->mt_free_pgs,
10917 pg, omp->mp_pages);
10918 if (rc)
10919 goto done;
10920 mc->mc_db->md_overflow_pages -= omp->mp_pages;
10921 if (!mc->mc_db->md_overflow_pages && !subs)
10922 break;
10923 } else if (subs && (ni->mn_flags & F_SUBDATA)) {
10924 edb_xcursor_init1(mc, ni);
10925 rc = edb_drop0(&mc->mc_xcursor->mx_cursor, 0);
10926 if (rc)
10927 goto done;
10928 }
10929 }
10930 if (!subs && !mc->mc_db->md_overflow_pages)
10931 goto pop;
10932 } else {
10933 if ((rc = edb_eidl_need(&txn->mt_free_pgs, n)) != 0)
10934 goto done;
10935 for (i=0; i<n; i++) {
10936 pgno_t pg;
10937 ni = NODEPTR(mp, i);
10938 pg = NODEPGNO(ni);
10939
10940 edb_eidl_xappend(txn->mt_free_pgs, pg);
10941 }
10942 }
10943 if (!mc->mc_top)
10944 break;
10945 mc->mc_ki[mc->mc_top] = i;
10946 rc = edb_cursor_sibling(mc, 1);
10947 if (rc) {
10948 if (rc != EDB_NOTFOUND)
10949 goto done;
10950
10951
10952
10953 pop:
10954 edb_cursor_pop(mc);
10955 mc->mc_ki[0] = 0;
10956 for (i=1; i<mc->mc_snum; i++) {
10957 mc->mc_ki[i] = 0;
10958 mc->mc_pg[i] = mx.mc_pg[i];
10959 }
10960 }
10961 }
10962
10963 rc = edb_eidl_append(&txn->mt_free_pgs, mc->mc_db->md_root);
10964 done:
10965 if (rc)
10966 txn->mt_flags |= EDB_TXN_ERROR;
10967
10968 EDB_CURSOR_UNREF(&mx, 0);
10969 } else if (rc == EDB_NOTFOUND) {
10970 rc = EDB_SUCCESS;
10971 }
10972 mc->mc_flags &= ~C_INITIALIZED;
10973 return rc;
10974 }
10975
10976 int edb_drop(EDB_txn *txn, EDB_dbi dbi, int del)
10977 {
10978 EDB_cursor *mc, *m2;
10979 int rc;
10980
10981 if ((unsigned)del > 1 || !TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
10982 return EINVAL;
10983
10984 if (F_ISSET(txn->mt_flags, EDB_TXN_RDONLY))
10985 return EACCES;
10986
10987 if (TXN_DBI_CHANGED(txn, dbi))
10988 return EDB_BAD_DBI;
10989
10990 rc = edb_cursor_open(txn, dbi, &mc);
10991 if (rc)
10992 return rc;
10993
10994 rc = edb_drop0(mc, mc->mc_db->md_flags & EDB_DUPSORT);
10995
10996 for (m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next)
10997 m2->mc_flags &= ~(C_INITIALIZED|C_EOF);
10998 if (rc)
10999 goto leave;
11000
11001
11002 if (del && dbi >= CORE_DBS) {
11003 rc = edb_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA);
11004 if (!rc) {
11005 txn->mt_dbflags[dbi] = DB_STALE;
11006 edb_dbi_close(txn->mt_env, dbi);
11007 } else {
11008 txn->mt_flags |= EDB_TXN_ERROR;
11009 }
11010 } else {
11011
11012 txn->mt_dbflags[dbi] |= DB_DIRTY;
11013 txn->mt_dbs[dbi].md_depth = 0;
11014 txn->mt_dbs[dbi].md_branch_pages = 0;
11015 txn->mt_dbs[dbi].md_leaf_pages = 0;
11016 txn->mt_dbs[dbi].md_overflow_pages = 0;
11017 txn->mt_dbs[dbi].md_entries = 0;
11018 txn->mt_dbs[dbi].md_root = P_INVALID;
11019
11020 txn->mt_flags |= EDB_TXN_DIRTY;
11021 }
11022 leave:
11023 edb_cursor_close(mc);
11024 return rc;
11025 }
11026
11027 int edb_set_compare(EDB_txn *txn, EDB_dbi dbi, EDB_cmp_func *cmp)
11028 {
11029 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
11030 return EINVAL;
11031
11032 txn->mt_dbxs[dbi].md_cmp = cmp;
11033 return EDB_SUCCESS;
11034 }
11035
11036 int edb_set_dupsort(EDB_txn *txn, EDB_dbi dbi, EDB_cmp_func *cmp)
11037 {
11038 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
11039 return EINVAL;
11040
11041 txn->mt_dbxs[dbi].md_dcmp = cmp;
11042 return EDB_SUCCESS;
11043 }
11044
11045 int edb_set_relfunc(EDB_txn *txn, EDB_dbi dbi, EDB_rel_func *rel)
11046 {
11047 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
11048 return EINVAL;
11049
11050 txn->mt_dbxs[dbi].md_rel = rel;
11051 return EDB_SUCCESS;
11052 }
11053
11054 int edb_set_relctx(EDB_txn *txn, EDB_dbi dbi, void *ctx)
11055 {
11056 if (!TXN_DBI_EXIST(txn, dbi, DB_USRVALID))
11057 return EINVAL;
11058
11059 txn->mt_dbxs[dbi].md_relctx = ctx;
11060 return EDB_SUCCESS;
11061 }
11062
11063 int ESECT
11064 edb_env_get_maxkeysize(EDB_env *env)
11065 {
11066 return ENV_MAXKEY(env);
11067 }
11068
11069 int ESECT
11070 edb_reader_list(EDB_env *env, EDB_msg_func *func, void *ctx)
11071 {
11072 unsigned int i, rdrs;
11073 EDB_reader *mr;
11074 char buf[64];
11075 int rc = 0, first = 1;
11076
11077 if (!env || !func)
11078 return -1;
11079 if (!env->me_txns) {
11080 return func("(no reader locks)\n", ctx);
11081 }
11082 rdrs = env->me_txns->mti_numreaders;
11083 mr = env->me_txns->mti_readers;
11084 for (i=0; i<rdrs; i++) {
11085 if (mr[i].mr_pid) {
11086 txnid_t txnid = mr[i].mr_txnid;
11087 sprintf(buf, txnid == (txnid_t)-1 ?
11088 "%10d %"Z"x -\n" : "%10d %"Z"x %"Yu"\n",
11089 (int)mr[i].mr_pid, (size_t)mr[i].mr_tid, txnid);
11090 if (first) {
11091 first = 0;
11092 rc = func(" pid thread txnid\n", ctx);
11093 if (rc < 0)
11094 break;
11095 }
11096 rc = func(buf, ctx);
11097 if (rc < 0)
11098 break;
11099 }
11100 }
11101 if (first) {
11102 rc = func("(no active readers)\n", ctx);
11103 }
11104 return rc;
11105 }
11106
11107
11108
11109
11110 static int ESECT
11111 edb_pid_insert(EDB_PID_T *ids, EDB_PID_T pid)
11112 {
11113
11114 unsigned base = 0;
11115 unsigned cursor = 1;
11116 int val = 0;
11117 unsigned n = ids[0];
11118
11119 while( 0 < n ) {
11120 unsigned pivot = n >> 1;
11121 cursor = base + pivot + 1;
11122 val = pid - ids[cursor];
11123
11124 if( val < 0 ) {
11125 n = pivot;
11126
11127 } else if ( val > 0 ) {
11128 base = cursor;
11129 n -= pivot + 1;
11130
11131 } else {
11132
11133 return -1;
11134 }
11135 }
11136
11137 if( val > 0 ) {
11138 ++cursor;
11139 }
11140 ids[0]++;
11141 for (n = ids[0]; n > cursor; n--)
11142 ids[n] = ids[n-1];
11143 ids[n] = pid;
11144 return 0;
11145 }
11146
11147 int ESECT
11148 edb_reader_check(EDB_env *env, int *dead)
11149 {
11150 if (!env)
11151 return EINVAL;
11152 if (dead)
11153 *dead = 0;
11154 return env->me_txns ? edb_reader_check0(env, 0, dead) : EDB_SUCCESS;
11155 }
11156
11157
11158 static int ESECT
11159 edb_reader_check0(EDB_env *env, int rlocked, int *dead)
11160 {
11161 edb_mutexref_t rmutex = rlocked ? NULL : env->me_rmutex;
11162 unsigned int i, j, rdrs;
11163 EDB_reader *mr;
11164 EDB_PID_T *pids, pid;
11165 int rc = EDB_SUCCESS, count = 0;
11166
11167 rdrs = env->me_txns->mti_numreaders;
11168 pids = malloc((rdrs+1) * sizeof(EDB_PID_T));
11169 if (!pids)
11170 return ENOMEM;
11171 pids[0] = 0;
11172 mr = env->me_txns->mti_readers;
11173 for (i=0; i<rdrs; i++) {
11174 pid = mr[i].mr_pid;
11175 if (pid && pid != env->me_pid) {
11176 if (edb_pid_insert(pids, pid) == 0) {
11177 if (!edb_reader_pid(env, Pidcheck, pid)) {
11178
11179 j = i;
11180 if (rmutex) {
11181 if ((rc = LOCK_MUTEX0(rmutex)) != 0) {
11182 if ((rc = edb_mutex_failed(env, rmutex, rc)))
11183 break;
11184 rdrs = 0;
11185 } else {
11186
11187 if (edb_reader_pid(env, Pidcheck, pid))
11188 j = rdrs;
11189 }
11190 }
11191 for (; j<rdrs; j++)
11192 if (mr[j].mr_pid == pid) {
11193 DPRINTF(("clear stale reader pid %u txn %"Yd,
11194 (unsigned) pid, mr[j].mr_txnid));
11195 mr[j].mr_pid = 0;
11196 count++;
11197 }
11198 if (rmutex)
11199 UNLOCK_MUTEX(rmutex);
11200 }
11201 }
11202 }
11203 }
11204 free(pids);
11205 if (dead)
11206 *dead = count;
11207 return rc;
11208 }
11209
11210 #ifdef EDB_ROBUST_SUPPORTED
11211
11212
11213
11214
11215
11216
11217
11218 static int ESECT
11219 edb_mutex_failed(EDB_env *env, edb_mutexref_t mutex, int rc)
11220 {
11221 int rlocked, rc2;
11222 EDB_meta *meta;
11223
11224 if (rc == EDB_OWNERDEAD) {
11225
11226 rc = EDB_SUCCESS;
11227 rlocked = (mutex == env->me_rmutex);
11228 if (!rlocked) {
11229
11230
11231
11232 meta = edb_env_pick_meta(env);
11233 env->me_txns->mti_txnid = meta->mm_txnid;
11234
11235 if (env->me_txn) {
11236 env->me_flags |= EDB_FATAL_ERROR;
11237 env->me_txn = NULL;
11238 rc = EDB_PANIC;
11239 }
11240 }
11241 DPRINTF(("%cmutex owner died, %s", (rlocked ? 'r' : 'w'),
11242 (rc ? "this process' env is hosed" : "recovering")));
11243 rc2 = edb_reader_check0(env, rlocked, NULL);
11244 if (rc2 == 0)
11245 rc2 = edb_mutex_consistent(mutex);
11246 if (rc || (rc = rc2)) {
11247 DPRINTF(("LOCK_MUTEX recovery failed, %s", edb_strerror(rc)));
11248 UNLOCK_MUTEX(mutex);
11249 }
11250 } else {
11251 #ifdef _WIN32
11252 rc = ErrCode();
11253 #endif
11254 DPRINTF(("LOCK_MUTEX failed, %s", edb_strerror(rc)));
11255 }
11256
11257 return rc;
11258 }
11259 #endif
11260
11261 #if defined(_WIN32)
11262
11263 static int ESECT
11264 utf8_to_utf16(const char *src, EDB_name *dst, int xtra)
11265 {
11266 int rc, need = 0;
11267 wchar_t *result = NULL;
11268 for (;;) {
11269 need = MultiByteToWideChar(CP_UTF8, 0, src, -1, result, need);
11270 if (!need) {
11271 rc = ErrCode();
11272 free(result);
11273 return rc;
11274 }
11275 if (!result) {
11276 result = malloc(sizeof(wchar_t) * (need + xtra));
11277 if (!result)
11278 return ENOMEM;
11279 continue;
11280 }
11281 dst->mn_alloced = 1;
11282 dst->mn_len = need - 1;
11283 dst->mn_val = result;
11284 return EDB_SUCCESS;
11285 }
11286 }
11287 #endif
11288
11289