*** dbinc/mp.h.orig 2004-02-02 10:24:53.000000000 -0800 --- dbinc/mp.h 2004-02-02 10:26:27.000000000 -0800 *************** *** 149,154 **** --- 149,161 ---- * region lock). */ DB_MPOOL_STAT stat; /* Per-cache mpool statistics. */ + + /* + * We track page puts so that we can decide when allocation is never + * going to succeed. We don't lock the field, all we care about is + * if it changes. + */ + u_int32_t put_counter; /* Count of page put calls. */ }; struct __db_mpool_hash { *** mp/mp_fput.c.orig 2002-08-13 06:26:41.000000000 -0700 --- mp/mp_fput.c 2004-02-02 10:22:35.000000000 -0800 *************** *** 19,24 **** --- 19,26 ---- #include "dbinc/db_shash.h" #include "dbinc/mp.h" + static void __memp_reset_lru __P((DB_ENV *, REGINFO *)); + /* * __memp_fput -- * Mpool file put function. *************** *** 198,202 **** --- 200,255 ---- MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + /* + * On every buffer put we update the buffer generation number and check + * for wraparound. + */ + if (++c_mp->lru_count == UINT32_T_MAX) + __memp_reset_lru(dbenv, dbmp->reginfo); + return (0); } + + /* + * __memp_reset_lru -- + * Reset the cache LRU counter. + */ + static void + __memp_reset_lru(dbenv, memreg) + DB_ENV *dbenv; + REGINFO *memreg; + { + BH *bhp; + DB_MPOOL_HASH *hp; + MPOOL *c_mp; + int bucket; + + c_mp = memreg->primary; + + /* + * Update the counter so all future allocations will start at the + * bottom. + */ + c_mp->lru_count -= MPOOL_BASE_DECREMENT; + + /* Adjust the priority of every buffer in the system. */ + for (hp = R_ADDR(memreg, c_mp->htab), + bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { + /* + * Skip empty buckets. + * + * We can check for empty buckets before locking as we + * only care if the pointer is zero or non-zero. + */ + if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) + continue; + + MUTEX_LOCK(dbenv, &hp->hash_mutex); + for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); + bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) + if (bhp->priority != UINT32_T_MAX && + bhp->priority > MPOOL_BASE_DECREMENT) + bhp->priority -= MPOOL_BASE_DECREMENT; + MUTEX_UNLOCK(dbenv, &hp->hash_mutex); + } + } *** mp/mp_alloc.c.orig 2002-08-17 07:23:25.000000000 -0700 --- mp/mp_alloc.c 2004-02-02 10:28:15.000000000 -0800 *************** *** 25,31 **** } HS; static void __memp_bad_buffer __P((DB_MPOOL_HASH *)); - static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *)); /* * __memp_alloc -- --- 25,30 ---- *************** *** 50,57 **** MPOOL *c_mp; MPOOLFILE *bh_mfp; size_t freed_space; ! u_int32_t buckets, buffers, high_priority, max_na, priority; ! int aggressive, ret; void *p; dbenv = dbmp->dbenv; --- 49,57 ---- MPOOL *c_mp; MPOOLFILE *bh_mfp; size_t freed_space; ! u_int32_t buckets, buffers, high_priority, priority, put_counter; ! u_int32_t total_buckets; ! int aggressive, giveup, ret; void *p; dbenv = dbmp->dbenv; *************** *** 59,76 **** dbht = R_ADDR(memreg, c_mp->htab); hp_end = &dbht[c_mp->htab_buckets]; ! buckets = buffers = 0; ! aggressive = 0; c_mp->stat.st_alloc++; /* - * Get aggressive if we've tried to flush the number of pages as are - * in the system without finding space. - */ - max_na = 5 * c_mp->htab_buckets; - - /* * If we're allocating a buffer, and the one we're discarding is the * same size, we don't want to waste the time to re-integrate it into * the shared memory free list. If the DB_MPOOLFILE argument isn't --- 59,71 ---- dbht = R_ADDR(memreg, c_mp->htab); hp_end = &dbht[c_mp->htab_buckets]; ! buckets = buffers = put_counter = total_buckets = 0; ! aggressive = giveup = 0; ! hp_tmp = NULL; c_mp->stat.st_alloc++; /* * If we're allocating a buffer, and the one we're discarding is the * same size, we don't want to waste the time to re-integrate it into * the shared memory free list. If the DB_MPOOLFILE argument isn't *************** *** 81,99 **** len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize; R_LOCK(dbenv, memreg); - - /* - * On every buffer allocation we update the buffer generation number - * and check for wraparound. - */ - if (++c_mp->lru_count == UINT32_T_MAX) - __memp_reset_lru(dbenv, memreg, c_mp); - /* * Anything newer than 1/10th of the buffer pool is ignored during * allocation (unless allocation starts failing). */ - DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10); high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10; /* --- 76,85 ---- *************** *** 120,129 **** * We're not holding the region locked here, these statistics * can't be trusted. */ ! if (buckets != 0) { ! if (buckets > c_mp->stat.st_alloc_max_buckets) ! c_mp->stat.st_alloc_max_buckets = buckets; ! c_mp->stat.st_alloc_buckets += buckets; } if (buffers != 0) { if (buffers > c_mp->stat.st_alloc_max_pages) --- 106,116 ---- * We're not holding the region locked here, these statistics * can't be trusted. */ ! total_buckets += buckets; ! if (total_buckets != 0) { ! if (total_buckets > c_mp->stat.st_alloc_max_buckets) ! c_mp->stat.st_alloc_max_buckets = total_buckets; ! c_mp->stat.st_alloc_buckets += total_buckets; } if (buffers != 0) { if (buffers > c_mp->stat.st_alloc_max_pages) *************** *** 131,136 **** --- 118,129 ---- c_mp->stat.st_alloc_pages += buffers; } return (0); + } else if (giveup || c_mp->stat.st_pages == 0) { + R_UNLOCK(dbenv, memreg); + + __db_err(dbenv, + "unable to allocate space from the buffer cache"); + return (ret); } /* *************** *** 138,163 **** * we need. Reset our free-space counter. */ freed_space = 0; /* * Walk the hash buckets and find the next two with potentially useful * buffers. Free the buffer with the lowest priority from the buckets' * chains. */ ! for (hp_tmp = NULL;;) { /* Check for wrap around. */ hp = &dbht[c_mp->last_checked++]; if (hp >= hp_end) { c_mp->last_checked = 0; ! ! /* ! * If we've gone through all of the hash buckets, try ! * an allocation. If the cache is small, the old page ! * size is small, and the new page size is large, we ! * might have freed enough memory (but not 3 times the ! * memory). ! */ ! goto alloc; } /* --- 131,154 ---- * we need. Reset our free-space counter. */ freed_space = 0; + total_buckets += buckets; + buckets = 0; /* * Walk the hash buckets and find the next two with potentially useful * buffers. Free the buffer with the lowest priority from the buckets' * chains. */ ! for (;;) { ! /* All pages have been freed, make one last try */ ! if (c_mp->stat.st_pages == 0) ! goto alloc; ! /* Check for wrap around. */ hp = &dbht[c_mp->last_checked++]; if (hp >= hp_end) { c_mp->last_checked = 0; ! hp = &dbht[c_mp->last_checked++]; } /* *************** *** 172,210 **** /* * The failure mode is when there are too many buffers we can't * write or there's not enough memory in the system. We don't ! * have a metric for deciding if allocation has no possible way ! * to succeed, so we don't ever fail, we assume memory will be ! * available if we wait long enough. * ! * Get aggressive if we've tried to flush 5 times the number of ! * hash buckets as are in the system -- it's possible we have ! * been repeatedly trying to flush the same buffers, although ! * it's unlikely. Aggressive means: * * a: set a flag to attempt to flush high priority buffers as * well as other buffers. * b: sync the mpool to force out queue extent pages. While we * might not have enough space for what we want and flushing * is expensive, why not? ! * c: sleep for a second -- hopefully someone else will run and ! * free up some memory. Try to allocate memory too, in case ! * the other thread returns its memory to the region. ! * d: look at a buffer in every hash bucket rather than choose * the more preferable of two. * * !!! * This test ignores pathological cases like no buffers in the * system -- that shouldn't be possible. */ ! if ((++buckets % max_na) == 0) { ! aggressive = 1; ! R_UNLOCK(dbenv, memreg); ! (void)__memp_sync_int( ! dbenv, NULL, 0, DB_SYNC_ALLOC, NULL); ! ! (void)__os_sleep(dbenv, 1, 0); R_LOCK(dbenv, memreg); goto alloc; --- 163,221 ---- /* * The failure mode is when there are too many buffers we can't * write or there's not enough memory in the system. We don't ! * have a way to know that allocation has no way to succeed. ! * We fail if there were no pages returned to the cache after ! * we've been trying for a relatively long time. * ! * Get aggressive if we've tried to flush the number of hash ! * buckets as are in the system and have not found any more ! * space. Aggressive means: * * a: set a flag to attempt to flush high priority buffers as * well as other buffers. * b: sync the mpool to force out queue extent pages. While we * might not have enough space for what we want and flushing * is expensive, why not? ! * c: look at a buffer in every hash bucket rather than choose * the more preferable of two. + * d: start to think about giving up. + * + * If we get here twice, sleep for a second, hopefully someone + * else will run and free up some memory. + * + * Always try to allocate memory too, in case some other thread + * returns its memory to the region. * * !!! * This test ignores pathological cases like no buffers in the * system -- that shouldn't be possible. */ ! if ((++buckets % c_mp->htab_buckets) == 0) { ! if (freed_space > 0) ! goto alloc; R_UNLOCK(dbenv, memreg); ! switch (++aggressive) { ! case 1: ! break; ! case 2: ! put_counter = c_mp->put_counter; ! /* FALLTHROUGH */ ! case 3: ! case 4: ! case 5: ! case 6: ! (void)__memp_sync_int( ! dbenv, NULL, 0, DB_SYNC_ALLOC, NULL); ! ! (void)__os_sleep(dbenv, 1, 0); ! break; ! default: ! aggressive = 1; ! if (put_counter == c_mp->put_counter) ! giveup = 1; ! break; ! } R_LOCK(dbenv, memreg); goto alloc; *************** *** 277,283 **** * thread may have acquired this buffer and incremented the ref * count after we wrote it, in which case we can't have it. * ! * If there's a write error, avoid selecting this buffer again * by making it the bucket's least-desirable buffer. */ if (ret != 0 || bhp->ref != 0) { --- 288,295 ---- * thread may have acquired this buffer and incremented the ref * count after we wrote it, in which case we can't have it. * ! * If there's a write error and we're having problems finding ! * something to allocate, avoid selecting this buffer again * by making it the bucket's least-desirable buffer. */ if (ret != 0 || bhp->ref != 0) { *************** *** 301,306 **** --- 313,320 ---- freed_space += __db_shsizeof(bhp); __memp_bhfree(dbmp, hp, bhp, 1); + if (aggressive > 1) + aggressive = 1; /* * Unlock this hash bucket and re-acquire the region lock. If *************** *** 362,415 **** hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority; } - /* - * __memp_reset_lru -- - * Reset the cache LRU counter. - */ - static void - __memp_reset_lru(dbenv, memreg, c_mp) - DB_ENV *dbenv; - REGINFO *memreg; - MPOOL *c_mp; - { - BH *bhp; - DB_MPOOL_HASH *hp; - int bucket; - - /* - * Update the counter so all future allocations will start at the - * bottom. - */ - c_mp->lru_count -= MPOOL_BASE_DECREMENT; - - /* Release the region lock. */ - R_UNLOCK(dbenv, memreg); - - /* Adjust the priority of every buffer in the system. */ - for (hp = R_ADDR(memreg, c_mp->htab), - bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) { - /* - * Skip empty buckets. - * - * We can check for empty buckets before locking as we - * only care if the pointer is zero or non-zero. - */ - if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL) - continue; - - MUTEX_LOCK(dbenv, &hp->hash_mutex); - for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh); - bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh)) - if (bhp->priority != UINT32_T_MAX && - bhp->priority > MPOOL_BASE_DECREMENT) - bhp->priority -= MPOOL_BASE_DECREMENT; - MUTEX_UNLOCK(dbenv, &hp->hash_mutex); - } - - /* Reacquire the region lock. */ - R_LOCK(dbenv, memreg); - } - #ifdef DIAGNOSTIC /* * __memp_check_order -- --- 376,381 ---- *** dbreg/dbreg_rec.c.orig 2002-08-17 07:22:52.000000000 -0700 --- dbreg/dbreg_rec.c 2003-11-08 10:59:19.000000000 -0800 *************** *** 174,192 **** * Typically, closes should match an open which means * that if this is a close, there should be a valid * entry in the dbentry table when we get here, ! * however there is an exception. If this is an * OPENFILES pass, then we may have started from * a log file other than the first, and the * corresponding open appears in an earlier file. ! * We can ignore that case, but all others are errors. */ dbe = &dblp->dbentry[argp->fileid]; if (dbe->dbp == NULL && !dbe->deleted) { /* No valid entry here. */ ! if ((argp->opcode != LOG_CLOSE && ! argp->opcode != LOG_RCLOSE) || ! (op != DB_TXN_OPENFILES && ! op !=DB_TXN_POPENFILES)) { __db_err(dbenv, "Improper file close at %lu/%lu", (u_long)lsnp->file, --- 174,193 ---- * Typically, closes should match an open which means * that if this is a close, there should be a valid * entry in the dbentry table when we get here, ! * however there are exceptions. 1. If this is an * OPENFILES pass, then we may have started from * a log file other than the first, and the * corresponding open appears in an earlier file. ! * 2. If we are undoing an open on an abort or ! * recovery, it's possible that we failed after ! * the log record, but before we actually entered ! * a handle here. */ dbe = &dblp->dbentry[argp->fileid]; if (dbe->dbp == NULL && !dbe->deleted) { /* No valid entry here. */ ! if (DB_REDO(op) || ! argp->opcode == LOG_CHECKPOINT) { __db_err(dbenv, "Improper file close at %lu/%lu", (u_long)lsnp->file, *** env/env_recover.c.orig.1 2002-08-22 14:52:51.000000000 -0700 --- env/env_recover.c 2003-11-15 08:20:59.000000000 -0800 *************** *** 232,243 **** * we'll still need to do a vtruncate based on information we haven't * yet collected. */ ! if (ret == DB_NOTFOUND) { ret = 0; ! if (max_lsn == NULL) ! goto done; ! } ! if (ret != 0) goto err; hi_txn = txnid; --- 232,240 ---- * we'll still need to do a vtruncate based on information we haven't * yet collected. */ ! if (ret == DB_NOTFOUND) ret = 0; ! else if (ret != 0) goto err; hi_txn = txnid; *************** *** 331,337 **** /* Find a low txnid. */ ret = 0; ! do { /* txnid is after rectype, which is a u_int32. */ memcpy(&txnid, (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid)); --- 328,334 ---- /* Find a low txnid. */ ret = 0; ! if (hi_txn != 0) do { /* txnid is after rectype, which is a u_int32. */ memcpy(&txnid, (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid)); *************** *** 344,354 **** * There are no transactions and we're not recovering to an LSN (see * above), so there is nothing to do. */ ! if (ret == DB_NOTFOUND) { ret = 0; - if (max_lsn == NULL) - goto done; - } /* Reset to the first lsn. */ if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0) --- 341,348 ---- * There are no transactions and we're not recovering to an LSN (see * above), so there is nothing to do. */ ! if (ret == DB_NOTFOUND) ret = 0; /* Reset to the first lsn. */ if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0) *************** *** 367,372 **** --- 361,370 ---- txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0) goto err; + /* If there were no transactions, then we can bail out early. */ + if (hi_txn == 0 && max_lsn == NULL) + goto done; + /* * Pass #2. * *************** *** 483,488 **** --- 481,487 ---- if ((ret = __dbreg_close_files(dbenv)) != 0) goto err; + done: if (max_lsn != NULL) { region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn; *************** *** 538,544 **** __db_err(dbenv, "Recovery complete at %.24s", ctime(&now)); __db_err(dbenv, "%s %lx %s [%lu][%lu]", "Maximum transaction ID", ! ((DB_TXNHEAD *)txninfo)->maxid, "Recovery checkpoint", (u_long)region->last_ckp.file, (u_long)region->last_ckp.offset); --- 537,544 ---- __db_err(dbenv, "Recovery complete at %.24s", ctime(&now)); __db_err(dbenv, "%s %lx %s [%lu][%lu]", "Maximum transaction ID", ! txninfo == NULL ? TXN_MINIMUM : ! ((DB_TXNHEAD *)txninfo)->maxid, "Recovery checkpoint", (u_long)region->last_ckp.file, (u_long)region->last_ckp.offset); *************** *** 550,556 **** (u_long)lsn.file, (u_long)lsn.offset, pass); } - done: err: if (lockid != DB_LOCK_INVALIDID) { if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0) ret = t_ret; --- 550,555 ----