*** dbinc/mp.h.orig	2004-02-02 10:24:53.000000000 -0800
--- dbinc/mp.h	2004-02-02 10:26:27.000000000 -0800
***************
*** 149,154 ****
--- 149,161 ----
  	 * region lock).
  	 */
  	DB_MPOOL_STAT stat;		/* Per-cache mpool statistics. */
+  
+ 	 /*
+ 	  * We track page puts so that we can decide when allocation is never
+ 	  * going to succeed.  We don't lock the field, all we care about is
+ 	  * if it changes.
+ 	  */
+ 	 u_int32_t  put_counter;                /* Count of page put calls. */
  };
  
  struct __db_mpool_hash {
*** mp/mp_fput.c.orig	2002-08-13 06:26:41.000000000 -0700
--- mp/mp_fput.c	2004-02-02 10:22:35.000000000 -0800
***************
*** 19,24 ****
--- 19,26 ----
  #include "dbinc/db_shash.h"
  #include "dbinc/mp.h"
  
+ static void __memp_reset_lru __P((DB_ENV *, REGINFO *));
+ 
  /*
   * __memp_fput --
   *	Mpool file put function.
***************
*** 198,202 ****
--- 200,255 ----
  
  	MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
  
+ 	/*
+ 	 * On every buffer put we update the buffer generation number and check
+ 	 * for wraparound.
+ 	 */
+ 	if (++c_mp->lru_count == UINT32_T_MAX)
+ 		__memp_reset_lru(dbenv, dbmp->reginfo);
+ 
  	return (0);
  }
+ 
+ /*
+  * __memp_reset_lru --
+  *	Reset the cache LRU counter.
+  */
+ static void
+ __memp_reset_lru(dbenv, memreg)
+ 	DB_ENV *dbenv;
+ 	REGINFO *memreg;
+ {
+ 	BH *bhp;
+ 	DB_MPOOL_HASH *hp;
+ 	MPOOL *c_mp;
+ 	int bucket;
+ 
+ 	c_mp = memreg->primary;
+ 
+ 	/*
+ 	 * Update the counter so all future allocations will start at the
+ 	 * bottom.
+ 	 */
+ 	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
+ 
+ 	/* Adjust the priority of every buffer in the system. */
+ 	for (hp = R_ADDR(memreg, c_mp->htab),
+ 	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
+ 		/*
+ 		 * Skip empty buckets.
+ 		 *
+ 		 * We can check for empty buckets before locking as we
+ 		 * only care if the pointer is zero or non-zero.
+ 		 */
+ 		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
+ 			continue;
+ 
+ 		MUTEX_LOCK(dbenv, &hp->hash_mutex);
+ 		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
+ 		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
+ 			if (bhp->priority != UINT32_T_MAX &&
+ 			    bhp->priority > MPOOL_BASE_DECREMENT)
+ 				bhp->priority -= MPOOL_BASE_DECREMENT;
+ 		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
+ 	}
+ }
*** mp/mp_alloc.c.orig	2002-08-17 07:23:25.000000000 -0700
--- mp/mp_alloc.c	2004-02-02 10:28:15.000000000 -0800
***************
*** 25,31 ****
  } HS;
  
  static void __memp_bad_buffer __P((DB_MPOOL_HASH *));
- static void __memp_reset_lru __P((DB_ENV *, REGINFO *, MPOOL *));
  
  /*
   * __memp_alloc --
--- 25,30 ----
***************
*** 50,57 ****
  	MPOOL *c_mp;
  	MPOOLFILE *bh_mfp;
  	size_t freed_space;
! 	u_int32_t buckets, buffers, high_priority, max_na, priority;
! 	int aggressive, ret;
  	void *p;
  
  	dbenv = dbmp->dbenv;
--- 49,57 ----
  	MPOOL *c_mp;
  	MPOOLFILE *bh_mfp;
  	size_t freed_space;
! 	u_int32_t buckets, buffers, high_priority, priority, put_counter;
! 	u_int32_t total_buckets;
! 	int aggressive, giveup, ret;
  	void *p;
  
  	dbenv = dbmp->dbenv;
***************
*** 59,76 ****
  	dbht = R_ADDR(memreg, c_mp->htab);
  	hp_end = &dbht[c_mp->htab_buckets];
  
! 	buckets = buffers = 0;
! 	aggressive = 0;
  
  	c_mp->stat.st_alloc++;
  
  	/*
- 	 * Get aggressive if we've tried to flush the number of pages as are
- 	 * in the system without finding space.
- 	 */
- 	max_na = 5 * c_mp->htab_buckets;
- 
- 	/*
  	 * If we're allocating a buffer, and the one we're discarding is the
  	 * same size, we don't want to waste the time to re-integrate it into
  	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
--- 59,71 ----
  	dbht = R_ADDR(memreg, c_mp->htab);
  	hp_end = &dbht[c_mp->htab_buckets];
  
! 	buckets = buffers = put_counter = total_buckets = 0;
! 	aggressive = giveup = 0;
! 	hp_tmp = NULL;
  
  	c_mp->stat.st_alloc++;
  
  	/*
  	 * If we're allocating a buffer, and the one we're discarding is the
  	 * same size, we don't want to waste the time to re-integrate it into
  	 * the shared memory free list.  If the DB_MPOOLFILE argument isn't
***************
*** 81,99 ****
  		len = (sizeof(BH) - sizeof(u_int8_t)) + mfp->stat.st_pagesize;
  
  	R_LOCK(dbenv, memreg);
- 
- 	/*
- 	 * On every buffer allocation we update the buffer generation number
- 	 * and check for wraparound.
- 	 */
- 	if (++c_mp->lru_count == UINT32_T_MAX)
- 		__memp_reset_lru(dbenv, memreg, c_mp);
- 
  	/*
  	 * Anything newer than 1/10th of the buffer pool is ignored during
  	 * allocation (unless allocation starts failing).
  	 */
- 	DB_ASSERT(c_mp->lru_count > c_mp->stat.st_pages / 10);
  	high_priority = c_mp->lru_count - c_mp->stat.st_pages / 10;
  
  	/*
--- 76,85 ----
***************
*** 120,129 ****
  		 * We're not holding the region locked here, these statistics
  		 * can't be trusted.
  		 */
! 		if (buckets != 0) {
! 			if (buckets > c_mp->stat.st_alloc_max_buckets)
! 				c_mp->stat.st_alloc_max_buckets = buckets;
! 			c_mp->stat.st_alloc_buckets += buckets;
  		}
  		if (buffers != 0) {
  			if (buffers > c_mp->stat.st_alloc_max_pages)
--- 106,116 ----
  		 * We're not holding the region locked here, these statistics
  		 * can't be trusted.
  		 */
! 		total_buckets += buckets;
! 		if (total_buckets != 0) {
! 			if (total_buckets > c_mp->stat.st_alloc_max_buckets)
! 				c_mp->stat.st_alloc_max_buckets = total_buckets;
! 			c_mp->stat.st_alloc_buckets += total_buckets;
  		}
  		if (buffers != 0) {
  			if (buffers > c_mp->stat.st_alloc_max_pages)
***************
*** 131,136 ****
--- 118,129 ----
  			c_mp->stat.st_alloc_pages += buffers;
  		}
  		return (0);
+ 	} else if (giveup || c_mp->stat.st_pages == 0) {
+ 		R_UNLOCK(dbenv, memreg);
+ 
+ 		__db_err(dbenv,
+ 		    "unable to allocate space from the buffer cache");
+ 		return (ret);
  	}
  
  	/*
***************
*** 138,163 ****
  	 * we need.  Reset our free-space counter.
  	 */
  	freed_space = 0;
  
  	/*
  	 * Walk the hash buckets and find the next two with potentially useful
  	 * buffers.  Free the buffer with the lowest priority from the buckets'
  	 * chains.
  	 */
! 	for (hp_tmp = NULL;;) {
  		/* Check for wrap around. */
  		hp = &dbht[c_mp->last_checked++];
  		if (hp >= hp_end) {
  			c_mp->last_checked = 0;
! 
! 			/*
! 			 * If we've gone through all of the hash buckets, try
! 			 * an allocation.  If the cache is small, the old page
! 			 * size is small, and the new page size is large, we
! 			 * might have freed enough memory (but not 3 times the
! 			 * memory).
! 			 */
! 			goto alloc;
  		}
  
  		/*
--- 131,154 ----
  	 * we need.  Reset our free-space counter.
  	 */
  	freed_space = 0;
+ 	total_buckets += buckets;
+ 	buckets = 0;
  
  	/*
  	 * Walk the hash buckets and find the next two with potentially useful
  	 * buffers.  Free the buffer with the lowest priority from the buckets'
  	 * chains.
  	 */
! 	for (;;) {
! 		/* All pages have been freed, make one last try */
! 		if (c_mp->stat.st_pages == 0)
! 			goto alloc;
! 
  		/* Check for wrap around. */
  		hp = &dbht[c_mp->last_checked++];
  		if (hp >= hp_end) {
  			c_mp->last_checked = 0;
! 			hp = &dbht[c_mp->last_checked++];
  		}
  
  		/*
***************
*** 172,210 ****
  		/*
  		 * The failure mode is when there are too many buffers we can't
  		 * write or there's not enough memory in the system.  We don't
! 		 * have a metric for deciding if allocation has no possible way
! 		 * to succeed, so we don't ever fail, we assume memory will be
! 		 * available if we wait long enough.
  		 *
! 		 * Get aggressive if we've tried to flush 5 times the number of
! 		 * hash buckets as are in the system -- it's possible we have
! 		 * been repeatedly trying to flush the same buffers, although
! 		 * it's unlikely.  Aggressive means:
  		 *
  		 * a: set a flag to attempt to flush high priority buffers as
  		 *    well as other buffers.
  		 * b: sync the mpool to force out queue extent pages.  While we
  		 *    might not have enough space for what we want and flushing
  		 *    is expensive, why not?
! 		 * c: sleep for a second -- hopefully someone else will run and
! 		 *    free up some memory.  Try to allocate memory too, in case
! 		 *    the other thread returns its memory to the region.
! 		 * d: look at a buffer in every hash bucket rather than choose
  		 *    the more preferable of two.
  		 *
  		 * !!!
  		 * This test ignores pathological cases like no buffers in the
  		 * system -- that shouldn't be possible.
  		 */
! 		if ((++buckets % max_na) == 0) {
! 			aggressive = 1;
! 
  			R_UNLOCK(dbenv, memreg);
  
! 			(void)__memp_sync_int(
! 			    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
! 
! 			(void)__os_sleep(dbenv, 1, 0);
  
  			R_LOCK(dbenv, memreg);
  			goto alloc;
--- 163,221 ----
  		/*
  		 * The failure mode is when there are too many buffers we can't
  		 * write or there's not enough memory in the system.  We don't
! 		 * have a way to know that allocation has no way to succeed.
! 		 * We fail if there were no pages returned to the cache after
! 		 * we've been trying for a relatively long time.
  		 *
! 		 * Get aggressive if we've tried to flush the number of hash
! 		 * buckets as are in the system and have not found any more
! 		 * space.  Aggressive means:
  		 *
  		 * a: set a flag to attempt to flush high priority buffers as
  		 *    well as other buffers.
  		 * b: sync the mpool to force out queue extent pages.  While we
  		 *    might not have enough space for what we want and flushing
  		 *    is expensive, why not?
! 		 * c: look at a buffer in every hash bucket rather than choose
  		 *    the more preferable of two.
+ 		 * d: start to think about giving up.
+ 		 *
+ 		 * If we get here twice, sleep for a second, hopefully someone
+ 		 * else will run and free up some memory.
+ 		 *
+ 		 * Always try to allocate memory too, in case some other thread
+ 		 * returns its memory to the region.
  		 *
  		 * !!!
  		 * This test ignores pathological cases like no buffers in the
  		 * system -- that shouldn't be possible.
  		 */
! 		if ((++buckets % c_mp->htab_buckets) == 0) {
! 			if (freed_space > 0)
! 				goto alloc;
  			R_UNLOCK(dbenv, memreg);
  
! 			switch (++aggressive) {
! 			case 1:
! 				break;
! 			case 2:
! 				put_counter = c_mp->put_counter;
! 				/* FALLTHROUGH */
! 			case 3:
! 			case 4:
! 			case 5:
! 			case 6:
! 				(void)__memp_sync_int(
! 				    dbenv, NULL, 0, DB_SYNC_ALLOC, NULL);
! 
! 				(void)__os_sleep(dbenv, 1, 0);
! 				break;
! 			default:
! 				aggressive = 1;
! 				if (put_counter == c_mp->put_counter)
! 					giveup = 1;
! 				break;
! 			}
  
  			R_LOCK(dbenv, memreg);
  			goto alloc;
***************
*** 277,283 ****
  		 * thread may have acquired this buffer and incremented the ref
  		 * count after we wrote it, in which case we can't have it.
  		 *
! 		 * If there's a write error, avoid selecting this buffer again
  		 * by making it the bucket's least-desirable buffer.
  		 */
  		if (ret != 0 || bhp->ref != 0) {
--- 288,295 ----
  		 * thread may have acquired this buffer and incremented the ref
  		 * count after we wrote it, in which case we can't have it.
  		 *
! 		 * If there's a write error and we're having problems finding
! 		 * something to allocate, avoid selecting this buffer again
  		 * by making it the bucket's least-desirable buffer.
  		 */
  		if (ret != 0 || bhp->ref != 0) {
***************
*** 301,306 ****
--- 313,320 ----
  
  		freed_space += __db_shsizeof(bhp);
  		__memp_bhfree(dbmp, hp, bhp, 1);
+ 		if (aggressive > 1)
+ 			aggressive = 1;
  
  		/*
  		 * Unlock this hash bucket and re-acquire the region lock. If
***************
*** 362,415 ****
  	hp->hash_priority = SH_TAILQ_FIRST(&hp->hash_bucket, __bh)->priority;
  }
  
- /*
-  * __memp_reset_lru --
-  *	Reset the cache LRU counter.
-  */
- static void
- __memp_reset_lru(dbenv, memreg, c_mp)
- 	DB_ENV *dbenv;
- 	REGINFO *memreg;
- 	MPOOL *c_mp;
- {
- 	BH *bhp;
- 	DB_MPOOL_HASH *hp;
- 	int bucket;
- 
- 	/*
- 	 * Update the counter so all future allocations will start at the
- 	 * bottom.
- 	 */
- 	c_mp->lru_count -= MPOOL_BASE_DECREMENT;
- 
- 	/* Release the region lock. */
- 	R_UNLOCK(dbenv, memreg);
- 
- 	/* Adjust the priority of every buffer in the system. */
- 	for (hp = R_ADDR(memreg, c_mp->htab),
- 	    bucket = 0; bucket < c_mp->htab_buckets; ++hp, ++bucket) {
- 		/*
- 		 * Skip empty buckets.
- 		 *
- 		 * We can check for empty buckets before locking as we
- 		 * only care if the pointer is zero or non-zero.
- 		 */
- 		if (SH_TAILQ_FIRST(&hp->hash_bucket, __bh) == NULL)
- 			continue;
- 
- 		MUTEX_LOCK(dbenv, &hp->hash_mutex);
- 		for (bhp = SH_TAILQ_FIRST(&hp->hash_bucket, __bh);
- 		    bhp != NULL; bhp = SH_TAILQ_NEXT(bhp, hq, __bh))
- 			if (bhp->priority != UINT32_T_MAX &&
- 			    bhp->priority > MPOOL_BASE_DECREMENT)
- 				bhp->priority -= MPOOL_BASE_DECREMENT;
- 		MUTEX_UNLOCK(dbenv, &hp->hash_mutex);
- 	}
- 
- 	/* Reacquire the region lock. */
- 	R_LOCK(dbenv, memreg);
- }
- 
  #ifdef DIAGNOSTIC
  /*
   * __memp_check_order --
--- 376,381 ----
*** dbreg/dbreg_rec.c.orig	2002-08-17 07:22:52.000000000 -0700
--- dbreg/dbreg_rec.c	2003-11-08 10:59:19.000000000 -0800
***************
*** 174,192 ****
  			 * Typically, closes should match an open which means
  			 * that if this is a close, there should be a valid
  			 * entry in the dbentry table when we get here,
! 			 * however there is an exception.  If this is an
  			 * OPENFILES pass, then we may have started from
  			 * a log file other than the first, and the
  			 * corresponding open appears in an earlier file.
! 			 * We can ignore that case, but all others are errors.
  			 */
  			dbe = &dblp->dbentry[argp->fileid];
  			if (dbe->dbp == NULL && !dbe->deleted) {
  				/* No valid entry here. */
! 				if ((argp->opcode != LOG_CLOSE &&
! 				    argp->opcode != LOG_RCLOSE) ||
! 				    (op != DB_TXN_OPENFILES &&
! 				    op !=DB_TXN_POPENFILES)) {
  					__db_err(dbenv,
  					    "Improper file close at %lu/%lu",
  					    (u_long)lsnp->file,
--- 174,193 ----
  			 * Typically, closes should match an open which means
  			 * that if this is a close, there should be a valid
  			 * entry in the dbentry table when we get here,
! 			 * however there are exceptions.  1. If this is an
  			 * OPENFILES pass, then we may have started from
  			 * a log file other than the first, and the
  			 * corresponding open appears in an earlier file.
! 			 * 2. If we are undoing an open on an abort or
! 			 * recovery, it's possible that we failed after
! 			 * the log record, but before we actually entered
! 			 * a handle here.
  			 */
  			dbe = &dblp->dbentry[argp->fileid];
  			if (dbe->dbp == NULL && !dbe->deleted) {
  				/* No valid entry here. */
! 				if (DB_REDO(op) ||
! 				    argp->opcode == LOG_CHECKPOINT) {
  					__db_err(dbenv,
  					    "Improper file close at %lu/%lu",
  					    (u_long)lsnp->file,
*** env/env_recover.c.orig.1	2002-08-22 14:52:51.000000000 -0700
--- env/env_recover.c	2003-11-15 08:20:59.000000000 -0800
***************
*** 232,243 ****
  	 * we'll still need to do a vtruncate based on information we haven't
  	 * yet collected.
  	 */
! 	if (ret == DB_NOTFOUND) {
  		ret = 0;
! 		if (max_lsn == NULL)
! 			goto done;
! 	}
! 	if (ret != 0)
  		goto err;
  
  	hi_txn = txnid;
--- 232,240 ----
  	 * we'll still need to do a vtruncate based on information we haven't
  	 * yet collected.
  	 */
! 	if (ret == DB_NOTFOUND) 
  		ret = 0;
! 	else if (ret != 0)
  		goto err;
  
  	hi_txn = txnid;
***************
*** 331,337 ****
  
  	/* Find a low txnid. */
  	ret = 0;
! 	do {
  		/* txnid is after rectype, which is a u_int32. */
  		memcpy(&txnid,
  		    (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
--- 328,334 ----
  
  	/* Find a low txnid. */
  	ret = 0;
! 	if (hi_txn != 0) do {
  		/* txnid is after rectype, which is a u_int32. */
  		memcpy(&txnid,
  		    (u_int8_t *)data.data + sizeof(u_int32_t), sizeof(txnid));
***************
*** 344,354 ****
  	 * There are no transactions and we're not recovering to an LSN (see
  	 * above), so there is nothing to do.
  	 */
! 	if (ret == DB_NOTFOUND) {
  		ret = 0;
- 		if (max_lsn == NULL)
- 			goto done;
- 	}
  
  	/* Reset to the first lsn. */
  	if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
--- 341,348 ----
  	 * There are no transactions and we're not recovering to an LSN (see
  	 * above), so there is nothing to do.
  	 */
! 	if (ret == DB_NOTFOUND) 
  		ret = 0;
  
  	/* Reset to the first lsn. */
  	if (ret != 0 || (ret = logc->get(logc, &first_lsn, &data, DB_SET)) != 0)
***************
*** 367,372 ****
--- 361,370 ----
  	    txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0)
  		goto err;
  
+ 	/* If there were no transactions, then we can bail out early. */
+ 	if (hi_txn == 0 && max_lsn == NULL)
+ 		goto done;
+ 		
  	/*
  	 * Pass #2.
  	 *
***************
*** 483,488 ****
--- 481,487 ----
  	if ((ret = __dbreg_close_files(dbenv)) != 0)
  		goto err;
  
+ done:
  	if (max_lsn != NULL) {
  		region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn;
  
***************
*** 538,544 ****
  		__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
  		__db_err(dbenv, "%s %lx %s [%lu][%lu]",
  		    "Maximum transaction ID",
! 		    ((DB_TXNHEAD *)txninfo)->maxid,
  		    "Recovery checkpoint",
  		    (u_long)region->last_ckp.file,
  		    (u_long)region->last_ckp.offset);
--- 537,544 ----
  		__db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
  		__db_err(dbenv, "%s %lx %s [%lu][%lu]",
  		    "Maximum transaction ID",
! 		    txninfo == NULL ? TXN_MINIMUM :
! 			((DB_TXNHEAD *)txninfo)->maxid,
  		    "Recovery checkpoint",
  		    (u_long)region->last_ckp.file,
  		    (u_long)region->last_ckp.offset);
***************
*** 550,556 ****
  		    (u_long)lsn.file, (u_long)lsn.offset, pass);
  	}
  
- done:
  err:	if (lockid != DB_LOCK_INVALIDID) {
  		if ((t_ret = __rep_unlockpages(dbenv, lockid)) != 0 && ret == 0)
  			ret = t_ret;
--- 550,555 ----