From a2564c3d6058f281677ca5198383858376d155bf Mon Sep 17 00:00:00 2001 From: phk Date: Mon, 24 Sep 2007 08:23:34 +0000 Subject: [PATCH] Our first cut at a LRU processing contained a three way race and can cause Really Weird Problems if you really provoke it. +++ Redo from start +++ LRU processing is only relevant for objects which are in the cache and on the EXP binary heap so we merge the LRU functionality with the EXP code in order to share locks and object reference counts. Another thing we overlooked in the first implementation is that the VCL callout for LRU should be able to examine the client information for the session which tries to push stuff out, so that QoS/DoS issues can be considered: sub vcl_dicard { if (client.bandwidth > 10 mb/s) { keep; } } (which sort of indicates that "error" should be a legal action as well) To enable this, pass the session into the stevedore when we try to allocate space and temporarily substitute the target object for its own object while we call vcl_discard(). The outcome of an attempt to make space can take three values, did, didn't and couldn't. In the latter case there is no point in trying again and again, in particular if the cause is incurable, such as the requested size being larger than the storage. We still need to handle failure to allocate storage for that case rather than core dump. When looking for targets to nuke, we only consider objects on the binheap which has reference count of 1, objects with higher reference counts would not free space. We take prospective targets off the binheap, but retain their refcount, and tag them with a magic marker while we ponder their destiny. It is quite possible that another session could pick the object up via the cache while we do so, and therefore attempts to update the ttl or shift them in the lru chain must ignore objects which has the magic marker. If the object is kept, the updated ttl will automatically take effect when we reinsert it in the binheap. git-svn-id: svn+ssh://projects.linpro.no/svn/varnish/trunk@2006 d4fa192b-c00b-0410-8231-f00ffab90ce4 --- varnish-cache/bin/varnishd/Makefile.am | 1 - varnish-cache/bin/varnishd/cache.h | 12 +- varnish-cache/bin/varnishd/cache_center.c | 2 + varnish-cache/bin/varnishd/cache_expire.c | 112 ++++++++-- varnish-cache/bin/varnishd/cache_fetch.c | 12 +- varnish-cache/bin/varnishd/cache_hash.c | 8 +- varnish-cache/bin/varnishd/cache_lru.c | 219 ------------------- varnish-cache/bin/varnishd/cache_synthetic.c | 2 +- varnish-cache/bin/varnishd/stevedore.c | 18 +- varnish-cache/bin/varnishd/stevedore.h | 2 +- 10 files changed, 122 insertions(+), 266 deletions(-) delete mode 100644 varnish-cache/bin/varnishd/cache_lru.c diff --git a/varnish-cache/bin/varnishd/Makefile.am b/varnish-cache/bin/varnishd/Makefile.am index 9d8e4ed3..956a1ab2 100644 --- a/varnish-cache/bin/varnishd/Makefile.am +++ b/varnish-cache/bin/varnishd/Makefile.am @@ -22,7 +22,6 @@ varnishd_SOURCES = \ cache_fetch.c \ cache_hash.c \ cache_http.c \ - cache_lru.c \ cache_main.c \ cache_pool.c \ cache_pipe.c \ diff --git a/varnish-cache/bin/varnishd/cache.h b/varnish-cache/bin/varnishd/cache.h index be9a8dd0..3f0638bd 100644 --- a/varnish-cache/bin/varnishd/cache.h +++ b/varnish-cache/bin/varnishd/cache.h @@ -247,7 +247,6 @@ struct object { TAILQ_HEAD(, sess) waitinglist; double lru_stamp; - TAILQ_ENTRY(object) lru; }; struct objhead { @@ -429,7 +428,8 @@ void CLI_Init(void); void EXP_Insert(struct object *o); void EXP_Init(void); void EXP_TTLchange(struct object *o); -void EXP_Terminate(struct object *o); +void EXP_Touch(struct object *o, double now); +int EXP_NukeOne(struct sess *sp); /* cache_fetch.c */ int Fetch(struct sess *sp); @@ -538,14 +538,6 @@ void VCL_Refresh(struct VCL_conf **vcc); void VCL_Rel(struct VCL_conf **vcc); void VCL_Get(struct VCL_conf **vcc); -/* cache_lru.c */ -// void LRU_Init(void); -void LRU_Enter(struct object *o, double stamp); -void LRU_Remove(struct object *o); -int LRU_DiscardOne(void); -int LRU_DiscardSpace(int64_t quota); -int LRU_DiscardTime(double cutoff); - #define VCL_RET_MAC(l,u,b,n) #define VCL_MET_MAC(l,u,b) void VCL_##l##_method(struct sess *); #include "vcl_returns.h" diff --git a/varnish-cache/bin/varnishd/cache_center.c b/varnish-cache/bin/varnishd/cache_center.c index dd9ea6af..bf292d8f 100644 --- a/varnish-cache/bin/varnishd/cache_center.c +++ b/varnish-cache/bin/varnishd/cache_center.c @@ -147,6 +147,8 @@ cnt_deliver(struct sess *sp) { sp->t_resp = TIM_real(); + if (sp->obj->objhead != NULL) + EXP_Touch(sp->obj, sp->t_resp); RES_BuildHttp(sp); VCL_deliver_method(sp); switch (sp->handling) { diff --git a/varnish-cache/bin/varnishd/cache_expire.c b/varnish-cache/bin/varnishd/cache_expire.c index fc617d8f..f8c238a9 100644 --- a/varnish-cache/bin/varnishd/cache_expire.c +++ b/varnish-cache/bin/varnishd/cache_expire.c @@ -45,12 +45,21 @@ #include "shmlog.h" #include "binary_heap.h" #include "cache.h" +#include "heritage.h" static pthread_t exp_thread; static struct binheap *exp_heap; static MTX exp_mtx; static unsigned expearly = 30; static TAILQ_HEAD(,object) exp_deathrow = TAILQ_HEAD_INITIALIZER(exp_deathrow); +static TAILQ_HEAD(,object) exp_lru = TAILQ_HEAD_INITIALIZER(exp_lru); + +/* + * This is a magic marker for the objects currently on the SIOP [look it up] + * so that other users of the object will not stumble trying to change the + * ttl or lru position. + */ +static const unsigned lru_target = (unsigned)(-3); /*--------------------------------------------------------------------*/ @@ -58,40 +67,42 @@ void EXP_Insert(struct object *o) { + CHECK_OBJ_NOTNULL(o, OBJECT_MAGIC); assert(o->heap_idx == 0); LOCK(&exp_mtx); binheap_insert(exp_heap, o); + TAILQ_INSERT_TAIL(&exp_lru, o, deathrow); UNLOCK(&exp_mtx); } void -EXP_TTLchange(struct object *o) +EXP_Touch(struct object *o, double now) { - assert(o->heap_idx != 0); - LOCK(&exp_mtx); - binheap_delete(exp_heap, o->heap_idx); - binheap_insert(exp_heap, o); - UNLOCK(&exp_mtx); + + CHECK_OBJ_NOTNULL(o, OBJECT_MAGIC); + if (o->lru_stamp + params->lru_timeout < now) { + LOCK(&exp_mtx); /* XXX: should be ..._TRY */ + if (o->heap_idx != lru_target) { + assert(o->heap_idx != 0); + TAILQ_REMOVE(&exp_lru, o, deathrow); + TAILQ_INSERT_TAIL(&exp_lru, o, deathrow); + o->lru_stamp = now; + } + UNLOCK(&exp_mtx); + } } -/* - * Immediately destroy an object. Do not wait for it to expire or trickle - * through death row; yank it - */ void -EXP_Terminate(struct object *o) +EXP_TTLchange(struct object *o) { + LOCK(&exp_mtx); - LRU_Remove(o); - if (o->heap_idx) + if (o->heap_idx != lru_target) { + assert(o->heap_idx != 0); binheap_delete(exp_heap, o->heap_idx); - if (o->deathrow.tqe_next) { - TAILQ_REMOVE(&exp_deathrow, o, deathrow); - VSL_stats->n_deathrow--; + binheap_insert(exp_heap, o); } UNLOCK(&exp_mtx); - VSL(SLT_Terminate, 0, "%u", o->xid); - HSH_Deref(o); } /*-------------------------------------------------------------------- @@ -181,6 +192,7 @@ exp_prefetch(void *arg) continue; } binheap_delete(exp_heap, o->heap_idx); + assert(o->heap_idx == 0); /* Sanity check */ o2 = binheap_root(exp_heap); @@ -195,6 +207,7 @@ exp_prefetch(void *arg) if (sp->handling == VCL_RET_DISCARD) { LOCK(&exp_mtx); + TAILQ_REMOVE(&exp_lru, o, deathrow); TAILQ_INSERT_TAIL(&exp_deathrow, o, deathrow); VSL_stats->n_deathrow++; UNLOCK(&exp_mtx); @@ -227,6 +240,69 @@ object_update(void *priv, void *p, unsigned u) o->heap_idx = u; } +/*-------------------------------------------------------------------- + * Attempt to make space by nuking, with VCLs permission, the oldest + * object on the LRU list which isn't in use. + * Returns: 1: did, 0: didn't, -1: can't + */ + +int +EXP_NukeOne(struct sess *sp) +{ + struct object *o, *o2; + + /* Find the first currently unused object on the LRU */ + LOCK(&exp_mtx); + TAILQ_FOREACH(o, &exp_lru, deathrow) + if (o->refcnt == 1) + break; + if (o != NULL) { + /* + * Take it off the binheap while we chew. This effectively + * means that we own the EXP refcnt on this object. + */ + TAILQ_REMOVE(&exp_lru, o, deathrow); + binheap_delete(exp_heap, o->heap_idx); + assert(o->heap_idx == 0); + o->heap_idx = lru_target; + VSL_stats->n_lru_nuked++; /* May be premature */ + } + UNLOCK(&exp_mtx); + + if (o == NULL) + return (-1); + + /* + * Ask VCL in the context of the requestors session, in order to + * allow client QoS considerations to inform the decision. + * Temporarily substitute the object we want to nuke for the sessions + * own object. + */ + o2 = sp->obj; + sp->obj = o; + VCL_discard_method(sp); + sp->obj = o2; + + if (sp->handling == VCL_RET_DISCARD) { + VSL(SLT_ExpKill, 0, "%u LRU", o->xid); + HSH_Deref(o); + return (1); + } + + assert(sp->handling == VCL_RET_KEEP); + + /* Insert in binheap and lru again */ + LOCK(&exp_mtx); + VSL_stats->n_lru_nuked--; /* It was premature */ + VSL_stats->n_lru_saved++; + o->heap_idx = 0; + o->lru_stamp = sp->wrk->used; + binheap_insert(exp_heap, o); + TAILQ_INSERT_TAIL(&exp_lru, o, deathrow); + UNLOCK(&exp_mtx); + return (0); +} + /*--------------------------------------------------------------------*/ void diff --git a/varnish-cache/bin/varnishd/cache_fetch.c b/varnish-cache/bin/varnishd/cache_fetch.c index 5eaa0f8d..c0ef0e8e 100644 --- a/varnish-cache/bin/varnishd/cache_fetch.c +++ b/varnish-cache/bin/varnishd/cache_fetch.c @@ -49,7 +49,7 @@ /*--------------------------------------------------------------------*/ static int -fetch_straight(const struct sess *sp, int fd, struct http *hp, char *b) +fetch_straight(struct sess *sp, int fd, struct http *hp, char *b) { int i; unsigned char *p; @@ -60,7 +60,7 @@ fetch_straight(const struct sess *sp, int fd, struct http *hp, char *b) if (cl == 0) return (0); - st = STV_alloc(cl); + st = STV_alloc(sp, cl); TAILQ_INSERT_TAIL(&sp->obj->store, st, list); st->len = cl; sp->obj->len = cl; @@ -84,7 +84,7 @@ fetch_straight(const struct sess *sp, int fd, struct http *hp, char *b) /* XXX: Cleanup. It must be possible somehow :-( */ static int -fetch_chunked(const struct sess *sp, int fd, struct http *hp) +fetch_chunked(struct sess *sp, int fd, struct http *hp) { int i; char *q; @@ -149,7 +149,7 @@ fetch_chunked(const struct sess *sp, int fd, struct http *hp) v = u; if (u < params->fetch_chunksize * 1024) v = params->fetch_chunksize * 1024; - st = STV_alloc(v); + st = STV_alloc(sp, v); TAILQ_INSERT_TAIL(&sp->obj->store, st, list); } v = st->space - st->len; @@ -208,7 +208,7 @@ fetch_chunked(const struct sess *sp, int fd, struct http *hp) #include static int -fetch_eof(const struct sess *sp, int fd, struct http *hp) +fetch_eof(struct sess *sp, int fd, struct http *hp) { int i; unsigned char *p; @@ -224,7 +224,7 @@ fetch_eof(const struct sess *sp, int fd, struct http *hp) st = NULL; while (1) { if (v == 0) { - st = STV_alloc(params->fetch_chunksize * 1024); + st = STV_alloc(sp, params->fetch_chunksize * 1024); TAILQ_INSERT_TAIL(&sp->obj->store, st, list); p = st->ptr + st->len; v = st->space - st->len; diff --git a/varnish-cache/bin/varnishd/cache_hash.c b/varnish-cache/bin/varnishd/cache_hash.c index 55e90def..b4726af7 100644 --- a/varnish-cache/bin/varnishd/cache_hash.c +++ b/varnish-cache/bin/varnishd/cache_hash.c @@ -214,7 +214,6 @@ HSH_Lookup(struct sess *sp) if (o != NULL) { UNLOCK(&oh->mtx); (void)hash->deref(oh); - LRU_Enter(o, sp->t_req); return (o); } @@ -226,7 +225,11 @@ HSH_Lookup(struct sess *sp) /* NB: do not deref objhead the new object inherits our reference */ UNLOCK(&oh->mtx); BAN_NewObj(o); - LRU_Enter(o, sp->t_req); + /* + * It's cheaper to copy the timestamp here, than to get a new one + * in EXP_Insert(). + */ + o->lru_stamp = w->used; return (o); } @@ -307,7 +310,6 @@ HSH_Deref(struct object *o) if (o->vary != NULL) free(o->vary); - LRU_Remove(o); HSH_Freestore(o); FREE_OBJ(o); VSL_stats->n_object--; diff --git a/varnish-cache/bin/varnishd/cache_lru.c b/varnish-cache/bin/varnishd/cache_lru.c deleted file mode 100644 index 236cb780..00000000 --- a/varnish-cache/bin/varnishd/cache_lru.c +++ /dev/null @@ -1,219 +0,0 @@ -/*- - * Copyright (c) 2007 Linpro AS - * All rights reserved. - * - * Author: Dag-Erling Smørgav - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $Id$ - */ - -#include "shmlog.h" -#include "cache.h" -#include "queue.h" - -/* - * For performance reasons, objects are only moved to the head of the LRU - * list when they've been in their current position for at least LRU_DELAY - * seconds, rather than on every access. This should probably be a - * run-time parameter. - */ -#define LRU_DELAY 2 - -TAILQ_HEAD(lru_head, object); - -static struct lru_head lru_list = TAILQ_HEAD_INITIALIZER(lru_list); -static pthread_mutex_t lru_mtx = PTHREAD_MUTEX_INITIALIZER; -static struct sess *lru_session; -static struct worker lru_worker; - -/* - * Initialize the LRU data structures. - */ -static inline void -LRU_Init(void) -{ - if (lru_session == NULL) { - lru_session = SES_New(NULL, 0); - XXXAN(lru_session); - lru_session->wrk = &lru_worker; - lru_worker.magic = WORKER_MAGIC; - lru_worker.wlp = lru_worker.wlog; - lru_worker.wle = lru_worker.wlog + sizeof lru_worker.wlog; - VCL_Get(&lru_session->vcl); - } else { - VCL_Refresh(&lru_session->vcl); - } -} - -/* - * Enter an object into the LRU list, or move it to the head of the list - * if it's already in it and hasn't moved in a while. - */ -void -LRU_Enter(struct object *o, double stamp) -{ - - CHECK_OBJ_NOTNULL(o, OBJECT_MAGIC); - assert(stamp > 0); - if (o->lru_stamp < stamp - LRU_DELAY && o != lru_list.tqh_first) { - // VSL(SLT_LRU_enter, 0, "%u %u %u", o->xid, o->lru_stamp, stamp); - LOCK(&lru_mtx); - if (o->lru_stamp != 0) - TAILQ_REMOVE(&lru_list, o, lru); - TAILQ_INSERT_HEAD(&lru_list, o, lru); - o->lru_stamp = stamp; - UNLOCK(&lru_mtx); - } -} - -/* - * Remove an object from the LRU list. - */ -void -LRU_Remove(struct object *o) -{ - - CHECK_OBJ_NOTNULL(o, OBJECT_MAGIC); - if (o->lru_stamp != 0) { - // VSL(SLT_LRU_remove, 0, "%u", o->xid); - LOCK(&lru_mtx); - TAILQ_REMOVE(&lru_list, o, lru); - UNLOCK(&lru_mtx); - } -} - -/* - * With the LRU lock held, call VCL_discard(). Depending on the result, - * either insert the object at the head of the list or dereference it. - */ -static int -LRU_DiscardLocked(struct object *o) -{ - struct object *so; - - if (o->busy) - return (0); - - /* XXX this is a really bad place to do this */ - LRU_Init(); - - CHECK_OBJ_NOTNULL(o, OBJECT_MAGIC); - TAILQ_REMOVE(&lru_list, o, lru); - - lru_session->obj = o; - VCL_discard_method(lru_session); - - if (lru_session->handling == VCL_RET_DISCARD) { - /* discard: release object */ - VSL(SLT_ExpKill, 0, "%u %d", o->xid, o->lru_stamp); - o->lru_stamp = 0; - EXP_Terminate(o); - return (1); - } else { - /* keep: move to front of list */ - if ((so = TAILQ_FIRST(&lru_list))) - o->lru_stamp = so->lru_stamp; - TAILQ_INSERT_HEAD(&lru_list, o, lru); - return (0); - } -} - -/* - * Walk through the LRU list, starting at the back, and check each object - * until we find one that can be retired. Return the number of objects - * that were discarded. - */ -int -LRU_DiscardOne(void) -{ - struct object *first = TAILQ_FIRST(&lru_list); - struct object *o; - int count = 0; - - LOCK(&lru_mtx); - while (!count && (o = TAILQ_LAST(&lru_list, lru_head))) { - if (LRU_DiscardLocked(o)) - ++count; - if (o == first) { - /* full circle */ - break; - } - } - UNLOCK(&lru_mtx); - return (count); -} - -/* - * Walk through the LRU list, starting at the back, and retire objects - * until our quota is reached or we run out of objects to retire. Return - * the number of objects that were discarded. - */ -int -LRU_DiscardSpace(int64_t quota) -{ - struct object *first = TAILQ_FIRST(&lru_list); - struct object *o; - unsigned int len; - int count = 0; - - LOCK(&lru_mtx); - while (quota > 0 && (o = TAILQ_LAST(&lru_list, lru_head))) { - len = o->len; - if (LRU_DiscardLocked(o)) { - quota -= len; - ++count; - } - if (o == first) { - /* full circle */ - break; - } - } - UNLOCK(&lru_mtx); - return (count); -} - -/* - * Walk through the LRU list, starting at the back, and retire objects - * that haven't been accessed since the specified cutoff date. Return the - * number of objects that were discarded. - */ -int -LRU_DiscardTime(double cutoff) -{ - struct object *first = TAILQ_FIRST(&lru_list); - struct object *o; - int count = 0; - - LOCK(&lru_mtx); - while ((o = TAILQ_LAST(&lru_list, lru_head)) && o->lru_stamp <= cutoff) { - if (LRU_DiscardLocked(o)) - ++count; - if (o == first) { - /* full circle */ - break; - } - } - UNLOCK(&lru_mtx); - return (count); -} diff --git a/varnish-cache/bin/varnishd/cache_synthetic.c b/varnish-cache/bin/varnishd/cache_synthetic.c index d4671396..caa17c6e 100644 --- a/varnish-cache/bin/varnishd/cache_synthetic.c +++ b/varnish-cache/bin/varnishd/cache_synthetic.c @@ -87,7 +87,7 @@ SYN_ErrorPage(struct sess *sp, int status, const char *reason, int ttl) /* allocate space for body */ /* XXX what if the object already has a body? */ - st = STV_alloc(1024); + st = STV_alloc(sp, 1024); TAILQ_INSERT_TAIL(&sp->obj->store, st, list); /* generate body */ diff --git a/varnish-cache/bin/varnishd/stevedore.c b/varnish-cache/bin/varnishd/stevedore.c index 7a15faf6..88bbb295 100644 --- a/varnish-cache/bin/varnishd/stevedore.c +++ b/varnish-cache/bin/varnishd/stevedore.c @@ -47,24 +47,28 @@ extern struct stevedore smf_stevedore; static struct stevedore * volatile stevedores; struct storage * -STV_alloc(size_t size) +STV_alloc(struct sess *sp, size_t size) { struct storage *st; struct stevedore *stv; for (;;) { + /* pick a stevedore and bump the head along */ + /* XXX: only safe as long as pointer writes are atomic */ stv = stevedores = stevedores->next; /* try to allocate from it */ - if ((st = stv->alloc(stv, size)) != NULL) { - CHECK_OBJ_NOTNULL(st, STORAGE_MAGIC); - return (st); - } + st = stv->alloc(stv, size); + if (st != NULL) + break; - /* no luck; free some space and keep trying */ - AN(LRU_DiscardOne()); + /* no luck; try to free some space and keep trying */ + if (EXP_NukeOne(sp) == -1) + break; } + CHECK_OBJ_NOTNULL(st, STORAGE_MAGIC); + return (st); } void diff --git a/varnish-cache/bin/varnishd/stevedore.h b/varnish-cache/bin/varnishd/stevedore.h index 364bcf43..1d1ba1b6 100644 --- a/varnish-cache/bin/varnishd/stevedore.h +++ b/varnish-cache/bin/varnishd/stevedore.h @@ -55,7 +55,7 @@ struct stevedore { struct stevedore *next, *prev; }; -struct storage *STV_alloc(size_t size); +struct storage *STV_alloc(struct sess *sp, size_t size); void STV_trim(struct storage *st, size_t size); void STV_free(struct storage *st); void STV_add(const char *spec); -- 2.39.5