名词
Slab
用于表示存储的最大size数据,仅仅只是用于定义(通俗的讲就是表示可以存储数据大小的范围)。默认情况下,前后两个slab表示存储的size以1.25倍进行增长。例如slab1为96字节,slab2为120字节
Page
分配给Slab的内存空间,默认为1MB。分给Slab后将会根据slab的大小切割成chunk
Chunk
用于缓存记录的内存空间
Slab calss
特定大小的Chunk集合
增长因子
增长因子就是相邻两个chunk之间的增长倍数。 memcache默认是1.25。
2倍 chunk size增长。
struct item:
#define ITEM_LINKED 1 //该item插入到LRU队列了
#define ITEM_CAS 2 //该item使用CAS
#define ITEM_SLABBED 4 //该item还在slab的空闲队列里面,没有分配出去
#define ITEM_FETCHED 8 //该item插入到LRU队列后,被worker线程访问过
typedef struct _stritem {
struct _stritem *next;//next指针,用于LRU链表
struct _stritem *prev;//prev指针,用于LRU链表
struct _stritem *h_next;//h_next指针,用于哈希表的冲突链
rel_time_t time;//最后一次访问时间。绝对时间
rel_time_t exptime;//过期失效时间,绝对时间
int nbytes;//本item存放的数据的长度
unsigned short refcount;//本item的引用数
uint8_t nsuffix;//后缀长度 /* length of flags-and-length string */
uint8_t it_flags;//item的属性 /* ITEM_* above */
uint8_t slabs_clsid;/* which slab class we're in */
uint8_t nkey;//键值的长度 /* key length, w/terminating null and padding */
/* this odd type prevents type-punning issues when we do
* the little shuffle to save space when not using CAS. */
union {
uint64_t cas;
char end;
} data[];
/* if it_flags & ITEM_CAS we have 8 bytes CAS */
/* then null-terminated key */
/* then " flags length\r\n" (no terminating null) */
/* then data with terminating \r\n (no terminating null; it's binary!) */
} item;
struct slabclass:
typedef struct {
unsigned int size; /* sizes of items */
unsigned int perslab; /* how many items per slab */
void *slots; /* list of item ptrs */
unsigned int sl_curr; /* total free items in list */
void *end_page_ptr; /* pointer to next free item at end of page, or 0 */
unsigned int end_page_free; /* number of items remaining at end of last alloced page */
unsigned int slabs; /* how many slabs were allocated for this class */
void **slab_list; /* array of slab pointers */
unsigned int list_size; /* size of prev array */
unsigned int killing; /* index+1 of dying slab, or zero if none */
size_t requested; /* The number of requested bytes */
} slabclass_t;
item_link_q:
将item插入到LRU队列的头部
//将item插入到LRU队列的头部
static void item_link_q(item *it) { /* item is the new head */
item **head, **tail;
assert(it->slabs_clsid < LARGEST_ID); assert((it->it_flags & ITEM_SLABBED) == 0);
head = &heads[it->slabs_clsid];
tail = &tails[it->slabs_clsid];
assert(it != *head);
assert((*head && *tail) || (*head == 0 && *tail == 0));
//头插法插入该item
it->prev = 0;
it->next = *head;
if (it->next) it->next->prev = it;
*head = it;//该item作为对应链表的第一个节点
//如果该item是对应id上的第一个item,那么还会被认为是该id链上的最后一个item
//因为在head那里使用头插法,所以第一个插入的item,到了后面确实成了最后一个item
if (*tail == 0) *tail = it;
sizes[it->slabs_clsid]++;//个数加一
return;
}
item_unlink_q:
将item从对应的LRU队列中删除
static void item_unlink_q(item *it) {
item **head, **tail;
assert(it->slabs_clsid < LARGEST_ID); head = &heads[it->slabs_clsid];
tail = &tails[it->slabs_clsid];
if (*head == it) {//链表上的第一个节点
assert(it->prev == 0);
*head = it->next;
}
if (*tail == it) {//链表上的最后一个节点
assert(it->next == 0);
*tail = it->prev;
}
assert(it->next != it);
assert(it->prev != it);
//把item的前驱节点和后驱节点连接起来
if (it->next) it->next->prev = it->prev;
if (it->prev) it->prev->next = it->next;
sizes[it->slabs_clsid]--;//个数减一
return;
}
do_item_update:
更新item
#define ITEM_UPDATE_INTERVAL 60 //更新频率为60秒
void do_item_update(item *it) {
//下面的代码可以看到update操作是耗时的。如果这个item频繁被访问,
//那么会导致过多的update,过多的一系列费时操作。此时更新间隔就应运而生
//了。如果上一次的访问时间(也可以说是update时间)距离现在(current_time)
//还在更新间隔内的,就不更新。超出了才更新。
if (it->time < current_time - ITEM_UPDATE_INTERVAL) { mutex_lock(&cache_lock); if ((it->it_flags & ITEM_LINKED) != 0) {
item_unlink_q(it);//从LRU队列中删除
it->time = current_time;//更新访问时间
item_link_q(it);//插入到LRU队列的头部
}
mutex_unlock(&cache_lock);
}
}
do_slabs_newslab:
分配一个新的 slab
static int do_slabs_newslab(const unsigned int id) {
slabclass_t *p = &slabclass[id];
int len = settings.slab_reassign ? settings.item_size_max
: p->size * p->perslab;
char *ptr;
if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
(grow_slab_list(id) == 0) ||
((ptr = memory_allocate((size_t)len)) == 0)) {
MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
return 0;
}
memset(ptr, 0, (size_t)len);
p->end_page_ptr = ptr;
p->end_page_free = p->perslab;
p->slab_list[p->slabs++] = ptr;
mem_malloced += len;
MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);
return 1;
}
do_slabs_alloc:
从指定的 slabclass, 即 slabclass[id], 分配大小为 size 的内存块供申请者使用.
分配的原则是, 优先从 slots 指向的空闲链表中分配, 空闲链表没有, 才从 slab 中分配一个空闲的 chunk.
static void *do_slabs_alloc(const size_t size, unsigned int id) {
slabclass_t *p;
void *ret = NULL;
item *it = NULL;
if (id < POWER_SMALLEST || id > power_largest) {
MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
return NULL;
}
p = &slabclass[id];
assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);
/* 如果不使用 slab 机制, 则直接从操作系统分配 */
#ifdef USE_SYSTEM_MALLOC
if (mem_limit && mem_malloced + size > mem_limit) {
MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
return 0;
}
mem_malloced += size;
ret = malloc(size);
MEMCACHED_SLABS_ALLOCATE(size, id, 0, ret);
return ret;
#endif
/* fail unless we have space at the end of a recently allocated page,
we have something on our freelist, or we could allocate a new page */
/* 确保最后一个slab 有空闲的chunk */
if (! (p->end_page_ptr != 0 || p->sl_curr != 0 ||
do_slabs_newslab(id) != 0)) {
/* We don't have more memory available */
ret = NULL;
} else if (p->sl_curr != 0) {
/* 从空闲list分配一个item */
/* return off our freelist */
it = (item *)p->slots;
p->slots = it->next;
if (it->next) it->next->prev = 0;
p->sl_curr--;
ret = (void *)it;
} else {
/* 从最后一个slab中分配一个空闲chunk */
/* if we recently allocated a whole page, return from that */
assert(p->end_page_ptr != NULL);
ret = p->end_page_ptr;
if (--p->end_page_free != 0) {
p->end_page_ptr = ((caddr_t)p->end_page_ptr) + p->size;
} else {
p->end_page_ptr = 0;
}
}
if (ret) {
p->requested += size;
MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
} else {
MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
}
return ret;
}
do_item_alloc:
从 slab 系统分配一个空闲 item
item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes) {
uint8_t nsuffix;
item *it = NULL;
char suffix[40];
size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
if (settings.use_cas) {
ntotal += sizeof(uint64_t);
}
unsigned int id = slabs_clsid(ntotal);
if (id == 0)
return 0;
mutex_lock(&cache_lock);
/* do a quick check if we have any expired items in the tail.. */
item *search;
rel_time_t oldest_live = settings.oldest_live;
search = tails[id];
if (search != NULL && (refcount_incr(&search->refcount) == 2)) {
/* 先检查 LRU 队列最后一个 item 是否超时, 超时的话就把这个 item 分配给用户 */
if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { // dead by flush STATS_LOCK(); stats.reclaimed++; STATS_UNLOCK(); itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) {
STATS_LOCK();
stats.expired_unfetched++;
STATS_UNLOCK();
itemstats[id].expired_unfetched++;
}
it = search;
slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
/* 把这个 item 从 LRU 队列和哈希表中移除 */
do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
/* Initialize the item block: */
it->slabs_clsid = 0;
/* 没有超时的 item, 那就尝试从 slabclass 分配, 运气不好的话, 分配失败,
那就把 LRU 队列最后一个 item 剔除, 然后分配给用户 */
} else if ((it = slabs_alloc(ntotal, id)) == NULL) {
if (settings.evict_to_free == 0) {
itemstats[id].outofmemory++;
pthread_mutex_unlock(&cache_lock);
return NULL;
}
itemstats[id].evicted++;
itemstats[id].evicted_time = current_time - search->time;
if (search->exptime != 0)
itemstats[id].evicted_nonzero++;
if ((search->it_flags & ITEM_FETCHED) == 0) {
STATS_LOCK();
stats.evicted_unfetched++;
STATS_UNLOCK();
itemstats[id].evicted_unfetched++;
}
STATS_LOCK();
stats.evictions++;
STATS_UNLOCK();
it = search;
slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
/* 把这个 item 从 LRU 队列和哈希表中移除 */
do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
/* Initialize the item block: */
it->slabs_clsid = 0;
} else {
refcount_decr(&search->refcount);
}
/* LRU 队列是空的, 或者锁住了, 那就只能从 slabclass 分配内存 */
} else {
/* If the LRU is empty or locked, attempt to allocate memory */
it = slabs_alloc(ntotal, id);
if (search != NULL)
refcount_decr(&search->refcount);
}
if (it == NULL) {
itemstats[id].outofmemory++;
/* Last ditch effort. There was a very rare bug which caused
* refcount leaks. We leave this just in case they ever happen again.
* We can reasonably assume no item can stay locked for more than
* three hours, so if we find one in the tail which is that old,
* free it anyway.
*/
if (search != NULL &&
search->refcount != 2 &&
search->time + TAIL_REPAIR_TIME < current_time) { itemstats[id].tailrepairs++; search->refcount = 1;
do_item_unlink_nolock(search, hash(ITEM_key(search), search->nkey, 0));
}
pthread_mutex_unlock(&cache_lock);
return NULL;
}
assert(it->slabs_clsid == 0);
assert(it != heads[id]);
/* 顺便对 item 做一些初始化 */
/* Item initialization can happen outside of the lock; the item's already
* been removed from the slab LRU.
*/
it->refcount = 1; /* the caller will have a reference */
pthread_mutex_unlock(&cache_lock);
it->next = it->prev = it->h_next = 0;
it->slabs_clsid = id;
DEBUG_REFCNT(it, '*');
it->it_flags = settings.use_cas ? ITEM_CAS : 0;
it->nkey = nkey;
it->nbytes = nbytes;
memcpy(ITEM_key(it), key, nkey);
it->exptime = exptime;
memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix);
it->nsuffix = nsuffix;
return it;
}
do_item_link
形成了一个完成的 item 后, 就要把它放入两个数据结构中, 一是 memcached 的哈希表,
memcached 运行过程中只有一个哈希表, 二是 item 所在的 slabclass 的 LRU 队列.
每个 slabclass 都有一个 LRU 队列
int do_item_link(item *it, const uint32_t hv) {
MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
mutex_lock(&cache_lock);
it->it_flags |= ITEM_LINKED;
it->time = current_time;
STATS_LOCK();
stats.curr_bytes += ITEM_ntotal(it);
stats.curr_items += 1;
stats.total_items += 1;
STATS_UNLOCK();
/* Allocate a new CAS ID on link. */
ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
/* 把 item 放入哈希表 */
assoc_insert(it, hv);
/* 把 item 放入 LRU 队列*/
item_link_q(it);
refcount_incr(&it->refcount);
pthread_mutex_unlock(&cache_lock);
return 1;
}
do_item_unlink
do_item_unlink 与 do_item_unlink 做的工作相反, 把 item 从哈希表和 LRU 队列中删除,
而且还释放掉 item 所占的内存 (其实只是把 item 放到空闲链表中).
void do_item_unlink(item *it, const uint32_t hv) {
MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
mutex_lock(&cache_lock);
if ((it->it_flags & ITEM_LINKED) != 0) {
it->it_flags &= ~ITEM_LINKED;
STATS_LOCK();
stats.curr_bytes -= ITEM_ntotal(it);
stats.curr_items -= 1;
STATS_UNLOCK();
/* 从哈希表中删除 item */
assoc_delete(ITEM_key(it), it->nkey, hv);
/* 从 LRU 队列中删除 item */
item_unlink_q(it);
/* 释放 item 所占的内存 */
do_item_remove(it);
}
pthread_mutex_unlock(&cache_lock);
}
item_get
根据 key 找对应的 item
/** wrapper around assoc_find which does the lazy expiration logic */
item *do_item_get(const char *key, const size_t nkey, const uint32_t hv) {
mutex_lock(&cache_lock);
/* 从哈希表中找 item */
item *it = assoc_find(key, nkey, hv);
if (it != NULL) {
refcount_incr(&it->refcount);
/* Optimization for slab reassignment. prevents popular items from
* jamming in busy wait. Can only do this here to satisfy lock order
* of item_lock, cache_lock, slabs_lock. */
if (slab_rebalance_signal &&
((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) { do_item_unlink_nolock(it, hv); do_item_remove(it); it = NULL; } } pthread_mutex_unlock(&cache_lock); int was_found = 0; if (settings.verbose > 2) {
if (it == NULL) {
fprintf(stderr, "> NOT FOUND %s", key);
} else {
fprintf(stderr, "> FOUND KEY %s", ITEM_key(it));
was_found++;
}
}
/* 找到了, 然后检查是否超时 */
if (it != NULL) {
if (settings.oldest_live != 0 && settings.oldest_live <= current_time && it->time <= settings.oldest_live) { do_item_unlink(it, hv); do_item_remove(it); it = NULL; if (was_found) { fprintf(stderr, " -nuked by flush"); } } else if (it->exptime != 0 && it->exptime <= current_time) { do_item_unlink(it, hv); do_item_remove(it); it = NULL; if (was_found) { fprintf(stderr, " -nuked by expire"); } } else { it->it_flags |= ITEM_FETCHED;
DEBUG_REFCNT(it, '+');
}
}
if (settings.verbose > 2)
fprintf(stderr, "\n");
return it;
}
slabs_clsid:
根据大小判定slab class
unsigned int slabs_clsid(const size_t size) {//返回slabclass索引下标值
int res = POWER_SMALLEST;//res的初始值为1
//返回0表示查找失败,因为slabclass数组中,第一个元素是没有使用的
if (size == 0)
return 0;
//因为slabclass数组中各个元素能分配的item大小是升序的
//所以从小到大直接判断即可在数组找到最小但又能满足的元素
while (size > slabclass[res].size)
if (res++ == power_largest) /* won't fit in the biggest slab */
return 0;
return res;
}
Links:
http://blog.csdn.net/luotuo44/article/details/42963793