memcached code analyse

名词

Slab
用于表示存储的最大size数据，仅仅只是用于定义（通俗的讲就是表示可以存储数据大小的范围）。默认情况下，前后两个slab表示存储的size以1.25倍进行增长。例如slab1为96字节，slab2为120字节

Page
分配给Slab的内存空间，默认为1MB。分给Slab后将会根据slab的大小切割成chunk

Chunk
用于缓存记录的内存空间

Slab calss
特定大小的Chunk集合

增长因子

增长因子就是相邻两个chunk之间的增长倍数。 memcache默认是1.25。

2倍 chunk size增长。

struct item:

#define ITEM_LINKED 1 //该item插入到LRU队列了
#define ITEM_CAS 2 //该item使用CAS
#define ITEM_SLABBED 4 //该item还在slab的空闲队列里面，没有分配出去
#define ITEM_FETCHED 8 //该item插入到LRU队列后，被worker线程访问过

typedef struct _stritem {
    struct _stritem *next;//next指针，用于LRU链表
    struct _stritem *prev;//prev指针，用于LRU链表
    struct _stritem *h_next;//h_next指针，用于哈希表的冲突链
    rel_time_t      time;//最后一次访问时间。绝对时间
    rel_time_t      exptime;//过期失效时间，绝对时间
    int             nbytes;//本item存放的数据的长度   
    unsigned short  refcount;//本item的引用数
    uint8_t         nsuffix;//后缀长度    /* length of flags-and-length string */
    uint8_t         it_flags;//item的属性   /* ITEM_* above */
    uint8_t         slabs_clsid;/* which slab class we're in */
    uint8_t         nkey;//键值的长度       /* key length, w/terminating null and padding */
    /* this odd type prevents type-punning issues when we do
     * the little shuffle to save space when not using CAS. */
    union {
        uint64_t cas;
        char end;
    } data[];
    /* if it_flags & ITEM_CAS we have 8 bytes CAS */
    /* then null-terminated key */
    /* then " flags length\r\n" (no terminating null) */
    /* then data with terminating \r\n (no terminating null; it's binary!) */
} item;

struct slabclass:

typedef struct {
    unsigned int size;      /* sizes of items */
    unsigned int perslab;   /* how many items per slab */

    void *slots;           /* list of item ptrs */
    unsigned int sl_curr;   /* total free items in list */

    void *end_page_ptr;         /* pointer to next free item at end of page, or 0 */
    unsigned int end_page_free; /* number of items remaining at end of last alloced page */

    unsigned int slabs;     /* how many slabs were allocated for this class */

    void **slab_list;       /* array of slab pointers */
    unsigned int list_size; /* size of prev array */

    unsigned int killing;  /* index+1 of dying slab, or zero if none */
    size_t requested; /* The number of requested bytes */
} slabclass_t;

item_link_q:

将item插入到LRU队列的头部

//将item插入到LRU队列的头部
static void item_link_q(item *it) { /* item is the new head */
    item **head, **tail;
    assert(it->slabs_clsid < LARGEST_ID); assert((it->it_flags & ITEM_SLABBED) == 0);

    head = &heads[it->slabs_clsid];
    tail = &tails[it->slabs_clsid];
    assert(it != *head);
    assert((*head && *tail) || (*head == 0 && *tail == 0));

	//头插法插入该item
    it->prev = 0;
    it->next = *head;
    if (it->next) it->next->prev = it;
    *head = it;//该item作为对应链表的第一个节点

	//如果该item是对应id上的第一个item，那么还会被认为是该id链上的最后一个item
	//因为在head那里使用头插法，所以第一个插入的item，到了后面确实成了最后一个item
    if (*tail == 0) *tail = it;
    sizes[it->slabs_clsid]++;//个数加一
    return;
}

item_unlink_q:

将item从对应的LRU队列中删除

static void item_unlink_q(item *it) {
    item **head, **tail;
    assert(it->slabs_clsid < LARGEST_ID); head = &heads[it->slabs_clsid];
    tail = &tails[it->slabs_clsid];

	
    if (*head == it) {//链表上的第一个节点
        assert(it->prev == 0);
        *head = it->next;
    }
    if (*tail == it) {//链表上的最后一个节点
        assert(it->next == 0);
        *tail = it->prev;
    }
    assert(it->next != it);
    assert(it->prev != it);

	//把item的前驱节点和后驱节点连接起来
    if (it->next) it->next->prev = it->prev;
    if (it->prev) it->prev->next = it->next;
    sizes[it->slabs_clsid]--;//个数减一
    return;
}

do_item_update:

更新item

#define ITEM_UPDATE_INTERVAL 60 //更新频率为60秒

void do_item_update(item *it) {
	//下面的代码可以看到update操作是耗时的。如果这个item频繁被访问，
	//那么会导致过多的update，过多的一系列费时操作。此时更新间隔就应运而生
	//了。如果上一次的访问时间(也可以说是update时间)距离现在(current_time)
	//还在更新间隔内的，就不更新。超出了才更新。
    if (it->time < current_time - ITEM_UPDATE_INTERVAL) { mutex_lock(&cache_lock); if ((it->it_flags & ITEM_LINKED) != 0) {
            item_unlink_q(it);//从LRU队列中删除
            it->time = current_time;//更新访问时间
            item_link_q(it);//插入到LRU队列的头部
        }
        mutex_unlock(&cache_lock);
    }
}

do_slabs_newslab:

分配一个新的 slab

static int do_slabs_newslab(const unsigned int id) {
    slabclass_t *p = &slabclass[id];
    int len = settings.slab_reassign ? settings.item_size_max
        : p->size * p->perslab;
    char *ptr;

    if ((mem_limit && mem_malloced + len > mem_limit && p->slabs > 0) ||
        (grow_slab_list(id) == 0) ||
        ((ptr = memory_allocate((size_t)len)) == 0)) {

        MEMCACHED_SLABS_SLABCLASS_ALLOCATE_FAILED(id);
        return 0;
    }

    memset(ptr, 0, (size_t)len);
    p->end_page_ptr = ptr;
    p->end_page_free = p->perslab;

    p->slab_list[p->slabs++] = ptr;
    mem_malloced += len;
    MEMCACHED_SLABS_SLABCLASS_ALLOCATE(id);

    return 1;
}

do_slabs_alloc:

从指定的 slabclass, 即 slabclass[id], 分配大小为 size 的内存块供申请者使用.

分配的原则是, 优先从 slots 指向的空闲链表中分配, 空闲链表没有, 才从 slab 中分配一个空闲的 chunk.

static void *do_slabs_alloc(const size_t size, unsigned int id) {
    slabclass_t *p;
    void *ret = NULL;
    item *it = NULL;

    if (id < POWER_SMALLEST || id > power_largest) {
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, 0);
        return NULL;
    }

    p = &slabclass[id];
    assert(p->sl_curr == 0 || ((item *)p->slots)->slabs_clsid == 0);

 /* 如果不使用 slab 机制, 则直接从操作系统分配 */
#ifdef USE_SYSTEM_MALLOC
    if (mem_limit && mem_malloced + size > mem_limit) {
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
        return 0;
    }
    mem_malloced += size;
    ret = malloc(size);
    MEMCACHED_SLABS_ALLOCATE(size, id, 0, ret);
    return ret;
#endif

    /* fail unless we have space at the end of a recently allocated page,
       we have something on our freelist, or we could allocate a new page */
 /* 确保最后一个slab 有空闲的chunk */
    if (! (p->end_page_ptr != 0 || p->sl_curr != 0 ||
           do_slabs_newslab(id) != 0)) {
        /* We don't have more memory available */
        ret = NULL;
    } else if (p->sl_curr != 0) {
  /* 从空闲list分配一个item */
        /* return off our freelist */
        it = (item *)p->slots;
        p->slots = it->next;
        if (it->next) it->next->prev = 0;
        p->sl_curr--;
        ret = (void *)it;
    } else {
  /* 从最后一个slab中分配一个空闲chunk */
        /* if we recently allocated a whole page, return from that */
        assert(p->end_page_ptr != NULL);
        ret = p->end_page_ptr;
        if (--p->end_page_free != 0) {
            p->end_page_ptr = ((caddr_t)p->end_page_ptr) + p->size;
        } else {
            p->end_page_ptr = 0;
        }
    }

    if (ret) {
        p->requested += size;
        MEMCACHED_SLABS_ALLOCATE(size, id, p->size, ret);
    } else {
        MEMCACHED_SLABS_ALLOCATE_FAILED(size, id);
    }

    return ret;
}

do_item_alloc:

从 slab 系统分配一个空闲 item

item *do_item_alloc(char *key, const size_t nkey, const int flags, const rel_time_t exptime, const int nbytes) {
    uint8_t nsuffix;
    item *it = NULL;
    char suffix[40];
    size_t ntotal = item_make_header(nkey + 1, flags, nbytes, suffix, &nsuffix);
    if (settings.use_cas) {
        ntotal += sizeof(uint64_t);
    }

    unsigned int id = slabs_clsid(ntotal);
    if (id == 0)
        return 0;

    mutex_lock(&cache_lock);
    /* do a quick check if we have any expired items in the tail.. */
    item *search;
    rel_time_t oldest_live = settings.oldest_live;

    search = tails[id];
    if (search != NULL && (refcount_incr(&search->refcount) == 2)) {
  /* 先检查 LRU 队列最后一个 item 是否超时, 超时的话就把这个 item 分配给用户 */
        if ((search->exptime != 0 && search->exptime < current_time) || (search->time <= oldest_live && oldest_live <= current_time)) { // dead by flush STATS_LOCK(); stats.reclaimed++; STATS_UNLOCK(); itemstats[id].reclaimed++; if ((search->it_flags & ITEM_FETCHED) == 0) {
                STATS_LOCK();
                stats.expired_unfetched++;
                STATS_UNLOCK();
                itemstats[id].expired_unfetched++;
            }
            it = search;
            slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
   /* 把这个 item 从 LRU 队列和哈希表中移除 */
            do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
            /* Initialize the item block: */
            it->slabs_clsid = 0;
  /* 没有超时的 item, 那就尝试从 slabclass 分配, 运气不好的话, 分配失败,
     那就把 LRU 队列最后一个 item 剔除, 然后分配给用户 */
        } else if ((it = slabs_alloc(ntotal, id)) == NULL) {
            if (settings.evict_to_free == 0) {
                itemstats[id].outofmemory++;
                pthread_mutex_unlock(&cache_lock);
                return NULL;
            }
            itemstats[id].evicted++;
            itemstats[id].evicted_time = current_time - search->time;
            if (search->exptime != 0)
                itemstats[id].evicted_nonzero++;
            if ((search->it_flags & ITEM_FETCHED) == 0) {
                STATS_LOCK();
                stats.evicted_unfetched++;
                STATS_UNLOCK();
                itemstats[id].evicted_unfetched++;
            }
            STATS_LOCK();
            stats.evictions++;
            STATS_UNLOCK();
            it = search;
            slabs_adjust_mem_requested(it->slabs_clsid, ITEM_ntotal(it), ntotal);
   /* 把这个 item 从 LRU 队列和哈希表中移除 */
            do_item_unlink_nolock(it, hash(ITEM_key(it), it->nkey, 0));
            /* Initialize the item block: */
            it->slabs_clsid = 0;
        } else {
            refcount_decr(&search->refcount);
        }
 /* LRU 队列是空的, 或者锁住了, 那就只能从 slabclass 分配内存 */
    } else {
        /* If the LRU is empty or locked, attempt to allocate memory */
        it = slabs_alloc(ntotal, id);
        if (search != NULL)
            refcount_decr(&search->refcount);
    }

    if (it == NULL) {
        itemstats[id].outofmemory++;
        /* Last ditch effort. There was a very rare bug which caused
         * refcount leaks. We leave this just in case they ever happen again.
         * We can reasonably assume no item can stay locked for more than
         * three hours, so if we find one in the tail which is that old,
         * free it anyway.
         */
        if (search != NULL &&
            search->refcount != 2 &&
            search->time + TAIL_REPAIR_TIME < current_time) { itemstats[id].tailrepairs++; search->refcount = 1;
            do_item_unlink_nolock(search, hash(ITEM_key(search), search->nkey, 0));
        }
        pthread_mutex_unlock(&cache_lock);
        return NULL;
    }

    assert(it->slabs_clsid == 0);
    assert(it != heads[id]);

 /* 顺便对 item 做一些初始化 */
    /* Item initialization can happen outside of the lock; the item's already
     * been removed from the slab LRU.
     */
    it->refcount = 1;     /* the caller will have a reference */
    pthread_mutex_unlock(&cache_lock);
    it->next = it->prev = it->h_next = 0;
    it->slabs_clsid = id;

    DEBUG_REFCNT(it, '*');
    it->it_flags = settings.use_cas ? ITEM_CAS : 0;
    it->nkey = nkey;
    it->nbytes = nbytes;
    memcpy(ITEM_key(it), key, nkey);
    it->exptime = exptime;
    memcpy(ITEM_suffix(it), suffix, (size_t)nsuffix);
    it->nsuffix = nsuffix;
    return it;
}

do_item_link

形成了一个完成的 item 后, 就要把它放入两个数据结构中, 一是 memcached 的哈希表,
memcached 运行过程中只有一个哈希表, 二是 item 所在的 slabclass 的 LRU 队列.
每个 slabclass 都有一个 LRU 队列

int do_item_link(item *it, const uint32_t hv) {
    MEMCACHED_ITEM_LINK(ITEM_key(it), it->nkey, it->nbytes);
    assert((it->it_flags & (ITEM_LINKED|ITEM_SLABBED)) == 0);
    mutex_lock(&cache_lock);
    it->it_flags |= ITEM_LINKED;
    it->time = current_time;

    STATS_LOCK();
    stats.curr_bytes += ITEM_ntotal(it);
    stats.curr_items += 1;
    stats.total_items += 1;
    STATS_UNLOCK();

    /* Allocate a new CAS ID on link. */
    ITEM_set_cas(it, (settings.use_cas) ? get_cas_id() : 0);
 /* 把 item 放入哈希表 */
    assoc_insert(it, hv);
 /* 把 item 放入 LRU 队列*/
    item_link_q(it);
    refcount_incr(&it->refcount);
    pthread_mutex_unlock(&cache_lock);

    return 1;
}

do_item_unlink

do_item_unlink 与 do_item_unlink 做的工作相反, 把 item 从哈希表和 LRU 队列中删除,
而且还释放掉 item 所占的内存 (其实只是把 item 放到空闲链表中).

void do_item_unlink(item *it, const uint32_t hv) {
    MEMCACHED_ITEM_UNLINK(ITEM_key(it), it->nkey, it->nbytes);
    mutex_lock(&cache_lock);
    if ((it->it_flags & ITEM_LINKED) != 0) {
        it->it_flags &= ~ITEM_LINKED;
        STATS_LOCK();
        stats.curr_bytes -= ITEM_ntotal(it);
        stats.curr_items -= 1;
        STATS_UNLOCK();
  /* 从哈希表中删除 item */
        assoc_delete(ITEM_key(it), it->nkey, hv);
  /* 从 LRU 队列中删除 item */
        item_unlink_q(it);
  /* 释放 item 所占的内存 */
        do_item_remove(it);
    }
    pthread_mutex_unlock(&cache_lock);
}

item_get

根据 key 找对应的 item

/** wrapper around assoc_find which does the lazy expiration logic */
item *do_item_get(const char *key, const size_t nkey, const uint32_t hv) {
    mutex_lock(&cache_lock);
 /* 从哈希表中找 item */
    item *it = assoc_find(key, nkey, hv);
    if (it != NULL) {
        refcount_incr(&it->refcount);
        /* Optimization for slab reassignment. prevents popular items from
         * jamming in busy wait. Can only do this here to satisfy lock order
         * of item_lock, cache_lock, slabs_lock. */
        if (slab_rebalance_signal &&
            ((void *)it >= slab_rebal.slab_start && (void *)it < slab_rebal.slab_end)) { do_item_unlink_nolock(it, hv); do_item_remove(it); it = NULL; } } pthread_mutex_unlock(&cache_lock); int was_found = 0; if (settings.verbose > 2) {
        if (it == NULL) {
            fprintf(stderr, "> NOT FOUND %s", key);
        } else {
            fprintf(stderr, "> FOUND KEY %s", ITEM_key(it));
            was_found++;
        }
    }

 /* 找到了, 然后检查是否超时 */
    if (it != NULL) {
        if (settings.oldest_live != 0 && settings.oldest_live <= current_time && it->time <= settings.oldest_live) { do_item_unlink(it, hv); do_item_remove(it); it = NULL; if (was_found) { fprintf(stderr, " -nuked by flush"); } } else if (it->exptime != 0 && it->exptime <= current_time) { do_item_unlink(it, hv); do_item_remove(it); it = NULL; if (was_found) { fprintf(stderr, " -nuked by expire"); } } else { it->it_flags |= ITEM_FETCHED;
            DEBUG_REFCNT(it, '+');
        }
    }

    if (settings.verbose > 2)
        fprintf(stderr, "\n");

    return it;
}

slabs_clsid：

根据大小判定slab class

unsigned int slabs_clsid(const size_t size) {//返回slabclass索引下标值
    int res = POWER_SMALLEST;//res的初始值为1

	//返回0表示查找失败，因为slabclass数组中，第一个元素是没有使用的
    if (size == 0)
        return 0;
	
	//因为slabclass数组中各个元素能分配的item大小是升序的
	//所以从小到大直接判断即可在数组找到最小但又能满足的元素
    while (size > slabclass[res].size)
        if (res++ == power_largest)     /* won't fit in the biggest slab */
            return 0;
    return res;
}

Links:
http://blog.csdn.net/luotuo44/article/details/42963793