Redis 字典 API 的性能调优秘籍

Redis 字典简介

Redis 字典（dict）是 Redis 中最基本的数据结构之一，它实现了一个高效的键值对存储。在 Redis 中，许多数据结构如数据库、哈希表等都是基于字典来实现的。

字典由 dict 结构体表示，其定义如下：

typedef struct dict {
    dictType *type;
    void *privdata;
    dictht ht[2];
    long rehashidx; /* rehashing not in progress if rehashidx == -1 */
    unsigned long iterators; /* number of iterators currently running */
} dict;

type 字段指向一个 dictType 结构体，该结构体定义了针对特定类型键值对的操作函数，如计算哈希值、比较键等。

typedef struct dictType {
    unsigned int (*hashFunction)(const void *key);
    void *(*keyDup)(void *privdata, const void *key);
    void *(*valDup)(void *privdata, const void *obj);
    int (*keyCompare)(void *privdata, const void *key1, const void *key2);
    void (*keyDestructor)(void *privdata, void *key);
    void (*valDestructor)(void *privdata, void *obj);
} dictType;

privdata 是一个指针，用于传递给 dictType 中函数的私有数据。
ht 是一个包含两个 dictht 结构体的数组，dictht 表示哈希表。

typedef struct dictht {
    dictEntry **table;
    unsigned long size;
    unsigned long sizemask;
    unsigned long used;
} dictht;

- `table` 是一个指向 `dictEntry` 指针数组的指针，`dictEntry` 代表哈希表中的一个键值对。

typedef struct dictEntry {
    void *key;
    union {
        void *val;
        uint64_t u64;
        int64_t s64;
        double d;
    } v;
    struct dictEntry *next;
} dictEntry;

- `size` 表示哈希表的大小。
- `sizemask` 用于计算哈希值的掩码，通常为 `size - 1`。
- `used` 表示哈希表中已使用的节点数。

rehashidx 用于记录 rehash 过程的进度，当 rehashidx == -1 时，表示没有进行 rehash。
iterators 记录当前正在运行的迭代器数量。

Redis 字典 API 概述

Redis 提供了一系列操作字典的 API，主要包括以下几类：

创建与销毁
- dictCreate：用于创建一个新的字典。它接受一个 dictType 指针和一个私有数据指针作为参数，并初始化字典的各个字段。
```
dict *dictCreate(dictType *type, void *privDataPtr) {
    dict *d = zmalloc(sizeof(*d));
    _dictInit(d, type, privDataPtr);
    return d;
}
```
- dictRelease：用于释放一个字典及其所有键值对。它会遍历字典，调用键值对的析构函数（如果有），并释放相关内存。
```
void dictRelease(dict *d) {
    _dictClear(d, 1);
    zfree(d);
}
```

添加与查找

dictAdd：用于向字典中添加一个新的键值对。它首先计算键的哈希值，然后根据哈希值找到对应的哈希表槽位。如果该槽位为空，则直接插入；如果不为空，则通过链表解决冲突。

int dictAdd(dict *d, void *key, void *val) {
    dictEntry *entry = dictAddRaw(d, key);
    if (!entry) return DICT_ERR;
    dictSetVal(d, entry, val);
    return DICT_OK;
}

dictFind：用于在字典中查找一个键对应的值。它同样计算键的哈希值，找到对应的槽位，然后在链表中查找键。

dictEntry *dictFind(dict *d, const void *key) {
    dictEntry *he;
    unsigned int h, idx, table;

    if (d->ht[0].used == 0) return NULL; /* dict is empty */
    h = dictHashKey(d, key);
    for (table = 0; table <= 1; table++) {
        idx = h & d->ht[table].sizemask;
        he = d->ht[table].table[idx];
        while(he) {
            if (dictCompareKeys(d, key, he->key))
                return he;
            he = he->next;
        }
        if (d->rehashidx == -1) break;
    }
    return NULL;
}

删除

dictDelete：用于从字典中删除一个键值对。它找到要删除的键值对后，调整链表结构，并调用键值对的析构函数（如果有）。

int dictDelete(dict *d, const void *key) {
    dictEntry *he, *prevHe;
    unsigned int h, idx, table;

    if (d->ht[0].used == 0) return DICT_ERR;
    h = dictHashKey(d, key);
    for (table = 0; table <= 1; table++) {
        idx = h & d->ht[table].sizemask;
        he = d->ht[table].table[idx];
        prevHe = NULL;
        while(he) {
            if (dictCompareKeys(d, key, he->key)) {
                if (prevHe)
                    prevHe->next = he->next;
                else
                    d->ht[table].table[idx] = he->next;
                dictFreeEntry(d, he);
                d->ht[table].used--;
                return DICT_OK;
            }
            prevHe = he;
            he = he->next;
        }
        if (d->rehashidx == -1) break;
    }
    return DICT_ERR;
}

迭代

dictGetIterator：用于获取一个字典迭代器。迭代器可以按顺序遍历字典中的所有键值对。

dictIterator *dictGetIterator(dict *d) {
    dictIterator *iter = zmalloc(sizeof(*iter));
    iter->d = d;
    iter->table = 0;
    iter->index = -1;
    iter->safe = 0;
    iter->entry = NULL;
    iter->nextEntry = NULL;
    return iter;
}

dictNext：用于移动迭代器到下一个键值对。

dictEntry *dictNext(dictIterator *iter) {
    while (1) {
        if (iter->entry == NULL) {
            if (iter->index == -1) {
                iter->table = 0;
                iter->index = 0;
            } else {
                iter->index++;
                if (iter->index >= (signed)iter->d->ht[iter->table].size) {
                    if (iter->d->rehashidx != -1) {
                        iter->table = 1;
                        iter->index = 0;
                    } else {
                        break;
                    }
                }
            }
            iter->entry = iter->d->ht[iter->table].table[iter->index];
        } else {
            iter->entry = iter->nextEntry;
        }
        if (iter->entry) {
            iter->nextEntry = iter->entry->next;
            return iter->entry;
        }
    }
    return NULL;
}

dictReleaseIterator：用于释放迭代器。

void dictReleaseIterator(dictIterator *iter) {
    zfree(iter);
}

性能调优秘籍

合理设置初始容量

原理：在创建字典时，合理设置初始容量可以减少 rehash 的次数。如果初始容量过小，随着键值对的不断添加，很快就会达到哈希表的负载因子，触发 rehash 操作，这会消耗额外的 CPU 和内存资源。反之，如果初始容量过大，会浪费内存空间。
示例：假设我们要创建一个存储用户信息的字典，预计有 1000 个用户。

#include <stdio.h>
#include <stdlib.h>
#include "dict.h"

// 定义键值对的操作函数
static unsigned int myHashFunction(const void *key) {
    // 简单的哈希函数，假设键是字符串
    const char *str = (const char *)key;
    unsigned int hash = 0;
    while (*str) {
        hash = (hash << 5) + hash + *str++;
    }
    return hash;
}

static void *myKeyDup(void *privdata, const void *key) {
    // 复制键，假设键是字符串
    char *newKey = (char *)malloc(strlen((const char *)key) + 1);
    strcpy(newKey, (const char *)key);
    return newKey;
}

static void *myValDup(void *privdata, const void *val) {
    // 复制值，假设值是字符串
    char *newVal = (char *)malloc(strlen((const char *)val) + 1);
    strcpy(newVal, (const char *)val);
    return newVal;
}

static int myKeyCompare(void *privdata, const void *key1, const void *key2) {
    // 比较键，假设键是字符串
    return strcmp((const char *)key1, (const char *)key2) == 0;
}

static void myKeyDestructor(void *privdata, void *key) {
    // 释放键，假设键是字符串
    free(key);
}

static void myValDestructor(void *privdata, void *val) {
    // 释放值，假设值是字符串
    free(val);
}

int main() {
    dictType myType = {
       .hashFunction = myHashFunction,
       .keyDup = myKeyDup,
       .valDup = myValDup,
       .keyCompare = myKeyCompare,
       .keyDestructor = myKeyDestructor,
       .valDestructor = myValDestructor
    };

    // 创建字典，设置初始容量为 1024
    dict *userDict = dictCreate(&myType, NULL);
    if (!userDict) {
        printf("Failed to create dictionary\n");
        return 1;
    }

    // 添加用户信息
    for (int i = 0; i < 1000; i++) {
        char key[20];
        char val[20];
        sprintf(key, "user%d", i);
        sprintf(val, "info%d", i);
        if (dictAdd(userDict, key, val) != DICT_OK) {
            printf("Failed to add user %d\n", i);
        }
    }

    // 查找用户信息
    char searchKey[20];
    sprintf(searchKey, "user500");
    dictEntry *entry = dictFind(userDict, searchKey);
    if (entry) {
        printf("Found user: %s, info: %s\n", (char *)entry->key, (char *)entry->v.val);
    } else {
        printf("User not found\n");
    }

    // 释放字典
    dictRelease(userDict);
    return 0;
}

注意事项：在实际应用中，要根据数据的增长趋势来预估初始容量。如果数据量增长较为稳定，可以适当设置较大的初始容量；如果数据量增长不确定，可以采用动态调整的策略，但要注意 rehash 带来的性能开销。

选择合适的哈希函数
- 原理：哈希函数的质量直接影响字典的性能。一个好的哈希函数应该能够将键均匀地分布在哈希表中，减少哈希冲突的发生。如果哈希函数设计不合理，会导致大量的键映射到同一个槽位，形成长链表，从而降低查找、插入和删除的效率。
- 示例：Redis 默认使用的是 MurmurHash2 哈希函数，它在速度和分布均匀性上表现较好。但在某些特定场景下，我们可能需要自定义哈希函数。比如，对于整数类型的键，可以使用更简单高效的哈希函数。
```
static unsigned int intHashFunction(const void *key) {
    // 假设键是整数
    int num = *(int *)key;
    return num ^ (num >> 16);
}
```
- 注意事项：在自定义哈希函数时，要充分考虑键的类型和分布特点，确保哈希函数能够均匀地散列键值对。同时，也要注意哈希函数的计算效率，避免过于复杂的计算导致性能下降。
控制负载因子
- 原理：负载因子是指哈希表中已使用的节点数与哈希表大小的比值。Redis 中默认的负载因子阈值是 1，当负载因子超过这个阈值时，会触发 rehash 操作。通过调整负载因子阈值，可以控制 rehash 的时机。较小的负载因子可以减少哈希冲突，但会浪费更多的内存空间；较大的负载因子可以提高内存利用率，但会增加哈希冲突的概率，降低性能。
- 示例：我们可以通过修改 Redis 源码中的 dictExpandIfNeeded 函数来调整负载因子阈值。假设我们将负载因子阈值调整为 0.75。
```
int dictExpandIfNeeded(dict *d) {
    /* Incremental rehashing already in progress. Return. */
    if (dictIsRehashing(d)) return DICT_OK;

    /* If the hash table is empty expand it to the initial size. */
    if (d->ht[0].size == 0) return dictExpand(d, DICT_HT_INITIAL_SIZE);

    /* If we reached the 1:1 ratio, and we are allowed to resize the hash
     * table (global setting) or we should avoid it but the ratio between
     * elements/buckets is over the "safe" threshold, we resize doubling
     * the number of buckets. */
    if (d->ht[0].used >= d->ht[0].size * 0.75) {
        return dictExpand(d, d->ht[0].used * 2);
    }
    return DICT_OK;
}
```
- 注意事项：调整负载因子阈值需要谨慎，要综合考虑应用场景对内存和性能的需求。在内存敏感的场景下，可以适当提高负载因子；在性能要求较高的场景下，应降低负载因子。
避免频繁 rehash
- 原理：rehash 操作是一个比较耗时的过程，它需要重新分配内存，重新计算键的哈希值并重新插入键值对。频繁的 rehash 会导致系统性能下降。
- 策略：
  - 预分配内存：在数据量增长可预测的情况下，提前分配足够的内存，避免频繁的内存重新分配。
  - 分批 rehash：Redis 采用了渐进式 rehash 的策略，即每次操作时（如插入、删除、查找），都会顺带处理一些 rehash 工作，而不是一次性完成所有 rehash。我们可以借鉴这种思想，在应用层实现类似的分批处理机制。
- 示例：假设我们在应用层实现一个简单的分批 rehash 机制。
```
void batchRehash(dict *d) {
    const int batchSize = 10;
    if (dictIsRehashing(d)) {
        for (int i = 0; i < batchSize; i++) {
            if (!dictRehash(d, 1)) {
                break;
            }
        }
    }
}
```
- 注意事项：在实现分批 rehash 时，要注意对 rehash 进度的管理，确保所有键值对最终都能被正确 rehash。同时，也要避免在高并发场景下由于分批 rehash 导致的数据一致性问题。

优化迭代操作

原理：迭代操作在遍历字典时，如果处理不当，可能会影响性能。例如，在迭代过程中进行插入或删除操作，可能会破坏迭代器的状态，导致未定义行为。
优化方法：
- 只读迭代：如果只是遍历字典读取数据，使用普通的迭代器即可。在迭代过程中，尽量避免对字典进行修改操作。
- 读写迭代：如果需要在迭代过程中进行修改操作，可以使用安全迭代器（Redis 中未提供，需自行实现）。安全迭代器在迭代开始时，会记录字典的状态，在迭代过程中，对字典的修改会记录在一个临时结构中，迭代结束后，再将修改应用到字典中。
示例：假设我们实现一个简单的安全迭代器。

typedef struct safeDictIterator {
    dict *d;
    dictEntry **table;
    unsigned long size;
    unsigned long index;
    dict *tempDict;
} safeDictIterator;

safeDictIterator *safeDictGetIterator(dict *d) {
    safeDictIterator *iter = zmalloc(sizeof(*iter));
    iter->d = d;
    iter->table = d->ht[0].table;
    iter->size = d->ht[0].size;
    iter->index = 0;
    iter->tempDict = dictCreate(d->type, d->privdata);
    return iter;
}

dictEntry *safeDictNext(safeDictIterator *iter) {
    while (iter->index < iter->size) {
        dictEntry *entry = iter->table[iter->index];
        while (entry) {
            dictEntry *next = entry->next;
            // 这里可以对 entry 进行操作
            // 如果要修改，先添加到 tempDict 中
            if (shouldModify(entry)) {
                dictAdd(iter->tempDict, entry->key, newValForEntry(entry));
            }
            entry = next;
        }
        iter->index++;
    }
    // 处理完所有数据后，将 tempDict 中的修改应用到原字典
    dictIterator *tempIter = dictGetIterator(iter->tempDict);
    dictEntry *tempEntry;
    while ((tempEntry = dictNext(tempIter))) {
        dictReplace(iter->d, tempEntry->key, tempEntry->v.val);
    }
    dictReleaseIterator(tempIter);
    dictRelease(iter->tempDict);
    return NULL;
}

void safeDictReleaseIterator(safeDictIterator *iter) {
    zfree(iter);
}

注意事项：安全迭代器的实现较为复杂，需要注意内存管理和数据一致性问题。在使用安全迭代器时，要评估性能开销，确保其在可接受范围内。

使用哈希表的批量操作
- 原理：批量操作可以减少函数调用开销和内存分配次数。例如，批量插入多个键值对比单个插入多个键值对效率更高。
- 示例：假设我们要批量插入 1000 个键值对。
```
void batchInsert(dict *d, void **keys, void **vals, int count) {
    for (int i = 0; i < count; i++) {
        dictAdd(d, keys[i], vals[i]);
    }
}
```
- 注意事项：在进行批量操作时，要注意内存使用情况，避免一次性分配过多内存导致内存不足。同时，也要考虑批量操作对系统资源的占用，合理控制批量大小。

总结性能调优要点

容量设置：根据数据预估合理设置初始容量，减少 rehash 次数。
哈希函数：选择或设计合适的哈希函数，确保键值对均匀分布。
负载因子：根据应用场景调整负载因子阈值，平衡内存与性能。
rehash 控制：通过预分配内存和分批 rehash 避免频繁 rehash。
迭代优化：根据读写需求选择合适的迭代方式，避免影响性能。
批量操作：利用批量操作减少函数调用和内存分配开销。

通过以上性能调优秘籍，可以显著提升 Redis 字典 API 的性能，使其更好地满足各种应用场景的需求。在实际应用中，要根据具体情况灵活运用这些方法，不断优化系统性能。