IPVS调度算法之LBLC

最新推荐文章于 2022-05-27 13:38:29 发布

redwingz

最新推荐文章于 2022-05-27 13:38:29 发布

阅读量1.1k

点赞数

分类专栏：负载均衡文章标签： ipvs

本文链接：https://blog.csdn.net/sinat_20184565/article/details/100865954

版权

负载均衡专栏收录该内容

42 篇文章 10 订阅

订阅专栏

LBLC（Locality-Based Least-Connection）调度算法，是基于LC算法的一个变种，对LC算法调度的目的服务器进行缓存，对于后续的连接如果其目的IP可在缓存中找到目的服务器，使用其处理新连接。

调度器注册

LBLC调度器的定义结构为ip_vs_lblc_scheduler，使用函数register_ip_vs_scheduler注册到IPVS的调度器系统中。

static struct ip_vs_scheduler ip_vs_lblc_scheduler = {
        .name =                 "lblc",
        .refcnt =               ATOMIC_INIT(0),
        .module =               THIS_MODULE,
        .n_list =               LIST_HEAD_INIT(ip_vs_lblc_scheduler.n_list),
        .init_service =         ip_vs_lblc_init_svc,
        .done_service =         ip_vs_lblc_done_svc,
        .schedule =             ip_vs_lblc_schedule,
};

static int __init ip_vs_lblc_init(void)
{
        ret = register_ip_vs_scheduler(&ip_vs_lblc_scheduler);
        if (ret)
                unregister_pernet_subsys(&ip_vs_lblc_ops);
        return ret;
}

虚拟服务初始化

如下命令，在添加虚拟服务时，指定使用lblc调度器：

# ipvsadm -A -t 207.175.44.110:80 -s lblc

内核在将虚拟服务（ip_vs_bind_scheduler函数）绑定调度器时，调用调度器的init_service指针函数。对于LBLC调度器，即以下的ip_vs_lblc_init_svc函数。在此函数中，分配一个ip_vs_lblc_table结构作为虚拟服务的调度私有数据（sched_data）。除此之外，启动一个60秒钟（CHECK_EXPIRE_INTERVAL）的定时器，用于垃圾项回收，超时处理函数为ip_vs_lblc_check_expire。

static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
{
    struct ip_vs_lblc_table *tbl;

    /* Allocate the ip_vs_lblc_table for this service
     */
    tbl = kmalloc(sizeof(*tbl), GFP_KERNEL);
    if (tbl == NULL)
            return -ENOMEM;

    svc->sched_data = tbl;

    /* Initialize the hash buckets
     */
    for (i = 0; i < IP_VS_LBLC_TAB_SIZE; i++) {
            INIT_HLIST_HEAD(&tbl->bucket[i]);
    }
    tbl->max_size = IP_VS_LBLC_TAB_SIZE*16;
    tbl->rover = 0;
    tbl->counter = 1;
    tbl->dead = 0;
    tbl->svc = svc;

    /* Hook periodic timer for garbage collection
     */
    timer_setup(&tbl->periodic_timer, ip_vs_lblc_check_expire, 0);
    mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);

在删除虚拟服务或者修改虚拟服务所使用的调度器时，内核需在函数ip_vs_unbind_scheduler中解绑调度器，此时如果调度器实现了done_service回调指针函数，将在此函数中被调用。对于LBLC调度器，为以下函数：

static void ip_vs_lblc_done_svc(struct ip_vs_service *svc)
{
        struct ip_vs_lblc_table *tbl = svc->sched_data;

        /* remove periodic timer */
        del_timer_sync(&tbl->periodic_timer);

        /* got to clean up table entries here */
        ip_vs_lblc_flush(svc);

        /* release the table itself */
        kfree_rcu(tbl, rcu_head);
}

以上函数执行删除init_service函数中启动的定时器（periodic_timer），释放ip_vs_lblc_table结构所占用内存。

调度数据

在LBLC算法中，私有调度数据sched_data指向ip_vs_lblc_table类型的结构。

struct ip_vs_lblc_table {
        struct rcu_head         rcu_head;
        struct hlist_head       bucket[IP_VS_LBLC_TAB_SIZE];  /* hash bucket */
        struct timer_list       periodic_timer; /* collect stale entries */
        struct ip_vs_service    *svc;           /* pointer back to service */
        atomic_t                entries;        /* number of entries */
        int                     max_size;       /* maximum size of entries */
        int                     rover;          /* rover for expire check */
        int                     counter;        /* counter for no expire */
        bool                    dead;
};

其中bucket链表数组的大小默认为1K（IP_VS_LBLC_TAB_SIZE）。可通过内核配置CONFIG_IP_VS_LBLC_TAB_BITS进行修改。

#ifndef CONFIG_IP_VS_LBLC_TAB_BITS
#define CONFIG_IP_VS_LBLC_TAB_BITS      10
#endif
#define IP_VS_LBLC_TAB_BITS     CONFIG_IP_VS_LBLC_TAB_BITS
#define IP_VS_LBLC_TAB_SIZE     (1 << IP_VS_LBLC_TAB_BITS)

结构ip_vs_lblc_entry为链表中的表项结构。

struct ip_vs_lblc_entry {
        struct hlist_node       list;
        int                     af;             /* address family */
        union nf_inet_addr      addr;           /* destination IP address */
        struct ip_vs_dest       *dest;          /* real server (cache) */
        unsigned long           lastuse;        /* last used time */
        struct rcu_head         rcu_head;
};

调度处理

LBLC调度函数的处理大致分成三个部分： a）在虚拟服务的调度数据（sched_data）中，根据报文的目的地址（daddr）查找缓存的表项（ip_vs_lblc_entry），如果表项中的真实目的服务器可用，其权重值大于零，并且没有超负荷，当前连接仍调度到此真实目的服务器上。

static struct ip_vs_dest *ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, struct ip_vs_iphdr *iph)
{
    struct ip_vs_lblc_table *tbl = svc->sched_data;
    struct ip_vs_dest *dest = NULL;
    struct ip_vs_lblc_entry *en;

    /* First look in our cache */
    en = ip_vs_lblc_get(svc->af, tbl, &iph->daddr);
    if (en) {
        /* We only hold a read lock, but this is atomic */
        en->lastuse = jiffies;
        dest = en->dest;
        if ((dest->flags & IP_VS_DEST_F_AVAILABLE) && atomic_read(&dest->weight) > 0 && !is_overloaded(dest, svc))
                goto out;
    }

b）如果在第一步中没有命中合适的真实服务器，此时使用LC算法进行选择，参见函数__ip_vs_lblc_schedule。

    /* No cache entry or it is invalid, time to schedule */
    dest = __ip_vs_lblc_schedule(svc);
    if (!dest) {
        ip_vs_scheduler_err(svc, "no destination available");
        return NULL;
    }

c）在第三步中，将第二步选择的真实服务器缓存到调度数据结构ip_vs_lblc_table中。

    /* If we fail to create a cache entry, we'll just use the valid dest */
    spin_lock_bh(&svc->sched_lock);
    if (!tbl->dead)
        ip_vs_lblc_new(tbl, &iph->daddr, svc->af, dest);
    spin_unlock_bh(&svc->sched_lock);

    return dest;

先来看一下在以上的第二步中的LC算法实现函数__ip_vs_lblc_schedule，其算法与在WLC调度器中介绍的完全相同。具体可参见： https://blog.csdn.net/sinat_20184565/article/details/100567193 中的介绍。

static inline struct ip_vs_dest *__ip_vs_lblc_schedule(struct ip_vs_service *svc)
{
    struct ip_vs_dest *dest, *least;

    list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
        if (dest->flags & IP_VS_DEST_F_OVERLOAD)
                continue;
        if (atomic_read(&dest->weight) > 0) {
            least = dest;
            loh = ip_vs_dest_conn_overhead(least);
            goto nextstage;
        }
    }
    return NULL;

    /* Find the destination with the least load.
     */
nextstage:
    list_for_each_entry_continue_rcu(dest, &svc->destinations, n_list) {
        if (dest->flags & IP_VS_DEST_F_OVERLOAD)
                continue;

        doh = ip_vs_dest_conn_overhead(dest);
        if ((__s64)loh * atomic_read(&dest->weight) > (__s64)doh * atomic_read(&least->weight)) {
            least = dest;
            loh = doh;
        }
    }
    return least;
}

其次看一下目的服务器缓存的添加，如在以上的__ip_vs_lblc_schedule函数中找到了合适的真实服务器，使用函数ip_vs_lblc_new将其添加到LBLC调度算法的缓存链表中。在此函数中会分配一个ip_vs_lblc_entry表项结构，用来保存真实服务器的指针和当前连接的目的IP地址。注意表项结构的lastuse成员用来保存表项最后使用的时间戳，在超时处理中将用到此值。

static inline struct ip_vs_lblc_entry *
ip_vs_lblc_new(struct ip_vs_lblc_table *tbl, const union nf_inet_addr *daddr, u16 af, struct ip_vs_dest *dest)
{
    struct ip_vs_lblc_entry *en;

    en = ip_vs_lblc_get(af, tbl, daddr);
    if (en) {
        if (en->dest == dest)
                return en;
        ip_vs_lblc_del(en);
    }
    en = kmalloc(sizeof(*en), GFP_ATOMIC);
    if (!en)
        return NULL;

    en->af = af;
    ip_vs_addr_copy(af, &en->addr, daddr);
    en->lastuse = jiffies;

    ip_vs_dest_hold(dest);
    en->dest = dest;

    ip_vs_lblc_hash(tbl, en);

    return en;
}

最后，使用函数ip_vs_lblc_hash按照目的地址计算的hash值，在bucket中选择合适的链表，将新初始化的表项结构ip_vs_lblc_entry添加到链表中。同时，更新缓存表项数量值。

static void ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en)
{
        unsigned int hash = ip_vs_lblc_hashkey(en->af, &en->addr);

        hlist_add_head_rcu(&en->list, &tbl->bucket[hash]);
        atomic_inc(&tbl->entries);
}

链表查找函数由ip_vs_lblc_get函数实现，根据目的IP地址计算hash值，据此找到合适的bucket，遍历链表通过比较IP地址，找到对应的表项结构ip_vs_lblc_entry。由于真实目的服务器缓存在表项结构中，也即找到了目的服务器。

static inline struct ip_vs_lblc_entry *ip_vs_lblc_get(int af, struct ip_vs_lblc_table *tbl, const union nf_inet_addr *addr)
{
    unsigned int hash = ip_vs_lblc_hashkey(af, addr);
    struct ip_vs_lblc_entry *en;

    hlist_for_each_entry_rcu(en, &tbl->bucket[hash], list)
        if (ip_vs_addr_equal(af, &en->addr, addr))
            return en;

    return NULL;
}

超时处理

如下超时处理函数，如果ip_vs_lblc_table结构中的counter为宏COUNT_FOR_FULL_EXPIRATION（30）的倍数，由于超时时长为60秒，意味着经过了30分钟，调用一次ip_vs_lblc_full_check，进行整个缓存的检查。

static void ip_vs_lblc_check_expire(struct timer_list *t)
{
    struct ip_vs_lblc_table *tbl = from_timer(tbl, t, periodic_timer);
    struct ip_vs_service *svc = tbl->svc;
    unsigned long now = jiffies;
    struct ip_vs_lblc_entry *en;
    struct hlist_node *next;

    if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) {
        /* do full expiration check */
        ip_vs_lblc_full_check(svc);
        tbl->counter = 1;
        goto out;
    }

    if (atomic_read(&tbl->entries) <= tbl->max_size) {
        tbl->counter++;
        goto out;
    }

如果缓存数量已经超过最大值（默认为16K），此值可通过内核配置进行修改（CONFIG_IP_VS_LBLC_TAB_BITS），执行以下的清理操作。清理数量（goal）的计算为两个部分：第一清理掉已经超出max_size的值；第二再清理掉超出值的三分之一，最多清理掉max_size值一半的缓存项。

    goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3;
    if (goal > tbl->max_size/2)
            goal = tbl->max_size/2;

在确定了清理数量之后遍历整个缓存链表，删除已经超时的缓存项，即超过ENTRY_TIMEOUT（6分钟）时长没有使用过的项。结构ip_vs_lblc_table的成员rover用来保存本次遍历结束时的位置，以便下次执行时由此结束位置开始。

    for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
        j = (j + 1) & IP_VS_LBLC_TAB_MASK;

        spin_lock(&svc->sched_lock);
        hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
            if (time_before(now, en->lastuse + ENTRY_TIMEOUT))
                    continue;

            ip_vs_lblc_del(en);
            atomic_dec(&tbl->entries);
            goal--;
        }
        spin_unlock(&svc->sched_lock);
        if (goal <= 0)
                break;
    }
    tbl->rover = j;
out:
    mod_timer(&tbl->periodic_timer, jiffies + CHECK_EXPIRE_INTERVAL);

全局清理函数ip_vs_lblc_full_check，每30分钟执行一次，用来清理超过一天（24小时）未使用的缓存项。此超时时间控制参见函数sysctl_lblc_expiration。

static inline void ip_vs_lblc_full_check(struct ip_vs_service *svc)
{
    struct ip_vs_lblc_table *tbl = svc->sched_data;
    struct ip_vs_lblc_entry *en;
    struct hlist_node *next;
    unsigned long now = jiffies;

    for (i = 0, j = tbl->rover; i < IP_VS_LBLC_TAB_SIZE; i++) {
        j = (j + 1) & IP_VS_LBLC_TAB_MASK;

        spin_lock(&svc->sched_lock);
        hlist_for_each_entry_safe(en, next, &tbl->bucket[j], list) {
            if (time_before(now, en->lastuse + sysctl_lblc_expiration(svc)))
                    continue;

            ip_vs_lblc_del(en);
            atomic_dec(&tbl->entries);
        }
        spin_unlock(&svc->sched_lock);
    }
    tbl->rover = j;
}

默认情况下次超时时间为DEFAULT_EXPIRATION（24小时），也可通过proc文件/proc/net/ipv4/vs/lblc_expiration进行修改。

static int sysctl_lblc_expiration(struct ip_vs_service *svc)
{
#ifdef CONFIG_SYSCTL
        return svc->ipvs->sysctl_lblc_expiration;
#else
        return DEFAULT_EXPIRATION;
#endif
}

内核版本 4.15