IPVS源代码分析---备份的实现

很多是转载的,也有自己的理解。

IPVS支持对连接的同步,两台IPVS设备可分别以MASTER或BACKUP运行,MASTER进程可将连接信息备份到BACKUP设备上,这样主设备死机时从设备可以无缝切换。
或者可以在IPVS设备上同时启动MASTER和BACKUP进程,使设备之间互为备份,实现IPVS设备的均衡。
IPVS同步实现在net/ipv4/ipvs/ip_vs_sync.c中.
同步信息块的格式如下,开始是4字节的信息头,后面是多个IPVS连接同步信息,每个块大小不固定,连接同步信息个数从0到多个:
       0                1                2                3
       0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
      | Count Conns    |    SyncID     |            Size                |
      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
      |                                                                 |
      |                    IPVS Sync Connection (1)                     |
      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
      |                            .                                    |
      |                            .                                    |
      |                            .                                    |
      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
      |                                                                 |
      |                    IPVS Sync Connection (n)                     |
      +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
同步信息块用UDP协议,组播的方式发送。在应用层启动同步这个线程时(同步是在内核作的,通过应用层的指令启动),会传进来一个参数syncid.只有syncid相同的两台机器才会作为备份。也就是说对于backup的设备,只有syncid和master相同,才会记录master组播的connection mesg。启动同步线程时,会调用join_mcast_group加入到一个组播组里。

数据结构
#define SYNC_MESG_HEADER_LEN    4
信息头结构
struct ip_vs_sync_mesg {
        __u8                    nr_conns; // 连接数
        __u8                    syncid; // 同步ID
        __u16                   size; // 数据总长
        /* ip_vs_sync_conn entries start here */
};
IPVS连接同步信息结构
struct ip_vs_sync_conn {
        __u8                    reserved;
        // 连接基本信息
        /* Protocol, addresses and port numbers */
        __u8                    protocol;       /* Which protocol (TCP/UDP) */
        __u16                   cport;
        __u16                   vport;
        __u16                   dport;
        __u32                   caddr;          /* client address */
        __u32                   vaddr;          /* virtual address */
        __u32                   daddr;          /* destination address */
        // 连接的状态和标志
        /* Flags and state transition */
        __u16                   flags;          /* status flags */
        __u16                   state;          /* state info */
        // 后续可能有连接选项参数,就是TCP的序列号和确认号信息
        /* The sequence options start here */
};
IPVS连接同步选项结构,,就是进入和发出发现TCP的序列号信息
struct ip_vs_sync_conn_options {
        struct ip_vs_seq        in_seq;         /* incoming seq. struct */
        struct ip_vs_seq        out_seq;        /* outgoing seq. struct */
};

连接数据控制块结构
struct ip_vs_sync_buff {
        struct list_head        list; // 形成队列
        unsigned long           firstuse;
        //实际的同步信息
        /* pointers for the message data */
        struct ip_vs_sync_mesg  *mesg;
        unsigned char           *head;
        unsigned char           *end;
};
ip_vs_sync_mesg的信息都存放在ip_vs_sync_buff中。其中,一个buff中存放一个ip_vs_sync_mesg,但是一个mesg中可以存放多个connection entry。通过head,end元素来控制mesg的容量,ip_vs_sync_buff在vmalloc时,它的大小就已经是固定的。另外,ip_vs_sync_mesg是同步时一次传输的单元,IPVS在同步时,会考虑MTU的大小,按照MTU的大小设定mesg的大小,使得mesg可以不用分片。以太网的MTU是1500。
 
IPVS同步进程是一个内核进程,是由IPVSADM通过命令启动的.

int start_sync_thread(int state, char *mcast_ifn, __u8 syncid)
{
	struct ip_vs_sync_thread_data *tinfo;
	struct task_struct **realtask, *task;
	struct socket *sock;
	char *name, *buf = NULL;
	int (*threadfn)(void *data);
	int result = -ENOMEM;


	IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
	IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
		  sizeof(struct ip_vs_sync_conn));


	if (state == IP_VS_STATE_MASTER) {
		if (sync_master_thread)
			return -EEXIST;


		strlcpy(ip_vs_master_mcast_ifn, mcast_ifn,
			sizeof(ip_vs_master_mcast_ifn));
		ip_vs_master_syncid = syncid;
		realtask = &sync_master_thread;
		name = "ipvs_syncmaster";
		threadfn = sync_thread_master;
		sock = make_send_sock();
	} else if (state == IP_VS_STATE_BACKUP) {
		if (sync_backup_thread)
			return -EEXIST;


		strlcpy(ip_vs_backup_mcast_ifn, mcast_ifn,
			sizeof(ip_vs_backup_mcast_ifn));
		ip_vs_backup_syncid = syncid;
		realtask = &sync_backup_thread;
		name = "ipvs_syncbackup";
		threadfn = sync_thread_backup;
		sock = make_receive_sock();
	} else {
		return -EINVAL;
	}


	if (IS_ERR(sock)) {
		result = PTR_ERR(sock);
		goto out;
	}


	set_sync_mesg_maxlen(state);
	if (state == IP_VS_STATE_BACKUP) {
		buf = kmalloc(sync_recv_mesg_maxlen, GFP_KERNEL);
		if (!buf)
			goto outsocket;
	}


	tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
	if (!tinfo)
		goto outbuf;


	tinfo->sock = sock;
	tinfo->buf = buf;


	task = kthread_run(threadfn, tinfo, name);
	if (IS_ERR(task)) {
		result = PTR_ERR(task);
		goto outtinfo;
	}


	/* mark as active */
	*realtask = task;
	ip_vs_sync_state |= state;


	/* increase the module use count */
	ip_vs_use_count_inc();


	return 0;


outtinfo:
	kfree(tinfo);
outbuf:
	kfree(buf);
outsocket:
	sock_release(sock);
out:
	return result;
}

MASTER循环:周期性的发送mesg信息
static void sync_master_loop(void)
{
      struct socket *sock;
      struct ip_vs_sync_buff *sb;

      /* create the sending multicast socket */
      sock = make_send_sock();//建立多播SOCK,同步信息由此SOCK发出
      if (!sock)
            return;
      for (;;) {//进入循环
           //从队列中取发送数据块
           while ((sb=sb_dequeue())) {
                 ip_vs_send_sync_msg(sock, sb->mesg);//发出同步数据块
                 ip_vs_sync_buff_release(sb);//释放数据块缓冲
           }
           /* check if entries stay in curr_sb for 2 seconds */
          //如果2秒内数据块没准备好,直接将未完成的数据块发出去
          //最差情况下数据块里没有IPVS连接信息,只有一个数据头,
          //相当于同步信号,表明MASTER还没死
          if ((sb = get_curr_sync_buff(2*HZ))) {
                ip_vs_send_sync_msg(sock, sb->mesg);
                ip_vs_sync_buff_release(sb);
          }
          //发现停止MASTER进程标志,中断循环
          if (stop_master_sync)
                break;
          //休眠1秒
          ssleep(1);
      }
      //循环退出,将当前发送队列中的数据块都释放
      while ((sb=sb_dequeue())) {
            ip_vs_sync_buff_release(sb);
      }

      //清除当前块,当前块是构造中还没放到发送队列中的数据块
      if ((sb = get_curr_sync_buff(0))) {
            ip_vs_sync_buff_release(sb);
      }

      /* release the sending multicast socket */
      sock_release(sock);
}

BACKUP:接收消息,并调用ip_vs_process_message 进行处理
static void sync_backup_loop(void)
{
      struct socket *sock;
      char *buf;
      int len;
      //分配数据接收空间
      if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) {
            IP_VS_ERR("sync_backup_loop: kmalloc error\n");
            return;
      }
      //创建UDP接收SOCK,并把这个SOCK地址加入到多播组中
      sock = make_receive_sock();
      if (!sock)
            goto out;
      for (;;) {
            //接收队列非空
            while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) {
                  //接收数据函数,比较简单,直接调用内核的kernel_recvmsg函数
                  if ((len = ip_vs_receive(sock, buf, sync_recv_mesg_maxlen)) <= 0) {
                        IP_VS_ERR("receiving message error\n");
                        break;
                  }
                  /* disable bottom half, because it accessed the data shared by softirq while getting/creating conns */
                  local_bh_disable();//处理数据时不能再进入bottom half
                  ip_vs_process_message(buf, len);//处理接收数据
                  local_bh_enable();
            }
            //检查是否设置进程停止标志
            if (stop_backup_sync)
                  break;
            ssleep(1);//睡眠1秒
      }
      sock_release(sock);
out:
      kfree(buf);
}

处理接收数据函数
static void ip_vs_process_message(const char *buffer, const size_t buflen)
{
      struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer;
      struct ip_vs_sync_conn *s;
      struct ip_vs_sync_conn_options *opt;
      struct ip_vs_conn *cp;
      char *p;
      int i;

      m->size = ntohs(m->size);
      //检查接收的数据长度是否正确
      if (buflen != m->size) {
            IP_VS_ERR("bogus message\n");
            return;
      }
      //检查同步ID是否匹配
      if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) {
            IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", m->syncid);
            return;
      }

      //同步信息块头后面是真正的IPVS连接同步信息
      //p现在是第一个同步连接结构指针
      p = (char *)buffer + sizeof(struct ip_vs_sync_mesg);

      for (i = 0; i < m->nr_conns; i++) {//循环读取缓冲区中的同步连接信息
            unsigned flags;
            s = (struct ip_vs_sync_conn *)p;
            flags = ntohs(s->flags);

            //根据同步连接信息查找连接
            if (!(flags & IP_VS_CONN_F_TEMPLATE))
                  cp = ip_vs_conn_in_get(s->protocol, s->caddr, s->cport, s->vaddr, s->vport);
            else
                  cp = ip_vs_ct_in_get(s->protocol, s->caddr, s->cport, s->vaddr, s->vport);
            if (!cp) {
                  //找不到连接,说明是MASTER新建的连接同步过来了
                  //新建连接,连接的dest参数为NULL,表明是同步产生的连接,而不是BACKUP自己生成的连接
                  cp = ip_vs_conn_new(s->protocol, s->caddr, s->cport, s->vaddr, s->vport, s->daddr, s->dport, flags, NULL);
                  if (!cp) {
                        IP_VS_ERR("ip_vs_conn_new failed\n");
                        return;
                  }
                  cp->state = ntohs(s->state);//设置连接状态
             } else if (!cp->dest) {
                  //找到了连接但没有dest指针,说明该连接是同步产生的连接,而不是BACKUP主动产生的连接
                  cp->state = ntohs(s->state);
                  cp->flags = flags | IP_VS_CONN_F_HASHED;
             } /* Note that we don't touch its state and flags if it is a normal entry. */
             if (flags & IP_VS_CONN_F_SEQ_MASK) {
                  //拷贝连接选项
                  opt = (struct ip_vs_sync_conn_options *)&s[1];
                  memcpy(&cp->in_seq, opt, sizeof(*opt));
                  p += FULL_CONN_SIZE;
             } else
                  p += SIMPLE_CONN_SIZE;
             //设置连接计数,是个固定值
             atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]);
             cp->timeout = IP_VS_SYNC_CONN_TIMEOUT;
             ip_vs_conn_put(cp);

             if (p > buffer + buflen) {//检查当前缓冲区指针是否越界了
                  IP_VS_ERR("bogus message\n");
                  return;
             }
      }
}

连接同步函数ip_vs_sync_conn()是由ip_vs_in()函数调用的。利用当前的cp,生成一个ip_vs_sync_conn结构。然后把这个结构放到ip_vs_sync_mesg中。其中,

void ip_vs_sync_conn(struct ip_vs_conn *cp)
{
      struct ip_vs_sync_mesg *m;
      struct ip_vs_sync_conn *s;
      int len;

      spin_lock(&curr_sb_lock);
      if (!curr_sb) {
            //当前连接数据块为空,分配新块
            if (!(curr_sb=ip_vs_sync_buff_create())) {
                  spin_unlock(&curr_sb_lock);
                  IP_VS_ERR("ip_vs_sync_buff_create failed.\n");
                  return;
            }
      }
      //检查是否包括选项长度
      len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : SIMPLE_CONN_SIZE;
      m = curr_sb->mesg;
      //空闲缓冲区头,作为一个连接同步单元头
      s = (struct ip_vs_sync_conn *)curr_sb->head;

      /* copy members */
      s->protocol = cp->protocol;
      s->cport = cp->cport;
      s->vport = cp->vport;
      s->dport = cp->dport;
      s->caddr = cp->caddr;
      s->vaddr = cp->vaddr;
      s->daddr = cp->daddr;
      s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED);
      s->state = htons(cp->state);

      if (cp->flags & IP_VS_CONN_F_SEQ_MASK) {//增加选项信息,即TCP序列号
            struct ip_vs_sync_conn_options *opt = (struct ip_vs_sync_conn_options *)&s[1];
            memcpy(opt, &cp->in_seq, sizeof(*opt));
      }

      m->nr_conns++;
      m->size += len;//有效数据长度增加
      curr_sb->head += len;//空闲指针后移

      //检查剩下的空间是否还能容纳一个同步连接结构
      if (curr_sb->head + FULL_CONN_SIZE > curr_sb->end) {
            sb_queue_tail(curr_sb);//空间不够的话将当前同步数据块添加到发送链表中
            curr_sb = NULL;
      }
      spin_unlock(&curr_sb_lock);

      //如果有主连接,递归调用本函数同步主连接信息
      if (cp->control)
             ip_vs_sync_conn(cp->control);
}

©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页