-
arp_init
- neigh_table per-lcore,hash桶初始化
- 注册ARP协议的pkt_type,用于处理接收的arp数据包
- 注册loop任务neigh_process_ring,arp数据包会通过ring广播到所有的lcore,取出其中的arp数据包处理
static int arp_init(void) { int i, j; int err; //初始化neigh_table哈希表,每个lcore维护自己本地的arp缓存表 for (i = 0; i < DPVS_MAX_LCORE; i++) { for (j = 0; j < NEIGH_TAB_SIZE; j++) { INIT_LIST_HEAD(&neigh_table[i][j]); } } master_cid = rte_lcore_id(); arp_pkt_type.type = rte_cpu_to_be_16(ETHER_TYPE_ARP); //注册pkt_type,主要注册arp类型数据处理函数neigh_resolve_input if ((err = netif_register_pkt(&arp_pkt_type)) != EDPVS_OK) return err; //注册ctrl信息回调 if ((err = sockopt_register(&neigh_sockopts)) != EDPVS_OK) return err; //创建neigh_ring,arp信息需要广播至每个lcore neigh_ring_init(); //注册 loop 任务,其中slave lcore注册为LCORE_JOB_SLOW类型任务,间隔100次loop执行 snprintf(neigh_jobs[0].name, sizeof(neigh_jobs[0].name) - 1, "%s", "neigh_sync"); neigh_jobs[0].func = neigh_process_ring; neigh_jobs[0].data = NULL; neigh_jobs[0].type = LCORE_JOB_SLOW; neigh_jobs[0].skip_loops = NEIGH_PROCESS_MAC_RING_INTERVAL; if ((err = dpvs_lcore_job_register(&neigh_jobs[0], LCORE_ROLE_FWD_WORKER)) != EDPVS_OK) return err; snprintf(neigh_jobs[1].name, sizeof(neigh_jobs[1].name) - 1, "%s", "neigh_sync"); neigh_jobs[1].func = neigh_process_ring; neigh_jobs[1].data = NULL; neigh_jobs[1].type = LCORE_JOB_LOOP; if ((err = dpvs_lcore_job_register(&neigh_jobs[1], LCORE_ROLE_MASTER)) != EDPVS_OK) return err; return EDPVS_OK; }
-
看下netif_deliver_mbuf中对ARP包的特殊处理
- 如果是arp包,并且不是从其他lcore中投递过来的数据包,则进入处理逻辑,此处防止类似arp风暴,重复在dpvs中循环投递
- 如果是对ARP的回复数据报,投递到其他lcore中,因为dpvs中每个lcore独立维护自己的邻居子系统状态,但是arp回复包不一定在发送arp请求的lcore上接收到
- 接着接收到arp的lcore上也继续往下处理,会查找pkt_table,找到arp消息处理函数
static inline int netif_deliver_mbuf(struct rte_mbuf *mbuf, uint16_t eth_type, struct netif_port *dev, struct netif_queue_conf *qconf, bool forward2kni, lcoreid_t cid, bool pkts_from_ring) { ... if (pt->type == rte_cpu_to_be_16(ETHER_TYPE_ARP) && !pkts_from_ring) { struct rte_mempool *mbuf_pool; struct rte_mbuf * mbuf_clone; uint8_t i; struct arp_hdr * arp; unsigned socket_id; socket_id = rte_socket_id(); mbuf_pool = pktmbuf_pool[socket_id]; rte_pktmbuf_adj(mbuf, sizeof(struct ether_hdr)); arp = rte_pktmbuf_mtod(mbuf, struct arp_hdr *); rte_pktmbuf_prepend(mbuf, (uint16_t)sizeof(struct ether_hdr)); //判断arp_op是否是ARP_OP_REPLY,是的话,需要clone mbuf,然后调用rte_ring_enqueue发送到其他每个核一份 if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REPLY) { for (i = 0; i < DPVS_MAX_LCORE; i++) { if ((i == cid) || (!is_lcore_id_fwd(i)) || (i == rte_get_master_lcore())) { continue; } /*rte_pktmbuf_clone will not clone pkt.data, just copy pointer!*/ mbuf_clone = rte_pktmbuf_clone(mbuf, mbuf_pool); if (mbuf_clone) { int ret = rte_ring_enqueue(arp_ring[i], mbuf_clone); if (unlikely(-EDQUOT == ret)) { RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d quota exceeded\\n", __func__, i); } else if (ret < 0) { RTE_LOG(WARNING, NETIF, "%s: arp ring of lcore %d enqueue failed\\n", __func__, i); rte_pktmbuf_free(mbuf_clone); } } } } } ... }
-
ARP协议pkt_type
static struct pkt_type arp_pkt_type = { .type = rte_cpu_to_be_16(ETHER_TYPE_ARP), .func = neigh_resolve_input, .port = NULL, };
-
ARP数据包
/** * ARP header IPv4 payload. */ struct rte_arp_ipv4 { struct rte_ether_addr arp_sha; /**< sender hardware address */ uint32_t arp_sip; /**< sender IP address */ struct rte_ether_addr arp_tha; /**< target hardware address */ uint32_t arp_tip; /**< target IP address */ } __attribute__((__packed__)) __attribute__((aligned(2))); /** * ARP header. */ struct rte_arp_hdr { uint16_t arp_hardware; /* format of hardware address */ #define RTE_ARP_HRD_ETHER 1 /* ARP Ethernet address format */ uint16_t arp_protocol; /* format of protocol address */ uint8_t arp_hlen; /* length of hardware address */ uint8_t arp_plen; /* length of protocol address */ uint16_t arp_opcode; /* ARP opcode (command) */ #define RTE_ARP_OP_REQUEST 1 /* request to resolve address */ #define RTE_ARP_OP_REPLY 2 /* response to previous request */ #define RTE_ARP_OP_REVREQUEST 3 /* request proto addr given hardware */ #define RTE_ARP_OP_REVREPLY 4 /* response giving protocol address */ #define RTE_ARP_OP_INVREQUEST 8 /* request to identify peer */ #define RTE_ARP_OP_INVREPLY 9 /* response identifying peer */ struct rte_arp_ipv4 arp_data; } __attribute__((__packed__)) __attribute__((aligned(2)));
-
neigh_resolve_input
int neigh_resolve_input(struct rte_mbuf *m, struct netif_port *port) { //arp指向接收包的arp首部 struct arp_hdr *arp = rte_pktmbuf_mtod(m, struct arp_hdr *); struct ether_hdr *eth; uint32_t ipaddr; struct neighbour_entry *neighbour = NULL; unsigned int hashkey; struct inet_ifaddr *ifa; //根据ARP数据包中的arp_tip获取对应的IP信息配置块 ifa = inet_addr_ifa_get(AF_INET, port, (union inet_addr*)&arp->arp_data.arp_tip); if (!ifa) return EDPVS_KNICONTINUE; inet_addr_ifa_put(ifa); //eth指向L2层首部 eth = (struct ether_hdr *)rte_pktmbuf_prepend(m, (uint16_t)sizeof(struct ether_hdr)); //判断ARP请求类型,如果是ARP_OP_REQUEST,生成ARP应答包(此处复用接收到的ARP请求包),调用netif_xmit发送出去 if (rte_be_to_cpu_16(arp->arp_op) == ARP_OP_REQUEST) { //填充回复包的L2层地址 ether_addr_copy(ð->s_addr, ð->d_addr); rte_memcpy(ð->s_addr, &port->addr, 6); //arp包中的操作类型变为ARP_OP_REPLY arp->arp_op = rte_cpu_to_be_16(ARP_OP_REPLY); ether_addr_copy(&arp->arp_data.arp_sha, &arp->arp_data.arp_tha); ether_addr_copy(ð->s_addr, &arp->arp_data.arp_sha); ipaddr = arp->arp_data.arp_sip; arp->arp_data.arp_sip = arp->arp_data.arp_tip; arp->arp_data.arp_tip = ipaddr; m->l2_len = sizeof(struct ether_hdr); m->l3_len = sizeof(struct arp_hdr); netif_xmit(m, port); return EDPVS_OK; } else if (arp->arp_op == htons(ARP_OP_REPLY)) { ipaddr = arp->arp_data.arp_sip; //如果数据包是ARP_OP_REPLY,根据源ip和网卡生成hashkey hashkey = neigh_hashkey(AF_INET, (union inet_addr *)&ipaddr, port); //查询neighbour条目 neighbour = neigh_lookup_entry(AF_INET, (union inet_addr *)&ipaddr, port, hashkey); //如果查找到邻居项缓存,并且不是STATIC类型,则首先更新缓存项中的邻居mac地址 if (neighbour && !(neighbour->flag & NEIGHBOUR_STATIC)) { neigh_edit(neighbour, &arp->arp_data.arp_sha); } else { //否则,创建新的邻居缓存项并加入邻居项hash表中 neighbour = neigh_add_table(AF_INET, (union inet_addr *)&ipaddr, &arp->arp_data.arp_sha, port, hashkey, 0); if (!neighbour) { RTE_LOG(ERR, NEIGHBOUR, "%s: add neighbour wrong\\n", __func__); rte_pktmbuf_free(m); return EDPVS_NOMEM; } } //更新邻居项状态 neigh_entry_state_trans(neighbour, 1); //将缓存在邻居项等待队列中的数据报发送出去 neigh_send_mbuf_cach(neighbour); return EDPVS_KNICONTINUE; } else { //其他操作类型,dpvs不处理 rte_pktmbuf_free(m); return EDPVS_DROP; } }
-
邻居项定义
struct neighbour_entry { //协议族 int af; struct list_head neigh_list; //邻居ip地址 union inet_addr ip_addr; //邻居mac地址 struct ether_addr eth_addr; //出口port struct netif_port *port; //维护arp状态机的定时器 struct dpvs_timer timer; //邻居项上等待发送的neighbour_mbuf_entry队列,neighbour_mbuf_entry中有mbuf的指针 struct list_head queue_list; uint32_t que_num; //邻居项状态 uint32_t state; uint32_t ts; uint8_t flag; } __rte_cache_aligned;
-
邻居项状态迁移
- 迁移表类似tcp state
enum { DPVS_NUD_S_NONE = 0, DPVS_NUD_S_SEND, DPVS_NUD_S_REACHABLE, DPVS_NUD_S_PROBE, DPVS_NUD_S_DELAY, DPVS_NUD_S_MAX /*Reserved*/ }; #define sNNO DPVS_NUD_S_NONE #define sNSD DPVS_NUD_S_SEND #define sNRE DPVS_NUD_S_REACHABLE #define sNPR DPVS_NUD_S_PROBE #define sNDE DPVS_NUD_S_DELAY struct nud_state { int next_state[DPVS_NUD_S_MAX]; }; static struct nud_state nud_states[] = { /* sNNO, sNSD, sNRE, sNPR, sNDE*/ /*send arp*/ {{sNSD, sNSD, sNKP, sNDE, sNDE}}, /*recv arp*/ {{sNRE, sNRE, sNRE, sNRE, sNRE}}, /*ack confirm*/ {{sNKP, sNKP, sNRE, sNRE, sNRE}}, /*mbuf ref*/ {{sNKP, sNKP, sNKP, sNPR, sNKP}}, /*timeout*/ {{sNNO, sNNO, sNPR, sNNO, sNNO}}, };
-
邻居项不同状态超时时间
#define DPVS_NEIGH_TIMEOUT_DEF 60 //单位为seconds static int nud_timeouts[DPVS_NUD_S_MAX] = { [DPVS_NUD_S_NONE] = 2, [DPVS_NUD_S_SEND] = 3, [DPVS_NUD_S_REACHABLE] = DPVS_NEIGH_TIMEOUT_DEF, [DPVS_NUD_S_PROBE] = 30, [DPVS_NUD_S_DELAY] = 3, };
-
状态迁移处理
void neigh_entry_state_trans(struct neighbour_entry *neighbour, int idx) { struct timeval timeout; /* DPVS_NUD_S_KEEP is not a real state, just use it to keep original state */ //如果状态迁移后保持原有状态或者邻居项的状态为STATIC(一般是系统管理员配置的),则不作任何处理 if ((nud_states[idx].next_state[neighbour->state] != DPVS_NUD_S_KEEP) && !(neighbour->flag & NEIGHBOUR_STATIC)) { //首先获取原有状态 int old_state = neighbour->state; struct timespec now = { 0 }; //设置邻居项的新状态 neighbour->state = nud_states[idx].next_state[neighbour->state]; if (neighbour->state == old_state) { if (likely(clock_gettime(CLOCK_REALTIME_COARSE, &now)) == 0) /* frequent timer updates hurt performance, * do not update timer unless half timeout passed */ if ((now.tv_sec - neighbour->ts) * 2 < nud_timeouts[old_state]) return; } //重新获取邻居项的超时时间,更新超时定时器 timeout.tv_sec = nud_timeouts[neighbour->state]; timeout.tv_usec = 0; dpvs_time_rand_delay(&timeout, 200000); /* delay 200ms randomly to avoid timer performance problem */ dpvs_timer_update_nolock(&neighbour->timer, &timeout, false); neighbour->ts = now.tv_sec; #ifdef CONFIG_DPVS_NEIGH_DEBUG if (neighbour->state != old_state) { char buf[512]; dump_neigh_entry(neighbour, buf, sizeof(buf)); RTE_LOG(INFO, NEIGHBOUR, "[%02d] neighbor (%s) trans state: %s -> %s, idx:%d.\\n", rte_lcore_id(), buf, nud_state_name(old_state), nud_state_name(neighbour->state), idx); } #endif } }
-
创建新的邻居项
-
neigh_add_table
struct neighbour_entry *neigh_add_table(int af, const union inet_addr *ipaddr, const struct ether_addr *eth_addr, struct netif_port *port, unsigned int hashkey, int flag) { struct neighbour_entry *new_neighbour=NULL; struct timeval delay; lcoreid_t cid = rte_lcore_id(); //创建新的neighbour_entry arp条目,从缓存池中创建 new_neighbour = dpvs_mempool_get(neigh_mempool, sizeof(struct neighbour_entry)); if (unlikely(new_neighbour == NULL)) return NULL; //邻居项赋值 rte_memcpy(&new_neighbour->ip_addr, ipaddr, sizeof(union inet_addr)); new_neighbour->flag = flag; new_neighbour->af = af; //eth_addr为空时,标识是新建项,新建时邻居项状态为DPVS_NUD_S_NONE;否则是接收到邻居项的回复(但是没有查找到邻居项)时新建 if (eth_addr) { rte_memcpy(&new_neighbour->eth_addr, eth_addr, 6); new_neighbour->state = DPVS_NUD_S_REACHABLE; } else { new_neighbour->state = DPVS_NUD_S_NONE; } new_neighbour->port = port; new_neighbour->que_num = 0; //根据邻居项的状态,确定定时器时间 delay.tv_sec = nud_timeouts[new_neighbour->state]; delay.tv_usec = 0; INIT_LIST_HEAD(&new_neighbour->queue_list); //加到定时器,如果处于 DPVS_NUD_S_NONE 状态,neighbour_timer_event 会将条目删除 if (!(new_neighbour->flag & NEIGHBOUR_STATIC)) { dpvs_time_rand_delay(&delay, 200000); /* delay 200ms randomly to avoid timer performance problem */ dpvs_timer_sched(&new_neighbour->timer, &delay, neighbour_timer_event, new_neighbour, false); } //将arp条目添加到arp表中 neigh_hash(new_neighbour, hashkey); neigh_nums[cid]++; #ifdef CONFIG_DPVS_NEIGH_DEBUG { char buf[512]; dump_neigh_entry(new_neighbour, buf, sizeof(buf)); RTE_LOG(INFO, NEIGHBOUR, "[%02d] add neigh entry: %s\\n", cid, buf); } #endif return new_neighbour; }
-
邻居项超时处理
- 在新建邻居项时设置处理函数为neighbour_timer_event
static int neighbour_timer_event(void *data) { struct neighbour_entry *neighbour = data; //如果处于DPVS_NUD_S_NONE状态时超时,则需要清理邻居项 if (neighbour->state == DPVS_NUD_S_NONE) { return neigh_entry_expire(neighbour); } //更新邻居项状态 neigh_entry_state_trans(neighbour, 4); return DTIMER_OK; }
-
邻居项清理
static int neigh_entry_expire(struct neighbour_entry *neighbour) { struct neighbour_mbuf_entry *mbuf, *mbuf_next; lcoreid_t cid = rte_lcore_id(); assert(cid != master_cid); //首先取消定时器操作 dpvs_timer_cancel_nolock(&neighbour->timer, false); //将邻居项从hash表中解除 neigh_unhash(neighbour); #ifdef CONFIG_DPVS_NEIGH_DEBUG { char buf[512]; dump_neigh_entry(neighbour, buf, sizeof(buf)); RTE_LOG(INFO, NEIGHBOUR, "%s:[%02d] del neigh entry: %s\\n", __func__, cid, buf); } #endif /* release pkts saved in neighbour entry */ //释放缓存发送队列上的数据包 list_for_each_entry_safe(mbuf, mbuf_next, &neighbour->queue_list, neigh_mbuf_list) { list_del(&mbuf->neigh_mbuf_list); rte_pktmbuf_free(mbuf->m); dpvs_mempool_put(neigh_mempool, mbuf); } //释放邻居项资源 dpvs_mempool_put(neigh_mempool, neighbour); neigh_nums[cid]--; return DTIMER_STOP; }
-
邻居项确认
- 传输层收到数据包时,对nexthop邻居项的确认
void neigh_confirm(int af, union inet_addr *nexthop, struct netif_port *port) { struct neighbour_entry *neighbour; unsigned int hashkey; lcoreid_t cid = rte_lcore_id(); /*find nexhop/neighbour to confirm, no matter whether it is the route in*/ hashkey = neigh_hashkey(af, nexthop, port); list_for_each_entry(neighbour, &neigh_table[cid][hashkey], neigh_list) { if (neigh_key_cmp(af, neighbour, nexthop, port) && !(neighbour->flag & NEIGHBOUR_STATIC)) { neigh_entry_state_trans(neighbour, 2); } } }
-
arp请求
static void neigh_state_confirm(struct neighbour_entry *neighbour) { union inet_addr saddr, daddr; memset(&saddr, 0, sizeof(saddr)); if (neighbour->af == AF_INET) { daddr.in.s_addr = neighbour->ip_addr.in.s_addr; //选择出口saddr inet_addr_select(AF_INET, neighbour->port, &daddr, 0, &saddr); if (!saddr.in.s_addr) RTE_LOG(ERR, NEIGHBOUR, "%s: no source ip\\n", __func__); //发送ARP请求 if (neigh_send_arp(neighbour->port, saddr.in.s_addr, daddr.in.s_addr) != EDPVS_OK) RTE_LOG(ERR, NEIGHBOUR, "%s: send arp failed\\n", __func__); } else if (neighbour->af == AF_INET6) { ipv6_addr_copy(&daddr.in6, &neighbour->ip_addr.in6); inet_addr_select(AF_INET6, neighbour->port, &daddr, 0, &saddr); if (ipv6_addr_any(&saddr.in6)) RTE_LOG(ERR, NEIGHBOUR, "%s: no source ip\\n", __func__); ndisc_solicit(neighbour, &saddr.in6); } } //构造ARP请求包,发送ARP请求 static int neigh_send_arp(struct netif_port *port, uint32_t src_ip, uint32_t dst_ip) { struct rte_mbuf *m; struct ether_hdr *eth; struct arp_hdr *arp; uint32_t addr; m = rte_pktmbuf_alloc(port->mbuf_pool); if (unlikely(m == NULL)) { return EDPVS_NOMEM; } m->userdata = NULL; eth = rte_pktmbuf_mtod(m, struct ether_hdr *); arp = (struct arp_hdr *)ð[1]; memset(ð->d_addr, 0xFF, 6); ether_addr_copy(&port->addr, ð->s_addr); eth->ether_type = htons(ETHER_TYPE_ARP); memset(arp, 0, sizeof(struct arp_hdr)); rte_memcpy(&arp->arp_data.arp_sha, &port->addr, 6); addr = src_ip; inetAddrCopy(&arp->arp_data.arp_sip, &addr); memset(&arp->arp_data.arp_tha, 0, 6); addr = dst_ip; inetAddrCopy(&arp->arp_data.arp_tip, &addr); arp->arp_hrd = htons(ARP_HRD_ETHER); arp->arp_pro = htons(ETHER_TYPE_IPv4); arp->arp_hln = 6; arp->arp_pln = 4; arp->arp_op = htons(ARP_OP_REQUEST); m->pkt_len = 60; m->data_len = 60; m->l2_len = sizeof(struct ether_hdr); m->l3_len = sizeof(struct arp_hdr); memset(&arp[1], 0, 18); #ifdef CONFIG_DPVS_NEIGH_DEBUG dump_arp_hdr("send", arp, port->id); #endif netif_xmit(m, port); return EDPVS_OK; }
-
dpvs中ARP协议
最新推荐文章于 2022-03-17 16:29:25 发布