NAT模式限制
-
NAT为双臂模式
-
拓扑实例
- 如上图所示,各节点的IP地址如下:
- client: 192.168.0.46
- VIP: 192.168.0.89
- DPVS local ip: 192.168.0.66, 10.140.31.48
- RS1: 10.140.18.33
- RS2: 10.140.18.33
- 可见,所有的IP地址都在一个局域网内
- 如上图所示,各节点的IP地址如下:
-
NAT模式约束
- DPVS-NAT模式只能在单个lcore中工作。由于以上原因,dpvs很难支持多lcore NAT转发模式
- DPVS会话条目通过RSS在lcore上进行拆分和分发
- NAT转发要求正、反向流量都通过DPVS
- NAT转发只转换dest IP/端口,不更改源IP/端口
- NIC的fdir规则设置有限
- 因此,如果没有对流量的其他控制,则出站数据包可能到达与入站数据包不同的lcore。如果是,出站数据包将被丢弃,因为会话查找未命中
- FNAT通过使用Flow Director(FDIR)解决了这个问题。但是,对于NIC,可以添加的规则非常有限,例如,对于XT-540,可以添加8k,与FNAT不同,NAT没有(本地ip,端口),因此只能在(源ip,端口)上设置FDIR规则,这意味着只支持数千个并发。因此,FDIR不适用于NAT
- 注意:从v1.7.2开始,就为多lcore NAT模式转发提供了解决方案。其原理是通过全局重定向表和一些无锁环将出站数据包重定向到其会话项所在的正确的lcore。当然,这在一定程度上会损害性能。如果要使用它,请在/etc/dpvs.conf中打开配置开关“ipvs_defs/conn/redirect”
- DPVS-NAT模式只能在单个lcore中工作。由于以上原因,dpvs很难支持多lcore NAT转发模式
NAT模式原理
-
对于inbound方向的流量,实际上做的是dnat,将目标ip由lb ip转换成真正的rs ip,此时后端rs是能拿到client ip的。outbond的流量做snat,将源地址换成lb ip
-
三层处理ipv4_rcv
- 由于nat不做syn_proxy,所以直接看dp_vs_in
-
dp_vs_conn_bind_dest
static int dp_vs_conn_bind_dest(struct dp_vs_conn *conn, struct dp_vs_dest *dest) { ... switch (dest->fwdmode) { case DPVS_FWD_MODE_NAT: conn->packet_xmit = dp_vs_xmit_nat; conn->packet_out_xmit = dp_vs_out_xmit_nat; break; ... } conn->dest = dest; return(EDPVS_OK); }
inbound方向处理
-
dp_vs_xmit_nat
/** * 包裹函数,根据协议族执行不同的内部函数 * 参数说明: * proto: 传输层dp_vs_proto结构,对于tcp协议为tcp_proto * conn: 对应的连接 * mbuf: inbound接收的mbuf */ int dp_vs_xmit_nat(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { int af = conn->af; assert(af == AF_INET || af == AF_INET6); //根据协议族执行内部函数 return(af == AF_INET ? __dp_vs_xmit_nat4(proto, conn, mbuf) : __dp_vs_xmit_nat6(proto, conn, mbuf)); }
-
__dp_vs_out_xmit_nat4
/** * nat模式下,inbound流量处理,主要做dnat,将目的ip由DPVS转换成真正的RS ip,此时后端RS是能够拿到正确的client ip的 * 参数说明: * proto: 传输层dp_vs_proto结构,对于tcp协议为tcp_proto * conn: 对应的连接 * mbuf: inbound接收的mbuf */ static int __dp_vs_xmit_nat4(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct flow4 fl4; struct ipv4_hdr * iph = ip4_hdr(mbuf); struct route_entry *rt; int err, mtu; if (!fast_xmit_close && !(conn->flags & DPVS_CONN_F_NOFASTXMIT)) { dp_vs_save_xmit_info(mbuf, proto, conn); if (!dp_vs_fast_xmit_nat(proto, conn, mbuf)) { return(EDPVS_OK); } } /* * drop old route. just for safe, because * NAT is PREROUTING, should not have route. */ //释放old路由缓存信息 if (unlikely(mbuf->userdata != NULL)) { RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\\n", __func__, mbuf->userdata); route4_put((struct route_entry *)mbuf->userdata); } //查找出口路由,此处查找时fl4_daddr设置成RS的daddr,因为需要dpvs将mbuf中目的ip转换成真正的rs ip memset(&fl4, 0, sizeof(struct flow4)); fl4.fl4_daddr = conn->daddr.in; //nat模式下client ip不变 fl4.fl4_saddr = conn->caddr.in; fl4.fl4_tos = iph->type_of_service; rt = route4_output(&fl4); //如果未找到出口路由,则丢弃数据报返回 if (!rt) { err = EDPVS_NOROUTE; goto errout; } //通过出口路由设置conn缓存字段 dp_vs_conn_cache_rt(conn, rt, true); mtu = rt->mtu; if (mbuf->pkt_len > mtu && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\\n", __func__); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(rt->mtu)); err = EDPVS_FRAG; goto errout; } //设置mbuf的路由缓存项 mbuf->userdata = rt; /* after route lookup and before translation */ //递减ttl,如果递减后ttl=0,则丢弃数据报,并通过icmp通知错误 if (xmit_ttl) { if (unlikely(iph->time_to_live <= 1)) { icmp_send(mbuf, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); err = EDPVS_DROP; goto errout; } iph->time_to_live--; } /* L3 translation before l4 re-csum */ //清零ip层chksum,因为ip层首部字段会发生变化,需要重新计算校验和 iph->hdr_checksum = 0; //将数据报中目的ip地址替换为后端rs地址 iph->dst_addr = conn->daddr.in.s_addr; /* L4 NAT translation */ //执行nat_in_handler,tcp中为tcp_snat_in_handler if (proto->nat_in_handler) { err = proto->nat_in_handler(proto, conn, mbuf); if (err != EDPVS_OK) { goto errout; } } //如果网卡支持硬件校验和,则设置校验和为0由硬件计算,否则软件计算校验和 if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) { iph->hdr_checksum = 0; } else { ip4_send_csum(iph); } //回调 INET_HOOK_LOCAL_OUT 链注册的回调,dpvs中目前没有注册hook函数,所以实际就是调用ipv4_output,发送mbuf return(INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output)); errout: if (rt) { route4_put(rt); } rte_pktmbuf_free(mbuf); return(err); }
-
tcp_snat_in_handler
- 主要执行tcp层校验和计算
- 由于nat模式下会修改ip首部源地址或者目的地址,tcp首部中伪首部会发生变化,导致tcp校验和会发生变化
/** * 主要执行tcp层校验和计算,由于nat模式下会修改ip首部源地址或者目的地址,tcp首部中伪首部会发生变化,导致tcp校验和会发生变化 */ static int tcp_snat_in_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct tcphdr *th; int af = conn->af; int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf) : ip4_hdrlen(mbuf)); //长度校验 if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) { return(EDPVS_INVPKT); } //th指向mbuf中tcp首部 th = tcp_hdr(mbuf); if (unlikely(!th)) { return(EDPVS_INVPKT); } //长度校验 if (mbuf_may_pull(mbuf, iphdrlen + (th->doff << 2)) != 0) { return(EDPVS_INVPKT); } /* L4 translation */ //将目的端口更改成rs server的实际端口 th->dest = conn->dport; /* L4 re-checksum */ //重新计算校验和 return(tcp_send_csum(af, iphdrlen, th, conn, mbuf)); }
outbound方向处理
-
dp_vs_out_xmit_nat
//包裹函数,根据协议族执行不同的内部函数 int dp_vs_out_xmit_nat(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { int af = conn->af; assert(af == AF_INET || af == AF_INET6); return(af == AF_INET ? __dp_vs_out_xmit_nat4(proto, conn, mbuf) : __dp_vs_out_xmit_nat6(proto, conn, mbuf)); }
-
__dp_vs_out_xmit_nat4
/** * nat模式下,outbound方向传输函数,流量做snat,将源地址换成lb ip * proto: 传输层协议实例 * conn: 连接信息 * mbuf: 从RS server--->LB的数据报,需要做snat后传输至外部客户端 */ static int __dp_vs_out_xmit_nat4(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct flow4 fl4; //iph指向mbuf中ip首部 struct ipv4_hdr * iph = ip4_hdr(mbuf); struct route_entry *rt; int err, mtu; if (!fast_xmit_close && !(conn->flags & DPVS_CONN_F_NOFASTXMIT)) { dp_vs_save_outxmit_info(mbuf, proto, conn); if (!dp_vs_fast_outxmit_nat(proto, conn, mbuf)) { return(EDPVS_OK); } } /* * drop old route. just for safe, because * NAT is PREROUTING, should not have route. */ //mbuf->userdata中存放路由缓存项 if (unlikely(mbuf->userdata != NULL)) { RTE_LOG(WARNING, IPVS, "%s: NAT have route %p ?\\n", __func__, mbuf->userdata); route4_put((struct route_entry *)mbuf->userdata); } //查找出口路由,目的地址为client ip,源地址更改为LB vip memset(&fl4, 0, sizeof(struct flow4)); fl4.fl4_daddr = conn->caddr.in; fl4.fl4_saddr = conn->vaddr.in; fl4.fl4_tos = iph->type_of_service; rt = route4_output(&fl4); if (!rt) { err = EDPVS_NOROUTE; goto errout; } //根据rt信息更新conn cache dp_vs_conn_cache_rt(conn, rt, false); //如果数据报长度超过出口路由的mtu,并且数据报中有DF标记,则丢弃数据报,并通过icmp回复错误消息 mtu = rt->mtu; if (mbuf->pkt_len > mtu && (iph->fragment_offset & htons(IPV4_HDR_DF_FLAG))) { RTE_LOG(DEBUG, IPVS, "%s: frag needed.\\n", __func__); icmp_send(mbuf, ICMP_DEST_UNREACH, ICMP_UNREACH_NEEDFRAG, htonl(rt->mtu)); err = EDPVS_FRAG; goto errout; } //更新mbuf的路由缓存项 mbuf->userdata = rt; /* after route lookup and before translation */ //递减ttl并做出错处理 if (xmit_ttl) { if (unlikely(iph->time_to_live <= 1)) { icmp_send(mbuf, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); err = EDPVS_DROP; goto errout; } iph->time_to_live--; } /* L3 translation before l4 re-csum */ //ip首部校验和清零,snat会修改ip首部源ip地址 iph->hdr_checksum = 0; //源ip地址更改为vip地址 iph->src_addr = conn->vaddr.in.s_addr; /* L4 NAT translation */ //执行nat_out_handler,主要重新计算传输层的校验和,tcp中为tcp_snat_out_handler if (proto->nat_out_handler) { err = proto->nat_out_handler(proto, conn, mbuf); if (err != EDPVS_OK) { goto errout; } } //重新计算IP层校验和 if (likely(mbuf->ol_flags & PKT_TX_IP_CKSUM)) { iph->hdr_checksum = 0; } else { ip4_send_csum(iph); } //INET_HOOK_LOCAL_OUT处暂时没有hook_ops,等于直接调用ipv4_output return(INET_HOOK(AF_INET, INET_HOOK_LOCAL_OUT, mbuf, NULL, rt->port, ipv4_output)); errout: if (rt) { route4_put(rt); } rte_pktmbuf_free(mbuf); return(err); }
-
tcp_snat_out_handler
//snat中重新计算tcp首部校验和 static int tcp_snat_out_handler(struct dp_vs_proto *proto, struct dp_vs_conn *conn, struct rte_mbuf *mbuf) { struct tcphdr *th; int af = conn->af; int iphdrlen = ((AF_INET6 == af) ? ip6_hdrlen(mbuf) : ip4_hdrlen(mbuf)); if (mbuf_may_pull(mbuf, iphdrlen + sizeof(*th)) != 0) { return(EDPVS_INVPKT); } th = tcp_hdr(mbuf); if (unlikely(!th)) { return(EDPVS_INVPKT); } if (mbuf_may_pull(mbuf, iphdrlen + (th->doff << 2)) != 0) { return(EDPVS_INVPKT); } /* L4 translation */ th->source = conn->vport; /* L4 re-checksum */ return(tcp_send_csum(af, iphdrlen, th, conn, mbuf)); }
nat 模式配置实例
-
拓扑,如开头
-
配置
-
DPVS configs
## DPVS configs ## # config LAN network on bond0, routes will generate automatically ./dpip addr add 192.168.0.66/24 dev bond0 ./dpip addr add 10.140.31.48/20 dev bond0 # add service <VIP:vport> to forwarding, scheduling mode is RR ./ipvsadm -A -t 192.168.0.89:80 -r 10.140.18.33 -m ./ipvsadm -A -t 192.168.0.89:80 -r 10.140.18.34 -m # add VIP and the route will generate automatically ./dpip addr add 192.168.0.89/32 dev bond0
-
keepalived configs
static_ipaddress { 192.168.0.66/24 dev bond0 10.140.31.48/20 dev bond0 } virtual_server_group vip_nat { 192.168.0.89 80 } virtual_server group vip_nat { protocol tcp lb_algo rr lb_kind NAT real server 10.140.18.33 80 { weight 100 inhibit_on_failure TCP_CHECK { nb_sock_retry 2 connect_timeout 3 connect_port 80 } } real server 10.140.18.34 80 { weight 100 inhibit_on_failure TCP_CHECK { nb_sock_retry 2 connect_timeout 3 connect_port 80 } } }
-
On RSs, back routes should be pointed to DPVS
## for each real server ip route add 192.168.0.0/24 via 10.140.31.48 dev eth0
-
-
Now you can test DPVS NAT mode
client$ curl 192.168.0.89:80 Hi, I am 10.140.18.33 client$ curl 192.168.0.89:80 Hi, I am 10.140.18.34