Linux下NAT功能的实现 | |
| |
来源: ChinaUnix博客 日期: 2009.04.30 11:58 (共有条评论) 我要评论 | |
本文档的Copyleft归yfydz所有,使用GPL发布,可以自由拷贝、转载,转载时请保持文档的完整性,严禁用于任何商业用途。 msn: yfydz_no1@hotmail.com 来源: http://yfydz.cublog.cn 1. 前言 在2.4/2.6内核的Linux中的防火墙代码netfilter中支持源NAT(SNAT)和目的NAT (DNAT),基本可以满足各种类型的NAT需求,本文介绍Linux下的NAT的具体实现过程,所引的内核代码版本2.4.26,NAT原理部分不在此介绍,有兴趣者可先看我的另一篇NAT原理介绍的文章。 2. NAT hook NAT操作也是以netfilter节点形式挂接在相应的处理点上的,DNAT挂接在NF_IP_PRE_ROUTING点上,优先级高于FILTER低于MANGLE,表示在mangle表后处理,但在filter表前处理数据包;SNAT挂接在NF_IP_POST_ROUTING点上,优先级低于FILTER,表示在filter表后面处理数据包。 在net/ipv4/netfilter/ip_nat_standalone.c中: 目的NAT的hook节点: /* Before packet filtering, change destination */ static struct nf_hook_ops ip_nat_in_ops = { { NULL, NULL }, ip_nat_fn, PF_INET, NF_IP_PRE_ROUTING, NF_IP_PRI_NAT_DST }; 源NAT的hook节点: /* After packet filtering, change source */ static struct nf_hook_ops ip_nat_out_ops = { { NULL, NULL }, ip_nat_out, PF_INET, NF_IP_POST_ROUTING, NF_IP_PRI_NAT_SRC}; include/linux/netfilter_ipv4.h enum nf_ip_hook_priorities { NF_IP_PRI_FIRST = INT_MIN, NF_IP_PRI_CONNTRACK = -200, // 连接跟踪 NF_IP_PRI_MANGLE = -150, // mangle table NF_IP_PRI_NAT_DST = -100, // DNAT NF_IP_PRI_FILTER = 0, // filter table NF_IP_PRI_NAT_SRC = 100, // SNAT NF_IP_PRI_LAST = INT_MAX, }; ip_nat_fn()是NAT hook的主处理函数,ip_nat_out()函数也是在数据合法性检查后调用ip_nat_fn()函数。 3. NAT处理相关结构 在状态连接结构struct ip_conntrack中包含了关于NAT的相关结构(include/linux/netfilter/ip_conntrack.h): struct ip_conntrack { ...... #ifdef CONFIG_IP_NF_NAT_NEEDED struct { struct ip_nat_info info; union ip_conntrack_nat_help help; #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \ defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) int masq_index; #endif } nat; #endif /* CONFIG_IP_NF_NAT_NEEDED */ }; 其中比较重要的是struct ip_nat_info结构,而union ip_conntrack_nat_help是各协议NAT时需要特殊处理的结构描述,不过在2.4.26内核中都没定义,联合为空。 #define IP_NAT_MAX_MANIPS (2*3) // 此结构描述数据包中要修改部分的信息 struct ip_nat_info_manip { /* The direction. */ u_int8_t direction; /* Which hook the manipulation happens on. */ u_int8_t hooknum; /* The manipulation type. */ u_int8_t maniptype; // 修改类型: SNAT / DNAT // 连接的数据包要修改的信息,包括地址和上层的协议信息 /* Manipulations to occur at each conntrack in this dirn. */ struct ip_conntrack_manip manip; }; /* The structure embedded in the conntrack structure. */ struct ip_nat_info { /* Set to zero when conntrack created: bitmask of maniptypes */ int initialized; // 实际最多用两位 unsigned int num_manips; /* Manipulations to be done on this conntrack. */ // 每个最多可以记录6个NAT信息 struct ip_nat_info_manip manips[IP_NAT_MAX_MANIPS]; struct ip_nat_hash bysource, byipsproto; // 按地址和协议的HASH表 /* Helper (NULL if none). */ struct ip_nat_helper *helper; // 多连接协议的NAT时的helper struct ip_nat_seq seq[IP_CT_DIR_MAX]; // 描述两个方向的序列号变化情况 }; 4. ip_nat_fn()函数 ip_nat_fn()是NAT hook的基本处理函数(net/ipv4/netfilter/ip_nat_standalone.c),目的是建立连接的NAT info信息, 并修改数据包中的相应部分。 static unsigned int ip_nat_fn(unsigned int hooknum, struct sk_buff **pskb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; struct ip_nat_info *info; /* maniptype == SRC for postrouting. */ // 根据hooknum来确定进行哪种方式的NAT,netfilter在hook点是能进行哪种NAT是固定的: // NF_IP_PRE_ROUTING点进行的是DNAT,maniptype=1 // NF_IP_POST_ROUTING点进行的是SNAT,maniptype=0 enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); /* We never see fragments: conntrack defrags on pre-routing and local-out, and ip_nat_out protects post-routing. */ IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET))); (*pskb)->nfcache |= NFC_UNKNOWN; /* If we had a hardware checksum before, it's now invalid */ if ((*pskb)->ip_summed == CHECKSUM_HW) (*pskb)->ip_summed = CHECKSUM_NONE; // 进行NAT的包必须都经过的连接跟踪处理,如果找不到该包对应的连接,不对其进行NAT处理 // 连接跟踪优先级最高,是数据包一进入netfilter就要进行处理的 ct = ip_conntrack_get(*pskb, &ctinfo); /* Can't track? It's not due to stress, or conntrack would have dropped it. Hence it's the user's responsibilty to packet filter it out, or implement conntrack/NAT for that protocol. 8) --RR */ if (!ct) { /* Exception: ICMP redirect to new connection (not in hash table yet). We must not let this through, in case we're doing NAT to the same network. */ struct iphdr *iph = (*pskb)->nh.iph; struct icmphdr *hdr = (struct icmphdr *) ((u_int32_t *)iph + iph->ihl); if (iph->protocol == IPPROTO_ICMP && hdr->type == ICMP_REDIRECT) return NF_DROP; return NF_ACCEPT; } switch (ctinfo) { //对于相关连接、相关连接的回复、新连接的包进行NAT信息的构建 case IP_CT_RELATED: case IP_CT_RELATED+IP_CT_IS_REPLY: if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { return icmp_reply_translation(*pskb, ct, hooknum, CTINFO2DIR(ctinfo)); } /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ case IP_CT_NEW: info = &ct->nat.info; WRITE_LOCK(&ip_nat_lock); /* Seen it before? This can happen for loopback, retrans, or local packets.. */ // 检查是否已经进行相应方向的初始化,注意初始化可以是两个方向同时进行的 // 这就是说一个数据包可以同时修改源和目的, 这在服务器和内网在相同网段时会用到, // netfilter已经能自动处理这种情况,根本不需要进行修改,以前我的理解有误,以为 // 只能修改一个方向的数据 if (!(info->initialized & (1 local traffic with * CONFIG_IP_NF_NAT_LOCAL disabled. */ && !(ct->status & IPS_CONFIRMED) #endif ) { unsigned int ret; if (ct->master && master_ct(ct)->nat.info.helper && master_ct(ct)->nat.info.helper->expect) { // 多连接协议情况, 如果是子连接, 调用主连接相关的expect函数处理填写NAT info信息 ret = call_expect(master_ct(ct), pskb, hooknum, ct, info); } else { #ifdef CONFIG_IP_NF_NAT_LOCAL /* LOCAL_IN hook doesn't have a chain! */ if (hooknum == NF_IP_LOCAL_IN) ret = alloc_null_binding(ct, info, hooknum); else #endif // 否则根据NAT规则表查找规则, 执行规则的动作: SNAT或DNAT, 填写NAT info信息 ret = ip_nat_rule_find(pskb, hooknum, in, out, ct, info); } // 返回值不是接受的话直接返回, 数据包将被丢弃 if (ret != NF_ACCEPT) { WRITE_UNLOCK(&ip_nat_lock); return ret; } } else DEBUGP("Already setup manip %s for ct %p\n", maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", ct); WRITE_UNLOCK(&ip_nat_lock); break; default: // 连接的NAT信息已经填好, 直接使用 /* ESTABLISHED */ IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); info = &ct->nat.info; } IP_NF_ASSERT(info); // 根据NAT info信息对数据包的相应部分进行修改 return do_bindings(ct, ctinfo, info, hooknum, pskb); } 4. do_bindings()函数 do_bindings()是完成具体的NAT操作部分的函数(net/ipv4/netfilter/ip_nat_core.c),修改地址端口等信息,必要时修改数据内容部分信息(这种情况下可能数据包长度会变,序列号/确认号相应会改变,这些都累计进NAT info参数中),并重新各种校验和(TCP/UDP/ICMP校验和,IP头校验和): /* Do packet manipulations according to binding. */ unsigned int do_bindings(struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, struct ip_nat_info *info, unsigned int hooknum, struct sk_buff **pskb) { unsigned int i; struct ip_nat_helper *helper; // 数据方向:original or reply enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); // 是否是TCP协议,TCP协议要处理序列号/确认号 int is_tcp = (*pskb)->nh.iph->protocol == IPPROTO_TCP; /* Need nat lock to protect against modification, but neither conntrack (referenced) and helper (deleted with synchronize_bh()) can vanish. */ READ_LOCK(&ip_nat_lock); for (i = 0; i num_manips; i++) { /* raw socket (tcpdump) may have clone of incoming skb: don't disturb it --RR */ if (skb_cloned(*pskb) && !(*pskb)->sk) { struct sk_buff *nskb = skb_copy(*pskb, GFP_ATOMIC); if (!nskb) { READ_UNLOCK(&ip_nat_lock); return NF_DROP; } kfree_skb(*pskb); *pskb = nskb; } // 检查数据包方向和hooknum是否是与NAT info中规定的一致 if (info->manips.direction == dir && info->manips.hooknum == hooknum) { DEBUGP("Mangling %p: %s to %u.%u.%u.%u %u\n", *pskb, info->manips.maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", NIPQUAD(info->manips.manip.ip), htons(info->manips.manip.u.all)); // 进行具体的NAT操作,修改IP头的地址、TCP、UDP等的端口 manip_pkt((*pskb)->nh.iph->protocol, (*pskb)->nh.iph, (*pskb)->len, &info->manips.manip, info->manips.maniptype, &(*pskb)->nfcache); } } helper = info->helper; READ_UNLOCK(&ip_nat_lock); // 多连接协议 if (helper) { struct ip_conntrack_expect *exp = NULL; struct list_head *cur_item; int ret = NF_ACCEPT; int helper_called = 0; DEBUGP("do_bindings: helper existing for (%p)\n", ct); /* Always defragged for helpers */ IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET))); /* Have to grab read lock before sibling_list traversal */ READ_LOCK(&ip_conntrack_lock); // 主连接的子连接链表是倒着搜索的 list_for_each_prev(cur_item, &ct->sibling_list) { // 取得期待的连接信息 exp = list_entry(cur_item, struct ip_conntrack_expect, expected_list); /* if this expectation is already established, skip */ // 期待的子连接已经到了,不用再处理 if (exp->sibling) continue; // 检查数据包是否是要修改的数据包,对于UDP、ICMP函数返回始终是1,TCP协议是才可能返回0 if (exp_for_packet(exp, pskb)) { /* FIXME: May be true multiple times in the * case of UDP!! */ DEBUGP("calling nat helper (exp=%p) for packet\n", exp); // 调用多连接协议的help函数修改内容部分的相关数据 ret = helper->help(ct, exp, info, ctinfo, hooknum, pskb); if (ret != NF_ACCEPT) { READ_UNLOCK(&ip_conntrack_lock); return ret; } helper_called = 1; } } /* Helper might want to manip the packet even when there is no * matching expectation for this packet */ if (!helper_called && helper->flags & IP_NAT_HELPER_F_ALWAYS) { DEBUGP("calling nat helper for packet without expectation\n"); ret = helper->help(ct, NULL, info, ctinfo, hooknum, pskb); if (ret != NF_ACCEPT) { READ_UNLOCK(&ip_conntrack_lock); return ret; } } READ_UNLOCK(&ip_conntrack_lock); /* Adjust sequence number only once per packet * (helper is called at all hooks) */ // 调整TCP的序列号 if (is_tcp && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) { DEBUGP("ip_nat_core: adjusting sequence number\n"); /* future: put this in a l4-proto specific function, * and call this function here. */ ip_nat_seq_adjust(*pskb, ct, ctinfo); } return ret; } else return NF_ACCEPT; /* not reached */ } manip_pkt()函数(net/ipv4/netfilter/ip_nat_core.c)相对就比较简单了,先修改传输层部分的数据参数(如TCP、UDP端口),再修改IP头中的地址: static void manip_pkt(u_int16_t proto, struct iphdr *iph, size_t len, const struct ip_conntrack_manip *manip, enum ip_nat_manip_type maniptype, __u32 *nfcache) { *nfcache |= NFC_ALTERED; // find_nat_proto函数始终会返回一个协议,因为如果不是能处理的协议,将 // 返回缺省的未知协议处理,由此也可知在IP上层协议NAT处理结构中的 // manip_pkt()函数不能为空,这个函数可以什么都不作,但不能为NULL find_nat_proto(proto)->manip_pkt(iph, len, manip, maniptype); // 根据NAT类型,修改源或目的IP地址 if (maniptype == IP_NAT_MANIP_SRC) { iph->check = ip_nat_cheat_check(~iph->saddr, manip->ip, iph->check); iph->saddr = manip->ip; } else { iph->check = ip_nat_cheat_check(~iph->daddr, manip->ip, iph->check); iph->daddr = manip->ip; } #if 0 if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) DEBUGP("IP: checksum on packet bad.\n"); if (proto == IPPROTO_TCP) { void *th = (u_int32_t *)iph + iph->ihl; if (tcp_v4_check(th, len - 4*iph->ihl, iph->saddr, iph->daddr, csum_partial((char *)th, len-4*iph->ihl, 0))) DEBUGP("TCP: checksum on packet bad\n"); } #endif } 6. SNAT、DNAT目标函数 前面在ip_nat_fn()函数中调用的ip_nat_rule_find()用来查找NAT规则,执行规则的动作,规则目标不是SNAT就是DNAT,该目标的具体实现在net/ipv4/netfilter/ip_nat_rule.c中。不论是SNAT还是DNAT规则,其目标函数最终都是调用ip_nat_setup_info()函数来建立连接的NAT info信息。 net/ipv4/netfilter/ip_nat_rule.c: /* Source NAT */ static unsigned int ipt_snat_target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in, const struct net_device *out, const void *targinfo, void *userinfo) { struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); ct = ip_conntrack_get(*pskb, &ctinfo); /* Connection must be valid and new. */ IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); IP_NF_ASSERT(out); // 只有新连接才进行NAT info的建立 // targinfo实际是struct ip_nat_multi_range结构指针,记录转换后的 // 地址、端口等信息, 一个NAT规则可以转换到可以转换到多个地址端口上 return ip_nat_setup_info(ct, targinfo, hooknum); } static unsigned int ipt_dnat_target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in, const struct net_device *out, const void *targinfo, void *userinfo) { struct ip_conntrack *ct; enum ip_conntrack_info ctinfo; #ifdef CONFIG_IP_NF_NAT_LOCAL IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_LOCAL_OUT); #else IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING); #endif ct = ip_conntrack_get(*pskb, &ctinfo); /* Connection must be valid and new. */ IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); // 只有新连接才进行NAT info的建立 // targinfo实际是struct ip_nat_multi_range结构指针,记录转换后的 // 地址、端口等信息, 一个NAT规则可以转换到可以转换到多个地址端口上 return ip_nat_setup_info(ct, targinfo, hooknum); } ...... int ip_nat_rule_find(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in, const struct net_device *out, struct ip_conntrack *ct, struct ip_nat_info *info) { int ret; ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL); if (ret == NF_ACCEPT) { // 数据接受但有没有初始化,分配一个NULL binding,实际不作任何修改,也就是 // 说对该包没有相应的NAT规则对于,不需要进行NAT处理 if (!(info->initialized & (1 nat.info; // 如果info->initialized不为0,表示已经初始化过了 int in_hashes = info->initialized; MUST_BE_WRITE_LOCKED(&ip_nat_lock); IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING || hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_OUT); IP_NF_ASSERT(info->num_manips initialized & (1 tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ // 根据连接的回应方向的tuple进行反转得到原始方向的tuple invert_tuplepr(&orig_tp, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple); #if 0 { unsigned int i; DEBUGP("Hook %u (%s), ", hooknum, HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST"); DUMP_TUPLE(&orig_tp); DEBUGP("Range %p: ", mr); for (i = 0; i rangesize; i++) { DEBUGP("%u:%s%s%s %u.%u.%u.%u - %u.%u.%u.%u %u - %u\n", i, (mr->range.flags & IP_NAT_RANGE_MAP_IPS) ? " MAP_IPS" : "", (mr->range.flags & IP_NAT_RANGE_PROTO_SPECIFIED) ? " PROTO_SPECIFIED" : "", (mr->range.flags & IP_NAT_RANGE_FULL) ? " FULL" : "", NIPQUAD(mr->range.min_ip), NIPQUAD(mr->range.max_ip), mr->range.min.all, mr->range.max.all); } } #endif do { // 找一个未使用的进行了转换后的tuple结构参数,mr是NAT规则确定的要转换后的 // 地址端口参数, new_tuple保持转换后的连接原始方向的tuple if (!get_unique_tuple(&new_tuple, &orig_tp, mr, conntrack, hooknum)) { DEBUGP("ip_nat_setup_info: Can't get unique for %p.\n", conntrack); return NF_DROP; } #if 0 DEBUGP("Hook %u (%s) %p\n", hooknum, HOOK2MANIP(hooknum)==IP_NAT_MANIP_SRC ? "SRC" : "DST", conntrack); DEBUGP("Original: "); DUMP_TUPLE(&orig_tp); DEBUGP("New: "); DUMP_TUPLE(&new_tuple); #endif /* We now have two tuples (SRCIP/SRCPT/DSTIP/DSTPT): the original (A/B/C/D') and the mangled one (E/F/G/H'). We're only allowed to work with the SRC per-proto part, so we create inverses of both to start, then derive the other fields we need. */ /* Reply connection: simply invert the new tuple (G/H/E/F') */ // 建立连接地址转换后的反向的tuple,这使netfilter能自动对连接的反方向数据 // 进行处理,也就是说定义了一条SNAT规则后,并不需要再定义一条DNAT规则来处理 // 返回的数据,netfilter已经自动处理了 invert_tuplepr(&reply, &new_tuple); /* Alter conntrack table so it recognizes replies. If fail this race (reply tuple now used), repeat. */ // 修改连接参数使能正确识别返回数据,如果reply已经对应一条连接 // ip_conntrack_alter_reply()函数返回0,表示要继续修改转换后的参数值 } while (!ip_conntrack_alter_reply(conntrack, &reply)); /* FIXME: We can simply used existing conntrack reply tuple here --RR */ /* Create inverse of original: C/D/A/B' */ invert_tuplepr(&inv_tuple, &orig_tp); /* Has source changed?. */ // 源NAT if (!ip_ct_tuple_src_equal(&new_tuple, &orig_tp)) { /* In this direction, a source manip. */ // 连接正方向是SNAT info->manips[info->num_manips++] = ((struct ip_nat_info_manip) { IP_CT_DIR_ORIGINAL, hooknum, IP_NAT_MANIP_SRC, new_tuple.src }); IP_NF_ASSERT(info->num_manips manips[info->num_manips++] = ((struct ip_nat_info_manip) { IP_CT_DIR_REPLY, opposite_hook[hooknum], IP_NAT_MANIP_DST, orig_tp.src }); IP_NF_ASSERT(info->num_manips manips[info->num_manips++] = ((struct ip_nat_info_manip) { IP_CT_DIR_ORIGINAL, hooknum, IP_NAT_MANIP_DST, reply.src }); IP_NF_ASSERT(info->num_manips manips[info->num_manips++] = ((struct ip_nat_info_manip) { IP_CT_DIR_REPLY, opposite_hook[hooknum], IP_NAT_MANIP_SRC, inv_tuple.src }); IP_NF_ASSERT(info->num_manips master) info->helper = LIST_FIND(&helpers, helper_cmp, struct ip_nat_helper *, &reply); /* It's done. */ // 完成该方向的NAT info初始化 info->initialized |= (1 bysource.conntrack); replace_in_hashes(conntrack, info); } else { place_in_hashes(conntrack, info); } return NF_ACCEPT; } 8. 结论 Linux下的NAT流程可以大致表示如下: hook_ops | V ip_nat_fn() | V 否 是否是新连接------------+ | | | 是 | | | V | ip_nat_rule_find() | | | ip_snat_taget() | ip_dnat_target() | | | V | ip_nat_setup_info() | | | | |