linux netfilter框架,Netfilter---框架的设计

利用包处理可以设置或改变数据包的服务类型(Type of Service, TOS)字段;改变包的生存期(Time to Live, TTL)字段;在包中设置标志值,利用该标志值可以进行带宽限制和分类查询.

通过上面的概述我们已经对netfilter已经有了个大致的概念和模块的划分.那面我们还是看一下它的框架图吧:

10c277d5402c7c2e35a94ca940393e73.png

这个图包括了应该所以的hook的地方,具体有IP层的5个,bridge的5个,还有arp层的3个. 这个图只是标出钩子点.没有具体的函数调用.(谅解^^).下面我们就分析下这个框架是如何运作起来的.

通过上一篇NAPI机制我们知道,最后数据包通过netif_receive_skb这个函数来查找具体的以太网协议处理函数继续往上走.下面我们就来看下这个函数:

/**

* netif_receive_skb - process receive buffer from network

* @skb: buffer to process

*

* netif_receive_skb() is the main receive data processing function.

* It always succeeds. The buffer may be dropped during processing

* for congestion control or by the protocol layers.

*

* This function may only be called from softirq context and interrupts

* should be enabled.

*

* Return values (usually ignored):

* NET_RX_SUCCESS: no congestion

* NET_RX_DROP: packet was dropped

*/

int netif_receive_skb(struct sk_buff *skb)

{

if (netdev_tstamp_prequeue)

net_timestamp_check(skb);

if (skb_defer_rx_timestamp(skb))

return NET_RX_SUCCESS;

#ifdef CONFIG_RPS

{

struct rps_dev_flow voidflow, *rflow = &voidflow;

int cpu, ret;

rcu_read_lock();

cpu = get_rps_cpu(skb->dev, skb, &rflow);

if (cpu >= 0) {

ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);

rcu_read_unlock();

} else {

rcu_read_unlock();

ret = __netif_receive_skb(skb);

}

return ret;

}

#else

return __netif_receive_skb(skb);

#endif

}

这里我们发现具体的处理数据的代码已经被封装了. 包括我们看3.1.1的process_backlog代码就会清楚,它是直接调用了__netif_receive_skb这个函数:

点击(此处)折叠或打开

static int __netif_receive_skb(struct sk_buff *skb)

{

struct packet_type *ptype, *pt_prev;

rx_handler_func_t *rx_handler;

struct net_device *orig_dev;

struct net_device *null_or_dev;

bool deliver_exact = false;

int ret = NET_RX_DROP;

__be16 type;

if (!netdev_tstamp_prequeue)

net_timestamp_check(skb);

trace_netif_receive_skb(skb);

/* if we've gotten here through NAPI, check netpoll */

if (netpoll_receive_skb(skb))

return NET_RX_DROP;

if (!skb->skb_iif)

skb->skb_iif = skb->dev->ifindex;

orig_dev = skb->dev;

skb_reset_network_header(skb);

skb_reset_transport_header(skb);

skb_reset_mac_len(skb);

pt_prev = NULL;

rcu_read_lock();

another_round:

__this_cpu_inc(softnet_data.processed);

if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {

skb = vlan_untag(skb);

if (unlikely(!skb))

goto out;

}

#ifdef CONFIG_NET_CLS_ACT

if (skb->tc_verd & TC_NCLS) {

skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);

goto ncls;

}

#endif

list_for_each_entry_rcu(ptype, &ptype_all, list) {

if (!ptype->dev || ptype->dev == skb->dev) {

if (pt_prev)

ret = deliver_skb(skb, pt_prev, orig_dev);

pt_prev = ptype;

}

}

#ifdef CONFIG_NET_CLS_ACT

skb = handle_ing(skb, &pt_prev, &ret, orig_dev);

if (!skb)

goto out;

ncls:

#endif

rx_handler = rcu_dereference(skb->dev->rx_handler);

if (rx_handler) {

if (pt_prev) {

ret = deliver_skb(skb, pt_prev, orig_dev);

pt_prev = NULL;

}

switch (rx_handler(&skb)) {

case RX_HANDLER_CONSUMED:

goto out;

case RX_HANDLER_ANOTHER:

goto another_round;

case RX_HANDLER_EXACT:

deliver_exact = true;

case RX_HANDLER_PASS:

break;

default:

BUG();

}

}

if (vlan_tx_tag_present(skb)) {

if (pt_prev) {

ret = deliver_skb(skb, pt_prev, orig_dev);

pt_prev = NULL;

}

if (vlan_do_receive(&skb)) {

ret = __netif_receive_skb(skb);

goto out;

} else if (unlikely(!skb))

goto out;

}

/* deliver only exact match when indicated */

null_or_dev = deliver_exact ? skb->dev : NULL;

type = skb->protocol;

list_for_each_entry_rcu(ptype,

&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {

if (ptype->type == type &&

(ptype->dev == null_or_dev || ptype->dev == skb->dev ||

ptype->dev == orig_dev)) {

if (pt_prev)

ret =deliver_skb(skb, pt_prev, orig_dev);

pt_prev = ptype;

}

}

if (pt_prev) {

ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);

} else {

atomic_long_inc(&skb->dev->rx_dropped);

kfree_skb(skb);

/* Jamal, now you will not able to escape explaining

* me how you were going to use this. :-)

*/

ret = NET_RX_DROP;

}

out:

rcu_read_unlock();

return ret;

}

这里我们不关注其他代码,只关注黑体标注的部分.首先是第一个list_for_each_entry_rcu部分,它查询ptype_all链表注册的协议处理函数,这里匹配的是ETH_P_ALL(具体定义在if_ether.h头文件里),注册是通过dev_add_pack这个函数,这个链表默认为空. 第二个list_for_each_entry_rcu就是具体的查询具体协议的处理函数里,比如ip_rcv . 它查询的是ptype_base链表,注册函数依然是dev_add_pack函数.这里我们贴出部分关键代码:

type = skb->protocol;

list_for_each_entry_rcu(ptype,

&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {

if (ptype->type == type &&

(ptype->dev == null_or_dev || ptype->dev == skb->dev ||

ptype->dev == orig_dev)) {

if (pt_prev)

ret = deliver_skb(skb, pt_prev, orig_dev);

pt_prev = ptype;

}

}

.....

通过deliver_skb就调用到了具体的协议接受模块了.但是我们发现在__netif_receive_skb里没有handle_bridge函数,那么哪里来处理bridge的包呢? 后来发现在两个list_for_each_entry_rcu查询之间有这么一行:rx_handler = rcu_dereference(skb->dev->rx_handler);

但是我们虽然知道要调用rx_handler,但是具体桥处理函数呢?它是如何联系起来的呢?

我们知道桥处理的标准函数br_handle_frame,在net/bridge目录下br_if.c文件里在创建桥接口的时候br_add_if函数中

err = netdev_rx_handler_register(dev,br_handle_frame, p);

而netdev_rx_handler_register又具体做什么呢?

/**

* netdev_rx_handler_register - register receive handler

* @dev: device to register a handler for

* @rx_handler: receive handler to register

* @rx_handler_data: data pointer that is used by rx handler

*

* Register a receive hander for a device. This handler will then be

* called from __netif_receive_skb. A negative errno code is returned

* on a failure.

*

* The caller must hold the rtnl_mutex.

*

* For a general description of rx_handler, see enum rx_handler_result.

*/

int netdev_rx_handler_register(struct net_device *dev,

rx_handler_func_t *rx_handler,

void *rx_handler_data)

{

ASSERT_RTNL();

if (dev->rx_handler)

return -EBUSY;

rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);

rcu_assign_pointer(dev->rx_handler, rx_handler);

return 0;

}

哈哈,看到这里,我想大家明白了吧,dev->rx_handler, rx_handler联系了起来,而在__netif_receive_skb

中skb->dev->rx_handler的调用,不谋而和嘛^^

而对于arp模块包的接受处理这里就不在说明了.前面我们铺垫了这么多从中断接受处理到进入到真正的协议模块接受处理,还是值得的.

我们开始上菜:ip_rcv函数来处理接收到的ip报文.

点击(此处)折叠或打开

/*

*     Main IP Receive routine.

*/

int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)

{

const struct iphdr *iph;

u32 len;

/* When the interface is in promisc. mode, drop all the crap

* that it receives, do not try to analyse it.

*/

if (skb->pkt_type == PACKET_OTHERHOST)

goto drop;

IP_UPD_PO_STATS_BH(dev_net(dev), IPSTATS_MIB_IN, skb->len);

if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) {

IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);

goto out;

}

if (!pskb_may_pull(skb, sizeof(struct iphdr)))

goto inhdr_error;

iph = ip_hdr(skb);

/*

*    RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.

*

*    Is the datagram acceptable?

*

*    1.    Length at least the size of an ip header

*    2.    Version of 4

*    3.    Checksums correctly. [Speed optimisation for later, skip loopback checksums]

*    4.    Doesn't have a bogus length

*/

if (iph->ihl < 5 || iph->version != 4)

goto inhdr_error;

if (!pskb_may_pull(skb, iph->ihl*4))

goto inhdr_error;

iph = ip_hdr(skb);

if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))

goto inhdr_error;

len = ntohs(iph->tot_len);

if (skb->len < len) {

IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);

goto drop;

} else if (len < (iph->ihl*4))

goto inhdr_error;

/* Our transport medium may have padded the buffer out. Now we know it

* is IP we can trim to the true length of the frame.

* Note this now means skb->len holds ntohs(iph->tot_len).

*/

if (pskb_trim_rcsum(skb, len)) {

IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);

goto drop;

}

/* Remove any debris in the socket control block */

memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));

/* Must drop socket now because of tproxy. */

skb_orphan(skb);

return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL,

ip_rcv_finish);

inhdr_error:

IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);

drop:

kfree_skb(skb);

out:

return NET_RX_DROP;

}

或许这里只用了一行代码而把整个函数贴出来,有点得不偿失.

NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, skb, dev, NULL, ip_rcv_finish);

结合我们的框架图,猜想大概就是先处理NF_INET_PRE_ROUTING处挂的钩子函数,最后在调用ip_rcv_finish. 那么NF_HOOK具体做是什么呢? 我们来看看:

static inline int

NF_HOOK(uint8_t pf, unsigned int hook, struct sk_buff *skb,

struct net_device *in, struct net_device *out,

int (*okfn)(struct sk_buff *))

{

return NF_HOOK_THRESH(pf, hook, skb, in, out, okfn, INT_MIN);

}

它最后只调用了一个函数那就是nf_hook_slow.

点击(此处)折叠或打开

/* Returns 1 if okfn() needs to be executed by the caller,

* -EPERM for NF_DROP, 0 otherwise. */

int nf_hook_slow(u_int8_t pf, unsigned int hook, struct sk_buff *skb,

struct net_device *indev,

struct net_device *outdev,

int (*okfn)(struct sk_buff *),

int hook_thresh)

{

struct list_head *elem;

unsigned int verdict;

int ret = 0;

/* We may already have this, but read-locks nest anyway */

rcu_read_lock();

elem = &nf_hooks[pf][hook];

next_hook:

verdict = nf_iterate(&nf_hooks[pf][hook], skb, hook, indev,

outdev, &elem, okfn, hook_thresh);

if (verdict == NF_ACCEPT || verdict == NF_STOP) {

ret = 1;

} else if ((verdict & NF_VERDICT_MASK) == NF_DROP) {

kfree_skb(skb);

ret = NF_DROP_GETERR(verdict);

if (ret == 0)

ret = -EPERM;

} else if ((verdict & NF_VERDICT_MASK) == NF_QUEUE) {

ret = nf_queue(skb, elem, pf, hook, indev, outdev, okfn,

verdict >> NF_VERDICT_QBITS);

if (ret < 0) {

if (ret == -ECANCELED)

goto next_hook;

if (ret == -ESRCH &&

(verdict & NF_VERDICT_FLAG_QUEUE_BYPASS))

goto next_hook;

kfree_skb(skb);

}

ret = 0;

}

rcu_read_unlock();

return ret;

}

这一行elem = &nf_hooks[pf][hook];获取具体哪个钩子的链表指针.而关键的是nf_iterate函数

unsigned int nf_iterate(struct list_head *head,

struct sk_buff *skb,

unsigned int hook,

const struct net_device *indev,

const struct net_device *outdev,

struct list_head **i,

int (*okfn)(struct sk_buff *),

int hook_thresh)

{

unsigned int verdict;

/*

* The caller must not block between calls to this

* function because of risk of continuing from deleted element.

*/

list_for_each_continue_rcu(*i, head) {

struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;

if (hook_thresh > elem->priority)

continue;

/* Optimization: we don't need to hold module

reference here, since function can't sleep. --RR */

repeat:

verdict = elem->hook(hook, skb, indev, outdev, okfn);

if (verdict != NF_ACCEPT) {

#ifdef CONFIG_NETFILTER_DEBUG

if (unlikely((verdict & NF_VERDICT_MASK)

> NF_MAX_VERDICT)) {

NFDEBUG("Evil return from %p(%u).\n",

elem->hook, hook);

continue;

}

#endif

if (verdict != NF_REPEAT)

return verdict;

goto repeat;

}

}

return NF_ACCEPT;

}

我们具体看下这个函数的粗体部分:

list_for_each_continue_rcu查询链表,然后调用 elem->hook,我就是系统或者我们自定义的钩子函数来处理数据包. 并根据函数的返回值verdict再做具体处理.

哎,分析的好长,貌似真正的菜没多少,这里我们只是打好地基,还有很多需要分析学习,这里只是一个学习的开始,还需要加油努力!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值