dev_queue_xmit在/net/core/dev.c中
int dev_queue_xmit(struct sk_buff *skb)
{
struct net_device *dev = skb->dev;
struct Qdisc *q;
int rc = -ENOMEM;
/* GSO will handle the following emulations directly. */
if (netif_needs_gso(dev, skb))
goto gso;
if (skb_shinfo(skb)->frag_list &&
!(dev->features & NETIF_F_FRAGLIST) &&
__skb_linearize(skb))
goto out_kfree_skb;
/* Fragmented skb is linearized if device does not support SG,
* or if at least one of fragments is in highmem and device
* does not support DMA from it.
*/
if (skb_shinfo(skb)->nr_frags &&
(!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
__skb_linearize(skb))
goto out_kfree_skb;
/* If packet is not checksummed and device does not support
* checksumming for this protocol, complete checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL)
{
skb_set_transport_header(skb, skb->csum_start -
skb_headroom(skb));
if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
goto out_kfree_skb;
}
gso:
spin_lock_prefetch(&dev->queue_lock);
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
rcu_read_lock_bh();
/* Updates of qdisc are serialized by queue_lock.
* The struct Qdisc which is pointed to by qdisc is now a
* rcu structure - it may be accessed without acquiring
* a lock (but the structure may be stale.) The freeing of the
* qdisc will be deferred until it's known that there are no
* more references to it.
*
* If the qdisc has an enqueue function, we still need to
* hold the queue_lock before calling it, since queue_lock
* also serializes access to the device queue.
*/
q = rcu_dereference(dev->qdisc);
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
#endif
if (q->enqueue)
{
/* Grab device queue */
spin_lock(&dev->queue_lock);
q = dev->qdisc;
if (q->enqueue)
{
/* reset queue_mapping to zero */
skb_set_queue_mapping(skb, 0);
rc = q->enqueue(skb, q);
qdisc_run(dev);
spin_unlock(&dev->queue_lock);
rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
goto out;
}
spin_unlock(&dev->queue_lock);
}
/* The device has no queue. Common case for software devices:
loopback, all the sorts of tunnels...
Really, it is unlikely that netif_tx_lock protection is necessary
here. (f.e. loopback and IP tunnels are clean ignoring statistics
counters.)
However, it is possible, that they rely on protection
made by us here.
Check this and shot the lock. It is not prone from deadlocks.
Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP)
{
int cpu = smp_processor_id(); /* ok because BHs are off */
if (dev->xmit_lock_owner != cpu)
{
HARD_TX_LOCK(dev, cpu);
if (!netif_queue_stopped(dev) &&
!netif_subqueue_stopped(dev, skb))
{
rc = 0;
if (!dev_hard_start_xmit(skb, dev))
{
HARD_TX_UNLOCK(dev);
goto out;
}
}
HARD_TX_UNLOCK(dev);
if (net_ratelimit())
printk(KERN_CRIT "Virtual device %s asks to "
"queue packet!\n", dev->name);
}
else
{
/* Recursion is detected! It is possible,
* unfortunately */
if (net_ratelimit())
printk(KERN_CRIT "Dead loop on virtual device "
"%s, fix it urgently!\n", dev->name);
}
}
rc = -ENETDOWN;
rcu_read_unlock_bh();
out_kfree_skb:
kfree_skb(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
因为我们的skb->ip_summed为0
所以是不会进入到if (skb->ip_summed == CHECKSUM_PARTIAL)
虽然dev->qdisc是存在的,但是没有enqueue
所以也不会进入到if (q->enqueue)中
IFF_UP这个标志表示设备正在运行,不运行怎么发包呢~ 所以我们是会进入到if (dev->flags & IFF_UP)中
终于来到dev_hard_start_xmit
dev_hard_start_xmit在/net/core/dev.c中
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev)
{
if (likely(!skb->next)) {
if (!list_empty(&ptype_all))
dev_queue_xmit_nit(skb, dev);
if (netif_needs_gso(dev, skb)) {
if (unlikely(dev_gso_segment(skb)))
goto out_kfree_skb;
if (skb->next)
goto gso;
}
return dev->hard_start_xmit(skb, dev);
}
gso:
do {
struct sk_buff *nskb = skb->next;
int rc;
skb->next = nskb->next;
nskb->next = NULL;
rc = dev->hard_start_xmit(nskb, dev);
if (unlikely(rc)) {
nskb->next = skb->next;
skb->next = nskb;
return rc;
}
if (unlikely((netif_queue_stopped(dev) ||
netif_subqueue_stopped(dev, skb)) &&
skb->next))
return NETDEV_TX_BUSY;
} while (skb->next);
skb->destructor = DEV_GSO_CB(skb)->destructor;
out_kfree_skb:
kfree_skb(skb);
return 0;
}
这里我们的skb的next是为NULL的,进入到if (likely(!skb->next)) 中
最重要就是这条return dev->hard_start_xmit(skb, dev)
我们lo设备的hard_start_xmit为loopback_xmit
loopback_xmit在/drivers/net/loopback.c中
static int loopback_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct pcpu_lstats *pcpu_lstats, *lb_stats;
//回收skb所连接的sock
skb_orphan(skb);
//读取硬件报头
skb->protocol = eth_type_trans(skb,dev);
#ifndef LOOPBACK_MUST_CHECKSUM
skb->ip_summed = CHECKSUM_UNNECESSARY;
#endif
#ifdef LOOPBACK_TSO
if (skb_is_gso(skb)) {
BUG_ON(skb->protocol != htons(ETH_P_IP));
BUG_ON(ip_hdr(skb)->protocol != IPPROTO_TCP);
emulate_large_send_offload(skb);
return 0;
}
#endif
//设置最后收到数据的时间
dev->last_rx = jiffies;
/* it's OK to use per_cpu_ptr() because BHs are off */
pcpu_lstats = netdev_priv(dev);
lb_stats = per_cpu_ptr(pcpu_lstats, smp_processor_id());
lb_stats->bytes += skb->len;
lb_stats->packets++;
netif_rx(skb);
return 0;
}首先是skb_orphan
skb_orphan在include/linux/skbuff.h中
static inline void skb_orphan(struct sk_buff *skb)
{
//检测是否有回收函数
if (skb->destructor)
//执行该回收函数
skb->destructor(skb);
//设置回收函数为NULL
skb->destructor = NULL;
//设置skb所连接的sock为NULL
skb->sk = NULL;
}我们的skb是有destructor操作的,他是sock_wfree
sock_wfree在/net/core/sock.c中
void sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
/* In case it might be waiting for more memory. */
atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
sk->sk_write_space(sk);
sock_put(sk);
}首先减去数据空间的大小
然后执行sock_put, sock_put检测sock的用户数是否为0,为0则销毁该sock
回到loopback_xmit,来到skb->protocol = eth_type_trans(skb,dev)
eth_type_trans在/net/ethernet/eth.c中
__be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev)
{
struct ethhdr *eth;
unsigned char *rawp;
//连接网卡设备到skb
skb->dev = dev;
//设置mac指针
skb_reset_mac_header(skb);
//弹出mac报头
skb_pull(skb, ETH_HLEN);
//取得eth结构
eth = eth_hdr(skb);
//检测是否为多播
if (is_multicast_ether_addr(eth->h_dest))
{
if (!compare_ether_addr(eth->h_dest, dev->broadcast))
skb->pkt_type = PACKET_BROADCAST;
else
skb->pkt_type = PACKET_MULTICAST;
}
/*
* This ALLMULTI check should be redundant by 1.4
* so don't forget to remove it.
*
* Seems, you forgot to remove it. All silly devices
* seems to set IFF_PROMISC.
*/
else if (1 /*dev->flags&IFF_PROMISC */ )
{
if (unlikely(compare_ether_addr(eth->h_dest, dev->dev_addr)))
skb->pkt_type = PACKET_OTHERHOST;
}
if (ntohs(eth->h_proto) >= 1536)
return eth->h_proto;
rawp = skb->data;
/*
* This is a magic hack to spot IPX packets. Older Novell breaks
* the protocol design and runs IPX over 802.3 without an 802.2 LLC
* layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This
* won't work for fault tolerant netware but does for the rest.
*/
if (*(unsigned short *)rawp == 0xFFFF)
return htons(ETH_P_802_3);
/*
* Real 802.2 LLC
*/
return htons(ETH_P_802_2);
}
skb_reset_mac_header设置mac_header为data所指的位置
skb_pull弹出硬件报头
结构图如下
回到loopback_xmit中
关于pcpu_lstats这个结构,我也还不清楚他的用途,跳过吧
之后就剩netif_rx了, netif_rx是数据包的接收函数,到了这里发送流程就结束了
接下来是第4部分接收ICMP包,为什么先说第4部分呢
因为从逻辑上来说发送了ICMP包后,需要等待对方机器的应答,而本机则做好接收准备,而ping自机则是先应答,后做准备,这里我们按照逻辑上的顺序进行分析
ICMP包的接收
recvfrom(sockfd,recvpacket,sizeof(recvpacket),0,(struct sockaddr *)&from,&fromlen)
回到sys_socketcall
这次的目标是case SYS_RECVFROM
sys_recvfrom在/net/socket.c中
asmlinkage long sys_recvfrom(int fd, void __user *ubuf, size_t size,
unsigned flags, struct sockaddr __user *addr,
int __user *addr_len)
{
struct socket *sock;
struct iovec iov;
struct msghdr msg;
char address[MAX_SOCK_ADDR];
int err, err2;
int fput_needed;
//根据文件描述符获取对应的socket
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_iovlen = 1;
msg.msg_iov = &iov;
//设置用户空间接收数据的大小
iov.iov_len = size;
//设置用户空间接收数据的起始地址
iov.iov_base = ubuf;
//设置保存的对方地址结构的起始位置
msg.msg_name = address;
msg.msg_namelen = MAX_SOCK_ADDR;
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
//接收数据
err = sock_recvmsg(sock, &msg, size, flags);
if (err >= 0 && addr != NULL)
{
//拷贝对方地址
err2 = move_addr_to_user(address, msg.msg_namelen, addr, addr_len);
if (err2 < 0)
err = err2;
}
fput_light(sock->file, fput_needed);
out:
return err;
}进入sock_recvmsg
sock_recvmsg在/net/socket.c中
int sock_recvmsg(struct socket *sock, struct msghdr *msg,
size_t size, int flags)
{
struct kiocb iocb;
struct sock_iocb siocb;
int ret;
init_sync_kiocb(&iocb, NULL);
iocb.private = &siocb;
//接收数据
ret = __sock_recvmsg(&iocb, sock, msg, size, flags);
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&iocb);
return ret;
}继续深入__sock_recvmsg
__sock_recvmsg在/net/socket.c中
static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, int flags)
{
int err;
struct sock_iocb *si = kiocb_to_siocb(iocb);
si->sock = sock;
si->scm = NULL;
si->msg = msg;
si->size = size;
si->flags = flags;
err = security_socket_recvmsg(sock, msg, size, flags);
if (err)
return err;
//运行协议接收数据的处理函数
return sock->ops->recvmsg(iocb, sock, msg, size, flags);
}sock->ops->recvmsg为sock_common_recvmsg
sock_common_recvmsg在/net/core/sock.c中
int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
struct msghdr *msg, size_t size, int flags)
{
struct sock *sk = sock->sk;
int addr_len = 0;
int err;
err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
flags & ~MSG_DONTWAIT, &addr_len);
if (err >= 0)
msg->msg_namelen = addr_len;
return err;
}sk->sk_prot->recvmsg为raw_recvmsg
raw_recvmsg在/net/ipv4/raw.c中
static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
size_t len, int noblock, int flags, int *addr_len)
{
struct inet_sock *inet = inet_sk(sk);
size_t copied = 0;
int err = -EOPNOTSUPP;
struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name;
struct sk_buff *skb;
if (flags & MSG_OOB)
goto out;
//是否需要获得对方地址
if (addr_len)
//取得sockaddr_in结构的大小
*addr_len = sizeof(*sin);
if (flags & MSG_ERRQUEUE)
{
err = ip_recv_error(sk, msg, len);
goto out;
}
//接收数据
skb = skb_recv_datagram(sk, flags, noblock, &err);
if (!skb)
goto out;
//取得可以拷贝的数据长度
copied = skb->len;
//检测需要的数据长度是否小于得到的
if (len < copied)
{
msg->msg_flags |= MSG_TRUNC;
//设置得到的数据长度为需要的
copied = len;
}
//拷贝数据
err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
if (err)
goto done;
sock_recv_timestamp(msg, sk, skb);
/* Copy the address. */
//检测是否需要拷贝发送方地址
if (sin)
{
sin->sin_family = AF_INET;
sin->sin_addr.s_addr = ip_hdr(skb)->saddr;
sin->sin_port = 0;
memset(&sin->sin_zero, 0, sizeof(sin->sin_zero));
}
if (inet->cmsg_flags)
ip_cmsg_recv(msg, skb);
if (flags & MSG_TRUNC)
//恢复得到的数据长度
copied = skb->len;
done:
skb_free_datagram(sk, skb);
out:
if (err)
return err;
return copied;
}skb_recv_datagram负责数据的接收
skb_recv_datagram在/net/core/datagram.c中
struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags,
int noblock, int *err)
{
int peeked;
return __skb_recv_datagram(sk, flags | (noblock ? MSG_DONTWAIT : 0),
&peeked, err);
}进入__skb_recv_datagram
__skb_recv_datagram在/net/core/datagram.c中
struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned flags,
int *peeked, int *err)
{
struct sk_buff *skb;
long timeo;
/*
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
*/
int error = sock_error(sk);
if (error)
goto no_packet;
//检测是否需要等待
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
*
* Look at current nfs client by the way...
* However, this function was corrent in any case. 8)
*/
unsigned long cpu_flags;
//获得自旋锁的同时把标志寄存器的值保存到变量flags中并失效本地中断
spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags);
//检测接收队列是否为空
skb = skb_peek(&sk->sk_receive_queue);
//检测skb是否为空
if (skb)
{
*peeked = skb->peeked;
//检测是否保存skb
if (flags & MSG_PEEK)
{
//设置保存标志
skb->peeked = 1;
//增加用户计数器
atomic_inc(&skb->users);
}
else
//从发送队列中剔除该skb
__skb_unlink(skb, &sk->sk_receive_queue);
}
//释放自旋锁lock的同时,也恢复标志寄存器的值为变量flags保存的值
spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags);
//检测skb是否存在
if (skb)
//返回该skb
return skb;
/* User doesn't want to wait */
error = -EAGAIN;
//检测是否等待
if (!timeo)
//不等待则退出
goto no_packet;
} while (!wait_for_packet(sk, err, &timeo));
return NULL;
no_packet:
*err = error;
return NULL;
}这个do_while循环负责等待skb数据包的到来,同时也可以发现,一次只返回一个skb
wait_for_packet用于等待skb的到来
wait_for_packet在/net/core/datagram.c中
static int wait_for_packet(struct sock *sk, int *err, long *timeo_p)
{
int error;
DEFINE_WAIT(wait);
//进入睡眠,等待sk_sleep被唤醒
prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
/* Socket errors? */
//检测sock是否有错误
error = sock_error(sk);
if (error)
goto out_err;
//检测接收队列是否为空
if (!skb_queue_empty(&sk->sk_receive_queue))
goto out;
/* Socket shut down? */
//检测sock是否在停机状态中
if (sk->sk_shutdown & RCV_SHUTDOWN)
goto out_noerr;
/* Sequenced packets can come disconnected.
* If so we report the problem
*/
error = -ENOTCONN;
if (connection_based(sk) &&
!(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN))
goto out_err;
/* handle signals */
//检查当前进程是否有信号处理,返回不为0表示有信号需要处理
if (signal_pending(current))
goto interrupted;
error = 0;
*timeo_p = schedule_timeout(*timeo_p);
out:
//结束睡眠
finish_wait(sk->sk_sleep, &wait);
return error;
interrupted:
error = sock_intr_errno(*timeo_p);
out_err:
*err = error;
goto out;
out_noerr:
*err = 0;
error = 1;
goto out;
}
最主要就是prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE)这一句代码
到了这里就开始睡眠了,怎么唤醒呢?
我们只要在记住某个地方敲醒sk_sleep就行了
好~ 现在到第3部分接收ICMP包并返回应答了
从我们之前停下来的netif_rx开始看起
netif_rx在/net/core/dev.c中
int netif_rx(struct sk_buff *skb)
{
struct softnet_data *queue;
unsigned long flags;
/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx(skb))
return NET_RX_DROP;
//检测是否有时间标识
if (!skb->tstamp.tv64)
//无则添加时间标识
net_timestamp(skb);
/*
* The code is rearranged so that the path is the most
* short when CPU is congested, but is still operating.
*/
//保存中断变量并禁止中断
local_irq_save(flags);
//取得拿到中断的CPU下的softnet_data
queue = &__get_cpu_var(softnet_data);
//增加累积计数器
__get_cpu_var(netdev_rx_stat).total++;
//检测当前softnet_data中skb队列的数量
//大于最大数量则不处理该skb
if (queue->input_pkt_queue.qlen <= netdev_max_backlog)
{
//检测队列是否为空
if (queue->input_pkt_queue.qlen)
{
enqueue:
//增加网卡计数器
dev_hold(skb->dev);
//把该skb添加到softnet_data队列中
__skb_queue_tail(&queue->input_pkt_queue, skb);
//恢复中断,值为之前保存的变量
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
//调度NAPI
napi_schedule(&queue->backlog);
goto enqueue;
}
//增加丢弃skb累积计数器
__get_cpu_var(netdev_rx_stat).dropped++;
//恢复中断,值为之前保存的变量
local_irq_restore(flags);
kfree_skb(skb);
return NET_RX_DROP;
}
关于NAPI的工作机制在PING本机中是无法体现出来的,所以也无法分析,目前知道调用了napi_schedule即可
napi_schedule在/include/linux/netdevice.h中
static inline void napi_schedule(struct napi_struct *n)
{
//检测NAPI的工作状态
if (napi_schedule_prep(n))
//进行NAPI调度
__napi_schedule(n);
}napi_schedule_prep负责NAPI工作状态的检测
napi_schedule_prep在/include/linux/netdevice.h中
static inline int napi_schedule_prep(struct napi_struct *n)
{
//检测是否禁止NAPI
//检测NAPI是否在调度中并设置为调度中
return !napi_disable_pending(n) &&
!test_and_set_bit(NAPI_STATE_SCHED, &n->state);
}然后接着到__napi_schedule
__napi_schedule在/net/core/dev.c中
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
//保存中断变量并禁止中断
local_irq_save(flags);
//挂接当前napi_struct到处理CPU的napi_struct中
list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
//发送NET_RX_SOFTIRQ软中断
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
//恢复中断,值为之前保存的变量
local_irq_restore(flags);
}
NET_RX_SOFTIRQ是一个软中断,激发这个软中断会执行一个指定的函数,这个函数在软中断注册时做为参数
NET_RX_SOFTIRQ的注册在net_dev_init中
open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
可以看见是注册了net_rx_action这个函数,所以来到net_rx_action
net_rx_action在/net/core/dev.c中
static void net_rx_action(struct softirq_action *h)
{
struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
unsigned long start_time = jiffies;
int budget = netdev_budget;
void *have;
//禁止CPU中断
local_irq_disable();
//历遍所有softnet_data
while (!list_empty(list))
{
struct napi_struct *n;
int work, weight;
/* If softirq window is exhuasted then punt.
*
* Note that this is a slight policy change from the
* previous NAPI code, which would allow up to 2
* jiffies to pass before breaking out. The test
* used to be "jiffies - start_time > 1".
*/
if (unlikely(budget <= 0 || jiffies != start_time))
goto softnet_break;
//开启CPU中断
local_irq_enable();
/* Even though interrupts have been re-enabled, this
* access is safe because interrupts can only add new
* entries to the tail of this list, and only ->poll()
* calls can remove this head entry from the list.
*/
//取得softnet_data
n = list_entry(list->next, struct napi_struct, poll_list);
have = netpoll_poll_lock(n);
weight = n->weight;
/* This NAPI_STATE_SCHED test is for avoiding a race
* with netpoll's poll_napi(). Only the entity which
* obtains the lock and sees NAPI_STATE_SCHED set will
* actually make the ->poll() call. Therefore we avoid
* accidently calling ->poll() when NAPI is not scheduled.
*/
work = 0;
//检测NAPI是否在调度状态中
if (test_bit(NAPI_STATE_SCHED, &n->state))
//执行轮询
work = n->poll(n, weight);
WARN_ON_ONCE(work > weight);
budget -= work;
//关闭CPU中断
local_irq_disable();
/* Drivers must not modify the NAPI state if they
* consume the entire weight. In such cases this code
* still "owns" the NAPI instance and therefore can
* move the instance around on the list at-will.
*/
if (unlikely(work == weight))
{
if (unlikely(napi_disable_pending(n)))
__napi_complete(n);
else
list_move_tail(&n->poll_list, list);
}
netpoll_poll_unlock(have);
}
out:
//开启CPU中断
local_irq_enable();
#ifdef CONFIG_NET_DMA
/*
* There may not be any more sk_buffs coming right now, so push
* any pending DMA copies to hardware
*/
if (!cpus_empty(net_dma.channel_mask)) {
int chan_idx;
for_each_cpu_mask(chan_idx, net_dma.channel_mask) {
struct dma_chan *chan = net_dma.channels[chan_idx];
if (chan)
dma_async_memcpy_issue_pending(chan);
}
}
#endif
return;
softnet_break:
__get_cpu_var(netdev_rx_stat).time_squeeze++;
__raise_softirq_irqoff(NET_RX_SOFTIRQ);
goto out;
}这里是历遍所有的softnet_data结构,执行其poll操作
这里的poll操作为process_backlog,随着网卡和其驱动的不同,poll操作也不同
不过我们的lo设备使用的为process_backlog
process_backlog在/net/core/dev.c中
static int process_backlog(struct napi_struct *napi, int quota)
{
int work = 0;
struct softnet_data *queue = &__get_cpu_var(softnet_data);
unsigned long start_time = jiffies;
napi->weight = weight_p;
do {
struct sk_buff *skb;
struct net_device *dev;
//禁止中断
local_irq_disable();
//取得当前input_pkt_queue队列中的第一个skb并且移除该skb出队列
skb = __skb_dequeue(&queue->input_pkt_queue);
//检测该skb是否存在
if (!skb)
{
//不存在则表明该napi_struct上的skb队列处理完成
__napi_complete(napi);
//开启CPU中断
local_irq_enable();
break;
}
//开启中断
local_irq_enable();
dev = skb->dev;
//处理skb
netif_receive_skb(skb);
dev_put(dev);
//每处理完一个skb便增加work计数器并检查处理的skb总数是否达到配额
//检测处理时间是否仍然在一个时间片之内
} while (++work < quota && jiffies == start_time);
return work;
}
ip_packet_type的结构如下
static struct packet_type ip_packet_type = {
.type = __constant_htons(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
};
首先是复位network_header和transport_header
然后计算硬件帧报头的大小
完成后的结构图如下
ip_packet_type是注册在ptype_base下面的
deliver_skb也是间接的调用了pt_prev->func(skb, skb->dev, pt_prev, orig_dev)
所以最后还是会走入到ip_rcv中
ip_rcv在/net/ipv4/ip_output.c中
int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
{
struct iphdr *iph;
u32 len;
/* When the interface is in promisc. mode, drop all the crap
* that it receives, do not try to analyse it.
*/
//检测协议类型
if (skb->pkt_type == PACKET_OTHERHOST)
goto drop;
//增加收到IP计数器
IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES);
//检测skb的缓冲是否为共享
if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
{
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto out;
}
//检测是否有足够的空间给IP报头(不包含options)
if (!pskb_may_pull(skb, sizeof(struct iphdr)))
goto inhdr_error;
//取得IP报头
iph = ip_hdr(skb);
/*
* RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
*
* Is the datagram acceptable?
*
* 1. Length at least the size of an ip header
* 2. Version of 4
* 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums]
* 4. Doesn't have a bogus length
*/
//检测首部长度是否小于5个字节
//检测版本是否为v4
if (iph->ihl < 5 || iph->version != 4)
goto inhdr_error;
//检测是否有足够的空间给IP报头(包含options)
if (!pskb_may_pull(skb, iph->ihl*4))
goto inhdr_error;
//重新取得IP报头
iph = ip_hdr(skb);
//检测效验和
if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
goto inhdr_error;
//取得总长度
len = ntohs(iph->tot_len);
//检测数据长度是否小于总长度
if (skb->len < len)
{
IP_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS);
goto drop;
}
//检测总长度是否小于ip报头的数据长度
else if (len < (iph->ihl*4))
goto inhdr_error;
/* Our transport medium may have padded the buffer out. Now we know it
* is IP we can trim to the true length of the frame.
* Note this now means skb->len holds ntohs(iph->tot_len).
*/
//检测效验和
if (pskb_trim_rcsum(skb, len))
{
IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS);
goto drop;
}
/* Remove any debris in the socket control block */
//初始化控制块
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
//提交合格的skb到上一层
return NF_HOOK(PF_INET, NF_INET_PRE_ROUTING, skb, dev, NULL,
ip_rcv_finish);
inhdr_error:
IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
drop:
kfree_skb(skb);
out:
return NET_RX_DROP;
}
好~ 合格~ 上一层是ip_rcv_finish
ip_rcv_finish在/net/ipv4/ip_input.c中
static int ip_rcv_finish(struct sk_buff *skb)
{
//得到IP报头
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
/*
* Initialise the virtual path cache for the packet. It describes
* how the packet travels inside Linux networking.
*/
//检测目的地址是否为空
if (skb->dst == NULL)
{
//检测路由
int err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
skb->dev);
if (unlikely(err))
{
if (err == -EHOSTUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS);
else if (err == -ENETUNREACH)
IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
goto drop;
}
}
#ifdef CONFIG_NET_CLS_ROUTE
if (unlikely(skb->dst->tclassid)) {
struct ip_rt_acct *st = per_cpu_ptr(ip_rt_acct, smp_processor_id());
u32 idx = skb->dst->tclassid;
st[idx&0xFF].o_packets++;
st[idx&0xFF].o_bytes+=skb->len;
st[(idx>>16)&0xFF].i_packets++;
st[(idx>>16)&0xFF].i_bytes+=skb->len;
}
#endif
//检测ip报头的总长度是否大于5个字节
//大于5个字节则解析options
if (iph->ihl > 5 && ip_rcv_options(skb))
goto drop;
//取得路由信息
rt = skb->rtable;
//检测是否为多播
if (rt->rt_type == RTN_MULTICAST)
//增加多播计数器
IP_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS);
//检测是否为广播
else if (rt->rt_type == RTN_BROADCAST)
//增加广播计数器
IP_INC_STATS_BH(IPSTATS_MIB_INBCASTPKTS);
return dst_input(skb);
drop:
kfree_skb(skb);
return NET_RX_DROP;
}继续上一层,到dst_input
dst_input在/include/net/dst.h中
static inline int dst_input(struct sk_buff *skb)
{
int err;
//skb处理被忽略则一直尝试再提交
for (;;) {
err = skb->dst->input(skb);
if (likely(err == 0))
return err;
/* Oh, Jamal... Seems, I will not forgive you this mess. :-) */
if (unlikely(err != NET_XMIT_BYPASS))
return err;
}
}这里dst->input为ip_local_deliver
ip_local_deliver在/net/ipv4/ip_input.c中
int ip_local_deliver(struct sk_buff *skb)
{
/*
* Reassemble IP fragments.
*/
//检测是否还有分片以及分片长度
if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
if (ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER))
return 0;
}
//提交skb到上一层
return NF_HOOK(PF_INET, NF_INET_LOCAL_IN, skb, skb->dev, NULL,
ip_local_deliver_finish);
}继续上一层ip_local_deliver_finish
ip_local_deliver_finish在/net/ipv4/input.c中
static int ip_local_deliver_finish(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
//弹出ip报头
__skb_pull(skb, ip_hdrlen(skb));
/* Point into the IP datagram, just past the header. */
//复位运输层指针
skb_reset_transport_header(skb);
//加上RCU缩
rcu_read_lock();
{
//取得ip报头中的协议类型
int protocol = ip_hdr(skb)->protocol;
int hash, raw;
struct net_protocol *ipprot;
resubmit:
//发送到raw层
raw = raw_local_deliver(skb, protocol);
//计算协议的哈希值
hash = protocol & (MAX_INET_PROTOS - 1);
//取得协议结构
ipprot = rcu_dereference(inet_protos[hash]);
//检测协议是否为空
//检测net是否为init_net 或者协议的netns_ok值为真
if (ipprot != NULL && (net == &init_net || ipprot->netns_ok))
{
int ret;
if (!ipprot->no_policy)
{
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
{
kfree_skb(skb);
goto out;
}
nf_reset(skb);
}
//递交skb给协议
ret = ipprot->handler(skb);
if (ret < 0)
{
protocol = -ret;
goto resubmit;
}
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
}
else
{
if (!raw)
{
if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
{
IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS);
icmp_send(skb, ICMP_DEST_UNREACH,
ICMP_PROT_UNREACH, 0);
}
}
else
IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS);
kfree_skb(skb);
}
}
out:
//释放RCU锁
rcu_read_unlock();
return 0;
}