Linux TCP/IP 协议栈学习(2)—— 数据帧收发主要函数及net_device 结构

 
 
 
 
 
/**
* netif_rx - post buffer to the network code
* @skb: buffer to post
*
* This function receives a packet from a device driver and queues it for
* the upper (protocol) levels to process. It always succeeds. The buffer
* may be dropped during processing for congestion control or by the
* protocol layers.
*
* return values:
* NET_RX_SUCCESS (no congestion)
* NET_RX_DROP (packet was dropped)
*
*/
int netif_rx( struct sk_buff * skb)
{
struct softnet_data * queue;
unsigned long flags ;
/* if netpoll wants it, pretend we never saw it */
if (netpoll_rx (skb ))
return NET_RX_DROP;
if (!skb->tstamp .tv64 ) //得到帧接收的时间
net_timestamp(skb);
/*
* The code is rearranged so that the path is the most
* short when CPU is congested, but is still operating.
*/
local_irq_save(flags);
queue = &__get_cpu_var (softnet_data );//获取当前CPU的 softnet_data 数据
__get_cpu_var(netdev_rx_stat ).total ++;//当前CPU接收的帧数+1
if (queue->input_pkt_queue .qlen <= netdev_max_backlog) {
//监测设备是否还有空间来存储帧,如果空间已满,表示网络阻塞严重,则返回一个错误,此后cpu将丢掉再来的帧。
if (queue->input_pkt_queue .qlen ) {
enqueue:
//将该帧加入到softnet_data队列
__skb_queue_tail(&queue ->input_pkt_queue, skb);
local_irq_restore(flags);
return NET_RX_SUCCESS;
}
//当队列是空的时候,表明这个队列并没有被软中断所schedule,因此我们需要将此队列加入到软中断的处理链表中。可以看到加入的正好是backlog,由于调用netif_rx的是非napi的驱动,因此backlog就是初始化时的process_backlog函数。
napi_schedule(&queue ->backlog);
goto enqueue;
}
__get_cpu_var(netdev_rx_stat ).dropped ++;
local_irq_restore(flags);
kfree_skb(skb);
return NET_RX_DROP;
}
// 上面代码中用到一个关键的数据结构 softnet_data , 在网卡收发数据的时候,需要维护一个缓冲区队列,来缓存可能存在的突发数据,在协议栈中用一个队列层来表示该缓冲区,队列层位于数据链路层和网络层之间。softnet_data 就是数据链路层中的数据结构,它是一个Per-CPU变量,每个CPU都有一个
/**
* netif_receive_skb - process receive buffer from network
* @skb: buffer to process
*
* netif_receive_skb() is the main receive data processing function.
* It always succeeds. The buffer may be dropped during processing
* for congestion control or by the protocol layers.
*
* This function may only be called from softirq context and interrupts
* should be enabled.
*
* Return values (usually ignored):
* NET_RX_SUCCESS: no congestion
* NET_RX_DROP: packet was dropped
*/
//netif_receive_skb 是对于 netif_rx 的 NAPI 对等函数; 它递交一个报文给内核. 当一个 NAPI 兼容的驱动已耗尽接收报文的供应, 它应当重开中断, 并且调用 netif_rx_complete(现在是 __napi_complete() ) 来停止轮询.
int netif_receive_skb( struct sk_buff * skb)
{
struct packet_type * ptype, *pt_prev ;
struct net_device * orig_dev;
struct net_device * master;
struct net_device * null_or_orig;
struct net_device * null_or_bond;
int ret = NET_RX_DROP;
__be16 type;
if (!skb->tstamp .tv64 )
net_timestamp(skb);
if (vlan_tx_tag_present (skb ) && vlan_hwaccel_do_receive(skb))
return NET_RX_SUCCESS;
/* if we've gotten here through NAPI, check netpoll */
if (netpoll_receive_skb (skb ))
return NET_RX_DROP;
if (!skb->skb_iif )
skb->skb_iif = skb ->dev-> ifindex;// 记录帧的入口
null_or_orig = NULL;
orig_dev = skb->dev;
master = ACCESS_ONCE (orig_dev ->master);
if (master) {
if (skb_bond_should_drop (skb , master ))
null_or_orig = orig_dev ; /* deliver only exact match */
else
skb->dev = master ;
}
__get_cpu_var(netdev_rx_stat ).total ++;
skb_reset_network_header(skb);
skb_reset_transport_header(skb);
skb->mac_len = skb ->network_header - skb->mac_header ;
pt_prev = NULL;
rcu_read_lock();
#ifdef CONFIG_NET_CLS_ACT
if (skb->tc_verd & TC_NCLS) {
skb->tc_verd = CLR_TC_NCLS( skb->tc_verd );
goto ncls;
}
#endif
// 处理 ptype_all 上所有的 packet_type->func() ,这里先提一下Linux 是根据packet_type 通过 dev_add_pack() 函数 来注册相应的处理函数,后面会讲如何注册,每种包对应哪个处理函数
// static struct list_head ptype_all __read_mostly;
list_for_each_entry_rcu(ptype, &ptype_all , list ) {
if (ptype->dev == null_or_orig || ptype->dev == skb-> dev ||
ptype->dev == orig_dev) {
if (pt_prev)
ret = deliver_skb (skb , pt_prev , orig_dev );//调用相应的包处理函数
pt_prev = ptype;
}
}
#ifdef CONFIG_NET_CLS_ACT
skb = handle_ing (skb , &pt_prev , &ret , orig_dev );
if (!skb)
goto out;
ncls:
#endif
// 若编译内核时选上BRIDGE,下面会执行网桥模块
skb = handle_bridge (skb , &pt_prev , &ret , orig_dev );
if (!skb)
goto out;
// 编译内核时选上MAC_VLAN模块,下面才会执行
skb = handle_macvlan (skb , &pt_prev , &ret , orig_dev );
if (!skb)
goto out;
/*
* Make sure frames received on VLAN interfaces stacked on
* bonding interfaces still make their way to any base bonding
* device that may have registered for a specific ptype. The
* handler may have to adjust skb->dev and orig_dev.
*/
null_or_bond = NULL;
if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
(vlan_dev_real_dev( skb->dev)->priv_flags & IFF_BONDING)) {
null_or_bond = vlan_dev_real_dev (skb ->dev);
}
//最后 type = skb->protocol; &ptype_base[ntohs(type)&15]处理ptype_base[ntohs(type)&15]上的所有的 packet_type->func(),根据第二层不同协议来进入不同的钩子函数,重要的有:ip_rcv(), arp_rcv()
type = skb->protocol ;
list_for_each_entry_rcu(ptype,
&ptype_base[ntohs (type ) & PTYPE_HASH_MASK], list) {
if (ptype->type == type && (ptype ->dev == null_or_orig ||
ptype->dev == skb-> dev || ptype->dev == orig_dev ||
ptype->dev == null_or_bond)) {
if (pt_prev)
ret = deliver_skb (skb , pt_prev , orig_dev );
pt_prev = ptype;
}
}
if (pt_prev) {
ret = pt_prev ->func( skb, skb->dev, pt_prev , orig_dev );
} else {
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
*/
ret = NET_RX_DROP ;
}
out:
rcu_read_unlock();
return ret;
}
/**
* dev_queue_xmit - transmit a buffer
* @skb: buffer to transmit
*
* Queue a buffer for transmission to a network device. The caller must
* have set the device and priority and built the buffer before calling
* this function. The function can be called from an interrupt.
*
* A negative errno code is returned on a failure. A success does not
* guarantee the frame will be transmitted as it may be dropped due
* to congestion or traffic shaping.
*
* -----------------------------------------------------------------------------------
* I notice this method can also return errors from the queue disciplines,
* including NET_XMIT_DROP, which is a positive value. So, errors can also
* be positive.
*
* Regardless of the return value, the skb is consumed, so it is currently
* difficult to retry a send to this method. (You can bump the ref count
* before sending to hold a reference for retry if you are careful.)
*
* When calling this method, interrupts MUST be enabled. This is because
* the BH enable code must have IRQs enabled so that it will not deadlock.
* --BLG
*/
int dev_queue_xmit( struct sk_buff * skb)
{
struct net_device * dev = skb->dev;
struct netdev_queue * txq;
struct Qdisc * q;
int rc = - ENOMEM;
/* GSO will handle the following emulations directly. */
if (netif_needs_gso (dev , skb ))//如果是GSO数据包,且设备支持GSO数据包的处理
goto gso;
/* Convert a paged skb to linear, if required */
if (skb_needs_linearize (skb , dev ) && __skb_linearize(skb))
goto out_kfree_skb;
/* If packet is not checksummed and device does not support
* checksumming for this protocol, complete checksumming here.
*/
if (skb->ip_summed == CHECKSUM_PARTIAL) {
skb_set_transport_header(skb, skb->csum_start -
skb_headroom(skb));
if (!dev_can_checksum (dev , skb ) && skb_checksum_help(skb))
goto out_kfree_skb;
}
gso:
/* Disable soft irqs for various locks below. Also
* stops preemption for RCU.
*/
rcu_read_lock_bh();
txq = dev_pick_tx (dev , skb );
q = rcu_dereference_bh(txq->qdisc );
#ifdef CONFIG_NET_CLS_ACT
skb->tc_verd = SET_TC_AT( skb->tc_verd , AT_EGRESS );
#endif
if (q->enqueue ) {
rc = __dev_xmit_skb (skb , q , dev , txq );
goto out;
}
/* The device has no queue. Common case for software devices:
loopback, all the sorts of tunnels...
Really, it is unlikely that netif_tx_lock protection is necessary
here. (f.e. loopback and IP tunnels are clean ignoring statistics
counters.)
However, it is possible, that they rely on protection
made by us here.
Check this and shot the lock. It is not prone from deadlocks.
Either shot noqueue qdisc, it is even simpler 8)
*/
if (dev->flags & IFF_UP) {
int cpu = smp_processor_id(); /* ok because BHs are off */
if (txq->xmit_lock_owner != cpu) {
HARD_TX_LOCK(dev, txq, cpu);
if (!netif_tx_queue_stopped (txq )) {
rc = dev_hard_start_xmit (skb , dev , txq );
if (dev_xmit_complete (rc )) {
HARD_TX_UNLOCK(dev, txq);
goto out;
}
}
HARD_TX_UNLOCK(dev, txq);
if (net_ratelimit ())
printk(KERN_CRIT "Virtual device %s asks to "
"queue packet!\n" , dev ->name);
} else {
/* Recursion is detected! It is possible,
* unfortunately */
if (net_ratelimit ())
printk(KERN_CRIT "Dead loop on virtual device "
"%s, fix it urgently!\n" , dev ->name);
}
}
rc = -ENETDOWN ;
rcu_read_unlock_bh();
out_kfree_skb:
kfree_skb(skb);
return rc;
out:
rcu_read_unlock_bh();
return rc;
}
数据链路层不得不谈到 struct net_device 相关结构,在2.6.29之后 net_device 结构进行了调整,操作函数被重构到了 net_device_ops 中。下面简要分析一下:
struct net_device
{
/*
This first field, name, is the beginning of the visible part of this structure. It contains the string
that is the name of the interface. By visible, we mean that this part of the data structure is generic
and doesn’t contain any private areas specific to a particular type of device
.
*/
char name[IFNAMSIZ ];
/* device name hash chain */
struct hlist_node name_hlist;
/* snmp alias */
char *ifalias ;
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
*/
unsigned long mem_end; /* shared mem end */
unsigned long mem_start; /* shared mem start */
unsigned long base_addr; /* device I/O address */
unsigned int irq; /* device IRQ number */
/*
* Some hardware also needs these fields, but they are not
* part of the usual set specified in Space.c.
*/
unsigned char if_port; /* Selectable AUI, TP,..*/
unsigned char dma; /* DMA channel */
unsigned long state;
/*
*/
struct list_head dev_list;
struct list_head napi_list;
struct list_head unreg_list;
/* Net device features */
unsigned long features;
/*
*/
#define NETIF_F_SG 1 /* Scatter/gather IO. */
#define NETIF_F_IP_CSUM 2 /* Can checksum TCP/UDP over IPv4. */
#define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */
#define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */
#define NETIF_F_IPV6_CSUM 16 /* Can checksum TCP/UDP over IPV6 */
#define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */
#define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */
#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
#define NETIF_F_GSO 2048 /* Enable software GSO. */
#define NETIF_F_LLTX 4096 /* LockLess TX - deprecated. Please */
/* do not use LLTX in new drivers */
#define NETIF_F_NETNS_LOCAL 8192 /* Does not change network namespaces */
#define NETIF_F_GRO 16384 /* Generic receive offload */
#define NETIF_F_LRO 32768 /* large receive offload */
/* the GSO_MASK reserves bits 16 through 23 */
#define NETIF_F_FCOE_CRC (1 << 24) /* FCoE CRC32 */
#define NETIF_F_SCTP_CSUM (1 << 25) /* SCTP checksum offload */
#define NETIF_F_FCOE_MTU (1 << 26) /* Supports max FCoE MTU, 2158 bytes*/
#define NETIF_F_NTUPLE (1 << 27) /* N-tuple filters supported */
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
#define NETIF_F_GSO_MASK 0x00ff0000
#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
#define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)
#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
#define NETIF_F_FSO (SKB_GSO_FCOE << NETIF_F_GSO_SHIFT)
/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
#define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM)
#define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
#define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
/*
* If one device supports one of these features, then enable them
* for all in netdev_increment_features.
*/
#define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
NETIF_F_SG | NETIF_F_HIGHDMA | \
NETIF_F_FRAGLIST )
/* Interface index. Unique device identifier */
int ifindex;
int iflink;
struct net_device_stats stats;
#ifdef CONFIG_WIRELESS_EXT
/* List of functions to handle Wireless Extensions (instead of ioctl).
* See <net/iw_handler.h> for details. Jean II */
const struct iw_handler_def * wireless_handlers;
/* Instance data managed by the core of Wireless Extensions. */
struct iw_public_data * wireless_data;
#endif
/* Management operations */
const struct net_device_ops *netdev_ops ;
const struct ethtool_ops *ethtool_ops ;
/* Hardware header description */
const struct header_ops *header_ops ;
unsigned int flags; /* interface flags (a la BSD) */
unsigned short gflags;
unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */
unsigned short padded; /* How much padding added by alloc_netdev() */
unsigned char operstate; /* RFC2863 operstate */
unsigned char link_mode; /* mapping policy to operstate */
unsigned mtu; /* interface MTU value */
unsigned short type; /* interface hardware type */
unsigned short hard_header_len; /* hardware hdr length */
/* extra head- and tailroom the hardware may need, but not in all cases
* can this be guaranteed, especially tailroom. Some cases also use
* LL_MAX_HEADER instead to allocate the skb.
*/
unsigned short needed_headroom;
unsigned short needed_tailroom;
struct net_device * master; /* Pointer to master device of a group,
* which this device is member of.
*/
/* Interface address info. */
unsigned char perm_addr[MAX_ADDR_LEN ]; /* permanent hw address */
unsigned char addr_len; /* hardware address length */
unsigned short dev_id; /* for shared network cards */
struct netdev_hw_addr_list uc; /* Secondary unicast
mac addresses */
int uc_promisc;
spinlock_t addr_list_lock ;
struct dev_addr_list *mc_list; /* Multicast mac addresses */
int mc_count; /* Number of installed mcasts */
unsigned int promiscuity;
unsigned int allmulti;
/* Protocol specific pointers */
#ifdef CONFIG_NET_DSA
void *dsa_ptr ; /* dsa specific data */
#endif
void *atalk_ptr ; /* AppleTalk link */
void *ip_ptr ; /* IPv4 specific data */
void *dn_ptr ; /* DECnet specific data */
void *ip6_ptr ; /* IPv6 specific data */
void *ec_ptr ; /* Econet specific data */
void *ax25_ptr ; /* AX.25 specific data */
struct wireless_dev *ieee80211_ptr ; /* IEEE 802.11 specific data,
assign before registering */
/*
* Cache line mostly used on receive path (including eth_type_trans())
*/
unsigned long last_rx; /* Time of last Rx */
/* Interface address info used in eth_type_trans() */
unsigned char *dev_addr ; /* hw address, (before bcast
because most packets are
unicast) */
struct netdev_hw_addr_list dev_addrs; /* list of device
hw addresses */
unsigned char broadcast[MAX_ADDR_LEN ]; /* hw bcast add */
struct netdev_queue rx_queue;
struct netdev_queue *_tx ____cacheline_aligned_in_smp ;
/* Number of TX queues allocated at alloc_netdev_mq() time */
unsigned int num_tx_queues;
/* Number of TX queues currently active in device */
unsigned int real_num_tx_queues;
/* root qdisc from userspace point of view */
struct Qdisc *qdisc;
unsigned long tx_queue_len; /* Max frames per queue allowed */
spinlock_t tx_global_lock ;
/*
* One part is mostly used on xmit path (device)
*/
/* These may be needed for future network-power-down code. */
/*
* trans_start here is expensive for high speed devices on SMP,
* please use netdev_queue->trans_start instead.
*/
unsigned long trans_start; /* Time (in jiffies) of last Tx */
int watchdog_timeo; /* used by dev_watchdog() */
struct timer_list watchdog_timer;
/* Number of references to this device */
atomic_t refcnt ____cacheline_aligned_in_smp ;
/* delayed register/unregister */
struct list_head todo_list;
/* device index hash chain */
struct hlist_node index_hlist;
struct list_head link_watch_list;
/* register/unregister state machine */
enum { NETREG_UNINITIALIZED =0,
NETREG_REGISTERED, /* completed register_netdevice */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
NETREG_DUMMY, /* dummy device for NAPI poll */
} reg_state: 16;
enum {
RTNL_LINK_INITIALIZED,
RTNL_LINK_INITIALIZING,
} rtnl_link_state:16;
/* Called from unregister, can be used to call free_netdev */
void (*destructor )(struct net_device *dev );
#ifdef CONFIG_NETPOLL
struct netpoll_info *npinfo;
#endif
#ifdef CONFIG_NET_NS
/* Network namespace this network device is inside */
struct net *nd_net;
#endif
/* mid-layer private */
void *ml_priv ;
/* bridge stuff */
struct net_bridge_port * br_port;
/* macvlan */
struct macvlan_port *macvlan_port ;
/* GARP */
struct garp_port * garp_port;
/* class/net/name entry */
struct device dev;
/* space for optional device, statistics, and wireless sysfs groups */
const struct attribute_group *sysfs_groups [4];
/* rtnetlink link ops */
const struct rtnl_link_ops *rtnl_link_ops ;
/* VLAN feature mask */
unsigned long vlan_features ;
/* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE 65536
unsigned int gso_max_size;
#ifdef CONFIG_DCB
/* Data Center Bridging netlink ops */
const struct dcbnl_rtnl_ops *dcbnl_ops ;
#endif
#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/* max exchange id for FCoE LRO by ddp */
unsigned int fcoe_ddp_xid;
#endif
/* n-tuple filter list attached to this device */
struct ethtool_rx_ntuple_list ethtool_ntuple_list;
};
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值