网络适配器是通过软件实现的,能够适配大多数网卡的组件。可以这么理解,网络适配器之于网卡,就相当于类之于实例。而虚拟网络适配器,则是并不对应一个物理网卡,叫做虚拟网络适配器,不与pci上的地址绑定。
直接看实现。
模块实现
还是和之前一样,需要insmod插入自己定义的模块。
//__init宏表示把函数代码放到内核中的代码段init.text中。注意与用户空间的代码段不同
static int __init nic_init(void) {
int ret = 0;
struct nic_priv *priv;
pr_info("%s(#%d): install module\n", __func__, __LINE__);
//定义两个,是为了可以实现一个发一个收
nic_dev[0] = nic_alloc_netdev(); //定义全局变量struct net_device nic_dev[2];
if (!nic_dev[0]) {
printk("%s(#%d): alloc netdev[0] failed\n", __func__, __LINE__);
return -ENOMEM;
}
nic_dev[1] = nic_alloc_netdev();
if (!nic_dev[1]) {
printk("%s(#%d): alloc netdev[1] failed\n", __func__, __LINE__);
goto alloc_2nd_failed;
}
ret = register_netdev(nic_dev[0]);
if (ret) {
printk("%s(#%d): reg net driver failed. ret: %d\n", __func__, __LINE__, ret);
goto reg1_failed;
}
ret = register_netdev(nic_dev[1]);
if (ret) {
printk("%s(#%d): reg net driver failed. ret:%d\n", __func__, __LINE__, ret);
goto reg2_failed;
}
priv = netdev_priv(nic_dev[0]);
priv->msg_enable = DEF_MSG_ENABLE;
priv = netdev_priv(nic_dev[1]);
priv->msg_enable = DEF_MSG_ENABLE;
return 0;
reg2_failed:
unregister_netdev(nic_dev[0]);
reg1_failed:
free_netdev(nic_dev[1]);
alloc_2nd_failed:
free_netdev(nic_dev[0]);
return ret;
}
static void __exit nic_exit(void) {
int i = 0;
pr_info("%s(#%d): remove module\n", __func__, __LINE__);
for (i = 0;i < ARRAY_SIZE(nic_dev);i ++) {
unregister_netdev(nic_dev[i]);
free_netdev(nic_dev[i]);
}
}
module_init(nic_init);
module_exit(nic_exit);
这里涉及几个函数。
第一个是自己定义的net_device分配函数nic_alloc_netdev(),除了分配之外,还有一些变量赋值的操作。
static struct net_device *nic_alloc_netdev(void) {
//内核实现的,分配net_device的函数,参数是netdev所需私有空间的大小
struct net_device *netdev = alloc_etherdev(sizeof(struct nic_priv));
if (!netdev) {
pr_err("%s(#%d): alloc dev failed", __func__, __LINE__);
return NULL;
}
eth_hw_addr_random(netdev); //赋予netdev随机mac地址
netdev->netdev_ops = &nic_netdev_ops; //netdev内部各种操作,下面会介绍
netdev->flags |= IFF_NOARP;
//这个特征值,表示使用系统自带的,汇编实现的校验和
netdev->features |= NETIF_F_HW_CSUM;
netdev->header_ops = &nic_header_ops;
return netdev;
}
这里说一下 struct net_device 私有空间的问题。一般思路,是认为结构体中有个成员(如void *)指向一块空间,这块空间为该实例私有,例如之前介绍的inode结构体。然而,事实不是这样。
//英文注释全贴下来了,就不写中文了
/**
* struct net_device - The DEVICE structure.
* Actually, this whole structure is a big mistake. It mixes I/O
* data with strictly "high-level" data, and it has to know about
* almost every data structure used in the INET module.
*
* @name: This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
* of the interface.
*
* @name_hlist: Device name hash chain, please keep it close to name[]
* @ifalias: SNMP alias
* @mem_end: Shared memory end
* @mem_start: Shared memory start
* @base_addr: Device I/O address
* @irq: Device IRQ number
*
* @carrier_changes: Stats to monitor carrier on<->off transitions
*
* @state: Generic network queuing layer state, see netdev_state_t
* @dev_list: The global list of network devices
* @napi_list: List entry used for polling NAPI devices
* @unreg_list: List entry when we are unregistering the
* device; see the function unregister_netdev
* @close_list: List entry used when we are closing the device
* @ptype_all: Device-specific packet handlers for all protocols
* @ptype_specific: Device-specific, protocol-specific packet handlers
*
* @adj_list: Directly linked devices, like slaves for bonding
* @features: Currently active device features
* @hw_features: User-changeable features
*
* @wanted_features: User-requested features
* @vlan_features: Mask of features inheritable by VLAN devices
*
* @hw_enc_features: Mask of features inherited by encapsulating devices
* This field indicates what encapsulation
* offloads the hardware is capable of doing,
* and drivers will need to set them appropriately.
*
* @mpls_features: Mask of features inheritable by MPLS
*
* @ifindex: interface index
* @group: The group the device belongs to
*
* @stats: Statistics struct, which was left as a legacy, use
* rtnl_link_stats64 instead
*
* @rx_dropped: Dropped packets by core network,
* do not use this in drivers
* @tx_dropped: Dropped packets by core network,
* do not use this in drivers
* @rx_nohandler: nohandler dropped packets by core network on
* inactive devices, do not use this in drivers
*
* @wireless_handlers: List of functions to handle Wireless Extensions,
* instead of ioctl,
* see <net/iw_handler.h> for details.
* @wireless_data: Instance data managed by the core of wireless extensions
*
* @netdev_ops: Includes several pointers to callbacks,
* if one wants to override the ndo_*() functions
* @ethtool_ops: Management operations
* @ndisc_ops: Includes callbacks for different IPv6 neighbour
* discovery handling. Necessary for e.g. 6LoWPAN.
* @header_ops: Includes callbacks for creating,parsing,caching,etc
* of Layer 2 headers.
*
* @flags: Interface flags (a la BSD)
* @priv_flags: Like 'flags' but invisible to userspace,
* see if.h for the definitions
* @gflags: Global flags ( kept as legacy )
* @padded: How much padding added by alloc_netdev()
* @operstate: RFC2863 operstate
* @link_mode: Mapping policy to operstate
* @if_port: Selectable AUI, TP, ...
* @dma: DMA channel
* @mtu: Interface MTU value
* @min_mtu: Interface Minimum MTU value
* @max_mtu: Interface Maximum MTU value
* @type: Interface hardware type
* @hard_header_len: Maximum hardware header length.
* @min_header_len: Minimum hardware header length
*
* @needed_headroom: Extra headroom the hardware may need, but not in all
* cases can this be guaranteed
* @needed_tailroom: Extra tailroom the hardware may need, but not in all
* cases can this be guaranteed. Some cases also use
* LL_MAX_HEADER instead to allocate the skb
*
* interface address info:
*
* @perm_addr: Permanent hw address
* @addr_assign_type: Hw address assignment type
* @addr_len: Hardware address length
* @neigh_priv_len: Used in neigh_alloc()
* @dev_id: Used to differentiate devices that share
* the same link layer address
* @dev_port: Used to differentiate devices that share
* the same function
* @addr_list_lock: XXX: need comments on this one
* @uc_promisc: Counter that indicates promiscuous mode
* has been enabled due to the need to listen to
* additional unicast addresses in a device that
* does not implement ndo_set_rx_mode()
* @uc: unicast mac addresses
* @mc: multicast mac addresses
* @dev_addrs: list of device hw addresses
* @queues_kset: Group of all Kobjects in the Tx and RX queues
* @promiscuity: Number of times the NIC is told to work in
* promiscuous mode; if it becomes 0 the NIC will
* exit promiscuous mode
* @allmulti: Counter, enables or disables allmulticast mode
*
* @vlan_info: VLAN info
* @dsa_ptr: dsa specific data
* @tipc_ptr: TIPC specific data
* @atalk_ptr: AppleTalk link
* @ip_ptr: IPv4 specific data
* @dn_ptr: DECnet specific data
* @ip6_ptr: IPv6 specific data
* @ax25_ptr: AX.25 specific data
* @ieee80211_ptr: IEEE 802.11 specific data, assign before registering
*
* @dev_addr: Hw address (before bcast,
* because most packets are unicast)
*
* @_rx: Array of RX queues
* @num_rx_queues: Number of RX queues
* allocated at register_netdev() time
* @real_num_rx_queues: Number of RX queues currently active in device
*
* @rx_handler: handler for received packets
* @rx_handler_data: XXX: need comments on this one
* @ingress_queue: XXX: need comments on this one
* @broadcast: hw bcast address
*
* @rx_cpu_rmap: CPU reverse-mapping for RX completion interrupts,
* indexed by RX queue number. Assigned by driver.
* This must only be set if the ndo_rx_flow_steer
* operation is defined
* @index_hlist: Device index hash chain
*
* @_tx: Array of TX queues
* @num_tx_queues: Number of TX queues allocated at alloc_netdev_mq() time
* @real_num_tx_queues: Number of TX queues currently active in device
* @qdisc: Root qdisc from userspace point of view
* @tx_queue_len: Max frames per queue allowed
* @tx_global_lock: XXX: need comments on this one
*
* @xps_maps: XXX: need comments on this one
*
* @watchdog_timeo: Represents the timeout that is used by
* the watchdog (see dev_watchdog())
* @watchdog_timer: List of timers
*
* @pcpu_refcnt: Number of references to this device
* @todo_list: Delayed register/unregister
* @link_watch_list: XXX: need comments on this one
*
* @reg_state: Register/unregister state machine
* @dismantle: Device is going to be freed
* @rtnl_link_state: This enum represents the phases of creating
* a new link
*
* @needs_free_netdev: Should unregister perform free_netdev?
* @priv_destructor: Called from unregister
* @npinfo: XXX: need comments on this one
* @nd_net: Network namespace this network device is inside
*
* @ml_priv: Mid-layer private
* @lstats: Loopback statistics
* @tstats: Tunnel statistics
* @dstats: Dummy statistics
* @vstats: Virtual ethernet statistics
*
* @garp_port: GARP
* @mrp_port: MRP
*
* @dev: Class/net/name entry
* @sysfs_groups: Space for optional device, statistics and wireless
* sysfs groups
*
* @sysfs_rx_queue_group: Space for optional per-rx queue attributes
* @rtnl_link_ops: Rtnl_link_ops
*
* @gso_max_size: Maximum size of generic segmentation offload
* @gso_max_segs: Maximum number of segments that can be passed to the
* NIC for GSO
*
* @dcbnl_ops: Data Center Bridging netlink ops
* @num_tc: Number of traffic classes in the net device
* @tc_to_txq: XXX: need comments on this one
* @prio_tc_map: XXX: need comments on this one
*
* @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp
*
* @priomap: XXX: need comments on this one
* @phydev: Physical device may attach itself
* for hardware timestamping
*
* @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
* @qdisc_running_key: lockdep class annotating Qdisc->running seqcount
*
* @proto_down: protocol port state information can be sent to the
* switch driver and used to set the phys state of the
* switch port.
*
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
struct net_device {
char name[IFNAMSIZ];
struct hlist_node name_hlist;
char *ifalias;
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
*/
unsigned long mem_end;
unsigned long mem_start;
unsigned long base_addr;
int irq;
atomic_t carrier_changes;
/*
* Some hardware also needs these fields (state,dev_list,
* napi_list,unreg_list,close_list) but they are not
* part of the usual set specified in Space.c.
*/
unsigned long state;
struct list_head dev_list;
struct list_head napi_list;
struct list_head unreg_list;
struct list_head close_list;
struct list_head ptype_all;
struct list_head ptype_specific;
struct {
struct list_head upper;
struct list_head lower;
} adj_list;
netdev_features_t features;
netdev_features_t hw_features;
netdev_features_t wanted_features;
netdev_features_t vlan_features;
netdev_features_t hw_enc_features;
netdev_features_t mpls_features;
netdev_features_t gso_partial_features;
int ifindex;
int group;
struct net_device_stats stats;
atomic_long_t rx_dropped;
atomic_long_t tx_dropped;
atomic_long_t rx_nohandler;
#ifdef CONFIG_WIRELESS_EXT
const struct iw_handler_def *wireless_handlers;
struct iw_public_data *wireless_data;
#endif
const struct net_device_ops *netdev_ops;
const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_SWITCHDEV
const struct switchdev_ops *switchdev_ops;
#endif
#ifdef CONFIG_NET_L3_MASTER_DEV
const struct l3mdev_ops *l3mdev_ops;
#endif
#if IS_ENABLED(CONFIG_IPV6)
const struct ndisc_ops *ndisc_ops;
#endif
#ifdef CONFIG_XFRM
const struct xfrmdev_ops *xfrmdev_ops;
#endif
const struct header_ops *header_ops;
unsigned int flags;
unsigned int priv_flags;
unsigned short gflags;
unsigned short padded;
unsigned char operstate;
unsigned char link_mode;
unsigned char if_port;
unsigned char dma;
unsigned int mtu;
unsigned int min_mtu;
unsigned int max_mtu;
unsigned short type;
unsigned short hard_header_len;
unsigned char min_header_len;
unsigned short needed_headroom;
unsigned short needed_tailroom;
/* Interface address info. */
unsigned char perm_addr[MAX_ADDR_LEN];
unsigned char addr_assign_type;
unsigned char addr_len;
unsigned short neigh_priv_len;
unsigned short dev_id;
unsigned short dev_port;
spinlock_t addr_list_lock;
unsigned char name_assign_type;
bool uc_promisc;
struct netdev_hw_addr_list uc;
struct netdev_hw_addr_list mc;
struct netdev_hw_addr_list dev_addrs;
#ifdef CONFIG_SYSFS
struct kset *queues_kset;
#endif
unsigned int promiscuity;
unsigned int allmulti;
/* Protocol-specific pointers */
#if IS_ENABLED(CONFIG_VLAN_8021Q)
struct vlan_info __rcu *vlan_info;
#endif
#if IS_ENABLED(CONFIG_NET_DSA)
struct dsa_switch_tree *dsa_ptr;
#endif
#if IS_ENABLED(CONFIG_TIPC)
struct tipc_bearer __rcu *tipc_ptr;
#endif
void *atalk_ptr;
struct in_device __rcu *ip_ptr;
struct dn_dev __rcu *dn_ptr;
struct inet6_dev __rcu *ip6_ptr;
void *ax25_ptr;
struct wireless_dev *ieee80211_ptr;
struct wpan_dev *ieee802154_ptr;
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
struct mpls_dev __rcu *mpls_ptr;
#endif
/*
* Cache lines mostly used on receive path (including eth_type_trans())
*/
/* Interface address info used in eth_type_trans() */
unsigned char *dev_addr;
#ifdef CONFIG_SYSFS
struct netdev_rx_queue *_rx;
unsigned int num_rx_queues;
unsigned int real_num_rx_queues;
#endif
struct bpf_prog __rcu *xdp_prog;
unsigned long gro_flush_timeout;
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;
#ifdef CONFIG_NET_CLS_ACT
struct tcf_proto __rcu *ingress_cl_list;
#endif
struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
struct nf_hook_entry __rcu *nf_hooks_ingress;
#endif
unsigned char broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
struct cpu_rmap *rx_cpu_rmap;
#endif
struct hlist_node index_hlist;
/*
* Cache lines mostly used on transmit path
*/
struct netdev_queue *_tx ____cacheline_aligned_in_smp;
unsigned int num_tx_queues;
unsigned int real_num_tx_queues;
struct Qdisc *qdisc;
#ifdef CONFIG_NET_SCHED
DECLARE_HASHTABLE (qdisc_hash, 4);
#endif
unsigned long tx_queue_len;
spinlock_t tx_global_lock;
int watchdog_timeo;
#ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_maps;
#endif
#ifdef CONFIG_NET_CLS_ACT
struct tcf_proto __rcu *egress_cl_list;
#endif
/* These may be needed for future network-power-down code. */
struct timer_list watchdog_timer;
int __percpu *pcpu_refcnt;
struct list_head todo_list;
struct list_head link_watch_list;
enum { NETREG_UNINITIALIZED=0,
NETREG_REGISTERED, /* completed register_netdevice */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
NETREG_DUMMY, /* dummy device for NAPI poll */
} reg_state:8;
bool dismantle;
enum {
RTNL_LINK_INITIALIZED,
RTNL_LINK_INITIALIZING,
} rtnl_link_state:16;
bool needs_free_netdev;
void (*priv_destructor)(struct net_device *dev);
#ifdef CONFIG_NETPOLL
struct netpoll_info __rcu *npinfo;
#endif
possible_net_t nd_net;
/* mid-layer private */
union {
void *ml_priv;
struct pcpu_lstats __percpu *lstats;
struct pcpu_sw_netstats __percpu *tstats;
struct pcpu_dstats __percpu *dstats;
struct pcpu_vstats __percpu *vstats;
};
#if IS_ENABLED(CONFIG_GARP)
struct garp_port __rcu *garp_port;
#endif
#if IS_ENABLED(CONFIG_MRP)
struct mrp_port __rcu *mrp_port;
#endif
struct device dev;
const struct attribute_group *sysfs_groups[4];
const struct attribute_group *sysfs_rx_queue_group;
const struct rtnl_link_ops *rtnl_link_ops;
/* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE 65536
unsigned int gso_max_size;
#define GSO_MAX_SEGS 65535
u16 gso_max_segs;
#ifdef CONFIG_DCB
const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
u8 num_tc;
struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
u8 prio_tc_map[TC_BITMASK + 1];
#if IS_ENABLED(CONFIG_FCOE)
unsigned int fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
struct netprio_map __rcu *priomap;
#endif
struct phy_device *phydev;
struct lock_class_key *qdisc_tx_busylock;
struct lock_class_key *qdisc_running_key;
bool proto_down;
};
结构体代码有点长,不需要每个成员都了解,特别是随着内核的更新,结构体还可能有变化。但是会发现,没有一个刚才提到的指向私有空间的指针。
这就引出了第二个要介绍的函数netdev_priv
/**
* netdev_priv - access network device private data
* @dev: network device
*
* Get network device private data
*/
static inline void *netdev_priv(const struct net_device *dev)
{
return (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);
}
意思是通过struct net_device *dev首地址加对齐后的偏移量就得到了私有数据的首地址。可以这么理解,私有空间在结构体之外,怎么拿到呢?通过指针偏移的方法。这一点与之前的结构体有点区别。
贴一下自己定义的私有空间的结构体
struct nic_priv {
unsigned char *tx_buf;
unsigned int tx_len;
unsigned char *rx_buf;
unsigned int rx_len;
u32 msg_enable;
};
第三个函数register_netdev(),向系统注册该网络设备对象。这个函数不贴代码了,主要要说明的是,注册流程实际是把传进来的参数nic_dev串到链表中。dev_list成员就是这个作用。
后面有释放、取消注册等函数,基本就是逆过程,就不介绍了。
适配器操作
实现了这么几个操作,实际上net_device_ops里面的成员方法(用了面向对象的思想)有很多。
static const struct net_device_ops nic_netdev_ops = {
//.ndo_init = insmod sample.ko
// open = ifconfig eth2 192.168.3.123 up
.ndo_open = nic_open,
.ndo_stop = nic_stop,
.ndo_validate_addr = nic_validate_addr,
.ndo_start_xmit = nic_start_xmit,
.ndo_change_mtu = nic_change_mtu,
.ndo_set_mac_address = nic_set_mac_addr,
};
这里要做一个区分,ndo_init、ndo_open,还有上面的nic_init()。
- nic_init:是在insmod时调用,其实是module_init()调用的
- ndo_init:是在网卡设备通电后开始工作时要进行的操作。因为是虚拟设备,没有这个过程,所以注释掉了
- ndo_open:上面也注释了,ifconfig up时会调用
剩下的再看看里面填的函数
static int nic_open(struct net_device *dev) {
struct nic_priv *priv = netdev_priv(dev);
netif_info(priv, ifup, dev, "%s(#%d), priv:%p\n",
__func__, __LINE__, priv);
priv->tx_buf = kmalloc(MAX_ETH_FRAME_SIZE, GFP_KERNEL);
if (priv->tx_buf == NULL) {
netif_info(priv, ifup, dev, "%s(#%d), cannot alloc tx buf\n",
__func__, __LINE__);
return -ENOMEM;
}
netif_start_queue(dev);
return 0;
}
这里open时才分配了tx_buf,因为如果在初始化时就分配,那么在insmod之前,分配的空间是没有用的,白白浪费内存。
还有一个函数netif_start_queue()。net_device工作是通过这一步开始,可以开始接收发送数据,相对应的还有一个netif_stop_queue()函数,在stop函数中
static int nic_stop(struct net_device *dev) {
struct nic_priv *priv = netdev_priv(dev);
netif_info(priv, ifdown, dev, "%s(#%d), priv:%p\n",
__func__, __LINE__, priv);
if (priv->tx_buf) {
kfree(priv->tx_buf);
}
netif_stop_queue(dev);
return 0;
}
接下来是读写操作,一并贴上了
static void nic_rx(struct net_device *dev, int len, unsigned char *buf) {
struct sk_buff *skb;
struct nic_priv *priv = netdev_priv(dev);
netif_info(priv, hw, dev, "%s(#%d), rx:%d\n",
__func__, __LINE__, len);
skb = dev_alloc_skb(len+2);
if (!skb) {
netif_err(priv, rx_err, dev,
"%s(#%d), rx: low on mem - packet dropped\n",
__func__, __LINE__);
dev->stats.rx_dropped++;
return ;
}
skb_reserve(skb, 2);
memcpy(skb_put(skb, len), buf, len);
skb->dev = dev;
skb->protocol = eth_type_trans(skb, dev);
skb->ip_summed = CHECKSUM_UNNECESSARY;
dev->stats.rx_packets ++;
dev->stats.rx_bytes += len;
netif_rx(skb); //接收数据包
}
static void nic_hw_xmit(struct net_device *dev) {
struct nic_priv *priv = netdev_priv(dev);
struct iphdr *iph;
u32 *saddr, *daddr;
struct in_device *in_dev;
struct in_ifaddr *if_info;
if (priv->tx_len < sizeof(struct ethhdr) + sizeof(struct iphdr)) {
netif_info(priv, hw, dev, "%s(#%d), too short\n",
__func__, __LINE__);
return ;
}
iph = (struct iphdr*)(priv->tx_buf + sizeof(struct ethhdr));
saddr = &iph->saddr;
daddr = &iph->daddr;
netif_info(priv, hw, dev, "%s(#%d), orig, src:%pI4, dst:%pI4, len:%d\n",
__func__, __LINE__, saddr, daddr, priv->tx_len);
in_dev = nic_dev[(dev == nic_dev[0] ? 1 : 0)]->ip_ptr;
if (in_dev) {
//if_info = in_dev->ifa_list;
for (if_info = in_dev->ifa_list; if_info; if_info = if_info->ifa_next) {
*saddr = *daddr = if_info->ifa_address;
((u8*)saddr)[3]++;
netif_info(priv, hw, dev, "%s(#%d), new, src:%pI4, dst:%pI4\n",
__func__, __LINE__, saddr, daddr);
break;
}
if (!if_info) {
dev->stats.tx_dropped ++;
netif_info(priv, hw, dev, "%s(#%d), drop packet\n",
__func__, __LINE__);
return ;
}
}
iph->check = 0; //这一步不要漏,因为iph->check参与下面的校验和运算
iph->check = ip_fast_csum((unsigned char*)iph, iph->ihl); //tcp、udp也是这样
dev->stats.tx_packets ++;
dev->stats.tx_bytes += priv->tx_len;
nic_rx(nic_dev[(dev == nic_dev[0] ? 1 : 0)], priv->tx_len, priv->tx_buf);
}
static netdev_tx_t nic_start_xmit(struct sk_buff *skb, struct net_device *dev) {
struct nic_priv *priv = netdev_priv(dev);
netif_info(priv, drv, dev, "%s(#%d), orig, src:%pI4, dst:%pI4\n",
__func__, __LINE__, &(ip_hdr(skb)->saddr), &(ip_hdr(skb)->daddr));
priv->tx_len = skb->len;
if (likely(priv->tx_len < MAX_ETH_FRAME_SIZE)) {
if (priv->tx_len < ETH_ZLEN) {
memset(priv->tx_buf, 0, ETH_ZLEN);
priv->tx_len = ETH_ZLEN;
}
skb_copy_and_csum_dev(skb, priv->tx_buf);
dev_kfree_skb_any(skb); //释放skb
} else {
dev_kfree_skb_any(skb);
dev->stats.tx_dropped ++;
return NETDEV_TX_OK;
}
nic_hw_xmit(dev);
return NETDEV_TX_OK;
}
读写过程中,重要的是sk_buff结构体。sk_buff结构体接收通过网卡收发的报文。
/**
* struct sk_buff - socket buffer
* @next: Next buffer in list
* @prev: Previous buffer in list
* @tstamp: Time we arrived/left
* @rbnode: RB tree node, alternative to next/prev for netem/tcp
* @sk: Socket we are owned by
* @dev: Device we arrived on/are leaving by
* @cb: Control buffer. Free for use by every layer. Put private vars here
* @_skb_refdst: destination entry (with norefcount bit)
* @sp: the security path, used for xfrm
* @len: Length of actual data
* @data_len: Data length
* @mac_len: Length of link layer header
* @hdr_len: writable header length of cloned skb
* @csum: Checksum (must include start/offset pair)
* @csum_start: Offset from skb->head where checksumming should start
* @csum_offset: Offset from csum_start where checksum should be stored
* @priority: Packet queueing priority
* @ignore_df: allow local fragmentation
* @cloned: Head may be cloned (check refcnt to be sure)
* @ip_summed: Driver fed us an IP checksum
* @nohdr: Payload reference only, must not modify header
* @pkt_type: Packet class
* @fclone: skbuff clone status
* @ipvs_property: skbuff is owned by ipvs
* @tc_skip_classify: do not classify packet. set by IFB device
* @tc_at_ingress: used within tc_classify to distinguish in/egress
* @tc_redirected: packet was redirected by a tc action
* @tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect
* @peeked: this packet has been seen already, so stats have been
* done for it, don't do them again
* @nf_trace: netfilter packet trace flag
* @protocol: Packet protocol from driver
* @destructor: Destruct function
* @_nfct: Associated connection, if any (with nfctinfo bits)
* @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
* @skb_iif: ifindex of device we arrived on
* @tc_index: Traffic control index
* @hash: the packet hash
* @queue_mapping: Queue mapping for multiqueue devices
* @xmit_more: More SKBs are pending for this queue
* @ndisc_nodetype: router type (from link layer)
* @ooo_okay: allow the mapping of a socket to a queue to be changed
* @l4_hash: indicate hash is a canonical 4-tuple hash over transport
* ports.
* @sw_hash: indicates hash was computed in software stack
* @wifi_acked_valid: wifi_acked was set
* @wifi_acked: whether frame was acked on wifi or not
* @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
* @dst_pending_confirm: need to confirm neighbour
* @napi_id: id of the NAPI struct this skb came from
* @secmark: security marking
* @mark: Generic packet mark
* @vlan_proto: vlan encapsulation protocol
* @vlan_tci: vlan tag control information
* @inner_protocol: Protocol (encapsulation)
* @inner_transport_header: Inner transport layer header (encapsulation)
* @inner_network_header: Network layer header (encapsulation)
* @inner_mac_header: Link layer header (encapsulation)
* @transport_header: Transport layer header
* @network_header: Network layer header
* @mac_header: Link layer header
* @tail: Tail pointer
* @end: End pointer
* @head: Head of buffer
* @data: Data head pointer
* @truesize: Buffer size
* @users: User count - see {datagram,tcp}.c
*/
struct sk_buff {
union {
struct {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
union {
ktime_t tstamp;
struct skb_mstamp skb_mstamp;
};
};
//sk_buff也可能由红黑树组织,key值是packet的到达时间或发送时间,这样就可以按时间组织sk_buff
struct rb_node rbnode; /* used in netem & tcp stack */
};
struct sock *sk;
union {
struct net_device *dev;
/* Some protocols might use this space to store information,
* while device pointer would be NULL.
* UDP receive path is one user.
*/
unsigned long dev_scratch;
};
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8);
unsigned long _skb_refdst;
void (*destructor)(struct sk_buff *skb);
#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
unsigned long _nfct;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info *nf_bridge;
#endif
unsigned int len,
data_len;
__u16 mac_len,
hdr_len;
/* Following fields are _not_ copied in __copy_skb_header()
* Note that queue_mapping is here mostly to fill a hole.
*/
kmemcheck_bitfield_begin(flags1);
__u16 queue_mapping;
/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK (1 << 7)
#else
#define CLONED_MASK 1
#endif
#define CLONED_OFFSET() offsetof(struct sk_buff, __cloned_offset)
__u8 __cloned_offset[0];
__u8 cloned:1,
nohdr:1,
fclone:2,
peeked:1,
head_frag:1,
xmit_more:1,
__unused:1; /* one bit hole */
kmemcheck_bitfield_end(flags1);
/* fields enclosed in headers_start/headers_end are copied
* using a single memcpy() in __copy_skb_header()
*/
/* private: */
__u32 headers_start[0];
/* public: */
/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX (7 << 5)
#else
#define PKT_TYPE_MAX 7
#endif
#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)
__u8 __pkt_type_offset[0];
__u8 pkt_type:3;
__u8 pfmemalloc:1;
__u8 ignore_df:1;
__u8 nf_trace:1;
__u8 ip_summed:2;
__u8 ooo_okay:1;
__u8 l4_hash:1;
__u8 sw_hash:1;
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
/* Indicates the inner headers are valid in the skbuff. */
__u8 encapsulation:1;
__u8 encap_hdr_csum:1;
__u8 csum_valid:1;
__u8 csum_complete_sw:1;
__u8 csum_level:2;
__u8 csum_bad:1;
__u8 dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ipvs_property:1;
__u8 inner_protocol_type:1;
__u8 remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
__u8 offload_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
__u8 tc_skip_classify:1;
__u8 tc_at_ingress:1;
__u8 tc_redirected:1;
__u8 tc_from_ingress:1;
#endif
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#endif
union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
};
__u32 priority;
int skb_iif;
__u32 hash;
__be16 vlan_proto;
__u16 vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
union {
unsigned int napi_id;
unsigned int sender_cpu;
};
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 reserved_tailroom;
};
union {
__be16 inner_protocol;
__u8 inner_ipproto;
};
__u16 inner_transport_header;
__u16 inner_network_header;
__u16 inner_mac_header;
__be16 protocol;
//传输层、网络层、链路层的包头的位置
//sk_buff中存储的是一包完整的数据,所以有包头
__u16 transport_header;
__u16 network_header;
__u16 mac_header;
/* private: */
__u32 headers_end[0];
/* public: */
/* These elements must be at the end, see alloc_skb() for details. */
//head和end指向缓冲区的头部和尾部,data和 tail指向实际数据的头部和尾部。
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize;
atomic_t users;
};
还有一些操作
//验证dev->dev_addr是否合法
static int nic_validate_addr(struct net_device *dev) {
struct nic_priv *priv = netdev_priv(dev);
netif_info(priv, drv, dev, "%s(#%d), priv:%p\n",
__func__, __LINE__, priv);
return eth_validate_addr(dev);
}
static int nic_change_mtu(struct net_device *dev, int new_mtu) {
struct nic_priv *priv = netdev_priv(dev);
netif_info(priv, drv, dev, "%s(#%d), priv:%p, mtu%d\n",
__func__, __LINE__, priv, new_mtu);
return eth_change_mtu(dev, new_mtu);
}
static int nic_set_mac_addr(struct net_device *dev, void *addr) {
struct nic_priv *priv = netdev_priv(dev);
netif_info(priv, drv, dev, "%s(#%d), priv:%p\n",
__func__, __LINE__, priv);
return eth_mac_addr(dev, addr);
}
//构造包头的函数
static int nic_header_create (struct sk_buff *skb, struct net_device *dev,
unsigned short type, const void *daddr,
const void *saddr, unsigned int len) {
struct nic_priv *priv = netdev_priv(dev);
struct ethhdr *eth = (struct ethhdr*)skb_push(skb, ETH_HLEN);
struct net_device *dst_netdev;
netif_info(priv, drv, dev, "%s(#%d)\n",
__func__, __LINE__);
dst_netdev = nic_dev[(dev == nic_dev[0] ? 1 : 0)];
eth->h_proto = htons(type);
memcpy(eth->h_source, saddr ? saddr : dev->dev_addr, dev->addr_len);
memcpy(eth->h_dest, dst_netdev->dev_addr, dst_netdev->addr_len);
return dev->hard_header_len;
}
static const struct header_ops nic_header_ops = {
.create = nic_header_create,
};