虚拟网络适配器的实现

网络适配器是通过软件实现的,能够适配大多数网卡的组件。可以这么理解,网络适配器之于网卡,就相当于类之于实例。而虚拟网络适配器,则是并不对应一个物理网卡,叫做虚拟网络适配器,不与pci上的地址绑定。
直接看实现。

模块实现

还是和之前一样,需要insmod插入自己定义的模块。

//__init宏表示把函数代码放到内核中的代码段init.text中。注意与用户空间的代码段不同
static int __init nic_init(void) {

	int ret = 0;
	struct nic_priv *priv;

	pr_info("%s(#%d): install module\n", __func__, __LINE__);

    //定义两个,是为了可以实现一个发一个收
	nic_dev[0] = nic_alloc_netdev();  //定义全局变量struct net_device nic_dev[2];
	if (!nic_dev[0]) {
		printk("%s(#%d): alloc netdev[0] failed\n", __func__, __LINE__);
		return -ENOMEM;
	}
	
	nic_dev[1] = nic_alloc_netdev();
	if (!nic_dev[1]) {
		printk("%s(#%d): alloc netdev[1] failed\n", __func__, __LINE__);
		goto alloc_2nd_failed;
	}

	ret = register_netdev(nic_dev[0]);
	if (ret) {
		printk("%s(#%d): reg net driver failed. ret: %d\n", __func__, __LINE__, ret);
		goto reg1_failed;
	}

	ret = register_netdev(nic_dev[1]);
    if (ret) {
        printk("%s(#%d): reg net driver failed. ret:%d\n", __func__, __LINE__, ret);
        goto reg2_failed;
    }

	priv = netdev_priv(nic_dev[0]);
	priv->msg_enable = DEF_MSG_ENABLE;
	priv = netdev_priv(nic_dev[1]);
	priv->msg_enable = DEF_MSG_ENABLE;
	
	return 0;

reg2_failed:
	unregister_netdev(nic_dev[0]);
reg1_failed:
	free_netdev(nic_dev[1]);
alloc_2nd_failed:
	free_netdev(nic_dev[0]);
	
	return ret;
}


static void __exit nic_exit(void) {

	int i = 0;
	pr_info("%s(#%d): remove module\n", __func__, __LINE__);
	for (i = 0;i < ARRAY_SIZE(nic_dev);i ++) {
		unregister_netdev(nic_dev[i]);
		free_netdev(nic_dev[i]);
	}
}

module_init(nic_init);
module_exit(nic_exit);

这里涉及几个函数。
第一个是自己定义的net_device分配函数nic_alloc_netdev(),除了分配之外,还有一些变量赋值的操作。

static struct net_device *nic_alloc_netdev(void) {

    //内核实现的,分配net_device的函数,参数是netdev所需私有空间的大小
	struct net_device *netdev = alloc_etherdev(sizeof(struct nic_priv));
	if (!netdev) {
		pr_err("%s(#%d): alloc dev failed", __func__, __LINE__);
		return NULL;
	}

	eth_hw_addr_random(netdev);  //赋予netdev随机mac地址
	netdev->netdev_ops = &nic_netdev_ops;  //netdev内部各种操作,下面会介绍
	netdev->flags |= IFF_NOARP;

	//这个特征值,表示使用系统自带的,汇编实现的校验和
    netdev->features |= NETIF_F_HW_CSUM;  
	netdev->header_ops = &nic_header_ops;

	return netdev;
	
}

这里说一下 struct net_device 私有空间的问题。一般思路,是认为结构体中有个成员(如void *)指向一块空间,这块空间为该实例私有,例如之前介绍的inode结构体。然而,事实不是这样。

//英文注释全贴下来了,就不写中文了
/**
 *	struct net_device - The DEVICE structure.
 *		Actually, this whole structure is a big mistake.  It mixes I/O
 *		data with strictly "high-level" data, and it has to know about
 *		almost every data structure used in the INET module.
 *
 *	@name:	This is the first field of the "visible" part of this structure
 *		(i.e. as seen by users in the "Space.c" file).  It is the name
 *	 	of the interface.
 *
 *	@name_hlist: 	Device name hash chain, please keep it close to name[]
 *	@ifalias:	SNMP alias
 *	@mem_end:	Shared memory end
 *	@mem_start:	Shared memory start
 *	@base_addr:	Device I/O address
 *	@irq:		Device IRQ number
 *
 *	@carrier_changes:	Stats to monitor carrier on<->off transitions
 *
 *	@state:		Generic network queuing layer state, see netdev_state_t
 *	@dev_list:	The global list of network devices
 *	@napi_list:	List entry used for polling NAPI devices
 *	@unreg_list:	List entry  when we are unregistering the
 *			device; see the function unregister_netdev
 *	@close_list:	List entry used when we are closing the device
 *	@ptype_all:     Device-specific packet handlers for all protocols
 *	@ptype_specific: Device-specific, protocol-specific packet handlers
 *
 *	@adj_list:	Directly linked devices, like slaves for bonding
 *	@features:	Currently active device features
 *	@hw_features:	User-changeable features
 *
 *	@wanted_features:	User-requested features
 *	@vlan_features:		Mask of features inheritable by VLAN devices
 *
 *	@hw_enc_features:	Mask of features inherited by encapsulating devices
 *				This field indicates what encapsulation
 *				offloads the hardware is capable of doing,
 *				and drivers will need to set them appropriately.
 *
 *	@mpls_features:	Mask of features inheritable by MPLS
 *
 *	@ifindex:	interface index
 *	@group:		The group the device belongs to
 *
 *	@stats:		Statistics struct, which was left as a legacy, use
 *			rtnl_link_stats64 instead
 *
 *	@rx_dropped:	Dropped packets by core network,
 *			do not use this in drivers
 *	@tx_dropped:	Dropped packets by core network,
 *			do not use this in drivers
 *	@rx_nohandler:	nohandler dropped packets by core network on
 *			inactive devices, do not use this in drivers
 *
 *	@wireless_handlers:	List of functions to handle Wireless Extensions,
 *				instead of ioctl,
 *				see <net/iw_handler.h> for details.
 *	@wireless_data:	Instance data managed by the core of wireless extensions
 *
 *	@netdev_ops:	Includes several pointers to callbacks,
 *			if one wants to override the ndo_*() functions
 *	@ethtool_ops:	Management operations
 *	@ndisc_ops:	Includes callbacks for different IPv6 neighbour
 *			discovery handling. Necessary for e.g. 6LoWPAN.
 *	@header_ops:	Includes callbacks for creating,parsing,caching,etc
 *			of Layer 2 headers.
 *
 *	@flags:		Interface flags (a la BSD)
 *	@priv_flags:	Like 'flags' but invisible to userspace,
 *			see if.h for the definitions
 *	@gflags:	Global flags ( kept as legacy )
 *	@padded:	How much padding added by alloc_netdev()
 *	@operstate:	RFC2863 operstate
 *	@link_mode:	Mapping policy to operstate
 *	@if_port:	Selectable AUI, TP, ...
 *	@dma:		DMA channel
 *	@mtu:		Interface MTU value
 *	@min_mtu:	Interface Minimum MTU value
 *	@max_mtu:	Interface Maximum MTU value
 *	@type:		Interface hardware type
 *	@hard_header_len: Maximum hardware header length.
 *	@min_header_len:  Minimum hardware header length
 *
 *	@needed_headroom: Extra headroom the hardware may need, but not in all
 *			  cases can this be guaranteed
 *	@needed_tailroom: Extra tailroom the hardware may need, but not in all
 *			  cases can this be guaranteed. Some cases also use
 *			  LL_MAX_HEADER instead to allocate the skb
 *
 *	interface address info:
 *
 * 	@perm_addr:		Permanent hw address
 * 	@addr_assign_type:	Hw address assignment type
 * 	@addr_len:		Hardware address length
 *	@neigh_priv_len:	Used in neigh_alloc()
 * 	@dev_id:		Used to differentiate devices that share
 * 				the same link layer address
 * 	@dev_port:		Used to differentiate devices that share
 * 				the same function
 *	@addr_list_lock:	XXX: need comments on this one
 *	@uc_promisc:		Counter that indicates promiscuous mode
 *				has been enabled due to the need to listen to
 *				additional unicast addresses in a device that
 *				does not implement ndo_set_rx_mode()
 *	@uc:			unicast mac addresses
 *	@mc:			multicast mac addresses
 *	@dev_addrs:		list of device hw addresses
 *	@queues_kset:		Group of all Kobjects in the Tx and RX queues
 *	@promiscuity:		Number of times the NIC is told to work in
 *				promiscuous mode; if it becomes 0 the NIC will
 *				exit promiscuous mode
 *	@allmulti:		Counter, enables or disables allmulticast mode
 *
 *	@vlan_info:	VLAN info
 *	@dsa_ptr:	dsa specific data
 *	@tipc_ptr:	TIPC specific data
 *	@atalk_ptr:	AppleTalk link
 *	@ip_ptr:	IPv4 specific data
 *	@dn_ptr:	DECnet specific data
 *	@ip6_ptr:	IPv6 specific data
 *	@ax25_ptr:	AX.25 specific data
 *	@ieee80211_ptr:	IEEE 802.11 specific data, assign before registering
 *
 *	@dev_addr:	Hw address (before bcast,
 *			because most packets are unicast)
 *
 *	@_rx:			Array of RX queues
 *	@num_rx_queues:		Number of RX queues
 *				allocated at register_netdev() time
 *	@real_num_rx_queues: 	Number of RX queues currently active in device
 *
 *	@rx_handler:		handler for received packets
 *	@rx_handler_data: 	XXX: need comments on this one
 *	@ingress_queue:		XXX: need comments on this one
 *	@broadcast:		hw bcast address
 *
 *	@rx_cpu_rmap:	CPU reverse-mapping for RX completion interrupts,
 *			indexed by RX queue number. Assigned by driver.
 *			This must only be set if the ndo_rx_flow_steer
 *			operation is defined
 *	@index_hlist:		Device index hash chain
 *
 *	@_tx:			Array of TX queues
 *	@num_tx_queues:		Number of TX queues allocated at alloc_netdev_mq() time
 *	@real_num_tx_queues: 	Number of TX queues currently active in device
 *	@qdisc:			Root qdisc from userspace point of view
 *	@tx_queue_len:		Max frames per queue allowed
 *	@tx_global_lock: 	XXX: need comments on this one
 *
 *	@xps_maps:	XXX: need comments on this one
 *
 *	@watchdog_timeo:	Represents the timeout that is used by
 *				the watchdog (see dev_watchdog())
 *	@watchdog_timer:	List of timers
 *
 *	@pcpu_refcnt:		Number of references to this device
 *	@todo_list:		Delayed register/unregister
 *	@link_watch_list:	XXX: need comments on this one
 *
 *	@reg_state:		Register/unregister state machine
 *	@dismantle:		Device is going to be freed
 *	@rtnl_link_state:	This enum represents the phases of creating
 *				a new link
 *
 *	@needs_free_netdev:	Should unregister perform free_netdev?
 *	@priv_destructor:	Called from unregister
 *	@npinfo:		XXX: need comments on this one
 * 	@nd_net:		Network namespace this network device is inside
 *
 * 	@ml_priv:	Mid-layer private
 * 	@lstats:	Loopback statistics
 * 	@tstats:	Tunnel statistics
 * 	@dstats:	Dummy statistics
 * 	@vstats:	Virtual ethernet statistics
 *
 *	@garp_port:	GARP
 *	@mrp_port:	MRP
 *
 *	@dev:		Class/net/name entry
 *	@sysfs_groups:	Space for optional device, statistics and wireless
 *			sysfs groups
 *
 *	@sysfs_rx_queue_group:	Space for optional per-rx queue attributes
 *	@rtnl_link_ops:	Rtnl_link_ops
 *
 *	@gso_max_size:	Maximum size of generic segmentation offload
 *	@gso_max_segs:	Maximum number of segments that can be passed to the
 *			NIC for GSO
 *
 *	@dcbnl_ops:	Data Center Bridging netlink ops
 *	@num_tc:	Number of traffic classes in the net device
 *	@tc_to_txq:	XXX: need comments on this one
 *	@prio_tc_map:	XXX: need comments on this one
 *
 *	@fcoe_ddp_xid:	Max exchange id for FCoE LRO by ddp
 *
 *	@priomap:	XXX: need comments on this one
 *	@phydev:	Physical device may attach itself
 *			for hardware timestamping
 *
 *	@qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
 *	@qdisc_running_key: lockdep class annotating Qdisc->running seqcount
 *
 *	@proto_down:	protocol port state information can be sent to the
 *			switch driver and used to set the phys state of the
 *			switch port.
 *
 *	FIXME: cleanup struct net_device such that network protocol info
 *	moves out.
 */

struct net_device {
	char			name[IFNAMSIZ];
	struct hlist_node	name_hlist;
	char 			*ifalias;
	/*
	 *	I/O specific fields
	 *	FIXME: Merge these and struct ifmap into one
	 */
	unsigned long		mem_end;
	unsigned long		mem_start;
	unsigned long		base_addr;
	int			irq;

	atomic_t		carrier_changes;

	/*
	 *	Some hardware also needs these fields (state,dev_list,
	 *	napi_list,unreg_list,close_list) but they are not
	 *	part of the usual set specified in Space.c.
	 */

	unsigned long		state;

	struct list_head	dev_list;
	struct list_head	napi_list;
	struct list_head	unreg_list;
	struct list_head	close_list;
	struct list_head	ptype_all;
	struct list_head	ptype_specific;

	struct {
		struct list_head upper;
		struct list_head lower;
	} adj_list;

	netdev_features_t	features;
	netdev_features_t	hw_features;
	netdev_features_t	wanted_features;
	netdev_features_t	vlan_features;
	netdev_features_t	hw_enc_features;
	netdev_features_t	mpls_features;
	netdev_features_t	gso_partial_features;

	int			ifindex;
	int			group;

	struct net_device_stats	stats;

	atomic_long_t		rx_dropped;
	atomic_long_t		tx_dropped;
	atomic_long_t		rx_nohandler;

#ifdef CONFIG_WIRELESS_EXT
	const struct iw_handler_def *wireless_handlers;
	struct iw_public_data	*wireless_data;
#endif
	const struct net_device_ops *netdev_ops;
	const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_SWITCHDEV
	const struct switchdev_ops *switchdev_ops;
#endif
#ifdef CONFIG_NET_L3_MASTER_DEV
	const struct l3mdev_ops	*l3mdev_ops;
#endif
#if IS_ENABLED(CONFIG_IPV6)
	const struct ndisc_ops *ndisc_ops;
#endif

#ifdef CONFIG_XFRM
	const struct xfrmdev_ops *xfrmdev_ops;
#endif

	const struct header_ops *header_ops;

	unsigned int		flags;
	unsigned int		priv_flags;

	unsigned short		gflags;
	unsigned short		padded;

	unsigned char		operstate;
	unsigned char		link_mode;

	unsigned char		if_port;
	unsigned char		dma;

	unsigned int		mtu;
	unsigned int		min_mtu;
	unsigned int		max_mtu;
	unsigned short		type;
	unsigned short		hard_header_len;
	unsigned char		min_header_len;

	unsigned short		needed_headroom;
	unsigned short		needed_tailroom;

	/* Interface address info. */
	unsigned char		perm_addr[MAX_ADDR_LEN];
	unsigned char		addr_assign_type;
	unsigned char		addr_len;
	unsigned short		neigh_priv_len;
	unsigned short          dev_id;
	unsigned short          dev_port;
	spinlock_t		addr_list_lock;
	unsigned char		name_assign_type;
	bool			uc_promisc;
	struct netdev_hw_addr_list	uc;
	struct netdev_hw_addr_list	mc;
	struct netdev_hw_addr_list	dev_addrs;

#ifdef CONFIG_SYSFS
	struct kset		*queues_kset;
#endif
	unsigned int		promiscuity;
	unsigned int		allmulti;


	/* Protocol-specific pointers */

#if IS_ENABLED(CONFIG_VLAN_8021Q)
	struct vlan_info __rcu	*vlan_info;
#endif
#if IS_ENABLED(CONFIG_NET_DSA)
	struct dsa_switch_tree	*dsa_ptr;
#endif
#if IS_ENABLED(CONFIG_TIPC)
	struct tipc_bearer __rcu *tipc_ptr;
#endif
	void 			*atalk_ptr;
	struct in_device __rcu	*ip_ptr;
	struct dn_dev __rcu     *dn_ptr;
	struct inet6_dev __rcu	*ip6_ptr;
	void			*ax25_ptr;
	struct wireless_dev	*ieee80211_ptr;
	struct wpan_dev		*ieee802154_ptr;
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
	struct mpls_dev __rcu	*mpls_ptr;
#endif

/*
 * Cache lines mostly used on receive path (including eth_type_trans())
 */
	/* Interface address info used in eth_type_trans() */
	unsigned char		*dev_addr;

#ifdef CONFIG_SYSFS
	struct netdev_rx_queue	*_rx;

	unsigned int		num_rx_queues;
	unsigned int		real_num_rx_queues;
#endif

	struct bpf_prog __rcu	*xdp_prog;
	unsigned long		gro_flush_timeout;
	rx_handler_func_t __rcu	*rx_handler;
	void __rcu		*rx_handler_data;

#ifdef CONFIG_NET_CLS_ACT
	struct tcf_proto __rcu  *ingress_cl_list;
#endif
	struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
	struct nf_hook_entry __rcu *nf_hooks_ingress;
#endif

	unsigned char		broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
	struct cpu_rmap		*rx_cpu_rmap;
#endif
	struct hlist_node	index_hlist;

/*
 * Cache lines mostly used on transmit path
 */
	struct netdev_queue	*_tx ____cacheline_aligned_in_smp;
	unsigned int		num_tx_queues;
	unsigned int		real_num_tx_queues;
	struct Qdisc		*qdisc;
#ifdef CONFIG_NET_SCHED
	DECLARE_HASHTABLE	(qdisc_hash, 4);
#endif
	unsigned long		tx_queue_len;
	spinlock_t		tx_global_lock;
	int			watchdog_timeo;

#ifdef CONFIG_XPS
	struct xps_dev_maps __rcu *xps_maps;
#endif
#ifdef CONFIG_NET_CLS_ACT
	struct tcf_proto __rcu  *egress_cl_list;
#endif

	/* These may be needed for future network-power-down code. */
	struct timer_list	watchdog_timer;

	int __percpu		*pcpu_refcnt;
	struct list_head	todo_list;

	struct list_head	link_watch_list;

	enum { NETREG_UNINITIALIZED=0,
	       NETREG_REGISTERED,	/* completed register_netdevice */
	       NETREG_UNREGISTERING,	/* called unregister_netdevice */
	       NETREG_UNREGISTERED,	/* completed unregister todo */
	       NETREG_RELEASED,		/* called free_netdev */
	       NETREG_DUMMY,		/* dummy device for NAPI poll */
	} reg_state:8;

	bool dismantle;

	enum {
		RTNL_LINK_INITIALIZED,
		RTNL_LINK_INITIALIZING,
	} rtnl_link_state:16;

	bool needs_free_netdev;
	void (*priv_destructor)(struct net_device *dev);

#ifdef CONFIG_NETPOLL
	struct netpoll_info __rcu	*npinfo;
#endif

	possible_net_t			nd_net;

	/* mid-layer private */
	union {
		void					*ml_priv;
		struct pcpu_lstats __percpu		*lstats;
		struct pcpu_sw_netstats __percpu	*tstats;
		struct pcpu_dstats __percpu		*dstats;
		struct pcpu_vstats __percpu		*vstats;
	};

#if IS_ENABLED(CONFIG_GARP)
	struct garp_port __rcu	*garp_port;
#endif
#if IS_ENABLED(CONFIG_MRP)
	struct mrp_port __rcu	*mrp_port;
#endif

	struct device		dev;
	const struct attribute_group *sysfs_groups[4];
	const struct attribute_group *sysfs_rx_queue_group;

	const struct rtnl_link_ops *rtnl_link_ops;

	/* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE		65536
	unsigned int		gso_max_size;
#define GSO_MAX_SEGS		65535
	u16			gso_max_segs;

#ifdef CONFIG_DCB
	const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
	u8			num_tc;
	struct netdev_tc_txq	tc_to_txq[TC_MAX_QUEUE];
	u8			prio_tc_map[TC_BITMASK + 1];

#if IS_ENABLED(CONFIG_FCOE)
	unsigned int		fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
	struct netprio_map __rcu *priomap;
#endif
	struct phy_device	*phydev;
	struct lock_class_key	*qdisc_tx_busylock;
	struct lock_class_key	*qdisc_running_key;
	bool			proto_down;
};

结构体代码有点长,不需要每个成员都了解,特别是随着内核的更新,结构体还可能有变化。但是会发现,没有一个刚才提到的指向私有空间的指针。
这就引出了第二个要介绍的函数netdev_priv

/**
 *	netdev_priv - access network device private data
 *	@dev: network device
 *
 * Get network device private data
 */
static inline void *netdev_priv(const struct net_device *dev)
{
	return (char *)dev + ALIGN(sizeof(struct net_device), NETDEV_ALIGN);
}

意思是通过struct net_device *dev首地址加对齐后的偏移量就得到了私有数据的首地址。可以这么理解,私有空间在结构体之外,怎么拿到呢?通过指针偏移的方法。这一点与之前的结构体有点区别。
贴一下自己定义的私有空间的结构体

struct nic_priv {
	unsigned char *tx_buf;
	unsigned int tx_len;
	unsigned char *rx_buf;
	unsigned int rx_len;
	u32 msg_enable;
};

第三个函数register_netdev(),向系统注册该网络设备对象。这个函数不贴代码了,主要要说明的是,注册流程实际是把传进来的参数nic_dev串到链表中。dev_list成员就是这个作用。
后面有释放、取消注册等函数,基本就是逆过程,就不介绍了。

适配器操作

实现了这么几个操作,实际上net_device_ops里面的成员方法(用了面向对象的思想)有很多。

static const struct net_device_ops nic_netdev_ops = {
	//.ndo_init =  insmod sample.ko
	// open =  ifconfig eth2 192.168.3.123 up
	.ndo_open = nic_open,
	.ndo_stop = nic_stop,
	.ndo_validate_addr = nic_validate_addr,

	.ndo_start_xmit = nic_start_xmit,
	.ndo_change_mtu = nic_change_mtu,
	.ndo_set_mac_address = nic_set_mac_addr,
};

这里要做一个区分,ndo_init、ndo_open,还有上面的nic_init()。

  • nic_init:是在insmod时调用,其实是module_init()调用的
  • ndo_init:是在网卡设备通电后开始工作时要进行的操作。因为是虚拟设备,没有这个过程,所以注释掉了
  • ndo_open:上面也注释了,ifconfig up时会调用

剩下的再看看里面填的函数

static int	nic_open(struct net_device *dev) {

	struct nic_priv *priv = netdev_priv(dev);
	netif_info(priv, ifup, dev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);

	priv->tx_buf = kmalloc(MAX_ETH_FRAME_SIZE, GFP_KERNEL);
	if (priv->tx_buf == NULL) {
		netif_info(priv, ifup, dev, "%s(#%d), cannot alloc tx buf\n",
                    __func__, __LINE__);
		return -ENOMEM;
	}

	netif_start_queue(dev);
	return 0;
}

这里open时才分配了tx_buf,因为如果在初始化时就分配,那么在insmod之前,分配的空间是没有用的,白白浪费内存。
还有一个函数netif_start_queue()。net_device工作是通过这一步开始,可以开始接收发送数据,相对应的还有一个netif_stop_queue()函数,在stop函数中

static int nic_stop(struct net_device *dev) {

	struct nic_priv *priv = netdev_priv(dev);
	netif_info(priv, ifdown, dev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);
	
	if (priv->tx_buf) {
		kfree(priv->tx_buf);
	}

	netif_stop_queue(dev);

	return 0;
}

接下来是读写操作,一并贴上了

static void nic_rx(struct net_device *dev, int len, unsigned char *buf) {
	struct sk_buff *skb;
	struct nic_priv *priv = netdev_priv(dev);

	netif_info(priv, hw, dev, "%s(#%d), rx:%d\n",
                __func__, __LINE__, len);
	
	skb = dev_alloc_skb(len+2);
	if (!skb) {
		netif_err(priv, rx_err, dev,
                  "%s(#%d), rx: low on mem - packet dropped\n",
                  __func__, __LINE__);
		dev->stats.rx_dropped++;
		return ;
	}

	skb_reserve(skb, 2);
	memcpy(skb_put(skb, len), buf, len);

	skb->dev = dev;
	skb->protocol = eth_type_trans(skb, dev);
	skb->ip_summed = CHECKSUM_UNNECESSARY;
	
	dev->stats.rx_packets ++;
	dev->stats.rx_bytes += len;

	netif_rx(skb);  //接收数据包
	
}

static void nic_hw_xmit(struct net_device *dev) {

	struct nic_priv *priv = netdev_priv(dev);
	struct iphdr *iph;
	u32 *saddr, *daddr;
	struct in_device *in_dev;
	struct in_ifaddr *if_info;
	
	if (priv->tx_len < sizeof(struct ethhdr) + sizeof(struct iphdr)) {
		netif_info(priv, hw, dev, "%s(#%d), too short\n",
                   __func__, __LINE__);
		return ;
	}
	iph = (struct iphdr*)(priv->tx_buf + sizeof(struct ethhdr));
	saddr = &iph->saddr;
	daddr = &iph->daddr;

	netif_info(priv, hw, dev, "%s(#%d), orig, src:%pI4, dst:%pI4, len:%d\n",
                __func__, __LINE__, saddr, daddr, priv->tx_len);

	in_dev = nic_dev[(dev == nic_dev[0] ? 1 : 0)]->ip_ptr;
	if (in_dev) {
		//if_info = in_dev->ifa_list;
		for (if_info = in_dev->ifa_list; if_info; if_info = if_info->ifa_next) {
			*saddr = *daddr = if_info->ifa_address;
			((u8*)saddr)[3]++;

			netif_info(priv, hw, dev, "%s(#%d), new, src:%pI4, dst:%pI4\n",
                        __func__, __LINE__, saddr, daddr);
			break;
		}
		if (!if_info) {
			dev->stats.tx_dropped ++;
			netif_info(priv, hw, dev, "%s(#%d), drop packet\n",
                        __func__, __LINE__);
			return ;
		}
	}

	iph->check = 0;  //这一步不要漏,因为iph->check参与下面的校验和运算
	iph->check = ip_fast_csum((unsigned char*)iph, iph->ihl);  //tcp、udp也是这样

	dev->stats.tx_packets ++;
	dev->stats.tx_bytes += priv->tx_len;

	nic_rx(nic_dev[(dev == nic_dev[0] ? 1 : 0)], priv->tx_len, priv->tx_buf);
	
}

static netdev_tx_t nic_start_xmit(struct sk_buff *skb, struct net_device *dev) {

	struct nic_priv *priv = netdev_priv(dev);
	netif_info(priv, drv, dev, "%s(#%d), orig, src:%pI4, dst:%pI4\n",
                __func__, __LINE__, &(ip_hdr(skb)->saddr), &(ip_hdr(skb)->daddr));
	priv->tx_len = skb->len;
	if (likely(priv->tx_len < MAX_ETH_FRAME_SIZE)) {
		if (priv->tx_len < ETH_ZLEN) {
			memset(priv->tx_buf, 0, ETH_ZLEN);
			priv->tx_len = ETH_ZLEN;
		}
		skb_copy_and_csum_dev(skb, priv->tx_buf);
		dev_kfree_skb_any(skb);  //释放skb
	} else {
		dev_kfree_skb_any(skb);
		dev->stats.tx_dropped ++;

		return NETDEV_TX_OK;
	}

	nic_hw_xmit(dev);

	return NETDEV_TX_OK;
}

读写过程中,重要的是sk_buff结构体。sk_buff结构体接收通过网卡收发的报文。

/** 
 *	struct sk_buff - socket buffer
 *	@next: Next buffer in list
 *	@prev: Previous buffer in list
 *	@tstamp: Time we arrived/left
 *	@rbnode: RB tree node, alternative to next/prev for netem/tcp
 *	@sk: Socket we are owned by
 *	@dev: Device we arrived on/are leaving by
 *	@cb: Control buffer. Free for use by every layer. Put private vars here
 *	@_skb_refdst: destination entry (with norefcount bit)
 *	@sp: the security path, used for xfrm
 *	@len: Length of actual data
 *	@data_len: Data length
 *	@mac_len: Length of link layer header
 *	@hdr_len: writable header length of cloned skb
 *	@csum: Checksum (must include start/offset pair)
 *	@csum_start: Offset from skb->head where checksumming should start
 *	@csum_offset: Offset from csum_start where checksum should be stored
 *	@priority: Packet queueing priority
 *	@ignore_df: allow local fragmentation
 *	@cloned: Head may be cloned (check refcnt to be sure)
 *	@ip_summed: Driver fed us an IP checksum
 *	@nohdr: Payload reference only, must not modify header
 *	@pkt_type: Packet class
 *	@fclone: skbuff clone status
 *	@ipvs_property: skbuff is owned by ipvs
 *	@tc_skip_classify: do not classify packet. set by IFB device
 *	@tc_at_ingress: used within tc_classify to distinguish in/egress
 *	@tc_redirected: packet was redirected by a tc action
 *	@tc_from_ingress: if tc_redirected, tc_at_ingress at time of redirect
 *	@peeked: this packet has been seen already, so stats have been
 *		done for it, don't do them again
 *	@nf_trace: netfilter packet trace flag
 *	@protocol: Packet protocol from driver
 *	@destructor: Destruct function
 *	@_nfct: Associated connection, if any (with nfctinfo bits)
 *	@nf_bridge: Saved data about a bridged frame - see br_netfilter.c
 *	@skb_iif: ifindex of device we arrived on
 *	@tc_index: Traffic control index
 *	@hash: the packet hash
 *	@queue_mapping: Queue mapping for multiqueue devices
 *	@xmit_more: More SKBs are pending for this queue
 *	@ndisc_nodetype: router type (from link layer)
 *	@ooo_okay: allow the mapping of a socket to a queue to be changed
 *	@l4_hash: indicate hash is a canonical 4-tuple hash over transport
 *		ports.
 *	@sw_hash: indicates hash was computed in software stack
 *	@wifi_acked_valid: wifi_acked was set
 *	@wifi_acked: whether frame was acked on wifi or not
 *	@no_fcs:  Request NIC to treat last 4 bytes as Ethernet FCS
 *	@dst_pending_confirm: need to confirm neighbour
  *	@napi_id: id of the NAPI struct this skb came from
 *	@secmark: security marking
 *	@mark: Generic packet mark
 *	@vlan_proto: vlan encapsulation protocol
 *	@vlan_tci: vlan tag control information
 *	@inner_protocol: Protocol (encapsulation)
 *	@inner_transport_header: Inner transport layer header (encapsulation)
 *	@inner_network_header: Network layer header (encapsulation)
 *	@inner_mac_header: Link layer header (encapsulation)
 *	@transport_header: Transport layer header
 *	@network_header: Network layer header
 *	@mac_header: Link layer header
 *	@tail: Tail pointer
 *	@end: End pointer
 *	@head: Head of buffer
 *	@data: Data head pointer
 *	@truesize: Buffer size
 *	@users: User count - see {datagram,tcp}.c
 */

struct sk_buff {
	union {
		struct {
			/* These two members must be first. */
			struct sk_buff		*next;
			struct sk_buff		*prev;

			union {
				ktime_t		tstamp;
				struct skb_mstamp skb_mstamp;
			};
		};
        //sk_buff也可能由红黑树组织,key值是packet的到达时间或发送时间,这样就可以按时间组织sk_buff
		struct rb_node	rbnode; /* used in netem & tcp stack */
	};
	struct sock		*sk;

	union {
		struct net_device	*dev;
		/* Some protocols might use this space to store information,
		 * while device pointer would be NULL.
		 * UDP receive path is one user.
		 */
		unsigned long		dev_scratch;
	};
	/*
	 * This is the control buffer. It is free to use for every
	 * layer. Please put your private variables there. If you
	 * want to keep them across layers you have to do a skb_clone()
	 * first. This is owned by whoever has the skb queued ATM.
	 */
	char			cb[48] __aligned(8);

	unsigned long		_skb_refdst;
	void			(*destructor)(struct sk_buff *skb);
#ifdef CONFIG_XFRM
	struct	sec_path	*sp;
#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
	unsigned long		 _nfct;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
	struct nf_bridge_info	*nf_bridge;
#endif
	unsigned int		len,
				data_len;
	__u16			mac_len,
				hdr_len;

	/* Following fields are _not_ copied in __copy_skb_header()
	 * Note that queue_mapping is here mostly to fill a hole.
	 */
	kmemcheck_bitfield_begin(flags1);
	__u16			queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK	(1 << 7)
#else
#define CLONED_MASK	1
#endif
#define CLONED_OFFSET()		offsetof(struct sk_buff, __cloned_offset)

	__u8			__cloned_offset[0];
	__u8			cloned:1,
				nohdr:1,
				fclone:2,
				peeked:1,
				head_frag:1,
				xmit_more:1,
				__unused:1; /* one bit hole */
	kmemcheck_bitfield_end(flags1);

	/* fields enclosed in headers_start/headers_end are copied
	 * using a single memcpy() in __copy_skb_header()
	 */
	/* private: */
	__u32			headers_start[0];
	/* public: */

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX	(7 << 5)
#else
#define PKT_TYPE_MAX	7
#endif
#define PKT_TYPE_OFFSET()	offsetof(struct sk_buff, __pkt_type_offset)

	__u8			__pkt_type_offset[0];
	__u8			pkt_type:3;
	__u8			pfmemalloc:1;
	__u8			ignore_df:1;

	__u8			nf_trace:1;
	__u8			ip_summed:2;
	__u8			ooo_okay:1;
	__u8			l4_hash:1;
	__u8			sw_hash:1;
	__u8			wifi_acked_valid:1;
	__u8			wifi_acked:1;

	__u8			no_fcs:1;
	/* Indicates the inner headers are valid in the skbuff. */
	__u8			encapsulation:1;
	__u8			encap_hdr_csum:1;
	__u8			csum_valid:1;
	__u8			csum_complete_sw:1;
	__u8			csum_level:2;
	__u8			csum_bad:1;

	__u8			dst_pending_confirm:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
	__u8			ndisc_nodetype:2;
#endif
	__u8			ipvs_property:1;
	__u8			inner_protocol_type:1;
	__u8			remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
	__u8			offload_fwd_mark:1;
#endif
#ifdef CONFIG_NET_CLS_ACT
	__u8			tc_skip_classify:1;
	__u8			tc_at_ingress:1;
	__u8			tc_redirected:1;
	__u8			tc_from_ingress:1;
#endif

#ifdef CONFIG_NET_SCHED
	__u16			tc_index;	/* traffic control index */
#endif

	union {
		__wsum		csum;
		struct {
			__u16	csum_start;
			__u16	csum_offset;
		};
	};
	__u32			priority;
	int			skb_iif;
	__u32			hash;
	__be16			vlan_proto;
	__u16			vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
	union {
		unsigned int	napi_id;
		unsigned int	sender_cpu;
	};
#endif
#ifdef CONFIG_NETWORK_SECMARK
	__u32		secmark;
#endif

	union {
		__u32		mark;
		__u32		reserved_tailroom;
	};

	union {
		__be16		inner_protocol;
		__u8		inner_ipproto;
	};

	__u16			inner_transport_header;
	__u16			inner_network_header;
	__u16			inner_mac_header;

	__be16			protocol;
    //传输层、网络层、链路层的包头的位置
    //sk_buff中存储的是一包完整的数据,所以有包头
	__u16			transport_header;
	__u16			network_header;
	__u16			mac_header;

	/* private: */
	__u32			headers_end[0];
	/* public: */

	/* These elements must be at the end, see alloc_skb() for details.  */
    //head和end指向缓冲区的头部和尾部,data和 tail指向实际数据的头部和尾部。
	sk_buff_data_t		tail;
	sk_buff_data_t		end;
	unsigned char		*head,
				*data;
	unsigned int		truesize;
	atomic_t		users;
};

还有一些操作

//验证dev->dev_addr是否合法
static int	nic_validate_addr(struct net_device *dev) {
	struct nic_priv *priv = netdev_priv(dev);
	netif_info(priv, drv, dev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);

	return eth_validate_addr(dev);
}

static int nic_change_mtu(struct net_device *dev, int new_mtu) {
	struct nic_priv *priv = netdev_priv(dev);
	netif_info(priv, drv, dev, "%s(#%d), priv:%p, mtu%d\n",
                __func__, __LINE__, priv, new_mtu);

	return eth_change_mtu(dev, new_mtu);
}

static int nic_set_mac_addr(struct net_device *dev, void *addr) {

	struct nic_priv *priv = netdev_priv(dev);
	netif_info(priv, drv, dev, "%s(#%d), priv:%p\n",
                __func__, __LINE__, priv);
	
	return eth_mac_addr(dev, addr);	
}

//构造包头的函数
static int nic_header_create (struct sk_buff *skb, struct net_device *dev,
			   unsigned short type, const void *daddr,
			   const void *saddr, unsigned int len) {

	struct nic_priv *priv = netdev_priv(dev);
	struct ethhdr *eth = (struct ethhdr*)skb_push(skb, ETH_HLEN);
	struct net_device *dst_netdev;

	netif_info(priv, drv, dev, "%s(#%d)\n",
                __func__, __LINE__);

	dst_netdev = nic_dev[(dev == nic_dev[0] ? 1 : 0)];
	eth->h_proto = htons(type);

	memcpy(eth->h_source, saddr ? saddr : dev->dev_addr, dev->addr_len);
	memcpy(eth->h_dest, dst_netdev->dev_addr, dst_netdev->addr_len);
	
	return dev->hard_header_len;
}


static const struct header_ops nic_header_ops = {
	.create = nic_header_create,
};
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值