本文探讨3点:
硬件抽象视图[由各种网卡驱动程序提供的、由网络实现代码使用的接口]
主机到网络层-->network层
network层 -->主机到网络层
0 网络信息流:
-A驱动程序 --A 硬件 <--> B特定硬件[以太网适配器] -- B特定硬件驱动程序 -- 通用网络设备结构 -- 内核IP层-
注:内核提供两个驱动程序框架(isa-skeleton.c和pci-skeleton.c),用于网络驱动程序的模板,可参考编程
1 硬件抽象视图
网络设备在内核中由通用接口net_device表示,该接口屏蔽了不同类型网络设备之间的差异,给内核提供一个一致的接口,不同的网络设备依靠 特定的设备驱动程序 完成接口和硬件设备[以太网适配器]之间的交互,内核支持的所有的网络设备通过链表指针挂载到命名空间struct net结构中,该结构在/sys/class/net中对应于各个设备:
1.1 struct net_device结构体
/*
* The DEVICE structure.
* Actually, this whole structure is a big mistake. It mixes I/O
* data with strictly "high-level" data, and it has to know about
* almost every data structure used in the INET module.
*
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
struct net_device {
/*
* This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
* the interface.
*/
char name[IFNAMSIZ]; 如ethX[以太网适配器],lo[回环设备],atmX[高速网卡]
struct pm_qos_request_list *pm_qos_req;
/* device name hash chain */
struct hlist_node name_hlist; //通过名称挂在到网络命名空间struct net中
/* snmp alias */
char *ifalias;
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
*/
unsigned long mem_end; /* shared mem end */ 共享内存结束位置
unsigned long mem_start; /* shared mem start */ 共享内存起始位置
unsigned long base_addr; /* device I/O address */ 设备I/0地址
unsigned int irq; /* device IRQ number */ 设备IRQ号
/*
* Some hardware also needs these fields, but they are not
* part of the usual set specified in Space.c.
*/
unsigned char if_port; /* Selectable AUI, TP,..*/
unsigned char dma; /* DMA channel */
unsigned long state;
/*
enum netdev_state_t{
__LINK_STATE_START
__LINK_STATE_PRESENT
__LINK_STATE_NOCARRIER
__LINK_STATE_LINKWATCH_PENDING
__LINK_STATE_DORMANT
}
*/
struct list_head dev_list;
struct list_head napi_list;
struct list_head unreg_list;
/* Net device features */
unsigned long features;
#define NETIF_F_SG 1 /* Scatter/gather IO. */
#define NETIF_F_IP_CSUM 2 /* Can checksum TCP/UDP over IPv4. */
#define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */
#define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */
#define NETIF_F_IPV6_CSUM 16 /* Can checksum TCP/UDP over IPV6 */
#define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */
#define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */
#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
#define NETIF_F_GSO 2048 /* Enable software GSO. */
#define NETIF_F_LLTX 4096 /* LockLess TX - deprecated. Please */
/* do not use LLTX in new drivers */
#define NETIF_F_NETNS_LOCAL 8192 /* Does not change network namespaces */
#define NETIF_F_GRO 16384 /* Generic receive offload */
#define NETIF_F_LRO 32768 /* large receive offload */
/* the GSO_MASK reserves bits 16 through 23 */
#define NETIF_F_FCOE_CRC (1 << 24) /* FCoE CRC32 */
#define NETIF_F_SCTP_CSUM (1 << 25) /* SCTP checksum offload */
#define NETIF_F_FCOE_MTU (1 << 26) /* Supports max FCoE MTU, 2158 bytes*/
#define NETIF_F_NTUPLE (1 << 27) /* N-tuple filters supported */
#define NETIF_F_RXHASH (1 << 28) /* Receive hashing offload */
/* Segmentation offload features */
#define NETIF_F_GSO_SHIFT 16
#define NETIF_F_GSO_MASK 0x00ff0000
#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
#define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)
#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
#define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
#define NETIF_F_FSO (SKB_GSO_FCOE << NETIF_F_GSO_SHIFT)
/* List of features with software fallbacks. */
#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
#define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM)
#define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
#define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
/*
* If one device supports one of these features, then enable them
* for all in netdev_increment_features.
*/
#define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
NETIF_F_SG | NETIF_F_HIGHDMA | \
NETIF_F_FRAGLIST)
/* Interface index. Unique device identifier */
int ifindex; //接口索引,唯一的设备描述符
int iflink;
struct net_device_stats stats;//该网络设备发送和接收情况的统计数据,对应/sys/class/net/eth0/statistics
#ifdef CONFIG_WIRELESS_EXT
/* List of functions to handle Wireless Extensions (instead of ioctl).
* See for details. Jean II */
const struct iw_handler_def * wireless_handlers;
/* Instance data managed by the core of Wireless Extensions. */
struct iw_public_data * wireless_data;
#endif
/* Management operations */
const struct net_device_ops *netdev_ops; //该网络设备的函数指针,由设备驱动程序实现
const struct ethtool_ops *ethtool_ops;
/* Hardware header description */
const struct header_ops *header_ops; //用于处理链路层头部的函数指针
unsigned int flags; /* interface flags (a la BSD) */
unsigned short gflags;
unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */
unsigned short padded; /* How much padding added by alloc_netdev() */
unsigned char operstate; /* RFC2863 operstate */
unsigned char link_mode; /* mapping policy to operstate */
unsigned int mtu; /* interface MTU value */
最大传输单元,帧的最大长度,不同的网络设备该值不一样,以太网适配器一般为1500字节。所以网络层才为满足该要求可能对传输层的数据进行分组
unsigned short type; /* interface hardware type */ 以太网适配器该值为1
/*
/* ARP protocol HARDWARE identifiers. */
#define ARPHRD_NETROM 0 /* from KA9Q: NET/ROM pseudo */
#define ARPHRD_ETHER 1 /* Ethernet 10Mbps */ 10M以太网
#define ARPHRD_EETHER 2 /* Experimental Ethernet */ 802.2以太网
#define ARPHRD_AX25 3 /* AX.25 Level 2 */
#define ARPHRD_PRONET 4 /* PROnet token ring */
#define ARPHRD_CHAOS 5 /* Chaosnet */
#define ARPHRD_IEEE802 6 /* IEEE 802.2 Ethernet/TR/TB */
#define ARPHRD_ARCNET 7 /* ARCnet */
#define ARPHRD_APPLETLK 8 /* APPLEtalk */
#define ARPHRD_DLCI 15 /* Frame Relay DLCI */
#define ARPHRD_ATM 19 /* ATM */
#define ARPHRD_METRICOM 23 /* Metricom STRIP (new IANA id) */
#define ARPHRD_IEEE1394 24 /* IEEE 1394 IPv4 - RFC 2734 */
#define ARPHRD_EUI64 27 /* EUI-64 */
#define ARPHRD_INFINIBAND 32 /* InfiniBand */
*/
unsigned short hard_header_len; /* hardware hdr length */ 帧头部长度
/* extra head- and tailroom the hardware may need, but not in all cases
* can this be guaranteed, especially tailroom. Some cases also use
* LL_MAX_HEADER instead to allocate the skb.
*/
unsigned short needed_headroom;
unsigned short needed_tailroom;
struct net_device *master; /* Pointer to master device of a group,
* which this device is member of.
*/
/* Interface address info. */
unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */ 该网络设备的物理地址
unsigned char addr_len; /* hardware address length */ 地址长度6字节
unsigned short dev_id; /* for shared network cards */
spinlock_t addr_list_lock;
struct netdev_hw_addr_list uc; /* Unicast mac addresses */
struct netdev_hw_addr_list mc; /* Multicast mac addresses */
int uc_promisc;
unsigned int promiscuity;
unsigned int allmulti;
/* Protocol specific pointers */
#ifdef CONFIG_NET_DSA
void *dsa_ptr; /* dsa specific data */
#endif
void *atalk_ptr; /* AppleTalk link */
void *ip_ptr; /* IPv4 specific data */
void *dn_ptr; /* DECnet specific data */
void *ip6_ptr; /* IPv6 specific data */
void *ec_ptr; /* Econet specific data */
void *ax25_ptr; /* AX.25 specific data */
struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
assign before registering */
/*
* Cache line mostly used on receive path (including eth_type_trans())
*/
unsigned long last_rx; /* Time of last Rx */ 最近接收时间
/* Interface address info used in eth_type_trans() */
unsigned char *dev_addr; /* hw address, (before bcast because most packets are unicast) */
struct netdev_hw_addr_list dev_addrs; /* list of device hw addresses */
unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */ 硬件多播地址
#ifdef CONFIG_RPS
struct kset *queues_kset;
struct netdev_rx_queue *_rx;
/* Number of RX queues allocated at alloc_netdev_mq() time */
unsigned int num_rx_queues;
#endif
struct netdev_queue rx_queue;
struct netdev_queue *_tx ____cacheline_aligned_in_smp;
/* Number of TX queues allocated at alloc_netdev_mq() time */
unsigned int num_tx_queues;
/* Number of TX queues currently active in device */
unsigned int real_num_tx_queues;
/* root qdisc from userspace point of view */
struct Qdisc *qdisc;
unsigned long tx_queue_len; /* Max frames per queue allowed */
spinlock_t tx_global_lock;
/*
* One part is mostly used on xmit path (device)
*/
/* These may be needed for future network-power-down code. */
/*
* trans_start here is expensive for high speed devices on SMP,
* please use netdev_queue->trans_start instead.
*/
unsigned long trans_start; /* Time (in jiffies) of last Tx */
int watchdog_timeo; /* used by dev_watchdog() */
struct timer_list watchdog_timer;
/* Number of references to this device */
atomic_t refcnt ____cacheline_aligned_in_smp;
/* delayed register/unregister */
struct list_head todo_list;
/* device index hash chain */
struct hlist_node index_hlist;
struct list_head link_watch_list;
/* register/unregister state machine */
enum { NETREG_UNINITIALIZED=0,
NETREG_REGISTERED, /* completed register_netdevice */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
NETREG_DUMMY, /* dummy device for NAPI poll */
} reg_state:16;
enum {
RTNL_LINK_INITIALIZED,
RTNL_LINK_INITIALIZING,
} rtnl_link_state:16;
/* Called from unregister, can be used to call free_netdev */
void (*destructor)(struct net_device *dev);
#ifdef CONFIG_NETPOLL
struct netpoll_info *npinfo;
#endif
#ifdef CONFIG_NET_NS
/* Network namespace this network device is inside */
struct net *nd_net; //命名空间
#endif
/* mid-layer private */
void *ml_priv;
/* bridge stuff */
struct net_bridge_port *br_port;
/* macvlan */
struct macvlan_port *macvlan_port;
/* GARP */
struct garp_port *garp_port;
/* class/net/name entry */
struct device dev;
/* space for optional device, statistics, and wireless sysfs groups */
const struct attribute_group *sysfs_groups[4];
/* rtnetlink link ops */
const struct rtnl_link_ops *rtnl_link_ops;
/* VLAN feature mask */
unsigned long vlan_features;
/* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE 65536
unsigned int gso_max_size;
#ifdef CONFIG_DCB
/* Data Center Bridging netlink ops */
const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/* max exchange id for FCoE LRO by ddp */
unsigned int fcoe_ddp_xid;
#endif
/* n-tuple filter list attached to this device */
struct ethtool_rx_ntuple_list ethtool_ntuple_list;
};
1.2 网络设备对应的接口服务程序,由驱动程序实现:
参考pci-skeleton.c网卡驱动程序模板实现,源代码中pci网卡设备各个函数的实现:
struct net_device_ops {
int (*ndo_init)(struct net_device *dev); //网络设备初始化
void (*ndo_uninit)(struct net_device *dev); //网络设备注销或失效
int (*ndo_open)(struct net_device *dev); //开启网络设备
int (*ndo_stop)(struct net_device *dev); //关闭网络设备
netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb,
struct net_device *dev); //将数据帧从等待队列中删除,并且发送出去
u16 (*ndo_select_queue)(struct net_device *dev,struct sk_buff *skb);
//decide which queue to when device supports multiple
* transmit queues.
void (*ndo_change_rx_flags)(struct net_device *dev,
int flags);
void (*ndo_set_rx_mode)(struct net_device *dev);
void (*ndo_set_multicast_list)(struct net_device *dev);
int (*ndo_set_mac_address)(struct net_device *dev,
void *addr);
int (*ndo_validate_addr)(struct net_device *dev);
int (*ndo_do_ioctl)(struct net_device *dev,
struct ifreq *ifr, int cmd); //将特定的命令发送到网卡设备中去
int (*ndo_set_config)(struct net_device *dev,
struct ifmap *map);
int (*ndo_change_mtu)(struct net_device *dev,
int new_mtu); //更改该网络设备的MTU单元大小
int (*ndo_neigh_setup)(struct net_device *dev,
struct neigh_parms *);
void (*ndo_tx_timeout) (struct net_device *dev); //分组传输失效处理
struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);//获得该网络设备的发送/接收统计
void (*ndo_vlan_rx_register)(struct net_device *dev,
struct vlan_group *grp);
void (*ndo_vlan_rx_add_vid)(struct net_device *dev,
unsigned short vid);
void (*ndo_vlan_rx_kill_vid)(struct net_device *dev,
unsigned short vid);
#ifdef CONFIG_NET_POLL_CONTROLLER
void (*ndo_poll_controller)(struct net_device *dev);
void (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
int (*ndo_set_vf_mac)(struct net_device *dev,
int queue, u8 *mac);
int (*ndo_set_vf_vlan)(struct net_device *dev,
int queue, u16 vlan, u8 qos);
int (*ndo_set_vf_tx_rate)(struct net_device *dev,
int vf, int rate);
int (*ndo_get_vf_config)(struct net_device *dev,
int vf,
struct ifla_vf_info *ivf);
int (*ndo_set_vf_port)(struct net_device *dev,
int vf,
struct nlattr *port[]);
int (*ndo_get_vf_port)(struct net_device *dev,
int vf, struct sk_buff *skb);
#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
int (*ndo_fcoe_enable)(struct net_device *dev);
int (*ndo_fcoe_disable)(struct net_device *dev);
int (*ndo_fcoe_ddp_setup)(struct net_device *dev,
u16 xid,
struct scatterlist *sgl,
unsigned int sgc);
int (*ndo_fcoe_ddp_done)(struct net_device *dev,
u16 xid);
#define NETDEV_FCOE_WWNN 0
#define NETDEV_FCOE_WWPN 1
int (*ndo_fcoe_get_wwn)(struct net_device *dev,
u64 *wwn, int type);
#endif
};
1.3 网络设备rx和tx的数据信息的统计
参考/sys/class/net/eth0/statistics目录
struct net_device_stats {
unsigned long rx_packets; /* total packets received */
unsigned long tx_packets; /* total packets transmitted */
unsigned long rx_bytes; /* total bytes received */
unsigned long tx_bytes; /* total bytes transmitted */
unsigned long rx_errors; /* bad packets received */
unsigned long tx_errors; /* packet transmit problems */
unsigned long rx_dropped; /* no space in linux buffers */
unsigned long tx_dropped; /* no space available in linux */
unsigned long multicast; /* multicast packets received */
unsigned long collisions;
/* detailed rx_errors: */
unsigned long rx_length_errors;
unsigned long rx_over_errors; /* receiver ring buff overflow */
unsigned long rx_crc_errors; /* recved pkt with crc error */
unsigned long rx_frame_errors; /* recv'd frame alignment error */
unsigned long rx_fifo_errors; /* recv'r fifo overrun */
unsigned long rx_missed_errors; /* receiver missed packet */
/* detailed tx_errors */
unsigned long tx_aborted_errors;
unsigned long tx_carrier_errors;
unsigned long tx_fifo_errors;
unsigned long tx_heartbeat_errors;
unsigned long tx_window_errors;
/* for cslip etc */
unsigned long rx_compressed;
unsigned long tx_compressed;
};
1.4 网络设备注册全过程
分配并初始化一个网络设备结构net_device:
struct net_device *alloc_netdev(int sizeof_priv, const char *name,void (*setup)(struct net_device *),
unsigned int queue_count)
/**
* alloc_netdev_mq - allocate network device
* @sizeof_priv: size of private data to allocate space for
* @name: device name format string
* @setup: callback to initialize device, 对以太网为:ether_setup,负责填充该结构
* @queue_count: the number of subqueues to allocate
*
* Allocates a struct net_device with private data area for driver use
* and performs basic initialization. Also allocates subquue structs
* for each queue on the device at the end of the netdevice.
*/
注册该net_device 备到内核net命名空间:
如果网络命名空间未指定,则默认为全局网络命名空间init_net
int register_netdevice(struct net_device *dev)
{
int ret;
struct net *net = dev_net(dev);
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
might_sleep();
/* When net_device's are persistent, this will be fatal. */
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
netdev_init_queue_locks(dev);
dev->iflink = -1;
#ifdef CONFIG_RPS
if (!dev->num_rx_queues) {
/*
* Allocate a single RX queue if driver never called
* alloc_netdev_mq
*/
dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
if (!dev->_rx) {
ret = -ENOMEM;
goto out;
}
dev->_rx->first = dev->_rx;
atomic_set(&dev->_rx->count, 1);
dev->num_rx_queues = 1;
}
#endif
/* Init, if this function is available */
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev); //网络设备初始化
if (ret) {
if (ret > 0)
ret = -EIO;
goto out;
}
}
ret = dev_get_valid_name(dev, dev->name, 0); //形成网络设备在网络命名空间中的名字
if (ret)
goto err_uninit;
dev->ifindex = dev_new_index(net); //生成命名空间中的索引
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
/* Fix illegal checksum combinations */
if ((dev->features & NETIF_F_HW_CSUM) &&
(dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
dev->name);
dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
if ((dev->features & NETIF_F_NO_CSUM) &&
(dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
dev->name);
dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
}
dev->features = netdev_fix_features(dev->features, dev->name);
/* Enable software GSO if SG is supported. */
if (dev->features & NETIF_F_SG)
dev->features |= NETIF_F_GSO;
ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
ret = notifier_to_errno(ret);
if (ret)
goto err_uninit;
ret = netdev_register_kobject(dev);//将新网络设备加入到内核对象模型/sys/class/net中
if (ret)
goto err_uninit;
dev->reg_state = NETREG_REGISTERED;
/*
* Default initial state at registry is that the
* device is present.
*/
set_bit(__LINK_STATE_PRESENT, &dev->state);
dev_init_scheduler(dev);
dev_hold(dev);
list_netdevice(dev); //将设备加入到net链表中
/* Notify protocols, that a new device appeared. */
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
ret = notifier_to_errno(ret);
if (ret) {
rollback_registered(dev);
dev->reg_state = NETREG_UNREGISTERED;
}
/*
* Prevent userspace races by waiting until the network
* device is fully setup before sending notifications.
*/
if (!dev->rtnl_link_ops ||
dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
out:
return ret;
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
goto out;
}