文章目录
如 net_device的分配与释放中的介绍,net_device对象只有通过register_netdev()注册到系统后才能被外部感知到。类似的,在释放之前需要通过unregister_netdev()将net_device对象先从系统中注销。这篇笔记分析了这两个过程的实现。
RTNL锁
在分析注册和注销过程之前,先来看个RTNL锁,因为这两个过程都由这把锁保护,代码如下:
static DEFINE_MUTEX(rtnl_mutex);
void rtnl_lock(void)
{
mutex_lock(&rtnl_mutex);
}
void __rtnl_unlock(void)
{
mutex_unlock(&rtnl_mutex);
}
void rtnl_unlock(void)
{
mutex_unlock(&rtnl_mutex);
// 释放锁后执行了一些其它工作,见下面的分析
netdev_run_todo();
}
这是一个互斥锁,这意味着net_device的中注册和注销不能在原子上下文中使用。
注册net_device
分配好net_device对象并进行初始化后,驱动程序就可以通过register_netdev()向系统注册该net_device对象了。
/**
* register_netdev - register a network device
* @dev: device to register
*
* Take a completed network device structure and add it to the kernel
* interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
* chain. 0 is returned on success. A negative errno code is returned
* on a failure to set up the device, or if the name is a duplicate.
*
* This is a wrapper around register_netdevice that takes the rtnl semaphore
* and expands the device name if you passed a format string to
* alloc_netdev.
*/
int register_netdev(struct net_device *dev)
{
int err;
rtnl_lock(); // 持锁情况下执行注册过程
// 如果驱动程序指定的网络设备名称中有%字符,则内核认为传入的是一个格式化字符串,
// 会尝试为其分配一个唯一的ID,以此组成最终的网络设备名称。比如传入
// "eth%d", 最终的结果是"eth0"、"eth1"等等
if (strchr(dev->name, '%')) {
err = dev_alloc_name(dev, dev->name);
if (err < 0)
goto out;
}
// 执行真正的注册流程
err = register_netdevice(dev);
out:
rtnl_unlock();
return err;
}
register_netdevice()
- 继续初始化net_device的一些字段;
- 如果有指定那么执行ndo_init()回调函数,如果返回失败那么也会导致注册过程失败;
- 确保net_device的名称在namespace内是唯一的。在namespace内分配一个唯一的ifindex;
- 将net_device对象添加到全局数据结构中;
- 发送设备注册通知事件;
int register_netdevice(struct net_device *dev)
{
struct hlist_head *head;
struct hlist_node *p;
int ret;
struct net *net = dev_net(dev);
// 设备接口层必须已经初始化完成,即net_dev_init()已经执行完毕
BUG_ON(dev_boot_phase);
ASSERT_RTNL(); // 确保在持有RTNL锁的情况下调用
might_sleep();
// 网络设备的注册状态必须是UNINITIALIZED,刚分配的net_device就是这个状态
BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
BUG_ON(!net);
// 初始化地址列表、队列锁
spin_lock_init(&dev->addr_list_lock);
netdev_set_addr_lockdep_class(dev);
netdev_init_queue_locks(dev);
dev->iflink = -1;
// 执行驱动程序提供的ndo_init()回调函数
if (dev->netdev_ops->ndo_init) {
ret = dev->netdev_ops->ndo_init(dev);
if (ret) {
if (ret > 0)
ret = -EIO;
goto out;
}
}
// 校验net_device对象的名称
if (!dev_valid_name(dev->name)) {
ret = -EINVAL;
goto err_uninit;
}
// 分配网络设备索引
dev->ifindex = dev_new_index(net);
if (dev->iflink == -1)
dev->iflink = dev->ifindex;
// 检查名字是否已经被使用
head = dev_name_hash(net, dev->name);
hlist_for_each(p, head) {
struct net_device *d = hlist_entry(p, struct net_device, name_hlist);
if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
ret = -EEXIST;
goto err_uninit;
}
}
/* Fix illegal checksum combinations */
if ((dev->features & NETIF_F_HW_CSUM) &&
(dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n", dev->name);
dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
if ((dev->features & NETIF_F_NO_CSUM) &&
(dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n", dev->name);
dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
}
dev->features = netdev_fix_features(dev->features, dev->name);
// 只要硬件支持SG就开启软件的GSO
if (dev->features & NETIF_F_SG)
dev->features |= NETIF_F_GSO;
// 将网络设备注册到设备模型中,效果就是在/sys/class/net目录下创建该net_device名称的目录
netdev_initialize_kobject(dev);
ret = netdev_register_kobject(dev);
if (ret)
goto err_uninit;
// 设置网络设备的注册状态为已注册状态
dev->reg_state = NETREG_REGISTERED;
/*
* Default initial state at registry is that the
* device is present.
*/
// 设置初始的设备状态为“PRESENT”
set_bit(__LINK_STATE_PRESENT, &dev->state);
// 初始化发送排队规则,见"流量控制"
dev_init_scheduler(dev);
// 初始化基本完成,net_device对象的引用计数+1(此时其引用计数为1)
dev_hold(dev);
// 将网络设备同时挂接到系统维护三个表中:名字表、索引表、设备表
list_netdevice(dev);
// 发出NETDEV_REGISTER事件事件通知,表示有网络设备注册
ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
ret = notifier_to_errno(ret);
// 如果通知失败,这里回滚前面所有的注册步骤,并设置注册状态为UNREGISTERED
if (ret) {
rollback_registered(dev);
dev->reg_state = NETREG_UNREGISTERED;
}
out:
return ret;
err_uninit:
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
goto out;
}
注销net_device
/**
* unregister_netdev - remove device from the kernel
* @dev: device
*
* This function shuts down a device interface and removes it
* from the kernel tables.
*
* This is just a wrapper for unregister_netdevice that takes
* the rtnl semaphore. In general you want to use this and not
* unregister_netdevice.
*/
void unregister_netdev(struct net_device *dev)
{
// 同样需要先持有锁,然后执行真正的注销流程
rtnl_lock();
unregister_netdevice(dev);
rtnl_unlock();
}
void unregister_netdevice(struct net_device *dev)
{
ASSERT_RTNL();
// 回滚注册时执行的操作
rollback_registered(dev);
// 将设备加入系统的todo_list中,net_device对象的清理过程在rtnl_unlock()时处理
net_set_todo(dev);
}
// 全局的net_todo_list专门用来延迟执行去注册后的网络设备对象删除操作
static DEFINE_SPINLOCK(net_todo_list_lock);
static LIST_HEAD(net_todo_list);
static void net_set_todo(struct net_device *dev)
{
spin_lock(&net_todo_list_lock);
list_add_tail(&dev->todo_list, &net_todo_list);
spin_unlock(&net_todo_list_lock);
}
这里需要注意的是todo_list的使用。为了缩短持有RTNL锁的时间,将注销过程分为两个阶段,第一阶段在持有RTNL锁的情况下执行,第二阶段主要是执行一些更加耗时的操作,将其放到todo_list中,在释放RTNL锁的时候执行。
rollback_registered()
static void rollback_registered(struct net_device *dev)
{
// 1)设备接口层已经初始化完毕;2)已经持有RTNETLINK信号量
BUG_ON(dev_boot_phase);
ASSERT_RTNL();
// 未初始化的设备不能执行去注册,仅仅执行过alloc_netdev()的网络设备处于该状态
if (dev->reg_state == NETREG_UNINITIALIZED) {
printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
"was registered\n", dev->name, dev);
WARN_ON(1);
return;
}
BUG_ON(dev->reg_state != NETREG_REGISTERED);
// 设备可能还处于UP状态,关闭设备
dev_close(dev);
// 将net_device对象从全局的名字表、索引表、设备表中移除
unlist_netdevice(dev);
// 设置设备注册状态为UNREGISTERING,即正在去注册
dev->reg_state = NETREG_UNREGISTERING;
// 同步其它CPU上面该设备的状态
synchronize_net();
// 关闭发送队列
dev_shutdown(dev);
// 向其它模块发送UNREGISTER通知事件
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
// Flush the unicast and multicast chains
dev_addr_discard(dev);
// 回调驱动程序ndo_uninit()接口
if (dev->netdev_ops->ndo_uninit)
dev->netdev_ops->ndo_uninit(dev);
/* Notifier chain MUST detach us from master device. */
WARN_ON(dev->master);
// 从统一设备模型中移除net_device对象
netdev_unregister_kobject(dev);
synchronize_net();
// 递减设备引用计数,对应注册过程中的dev_hold()
dev_put(dev);
}
清理todo list
unregister_netdevice()末尾调用net_set_todo()函数将去注册net_device对象添加到全局net_todo_list中,在释放RTNL锁的时候调用netdev_run_todo()处理该todo list。
/* The sequence is:
*
* rtnl_lock();
* ...
* register_netdevice(x1);
* register_netdevice(x2);
* ...
* unregister_netdevice(y1);
* unregister_netdevice(y2);
* ...
* rtnl_unlock();
* free_netdev(y1);
* free_netdev(y2);
*
* We are invoked by rtnl_unlock().
* This allows us to deal with problems:
* 1) We can delete sysfs objects which invoke hotplug
* without deadlocking with linkwatch via keventd.
* 2) Since we run with the RTNL semaphore not held, we can sleep
* safely in order to wait for the netdev refcnt to drop to zero.
*
* We must not return until all unregister events added during
* the interval the lock was held have been completed.
*/
void netdev_run_todo(void)
{
struct list_head list;
// 为了尽可能的缩短持有互斥锁的时间,将net_todo_list链表做个快照,
// 然后释放互斥锁,后面的耗时流程操作快照list即可
list_replace_init(&net_todo_list, &list);
// 此时可以释放掉RTNL锁
__rtnl_unlock();
// 遍历快照list,继续去注册每一个net_device对象
while (!list_empty(&list)) {
struct net_device *dev = list_entry(list.next, struct net_device, todo_list);
list_del(&dev->todo_list);
// 执行到这里设备的注册状态肯定已经是UNREGISTERING状态
if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
printk(KERN_ERR "network todo '%s' but state %d\n",
dev->name, dev->reg_state);
dump_stack();
continue;
}
// 设置注册状态为去注册完成状态
dev->reg_state = NETREG_UNREGISTERED;
// 清空每个cpu的backlog队列
on_each_cpu(flush_backlog, dev, 1);
// 等待对所有网络设备对象的引用计数都释放
netdev_wait_allrefs(dev);
/* paranoia */
BUG_ON(atomic_read(&dev->refcnt));
WARN_ON(dev->ip_ptr);
WARN_ON(dev->ip6_ptr);
WARN_ON(dev->dn_ptr);
// 调用驱动程序提供的destructor()回调
if (dev->destructor)
dev->destructor(dev);
// 释放设备模型中的相关结构
kobject_put(&dev->dev.kobj);
}
}
netdev_wait_allrefs()
注销net_device对象时,设备接口层框架必须负责通知到所有引用了该net_devices对象的模块,为了实现这一目标,需要设备接口层代码和引用该net_device对象的模块互相配合:
- 其它模块监听设备接口层的net_device通知事件,在REGISTER事件处理中用dev_hold()递增该net_device对象的引用计数,在UNREGISTER事件处理时用dev_put()递减引用计数;
- 设备接口层代码在net_device对象注销过程中等待所有外部模块释放引用计数。该等待过程就是通过netdev_wait_allrefs()函数实现的。
/*
* netdev_wait_allrefs - wait until all references are gone.
*
* This is called when unregistering network devices.
*
* Any protocol or device that holds a reference should register
* for netdevice notification, and cleanup and put back the
* reference if they receive an UNREGISTER event.
* We can get stuck here if buggy protocols don't correctly
* call dev_put.
*/
static void netdev_wait_allrefs(struct net_device *dev)
{
unsigned long rebroadcast_time, warning_time;
rebroadcast_time = warning_time = jiffies;
// 循环等待,直到其它模块对网络设备对象的引用计数变为0
while (atomic_read(&dev->refcnt) != 0) {
// 每隔1s向外发送一次NETDEV_UNREGISTER事件通知,
// 促使其它模块释放对网络设备的引用
if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
rtnl_lock();
/* Rebroadcast unregister notification */
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
// 对链路状态的处理我们在单独的笔记中介绍
if (test_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) {
/* We must not have linkwatch events
* pending on unregister. If this
* happens, we simply run the queue
* unscheduled, resulting in a noop
* for this device.
*/
linkwatch_run_queue();
}
__rtnl_unlock();
rebroadcast_time = jiffies;
}
// 休眠250ms后再次检查
msleep(250);
// 等待每超过10s,打印一条告警信息
if (time_after(jiffies, warning_time + 10 * HZ)) {
printk(KERN_EMERG "unregister_netdevice: "
"waiting for %s to become free. Usage count = %d\n",
dev->name, atomic_read(&dev->refcnt));
warning_time = jiffies;
}
}
}
由于netdev_wait_allrefs()会休眠等待,所以去注册过程可能会阻塞一段时间。
net_device注册状态
从前面的代码分析中可以看出,在net_device对象的分配、注册、注销和释放过程中,net_device.reg_state的取值非常关键,它控制了net_device对象的整个生命周期流程。定义的注册状态有如下几种:
struct net_device {
enum {
NETREG_UNINITIALIZED=0, /* after alloc_netdev_mq() */
NETREG_REGISTERED, /* completed register_netdevice */
NETREG_UNREGISTERING,/* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
} reg_state;
...
}
在整个流程中,注册状态的变迁关系见下图: