1 简介
内核功能划分
- 进程管理
- 内存管理
- 文件系统
- 设备控制
- 网络功能
设备和模块分类
- 字符设备
- 块设备
- 网络接口
内核中的并发
- linux系统运行多个并发进程,可能有多个进程同时使用我们的驱动程序。
- 大多数设备能够中断处理器,而中断处理程序异步运行,可能在驱动程序正试图处理其他任务时被调用。
- linux可以允许在对称多处理器(Symmetric multiprocessor,SMP)系统上,可能同时不止一个CPU运行我们的驱动程序。
Linux内核代码(包括驱动程序代码)必须是可重入的。
2 构造和运行模块
当前进程
<linux/sched.h>头文件中定义了task_struct
/*
* Define 'struct task_struct' and provide the main scheduler
* APIs (schedule(), wakeup variants, etc.)
*/
#include <asm/current.h>
struct task_struct {
...
}
直接访问全局项current
来获得当前进程
#include <linux/thread_info.h>
#define get_current() (current_thread_info()->task)
#define current get_current()
Hello World模块
#include <linux/init.h>
#include <linux/module.h>
MODULE_LICENSE("Dual BSD/GPL");
static int hello_init(void) {
printk(KERN_ALERT "Hello, world\n");
return 0;
}
static void hello_exit(void) {
printk(KERN_ALERT "Goodbye, cruel world\n");
}
module_init(hello_init);
module_exit(hello_exit);
装载和卸载模块
insmod
它将模块的代码和数据装入内核,然后使用内核的符号表解析模块中任何未解析的符号。insmod可以接受一些命令行选项。
内 核 是 如 何 支 持 i n s m o d 工 作 的 ? \color{red}{内核是如何支持insmod工作的?} 内核是如何支持insmod工作的?
实际上它依赖于定义在kernel/module.c中的一个系统调用。函数sys_init_module给模块分配内核内存以便装载模块,然后,该系统调用将模块正文复制到内存区域,并通过内核符号表解析模块中内核引用,最后调用模块的初始化函数。
modprobe
也用来将模块装载到内核中。
m o d p r o b e 和 i n s m o d 的 区 别 在 于 \color{red}{modprobe和insmod的区别在于} modprobe和insmod的区别在于
modprobe会考虑要装载的模块是否引用了一些当前内核不存在的符合。如果有这类引用,modprobe会在当前模块搜索路径中查找定义了这些符号的其他模块。如果modprobe找到了这些模块(即要装载模块所依赖的模块),它会同时将这些模块装载到内核。
在这种情况下使用insmod,则该命令会失败,并在系统日志文件中记录“unresolved symbols”消息。
rmmod
从内核中移除模块
/sys/module
是sysfs目录层次结构中包含当前已装载模块信息的目录。
/proc/modules
是早期用法,只是在单个文件中包括这些信息。
内核符号表
// 导出符号
EXPORT_SYMBOL(name);
// 要导出的模块只能被GPL许可证下的模块使用
EXPORT_SYMBOL_GPL(name);
模块参数
// linux/moduleparam.h
#define module_param(name, type, perm) \
module_param_named(name, name, type, perm)
3 字符设备驱动程序
主设备号和次设备号
内核中,dev_t
类型(在<linux/types.h>
中定义)用来保存设备编号。
dev_t
是一个32位数,12位用来表示主设备号
,其余20位用来表示次设备号
。
// linux/kdev_t.h
#define MINORBITS 20
#define MINORMASK ((1U << MINORBITS) - 1)
// 获取dev_t的主设备号和次设备号
#define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS))
#define MINOR(dev) ((unsigned int) ((dev) & MINORMASK))
// 将主(次)设备号转换成dev_t类型
#define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi))
分配和释放设备编号
int register_chrdev_region(dev_t, unsigned, const char *);
int alloc_chrdev_region(dev_t *, unsigned, unsigned, const char *);
void unregister_chrdev_region(dev_t, unsigned);
驱动程序应该始终使用alloc_chrdev_region
而不是register_chrdev_region
动态分配的缺点:由于分配的主设备号不能保证始终一致,所以无法预先创建设备节点。
示例:获取主设备号
if (major) {
dev = MKDEV(major, minor);
result = register_chrdev_region(dev, nr_devs, "name");
} else {
result = alloc_chrdev_region(&dev, minor, nr_devs);
major = MAJOR(dev);
}
if (result < 0) {
printk(KERN_WARNING "can't get major %d\n", major);
return result;
}
重要的数据结构
- file_operations
- file
- inode
文件操作
// 来自<linux/fs.h>
struct file_operations {
/*
* 指向拥有该结构的模块的指针
* 内核使用这个字段避免在模块的操作正在被使用时卸载该模块
* 几乎在所有的情况下,该成员会被初始化为THIS_MODULE,定义在<linux/module.h>
*/
struct module *owner;
/*
* 修改文件的当前读写位置
*/
loff_t (*llseek) (struct file *, loff_t, int);
/*
* 用来从设备中读取数据
*/
ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
/*
* 向设备发送数据
*/
ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
int (*iterate_shared) (struct file *, struct dir_context *);
/*
* poll、epoll和select这三个系统调用的后端实现
*/
unsigned int (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
/*
* mmap用于请求将设备内存映射到进程地址空间
*/
int (*mmap) (struct file *, struct vm_area_struct *);
unsigned long mmap_supported_flags;
/*
* 始终是对设备执行的第一个操作,不要求驱动程序一定要声明一个相应的方法。
* 如果这个入口为NULL,设备的打开操作永远成功,但系统不会通知驱动程序。
*/
int (*open) (struct inode *, struct file *);
/*
* 对flush操作的调用发生在进程关闭设备文件描述符副本的时候,它应该执行设备上尚未完结的操作。
*/
int (*flush) (struct file *, fl_owner_t id);
/*
* 当file结构被释放时,将调用这个操作。
*/
int (*release) (struct inode *, struct file *);
/*
* fsync系统调用的后端实现,用户调用它来刷新待处理的数据
*/
int (*fsync) (struct file *, loff_t, loff_t, int datasync);
/*
* 用来通知设备其FASYNC标志发生了变化
*/
int (*fasync) (int, struct file *, int);
/*
* 用于实现文件锁定
*/
int (*lock) (struct file *, int, struct file_lock *);
/*
* sendpage是sendfile系用调用的另外一半,它由内核调用以将数据发送到对应的文件,每次一个数据页
*/
ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
int (*check_flags)(int);
int (*setfl)(struct file *, unsigned long);
int (*flock) (struct file *, int, struct file_lock *);
ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long, struct file_lock **, void **);
long (*fallocate)(struct file *file, int mode, loff_t offset,
loff_t len);
void (*show_fdinfo)(struct seq_file *m, struct file *f);
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
ssize_t (*copy_file_range)(struct file *, loff_t, struct file *,
loff_t, size_t, unsigned int);
int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t,
u64);
ssize_t (*dedupe_file_range)(struct file *, u64, u64, struct file *,
u64);
} __randomize_layout;
示例:file_operations结构初始化
struct file_operations xx_fops = {
.owner = THIS_MODULE,
.open = xx_open,
.read = xx_read,
.write = xx_write,
.ioctl = xx_ioctl,
.release = xx_release,
}
file结构
// 来自<linux/fs.h>
struct file {
union {
struct llist_node fu_llist;
struct rcu_head fu_rcuhead;
} f_u;
struct path f_path;
struct inode *f_inode; /* cached value */
/*
* 与文件操作相关的操作
*/
const struct file_operations *f_op;
/*
* Protects f_ep_links, f_flags.
* Must not be taken from IRQ context.
*/
spinlock_t f_lock;
enum rw_hint f_write_hint;
atomic_long_t f_count;
/*
* 文件标志。如O_RDONLY、O_NONBLOCK和O_SYNC。
*/
unsigned int f_flags;
/*
* 文件模式。通过FMODE_READ和FMODE_WRITE来标识文件是否可读或可写或可读写。
* 由于内核在调用驱动程序的read和write方法前已经检查了权限,所以不必为这两个方法检查权限。
* 在没有获得对应访问权限而打开文件的情况下,对文件的读写操作都将被内核拒绝,驱动程序无需为此而作额外的判断。
*/
fmode_t f_mode;
struct mutex f_pos_lock;
/*
* 当前的读/写位置
*/
loff_t f_pos;
struct fown_struct f_owner;
const struct cred *f_cred;
struct file_ra_state f_ra;
u64 f_version;
#ifdef CONFIG_SECURITY
void *f_security;
#endif
/*
* 驱动程序可以将这个字段用于任何目的或者忽略,指向已分配的数据。
* private_data是跨系统调用时保存状态信息的非常有用的资源。
*/
/* needed for tty driver, and maybe others */
void *private_data;
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct list_head f_ep_links;
struct list_head f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
errseq_t f_wb_err;
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
inode结构
内核用inode结构在内部表示文件。
对于单个文件,可能会有许多个表示打开的文件描述符的file
结构,但它们都指向单个inode
结构。
// 来自<linux/fs.h>
/*
* Keep mostly read-only and often accessed (especially for
* the RCU path lookup and 'stat' data) fields at the beginning
* of the 'struct inode'
*/
struct inode {
...
/*
* 对表示设备文件的inode结构,该字段包含了真正的设备编号
*/
dev_t i_rdev;
union {
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev;
/*
* struct cdev是表示字符设备的内核的内部结构。
* 当inode指向一个字符设备文件时,该字段包含了指向struct cdev结构的指针。
*/
struct cdev *i_cdev;
char *i_link;
unsigned i_dir_seq;
};
...
}
从inode中获取主设备号和次设备号
static inline unsigned iminor(const struct inode *inode)
{
return MINOR(inode->i_rdev);
}
static inline unsigned imajor(const struct inode *inode)
{
return MAJOR(inode->i_rdev);
}
字符设备的注册
// 来自<linux/cdev.h>
struct cdev {
struct kobject kobj;
struct module *owner;
const struct file_operations *ops;
struct list_head list;
dev_t dev;
unsigned int count;
} __randomize_layout;
void cdev_init(struct cdev *, const struct file_operations *);
struct cdev *cdev_alloc(void);
void cdev_put(struct cdev *p);
int cdev_add(struct cdev *, dev_t, unsigned);
void cdev_del(struct cdev *);
container_of
可用于从包含在某个结构体中的指针获得结构本身的指针
// <linux/kernel.h>
/**
* container_of - cast a member of a structure out to the containing structure
* @ptr: the pointer to the member.
* @type: the type of the container struct this is embedded in.
* @member: the name of the member within the struct.
*
*/
#define container_of(ptr, type, member) ({ \
void *__mptr = (void *)(ptr); \
BUILD_BUG_ON_MSG(!__same_type(*(ptr), ((type *)0)->member) && \
!__same_type(*(ptr), void), \
"pointer type mismatch in container_of()"); \
((type *)(__mptr - offsetof(type, member))); })
内存使用
// <linux/slab.h>
void *kmalloc(size_t size, int flags);
void kfree(void *ptr);
在内核空间和用户空间之间拷贝数据
// <linux/uaccess.h>
// 该头文件声明了在内核代码和用户空间之间移动数据的函数
static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n);
static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n);
4 调试技术
打印调试
// <linux/kern_levels.h>
#define KERN_EMERG KERN_SOH "0" /* system is unusable */
#define KERN_ALERT KERN_SOH "1" /* action must be taken immediately */
#define KERN_CRIT KERN_SOH "2" /* critical conditions */
#define KERN_ERR KERN_SOH "3" /* error conditions */
#define KERN_WARNING KERN_SOH "4" /* warning conditions */
#define KERN_NOTICE KERN_SOH "5" /* normal but significant condition */
#define KERN_INFO KERN_SOH "6" /* informational */
#define KERN_DEBUG KERN_SOH "7" /* debug-level messages */
可以使所有内核消息显示到控制台上:
echo > 8 /proc/sys/kernel/printk
printk_ratelimit
通过跟踪发送到控制台的消息数量工作。如果输出的速度超过一个阈值,printk_ratelimit将返回零,从而避免发送重复消息。
int printk_ratelimit(void);
/*
if (printk_ratelimit()) {
printk(KERN_NOTICE "This message will be printed\n");
}
*/
定制printk_ratelimit
的行为:
/proc/sys/kernel/printk_ratelimit
/proc/sys/kernel/printk_ratelimit_burst
打印设备编号:
// <linux/kdev_t.h>
#define print_dev_t(buffer, dev) \
sprintf((buffer), "%u:%u\n", MAJOR(dev), MINOR(dev))
#define format_dev_t(buffer, dev) \
({ \
sprintf(buffer, "%u:%u", MAJOR(dev), MINOR(dev)); \
buffer; \
})
查询调试
// <linux/seq_file.h>
struct seq_operations {
void * (*start) (struct seq_file *m, loff_t *pos);
void (*stop) (struct seq_file *m, void *v);
void * (*next) (struct seq_file *m, void *v, loff_t *pos);
int (*show) (struct seq_file *m, void *v);
};
监视调试
strace
命令,它可以显示由用户空间程序所发出的所有系统调用。
5 并发和竞态
信号量和互斥体
Linux信号量的实现
// <linux/semaphore.h>
/* Please don't access any members of this structure directly */
struct semaphore {
raw_spinlock_t lock;
unsigned int count;
struct list_head wait_list;
};
#define __SEMAPHORE_INITIALIZER(name, n) \
{ \
.lock = __RAW_SPIN_LOCK_UNLOCKED((name).lock), \
.count = n, \
.wait_list = LIST_HEAD_INIT((name).wait_list), \
}
#define DEFINE_SEMAPHORE(name) \
struct semaphore name = __SEMAPHORE_INITIALIZER(name, 1)
static inline void sema_init(struct semaphore *sem, int val)
{
static struct lock_class_key __key;
*sem = (struct semaphore) __SEMAPHORE_INITIALIZER(*sem, val);
lockdep_init_map(&sem->lock.dep_map, "semaphore->lock", &__key, 0);
}
/*
* 减小了信号量的值
*/
extern void down(struct semaphore *sem);
/*
* 减小了信号量的值
* 但操作是可中断的,它允许等待在某个信号量上的用户空间进程可被用户中断
*/
extern int __must_check down_interruptible(struct semaphore *sem);
/*
* 调用up之后,调用者不再拥有该信号量
* 任何拿到信号量的线程都必须通过一次(只有一次)对up的调用而释放该信号量。
*/
extern void up(struct semaphore *sem);
读写信号量
// <linux/rwlock.h>
/* All arch specific implementations share the same struct */
struct rw_semaphore {
long count;
struct list_head wait_list;
raw_spinlock_t wait_lock;
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
struct optimistic_spin_queue osq; /* spinner MCS lock */
/*
* Write owner. Used as a speculative check to see
* if the owner is running on the cpu.
*/
struct task_struct *owner;
#endif
#ifdef CONFIG_DEBUG_LOCK_ALLOC
struct lockdep_map dep_map;
#endif
};
extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
struct lock_class_key *key);
#define init_rwsem(sem) \
do { \
static struct lock_class_key __key; \
\
__init_rwsem((sem), #sem, &__key); \
} while (0)
/*
* lock for reading
*/
extern void down_read(struct rw_semaphore *sem);
/*
* trylock for reading -- returns 1 if successful, 0 if contention
*/
extern int down_read_trylock(struct rw_semaphore *sem);
/*
* lock for writing
*/
extern void down_write(struct rw_semaphore *sem);
/*
* trylock for writing -- returns 1 if successful, 0 if contention
*/
extern int down_write_trylock(struct rw_semaphore *sem);
/*
* release a read lock
*/
extern void up_read(struct rw_semaphore *sem);
/*
* release a write lock
*/
extern void up_write(struct rw_semaphore *sem);
completion
completion
是一种轻量级的机制,它允许一个线程告诉另外一个线程某个工作已经完成。
// <linux/completion.h>
struct completion {
unsigned int done;
wait_queue_head_t wait;
};
/*
* 创建completion
*/
#define DECLARE_COMPLETION(work) \
struct completion work = COMPLETION_INITIALIZER(work)
/**
* init_completion - Initialize a dynamically allocated completion
* @x: pointer to completion structure that is to be initialized
*
* This inline function will initialize a dynamically created completion
* structure.
*/
static inline void init_completion(struct completion *x)
{
x->done = 0;
init_waitqueue_head(&x->wait);
}
/**
* reinit_completion - reinitialize a completion structure
* @x: pointer to completion structure that is to be reinitialized
*
* This inline function should be used to reinitialize a completion structure so it can
* be reused. This is especially important after complete_all() is used.
*/
static inline void reinit_completion(struct completion *x)
{
x->done = 0;
}
/*
* 等待completion
*/
extern void wait_for_completion(struct completion *);
/*
* complete只会唤醒一个等待线程,complete_all允许唤醒所有等待线程
*/
extern void complete(struct completion *);
extern void complete_all(struct completion *);
completion
机制的典型使用是 模块退出时的内核线程终止
自旋锁
一个自旋锁是一个互斥设备,它只能有两个值:锁定
和 解锁
。
如果锁被其他人获得,则代码进入忙循环并重复检查这个锁,直到该锁可用为止。这个循环就是自旋锁的 自旋
部分。
// <linux/spinlock.h>
#define spin_lock_init(_lock) \
do { \
spinlock_check(_lock); \
raw_spin_lock_init(&(_lock)->rlock); \
} while (0)
static __always_inline void spin_lock(spinlock_t *lock)
{
raw_spin_lock(&lock->rlock);
}
static __always_inline void spin_unlock(spinlock_t *lock)
{
raw_spin_unlock(&lock->rlock);
}
原子变量
Lock()
index++;
UnLock()
完整的锁机制对一个简单的整数来讲却显得有些浪费。
针对这种情况,内核提供了一种原子的整数类型。atomic_t
// <linux/types.h>
typedef struct {
int counter;
} atomic_t;
#ifdef CONFIG_64BIT
typedef struct {
long counter;
} atomic64_t;
#endif
// <linux/atomic.h>
void atomic_set(atomic_t *v, int i);
int atomic_read(atomic_t *v);
void atomic_add(int i, atomic_t *v);
void atomic_sub(int i, atomic_t *v);
void atomic_inc(atomic_t *v);
void atomic_dec(atomic_t *v);
int atomic_add_return(int i, atomic_t *v);
int atomic_sub_return(int i, atomic_t *v);
int atomic_inc_return(atomic_t *v);
int atomic_dec_return(atomic_t *v);
注意:只有原子变量的数目是原子的,atomic_t变量才能工作。需要多个atomic_t变量的操作,仍然需要某种类型的锁。
位操作
void set_bit(nr, void *addr);
void clear_bit(nr, void *addr);
void change_bit(nr, void *addr);
seqlock
提供对共享资源的快速、免锁访问。
当要保护的资源很小,很简单,会频繁被访问而且写入访问很少发送且必须快速时,就可以使用seqlock
。
// <linux/seqlock.h>
typedef struct {
struct seqcount seqcount;
spinlock_t lock;
} seqlock_t;
#define seqlock_init(x) \
do { \
seqcount_init(&(x)->seqcount); \
spin_lock_init(&(x)->lock); \
} while (0)
#define DEFINE_SEQLOCK(x) \
seqlock_t x = __SEQLOCK_UNLOCKED(x)
读取–复制–更新
读取–复制–更新 (read-copy-update,RCU)是一种高级的互斥机制。
// <linux/rcupdate.h>
void rcu_read_lock(void);
void rcu_read_unlock(void);
void call_rcu(struct rcu_head *head, rcu_callback_t func);
6 高级字符驱动程序操作
ioctl
// 用户空间,ioctl系统调用
int ioctl(int fildes, int request, ... /* arg */);
// 驱动程序的ioctl方法
long (*unlocked_ioctl) (struct file *filp, unsigned int cmd, unsigned long arg);
long (*compat_ioctl) (struct file *filp, unsigned int cmd, unsigned long arg);
linux提供了参数cmd统一格式
// <uapi/asm-generic/ioctl.h>
#define _IOC_NRBITS 8
#define _IOC_TYPEBITS 8
#ifndef _IOC_SIZEBITS
# define _IOC_SIZEBITS 14
#endif
#ifndef _IOC_DIRBITS
# define _IOC_DIRBITS 2
#endif
#ifndef _IOC_NONE
# define _IOC_NONE 0U
#endif
#ifndef _IOC_WRITE
# define _IOC_WRITE 1U
#endif
#ifndef _IOC_READ
# define _IOC_READ 2U
#endif
#define _IOC(dir,type,nr,size) \
(((dir) << _IOC_DIRSHIFT) | \
((type) << _IOC_TYPESHIFT) | \
((nr) << _IOC_NRSHIFT) | \
((size) << _IOC_SIZESHIFT))
#ifndef __KERNEL__
#define _IOC_TYPECHECK(t) (sizeof(t))
#endif
/* used to create numbers */
#define _IO(type,nr) _IOC(_IOC_NONE,(type),(nr),0)
#define _IOR(type,nr,size) _IOC(_IOC_READ,(type),(nr),(_IOC_TYPECHECK(size)))
#define _IOW(type,nr,size) _IOC(_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size)))
#define _IOWR(type,nr,size) _IOC(_IOC_READ|_IOC_WRITE,(type),(nr),(_IOC_TYPECHECK(size)))
/* used to decode ioctl numbers.. */
#define _IOC_DIR(nr) (((nr) >> _IOC_DIRSHIFT) & _IOC_DIRMASK)
#define _IOC_TYPE(nr) (((nr) >> _IOC_TYPESHIFT) & _IOC_TYPEMASK)
#define _IOC_NR(nr) (((nr) >> _IOC_NRSHIFT) & _IOC_NRMASK)
#define _IOC_SIZE(nr) (((nr) >> _IOC_SIZESHIFT) & _IOC_SIZEMASK)
type
幻数
nr
顺序编号
dir
如果相关命令涉及数据传输,则该字段定义数据传输的方向。__IOC_NONE
,__IOC_READ
,__IOC_WRITE
size
用户数据大小
使用ioctl参数
当使用一个指针指向用户空间时,必须确保指向的用户空间是合法的。
驱动程序应该负责对每个用到的用户空间地址做适当的检查。
// <asm-generic/uaccess.h>
#define VERIFY_READ 0
#define VERIFY_WRITE 1
#define access_ok(type, addr, size) __access_ok((unsigned long)(addr),(size))
access_ok使用示例
if (__IOC_DIR(cmd) & __IOC_READ)
err = !access_ok(VERIFY_WRITE, (void __user *)arg, __IOC_SIZE(cmd));
else if (__IOC_DIR(cmd) & __IOC_WRITE)
err = !access_ok(VERIFY_READ, (void __user *)arg, __IOC_SIZE(cmd));
if (err) return -EFAULT;
在调用access_ok后,驱动程序就可以安全地进行实际的数据传送了。
除了cpoy_from_user
和cpoy_to_user
函数外,还可以使用已经为==最常用的数据大小(1,2,3,8字节)==优化过的一组函数。
// 把x写到用户空间
#define put_user(x, ptr) \
({ \
void *__p = (ptr); \
might_fault(); \
access_ok(VERIFY_WRITE, __p, sizeof(*ptr)) ? \
__put_user((x), ((__typeof__(*(ptr)) *)__p)) : \
-EFAULT; \
})
// 从用户空间接收一个数据
#define get_user(x, ptr) \
({ \
const void *__p = (ptr); \
might_fault(); \
access_ok(VERIFY_READ, __p, sizeof(*ptr)) ? \
__get_user((x), (__typeof__(*(ptr)) *)__p) : \
((x) = (__typeof__(*(ptr)))0,-EFAULT); \
})
权限与受限操作
linux内核提供了一个更为灵活的系统,称为权能(capability)。
内核提供了两个系统调用capget
和capset
,这样就可以从用户空间来管理权能。
#include <sys/capability.h>
int capget(cap_user_header_t header, cap_user_data_t data);
int capset(cap_user_header_t header, const cap_user_data_t data);
权能操作定义:
// <uapi/linux/capability.h>
/**
** POSIX-draft defined capabilities.
**/
/* In a system with the [_POSIX_CHOWN_RESTRICTED] option defined, this
overrides the restriction of changing file ownership and group
ownership. */
#define CAP_CHOWN 0
/* Override all DAC access, including ACL execute access if
[_POSIX_ACL] is defined. Excluding DAC access covered by
CAP_LINUX_IMMUTABLE. */
#define CAP_DAC_OVERRIDE 1
/* Allow interface configuration */
/* Allow administration of IP firewall, masquerading and accounting */
/* Allow setting debug option on sockets */
/* Allow modification of routing tables */
/* Allow setting arbitrary process / process group ownership on
sockets */
/* Allow binding to any address for transparent proxying (also via NET_RAW) */
/* Allow setting TOS (type of service) */
/* Allow setting promiscuous mode */
/* Allow clearing driver statistics */
/* Allow multicasting */
/* Allow read/write of device-specific registers */
/* Allow activation of ATM control sockets */
#define CAP_NET_ADMIN 12
/* Insert and remove kernel modules - modify kernel without limit */
#define CAP_SYS_MODULE 16
/* Allow ioperm/iopl access */
/* Allow sending USB messages to any device via /proc/bus/usb */
#define CAP_SYS_RAWIO 17
/* Allow configuration of the secure attention key */
/* Allow administration of the random device */
/* Allow examination and configuration of disk quotas */
/* Allow setting the domainname */
/* Allow setting the hostname */
/* Allow calling bdflush() */
/* Allow mount() and umount(), setting up new smb connection */
/* Allow some autofs root ioctls */
/* Allow nfsservctl */
/* Allow VM86_REQUEST_IRQ */
/* Allow to read/write pci config on alpha */
/* Allow irix_prctl on mips (setstacksize) */
/* Allow flushing all cache on m68k (sys_cacheflush) */
/* Allow removing semaphores */
/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
and shared memory */
/* Allow locking/unlocking of shared memory segment */
/* Allow turning swap on/off */
/* Allow forged pids on socket credentials passing */
/* Allow setting readahead and flushing buffers on block devices */
/* Allow setting geometry in floppy driver */
/* Allow turning DMA on/off in xd driver */
/* Allow administration of md devices (mostly the above, but some
extra ioctls) */
/* Allow tuning the ide driver */
/* Allow access to the nvram device */
/* Allow administration of apm_bios, serial and bttv (TV) device */
/* Allow manufacturer commands in isdn CAPI support driver */
/* Allow reading non-standardized portions of pci configuration space */
/* Allow DDI debug ioctl on sbpcd driver */
/* Allow setting up serial ports */
/* Allow sending raw qic-117 commands */
/* Allow enabling/disabling tagged queuing on SCSI controllers and sending
arbitrary SCSI commands */
/* Allow setting encryption key on loopback filesystem */
/* Allow setting zone reclaim policy */
#define CAP_SYS_ADMIN 21
/* Allow configuration of tty devices */
/* Allow vhangup() of tty */
#define CAP_SYS_TTY_CONFIG 26
权能的检查通过capable
函数实现:
int capable(int capability);
阻塞型I/O
休眠
当一个进程被置入休眠时,它会被标记为一种特殊状态并从调度器的运行队列中移走。直到某些情况下修改了这个状态,进程才会在任意CPU上调度,也即运行该进程。休眠中的进程会被搁置在一边,等待将来的某个事件发生。对Linux设备驱动程序来讲,让一个进程进入休眠状态很容易。但是,为了将进程以一种安全的方式进入休眠,我们需要牢记两条规则:
-
永远不要在原子上下文中进入休眠
-
当我们被唤醒是,我们永远无法知道休眠了多长时间,或者休眠期间都发生了什么事。
// <linux/wait.h>
typedef struct __wait_queue wait_queue_t;
struct __wait_queue {
unsigned int flags;
void *private;
wait_queue_func_t func;
struct list_head task_list;
};
struct __wait_queue_head {
spinlock_t lock;
struct list_head task_list;
};
typedef struct __wait_queue_head wait_queue_head_t;
#define __WAITQUEUE_INITIALIZER(name, tsk) { \
.private = tsk, \
.func = default_wake_function, \
.task_list = { NULL, NULL } }
#define DECLARE_WAITQUEUE(name, tsk) \
wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
extern void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *);
#define init_waitqueue_head(q) \
do { \
static struct lock_class_key __key; \
\
__init_waitqueue_head((q), #q, &__key); \
} while (0)
在执行阻塞型操作的情况下,应该实现下列动作以保持和标准语义一致:
1、如果一个进程调用了read但是还没有数据可读,此进程必须阻塞。数据到达时进程被唤醒,并把数据返回给调用者。即使数据数目少于count参数指定的数目也是如此。
2、如果一个进程调用了write但缓冲区没有空间,此进程必须阻塞,而且必须休眠在与读取进程不同的等待队列上。当向硬件设备写入一些数据,从而腾出了部分输出缓冲区后,进程即被唤醒,write调用成功。即使缓冲区中可能没有所要求的count字节的空间而只写入了部分数据,也是如此。
poll和select
unsigned int (*poll) (struct file *filp, poll_table *wait);
当用户空间程序在驱动程序关联的文件描述符上执行poll、select或epoll系统调用时,该驱动程序方法将被调用。
该设备方法分为两步处理:
1) 在一个或多个可指示poll状态变化的等待队列上调用poll_wait。如果当前没有文件描述符可用来执行I/O,则内核将使进程在传递到该系统调用的所有文件描述符对应的等待队列上等待。
2) 返回一个用来描述操作是否可以立即无阻塞执行的位掩码。
通过poll_wait
函数,驱动程序向poll_table结构添加一个等待队列:
typedef struct poll_table_struct {
poll_queue_proc _qproc;
unsigned long _key;
} poll_table;
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
if (p && p->_qproc && wait_address)
p->_qproc(filp, wait_address, p);
}
标志用来指明可能的操作:
/* 无阻塞地读取 */
#define POLLIN 0x0001
/* 无阻塞地读取高优先级的数据 */
#define POLLPRI 0x0002
/* 无阻塞地写入 */
#define POLLOUT 0x0004
/* 设备发送错误 */
#define POLLERR 0x0008
/* 当读取设备的进程到达文件尾时,驱动程序必须设置POLLHUP(挂起)位。 */
#define POLLHUP 0x0010
/* 描述字不是一个打开的文件 */
#define POLLNVAL 0x0020
/* 如果普通数据已经就绪,就可以读取,就设置该位。一个可读设备返回(POLLIN | POLLRDNORM) */
#define POLLRDNORM 0x0040
/* 优先级带数据可读 */
#define POLLRDBAND 0x0080
#ifndef POLLWRNORM
/* 普通数据可写 */
#define POLLWRNORM 0x0100
#endif
#ifndef POLLWRBAND
/* 优先级带数据可写 */
#define POLLWRBAND 0x0200
#endif
如果使用设备的程序需要保证输出缓冲区中的数据确实已经被传送出去,驱动程序就必须提供一个fsync方法。
int (*fsync) (struct file *, loff_t, loff_t, int datasync);
异步通知
通过使用异步通知,应用程序可以在数据可用时收到一个信号,而不需要不停地使用轮询来关注数据。
驱动程序怎样实现异步信号:
- F_SETOWN被调用时对filp->f_owner赋值。
- 在执行F_SETFL启用FASYNC时,调用驱动程序的fasync方法。
只要filp->f_flags中的FASYNC标志发生了变化,就会调用fasync方法,以便把这个变化通知驱动程序,使其能正确响应。
文件打开时,FASYNC标志被默认为是清除的。 - 当数据到达时,所有注册为异步通知的进程都会被发送一个SIGIO信号。
int (*fasync) (int, struct file *, int);
// 驱动程序要调用的两个函数:
int fasync_helper(int fd, struct file *, int mode, struct fasync_struct **);
void kill_fasync(struct fasync_struct **, int sig, int band);
当一个打开的文件的FASYNC标志被修改时,调用fasync_helper
以便从相关进程列表中增加或删除文件。
在数据到达时,可使用kill_fasync
通知所有的相关进程。它的参数包括要发送的信号(通常是SIGIO)和带宽(band),后者几乎总是POLL_IN。
定位设备
如果设备操作未定义llseek
方法,内核默认通过修改filp->f_pos
而执行定位。filp->f_pos
是文件的当前读写位置。
loff_t (*llseek) (struct file *filp, loff_t off, int whence);
// 通过调用nonseekable_open,可以通知内核设备不支持llseek
int nonseekable_open(struct inode * inode, struct file * filp);
设备文件的访问控制
- 独享设备
- 限制每次只由一个用户访问
- 替代EBUSY的阻塞型open
- 在打开时复制设备
17 网络驱动程序
net_device结构细节
网络驱动程序在其模块初始化函数中的注册方法,和字符驱动程序以及块驱动程序不同。
// <linux/netdevice.h>
int register_netdev(struct net_device *dev);
void unregister_netdev(struct net_device *dev);
int register_netdevice_notifier(struct notifier_block *nb);
int unregister_netdevice_notifier(struct notifier_block *nb);
每个接口由一个net_device
结构描述,其定义在<linux/netdevice.h>中。
// <linux/netdevice.h>
/**
* struct net_device - The DEVICE structure.
*
* Actually, this whole structure is a big mistake. It mixes I/O
* data with strictly "high-level" data, and it has to know about
* almost every data structure used in the INET module.
*
* @name: This is the first field of the "visible" part of this structure
* (i.e. as seen by users in the "Space.c" file). It is the name
* of the interface.
*
* @name_hlist: Device name hash chain, please keep it close to name[]
* @ifalias: SNMP alias
* @mem_end: Shared memory end
* @mem_start: Shared memory start
* @base_addr: Device I/O address
* @irq: Device IRQ number
*
* @carrier_changes: Stats to monitor carrier on<->off transitions
*
* @state: Generic network queuing layer state, see netdev_state_t
* @dev_list: The global list of network devices
* @napi_list: List entry used for polling NAPI devices
* @unreg_list: List entry when we are unregistering the
* device; see the function unregister_netdev
* @close_list: List entry used when we are closing the device
* @ptype_all: Device-specific packet handlers for all protocols
* @ptype_specific: Device-specific, protocol-specific packet handlers
*
* @adj_list: Directly linked devices, like slaves for bonding
* @features: Currently active device features
* @hw_features: User-changeable features
*
* @wanted_features: User-requested features
* @vlan_features: Mask of features inheritable by VLAN devices
*
* @hw_enc_features: Mask of features inherited by encapsulating devices
* This field indicates what encapsulation
* offloads the hardware is capable of doing,
* and drivers will need to set them appropriately.
*
* @mpls_features: Mask of features inheritable by MPLS
*
* @ifindex: interface index
* @group: The group the device belongs to
*
* @stats: Statistics struct, which was left as a legacy, use
* rtnl_link_stats64 instead
*
* @rx_dropped: Dropped packets by core network,
* do not use this in drivers
* @tx_dropped: Dropped packets by core network,
* do not use this in drivers
* @rx_nohandler: nohandler dropped packets by core network on
* inactive devices, do not use this in drivers
*
* @wireless_handlers: List of functions to handle Wireless Extensions,
* instead of ioctl,
* see <net/iw_handler.h> for details.
* @wireless_data: Instance data managed by the core of wireless extensions
*
* @netdev_ops: Includes several pointers to callbacks,
* if one wants to override the ndo_*() functions
* @ethtool_ops: Management operations
* @ndisc_ops: Includes callbacks for different IPv6 neighbour
* discovery handling. Necessary for e.g. 6LoWPAN.
* @header_ops: Includes callbacks for creating,parsing,caching,etc
* of Layer 2 headers.
*
* @flags: Interface flags (a la BSD)
* @priv_flags: Like 'flags' but invisible to userspace,
* see if.h for the definitions
* @gflags: Global flags ( kept as legacy )
* @padded: How much padding added by alloc_netdev()
* @operstate: RFC2863 operstate
* @link_mode: Mapping policy to operstate
* @if_port: Selectable AUI, TP, ...
* @dma: DMA channel
* @mtu: Interface MTU value
* @min_mtu: Interface Minimum MTU value
* @max_mtu: Interface Maximum MTU value
* @type: Interface hardware type
* @hard_header_len: Maximum hardware header length.
* @min_header_len: Minimum hardware header length
*
* @needed_headroom: Extra headroom the hardware may need, but not in all
* cases can this be guaranteed
* @needed_tailroom: Extra tailroom the hardware may need, but not in all
* cases can this be guaranteed. Some cases also use
* LL_MAX_HEADER instead to allocate the skb
*
* interface address info:
*
* @perm_addr: Permanent hw address
* @addr_assign_type: Hw address assignment type
* @addr_len: Hardware address length
* @neigh_priv_len: Used in neigh_alloc()
* @dev_id: Used to differentiate devices that share
* the same link layer address
* @dev_port: Used to differentiate devices that share
* the same function
* @addr_list_lock: XXX: need comments on this one
* @uc_promisc: Counter that indicates promiscuous mode
* has been enabled due to the need to listen to
* additional unicast addresses in a device that
* does not implement ndo_set_rx_mode()
* @uc: unicast mac addresses
* @mc: multicast mac addresses
* @dev_addrs: list of device hw addresses
* @queues_kset: Group of all Kobjects in the Tx and RX queues
* @promiscuity: Number of times the NIC is told to work in
* promiscuous mode; if it becomes 0 the NIC will
* exit promiscuous mode
* @allmulti: Counter, enables or disables allmulticast mode
*
* @vlan_info: VLAN info
* @dsa_ptr: dsa specific data
* @tipc_ptr: TIPC specific data
* @atalk_ptr: AppleTalk link
* @ip_ptr: IPv4 specific data
* @dn_ptr: DECnet specific data
* @ip6_ptr: IPv6 specific data
* @ax25_ptr: AX.25 specific data
* @ieee80211_ptr: IEEE 802.11 specific data, assign before registering
*
* @dev_addr: Hw address (before bcast,
* because most packets are unicast)
*
* @_rx: Array of RX queues
* @num_rx_queues: Number of RX queues
* allocated at register_netdev() time
* @real_num_rx_queues: Number of RX queues currently active in device
*
* @rx_handler: handler for received packets
* @rx_handler_data: XXX: need comments on this one
* @miniq_ingress: ingress/clsact qdisc specific data for
* ingress processing
* @ingress_queue: XXX: need comments on this one
* @broadcast: hw bcast address
*
* @rx_cpu_rmap: CPU reverse-mapping for RX completion interrupts,
* indexed by RX queue number. Assigned by driver.
* This must only be set if the ndo_rx_flow_steer
* operation is defined
* @index_hlist: Device index hash chain
*
* @_tx: Array of TX queues
* @num_tx_queues: Number of TX queues allocated at alloc_netdev_mq() time
* @real_num_tx_queues: Number of TX queues currently active in device
* @qdisc: Root qdisc from userspace point of view
* @tx_queue_len: Max frames per queue allowed
* @tx_global_lock: XXX: need comments on this one
*
* @xps_maps: XXX: need comments on this one
* @miniq_egress: clsact qdisc specific data for
* egress processing
* @watchdog_timeo: Represents the timeout that is used by
* the watchdog (see dev_watchdog())
* @watchdog_timer: List of timers
*
* @pcpu_refcnt: Number of references to this device
* @todo_list: Delayed register/unregister
* @link_watch_list: XXX: need comments on this one
*
* @reg_state: Register/unregister state machine
* @dismantle: Device is going to be freed
* @rtnl_link_state: This enum represents the phases of creating
* a new link
*
* @needs_free_netdev: Should unregister perform free_netdev?
* @priv_destructor: Called from unregister
* @npinfo: XXX: need comments on this one
* @nd_net: Network namespace this network device is inside
*
* @ml_priv: Mid-layer private
* @lstats: Loopback statistics
* @tstats: Tunnel statistics
* @dstats: Dummy statistics
* @vstats: Virtual ethernet statistics
*
* @garp_port: GARP
* @mrp_port: MRP
*
* @dev: Class/net/name entry
* @sysfs_groups: Space for optional device, statistics and wireless
* sysfs groups
*
* @sysfs_rx_queue_group: Space for optional per-rx queue attributes
* @rtnl_link_ops: Rtnl_link_ops
*
* @gso_max_size: Maximum size of generic segmentation offload
* @gso_max_segs: Maximum number of segments that can be passed to the
* NIC for GSO
*
* @dcbnl_ops: Data Center Bridging netlink ops
* @num_tc: Number of traffic classes in the net device
* @tc_to_txq: XXX: need comments on this one
* @prio_tc_map: XXX: need comments on this one
*
* @fcoe_ddp_xid: Max exchange id for FCoE LRO by ddp
*
* @priomap: XXX: need comments on this one
* @phydev: Physical device may attach itself
* for hardware timestamping
*
* @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock
* @qdisc_running_key: lockdep class annotating Qdisc->running seqcount
*
* @proto_down: protocol port state information can be sent to the
* switch driver and used to set the phys state of the
* switch port.
*
* FIXME: cleanup struct net_device such that network protocol info
* moves out.
*/
struct net_device {
// 设备名称
char name[IFNAMSIZ];
struct hlist_node name_hlist;
struct dev_ifalias __rcu *ifalias;
/*
* I/O specific fields
* FIXME: Merge these and struct ifmap into one
*/
// 设备内存信息
unsigned long mem_end;
unsigned long mem_start;
// 网络接口的I/O基地址
unsigned long base_addr;
// 被赋予的中断号
int irq;
/*
* Some hardware also needs these fields (state,dev_list,
* napi_list,unreg_list,close_list) but they are not
* part of the usual set specified in Space.c.
*/
// 设备状态
unsigned long state;
struct list_head dev_list;
struct list_head napi_list;
struct list_head unreg_list;
struct list_head close_list;
struct list_head ptype_all;
struct list_head ptype_specific;
struct {
struct list_head upper;
struct list_head lower;
} adj_list;
netdev_features_t features;
netdev_features_t hw_features;
netdev_features_t wanted_features;
netdev_features_t vlan_features;
netdev_features_t hw_enc_features;
netdev_features_t mpls_features;
netdev_features_t gso_partial_features;
int ifindex;
int group;
struct net_device_stats stats;
atomic_long_t rx_dropped;
atomic_long_t tx_dropped;
atomic_long_t rx_nohandler;
/* Stats to monitor link on/off, flapping */
atomic_t carrier_up_count;
atomic_t carrier_down_count;
#ifdef CONFIG_WIRELESS_EXT
const struct iw_handler_def *wireless_handlers;
struct iw_public_data *wireless_data;
#endif
const struct net_device_ops *netdev_ops;
const struct ethtool_ops *ethtool_ops;
#ifdef CONFIG_NET_SWITCHDEV
const struct switchdev_ops *switchdev_ops;
#endif
#ifdef CONFIG_NET_L3_MASTER_DEV
const struct l3mdev_ops *l3mdev_ops;
#endif
#if IS_ENABLED(CONFIG_IPV6)
const struct ndisc_ops *ndisc_ops;
#endif
#ifdef CONFIG_XFRM
const struct xfrmdev_ops *xfrmdev_ops;
#endif
const struct header_ops *header_ops;
// 接口标志
unsigned int flags;
unsigned int priv_flags;
unsigned short gflags;
unsigned short padded;
unsigned char operstate;
unsigned char link_mode;
// 指定在多端口设备上使用哪个端口
unsigned char if_port;
// 为设备分配的DMA通道。
unsigned char dma;
// 最大传输单元(MTU)
unsigned int mtu;
unsigned int min_mtu;
unsigned int max_mtu;
// 接口的硬件类型
unsigned short type;
// 硬件头的长度
unsigned short hard_header_len;
unsigned char min_header_len;
unsigned short needed_headroom;
unsigned short needed_tailroom;
/* Interface address info. */
unsigned char perm_addr[MAX_ADDR_LEN];
unsigned char addr_assign_type;
unsigned char addr_len;
unsigned short neigh_priv_len;
unsigned short dev_id;
unsigned short dev_port;
spinlock_t addr_list_lock;
unsigned char name_assign_type;
bool uc_promisc;
struct netdev_hw_addr_list uc;
struct netdev_hw_addr_list mc;
struct netdev_hw_addr_list dev_addrs;
#ifdef CONFIG_SYSFS
struct kset *queues_kset;
#endif
unsigned int promiscuity;
unsigned int allmulti;
/* Protocol-specific pointers */
#if IS_ENABLED(CONFIG_VLAN_8021Q)
struct vlan_info __rcu *vlan_info;
#endif
#if IS_ENABLED(CONFIG_NET_DSA)
struct dsa_port *dsa_ptr;
#endif
#if IS_ENABLED(CONFIG_TIPC)
struct tipc_bearer __rcu *tipc_ptr;
#endif
void *atalk_ptr;
struct in_device __rcu *ip_ptr;
struct dn_dev __rcu *dn_ptr;
struct inet6_dev __rcu *ip6_ptr;
void *ax25_ptr;
struct wireless_dev *ieee80211_ptr;
struct wpan_dev *ieee802154_ptr;
#if IS_ENABLED(CONFIG_MPLS_ROUTING)
struct mpls_dev __rcu *mpls_ptr;
#endif
/*
* Cache lines mostly used on receive path (including eth_type_trans())
*/
/* Interface address info used in eth_type_trans() */
unsigned char *dev_addr;
#ifdef CONFIG_SYSFS
struct netdev_rx_queue *_rx;
unsigned int num_rx_queues;
unsigned int real_num_rx_queues;
#endif
struct bpf_prog __rcu *xdp_prog;
unsigned long gro_flush_timeout;
rx_handler_func_t __rcu *rx_handler;
void __rcu *rx_handler_data;
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_ingress;
#endif
struct netdev_queue __rcu *ingress_queue;
#ifdef CONFIG_NETFILTER_INGRESS
struct nf_hook_entries __rcu *nf_hooks_ingress;
#endif
unsigned char broadcast[MAX_ADDR_LEN];
#ifdef CONFIG_RFS_ACCEL
struct cpu_rmap *rx_cpu_rmap;
#endif
struct hlist_node index_hlist;
/*
* Cache lines mostly used on transmit path
*/
struct netdev_queue *_tx ____cacheline_aligned_in_smp;
unsigned int num_tx_queues;
unsigned int real_num_tx_queues;
struct Qdisc *qdisc;
#ifdef CONFIG_NET_SCHED
DECLARE_HASHTABLE (qdisc_hash, 4);
#endif
// 可在设备的传输队列中排队的最大帧数目。
unsigned int tx_queue_len;
spinlock_t tx_global_lock;
int watchdog_timeo;
#ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_maps;
#endif
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
#endif
/* These may be needed for future network-power-down code. */
struct timer_list watchdog_timer;
int __percpu *pcpu_refcnt;
struct list_head todo_list;
struct list_head link_watch_list;
enum { NETREG_UNINITIALIZED=0,
NETREG_REGISTERED, /* completed register_netdevice */
NETREG_UNREGISTERING, /* called unregister_netdevice */
NETREG_UNREGISTERED, /* completed unregister todo */
NETREG_RELEASED, /* called free_netdev */
NETREG_DUMMY, /* dummy device for NAPI poll */
} reg_state:8;
bool dismantle;
enum {
RTNL_LINK_INITIALIZED,
RTNL_LINK_INITIALIZING,
} rtnl_link_state:16;
bool needs_free_netdev;
void (*priv_destructor)(struct net_device *dev);
#ifdef CONFIG_NETPOLL
struct netpoll_info __rcu *npinfo;
#endif
possible_net_t nd_net;
/* mid-layer private */
union {
void *ml_priv;
struct pcpu_lstats __percpu *lstats;
struct pcpu_sw_netstats __percpu *tstats;
struct pcpu_dstats __percpu *dstats;
struct pcpu_vstats __percpu *vstats;
};
#if IS_ENABLED(CONFIG_GARP)
struct garp_port __rcu *garp_port;
#endif
#if IS_ENABLED(CONFIG_MRP)
struct mrp_port __rcu *mrp_port;
#endif
struct device dev;
const struct attribute_group *sysfs_groups[4];
const struct attribute_group *sysfs_rx_queue_group;
const struct rtnl_link_ops *rtnl_link_ops;
/* for setting kernel sock attribute on TCP connection setup */
#define GSO_MAX_SIZE 65536
unsigned int gso_max_size;
#define GSO_MAX_SEGS 65535
u16 gso_max_segs;
#ifdef CONFIG_DCB
const struct dcbnl_rtnl_ops *dcbnl_ops;
#endif
u8 num_tc;
struct netdev_tc_txq tc_to_txq[TC_MAX_QUEUE];
u8 prio_tc_map[TC_BITMASK + 1];
#if IS_ENABLED(CONFIG_FCOE)
unsigned int fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
struct netprio_map __rcu *priomap;
#endif
struct phy_device *phydev;
struct lock_class_key *qdisc_tx_busylock;
struct lock_class_key *qdisc_running_key;
bool proto_down;
};
/*
* @param sizeof_priv 驱动程序的私有数据区的大小
* @param name 接口的名字,在用户空间可见。这个名字可以使用类似printf中%d的格式,内核将用下一个可用的接口号代替%d
* @param setup 初始化函数,用来设置net_device结构剩余的部分
*/
struct net_device *alloc_netdev(int sizeof_priv, const char *name, void (*setup)(struct net_device *));
int register_netdev(struct net_device *dev);
void unregister_netdev(struct net_device *dev);
// <linux/etherdevice.h>
/*
* 该函数使用eth%d的形式制定分配给网络设备的名字
*/
struct net_device *alloc_etherdev(int sizeof_priv);
数据包传输
内核处理的每个数据包位于一个套接字缓冲区结构(sk_buff)中。
指向sk_buff的指针,通常称为skb
// <linux/skbuff.h>
struct sk_buff {
...
// skb->len是以octet为单位的长度
unsigned int len;
// skb->data指向要传输的数据包
unsigned char *data;
...
}
struct sk_buff *dev_alloc_skb(unsigned int length);
数据包接收
网络驱动程序实现了两种模式接收数据包:中断驱动方式
和 轮询方式
。
链路状态的改变
// <linux/netdevice.h>
// 驱动程序检测出设备出现载波,调用netif_carrier_on通知内核
void netif_carrier_on(struct net_device *dev);
// 驱动程序检测出设备不存在载波,调用netif_carrier_off通知内核
void netif_carrier_off(struct net_device *dev);
// 检测当前的载波状态
bool netif_carrier_ok(const struct net_device *dev);
套接字缓冲区
// <linux/skbuff.h>
struct sk_buff {
...
// 接收和发送该缓冲区的设备
struct net_device *dev;
// 对数据包的校验策略
unsigned char ip_summed;
// 在发送过程中使用的数据包类型。
// PACKET_HOST 该数据包是给我的
// PACKET_OTHERHOST 该数据包不是我的
// PACKET_BROADHOST
// PACKET_MULTICAST
unsigned char pkt_type;
// len是数据包中全部数据的长度,data_len是分隔存储的数据片段的长度。
unsigned int len, data_len;
// head指向已分配空间的开头;
// data是有效octet的开头;
// tail是有效octet的结尾;
// end指向tail可达到的最大地址;
// 可用缓冲区空间为 skb->end - skb->head
// 已使用的数据空间为 skb->tail - skb->data
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head, *data;
...
}
// 分配一个缓冲区,并初始化skb->data,skb->tail为skb->head
struct sk_buff *alloc_skb(unsigned int size, gfp_t priority);
// 以GTP_ATOMIC优先级调用alloc_skb,并在skb->head和skb->data之间保留一些空间,网络层使用这一数据空间进行优化工作,驱动程序不应访问这个空间。
struct sk_buff *dev_alloc_skb(unsigned int length);
// 释放一个缓冲区,kfree_skb函数由内核内部调用
void kfree_skb(struct sk_buff *skb);
// 驱动程序调用
void dev_kfree_skb(struct sk_buff *skb);
void *skb_put(struct sk_buff *skb, unsigned int len);
void *skb_push(struct sk_buff *skb, unsigned int len);
unsigned int skb_headroom(const struct sk_buff *skb);
int skb_tailroom(const struct sk_buff *skb);
int skb_availroom(const struct sk_buff *skb);
void skb_reserve(struct sk_buff *skb, int len);