深入理解linux内核的阻塞和非阻塞
感悟
这里还是要写下笔记,再次复习知识点,一遍遍的复习,一遍遍的加深,可能工作中用不到,精力有限也很难再去换一门语言去学习了,但是工作问题不大,这几天通过学习这个再巩固加深知识点吧!!
linux 内核的阻塞和非阻塞操作
在<linux/fs.h>中,记载着非阻塞标志位filp->f_flags的O_NONBLOCK,这个标志位还有一个别的名字叫做O_NDELAY标志位
阻塞操作出现的场景
1.调用read时候套接字缓冲区里没有数据可以读,或者小于count的参数,直到条件成立才会被唤醒
2.调用write的时候套接字缓冲区里没有足够空间可以写,也可能可写入数据小于count,直到条件成立才会被唤醒。
缓冲区存在与大多数系统中,用来保证数据无人读的时候不会丢失,write时候系统不能读取,也可以保证数据不丢失。
缓冲区
驱动程序实现输出缓冲区可以提高性能,从而减少用户态和内核态的切换次数。
假如一个慢速设备没有缓冲区,那么每次系统调用read只能接收到一个或者几个字符,然后进程在write上休眠,另一个进程开始运行(这里有一个上下文切换),当前一个进程被唤醒后,他重新开始运行(另一次上下文切换),write返回(内核态切换回用户态的切换),接着write写入更多数据,接着write又阻塞,然后再次进入上面的循环,如果有缓冲区的话,write首次操作就能成功,缓存的数据在以后中断的时候发送给设备,而不必第二次第三次返回后调用write。显然输出缓冲区适应大多数设备
总结:
通过缓冲区技术减少write和read的调用,减少内核态和用户态的切换。
非阻塞在没有数据就绪的时候调用read或是在没有空间write没有足够空间的时候回返回-EAGAIN,所以非阻塞总是要在返回后检查errno
open如果设置了O_NONBLOCK可能会立马返回-EAGAIN,因为可能并没有完成打开就立马返回了,open用非阻塞打开后,会让read和write全部变为了非阻塞的。
一个阻塞IO的例子
首先对着书本写一个IO阻塞的例子
#include <linux/init.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/cdev.h>
#include <linux/fs.h>
#include <linux/uaccess.h> /* copy_*_user */
#include <linux/slab.h>
#include <linux/wait.h>
struct scull_pipe
{
wait_queue_head_t inq, outq;
char *buffer, *end;
int buffersize;
char *rp, *wp;
int nreaders, nwriters;
struct semaphore sem;
struct cdev cdev;
};
#define SCULL_P_BUFFER 4000
#define SCULL_MAJOR 0
static int scull_pipe_number = 4;
int scull_major = SCULL_MAJOR;
int scull_minor = 0;
dev_t scull_pipe_devno; /* Our first device number */
static struct scull_pipe *scull_pipe_ptr;
int scull_p_init(dev_t firstdev);
static int scull_p_open(struct inode* inode, struct file* filp);
ssize_t scull_p_read(struct file *filp, char __user *buf, size_t count,
loff_t *f_pos);
static ssize_t scull_p_write(struct file *filp, const char __user *buf, size_t count,
loff_t *f_pos);
static int scull_getwritespace(struct scull_pipe* dev, struct file* filep);
static int spacefree(struct scull_pipe *dev);
struct file_operations scull_pipe_fops = {
.owner = THIS_MODULE,
.open = scull_p_open,
.read = scull_p_read,
.write = scull_p_write,
};
static int scull_p_open(struct inode* inode, struct file* filp)
{
struct scull_pipe *dev;
dev = container_of(inode->i_cdev, struct scull_pipe, cdev);
filp->private_data = dev;
if (down_interruptible(&dev->sem))
return -EINTR;
if (!dev->buffer)
{
dev->buffer = kmalloc(SCULL_P_BUFFER, GFP_KERNEL);
if (!dev->buffer)
{
up(&dev->sem);
return -ENOMEM;
}
}
dev->buffersize = SCULL_P_BUFFER;
dev->end = dev->buffer + SCULL_P_BUFFER;
dev->rp = dev->wp = dev->buffer;
if (filp->f_mode & FMODE_READ)
{
dev->nreaders++;
}
if (filp->f_mode & FMODE_WRITE)
{
dev->nwriters++;
}
up(&dev->sem);
return nonseekable_open(inode, filp);
}
ssize_t scull_p_read(struct file *filp, char __user *buf, size_t count,
loff_t *f_pos)
{
struct scull_pipe *dev = filp->private_data;
if (down_interruptible(&dev->sem))
return -ERESTARTSYS;
while(dev->rp == dev->wp) {
up(&dev->sem);
if (filp->f_flags & O_NONBLOCK)
{
return -EAGAIN;
}
if (wait_event_interruptible(dev->inq, (dev->rp != dev->wp)))
{
return -EINTR;
}
if (down_interruptible(&dev->sem))
{
return -EINTR;
}
}
if (dev->wp > dev->rp)
count = min(count, (size_t)(dev->wp - dev->rp));
else
count = min(count, (size_t)(dev->end - dev->rp));
if (copy_to_user(buf, dev->rp, count))
{
up(&dev->sem);
return -EFAULT;
}
dev->rp += count;
if (dev->rp == dev->end) {
dev->rp = dev->buffer;
}
up(&dev->sem);
wake_up_interruptible(&dev->outq);
return count;
}
/* How much space is free? */
static int spacefree(struct scull_pipe *dev)
{
if (dev->rp == dev->wp)
return dev->buffersize - 1;
return ((dev->rp + dev->buffersize - dev->wp) % dev->buffersize) - 1;
}
static int scull_getwritespace(struct scull_pipe* dev, struct file* filp)
{
/* Wait for space for writing; caller must hold device semaphore. On
* error the semaphore will be released before returning. */
while (spacefree(dev) == 0) { /* full */
DEFINE_WAIT(wait);
up(&dev->sem);
if (filp->f_flags & O_NONBLOCK)
return -EAGAIN;
prepare_to_wait(&dev->outq, &wait, TASK_INTERRUPTIBLE);
if (spacefree(dev) == 0)
schedule();
finish_wait(&dev->outq, &wait);
if (signal_pending(current))
return -EINTR; /* signal: tell the fs layer to handle it */
if (down_interruptible(&dev->sem))
return -EINTR;
}
return 0;
}
static ssize_t scull_p_write(struct file *filp, const char __user *buf, size_t count,
loff_t *f_pos)
{
struct scull_pipe *dev = filp->private_data;
int result;
if (down_interruptible(&dev->sem))
return -ERESTARTSYS;
/* Make sure there's space to write */
result = scull_getwritespace(dev, filp);
if (result)
return result; /* scull_getwritespace called up(&dev->sem) */
/* ok, space is there, accept something */
count = min(count, (size_t)spacefree(dev));
if (dev->wp >= dev->rp)
count = min(count, (size_t)(dev->end - dev->wp)); /* to end-of-buf */
else /* the write pointer has wrapped, fill up to rp-1 */
count = min(count, (size_t)(dev->rp - dev->wp - 1));
if (copy_from_user(dev->wp, buf, count)) {
up (&dev->sem);
return -EFAULT;
}
dev->wp += count;
if (dev->wp == dev->end)
dev->wp = dev->buffer; /* wrapped */
up(&dev->sem);
/* finally, awake any reader */
wake_up_interruptible(&dev->inq); /* blocked in read() and select() */
return count;
}
static void scull_pipe_setup_dev(struct scull_pipe *dev_ptr, int i)
{
int err;
int dev_no = scull_pipe_devno + i;
cdev_init(&dev_ptr->cdev, &scull_pipe_fops);
dev_ptr->cdev.owner = THIS_MODULE;
err = cdev_add(&dev_ptr->cdev, dev_no, 1);
printk("err:%d\n", err);
if (err)
printk(KERN_NOTICE "Error %d adding scull%d", err, i);
}
int scull_p_init(dev_t firstdev)
{
int i;
int result;
printk("number:%d\n", scull_pipe_number);
printk("scull_major:%d\n", scull_major);
if (scull_major)
{
firstdev = MKDEV(scull_major, scull_minor);
result = register_chrdev_region(firstdev, 4, "scull_pipe");
printk("11111\n");
} else {
result = alloc_chrdev_region(&firstdev, scull_minor, 4, "scull_pipe");
scull_major = MAJOR(firstdev);
printk("22222\n");
}
printk("scull_minor:%d\n", scull_minor);
if (result < 0) {
printk(KERN_WARNING "scull: can't get major %d\n", scull_major);
return result;
}
firstdev = MKDEV(scull_major, scull_minor);
if (result < 0) {
return 0;
}
scull_pipe_devno = firstdev;
printk("scull_pipe_devno:%d\n", scull_pipe_devno);
scull_pipe_ptr = kmalloc(scull_pipe_number * sizeof(struct scull_pipe), GFP_KERNEL);
if (scull_pipe_ptr == NULL)
{
unregister_chrdev_region(firstdev, scull_pipe_number);
return 0;
}
memset(scull_pipe_ptr, 0, sizeof(struct scull_pipe) * 4);
for(i = 0; i < scull_pipe_number; i++)
{
init_waitqueue_head(&(scull_pipe_ptr[i].inq));
init_waitqueue_head(&(scull_pipe_ptr[i].outq));
sema_init(&(scull_pipe_ptr[i].sem), 1);
scull_pipe_setup_dev(scull_pipe_ptr + i, i);
}
return scull_pipe_number;
}
void scull_p_cleanup(void)
{
int i;
if (!scull_pipe_ptr)
{
return;
}
for(i = 0; i < scull_pipe_number; i++) {
cdev_del(&scull_pipe_ptr[i].cdev);
kfree(scull_pipe_ptr[i].buffer);
}
kfree(scull_pipe_ptr);
printk("devno:%d\n", scull_pipe_devno);
printk("pipe:%d\n", scull_pipe_number);
unregister_chrdev_region(scull_pipe_devno, scull_pipe_number);
scull_pipe_ptr = NULL;
}
static void __exit hello_exit(void)
{
scull_p_cleanup();
return;
}
static int __init hello_init(void)
{
dev_t dev = 0;
scull_p_init(dev);
return 0;
}
module_init(hello_init);
module_exit(hello_exit);
首先还是要看
pipe_module_init
字符串设备编号复习
这个函数主要作用是申请主设备编号和次设备编号,申请完成设备号之后,再为设备号绑定字符串设备
申请设备号核心函数有两个,书中说linux内核比较希望我们随机申请设备编号
主要有两个函数
register_chrdev_region(dev, 4, "scull_p");
这里其实是申请了一个主设备号,4个从设备号的空间,这是当我们已经申请过主设备号和从设备号的一情况下使用。
如果我们没申请过主设备号,我们申请一个chrdev,看似只是申请了一个主设备号,其实chrdev的255个数,每个数都指向一个链表,从设备号就分布再链表上,并且他们不会重复
我们使用
alloc_chrdev_region(dev_t first, unsigned int firstminor, unsigned int count, char* name)
dev是我们已经分配的第一个设备号,first minor 是我们申请的第一个从设备号
所以我们便看到了这段程序
if (scull_major)
{
firstdev = MKDEV(scull_major, scull_minor);
result = register_chrdev_region(firstdev, 4, "scull_pipe");
printk("11111\n");
} else {
result = alloc_chrdev_region(&firstdev, scull_minor, 4, "scull_pipe");
scull_major = MAJOR(firstdev);
printk("22222\n");
}
printk("scull_major:%d\n", scull_major);
if (result < 0) {
printk(KERN_WARNING "scull: can't get major %d\n", scull_major);
return result;
}
firstdev = MKDEV(scull_major, scull_minor);
申请完成后我们在内核上申请我们设备驱动对应的结构体
scull_pipe_ptr = kmalloc(scull_pipe_number * sizeof(struct scull_pipe), GFP_KERNEL);
if (scull_pipe_ptr == NULL)
{
unregister_chrdev_region(firstdev, scull_pipe_number);
return 0;
}
memset(scull_pipe_ptr, 0, sizeof(struct scull_pipe) * 4);
for(i = 0; i < scull_pipe_number; i++)
{
//初始化输入的等待队列
init_waitqueue_head(&(scull_pipe_ptr[i].inq));
//初始化输出的等待队列
init_waitqueue_head(&(scull_pipe_ptr[i].outq));
//初始化信号量
sema_init(&(scull_pipe_ptr[i].sem), 1);
scull_pipe_setup_dev(scull_pipe_ptr + i, i);
}
return scull_pipe_number;
再看一个比较关键的scull_pipe_setup_dev,这一步是挂载四个字符串设备
static void scull_pipe_setup_dev(struct scull_pipe *dev_ptr, int i)
{
int err;
int dev_no = scull_pipe_devno + i;
cdev_init(&dev_ptr->cdev, &scull_pipe_fops);
dev_ptr->cdev.owner = THIS_MODULE;
err = cdev_add(&dev_ptr->cdev, dev_no, 1);
printk("err:%d\n", err);
if (err)
printk(KERN_NOTICE "Error %d adding scull%d", err, i);
}
scull_pipe_devno这个很关键是我们用首主设备号和首次设备号生成的区间范围,firstdev = MKDEV(scull_major, scull_minor);到此为止我们挂载了4个字符串设备号
申请的主设备号我们可以从
/proc/devices
去查看
挂载的文件我们可以写脚本挂载到 /dev/scull_pipe * 去查看
一张宝图说明流程!
字符串设备 的 inode 和 file 结构体复习
inode 主要有两个重要的成员, i_rdev、i_cdev
i_rdev 对应的是设备号
i_cdev 对应的是当时注册时候的cdev 结构体
另一个file 结构是描述符相关信息
读写位置、标志等等。
open函数的玄妙
书中的open函数我认为十分玄妙,经典代码片段莫过于对于((ptr*)0)的使用,不止一次看到这种代码的艺术,这种令人沸腾的美
struct scull_pipe *dev;
dev = container_of(inode->i_cdev, struct scull_pipe, cdev);
filp->private_data = dev;
此中玄妙,尽在container_of
0 指针的使用
#include<stdio.h>
struct test
{
char i ;
int j;
char k;
};
int main()
{
struct test temp;
printf("&temp = %p\n",&temp);
printf("&temp.k = %p\n",&temp.k);
printf("&((struct test *)0)->k = %d\n",((int)&((struct test *)0)->k));
}
通过0指针巧妙得到了偏移量,然后计算出scull_pipe* scull_pipe_ptr 4个中对应的一个地址
再看read函数
read 函数毫无疑问,比较容易帮我们理解阻塞和非阻塞编程
ssize_t scull_p_read(struct file *filp, char __user *buf, size_t count,
loff_t *f_pos)
{
struct scull_pipe *dev = filp->private_data;
if (down_interruptible(&dev->sem))
return -ERESTARTSYS;
while(dev->rp == dev->wp) {
up(&dev->sem);
if (filp->f_flags & O_NONBLOCK)
{
return -EAGAIN;
}
if (wait_event_interruptible(dev->inq, (dev->rp != dev->wp)))
{
return -EINTR;
}
if (down_interruptible(&dev->sem))
{
return -EINTR;
}
}
if (dev->wp > dev->rp)
count = min(count, (size_t)(dev->wp - dev->rp));
else
count = min(count, (size_t)(dev->end - dev->rp));
if (copy_to_user(buf, dev->rp, count))
{
up(&dev->sem);
return -EFAULT;
}
dev->rp += count;
if (dev->rp == dev->end) {
dev->rp = dev->buffer;
}
up(&dev->sem);
wake_up_interruptible(&dev->outq);
return count;
}
代码解析
struct scull_pipe *dev = filp->private_data;
if (down_interruptible(&dev->sem))
return -ERESTARTSYS;
这个代码是获得了4个scull_pipe_ptr对应中的一个,然后上信号锁,意味着读写scull_pipe0、scull_pipe1、scull_pipe2、scull_pipe3的读取操作都是原子的
while(dev->rp == dev->wp) {
up(&dev->sem);
if (filp->f_flags & O_NONBLOCK)
{
return -EAGAIN;
}
if (wait_event_interruptible(dev->inq, (dev->rp != dev->wp)))
{
return -EINTR;
}
if (down_interruptible(&dev->sem))
{
return -EINTR;
}
}
读写缓冲区满了,如果是非阻塞直接返回-EAGAIN,如果是阻塞的,则wait_event_interruptible让程序陷入睡眠,如果被信号中断,返回 -EINTR, 然后再次上锁down_interruptible,确保dev->rp == dev->wp读取的是原子大的,最后重点分析一下
if (dev->wp > dev->rp)
count = min(count, (size_t)(dev->wp - dev->rp));
else
count = min(count, (size_t)(dev->end - dev->rp));
什么时候会出现dev->wp > dev->rp ?
比如说字节去写到 500 个字节处,我们已经读了100个字节,也就是说缓冲区里我们有400个字节还没度过。
count = min(count, (size_t)(dev->wp - dev->rp));
那我们思考什么时候会出现 dev->wp <= dev->rp ?
缓冲区被写满了,wp被复位到0这个地方,我们可读取的长度应该是缓冲区的缓冲区末尾减去已经读的位置
count = min(count, (size_t)(dev->end - dev->rp));
再看write函数
我们要进行write,第一步肯定是要直到是否有空间可以写
/* How much space is free? */
static int spacefree(struct scull_pipe *dev)
{
if (dev->rp == dev->wp)
return dev->buffersize - 1;
return ((dev->rp + dev->buffersize - dev->wp) % dev->buffersize) - 1;
}
/* Wait for space for writing; caller must hold device semaphore. On
* error the semaphore will be released before returning. */
static int scull_getwritespace(struct scull_pipe* dev, struct file* filp)
{
/* Wait for space for writing; caller must hold device semaphore. On
* error the semaphore will be released before returning. */
while (spacefree(dev) == 0) { /* full */
DEFINE_WAIT(wait);
up(&dev->sem);
if (filp->f_flags & O_NONBLOCK)
return -EAGAIN;
prepare_to_wait(&dev->outq, &wait, TASK_INTERRUPTIBLE);
if (spacefree(dev) == 0)
schedule();
finish_wait(&dev->outq, &wait);
if (signal_pending(current))
return -EINTR; /* signal: tell the fs layer to handle it */
if (down_interruptible(&dev->sem))
return -EINTR;
}
return 0;
}
这里有一个非常重要的技术点,之前用的write_intrruptible不能进行CPU调度,只是一个简单的休眠,那么我们如何能进行高级休眠呢?
定义一个等待队列的入口
DEFINE_WAIT(wait);
将进程加入等待队列的入口,并且设置进程状态为 TASK_INTERRUPTIBLE
prepare_to_wait(&dev->outq, &wait, TASK_INTERRUPTIBLE);
好了我么可以进行调度了
if (spacefree(dev) == 0)
schedule();
调度完成后我们要清理调度任务
finish_wait(&dev->outq, &wait);
查看write函数主干道:
struct scull_pipe *dev = filp->private_data;
int result;
if (down_interruptible(&dev->sem))
return -ERESTARTSYS;
/* Make sure there's space to write */
result = scull_getwritespace(dev, filp);
if (result)
return result; /* scull_getwritespace called up(&dev->sem) */
/* ok, space is there, accept something */
count = min(count, (size_t)spacefree(dev));
if (dev->wp >= dev->rp)
count = min(count, (size_t)(dev->end - dev->wp)); /* to end-of-buf */
else /* the write pointer has wrapped, fill up to rp-1 */
count = min(count, (size_t)(dev->rp - dev->wp - 1));
if (copy_from_user(dev->wp, buf, count)) {
up (&dev->sem);
return -EFAULT;
}
dev->wp += count;
if (dev->wp == dev->end)
dev->wp = dev->buffer; /* wrapped */
up(&dev->sem);
/* finally, awake any reader */
wake_up_interruptible(&dev->inq); /* blocked in read() and select() */
return count;
编写例子测试:
读取程序
#include <iostream>
#include <unistd.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <cstring>
int main ()
{
int fd = open("/dev/scull_pipe0", O_RDWR);
std::cout << fd << std::endl;
char data[6] = "haha";
int res = read(fd, data, sizeof(data));
std::cout << data << std::endl;
return 0;
}
写入程序:
#include <iostream>
#include <unistd.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <cstring>
int main ()
{
int fd = open("/dev/scull_pipe0", O_RDWR);
std::cout << fd << std::endl;
char data[6] = "haha";
int res = write(fd, data, sizeof(data));
std::cout << strerror(errno) << std::endl;
return 0;
}
编译二进制 运行
zhanglei@ubuntu:~/ourc$ g++ test_read.cpp -o test_read
zhanglei@ubuntu:~/ourc$ g++ test_write.cpp -o test_write
运行demo
执行sudo ./test_read 我们会发现程序发生了阻塞
执行sudo ./test_write 我们会发现read的进程被唤醒!!
实验结束thanks