linux中文件读写的底层实现（基于linux0.11源码）

本文链接：https://blog.csdn.net/qq_66399980/article/details/127016527

最近在重温linux部分的知识，回过头来看的时候，不禁又产生了新的疑惑：为什么系统IO中 read /write 函数可以对各种不同类型的文件进行操作呢？他们实现读取的原理和实现方式又是什么呢？

有的时候难免会好奇，于是乎就动手去直接追源溯本-------查看linux源码

****************************标注：以下下载的是linux0.11版本源码****************************************

从linux kernel官网下载了linux0.11版本的源码。

## Linux-0.11 ##

### RTFSC ###

Linus曾经说过：RTFSC - Read The Fucking Source Code.

该代码是目前能够找到的最早的Linux的内核版本。

代码中的注释99%都来源于赵炯老师的那本[Linux-0.11源码完全注释]

第一步

可能有小伙伴会问了，我们第一步该干嘛？下了源码也不知道怎么去看呀！！！

查看的第一步，当然是先查看man 手册查看 read/write 函数基本位置

（注：查看man手册的linux版本为：Linux version 5.10.16.3）
READ(2)                                          Linux Programmer's Manual                                         READ(2)    

NAME
       read - read from a file descriptor

SYNOPSIS
       #include <unistd.h>

       ssize_t read(int fd, void *buf, size_t count);
于是，我们首先得去追溯linux0.11 中unistd的位置了

使用我们最爱的CTL+F大法

找到了unistd.h的位置

这个时候，我们可以查看一下其中关于read/write的描写
int write(int fildes, const char * buf, off_t count);
int read(int fildes, char * buf, off_t count);
很显然，在unistd中并没有直接对write/read 函数进行定义，而只是进行了声明。

这时候我们查阅赵炯老师的那本[Linux-0.11源码完全注释]，上面写道：

行吧，那我们就再去查sys.h这个文件

第二步

直接跳到sys.h 查看其定义

我们可以看到，只有对sys_read和sys_write的定义，而sys又是系统的意思，不出意外的话，这就是我们苦苦追寻的read write 源码了

打开fs目录下的read_write.c文件

第三步

很快奥，我们一眼就看到了我们要找的源码-------


#include <sys/stat.h>
#include <errno.h>
#include <sys/types.h>

#include <linux/kernel.h>
#include <linux/sched.h>
#include <asm/segment.h>

// 字符设备读写函数。
extern int rw_char(int rw, int dev, char *buf, int count, off_t *pos);
// 读管道操作函数。
extern int read_pipe(struct m_inode *inode, char *buf, int count);
// 写管道操作函数
extern int write_pipe(struct m_inode *inode, char *buf, int count);
// 块设备读操作函数
extern int block_read(int dev, off_t *pos, char *buf, int count);
// 块设备写操作函数
extern int block_write(int dev, off_t *pos, char *buf, int count);
// 读文件操作函数
extern int file_read(struct m_inode *inode, struct file *filp,
					 char *buf, int count);
// 写文件操作函数
extern int file_write(struct m_inode *inode, struct file *filp,
					  char *buf, int count);

 读文件系统调用
// 参数fd是文件句柄，buf是缓冲区，count是预读字节数
int sys_read(unsigned int fd, char *buf, int count)
{

		/*
struct file {
	unsigned short f_mode;
	unsigned short f_flags;
	unsigned short f_count;
	struct m_inode * f_inode;
	off_t f_pos;
};
*/

// struct m_inode {
// 	unsigned short i_mode;
// 	unsigned short i_uid;
// 	unsigned long i_size;
// 	unsigned long i_mtime;
// 	unsigned char i_gid;
// 	unsigned char i_nlinks;
// 	unsigned short i_zone[9];
// /* these are in memory also */
// 	struct task_struct * i_wait;
// 	unsigned long i_atime;
// 	unsigned long i_ctime;
// 	unsigned short i_dev;
// 	unsigned short i_num;
// 	unsigned short i_count;
// 	unsigned char i_lock;
// 	unsigned char i_dirt;
// 	unsigned char i_pipe;
// 	unsigned char i_mount;
// 	unsigned char i_seek;
// 	unsigned char i_update;
// };
	struct file *file;
	struct m_inode *inode;

	// 函数首先对参数有效性进行判断。如果文件句柄值大于程序最多打开文件数NR_OPEN，
	// 或者需要读取的字节计数值小于0，或者该句柄的文件结构指针为空，则返回出错码并
	// 退出。若需读取的字节数count等于0，则返回0退出。
	if (fd >= NR_OPEN || count < 0 || !(file = current->filp[fd]))
		return -EINVAL;
	if (!count)
		return 0;
	// 然后验证存放数据的缓冲区内存限制。并取文件的i节点。用于根据该i节点的属性，分
	// 别调用相应的读操作函数。若是管道文件，并且是读管道文件模式，则进行读管道操作，
	// 若成功则返回读取的字节数，否则返回出错码，退出。如果是字符型文件，则进行读
	// 字符设备操作，并返回读取的字符数。如果是块设备文件，则执行块设备读操作，并
	// 返回读取的字节数。
	verify_area(buf, count);
	inode = file->f_inode;
	if (inode->i_pipe)
		return (file->f_mode & 1) ? read_pipe(inode, buf, count) : -EIO;
	if (S_ISCHR(inode->i_mode))
		return rw_char(READ, inode->i_zone[0], buf, count, &file->f_pos);
	if (S_ISBLK(inode->i_mode))
		return block_read(inode->i_zone[0], &file->f_pos, buf, count);
	// 如果是目录文件或者是常规文件，则首先验证读取字节数count的有效性并进行调整(若
	// 读去字节数加上文件当前读写指针值大于文件长度，则重新设置读取字节数为文件长度
	// -当前读写指针值，若读取数等于0，则返回0退出)，然后执行文件读操作，返回读取的
	// 字节数并退出。
	if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode))
	{
		if (count + file->f_pos > inode->i_size)
			count = inode->i_size - file->f_pos;
		if (count <= 0)
			return 0;
		return file_read(inode, file, buf, count);
	}
	// 执行到这里，说明我们无法判断文件的属性。则打印节点文件属性，并返回出错码退出。
	printk("(Read)inode->i_mode=%06o\n\r", inode->i_mode);
	return -EINVAL;
}

 写文件系统调用
// 参数fd是文件句柄，buf是用户缓冲区，count是欲写字节数。
int sys_write(unsigned int fd, char *buf, int count)
{
	struct file *file;
	struct m_inode *inode;

	// 同样地，我们首先判断函数参数的有效性。若果进程文件句柄值大于程序最多打开文件数
	// NR_OPEN，或者需要写入的字节数小于0，或者该句柄的文件结构指针为空，则返回出错码
	// 并退出。如果需读取字节数count等于0，则返回0退出。
	if (fd >= NR_OPEN || count < 0 || !(file = current->filp[fd]))
		return -EINVAL;
	if (!count)
		return 0;
	// 然后验证存放数据的缓冲区内存限制。并取文件的i节点。用于根据该i节点属性，分别调
	// 用相应的读操作函数。若是管道文件，并且是写管道文件模式，则进行写管道操作，若成
	// 功则返回写入的字节数，否则返回出错码退出。如果是字符设备文件，则进行写字符设备
	// 操作，返回写入的字符数退出。如果是块设备文件，则进行块设备写操作，并返回写入的
	// 字节数退出。若是常规文件，则执行文件写操作，并返回写入的字节数，退出。
	inode = file->f_inode;
	if (inode->i_pipe)
		return (file->f_mode & 2) ? write_pipe(inode, buf, count) : -EIO;
	if (S_ISCHR(inode->i_mode))
		return rw_char(WRITE, inode->i_zone[0], buf, count, &file->f_pos);
	if (S_ISBLK(inode->i_mode))
		return block_write(inode->i_zone[0], &file->f_pos, buf, count);
	if (S_ISREG(inode->i_mode))
		return file_write(inode, file, buf, count);
	// 执行到这里，说明我们无法判断文件的属性。则打印节点文件属性，并返回出错码退出。
	printk("(Write)inode->i_mode=%06o\n\r", inode->i_mode);
	return -EINVAL;
}

注：源文件中还有sys_lseek()函数的源码，被我去除了

分析代码我们可以看出，其实所谓的read write函数仍不是最底层的实现，他是对其他好几种文件的读写操作封装好的上层接口实现。

但这无伤大雅，不影响我们理解read,write函数

我们也可以看看以上几个文件读写的操作

从中选两个

在linux进程通信中经常会用到管道，那就用管道读写来做例子吧

/*
 *  linux/fs/pipe.c
 *
 *  (C) 1991  Linus Torvalds
 */

#include <signal.h>

#include <linux/sched.h>
#include <linux/mm.h>	/* for get_free_page */
#include <asm/segment.h>

 管道读操作函数
// 参数inode是管道对应的i节点，buf是用户数据缓冲区指针，count是读取的字节数。
int read_pipe(struct m_inode * inode, char * buf, int count)
{
	int chars, size, read = 0;

    // 如果需要读取的字节计数count大于0，我们就循环执行以下操作。在循环读操作
    // 过程中，若当前管道中没有数据（size=0），则唤醒等待该节点的进程，这通常
    // 是写管道进程。如果已没有写管道者，即i节点引用计数值小于2，则返回已读字
    // 节数退出。否则在该i节点上睡眠，等待信息。宏PIPE_SIZE定义在fs.h中。
	while (count>0) {
		while (!(size=PIPE_SIZE(*inode))) {
			wake_up(&inode->i_wait);
			if (inode->i_count != 2) /* are there any writers? */
				return read;
			sleep_on(&inode->i_wait);
		}
        // 此时说明管道(缓冲区)中有数据。于是我们取管道尾指针到缓冲区末端的字
        // 节数chars。如果其大于还需要读取的字节数count，则令其等于count。如果
        // chars大于当前管道中含有数据的长度size，则令其等于size。然后把需读字
        // 节数count减去此次可读的字节数chars，并累加已读字节数read.
		chars = PAGE_SIZE-PIPE_TAIL(*inode);
		if (chars > count)
			chars = count;
		if (chars > size)
			chars = size;
		count -= chars;
		read += chars;
        // 再令size指向管道尾指针处，并调整当前管道尾指针(前移chars字节)。若尾
        // 指针超过管道末端则绕回。然后将管道中的数据复制到用户缓冲区中。对于
        // 管道i节点，其i_size字段中是管道缓冲块指针。
		size = PIPE_TAIL(*inode);
		PIPE_TAIL(*inode) += chars;
		PIPE_TAIL(*inode) &= (PAGE_SIZE-1);
		while (chars-->0)
			put_fs_byte(((char *)inode->i_size)[size++],buf++);
	}
    // 当此次读管道操作结束，则唤醒等待该管道的进程，并返回读取的字节数。
	wake_up(&inode->i_wait);
	return read;
}

 管道写操作函数。
// 参数inode是管道对应的i节点，buf是数据缓冲区指针，count是将写入管道的字节数。
int write_pipe(struct m_inode * inode, char * buf, int count)
{
	int chars, size, written = 0;

    // 如果要写入的字节数count大于0，那么我们就循环执行以下操作。在循环操作过程
    // 中，若当前管道中没有已经满了(空闲空间size = 0),则唤醒等待该节点的进程，
    // 通常唤醒的是读管道进程。如果已没有读管道者，即i节点引用计数值小于2，则
    // 向当前进程发送SIGPIPE信号，并返回已写入的字节数退出；若写入0字节，则返回
    // -1.否则让当前进程在该i节点睡眠，以等待读管道进程读取数据，从而让管道腾出
    // 空间。宏PIPE_SIZE()、PIPE_HEAD()等定义在文件fs.h中。
	while (count>0) {
		while (!(size=(PAGE_SIZE-1)-PIPE_SIZE(*inode))) {
			wake_up(&inode->i_wait);
			if (inode->i_count != 2) { /* no readers */
				current->signal |= (1<<(SIGPIPE-1));
				return written?written:-1;
			}
			sleep_on(&inode->i_wait);
		}
        // 程序执行到这里表示管道缓冲区中有可写空间size.于是我们管道头指针到缓冲区
        // 末端空间字节数chars。写管道操作是从管道头指针处开始写的。如果chars大于还
        // 需要写入的字节数count，则令其等于count。如果chars大于当前管道中空闲空间
        // 长度size，则令其等于size，然后把需要写入字节数count减去此次可写入的字节数
        // chars，并把写入字节数累驾到witten中。
		chars = PAGE_SIZE-PIPE_HEAD(*inode);
		if (chars > count)
			chars = count;
		if (chars > size)
			chars = size;
		count -= chars;
		written += chars;
        // 再令size指向管道数据头指针处，并调整当前管道数据头部指针(前移chars字节)。
        // 若头指针超过管道末端则绕回。然后从用户缓冲区复制chars个字节到管道头指针
        // 开始处。对于管道i节点，其i_size字段中是管道缓冲块指针。
		size = PIPE_HEAD(*inode);
		PIPE_HEAD(*inode) += chars;
		PIPE_HEAD(*inode) &= (PAGE_SIZE-1);
		while (chars-->0)
			((char *)inode->i_size)[size++]=get_fs_byte(buf++);
	}
    // 当此次写管道操作结束，则唤醒等待管道的进程，返回已写入的字节数，退出。
	wake_up(&inode->i_wait);
	return written;
}

 创建管道系统调用。
// 在fildes所指的数组中创建一对文件句柄(描述符)。这对句柄指向一管道i节点。
// 参数：filedes - 文件句柄数组。fildes[0]用于读管道数据，fildes[1]向管道写入数据。
// 成功时返回0，出错时返回-1.
int sys_pipe(unsigned long * fildes)
{
	struct m_inode * inode;
	struct file * f[2];
	int fd[2];
	int i,j;

    // 首先从系统文件表中取两个空闲项(引用计数字段为0的项)，并分别设置引用计数为1。
    // 若只有1个空闲项，则释放该项(引用计数复位).若没有找到两个空闲项，则返回-1.
	j=0;
	for(i=0;j<2 && i<NR_FILE;i++)
		if (!file_table[i].f_count)
			(f[j++]=i+file_table)->f_count++;
	if (j==1)
		f[0]->f_count=0;
	if (j<2)
		return -1;
    // 针对上面取得的两个文件表结构项，分别分配一文件句柄号，并使用进程文件结构指针
    // 数组的两项分别指向这两个文件结构。而文件句柄即是该数组的索引号。类似的，如果
    // 只有一个空闲文件句柄，则释放该句柄(置空相应数组项)。如果没有找到两个空闲句柄，
    // 则释放上面获取的两个文件结构项(复位引用计数值)，并返回-1.
	j=0;
	for(i=0;j<2 && i<NR_OPEN;i++)
		if (!current->filp[i]) {
			current->filp[ fd[j]=i ] = f[j];
			j++;
		}
	if (j==1)
		current->filp[fd[0]]=NULL;
	if (j<2) {
		f[0]->f_count=f[1]->f_count=0;
		return -1;
	}
    // 然后利用函数get_pipe_inode()申请一个管道使用的i节点，并为管道分配一页内存作为
    // 缓冲区。如果不成功，则相应释放两个文件句柄和文件结构项，并返回-1.
	if (!(inode=get_pipe_inode())) {
		current->filp[fd[0]] =
			current->filp[fd[1]] = NULL;
		f[0]->f_count = f[1]->f_count = 0;
		return -1;
	}
    // 如果管道i节点申请成功，则对两个文件结构进行初始化操作，让他们都指向同一个管道
    // i节点，并把读写指针都置零。第1个文件结构的文件模式置为读，第2个文件结构的文件
    // 模式置为写。最后将文件句柄数组复制到对应的用户空间数组中，成功返回0，退出。
	f[0]->f_inode = f[1]->f_inode = inode;
	f[0]->f_pos = f[1]->f_pos = 0;
	f[0]->f_mode = 1;		/* read */
	f[1]->f_mode = 2;		/* write */
	put_fs_long(fd[0],0+fildes);
	put_fs_long(fd[1],1+fildes);
	return 0;
}

一堆代码写的神乎其神，说的牛逼哄哄。。

是不是都快看晕了？

说白了----，在不谈论操作系统的内核实现情况下，linux底层对读取文件并没有那么吓人

：

用大白话说就是，先定义

struct file *file;

struct m_inode *inode;

两个指针，然后进行参数有效性判断，验证完缓存区限制等等一系列操作后，直接读取结点指针，判断文件类型，针对不同的类型调用不同的底层存取读取函数罢了。

用我拙劣的画工画一下吧...

最后，还有一个有趣，并且上文没有提到的地方:

在lib库中的write.c文件中，有一个针对write函数的系统调用函数,他的原型是这样的:


#define _syscall3(type,name,atype,a,btype,b,ctype,c) \
type name(atype a,btype b,ctype c) \
{ \
long __res; \
__asm__ volatile ("int $0x80" \
	: "=a" (__res) \
	: "0" (__NR_##name),"b" ((long)(a)),"c" ((long)(b)),"d" ((long)(c))); \
if (__res>=0) \
	return (type) __res; \
errno=-__res; \
return -1; \
}

而write.c中的文件是这样的：

/*
 *  linux/lib/write.c
 *
 *  (C) 1991  Linus Torvalds
 */

#define __LIBRARY__//linux标准头文件，定义了各种符号常数和类型，并声明了各种函数
                    //如定义了_LIBRARY_,则还包含了系统调用号和内嵌汇编
#include <unistd.h>

_syscall3(int,write,int,fd,const char *,buf,off_t,count)

所以上面这段代码到底有什么用呢？？