sock_alloc原理分析，write与send疑问

最新推荐文章于 2023-03-03 15:39:21 发布

置顶 guoguangwu

最新推荐文章于 2023-03-03 15:39:21 发布

阅读量513

点赞数

分类专栏： linux内核网络文章标签： linux tcp/ip

本文链接：https://blog.csdn.net/guoguangwu/article/details/117159035

版权

linux内核同时被 2 个专栏收录

68 篇文章 13 订阅

订阅专栏

网络

10 篇文章 0 订阅

订阅专栏

在“linux 内核协议栈原理分析之 tcp 服务器端的 send 过程”这篇博客中提到在sys_socket=>sock_create中调用了sock_alloc函数，但是具体里面做了什么，并没有说明。

现在我们简单讲解下其中的原理。主要是文件系统相关的内容。

顺便讲一下为什么我们发送数据的时候，write等价于send。

struct socket *sock_alloc(void)
{
	struct inode * inode;
	struct socket * sock;

	inode = get_empty_inode();//高速缓冲区获取空闲inode
	if (!inode)
		return NULL;

	inode->i_sb = sock_mnt->mnt_sb;//指向socket文件系统的超级块
	sock = socki_lookup(inode);

	inode->i_mode = S_IFSOCK|S_IRWXUGO;
	inode->i_sock = 1;
	inode->i_uid = current->fsuid;
	inode->i_gid = current->fsgid;

	sock->inode = inode;//sock引用inode
	init_waitqueue_head(&sock->wait);//初始化等待队列
	sock->fasync_list = NULL;
	sock->state = SS_UNCONNECTED;//设置为未连接状态
	sock->flags = 0;
	sock->ops = NULL;
	sock->sk = NULL;
	sock->file = NULL;

	sockets_in_use[smp_processor_id()].counter++;
	return sock;
}

这里面sock_mnt是怎么来的，这是个挂载数据结构对象，这个是在 sock_init函数中初始化的。

init=>do_basic_setup=>sock_init

void __init sock_init(void)
{
......

	register_filesystem(&sock_fs_type);
	sock_mnt = kern_mount(&sock_fs_type);
......
}

#define DECLARE_FSTYPE(var,type,read,flags) \
struct file_system_type var = { \
	name:		type, \
	read_super:	read, \
	fs_flags:	flags, \
	owner:		THIS_MODULE, \
}


static DECLARE_FSTYPE(sock_fs_type, "sockfs", sockfs_read_super,
	FS_NOMOUNT|FS_SINGLE);

//最终替换结果如下：
struct file_system_type sock_fs_type= { 
	name:		"sockfs", 
	read_super:	sockfs_read_super, 
	fs_flags:	FS_NOMOUNT|FS_SINGLE, 
	owner:		THIS_MODULE, 
}

注册文件系统：

要挂载socket文件系统，需要先向系统注册文件系统类型，即sock_fs_type插入到file_systems全局变量中；

init=>do_basic_setup=>sock_init=>register_filesystem

int register_filesystem(struct file_system_type * fs)
{
	int res = 0;
	struct file_system_type ** p;

	if (!fs)
		return -EINVAL;
	if (fs->next)
		return -EBUSY;
	write_lock(&file_systems_lock);
	p = find_filesystem(fs->name);//通过文件类型名做区分，已存在报错
	if (*p)
		res = -EBUSY;
	else
		*p = fs;
	write_unlock(&file_systems_lock);
	return res;
}

挂载文件系统:

继续看mount过程，这是内核自己操作的，用户无法操作：

init=>do_basic_setup=>sock_init=>kern_mount


struct vfsmount *kern_mount(struct file_system_type *type)
{
	kdev_t dev = get_unnamed_dev();
	struct super_block *sb;//超级块指针
	struct vfsmount *mnt;//挂载指针
	if (!dev)
		return ERR_PTR(-EMFILE);
	sb = read_super(dev, NULL, type, 0, NULL, 0);//读取超级块
	if (!sb) {
		put_unnamed_dev(dev);
		return ERR_PTR(-EINVAL);
	}
	mnt = add_vfsmnt(NULL, sb->s_root, NULL);//挂载
	if (!mnt) {
		kill_super(sb, 0);
		return ERR_PTR(-ENOMEM);
	}
	type->kern_mnt = mnt;
	return mnt;
}

init=>do_basic_setup=>sock_init=>kern_mount=>read_super


static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
				       struct file_system_type *type, int flags,
				       void *data, int silent)
{
	struct super_block * s;
	s = get_empty_super();//分配超级块对象
	if (!s)
		goto out;
	s->s_dev = dev;
	s->s_bdev = bdev;
	s->s_flags = flags;
	s->s_dirt = 0;
	sema_init(&s->s_vfs_rename_sem,1);
	sema_init(&s->s_nfsd_free_path_sem,1);
	s->s_type = type;
	sema_init(&s->s_dquot.dqio_sem, 1);
	sema_init(&s->s_dquot.dqoff_sem, 1);
	s->s_dquot.flags = 0;
	lock_super(s);
	if (!type->read_super(s, data, silent))//调用sock_fs_type的sockfs_read_super函数，读取超级块
		goto out_fail;
	unlock_super(s);
	/* tell bdcache that we are going to keep this one */
	if (bdev)
		atomic_inc(&bdev->bd_count);
......
	return NULL;
}

init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super

static struct super_operations sockfs_ops = {
	statfs:		sockfs_statfs,
};

static struct super_block * sockfs_read_super(struct super_block *sb, void *data, int silent)
{
	struct inode *root = new_inode(sb);//创建根节点，并指向sb
	if (!root)
		return NULL;
	root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;//设置目录属性
	root->i_uid = root->i_gid = 0;
	root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
	sb->s_blocksize = 1024;
	sb->s_blocksize_bits = 10;
	sb->s_magic = SOCKFS_MAGIC;//超级块魔数
	sb->s_op	= &sockfs_ops;//设置超级块跳转表
	sb->s_root = d_alloc(NULL, &(const struct qstr) { "socket:", 7, 0 });//创建dentry
	if (!sb->s_root) {
		iput(root);
		return NULL;
	}
	sb->s_root->d_sb = sb;//每个dentry都有个指针指向超级块，为以后要访问做准备
	sb->s_root->d_parent = sb->s_root;//在socket文件系统中，因为是根目录，不存在父目录，指向自己
	d_instantiate(sb->s_root, root);//将dentry 与inode关联起来
	return sb;
}

init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super=>d_alloc

struct dentry * d_alloc(struct dentry * parent, const struct qstr *name)
{
	char * str;
	struct dentry *dentry;

	dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); //从高速缓冲区中分配dentry
	if (!dentry)
		return NULL;

	if (name->len > DNAME_INLINE_LEN-1) {//如果name过长，重新分配内存
		str = kmalloc(NAME_ALLOC_LEN(name->len), GFP_KERNEL);
		if (!str) {
			kmem_cache_free(dentry_cache, dentry); 
			return NULL;
		}
	} else
		str = dentry->d_iname; 

	memcpy(str, name->name, name->len);//拷贝名字到dentry->d_iname中
	str[name->len] = 0;

	atomic_set(&dentry->d_count, 1);//设置引用计数为1
	dentry->d_flags = 0;
	dentry->d_inode = NULL;
	dentry->d_parent = NULL;
	dentry->d_sb = NULL;
	dentry->d_name.name = str;
	dentry->d_name.len = name->len;
	dentry->d_name.hash = name->hash;
	dentry->d_op = NULL;
	dentry->d_fsdata = NULL;
	INIT_LIST_HEAD(&dentry->d_vfsmnt);//初始化挂载队列，一个目录可能会挂在多次
	INIT_LIST_HEAD(&dentry->d_hash);//每个dentry都会挂入dentry_hashtable哈希表中
	INIT_LIST_HEAD(&dentry->d_lru);//dentry可能会插入到最近最少使用的dentry队列
	INIT_LIST_HEAD(&dentry->d_subdirs);//每个目录下面可能会会有多个子目录，子目录通过d_child挂入到父目录的d_subdirs中
	INIT_LIST_HEAD(&dentry->d_alias);//一个inode可能会对应多个dentry，dentry通过d_alias挂入到inode的i_dentry中
	if (parent) {
		dentry->d_parent = dget(parent);
		dentry->d_sb = parent->d_sb;
		spin_lock(&dcache_lock);
		list_add(&dentry->d_child, &parent->d_subdirs);
		spin_unlock(&dcache_lock);
	} else
		INIT_LIST_HEAD(&dentry->d_child);

	dentry_stat.nr_dentry++;
	return dentry;
}

init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super=>d_instantiate

void d_instantiate(struct dentry *entry, struct inode * inode)
{
	spin_lock(&dcache_lock);
	if (inode)
		list_add(&entry->d_alias, &inode->i_dentry);//将entry 链入到inode中，因为一个文件只有一个inode,但是可能存在多个别名，
	entry->d_inode = inode;//将entry的inode指向inode，这样后面就可以访问，如果在查找文件时，
	spin_unlock(&dcache_lock);
}

读完了超级块，我们来看下挂载过程：

static struct vfsmount *add_vfsmnt(struct nameidata *nd,
				struct dentry *root,
				const char *dev_name)
{
	struct vfsmount *mnt;
	struct super_block *sb = root->d_inode->i_sb;//拿到超级块对象
	char *name;
    
	mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);//分配挂在对象
	if (!mnt)
		goto out;
	memset(mnt, 0, sizeof(struct vfsmount));

	if (nd || dev_name)
		mnt->mnt_flags = MNT_VISIBLE;

	/* It may be NULL, but who cares? */
	if (dev_name) {
		name = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
		if (name) {
			strcpy(name, dev_name);
			mnt->mnt_devname = name;
		}
	}
	mnt->mnt_owner = current->uid;
	atomic_set(&mnt->mnt_count,1);//设置引用计数为1
	mnt->mnt_sb = sb;//保存超级块对象地址，就是我们在sock_alloc看到的sock_mnt->mnt_sb

	spin_lock(&dcache_lock);
	if (nd && !IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
		goto fail;
	mnt->mnt_root = dget(root);//设置挂载点根目录项
	mnt->mnt_mountpoint = nd ? dget(nd->dentry) : dget(root);//挂载点的目录项同样为根目录项
	mnt->mnt_parent = nd ? mntget(nd->mnt) : mnt;//挂载点的父目录项同样为根目录项

	if (nd) {
		list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
		list_add(&mnt->mnt_clash, &nd->dentry->d_vfsmnt);
	} else {
		INIT_LIST_HEAD(&mnt->mnt_child);//如果挂载的文件系统下面有子挂载点，则挂入父目录的mnt_child链表中
		INIT_LIST_HEAD(&mnt->mnt_clash);
	}
	INIT_LIST_HEAD(&mnt->mnt_mounts);
	list_add(&mnt->mnt_instances, &sb->s_mounts);//将挂载点链入超级块中
	list_add(&mnt->mnt_list, vfsmntlist.prev);
	spin_unlock(&dcache_lock);
out:
	return mnt;
fail:
	spin_unlock(&dcache_lock);
	if (mnt->mnt_devname)
		kfree(mnt->mnt_devname);
	kfree(mnt);
	return NULL;
}

这个挂载就完成了，创建并入读入了超级块，实际就是创建了一个名为"socket:"的根目录项和inode，并相互关联。然后创建vfsmount挂载对象，并与sb、root_inode、root_dentry 相互关联。

接着看下开头的代码socki_lookup，获取socket对象，同时将分配的inode地址保存在socket的inode 中。

struct inode {
......

	union {
		struct minix_inode_info		minix_i;
		struct ext2_inode_info		ext2_i;
......
		struct socket			socket_i;
		struct usbdev_inode_info        usbdev_i;
		void				*generic_ip;
	} u;
};



extern __inline__ struct socket *socki_lookup(struct inode *inode)
{
	return &inode->u.socket_i;//创建inode节点时，将struct socket 一起分配了，所以这里直接访问
}

创建socket：

在sys_socket函数中，有个很重要的地方和socket文件系统息息相关，那就是sock_map_fd，我们看下它的实现：

static int sock_map_fd(struct socket *sock)
{
	int fd;
	struct qstr this;
	char name[32];

	/*
	 *	Find a file descriptor suitable for return to the user. 
	 */

	fd = get_unused_fd();//获取一个空闲的文件描述符
	if (fd >= 0) {
		struct file *file = get_empty_filp();//获取空闲文件对象

		if (!file) {
			put_unused_fd(fd);
			fd = -ENFILE;
			goto out;
		}

		sprintf(name, "[%lu]", sock->inode->i_ino);//这个就是文件名，通过inode编号设置
		this.name = name;
		this.len = strlen(name);
		this.hash = sock->inode->i_ino;

		file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);//通过文件名，创建目录项
		if (!file->f_dentry) {
			put_filp(file);
			put_unused_fd(fd);
			fd = -ENOMEM;
			goto out;
		}
		file->f_dentry->d_op = &sockfs_dentry_operations;//设置dentry的函数跳转表
		d_add(file->f_dentry, sock->inode);//关联inode和dentry，并将dentry 挂入dentry_hashtable中
		file->f_vfsmnt = mntget(sock_mnt);//当前文件指向本挂载点对象

		sock->file = file;
		file->f_op = sock->inode->i_fop = &socket_file_ops;//设置文件跳转表，这里我们可以分析出，为什么用户态调用write函数也能发送数据，其内部最终调用的是sock->ops->sendmsg，如果是inet协议族则为inet_sendmsg
		file->f_mode = 3;
		file->f_flags = O_RDWR;
		file->f_pos = 0;
		fd_install(fd, file);//将fd与file关联
	}

out:
	return fd;
}

sys_socket=>sock_map_fd=>d_add

static __inline__ void d_add(struct dentry * entry, struct inode * inode)
{
	d_instantiate(entry, inode);//关联inode和dentry
	d_rehash(entry);//重新计算entry的哈希值，然后挂入到dentry_hashtable中
}

sys_socket=>sock_map_fd=>d_add=>d_rehash

void d_rehash(struct dentry * entry)
{
	struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);//计算哈希值，找到对应的桶
	spin_lock(&dcache_lock);
	list_add(&entry->d_hash, list);//挂入哈希表对应的桶的链表中
	spin_unlock(&dcache_lock);
}

sys_socket=>sock_map_fd=>d_add=>d_rehash=>d_hash

巧妙之处：

这个非常巧妙，利用父目录的dentry的地址一起参与计算哈希值，可以避免哈希冲突，比如 ./zhangsan/project/src ， ./lisi/project/src ，比如project目录，如果只用当前目录或者文件名作为计算哈希的key,很容易就产生冲突，如果可是如果用父目录的名字一起作为哈希值计算，也很容易冲突，如果使用全路径那效率太低了，比如 src,所以内核使用该dentry的父dentry的地址作为哈希值的一部分，则src 理论产生冲突的概率大大降低了。

static inline struct list_head * d_hash(struct dentry * parent, unsigned long hash)
{
	hash += (unsigned long) parent / L1_CACHE_BYTES;
	hash = hash ^ (hash >> D_HASHBITS) ^ (hash >> D_HASHBITS*2);
	return dentry_hashtable + (hash & D_HASHMASK);
}

sockfs_dentry_operations，大部分为空，则走系统默认流程处理

static struct dentry_operations sockfs_dentry_operations = {
	d_delete:	sockfs_delete_dentry,
};

socket_file_ops：

static struct file_operations socket_file_ops = {
	llseek:		sock_lseek,
	read:		sock_read,
	write:		sock_write,
	poll:		sock_poll,
	ioctl:		sock_ioctl,
	mmap:		sock_mmap,
	open:		sock_no_open,	/* special open code to disallow open via /proc */
	release:	sock_close,
	fasync:		sock_fasync,
	readv:		sock_readv,
	writev:		sock_writev
};

write与send：

我们以write为例，看看是否与我们用户态用send发送数据是一样的效果。

asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count)
{
	ssize_t ret;
	struct file * file;

	ret = -EBADF;
	file = fget(fd);
	if (file) {
		if (file->f_mode & FMODE_WRITE) {
			struct inode *inode = file->f_dentry->d_inode;//找到对应的inode节点
			ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file,
				file->f_pos, count);
			if (!ret) {
				ssize_t (*write)(struct file *, const char *, size_t, loff_t *);
				ret = -EINVAL;
				if (file->f_op && (write = file->f_op->write) != NULL)
					ret = write(file, buf, count, &file->f_pos);//很明显这里是sock_write
			}
		}
		if (ret > 0)
			inode_dir_notify(file->f_dentry->d_parent->d_inode,
				DN_MODIFY);
		fput(file);
	}
	return ret;
}

sys_write=>sock_write

static ssize_t sock_write(struct file *file, const char *ubuf,
			  size_t size, loff_t *ppos)
{
	struct socket *sock;
	struct msghdr msg;
	struct iovec iov;
	
	if (ppos != &file->f_pos)
		return -ESPIPE;
	if(size==0)		/* Match SYS5 behaviour */
		return 0;

	sock = socki_lookup(file->f_dentry->d_inode); //通过inode找到socket对象，这个我们前面看到了

	msg.msg_name=NULL;
	msg.msg_namelen=0;
	msg.msg_iov=&iov;
	msg.msg_iovlen=1;
	msg.msg_control=NULL;
	msg.msg_controllen=0;
	msg.msg_flags=!(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
	if (sock->type == SOCK_SEQPACKET)
		msg.msg_flags |= MSG_EOR;
	iov.iov_base=(void *)ubuf;
	iov.iov_len=size;
	
	return sock_sendmsg(sock, &msg, size);//发送内容
}

sys_write=>sock_write=>sock_sendmsg


int sock_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
	int err;
	struct scm_cookie scm;

	err = scm_send(sock, msg, &scm);
	if (err >= 0) {
		err = sock->ops->sendmsg(sock, msg, size, &scm);//到这里我们就很熟悉了，调用的就是inet_stream_ops的inet_sendmsg函数，如果是inet协议族的话
		scm_destroy(&scm);
	}
	return err;
}

到这里，我们知道了socket文件系统的挂在过程，并且知道为什么用户态调用send 和write效果一样，原因是sock_write最终还是走到对应的协议栈的发送函数。