在“linux 内核协议栈原理分析之 tcp 服务器端的 send 过程”这篇博客中提到在sys_socket=>sock_create中调用了sock_alloc函数,但是具体里面做了什么,并没有说明。
现在我们简单讲解下其中的原理。主要是文件系统相关的内容。
顺便讲一下为什么我们发送数据的时候,write等价于send。
struct socket *sock_alloc(void)
{
struct inode * inode;
struct socket * sock;
inode = get_empty_inode();//高速缓冲区获取空闲inode
if (!inode)
return NULL;
inode->i_sb = sock_mnt->mnt_sb;//指向socket文件系统的超级块
sock = socki_lookup(inode);
inode->i_mode = S_IFSOCK|S_IRWXUGO;
inode->i_sock = 1;
inode->i_uid = current->fsuid;
inode->i_gid = current->fsgid;
sock->inode = inode;//sock引用inode
init_waitqueue_head(&sock->wait);//初始化等待队列
sock->fasync_list = NULL;
sock->state = SS_UNCONNECTED;//设置为未连接状态
sock->flags = 0;
sock->ops = NULL;
sock->sk = NULL;
sock->file = NULL;
sockets_in_use[smp_processor_id()].counter++;
return sock;
}
这里面sock_mnt是怎么来的,这是个挂载数据结构对象,这个是在 sock_init函数中初始化的。
init=>do_basic_setup=>sock_init
void __init sock_init(void)
{
......
register_filesystem(&sock_fs_type);
sock_mnt = kern_mount(&sock_fs_type);
......
}
#define DECLARE_FSTYPE(var,type,read,flags) \
struct file_system_type var = { \
name: type, \
read_super: read, \
fs_flags: flags, \
owner: THIS_MODULE, \
}
static DECLARE_FSTYPE(sock_fs_type, "sockfs", sockfs_read_super,
FS_NOMOUNT|FS_SINGLE);
//最终替换结果如下:
struct file_system_type sock_fs_type= {
name: "sockfs",
read_super: sockfs_read_super,
fs_flags: FS_NOMOUNT|FS_SINGLE,
owner: THIS_MODULE,
}
注册文件系统:
要挂载socket文件系统,需要先向系统注册文件系统类型,即sock_fs_type插入到file_systems全局变量中;
init=>do_basic_setup=>sock_init=>register_filesystem
int register_filesystem(struct file_system_type * fs)
{
int res = 0;
struct file_system_type ** p;
if (!fs)
return -EINVAL;
if (fs->next)
return -EBUSY;
write_lock(&file_systems_lock);
p = find_filesystem(fs->name);//通过文件类型名做区分,已存在报错
if (*p)
res = -EBUSY;
else
*p = fs;
write_unlock(&file_systems_lock);
return res;
}
挂载文件系统:
继续看mount过程,这是内核自己操作的,用户无法操作:
init=>do_basic_setup=>sock_init=>kern_mount
struct vfsmount *kern_mount(struct file_system_type *type)
{
kdev_t dev = get_unnamed_dev();
struct super_block *sb;//超级块指针
struct vfsmount *mnt;//挂载指针
if (!dev)
return ERR_PTR(-EMFILE);
sb = read_super(dev, NULL, type, 0, NULL, 0);//读取超级块
if (!sb) {
put_unnamed_dev(dev);
return ERR_PTR(-EINVAL);
}
mnt = add_vfsmnt(NULL, sb->s_root, NULL);//挂载
if (!mnt) {
kill_super(sb, 0);
return ERR_PTR(-ENOMEM);
}
type->kern_mnt = mnt;
return mnt;
}
init=>do_basic_setup=>sock_init=>kern_mount=>read_super
static struct super_block * read_super(kdev_t dev, struct block_device *bdev,
struct file_system_type *type, int flags,
void *data, int silent)
{
struct super_block * s;
s = get_empty_super();//分配超级块对象
if (!s)
goto out;
s->s_dev = dev;
s->s_bdev = bdev;
s->s_flags = flags;
s->s_dirt = 0;
sema_init(&s->s_vfs_rename_sem,1);
sema_init(&s->s_nfsd_free_path_sem,1);
s->s_type = type;
sema_init(&s->s_dquot.dqio_sem, 1);
sema_init(&s->s_dquot.dqoff_sem, 1);
s->s_dquot.flags = 0;
lock_super(s);
if (!type->read_super(s, data, silent))//调用sock_fs_type的sockfs_read_super函数,读取超级块
goto out_fail;
unlock_super(s);
/* tell bdcache that we are going to keep this one */
if (bdev)
atomic_inc(&bdev->bd_count);
......
return NULL;
}
init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super
static struct super_operations sockfs_ops = {
statfs: sockfs_statfs,
};
static struct super_block * sockfs_read_super(struct super_block *sb, void *data, int silent)
{
struct inode *root = new_inode(sb);//创建根节点,并指向sb
if (!root)
return NULL;
root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;//设置目录属性
root->i_uid = root->i_gid = 0;
root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
sb->s_blocksize = 1024;
sb->s_blocksize_bits = 10;
sb->s_magic = SOCKFS_MAGIC;//超级块魔数
sb->s_op = &sockfs_ops;//设置超级块跳转表
sb->s_root = d_alloc(NULL, &(const struct qstr) { "socket:", 7, 0 });//创建dentry
if (!sb->s_root) {
iput(root);
return NULL;
}
sb->s_root->d_sb = sb;//每个dentry都有个指针指向超级块,为以后要访问做准备
sb->s_root->d_parent = sb->s_root;//在socket文件系统中,因为是根目录,不存在父目录,指向自己
d_instantiate(sb->s_root, root);//将dentry 与inode关联起来
return sb;
}
init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super=>d_alloc
struct dentry * d_alloc(struct dentry * parent, const struct qstr *name)
{
char * str;
struct dentry *dentry;
dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); //从高速缓冲区中分配dentry
if (!dentry)
return NULL;
if (name->len > DNAME_INLINE_LEN-1) {//如果name过长,重新分配内存
str = kmalloc(NAME_ALLOC_LEN(name->len), GFP_KERNEL);
if (!str) {
kmem_cache_free(dentry_cache, dentry);
return NULL;
}
} else
str = dentry->d_iname;
memcpy(str, name->name, name->len);//拷贝名字到dentry->d_iname中
str[name->len] = 0;
atomic_set(&dentry->d_count, 1);//设置引用计数为1
dentry->d_flags = 0;
dentry->d_inode = NULL;
dentry->d_parent = NULL;
dentry->d_sb = NULL;
dentry->d_name.name = str;
dentry->d_name.len = name->len;
dentry->d_name.hash = name->hash;
dentry->d_op = NULL;
dentry->d_fsdata = NULL;
INIT_LIST_HEAD(&dentry->d_vfsmnt);//初始化挂载队列,一个目录可能会挂在多次
INIT_LIST_HEAD(&dentry->d_hash);//每个dentry都会挂入dentry_hashtable哈希表中
INIT_LIST_HEAD(&dentry->d_lru);//dentry可能会插入到最近最少使用的dentry队列
INIT_LIST_HEAD(&dentry->d_subdirs);//每个目录下面可能会会有多个子目录,子目录通过d_child挂入到父目录的d_subdirs中
INIT_LIST_HEAD(&dentry->d_alias);//一个inode可能会对应多个dentry,dentry通过d_alias挂入到inode的i_dentry中
if (parent) {
dentry->d_parent = dget(parent);
dentry->d_sb = parent->d_sb;
spin_lock(&dcache_lock);
list_add(&dentry->d_child, &parent->d_subdirs);
spin_unlock(&dcache_lock);
} else
INIT_LIST_HEAD(&dentry->d_child);
dentry_stat.nr_dentry++;
return dentry;
}
init=>do_basic_setup=>sock_init=>kern_mount=>read_super=>sockfs_read_super=>d_instantiate
void d_instantiate(struct dentry *entry, struct inode * inode)
{
spin_lock(&dcache_lock);
if (inode)
list_add(&entry->d_alias, &inode->i_dentry);//将entry 链入到inode中,因为一个文件只有一个inode,但是可能存在多个别名,
entry->d_inode = inode;//将entry的inode指向inode,这样后面就可以访问,如果在查找文件时,
spin_unlock(&dcache_lock);
}
读完了超级块,我们来看下挂载过程:
static struct vfsmount *add_vfsmnt(struct nameidata *nd,
struct dentry *root,
const char *dev_name)
{
struct vfsmount *mnt;
struct super_block *sb = root->d_inode->i_sb;//拿到超级块对象
char *name;
mnt = kmalloc(sizeof(struct vfsmount), GFP_KERNEL);//分配挂在对象
if (!mnt)
goto out;
memset(mnt, 0, sizeof(struct vfsmount));
if (nd || dev_name)
mnt->mnt_flags = MNT_VISIBLE;
/* It may be NULL, but who cares? */
if (dev_name) {
name = kmalloc(strlen(dev_name)+1, GFP_KERNEL);
if (name) {
strcpy(name, dev_name);
mnt->mnt_devname = name;
}
}
mnt->mnt_owner = current->uid;
atomic_set(&mnt->mnt_count,1);//设置引用计数为1
mnt->mnt_sb = sb;//保存超级块对象地址,就是我们在sock_alloc看到的sock_mnt->mnt_sb
spin_lock(&dcache_lock);
if (nd && !IS_ROOT(nd->dentry) && d_unhashed(nd->dentry))
goto fail;
mnt->mnt_root = dget(root);//设置挂载点根目录项
mnt->mnt_mountpoint = nd ? dget(nd->dentry) : dget(root);//挂载点的目录项同样为根目录项
mnt->mnt_parent = nd ? mntget(nd->mnt) : mnt;//挂载点的父目录项同样为根目录项
if (nd) {
list_add(&mnt->mnt_child, &nd->mnt->mnt_mounts);
list_add(&mnt->mnt_clash, &nd->dentry->d_vfsmnt);
} else {
INIT_LIST_HEAD(&mnt->mnt_child);//如果挂载的文件系统下面有子挂载点,则挂入父目录的mnt_child链表中
INIT_LIST_HEAD(&mnt->mnt_clash);
}
INIT_LIST_HEAD(&mnt->mnt_mounts);
list_add(&mnt->mnt_instances, &sb->s_mounts);//将挂载点链入超级块中
list_add(&mnt->mnt_list, vfsmntlist.prev);
spin_unlock(&dcache_lock);
out:
return mnt;
fail:
spin_unlock(&dcache_lock);
if (mnt->mnt_devname)
kfree(mnt->mnt_devname);
kfree(mnt);
return NULL;
}
这个挂载就完成了,创建并入读入了超级块,实际就是创建了一个名为"socket:"的根目录项和inode,并相互关联。然后创建vfsmount挂载对象,并与sb、root_inode、root_dentry 相互关联。
接着看下开头的代码socki_lookup,获取socket对象,同时将分配的inode地址保存在socket的inode 中。
struct inode {
......
union {
struct minix_inode_info minix_i;
struct ext2_inode_info ext2_i;
......
struct socket socket_i;
struct usbdev_inode_info usbdev_i;
void *generic_ip;
} u;
};
extern __inline__ struct socket *socki_lookup(struct inode *inode)
{
return &inode->u.socket_i;//创建inode节点时,将struct socket 一起分配了,所以这里直接访问
}
创建socket:
在sys_socket函数中,有个很重要的地方和socket文件系统息息相关,那就是sock_map_fd,我们看下它的实现:
static int sock_map_fd(struct socket *sock)
{
int fd;
struct qstr this;
char name[32];
/*
* Find a file descriptor suitable for return to the user.
*/
fd = get_unused_fd();//获取一个空闲的文件描述符
if (fd >= 0) {
struct file *file = get_empty_filp();//获取空闲文件对象
if (!file) {
put_unused_fd(fd);
fd = -ENFILE;
goto out;
}
sprintf(name, "[%lu]", sock->inode->i_ino);//这个就是文件名,通过inode编号设置
this.name = name;
this.len = strlen(name);
this.hash = sock->inode->i_ino;
file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this);//通过文件名,创建目录项
if (!file->f_dentry) {
put_filp(file);
put_unused_fd(fd);
fd = -ENOMEM;
goto out;
}
file->f_dentry->d_op = &sockfs_dentry_operations;//设置dentry的函数跳转表
d_add(file->f_dentry, sock->inode);//关联inode和dentry,并将dentry 挂入dentry_hashtable中
file->f_vfsmnt = mntget(sock_mnt);//当前文件指向本挂载点对象
sock->file = file;
file->f_op = sock->inode->i_fop = &socket_file_ops;//设置文件跳转表,这里我们可以分析出,为什么用户态调用write函数也能发送数据,其内部最终调用的是sock->ops->sendmsg,如果是inet协议族则为inet_sendmsg
file->f_mode = 3;
file->f_flags = O_RDWR;
file->f_pos = 0;
fd_install(fd, file);//将fd与file关联
}
out:
return fd;
}
sys_socket=>sock_map_fd=>d_add
static __inline__ void d_add(struct dentry * entry, struct inode * inode)
{
d_instantiate(entry, inode);//关联inode和dentry
d_rehash(entry);//重新计算entry的哈希值,然后挂入到dentry_hashtable中
}
sys_socket=>sock_map_fd=>d_add=>d_rehash
void d_rehash(struct dentry * entry)
{
struct list_head *list = d_hash(entry->d_parent, entry->d_name.hash);//计算哈希值,找到对应的桶
spin_lock(&dcache_lock);
list_add(&entry->d_hash, list);//挂入哈希表对应的桶的链表中
spin_unlock(&dcache_lock);
}
sys_socket=>sock_map_fd=>d_add=>d_rehash=>d_hash
巧妙之处:
这个非常巧妙,利用父目录的dentry的地址一起参与计算哈希值,可以避免哈希冲突,比如 ./zhangsan/project/src , ./lisi/project/src , 比如project目录,如果只用当前目录或者文件名作为计算哈希的key,很容易就产生冲突,如果可是如果用父目录的名字一起作为哈希值计算,也很容易冲突,如果使用全路径那效率太低了,比如 src,所以内核使用该dentry的父dentry的地址作为哈希值的一部分,则src 理论产生冲突的概率大大降低了。
static inline struct list_head * d_hash(struct dentry * parent, unsigned long hash)
{
hash += (unsigned long) parent / L1_CACHE_BYTES;
hash = hash ^ (hash >> D_HASHBITS) ^ (hash >> D_HASHBITS*2);
return dentry_hashtable + (hash & D_HASHMASK);
}
sockfs_dentry_operations,大部分为空,则走系统默认流程处理
static struct dentry_operations sockfs_dentry_operations = {
d_delete: sockfs_delete_dentry,
};
socket_file_ops:
static struct file_operations socket_file_ops = {
llseek: sock_lseek,
read: sock_read,
write: sock_write,
poll: sock_poll,
ioctl: sock_ioctl,
mmap: sock_mmap,
open: sock_no_open, /* special open code to disallow open via /proc */
release: sock_close,
fasync: sock_fasync,
readv: sock_readv,
writev: sock_writev
};
write与send:
我们以write为例,看看是否与我们用户态用send发送数据是一样的效果。
asmlinkage ssize_t sys_write(unsigned int fd, const char * buf, size_t count)
{
ssize_t ret;
struct file * file;
ret = -EBADF;
file = fget(fd);
if (file) {
if (file->f_mode & FMODE_WRITE) {
struct inode *inode = file->f_dentry->d_inode;//找到对应的inode节点
ret = locks_verify_area(FLOCK_VERIFY_WRITE, inode, file,
file->f_pos, count);
if (!ret) {
ssize_t (*write)(struct file *, const char *, size_t, loff_t *);
ret = -EINVAL;
if (file->f_op && (write = file->f_op->write) != NULL)
ret = write(file, buf, count, &file->f_pos);//很明显这里是sock_write
}
}
if (ret > 0)
inode_dir_notify(file->f_dentry->d_parent->d_inode,
DN_MODIFY);
fput(file);
}
return ret;
}
sys_write=>sock_write
static ssize_t sock_write(struct file *file, const char *ubuf,
size_t size, loff_t *ppos)
{
struct socket *sock;
struct msghdr msg;
struct iovec iov;
if (ppos != &file->f_pos)
return -ESPIPE;
if(size==0) /* Match SYS5 behaviour */
return 0;
sock = socki_lookup(file->f_dentry->d_inode); //通过inode找到socket对象,这个我们前面看到了
msg.msg_name=NULL;
msg.msg_namelen=0;
msg.msg_iov=&iov;
msg.msg_iovlen=1;
msg.msg_control=NULL;
msg.msg_controllen=0;
msg.msg_flags=!(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT;
if (sock->type == SOCK_SEQPACKET)
msg.msg_flags |= MSG_EOR;
iov.iov_base=(void *)ubuf;
iov.iov_len=size;
return sock_sendmsg(sock, &msg, size);//发送内容
}
sys_write=>sock_write=>sock_sendmsg
int sock_sendmsg(struct socket *sock, struct msghdr *msg, int size)
{
int err;
struct scm_cookie scm;
err = scm_send(sock, msg, &scm);
if (err >= 0) {
err = sock->ops->sendmsg(sock, msg, size, &scm);//到这里我们就很熟悉了,调用的就是inet_stream_ops的inet_sendmsg函数,如果是inet协议族的话
scm_destroy(&scm);
}
return err;
}
到这里,我们知道了socket文件系统的挂在过程,并且知道为什么用户态调用send 和write效果一样,原因是sock_write最终还是走到对应的协议栈的发送函数。