读内核源码(Linux 4.9.9)之共享内存的实现

最新推荐文章于 2023-10-01 13:55:54 发布

qauzy

最新推荐文章于 2023-10-01 13:55:54 发布

阅读量1.2k

点赞数

分类专栏： Linux内核阅读服务器和网络

本文链接：https://blog.csdn.net/idwtwt/article/details/79092578

版权

服务器和网络同时被 2 个专栏收录

20 篇文章 2 订阅

订阅专栏

Linux内核阅读

8 篇文章 2 订阅

订阅专栏

1 共享内存使用

1.1 共享内存头文件

#include<sys/ipc.h>
#include<sys/types.h>

1.2 共享内存API

int shmget(key_t key,size_t size,int shmflg);

@key : 提供一个参数key，它有效地为共享内存段命名，
@size : 以字节为单位指定需要共享的内存容量
@shmflg : 是权限标志，它的作用与open函数的mode参数一样，如果要想在key标识的共享内存不存在时，创建它的话，可以与IPC_CREAT做或操作。

@return shmget函数成功时返回一个与key相关的共享内存标识符（非负整数），用于后续的共享内存函数。调用失败返回-1.

void* shmat(int shmid,const void* shm_addr,int shmflg);

@shm_id : 是由shmget函数返回的共享内存标识
@shm_addr : 指定共享内存连接到当前进程中的地址位置，通常为空，表示让系统来选择共享内存的地址
@shm_flg : 是一组标志位，通常为0。

int shmdt(const void* shm_addr);

@shmaddr : 是shmat函数返回的地址指针，调用成功时返回0，失败时返回-1.

int shctl(int shmid,int cmd,struct shmid_ds *buf);

@shm_id : 是shmget函数返回的共享内存标识符。
@cmd : 是要采取的操作，它可以取下面的三个值：
@ IPC_STAT：把shmid_ds结构中的数据设置为共享内存的当前关联值，即用共享内存的当前关联值覆盖shmid_ds的值。
@IPC_SET：如果进程有足够的权限，就把共享内存的当前关联值设置为shmid_ds结构中给出的值
@IPC_RMID：删除共享内存段
@ buf : 是一个结构指针，它指向共享内存模式和访问权限的结构。

shmid_ds结构至少包括以下成员：

struct shmid_ds  
{  
   uid_t shm_perm.uid;  
   uid_t shm_perm.gid;  
   mode_t shm_perm.mode;  
};

2 共享内存子系统初始化

2.1建立proc文件

void __init shm_init(void)
{
	//建立proc文件节点
	ipc_init_proc_interface("sysvipc/shm",
#if BITS_PER_LONG <= 32
				"       key      shmid perms       size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime        rss       swap\n",
#else
				"       key      shmid perms                  size  cpid  lpid nattch   uid   gid  cuid  cgid      atime      dtime      ctime                   rss                  swap\n",
#endif
				IPC_SHM_IDS, sysvipc_shm_proc_show);
}

很简单,通过调用ipc_init_proc_interface函数,注册sysvipc_shm_proc_show函数,当我们cat sysvipc/shm就可以显示共享内存的一些使用情况等信息,具体不是我们分析的重点.

2.2 资源使用限制和权限初始化

static int __init ipc_ns_init(void)
{
	shm_init_ns(&init_ipc_ns);
	return 0;
}
void shm_init_ns(struct ipc_namespace *ns)
{
	ns->shm_ctlmax = SHMMAX;   //共享内存块最大值限制
	ns->shm_ctlall = SHMALL;   //共享内存总用量限制
	ns->shm_ctlmni = SHMMNI;   //共享内存块数限制,默认4096
	ns->shm_rmid_forced = 0;
	ns->shm_tot = 0;
	ipc_init_ids(&shm_ids(ns)); 
}

我们主要关注关于共享内存资源使用限制的三个参数变量.

2.3 tmpfs文件系统初始化

共享内存走的 共享文件这个路子,只是这个文件是在内存文件系统tmpfs中的 特殊文件,无法像普通文件那样被直接看到或访问.在分析tmpfs文件系统注册过程之前我们回顾下文件系统相关的概念.

超级块对象(super_block): 各种文件系统都必须实超级块对象,该对象用于存储文件系统信息,通常对应于存放在磁盘特定扇区中的文件系统超级块或文件系统控制块(对于tmpfs的超级块 不是存放在磁盘中,挂载的时候不是从磁盘读取,而是直接通过函数 ramfs_fill_super填充)

索引节点对象(inode):包含了内核在操作文件或目录时需要的全部信息,一个索引节点代表文件系统中的一个文件.

目录项对象(dentry):每个目录项对象dentry代表路径中的一个特定部分,在路径中(包括普通文件在内),每一个部分都是目录项对象.

文件对象(file):表示进程已经打开的文件.

文件系统类型(file_system_type):用来描述各种特定文件系统类型.

安装点(vfsmount):用来描述一个安装的文件系统实例.

在回顾了文件系统的几个概念之后,我们简要地分析tmpfs文件系统的初始化.在文件mm/shmem.c:

int __init shmem_init(void)
{
	int error;

	/* If rootfs called this, don't re-init */
	if (shmem_inode_cachep)
		return 0;

	//为inode初始化slab高速缓存
	error = shmem_init_inodecache();
	if (error)
		goto out3;
	
	//注册tmpfs文件系统
	error = register_filesystem(&shmem_fs_type);
	if (error) {
		pr_err("Could not register tmpfs\n");
		goto out2;
	}

    // vfs_kern_mount --> mount_fs -->  (shmem_fs_type  --> mount)
    
    //shm_mnt是安装点vfsmount* 对象

	// alloc_super 分配

	//挂载tmpfs文件系统,期间会构建并填充超级块对象
	shm_mnt = kern_mount(&shmem_fs_type);
	if (IS_ERR(shm_mnt)) {
		error = PTR_ERR(shm_mnt);
		pr_err("Could not kern_mount tmpfs\n");
		goto out1;
	}

#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	if (has_transparent_hugepage() && shmem_huge < SHMEM_HUGE_DENY)
		SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
	else
		shmem_huge = 0; /* just in case it was patched */
#endif
	return 0;

out1:
	unregister_filesystem(&shmem_fs_type);
out2:
	shmem_destroy_inodecache();
out3:
	shm_mnt = ERR_PTR(error);
	return error;
}

上面的代码主要做了三件事:

第一件,为inode分配slab cache.由文件系统的基本概念我们知道在文件系统中一个inode描述了一个实质的文件,具体到这里,如果我们要新建一个共享内存块,也就是在tmpfs新建一个inode对象,这个inode对象就是在名为shmem_inode_cache的slab cache中分配.

static int shmem_init_inodecache(void)
{
	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
				sizeof(struct shmem_inode_info),
				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
	return 0;
}

第二件,注册文件系统类型,实质是在操作系统的一个全局链表中添加一个tmpfs的struct file_system_type对象,这个对象在shmem.c中定义了两个,因为定义了 CONFIG_SHMEM,我们只看 CONFIG_SHMEM的那个

static struct file_system_type shmem_fs_type = {
	.owner		= THIS_MODULE,//文件系统模块
	.name		= "tmpfs",//文件系统名字
	.mount		= shmem_mount,//文件系统挂载时被调用
	.kill_sb	= kill_litter_super,//终止超级块的访问
	.fs_flags	= FS_USERNS_MOUNT,
};

这里主要留意成员函数shmen_mount,这个函数将在文件系统挂载时被调用.

第三件,挂载tmpfs文件系统,顺着函数调用链kern_mount --> kern_mount_data --> vfs_kern_mount --> mount_fs --> shmem_mount,最终调用tmpfs文件系统的挂载函数shmem_mount(对于其他文件系统也会调用注册时传入的挂载函数)

static struct dentry *shmem_mount(struct file_system_type *fs_type,
	int flags, const char *dev_name, void *data)
{
    //调 alloc_super 分配一个 super_block

	//超级块的操作函数集为 shmem_ops

	// shmem_fill_super 填充 sb->s_root , 包含超级块函数操作集
	return mount_nodev(fs_type, flags, data, shmem_fill_super);
}

传入超级块填充函数shmem_fill_super:

struct dentry *mount_nodev(struct file_system_type *fs_type,
	int flags, void *data,
	int (*fill_super)(struct super_block *, void *, int))
{
	int error;
	// alloc_super 分配
	struct super_block *s = sget(fs_type, NULL, set_anon_super, flags, NULL);

	if (IS_ERR(s))
		return ERR_CAST(s);

    // shmem_fill_super
	error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
	if (error) {
		deactivate_locked_super(s);
		return ERR_PTR(error);
	}
	s->s_flags |= MS_ACTIVE;
	//增加引用计数
	return dget(s->s_root);
}

在函数mount_nodev中,构建一个超级块对象,然后调用shmem_fill_super填充该对象:

//这里会填充sb->s_root
int shmem_fill_super(struct super_block *sb, void *data, int silent)
{
	struct inode *inode;
	struct shmem_sb_info *sbinfo;
	int err = -ENOMEM;

	/* Round up to L1_CACHE_BYTES to resist false sharing */
	sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
				L1_CACHE_BYTES), GFP_KERNEL);
	if (!sbinfo)
		return -ENOMEM;

	sbinfo->mode = S_IRWXUGO | S_ISVTX;
	sbinfo->uid = current_fsuid();
	sbinfo->gid = current_fsgid();
	sb->s_fs_info = sbinfo;

#ifdef CONFIG_TMPFS
	/*
	 * Per default we only allow half of the physical ram per
	 * tmpfs instance, limiting inodes to one per page of lowmem;
	 * but the internal instance is left unlimited.
	 */
	if (!(sb->s_flags & MS_KERNMOUNT)) {
		sbinfo->max_blocks = shmem_default_max_blocks();
		sbinfo->max_inodes = shmem_default_max_inodes();
		if (shmem_parse_options(data, sbinfo, false)) {
			err = -EINVAL;
			goto failed;
		}
	} else {
		sb->s_flags |= MS_NOUSER;
	}
	sb->s_export_op = &shmem_export_ops;
	sb->s_flags |= MS_NOSEC;
#else
	sb->s_flags |= MS_NOUSER;
#endif

	spin_lock_init(&sbinfo->stat_lock);
	if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
		goto failed;
	sbinfo->free_inodes = sbinfo->max_inodes;
	spin_lock_init(&sbinfo->shrinklist_lock);
	INIT_LIST_HEAD(&sbinfo->shrinklist);

	sb->s_maxbytes = MAX_LFS_FILESIZE;
	sb->s_blocksize = PAGE_SIZE;
	sb->s_blocksize_bits = PAGE_SHIFT;
	sb->s_magic = TMPFS_MAGIC;
	
	//超级块操作函数集
	sb->s_op = &shmem_ops;
	sb->s_time_gran = 1;
#ifdef CONFIG_TMPFS_XATTR
	sb->s_xattr = shmem_xattr_handlers;
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
	sb->s_flags |= MS_POSIXACL;
#endif

	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
	if (!inode)
		goto failed;
	inode->i_uid = sbinfo->uid;
	inode->i_gid = sbinfo->gid;
	//
	sb->s_root = d_make_root(inode);
	if (!sb->s_root)
		goto failed;
	return 0;

failed:
	shmem_put_super(sb);
	return err;
}

这里我们主要留意超级块操作函数集 shmem_ops,这里包含了新建共享内存块时新建inode节点和删除共享内存区块时要调用的成员函数:

static const struct super_operations shmem_ops = {
	.alloc_inode	= shmem_alloc_inode,//分配inode对象
	.destroy_inode	= shmem_destroy_inode,//销毁inode对象
#ifdef CONFIG_TMPFS
	.statfs		= shmem_statfs,
	.remount_fs	= shmem_remount_fs,
	.show_options	= shmem_show_options,
#endif
	.evict_inode	= shmem_evict_inode,
	.drop_inode	= generic_delete_inode,
	.put_super	= shmem_put_super,
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
	.nr_cached_objects	= shmem_unused_huge_count,
	.free_cached_objects	= shmem_unused_huge_scan,
#endif
};

3 共享内存API内核实现

3.1 shmget内核实现分析

系统调用进入:

//返回共享内存IPC id
SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
{
	struct ipc_namespace *ns;

	//实例化ipc操作集,待会会使用
	static const struct ipc_ops shm_ops = {
		.getnew = newseg,					//后面会调用的获取新共享内存区块函数
		.associate = shm_security,
		.more_checks = shm_more_checks,
	};
	struct ipc_params shm_params;

	ns = current->nsproxy->ipc_ns;

	shm_params.key = key;
	shm_params.flg = shmflg;
	shm_params.u.size = size;

	return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
}

在这里初始化一个ipc操作函数集,然后作为参数调用ipcget:

/**
 * ipcget - Common sys_*get() code
 * @ns: namespace
 * @ids: ipc identifier set
 * @ops: operations to be called on ipc object creation, permission checks
 *       and further checks
 * @params: the parameters needed by the previous operations.
 *
 * Common routine called by sys_msgget(), sys_semget() and sys_shmget().
 */
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
			const struct ipc_ops *ops, struct ipc_params *params)
{
    //传入0？
	if (params->key == IPC_PRIVATE)
		return ipcget_new(ns, ids, ops, params);
	//传入非0
	else
		return ipcget_public(ns, ids, ops, params);
}

传入0,代表私有,即进程内或父子进程之间的共享,传入非0代表公有,即跨进程.我们这里以跨进程为例,ipcget_public

/**
 * ipcget_public - get an ipc object or create a new one
 * @ns: ipc namespace
 * @ids: ipc identifier set
 * @ops: the actual creation routine to call
 * @params: its parameters
 *
 * This routine is called by sys_msgget, sys_semget() and sys_shmget()
 * when the key is not IPC_PRIVATE.
 * It adds a new entry if the key is not found and does some permission
 * / security checkings if the key is found.
 *
 * On success, the ipc id is returned.
 */
static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
		const struct ipc_ops *ops, struct ipc_params *params)
{

//   static const struct ipc_ops shm_ops = {
//	   .getnew = newseg,
//	   .associate = shm_security,
//	   .more_checks = shm_more_checks,
//   };

//
//	shm_params.key = key;
//	shm_params.flg = shmflg;
//	shm_params.u.size = size;

	struct kern_ipc_perm *ipcp;
	int flg = params->flg;
	int err;

	/*
	 * Take the lock as a writer since we are potentially going to add
	 * a new entry + read locks are not "upgradable"
	 */
	down_write(&ids->rwsem);
	
	//查找看是否有对应的共享内存区块
	ipcp = ipc_findkey(ids, params->key);
	
	//如果不存则建立
	if (ipcp == NULL) {
		/* key not used */
		if (!(flg & IPC_CREAT))
			err = -ENOENT;
		else
			// newseg
			err = ops->getnew(ns, params);
	} else {
		/* ipc object has been locked by ipc_findkey() */

		if (flg & IPC_CREAT && flg & IPC_EXCL)
			err = -EEXIST;
		else {
			err = 0;
			if (ops->more_checks)
				err = ops->more_checks(ipcp, params);
			if (!err)
				/*
				 * ipc_check_perms returns the IPC id on
				 * success
				 */
				err = ipc_check_perms(ns, ipcp, ops, params);
		}
		ipc_unlock(ipcp);
	}
	up_write(&ids->rwsem);

	return err;
}

在这里,先查找是否有对应key的共享内存区块,如果存在,则返回该区块,如果不存在则调用err = ops->getnew新建一个,即调用newseg:

//建立新的共享内存段

//1.参数及共享内存系统限制检查
//2.分配共享内存管理结构 shmid_kernel
//3.在tmpfs中创建共享内存文件，以获取物理内存
//4.将shmid_kernel添加到共享内存基数树中，并获得基数树id
//5.初始化shmid_kernel结构
//6.返回共享内存IPC id

// ipcget_public
static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
{
	key_t key = params->key;
	int shmflg = params->flg;
	size_t size = params->u.size;
	int error;
	struct shmid_kernel *shp;
	
	//转换为页数
	size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
	struct file *file;
	char name[13];
	int id;
	vm_flags_t acctflag = 0;
	
	//如果要分配的内存区块小于SHMMIN,或大于ns->shm_ctlmax,即/proc/sys/kernel/shmmax的值,则返错
	if (size < SHMMIN || size > ns->shm_ctlmax)
		return -EINVAL;

	if (numpages << PAGE_SHIFT < size)
		return -ENOSPC;

	//当前系统共享存储区占用的叶的总数大于共享存储引用的最多叶数返错
	if (ns->shm_tot + numpages < ns->shm_tot ||
			ns->shm_tot + numpages > ns->shm_ctlall)
		return -ENOSPC;

    //为shmid_kernel对象分配内存
    //从slab中分配内存
	shp = ipc_rcu_alloc(sizeof(*shp));
	if (!shp)
		return -ENOMEM;

	shp->shm_perm.key = key;
	shp->shm_perm.mode = (shmflg & S_IRWXUGO);
	shp->mlock_user = NULL;

	shp->shm_perm.security = NULL;
	error = security_shm_alloc(shp);
	if (error) {
		ipc_rcu_putref(shp, ipc_rcu_free);
		return error;
	}

	sprintf(name, "SYSV%08x", key);
	//大页表
	if (shmflg & SHM_HUGETLB) {
		struct hstate *hs;
		size_t hugesize;


		hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
		if (!hs) {
			error = -EINVAL;
			goto no_file;
		}
		hugesize = ALIGN(size, huge_page_size(hs));

		/* hugetlb_file_setup applies strict accounting */
		if (shmflg & SHM_NORESERVE)
			acctflag = VM_NORESERVE;
		file = hugetlb_file_setup(name, hugesize, acctflag,
				  &shp->mlock_user, HUGETLB_SHMFS_INODE,
				(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
	} else {
		/*
		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
		 * if it's asked for.
		 */
		if  ((shmflg & SHM_NORESERVE) &&
				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
			acctflag = VM_NORESERVE;
		//在shm的tempfs中创建一个文件inode节点，并返回一个文件描述符，文件存在哪个路径了呢？？是个隐藏文件，用户空间看不到！！

		// shmem_ops
		//name = SYSV%08x  --> key
		file = shmem_kernel_file_setup(name, size, acctflag);
	}
	error = PTR_ERR(file);
	if (IS_ERR(file))
		goto no_file;

	shp->shm_cprid = task_tgid_vnr(current);
	shp->shm_lprid = 0;
	shp->shm_atim = shp->shm_dtim = 0;
	shp->shm_ctim = get_seconds();
	shp->shm_segsz = size;
	shp->shm_nattch = 0;
	
	//它指向了共享内存对应的文件。在该结构中有一个字段，f_mapping，它指向了该内存段使用的页面（物理内存）---> 其实是地址空间对象 struct address_space 
	shp->shm_file = file;
	
	shp->shm_creator = current;

	//将shp添加到ipc基数树,并返回对应id
	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
	if (id < 0) {
		error = id;
		goto no_id;
	}

	list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);

	/*
	 * shmid gets reported as "inode#" in /proc/pid/maps.
	 * proc-ps tools use this. Changing this will break them.
	 */
	file_inode(file)->i_ino = shp->shm_perm.id;

	ns->shm_tot += numpages;
	error = shp->shm_perm.id;

	ipc_unlock_object(&shp->shm_perm);
	rcu_read_unlock();
	return error;

no_id:
	if (is_file_hugepages(file) && shp->mlock_user)
		user_shm_unlock(size, shp->mlock_user);
	fput(file);
no_file:
	ipc_rcu_putref(shp, shm_rcu_free);
	return error;
}

这里主要做几件事:

1 对size和权限做检查

2 在slab中分配一个shmid_kernel对象

3 根据有没有传入大页表标志SHM_HUGETLB执行不同的分支,最终在tmpfs中建立一个文件,对应一个file,dentry和inode.

4 用得到的file以及其他参数填充shmid_kernel对象

5 将shmid_kernel对象添加到ipc基数树,得到对应shmid,将shmid_kernel对象添加到sysvshm.shm_clist链表,返回shmid

接着我们以非大页表为例,分析执行shmem_kernel_file_setup,这里已经进入了tmpfs文件系统的代码:

/**
 * shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
 * 	kernel internal.  There will be NO LSM permission checks against the
 * 	underlying inode.  So users of this interface must do LSM checks at a
 *	higher layer.  The users are the big_key and shm implementations.  LSM
 *	checks are provided at the key or shm level rather than the inode.
 * @name: name for dentry (to be seen in /proc/<pid>/maps
 * @size: size to be set for the file
 * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
 */
struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
{
    // shmem_ops
	return __shmem_file_setup(name, size, flags, S_PRIVATE);
}
//name for dentry (to be seen in /proc/<pid>/maps
static struct file *__shmem_file_setup(const char *name, loff_t size,
				       unsigned long flags, unsigned int i_flags)
{
	struct file *res;
	struct inode *inode;
	struct path path;
	struct super_block *sb;
	struct qstr this;

	if (IS_ERR(shm_mnt))
		return ERR_CAST(shm_mnt);

	if (size < 0 || size > MAX_LFS_FILESIZE)
		return ERR_PTR(-EINVAL);

	if (shmem_acct_size(flags, size))
		return ERR_PTR(-ENOMEM);

	res = ERR_PTR(-ENOMEM);
	this.name = name;
	this.len = strlen(name);
	this.hash = 0; /* will go */
	//直接通过全局变量得到文件系统的超级块对象
	sb = shm_mnt->mnt_sb;
	
	//得文件系统挂载点
	path.mnt = mntget(shm_mnt);
	
	//虚拟的目录

	//在shm所mount文件系统根目录下创建dentry节点
	path.dentry = d_alloc_pseudo(sb, &this);
	if (!path.dentry)
		goto put_memory;
	d_set_d_op(path.dentry, &anon_ops);

	res = ERR_PTR(-ENOSPC);

	//分配一个inode对象
	//无论是否定义了 CONFIG_SHMEM 都会执行 new_inode  -- > alloc_inode   -> ( shmem_ops --> shmem_alloc_inode ) 

	//分配的是 shmem_inode_info 类型的 inode
	
	inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
	if (!inode)
		goto put_memory;

	inode->i_flags |= i_flags;

	//将目录项对象dentry和inode节点关联起来 
	d_instantiate(path.dentry, inode);
	//文件大小
	inode->i_size = size;
	clear_nlink(inode);	/* It is unlinked */

	//如果配置了MMU则返回0
	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
	if (IS_ERR(res))
		goto put_path;
	
    //构建file对象
    //分配一个file文件描述符指向该inode节点，并指定该文件操作指针为 shmem_file_operations
	res = alloc_file(&path, FMODE_WRITE | FMODE_READ,
		  &shmem_file_operations);

		  
	if (IS_ERR(res))
		goto put_path;

	return res;

put_memory:
	shmem_unacct_size(flags, size);
put_path:
	path_put(&path);
	return res;
}

函数比较简单,就是直接通过全局变量得到2.3节提到的tmpfs文件系统的安装点,即其中包含的虚拟目录和超级块对象,然后用tmpfs的超级块操作函数集 shmem_file_operations中的成员 shmem_alloc_inode分配一个特殊的inode,这样文件就在tmpfs中建立起来了.这还没完,有了inode,顺带也建立目录项对象dentry,然后利用目录项对象构建file对象返回.如果对vfs比较熟悉可以非常轻易的理解.这里我们要关注这个file对象的操作函数集 shmem_file_operations:

static const struct file_operations shmem_file_operations = {
	.mmap		= shmem_mmap,
	.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
	.llseek		= shmem_file_llseek,
	.read_iter	= shmem_file_read_iter,
	.write_iter	= generic_file_write_iter,
	.fsync		= noop_fsync,
	.splice_read	= generic_file_splice_read,
	.splice_write	= iter_file_splice_write,
	.fallocate	= shmem_fallocate,
#endif
};

总结一下,shmget得到了是一个什么东西,其实就是 file-dentry-inode这样一个普通的三元组,特殊的是,这个inode是tmpfs中的一个特殊inode.

3.2 shmat内核实现分析

系统调用进入:

SYSCALL_DEFINE3(shmat, int, shmid, char __user *, shmaddr, int, shmflg)
{
	unsigned long ret;
	long err;

	err = do_shmat(shmid, shmaddr, shmflg, &ret, SHMLBA);
	if (err)
		return err;
	force_successful_syscall_return();
	return (long)ret;
}

接着进入do_shmat:

/*
 * Fix shmaddr, allocate descriptor, map shm, add attach descriptor to lists.
 *
 * NOTE! Despite the name, this is NOT a direct system call entrypoint. The
 * "raddr" thing points to kernel space, and there has to be a wrapper around
 * this.
 */
long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
	      unsigned long shmlba)
{
	//为包含一个file对象的结构体
	struct shmid_kernel *shp;

	//addr存储根据用户传入的地址shmaddr校正
	unsigned long addr;
	
	unsigned long size;
	struct file *file;
	int    err;
	unsigned long flags;
	unsigned long prot;
	int acc_mode;
	struct ipc_namespace *ns;

	//包含一个file对象和一个 vm_operations_struct 对象
	struct shm_file_data *sfd;
	struct path path;
	fmode_t f_mode;
	unsigned long populate = 0;

	err = -EINVAL;
	if (shmid < 0)
		goto out;

	//addr存储根据用户传入的地址shmaddr校正
	else if ((addr = (ulong)shmaddr)) {
		if (addr & (shmlba - 1)) {
			if (shmflg & SHM_RND)
				addr &= ~(shmlba - 1);	   /* round down */
			else
#ifndef __ARCH_FORCE_SHMLBA
				if (addr & ~PAGE_MASK)
#endif
					goto out;
		}
		flags = MAP_SHARED | MAP_FIXED;
	} else {
		if ((shmflg & SHM_REMAP))
			goto out;

		flags = MAP_SHARED;
	}

	if (shmflg & SHM_RDONLY) {
		prot = PROT_READ;
		acc_mode = S_IRUGO;
		f_mode = FMODE_READ;
	} else {
		prot = PROT_READ | PROT_WRITE;
		acc_mode = S_IRUGO | S_IWUGO;
		f_mode = FMODE_READ | FMODE_WRITE;
	}
	if (shmflg & SHM_EXEC) {
		prot |= PROT_EXEC;
		acc_mode |= S_IXUGO;
	}

	/*
	 * We cannot rely on the fs check since SYSV IPC does have an
	 * additional creator id...
	 */
	ns = current->nsproxy->ipc_ns;
	rcu_read_lock();
	
	//根据shmid查找基数树得到shmid_kernel对象	
	//该数据结构中最重要的部分就是shm_file这个字段。它指向了共享内存对应的文件。
	//在该结构中有一个字段f_mapping(address_space )，它指向了组织该内存段使用的页面的基数树 
	
	shp = shm_obtain_object_check(ns, shmid);
	if (IS_ERR(shp)) {
		err = PTR_ERR(shp);
		goto out_unlock;
	}

	err = -EACCES;
	if (ipcperms(ns, &shp->shm_perm, acc_mode))
		goto out_unlock;

	err = security_shm_shmat(shp, shmaddr, shmflg);
	if (err)
		goto out_unlock;

	ipc_lock_object(&shp->shm_perm);

	/* check if shm_destroy() is tearing down shp */
	if (!ipc_valid_object(&shp->shm_perm)) {
		ipc_unlock_object(&shp->shm_perm);
		err = -EIDRM;
		goto out_unlock;
	}

	path = shp->shm_file->f_path;
	path_get(&path);
	
	//共享内存引用计数自增
	shp->shm_nattch++;

	//文件大小
	size = i_size_read(d_inode(path.dentry));
	
	ipc_unlock_object(&shp->shm_perm);
	rcu_read_unlock();

	err = -ENOMEM;

	// 从slab分配其中分配shm_file_data数据结构
	sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
	if (!sfd) {
		path_put(&path);
		goto out_nattch;
	}

    //为什么还要分配file对象而不用原来shmid_kernel的file对象呢? ------> 因为一个共享内存文件可以被多个进程映射,这需要file对象来代表
    
    //分配一个新文件描述符指向共享文件，文件访问指针为shm_file_operations ----> 其中包含mmap成员函数
	file = alloc_file(&path, f_mode,
			  is_file_hugepages(shp->shm_file) ?
				&shm_file_operations_huge :
				&shm_file_operations);                      
	err = PTR_ERR(file);
	if (IS_ERR(file)) {
		kfree(sfd);
		path_put(&path);
		goto out_nattch;
	}

	file->private_data = sfd;
	
	//指向共享文件的 address_space   -- >实际上共享内存共享的就是 address_space --> 带着一个基数树的内存页
	file->f_mapping = shp->shm_file->f_mapping;
	
	sfd->id = shp->shm_perm.id;
	sfd->ns = get_ipc_ns(ns);

	//指向共享文件的file描述符
	sfd->file = shp->shm_file;
	sfd->vm_ops = NULL;

	err = security_mmap_file(file, prot, flags);
	if (err)
		goto out_fput;

	if (down_write_killable(¤t->mm->mmap_sem)) {
		err = -EINTR;
		goto out_fput;
	}

	if (addr && !(shmflg & SHM_REMAP)) {
		err = -EINVAL;
		if (addr + size < addr)
			goto invalid;

		if (find_vma_intersection(current->mm, addr, addr + size))
			goto invalid;
	}
	
	//最后进行内存映射，完成attach操作

	//do_mmap_pgoff -->  do_mmap -- >mmap_region -- > file->f_op->mmap(file, vma) --> shm_mmap
	addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
	*raddr = addr;
	err = 0;
	if (IS_ERR_VALUE(addr))
		err = (long)addr;
invalid:
	up_write(¤t->mm->mmap_sem);
	if (populate)
		mm_populate(addr, populate);

out_fput:
	fput(file);

out_nattch:
	down_write(&shm_ids(ns).rwsem);
	shp = shm_lock(ns, shmid);
	shp->shm_nattch--;
	if (shm_may_destroy(ns, shp))
		shm_destroy(ns, shp);
	else
		shm_unlock(shp);
	up_write(&shm_ids(ns).rwsem);
	return err;

out_unlock:
	rcu_read_unlock();
out:
	return err;
}

在该函数主要做了几件事:

1 如果用户有传入有效的映射地址,根据需要调整该值

2 根据传入的shmid查找基数树,得到描述共享内存块的shmid_kernel对象shp

3 构建struct shm_file_data对象sfd,并用shmid_kernel对象中的file对象等初始化该对象

构建用于映射用的file对象,并根据共享内存块是否大页表赋予该file函数操作集 shm_file_operations或shm_file_operations_huge
4 执行 do_mmap_pgoff,进行映射

这里我们重点关注函数操作集 shm_file_operations

static const struct file_operations shm_file_operations = {
	.mmap		= shm_mmap,//执行文件映射时调用该函数
	.fsync		= shm_fsync,
	.release	= shm_release,
	.get_unmapped_area	= shm_get_unmapped_area,
	.llseek		= noop_llseek,
	.fallocate	= shm_fallocate,
};

实际上执行do_mmap_pgoff会沿着函数调用链do_mmap_pgoff --> do_mmap -- >mmap_region -- > file->f_op->mmap(file, vma) --> shm_mmap,最终调用到shm_mmap

static int shm_mmap(struct file *file, struct vm_area_struct *vma)
{
	struct shm_file_data *sfd = shm_file_data(file);
	int ret;

	/*
	 * In case of remap_file_pages() emulation, the file can represent
	 * removed IPC ID: propogate shm_lock() error to caller.
	 */
	//枷锁
	ret =__shm_open(vma);
	if (ret)
		return ret;
	//调用共享文件的mmap,即 shmem_mmap
	ret = sfd->file->f_op->mmap(sfd->file, vma);
	if (ret) {
		shm_close(vma);
		return ret;
	}
	sfd->vm_ops = vma->vm_ops;
#ifdef CONFIG_MMU
	WARN_ON(!sfd->vm_ops->fault);
#endif
	//将操作函数集shm_vm_ops赋值给得到的(新建或红黑树上已有)vm_area_struct 对象的vm_ops字段
	vma->vm_ops = &shm_vm_ops;
	return 0;
}

在继续之前我们先简要了解下进程地址空间和虚拟内存区域的东西.我们知道shmat是将共享文件映射进进程的虚拟内存空间中的一些虚拟内存区域(具体就是MMAP区),这些区域由vm_area_struct描述.我们映射文件到进程地址空间(mmap操作),实质是新建一个vm_area_struct对象,添加到进程内存描述符的mm字段指向的红黑树,或内存区域刚好被覆盖到,直接使用已经加入红黑树的vm_area_struct对象.

/*
 * This struct defines a memory VMM memory area. There is one of these
 * per VM-area/task.  A VM area is any part of the process virtual memory
 * space that has a special rule for the page-fault handlers (ie a shared
 * library, the executable area etc).
 */
struct vm_area_struct {
	/* The first cache line has the info for VMA tree walking. */

	unsigned long vm_start;		/* Our start address within vm_mm. */
	unsigned long vm_end;		/* The first byte after our end address
					   within vm_mm. */

	/* linked list of VM areas per task, sorted by address */
	struct vm_area_struct *vm_next, *vm_prev;

	struct rb_node vm_rb;

	/*
	 * Largest free memory gap in bytes to the left of this VMA.
	 * Either between this VMA and vma->vm_prev, or between one of the
	 * VMAs below us in the VMA rbtree and its ->vm_prev. This helps
	 * get_unmapped_area find a free area of the right size.
	 */
	unsigned long rb_subtree_gap;

	/* Second cache line starts here. */

	struct mm_struct *vm_mm;	/* The address space we belong to. */
	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
	unsigned long vm_flags;		/* Flags, see mm.h. */

	/*
	 * For areas with an address space and backing store,
	 * linkage into the address_space->i_mmap interval tree.
	 */
	struct {
		struct rb_node rb;
		unsigned long rb_subtree_last;
	} shared;

	/*
	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
	 * or brk vma (with NULL file) can only be in an anon_vma list.
	 */
	struct list_head anon_vma_chain; /* Serialized by mmap_sem &
					  * page_table_lock */
	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */

	/* Function pointers to deal with this struct. */
	const struct vm_operations_struct *vm_ops;

	/* Information about our backing store: */
	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
					   units */
	struct file * vm_file;		/* File we map to (can be NULL). */
	void * vm_private_data;		/* was vm_pte (shared mem) */

#ifndef CONFIG_MMU
	struct vm_region *vm_region;	/* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
#endif
	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
};

vm_area_struct的定义如上,具体我们就不做分析,有兴趣可以自己去了解.这里我们关注虚拟内存区域的操作函数集vm_operations_struct 对象,并具体到shm_vm_ops

static const struct vm_operations_struct shm_vm_ops = {
	.open	= shm_open,	//指定内存区域加入到一个地址空间时被调用
	.close	= shm_close,	//指定内存区域从一个地址空间删除时被调用
	.fault	= shm_fault,	//当没有出现在物理内存中的页面被访问时被调用
#if defined(CONFIG_NUMA)
	.set_policy = shm_set_policy,
	.get_policy = shm_get_policy,
#endif
};

现在我们回去分析函数shm_mmap,该函数做了两件事:

第一,调用共享文件的操作函数集中的mmap,即shmem_mmap

static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
	file_accessed(file);
	vma->vm_ops = &shmem_vm_ops;
	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
			((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
			(vma->vm_end & HPAGE_PMD_MASK)) {
		khugepaged_enter(vma, vma->vm_flags);
	}
	return 0;
}

很简单,其实就是将内存区域操作函数集shmem_vm_ops赋值给赋值给得到的(新建或红黑树上已有)vm_area_struct 对象的vm_ops字段

第二,用 shm_vm_ops替换调用shmem_mmap时给赋值的内存区域操作函数集.

所以最终描述我们新映射内存区域的描述符对象vma的操作函数集是 shm_vm_ops,也就是我们上面提到的几个函数集.当
内存区域加入到一个地址空间时 shm_open被调用,移除时 shm_close被调用,当没有出现在物理内存中的页面被访问时,即缺页异常时, shm_fault被调用.

static int __shm_open(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
	struct shmid_kernel *shp;

	shp = shm_lock(sfd->ns, sfd->id);

	if (IS_ERR(shp))
		return PTR_ERR(shp);

	shp->shm_atim = get_seconds();
	shp->shm_lprid = task_tgid_vnr(current);
	shp->shm_nattch++;
	shm_unlock(shp);
	return 0;
}

/* This is called by fork, once for every shm attach. */
static void shm_open(struct vm_area_struct *vma)
{
	int err = __shm_open(vma);
	/*
	 * We raced in the idr lookup or with shm_destroy().
	 * Either way, the ID is busted.
	 */
	WARN_ON_ONCE(err);
}

static void shm_close(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
	struct shmid_kernel *shp;
	struct ipc_namespace *ns = sfd->ns;

	down_write(&shm_ids(ns).rwsem);
	/* remove from the list of attaches of the shm segment */
	shp = shm_lock(ns, sfd->id);

	/*
	 * We raced in the idr lookup or with shm_destroy().
	 * Either way, the ID is busted.
	 */
	if (WARN_ON_ONCE(IS_ERR(shp)))
		goto done; /* no-op */

	shp->shm_lprid = task_tgid_vnr(current);
	shp->shm_dtim = get_seconds();
	shp->shm_nattch--;
	if (shm_may_destroy(ns, shp))
		shm_destroy(ns, shp);
	else
		shm_unlock(shp);
done:
	up_write(&shm_ids(ns).rwsem);
}

我们这里重点分析 shm_fault

static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);

	return sfd->vm_ops->fault(vma, vmf);
}

发现实质是 sfd->vm_ops->fault调用的封装,也就是 shmem_fault

static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
	struct inode *inode = file_inode(vma->vm_file);
	gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
	enum sgp_type sgp;
	int error;
	int ret = VM_FAULT_LOCKED;

	/*
	 * Trinity finds that probing a hole which tmpfs is punching can
	 * prevent the hole-punch from ever completing: which in turn
	 * locks writers out with its hold on i_mutex.  So refrain from
	 * faulting pages into the hole while it's being punched.  Although
	 * shmem_undo_range() does remove the additions, it may be unable to
	 * keep up, as each new page needs its own unmap_mapping_range() call,
	 * and the i_mmap tree grows ever slower to scan if new vmas are added.
	 *
	 * It does not matter if we sometimes reach this check just before the
	 * hole-punch begins, so that one fault then races with the punch:
	 * we just need to make racing faults a rare case.
	 *
	 * The implementation below would be much simpler if we just used a
	 * standard mutex or completion: but we cannot take i_mutex in fault,
	 * and bloating every shmem inode for this unlikely case would be sad.
	 */
	if (unlikely(inode->i_private)) {
		struct shmem_falloc *shmem_falloc;

		spin_lock(&inode->i_lock);
		shmem_falloc = inode->i_private;
		if (shmem_falloc &&
		    shmem_falloc->waitq &&
		    vmf->pgoff >= shmem_falloc->start &&
		    vmf->pgoff < shmem_falloc->next) {
			wait_queue_head_t *shmem_falloc_waitq;
			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);

			ret = VM_FAULT_NOPAGE;
			if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
			   !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
				/* It's polite to up mmap_sem if we can */
				up_read(&vma->vm_mm->mmap_sem);
				ret = VM_FAULT_RETRY;
			}

			shmem_falloc_waitq = shmem_falloc->waitq;
			prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
					TASK_UNINTERRUPTIBLE);
			spin_unlock(&inode->i_lock);
			schedule();

			/*
			 * shmem_falloc_waitq points into the shmem_fallocate()
			 * stack of the hole-punching task: shmem_falloc_waitq
			 * is usually invalid by the time we reach here, but
			 * finish_wait() does not dereference it in that case;
			 * though i_lock needed lest racing with wake_up_all().
			 */
			spin_lock(&inode->i_lock);
			finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
			spin_unlock(&inode->i_lock);
			return ret;
		}
		spin_unlock(&inode->i_lock);
	}

	sgp = SGP_CACHE;
	if (vma->vm_flags & VM_HUGEPAGE)
		sgp = SGP_HUGE;
	else if (vma->vm_flags & VM_NOHUGEPAGE)
		sgp = SGP_NOHUGE;
	//获得新的内存也加入inode的i_mapping字段指向的基数树
	error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
				  gfp, vma->vm_mm, &ret);
	if (error)
		return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
	return ret;
}

主要就是调用 shmem_getpage_gfp查找新的内存页加入 inode的 i_mapping字段指向的基数树,这样对应地址就有对应的物理内存页了,具了体我们不往下分析.

3.3 shmdt内核实现分析

系统调用进入:

/*
 * detach and kill segment if marked destroyed.
 * The work is done in shm_close.
 */
SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma;
	unsigned long addr = (unsigned long)shmaddr;
	int retval = -EINVAL;
#ifdef CONFIG_MMU
	loff_t size = 0;
	struct file *file;
	struct vm_area_struct *next;
#endif

	if (addr & ~PAGE_MASK)
		return retval;

	if (down_write_killable(&mm->mmap_sem))
		return -EINTR;

	/*
	 * This function tries to be smart and unmap shm segments that
	 * were modified by partial mlock or munmap calls:
	 * - It first determines the size of the shm segment that should be
	 *   unmapped: It searches for a vma that is backed by shm and that
	 *   started at address shmaddr. It records it's size and then unmaps
	 *   it.
	 * - Then it unmaps all shm vmas that started at shmaddr and that
	 *   are within the initially determined size and that are from the
	 *   same shm segment from which we determined the size.
	 * Errors from do_munmap are ignored: the function only fails if
	 * it's called with invalid parameters or if it's called to unmap
	 * a part of a vma. Both calls in this function are for full vmas,
	 * the parameters are directly copied from the vma itself and always
	 * valid - therefore do_munmap cannot fail. (famous last words?)
	 */
	/*
	 * If it had been mremap()'d, the starting address would not
	 * match the usual checks anyway. So assume all vma's are
	 * above the starting address given.
	 */

	//查找对应起始地址的虚拟内存区域
	vma = find_vma(mm, addr);

#ifdef CONFIG_MMU
	while (vma) {
		next = vma->vm_next;

		/*
		 * Check if the starting address would match, i.e. it's
		 * a fragment created by mprotect() and/or munmap(), or it
		 * otherwise it starts at this address with no hassles.
		 */
		 
		//包含缺页异常回调函数
		if ((vma->vm_ops == &shm_vm_ops) &&
			(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {

			/*
			 * Record the file of the shm segment being
			 * unmapped.  With mremap(), someone could place
			 * page from another segment but with equal offsets
			 * in the range we are unmapping.
			 */
			file = vma->vm_file;
			size = i_size_read(file_inode(vma->vm_file));
			
			//执行解除映射
			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
			/*
			 * We discovered the size of the shm segment, so
			 * break out of here and fall through to the next
			 * loop that uses the size information to stop
			 * searching for matching vma's.
			 */
			retval = 0;
			vma = next;
			break;
		}
		vma = next;
	}

	/*
	 * We need look no further than the maximum address a fragment
	 * could possibly have landed at. Also cast things to loff_t to
	 * prevent overflows and make comparisons vs. equal-width types.
	 */
	size = PAGE_ALIGN(size);
	while (vma && (loff_t)(vma->vm_end - addr) <= size) {
		next = vma->vm_next;

		/* finding a matching vma now does not alter retval */
		if ((vma->vm_ops == &shm_vm_ops) &&
		    ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
		    (vma->vm_file == file))
			do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
		vma = next;
	}

#else /* CONFIG_MMU */
	/* under NOMMU conditions, the exact address to be destroyed must be
	 * given */
	if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
		do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
		retval = 0;
	}

#endif

	up_write(&mm->mmap_sem);
	return retval;
}

主要依赖于do_munmap,将特定的内存区域从当前地址空间中移除,似乎没有多少共享内存特有的东西,所以我不详细分析.但是我们在3.2分析API shmat时,说到内存区域注册到进程时,注册了内存区域操作函数集 shm_vm_ops,其中有成员函数 shm_close,内存区域从进程空间移除是,会回调该函数:

/*
 * remove the attach descriptor vma.
 * free memory for segment if it is marked destroyed.
 * The descriptor has already been removed from the current->mm->mmap list
 * and will later be kfree()d.
 */
static void shm_close(struct vm_area_struct *vma)
{
	struct file *file = vma->vm_file;
	struct shm_file_data *sfd = shm_file_data(file);
	struct shmid_kernel *shp;
	struct ipc_namespace *ns = sfd->ns;

	down_write(&shm_ids(ns).rwsem);
	/* remove from the list of attaches of the shm segment */
	shp = shm_lock(ns, sfd->id);

	/*
	 * We raced in the idr lookup or with shm_destroy().
	 * Either way, the ID is busted.
	 */
	if (WARN_ON_ONCE(IS_ERR(shp)))
		goto done; /* no-op */

	shp->shm_lprid = task_tgid_vnr(current);
	shp->shm_dtim = get_seconds();
	shp->shm_nattch--;
	//每次共享内存从进程空间分离时,都检查该共享内存块是否可以删除
	if (shm_may_destroy(ns, shp))
		shm_destroy(ns, shp);
	else
		shm_unlock(shp);
done:
	up_write(&shm_ids(ns).rwsem);
}

每次都会检查共享内存块是否可以删除:

/*
 * shm_may_destroy - identifies whether shm segment should be destroyed now
 *
 * Returns true if and only if there are no active users of the segment and
 * one of the following is true:
 *
 * 1) shmctl(id, IPC_RMID, NULL) was called for this shp
 *
 * 2) sysctl kernel.shm_rmid_forced is set to 1.
 */
static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
{
	return (shp->shm_nattch == 0) &&
	       (ns->shm_rmid_forced ||
	      //检查SHM_DEST标记
		(shp->shm_perm.mode & SHM_DEST));
}

如果共享内存块被标记了 SHM_DEST,以及其他条件符合,则调用函数 shm_destroy删除共享内存块.

3.4 shmctl内核实现分析

系统调用进入:

SYSCALL_DEFINE3(shmctl, int, shmid, int, cmd, struct shmid_ds __user *, buf)
{
	struct shmid_kernel *shp;
	int err, version;
	struct ipc_namespace *ns;

	if (cmd < 0 || shmid < 0)
		return -EINVAL;

	version = ipc_parse_version(&cmd);
	//当前进程所在的的命名空间 -- > docker等容器实现的基础
	ns = current->nsproxy->ipc_ns;

	switch (cmd) {
	case IPC_INFO:
	case SHM_INFO:
	case SHM_STAT:
	case IPC_STAT:
		//获得共享内存有关的信息
		return shmctl_nolock(ns, shmid, cmd, version, buf);
	case IPC_RMID:
	case IPC_SET:
		//设置,移除ID
		return shmctl_down(ns, shmid, cmd, buf, version);
	case SHM_LOCK:
	case SHM_UNLOCK:
	{
		struct file *shm_file;

		rcu_read_lock();
		
		//查找基数树得到对应的struct shmid_kernel对象
		shp = shm_obtain_object_check(ns, shmid);
		if (IS_ERR(shp)) {
			err = PTR_ERR(shp);
			goto out_unlock1;
		}

		audit_ipc_obj(&(shp->shm_perm));
		err = security_shm_shmctl(shp, cmd);
		if (err)
			goto out_unlock1;

		ipc_lock_object(&shp->shm_perm);

		/* check if shm_destroy() is tearing down shp */
		if (!ipc_valid_object(&shp->shm_perm)) {
			err = -EIDRM;
			goto out_unlock0;
		}

		if (!ns_capable(ns->user_ns, CAP_IPC_LOCK)) {
			kuid_t euid = current_euid();
			if (!uid_eq(euid, shp->shm_perm.uid) &&
			    !uid_eq(euid, shp->shm_perm.cuid)) {
				err = -EPERM;
				goto out_unlock0;
			}
			if (cmd == SHM_LOCK && !rlimit(RLIMIT_MEMLOCK)) {
				err = -EPERM;
				goto out_unlock0;
			}
		}

		shm_file = shp->shm_file;
		if (is_file_hugepages(shm_file))
			goto out_unlock0;

		if (cmd == SHM_LOCK) {
			struct user_struct *user = current_user();
			
			err = shmem_lock(shm_file, 1, user);
			if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
				//已经锁定成功,而且是未锁定模式,设置为锁定模式
				shp->shm_perm.mode |= SHM_LOCKED;
				shp->mlock_user = user;
			}
			goto out_unlock0;
		}

		/* SHM_UNLOCK */
		if (!(shp->shm_perm.mode & SHM_LOCKED))
			goto out_unlock0;
		shmem_lock(shm_file, 0, shp->mlock_user);
		shp->shm_perm.mode &= ~SHM_LOCKED;
		shp->mlock_user = NULL;
		get_file(shm_file);
		ipc_unlock_object(&shp->shm_perm);
		rcu_read_unlock();
		
		shmem_unlock_mapping(shm_file->f_mapping);

		fput(shm_file);
		return err;
	}
	default:
		return -EINVAL;
	}

out_unlock0:
	ipc_unlock_object(&shp->shm_perm);
out_unlock1:
	rcu_read_unlock();
	return err;
}

这里主要处理关于共享内存或IPC的几个命令.我们首先分析处理删除共享内存的 IPC_RMID.得到对应共享内存块的 shmid_kernel对象之后.进入函数 shmctl_down

/*
 * This function handles some shmctl commands which require the rwsem
 * to be held in write mode.
 * NOTE: no locks must be held, the rwsem is taken inside this function.
 */
static int shmctl_down(struct ipc_namespace *ns, int shmid, int cmd,
		       struct shmid_ds __user *buf, int version)
{
	struct kern_ipc_perm *ipcp;
	struct shmid64_ds shmid64;
	struct shmid_kernel *shp;
	int err;

	if (cmd == IPC_SET) {
		if (copy_shmid_from_user(&shmid64, buf, version))
			return -EFAULT;
	}

	down_write(&shm_ids(ns).rwsem);
	rcu_read_lock();

	ipcp = ipcctl_pre_down_nolock(ns, &shm_ids(ns), shmid, cmd,
				      &shmid64.shm_perm, 0);
	if (IS_ERR(ipcp)) {
		err = PTR_ERR(ipcp);
		goto out_unlock1;
	}

	//得到对应的 shmid_kernel 对象
	shp = container_of(ipcp, struct shmid_kernel, shm_perm);

	err = security_shm_shmctl(shp, cmd);
	if (err)
		goto out_unlock1;

	switch (cmd) {
	case IPC_RMID:
		//处理删除共享内存命令
		ipc_lock_object(&shp->shm_perm);
		/* do_shm_rmid unlocks the ipc object and rcu */
		do_shm_rmid(ns, ipcp);
		goto out_up;
	case IPC_SET:
		ipc_lock_object(&shp->shm_perm);
		err = ipc_update_perm(&shmid64.shm_perm, ipcp);
		if (err)
			goto out_unlock0;
		shp->shm_ctim = get_seconds();
		break;
	default:
		err = -EINVAL;
		goto out_unlock1;
	}

out_unlock0:
	ipc_unlock_object(&shp->shm_perm);
out_unlock1:
	rcu_read_unlock();
out_up:
	up_write(&shm_ids(ns).rwsem);
	return err;
}

接着进入do_shm_rmid:

/*
 * Called with shm_ids.rwsem (writer) and the shp structure locked.
 * Only shm_ids.rwsem remains locked on exit.
 */
static void do_shm_rmid(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
{
	struct shmid_kernel *shp;
	shp = container_of(ipcp, struct shmid_kernel, shm_perm);

	//如果还有进程关联该共享内存块,则设置删除标志,并将key设置为私有
	if (shp->shm_nattch) {
		shp->shm_perm.mode |= SHM_DEST;
		/* Do not find it any more */
		shp->shm_perm.key = IPC_PRIVATE;
		shm_unlock(shp);
	} else
		//否则删除该共享内存块
		shm_destroy(ns, shp);
}

这是要分两种情况,一种是还有进程关联到共享内存块,那么直设置一个删除标志 SHM_DEST,实际并不删除.另外一种情况是直接删除共享内存块,执行 shm_destroy:

/*
 * shm_destroy - free the struct shmid_kernel
 *
 * @ns: namespace
 * @shp: struct to free
 *
 * It has to be called with shp and shm_ids.rwsem (writer) locked,
 * but returns with shp unlocked and freed.
 */
static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
{
	struct file *shm_file;

	shm_file = shp->shm_file;
	shp->shm_file = NULL;
	//减去引用页数
	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
	//删除ID
	shm_rmid(ns, shp);
	shm_unlock(shp);
	if (!is_file_hugepages(shm_file))
		shmem_lock(shm_file, 0, shp->mlock_user);
	else if (shp->mlock_user)
		user_shm_unlock(i_size_read(file_inode(shm_file)),
				shp->mlock_user);
	//减去file的引(计数)
	fput(shm_file);
	ipc_rcu_putref(shp, shm_rcu_free);
}

只看到内存页数的减少,以及ID移除等,没有看到inode等资源的回收,以后有时及学习了ext文件系统后再去详细研究.

qauzy

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
读内核源码(Linux 4.9.9)之共享内存的实现

1 共享内存使用1.1 共享内存头文件#include#include1.2 共享内存APIint shmget(key_t key,size_t size,int shmflg);@key : 提供一个参数key，它有效地为共享内存段命名，@size : 以字节为单位指定需要共享的内存容量@shmflg : 是权限标志，它的作用与open函数的mo
复制链接

扫一扫

专栏目录