介绍
我看的是linux-4.2.3的源码。参考了《边干边学——Linux内核指导》(鬼畜的书名)第16章内容,他们用的是2.6.15的内核源码。
现在linux中可以使用共享内存的方式有两种
-
POSIX的
shm_open()
在/dev/shm/
下打开一个文件,用mmap()
映射到进程自己的内存地址 -
System V的
shmget()
得到一个共享内存对象的id,用shmat()
映射到进程自己的内存地址
POSIX的实现是基于tmpfs的,函数都写在libc里,没什么好说的,主要还是看System V的实现方式。在System V中共享内存属于IPC子系统。所谓ipc,就是InterProcess Communication即进程间通信的意思,System V比前面的Unix增加了3中进程间通信的方式,共享内存、消息队列、信号量,统称IPC。主要代码在以下文件中
-
ipc/shm.c
-
include/linux/shm.h
-
ipc/util.c
-
ipc/util.h
-
include/linux/ipc.h
同一块共享内存在内核中至少有3个标识符
-
IPC对象id(IPC对象是保存IPC信息的数据结构)
-
进程虚拟内存中文件的inode,即每个进程中的共享内存也是以文件的方式存在的,但并不是显式的。可以通过某个
vm_area_struct->vm_file->f_dentry->d_inode->i_ino
表示 -
IPC对象的key。如果在
shmget()
中传入同一个key可以获取到同一块共享内存。但由于key是用户指定的,可能重复,而且也很少程序写之前会约定一个key,所以这种方法不是很常用。通常System V这种共享内存的方式是用于有父子关系的进程的。或者用ftok()
函数用路径名来生成一个key。
首先看一下在内核中表示一块共享内存的数据结构,在include/linux/shm.h
中
/* */
是内核源码的注释,//
是我的注释
-
struct shmid_kernel /* private to the kernel */
-
{
-
struct kern_ipc_perm shm_perm; // 权限,这个结构体中还有一些重要的内容,后面会提到
-
struct file *shm_file; // 表示这块共享内存的内核文件,文件内容即共享内存的内容
-
unsigned long shm_nattch; // 连接到这块共享内存的进程数
-
unsigned long shm_segsz; // 大小,字节为单位
-
time_t shm_atim; // 最后一次连接时间
-
time_t shm_dtim; // 最后一次断开时间
-
time_t shm_ctim; // 最后一次更改信息的时间
-
pid_t shm_cprid; // 创建者进程id
-
pid_t shm_lprid; // 最后操作者进程id
-
struct user_struct *mlock_user;
-
/* The task created the shm object. NULL if the task is dead. */
-
struct task_struct *shm_creator;
-
struct list_head shm_clist; /* list by creator */
-
};
再看一下struct shmid_kernel
中存储权限信息的shm_perm
,在include/linux/ipc.h
中
-
/* used by in-kernel data structures */
-
struct kern_ipc_perm
-
{
-
spinlock_t lock;
-
bool deleted;
-
int id; // IPC对象id
-
key_t key; // IPC对象键值,即创建共享内存时用户指定的
-
kuid_t uid; // IPC对象拥有者id
-
kgid_t gid; // 组id
-
kuid_t cuid; // 创建者id
-
kgid_t cgid;
-
umode_t mode;
-
unsigned long seq;
-
void *security;
-
};
为啥有这样一个struct呢?因为这些权限、id、key是IPC对象都有的属性,所以比如表示semaphore的结构struct semid_kernel
中也有一个这样的struct kern_ipc_perm
。然后在传递IPC对象的时候,传的也是struct kern_ipc_perm
的指针,再用container_of
这样的宏获得外面的struct,这样就能用同一个函数操作3种IPC对象,达到较好的代码重用。
接下来我们看一下共享内存相关函数。首先它们都是系统调用,对应的用户API在libc里面,参数是相同的,只是libc中的API做了一些调用系统调用需要的日常工作(保护现场、恢复现场之类的),所以就直接看这个系统调用了。
声明在include/linux/syscalls.h
中
-
asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg);
-
asmlinkage long sys_shmget(key_t key, size_t size, int flag);
-
asmlinkage long sys_shmdt(char __user *shmaddr);
-
asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
定义在ipc/shm.c
中
shmget
-
SYSCALL_DEFINE3(shmget, key_t, key, size_t, size, int, shmflg)
-
{
-
struct ipc_namespace *ns;
-
static const struct ipc_ops shm_ops = {
-
.getnew = newseg,
-
.associate = shm_security,
-
.more_checks = shm_more_checks,
-
};
-
struct ipc_params shm_params;
-
ns = current->nsproxy->ipc_ns;
-
shm_params.key = key;
-
shm_params.flg = shmflg;
-
shm_params.u.size = size;
-
return ipcget(ns, &shm_ids(ns), &shm_ops, &shm_params);
-
}
首先看到这个函数定义可能会很奇怪,不过这个SYSCALL_DEFINE3
的宏展开来最后形式肯定和.h文件中声明的一样,即还是long sys_shmget(key_t key, size_t size, int flag)
这个宏是为了修一个bug,纯粹黑科技,这里不提它。
然后这里实际调用的函数是ipcget()
。为了统一一个ipc的接口也是煞费苦心,共享内存、信号量、消息队列三种对象创建的时候都会调用这个函数,但其实创建的逻辑并不在这里。而在shm_ops
中的三个函数里。
namespace
顺便提一下其中的current->nsproxy->ipc_ns
。这个的类型是struct ipc_namespace
。它是啥呢?我们知道,共享内存这些进程间通信的数据结构是全局的,但有时候需要把他们隔离开,即某一组进程并不知道另外的进程的共享内存,它们只希望在组内共用这些东西,这样就不会与其他进程冲突。于是就煞费苦心在内核中加了一个namespace。只要在clone()
函数中加入CLONE_NEWIPC
标志就能创建一个新的IPC namespace。
那么这个IPC namespace和我们的共享内存的数据结构有什么关系呢,可以看一下结构体
-
struct ipc_ids {
-
int in_use;
-
unsigned short seq;
-
struct rw_semaphore rwsem;
-
struct idr ipcs_idr;
-
int next_id;
-
};
-
struct ipc_namespace {
-
atomic_t count;
-
struct ipc_ids ids[3];
-
...
-
};
比较重要的是其中的ids
,它存的是所用IPC对象的id,其中共享内存都存在ids[2]
中。而在ids[2]
中真正负责管理数据的是ipcs_idr
,它也是内核中一个煞费苦心弄出来的id管理机制,一个id可以对应任意唯一确定的对象。把它理解成一个数组就好。它们之间的关系大概如下图所示。
-
[0] struct kern_ipc_perm <==> struct shmid_kernel
-
struct ipc_namespace => struct ipc_ids => struct idr => [1] struct kern_ipc_perm <==> struct shmid_kernel
-
[2] struct kern_ipc_perm <==> struct shmid_kernel
回到shmget
好的,我们回头来看看shmget()
究竟干了啥,首先看一下ipcget()
-
int ipcget(struct ipc_namespace *ns, struct ipc_ids *ids,
-
const struct ipc_ops *ops, struct ipc_params *params)
-
{
-
if (params->key == IPC_PRIVATE)
-
return ipcget_new(ns, ids, ops, params);
-
else
-
return ipcget_public(ns, ids, ops, params);
-
}
如果传进来的参数是IPC_PRIVATE
(这个宏的值是0)的话,无论是什么mode,都会创建一块新的共享内存。如果非0,则会去已有的共享内存中找有没有这个key的,有就返回,没有就新建。
首先看一下新建的函数newseg()
-
static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
-
{
-
key_t key = params->key;
-
int shmflg = params->flg;
-
size_t size = params->u.size;
-
int error;
-
struct shmid_kernel *shp;
-
size_t numpages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
-
struct file *file;
-
char name[13];
-
int id;
-
vm_flags_t acctflag = 0;
-
if (size < SHMMIN || size > ns->shm_ctlmax)
-
return -EINVAL;
-
if (numpages << PAGE_SHIFT < size)
-
return -ENOSPC;
-
if (ns->shm_tot + numpages < ns->shm_tot ||
-
ns->shm_tot + numpages > ns->shm_ctlall)
-
return -ENOSPC;
-
shp = ipc_rcu_alloc(sizeof(*shp));
-
if (!shp)
-
return -ENOMEM;
-
shp->shm_perm.key = key;
-
shp->shm_perm.mode = (shmflg & S_IRWXUGO);
-
shp->mlock_user = NULL;
-
shp->shm_perm.security = NULL;
-
error = security_shm_alloc(shp);
-
if (error) {
-
ipc_rcu_putref(shp, ipc_rcu_free);
-
return error;
-
}
-
sprintf(name, "SYSV%08x", key);
-
if (shmflg & SHM_HUGETLB) {
-
struct hstate *hs;
-
size_t hugesize;
-
hs = hstate_sizelog((shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
-
if (!hs) {
-
error = -EINVAL;
-
goto no_file;
-
}
-
hugesize = ALIGN(size, huge_page_size(hs));
-
/* hugetlb_file_setup applies strict accounting */
-
if (shmflg & SHM_NORESERVE)
-
acctflag = VM_NORESERVE;
-
file = hugetlb_file_setup(name, hugesize, acctflag,
-
&shp->mlock_user, HUGETLB_SHMFS_INODE,
-
(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
-
} else {
-
/*
-
* Do not allow no accounting for OVERCOMMIT_NEVER, even
-
* if it's asked for.
-
*/
-
if ((shmflg & SHM_NORESERVE) &&
-
sysctl_overcommit_memory != OVERCOMMIT_NEVER)
-
acctflag = VM_NORESERVE;
-
file = shmem_kernel_file_setup(name, size, acctflag);
-
}
-
error = PTR_ERR(file);
-
if (IS_ERR(file))
-
goto no_file;
-
id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
-
if (id < 0) {
-
error = id;
-
goto no_id;
-
}
-
shp->shm_cprid = task_tgid_vnr(current);
-
shp->shm_lprid = 0;
-
shp->shm_atim = shp->shm_dtim = 0;
-
shp->shm_ctim = get_seconds();
-
shp->shm_segsz = size;
-
shp->shm_nattch = 0;
-
shp->shm_file = file;
-
shp->shm_creator = current;
-
list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist);
-
/*
-
* shmid gets reported as "inode#" in /proc/pid/maps.
-
* proc-ps tools use this. Changing this will break them.
-
*/
-
file_inode(file)->i_ino = shp->shm_perm.id;
-
ns->shm_tot += numpages;
-
error = shp->shm_perm.id;
-
ipc_unlock_object(&shp->shm_perm);
-
rcu_read_unlock();
-
return error;
-
no_id:
-
if (is_file_hugepages(file) && shp->mlock_user)
-
user_shm_unlock(size, shp->mlock_user);
-
fput(file);
-
no_file:
-
ipc_rcu_putref(shp, shm_rcu_free);
-
return error;
-
}
这个函数首先几个if检查size是不是合法的参数,并且检查有没有足够的pages。然后调用ipc_rcu_alloc()
函数给共享内存数据结构shp分配空间。然后把一些参数写到shp的shm_perm成员中。然后sprintf下面那个大的if-else是为表示共享内存内容的file分配空间。再然后ipc_addid()
是一个比较重要的函数,它把刚才新建的这个共享内存的数据结构的指针加入到namespace的ids里,即可以想象成加入到数组里,并获得一个可以找到它的id。这里的id并不完全是数组的下标,因为要避免重复,所以这里有一个简单的机制来保证生成的id几乎是unique的,即ids里面有个seq变量,每次新加入共享内存对象时都会加1,而真正的id是这样生成的SEQ_MULTIPLIER * seq + id
。然后初始化一些成员,再把这个数据结构的指针加到当前进程的一个list里。这个函数的工作就基本完成了。
接下来我们再看一下如果创建时传入一个已有的key,即ipcget_public()
的逻辑
-
static int ipcget_public(struct ipc_namespace *ns, struct ipc_ids *ids,
-
const struct ipc_ops *ops, struct ipc_params *params)
-
{
-
struct kern_ipc_perm *ipcp;
-
int flg = params->flg;
-
int err;
-
/*
-
* Take the lock as a writer since we are potentially going to add
-
* a new entry + read locks are not "upgradable"
-
*/
-
down_write(&ids->rwsem);
-
ipcp = ipc_findkey(ids, params->key);
-
if (ipcp == NULL) {
-
/* key not used */
-
if (!(flg & IPC_CREAT))
-
err = -ENOENT;
-
else
-
err = ops->getnew(ns, params);
-
} else {
-
/* ipc object has been locked by ipc_findkey() */
-
if (flg & IPC_CREAT && flg & IPC_EXCL)
-
err = -EEXIST;
-
else {
-
err = 0;
-
if (ops->more_checks)
-
err = ops->more_checks(ipcp, params);
-
if (!err)
-
/*
-
* ipc_check_perms returns the IPC id on
-
* success
-
*/
-
err = ipc_check_perms(ns, ipcp, ops, params);
-
}
-
ipc_unlock(ipcp);
-
}
-
up_write(&ids->rwsem);
-
return err;
-
}
逻辑非常简单,先去找有没有这个key。没有的话还是创建一个新的,注意ops->getnew()
对应的就是刚才的newseg()
函数。如果找到了就判断一下权限有没有问题,没有问题就直接返回IPC id。
可以再看下ipc_findkey()
这个函数
-
static struct kern_ipc_perm *ipc_findkey(struct ipc_ids *ids, key_t key)
-
{
-
struct kern_ipc_perm *ipc;
-
int next_id;
-
int total;
-
for (total = 0, next_id = 0; total < ids->in_use; next_id++) {
-
ipc = idr_find(&ids->ipcs_idr, next_id);
-
if (ipc == NULL)
-
continue;
-
if (ipc->key != key) {
-
total++;
-
continue;
-
}
-
rcu_read_lock();
-
ipc_lock_object(ipc);
-
return ipc;
-
}
-
return NULL;
-
}
逻辑也很简单,注意到ids->ipcs_idr
就是之前提到的Interger ID Managenent机制,里面存的就是shmid和对象一一对应的关系。然后这里可以看到ids->in_use
表示的是共享内存的个数,由于中间的有些可能删掉了,所以total在找到一个不为空的共享内存的时候才++。然后我们也可以看到,这里对重复的key并没有做任何处理。所以我们在编程的时候也应该避免直接约定用某一个数字当key。
shmat
接下来我们看一下shmat()
,它的逻辑全在do_shmat()
中,所以我们直接看这个函数。
-
long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
-
unsigned long shmlba)
-
{
-
struct shmid_kernel *shp;
-
unsigned long addr;
-
unsigned long size;
-
struct file *file;
-
int err;
-
unsigned long flags;
-
unsigned long prot;
-
int acc_mode;
-
struct ipc_namespace *ns;
-
struct shm_file_data *sfd;
-
struct path path;
-
fmode_t f_mode;
-
unsigned long populate = 0;
-
err = -EINVAL;
-
if (shmid < 0)
-
goto out;
-
else if ((addr = (ulong)shmaddr)) {
-
if (addr & (shmlba - 1)) {
-
if (shmflg & SHM_RND)
-
addr &= ~(shmlba - 1); /* round down */
-
else
-
#ifndef __ARCH_FORCE_SHMLBA
-
if (addr & ~PAGE_MASK)
-
#endif
-
goto out;
-
}
-
flags = MAP_SHARED | MAP_FIXED;
-
} else {
-
if ((shmflg & SHM_REMAP))
-
goto out;
-
flags = MAP_SHARED;
-
}
-
if (shmflg & SHM_RDONLY) {
-
prot = PROT_READ;
-
acc_mode = S_IRUGO;
-
f_mode = FMODE_READ;
-
} else {
-
prot = PROT_READ | PROT_WRITE;
-
acc_mode = S_IRUGO | S_IWUGO;
-
f_mode = FMODE_READ | FMODE_WRITE;
-
}
-
if (shmflg & SHM_EXEC) {
-
prot |= PROT_EXEC;
-
acc_mode |= S_IXUGO;
-
}
-
/*
-
* We cannot rely on the fs check since SYSV IPC does have an
-
* additional creator id...
-
*/
-
ns = current->nsproxy->ipc_ns;
-
rcu_read_lock();
-
shp = shm_obtain_object_check(ns, shmid);
-
if (IS_ERR(shp)) {
-
err = PTR_ERR(shp);
-
goto out_unlock;
-
}
-
err = -EACCES;
-
if (ipcperms(ns, &shp->shm_perm, acc_mode))
-
goto out_unlock;
-
err = security_shm_shmat(shp, shmaddr, shmflg);
-
if (err)
-
goto out_unlock;
-
ipc_lock_object(&shp->shm_perm);
-
/* check if shm_destroy() is tearing down shp */
-
if (!ipc_valid_object(&shp->shm_perm)) {
-
ipc_unlock_object(&shp->shm_perm);
-
err = -EIDRM;
-
goto out_unlock;
-
}
-
path = shp->shm_file->f_path;
-
path_get(&path);
-
shp->shm_nattch++;
-
size = i_size_read(d_inode(path.dentry));
-
ipc_unlock_object(&shp->shm_perm);
-
rcu_read_unlock();
-
err = -ENOMEM;
-
sfd = kzalloc(sizeof(*sfd), GFP_KERNEL);
-
if (!sfd) {
-
path_put(&path);
-
goto out_nattch;
-
}
-
file = alloc_file(&path, f_mode,
-
is_file_hugepages(shp->shm_file) ?
-
&shm_file_operations_huge :
-
&shm_file_operations);
-
err = PTR_ERR(file);
-
if (IS_ERR(file)) {
-
kfree(sfd);
-
path_put(&path);
-
goto out_nattch;
-
}
-
file->private_data = sfd;
-
file->f_mapping = shp->shm_file->f_mapping;
-
sfd->id = shp->shm_perm.id;
-
sfd->ns = get_ipc_ns(ns);
-
sfd->file = shp->shm_file;
-
sfd->vm_ops = NULL;
-
err = security_mmap_file(file, prot, flags);
-
if (err)
-
goto out_fput;
-
down_write(¤t->mm->mmap_sem);
-
if (addr && !(shmflg & SHM_REMAP)) {
-
err = -EINVAL;
-
if (addr + size < addr)
-
goto invalid;
-
if (find_vma_intersection(current->mm, addr, addr + size))
-
goto invalid;
-
}
-
addr = do_mmap_pgoff(file, addr, size, prot, flags, 0, &populate);
-
*raddr = addr;
-
err = 0;
-
if (IS_ERR_VALUE(addr))
-
err = (long)addr;
-
invalid:
-
up_write(¤t->mm->mmap_sem);
-
if (populate)
-
mm_populate(addr, populate);
-
out_fput:
-
fput(file);
-
out_nattch:
-
down_write(&shm_ids(ns).rwsem);
-
shp = shm_lock(ns, shmid);
-
shp->shm_nattch--;
-
if (shm_may_destroy(ns, shp))
-
shm_destroy(ns, shp);
-
else
-
shm_unlock(shp);
-
up_write(&shm_ids(ns).rwsem);
-
return err;
-
out_unlock:
-
rcu_read_unlock();
-
out:
-
return err;
-
}
首先检查shmaddr的合法性并进行对齐,即调整为shmlba的整数倍。如果传入addr是0,前面检查部分只会加上一个MAP_SHARED标志,因为后面的mmap会自动为其分配地址。然后从那一段两行的注释开始,函数通过shmid尝试获取共享内存对象,并进行权限检查。然后修改shp中的一些数据,比如连接进程数加一。然后是通过alloc_file()
创建真正的要做mmap的file。在mmap之前还要对地址空间进行检查,检查是否和别的地址重叠,是否够用。实际的映射工作就在do_mmap_pgoff()
函数中做了。
shmdt
-
SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
-
{
-
struct mm_struct *mm = current->mm;
-
struct vm_area_struct *vma;
-
unsigned long addr = (unsigned long)shmaddr;
-
int retval = -EINVAL;
-
#ifdef CONFIG_MMU
-
loff_t size = 0;
-
struct file *file;
-
struct vm_area_struct *next;
-
#endif
-
if (addr & ~PAGE_MASK)
-
return retval;
-
down_write(&mm->mmap_sem);
-
/*
-
* This function tries to be smart and unmap shm segments that
-
* were modified by partial mlock or munmap calls:
-
* - It first determines the size of the shm segment that should be
-
* unmapped: It searches for a vma that is backed by shm and that
-
* started at address shmaddr. It records it's size and then unmaps
-
* it.
-
* - Then it unmaps all shm vmas that started at shmaddr and that
-
* are within the initially determined size and that are from the
-
* same shm segment from which we determined the size.
-
* Errors from do_munmap are ignored: the function only fails if
-
* it's called with invalid parameters or if it's called to unmap
-
* a part of a vma. Both calls in this function are for full vmas,
-
* the parameters are directly copied from the vma itself and always
-
* valid - therefore do_munmap cannot fail. (famous last words?)
-
*/
-
/*
-
* If it had been mremap()'d, the starting address would not
-
* match the usual checks anyway. So assume all vma's are
-
* above the starting address given.
-
*/
-
vma = find_vma(mm, addr);
-
#ifdef CONFIG_MMU
-
while (vma) {
-
next = vma->vm_next;
-
/*
-
* Check if the starting address would match, i.e. it's
-
* a fragment created by mprotect() and/or munmap(), or it
-
* otherwise it starts at this address with no hassles.
-
*/
-
if ((vma->vm_ops == &shm_vm_ops) &&
-
(vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) {
-
/*
-
* Record the file of the shm segment being
-
* unmapped. With mremap(), someone could place
-
* page from another segment but with equal offsets
-
* in the range we are unmapping.
-
*/
-
file = vma->vm_file;
-
size = i_size_read(file_inode(vma->vm_file));
-
do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
-
/*
-
* We discovered the size of the shm segment, so
-
* break out of here and fall through to the next
-
* loop that uses the size information to stop
-
* searching for matching vma's.
-
*/
-
retval = 0;
-
vma = next;
-
break;
-
}
-
vma = next;
-
}
-
/*
-
* We need look no further than the maximum address a fragment
-
* could possibly have landed at. Also cast things to loff_t to
-
* prevent overflows and make comparisons vs. equal-width types.
-
*/
-
size = PAGE_ALIGN(size);
-
while (vma && (loff_t)(vma->vm_end - addr) <= size) {
-
next = vma->vm_next;
-
/* finding a matching vma now does not alter retval */
-
if ((vma->vm_ops == &shm_vm_ops) &&
-
((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) &&
-
(vma->vm_file == file))
-
do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
-
vma = next;
-
}
-
#else /* CONFIG_MMU */
-
/* under NOMMU conditions, the exact address to be destroyed must be
-
* given */
-
if (vma && vma->vm_start == addr && vma->vm_ops == &shm_vm_ops) {
-
do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start);
-
retval = 0;
-
}
-
#endif
-
up_write(&mm->mmap_sem);
-
return retval;
-
}
接下来是shmdt()
,这个函数非常简单,找到传入的shmaddr对应的虚拟内存数据结构vma,检查它的地址是不是正确的,然后调用do_munmap()
函数断开对共享内存的连接。注意此操作并不会销毁共享内存,即使没有进程连接到它也不会,只有手动调用shmctl(id, IPC_RMID, NULL)
才能销毁。
shmctl()
总体就是一个switch语句,大多数做的是读取信息的或者设置标志位的工作,这里不赘述。