Linux内存管理之mmap原理

mmap原理

在这里插入图片描述
mmap完成的是将物理内存映射到用户态虚拟内存,中间不需要任何的内存拷贝,文件映射实质上就是在创建内核文件的时候,给文件挂上一个mmap钩子,下面将讲解mmap系统调用是如何调用到文件mmap钩子函数。

首先是系统调用,由于mmap是对物理内存对映射,因此需要准从MMU在CPU架构上对差异,这里选择了arm64架构对函数实现,系统调用函数入口在arch/arm64/kernel/sys.c中,不同架构实现方式基本相同,对于不同对内核版本,有不同对实现方式,对于4.X内核,系统调用内部调用对是一个隐藏系统调用函数sys_mmap_pgoff,实质上就是SYSCALL_DEFINE6(mmap_pgoff…),逻辑上没啥差异,新版本内核也有该函数,调用对也是ksys_mmap_pgoff(),下面主要以5.12.1版本对内核进行讲解:

/*
 * @addr:表示用户空间传入的地址,用于映射到指定用户空间内存位置,一般情况下用
 *        NULL,让用户空间自适应映射;
 * @len:映射对内存大小,是需要映射的物理内存大小,在系统调用过后,该大小做了页
 *       面对齐操作,因为mmap映射对内存大小必须是整页映射。
 * @prot:表示映射的保护权限,有以下四种权限:
 *        PROT_EXEC:映射页面可以为可执行的;
 *        PROT_READ:映射页面是可读的;
 *        PROT_WRITE:映射页面是可写的;
 *        PROT_NONE:映射页面是不可访问的;
 * @flags:表示映射的标志位,决定了映射区域对其他(映射了相同区域的)进程是否可
 *        见,并决定了是否将映射更新到基础文件,用得比较多的有MAP_SHARED、
 *        MAP_PRIVATE、MAP_HUGETLB,详细含义参考mmap函数帮助说明;
 * @fd:open得到的文件描述符;
 * @off:表示映射的页面偏移,一般情况下,该值为0,需要注意的是,off的大小必须是
 *       内存页面的整数倍,如系统采用4K页面,则off的值为0、4、8...
 * @return:mmap返回一个指针,指向映射的内存区域,如果映射失败,则返回
 *         MAP_FAILED(-1),同时将错误码保存在errno中。
 */
SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
		unsigned long, prot, unsigned long, flags,
		unsigned long, fd, unsigned long, off)
{
	if (offset_in_page(off) != 0)
		return -EINVAL;

	return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}

unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
			      unsigned long prot, unsigned long flags,
			      unsigned long fd, unsigned long pgoff)
{
	struct file *file = NULL;
	unsigned long retval;

	if (!(flags & MAP_ANONYMOUS)) {//不使用文件映射
		audit_mmap_fd(fd, flags);
		file = fget(fd);
		if (!file)
			return -EBADF;
		if (is_file_hugepages(file)) {//巨页映射
			len = ALIGN(len, huge_page_size(hstate_file(file)));
		} else if (unlikely(flags & MAP_HUGETLB)) {
			retval = -EINVAL;
			goto out_fput;
		}
	} else if (flags & MAP_HUGETLB) {//巨页映射
		struct user_struct *user = NULL;
		struct hstate *hs;

		hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
		if (!hs)
			return -EINVAL;

		len = ALIGN(len, huge_page_size(hs));
		/*
		 * VM_NORESERVE is used because the reservations will be
		 * taken when vm_ops->mmap() is called
		 * A dummy user value is used because we are not locking
		 * memory so no accounting is necessary
		 */
		file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
				VM_NORESERVE,
				&user, HUGETLB_ANONHUGE_INODE,
				(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
		if (IS_ERR(file))
			return PTR_ERR(file);
	}

	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);//忽略这两个标志位

	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
out_fput:
	if (file)
		fput(file);
	return retval;
}

unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
	unsigned long len, unsigned long prot,
	unsigned long flag, unsigned long pgoff)
{
	unsigned long ret;
	struct mm_struct *mm = current->mm;
	unsigned long populate;
	LIST_HEAD(uf);

    /*
     * 检查映射文件的安全性,需要安全钩子函数支持mmap_file的检查,并通过
     * ima_file_mmap来完成进程文件测量的收集与存储。
     */
	ret = security_mmap_file(file, prot, flag);
	if (!ret) {
		if (mmap_write_lock_killable(mm))
			return -EINTR;
		//执行mmap过程
		ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
			      &uf);
		mmap_write_unlock(mm);
		userfaultfd_unmap_complete(mm, &uf);
		if (populate)
			mm_populate(ret, populate);
	}
	return ret;
}

unsigned long do_mmap(struct file *file, unsigned long addr,
			unsigned long len, unsigned long prot,
			unsigned long flags, unsigned long pgoff,
			unsigned long *populate, struct list_head *uf)
{
	struct mm_struct *mm = current->mm;
	vm_flags_t vm_flags;
	int pkey = 0;

	*populate = 0;

	if (!len)
		return -EINVAL;

	/*
	 * Does the application expect PROT_READ to imply PROT_EXEC?
	 *
	 * (the exception is when the underlying filesystem is noexec
	 *  mounted, in which case we dont add PROT_EXEC.)
	 */
	if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
		if (!(file && path_noexec(&file->f_path)))
			prot |= PROT_EXEC;

	/* force arch specific MAP_FIXED handling in get_unmapped_area */
	if (flags & MAP_FIXED_NOREPLACE)
		flags |= MAP_FIXED;

	if (!(flags & MAP_FIXED))
		addr = round_hint_to_min(addr);

	/* Careful about overflows.. */
	len = PAGE_ALIGN(len);//页面对齐,防止内存溢出
	if (!len)
		return -ENOMEM;

	/* offset overflow? */
	if ((pgoff + (len >> PAGE_SHIFT)) < pgoff)//防止偏移后内存溢出
		return -EOVERFLOW;

	/* Too many mappings? */
	if (mm->map_count > sysctl_max_map_count)//每次映射都会对映射区域的映射计数器增加一次,防止多次映射
		return -ENOMEM;

	/* Obtain the address to map to. we verify (or select) it and ensure
	 * that it represents a valid section of the address space.
	 */
	addr = get_unmapped_area(file, addr, len, pgoff, flags);//获取没有映射的区域
	if (IS_ERR_VALUE(addr))
		return addr;

	if (flags & MAP_FIXED_NOREPLACE) {
		struct vm_area_struct *vma = find_vma(mm, addr);

		if (vma && vma->vm_start < addr + len)
			return -EEXIST;
	}

	if (prot == PROT_EXEC) {
		pkey = execute_only_pkey(mm);
		if (pkey < 0)
			pkey = 0;
	}

	/* Do simple checking here so the lower-level routines won't have
	 * to. we assume access permissions have been handled by the open
	 * of the memory object, so we don't do any here.
	 */
	vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;

	if (flags & MAP_LOCKED)
		if (!can_do_mlock())
			return -EPERM;

	if (mlock_future_check(mm, vm_flags, len))
		return -EAGAIN;

	if (file) {
		struct inode *inode = file_inode(file);
		unsigned long flags_mask;

		if (!file_mmap_ok(file, inode, pgoff, len))//根据文件节点和文件,判断是否可被映射
			return -EOVERFLOW;

		flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;

		switch (flags & MAP_TYPE) {
		case MAP_SHARED:
			/*
			 * Force use of MAP_SHARED_VALIDATE with non-legacy
			 * flags. E.g. MAP_SYNC is dangerous to use with
			 * MAP_SHARED as you don't know which consistency model
			 * you will get. We silently ignore unsupported flags
			 * with MAP_SHARED to preserve backward compatibility.
			 */
			flags &= LEGACY_MAP_MASK;
			fallthrough;
		case MAP_SHARED_VALIDATE:
			...
		case MAP_PRIVATE:
			...
			break;

		default:
			return -EINVAL;
		}
	} else {//如果文件对应的file数据结构不存在,则只支持MAP_SHARED和MAP_PRIVATE方式的映射
		switch (flags & MAP_TYPE) {
		case MAP_SHARED:
			if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
				return -EINVAL;
			/*
			 * Ignore pgoff.
			 */
			pgoff = 0;
			vm_flags |= VM_SHARED | VM_MAYSHARE;
			break;
		case MAP_PRIVATE:
			/*
			 * Set pgoff according to addr for anon_vma.
			 */
			pgoff = addr >> PAGE_SHIFT;
			break;
		default:
			return -EINVAL;
		}
	}

	/*
	 * Set 'VM_NORESERVE' if we should not account for the
	 * memory use of this mapping.
	 */
	if (flags & MAP_NORESERVE) {
		/* We honor MAP_NORESERVE if allowed to overcommit */
		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
			vm_flags |= VM_NORESERVE;

		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
		if (file && is_file_hugepages(file))
			vm_flags |= VM_NORESERVE;
	}

	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);//这里实现了内存区域的映射
	if (!IS_ERR_VALUE(addr) &&
	    ((vm_flags & VM_LOCKED) ||
	     (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE))
		*populate = len;
	return addr;
}

/*
 * 该函数主要是创建映射区域对应的用户态虚拟内存空间,即创建一个struct 
 * vm_area_struct变量,存放映射区域的首地址、映射长度、映射标志位等,同时在映射
 * 文件存在struct file数据的情况下,还会找到文件挂的mmap钩子函数,实现自定义的
 * 映射过程,这里就可以将内核中创建的内存对应的物理内存映射到用户空间。
 */
unsigned long mmap_region(struct file *file, unsigned long addr,
		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
		struct list_head *uf)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma, *prev, *merge;
	int error;
	struct rb_node **rb_link, *rb_parent;
	unsigned long charged = 0;

	/* Check against address space limit. */
	if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
		...
	}

	/* 清除旧映射, 设置prev、rb_link、rb_parent以及uf */
	if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf))
		return -ENOMEM;
	/*
	 * Private writable mapping: check memory availability
	 */
	if (accountable_mapping(file, vm_flags)) {
		..
	}

	/*
	 * 判断是否能够扩展旧的映射(已经merged了的映射区域)
	 */
	vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
			NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
	if (vma)
		goto out;

	/*
	 * Determine the object being mapped and call the appropriate
	 * specific mapper. the address has already been validated, but
	 * not unmapped, but the maps are removed from the list.
	 */
	vma = vm_area_alloc(mm);
	if (!vma) {
		error = -ENOMEM;
		goto unacct_error;
	}

	vma->vm_start = addr;
	vma->vm_end = addr + len;
	vma->vm_flags = vm_flags;
	vma->vm_page_prot = vm_get_page_prot(vm_flags);
	vma->vm_pgoff = pgoff;

	if (file) {
		if (vm_flags & VM_DENYWRITE) {
			error = deny_write_access(file);
			if (error)
				goto free_vma;
		}
		if (vm_flags & VM_SHARED) {
			error = mapping_map_writable(file->f_mapping);
			if (error)
				goto allow_write_and_free_vma;
		}

		/* ->mmap() can change vma->vm_file, but must guarantee that
		 * vma_link() below can deny write-access if VM_DENYWRITE is set
		 * and map writably if VM_SHARED is set. This usually means the
		 * new file must not have been exposed to user-space, yet.
		 */
		vma->vm_file = get_file(file);//获取映射文件的数据
		/*
		 * 调用文件挂的mmap钩子,到这里就会进入到struct file_operations数据
		 * 结构中的mmap钩子,完成自定义的映射过程,具体实现参考用例
		*/
		error = call_mmap(file, vma);
		if (error)
			goto unmap_and_free_vma;

		/* Can addr have changed??
		 *
		 * Answer: Yes, several device drivers can do it in their
		 *         f_op->mmap method. -DaveM
		 * Bug: If addr is changed, prev, rb_link, rb_parent should
		 *      be updated for vma_link()
		 */
		WARN_ON_ONCE(addr != vma->vm_start);

		addr = vma->vm_start;

		/* If vm_flags changed after call_mmap(), we should try merge vma again
		 * as we may succeed this time.
		 */
		if (unlikely(vm_flags != vma->vm_flags && prev)) {
			merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
				NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
			if (merge) {
				/* ->mmap() can change vma->vm_file and fput the original file. So
				 * fput the vma->vm_file here or we would add an extra fput for file
				 * and cause general protection fault ultimately.
				 */
				fput(vma->vm_file);
				vm_area_free(vma);
				vma = merge;
				/* Update vm_flags to pick up the change. */
				vm_flags = vma->vm_flags;
				goto unmap_writable;
			}
		}

		vm_flags = vma->vm_flags;
	} else if (vm_flags & VM_SHARED) {
		error = shmem_zero_setup(vma);
		if (error)
			goto free_vma;
	} else {
		vma_set_anonymous(vma);
	}

	/* Allow architectures to sanity-check the vm_flags */
	if (!arch_validate_flags(vma->vm_flags)) {
		error = -EINVAL;
		if (file)
			goto unmap_and_free_vma;
		else
			goto free_vma;
	}

	vma_link(mm, vma, prev, rb_link, rb_parent);
	/* Once vma denies write, undo our temporary denial count */
	if (file) {
unmap_writable:
		if (vm_flags & VM_SHARED)
			mapping_unmap_writable(file->f_mapping);
		if (vm_flags & VM_DENYWRITE)
			allow_write_access(file);
	}
	file = vma->vm_file;
out:
	perf_event_mmap(vma);

	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
	if (vm_flags & VM_LOCKED) {
		if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
					is_vm_hugetlb_page(vma) ||
					vma == get_gate_vma(current->mm))
			vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
		else
			mm->locked_vm += (len >> PAGE_SHIFT);
	}

	if (file)
		uprobe_mmap(vma);

	/*
	 * New (or expanded) vma always get soft dirty status.
	 * Otherwise user-space soft-dirty page tracker won't
	 * be able to distinguish situation when vma area unmapped,
	 * then new mapped in-place (which must be aimed as
	 * a completely new data area).
	 */
	vma->vm_flags |= VM_SOFTDIRTY;

	vma_set_page_prot(vma);

	return addr;

unmap_and_free_vma:
	fput(vma->vm_file);
	vma->vm_file = NULL;

	/* Undo any partial mapping done by a device driver. */
	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
	charged = 0;
	if (vm_flags & VM_SHARED)
		mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
	if (vm_flags & VM_DENYWRITE)
		allow_write_access(file);
free_vma:
	vm_area_free(vma);
unacct_error:
	if (charged)
		vm_unacct_memory(charged);
	return error;
}

mmap映射类型

上述先入为主,先讲解了mmap的内核实现原理及其过程,由此可以看出,mmap映射是否和文件关联,即当file参数为空时就表示不关联文件,当file数据存在时,就表示关联上了文件,由此可以将Linux内核中的映射分为匿名映射(不关联文件的映射)和文件映射(需要内核文件数据结构(struct file_operations)挂上mmap钩子,自定义逻辑),详述如下:

匿名映射

私有匿名映射

私有匿名映射通常用于内存分配,当open文件的时候,返回的fd为-1,且flags为MAP_ANONYMOUS | MAP_PRIVATE时,创建的mmap映射就是私有匿名映射,私有匿名映射的最常见用于是在glibc分配大块内存时,通常情况下,malloc分配内存是先查找内存中可用的部分(该部分不一定是连续的),当不够的情况下,会通过伙伴系统brk来分配剩余的,当分配的内存大于128KB(即MMAP_THREADHOLD)时,glibc会使用mmap代替默认的brk来分配内存,需要注意的是,小于128字节时,在第一次读写之前,用户态分配的内存只有虚拟内存,还不存在物理内存,当第一次读写之后,才会通过伙伴系统分配对应的物理内存,但是当大于128KB时,由于不再使用brk分配内存,而是通过mmap分配内存,此时,会对应一片物理内存。

共享匿名映射

匿名映射通常用于进程间共享内存,当open文件的时候,返回的fd为-1,且flags为MAP_ANONYMOUS | MAP_SHARED时,创建的mmap映射就是共享匿名映射。共享匿名映射能够让相关进程共享一块内存,通常用于父子进程间通信,创建共享匿名映射的方式有以下两种:
(1)若Open打开的设备不是/dev/zero,且此时满足共享匿名映射,则在do_mmap()->mmap_region()中,会通过shmem_zero_setup()函数来完成映射,这里实则上还是通过/dev/zero这个特殊的设备文件来完成最终的映射,该方式映射出来的是一块可读写的内存区域。shmem_zero_setup函数实现如下:

/*
 * 在这里的shmem_mmap处理处理内存页面映射逻辑,还将一个全局struct 
 * vm_operation_struct 变量shmem_vm_ops挂在虚拟内存数据结构的vm_ops(struct 
 * vm_area_struct成员)上,同时在shmem_zero_setup()函数中也会挂上该钩子函数
 * 集,如果未直接通过dev/zero文件映射,则这里的钩子函数集不会有什么用处。
 */
static const struct file_operations shmem_file_operations = {
	.mmap		= shmem_mmap,
	.get_unmapped_area = shmem_get_unmapped_area,
#ifdef CONFIG_TMPFS
	.llseek		= shmem_file_llseek,
	.read_iter	= shmem_file_read_iter,
	.write_iter	= generic_file_write_iter,
	.fsync		= noop_fsync,
	.splice_read	= generic_file_splice_read,
	.splice_write	= iter_file_splice_write,
	.fallocate	= shmem_fallocate,
#endif
};

static struct file *__shmem_file_setup(struct vfsmount *mnt, const char *name, loff_t size,
				       unsigned long flags, unsigned int i_flags)
{
	struct inode *inode;
	struct file *res;

	if (IS_ERR(mnt))
		return ERR_CAST(mnt);

	if (size < 0 || size > MAX_LFS_FILESIZE)
		return ERR_PTR(-EINVAL);

	if (shmem_acct_size(flags, size))
		return ERR_PTR(-ENOMEM);

	//获取挂载节点
	inode = shmem_get_inode(mnt->mnt_sb, NULL, S_IFREG | S_IRWXUGO, 0,
				flags);
	if (unlikely(!inode)) {
		shmem_unacct_size(flags, size);
		return ERR_PTR(-ENOSPC);
	}
	inode->i_flags |= i_flags;
	inode->i_size = size;
	clear_nlink(inode);	/* It is unlinked */
	res = ERR_PTR(ramfs_nommu_expand_for_mapping(inode, size));
	if (!IS_ERR(res))
		res = alloc_file_pseudo(inode, mnt, name, O_RDWR,
				&shmem_file_operations);//根据挂载节点获取RW权限的文件,且文件的钩子为shmem_file_operations
	if (IS_ERR(res))
		iput(inode);
	return res;
}

/*
 * 这里的shm_mnt是一个全局struct vfsmount数据结构变量,用于记录虚拟文件系统的
 * 挂载信息,声明在include/linux/mount.h中,包括了mount的根节点、mount的超级
 * 块、mount用户命名空间等。
 */
struct file *shmem_kernel_file_setup(const char *name, loff_t size, unsigned long flags)
{
	return __shmem_file_setup(shm_mnt, name, size, flags, S_PRIVATE);
}

int shmem_zero_setup(struct vm_area_struct *vma)
{
	struct file *file;
	loff_t size = vma->vm_end - vma->vm_start;

	/*
	 * Cloning a new file under mmap_lock leads to a lock ordering conflict
	 * between XFS directory reading and selinux: since this file is only
	 * accessible to the user through its mapping, use S_PRIVATE flag to
	 * bypass file security, in the same way as shmem_kernel_file_setup().
	 */
	file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
	if (IS_ERR(file))
		return PTR_ERR(file);

	if (vma->vm_file)
		fput(vma->vm_file);
	vma->vm_file = file;
	vma->vm_ops = &shmem_vm_ops;

	if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
			((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
			(vma->vm_end & HPAGE_PMD_MASK)) {
		khugepaged_enter(vma, vma->vm_flags);
	}

	return 0;
}

文件映射

文件映射实则上就是内核自定义了一个实现struct file_operations钩子的文件,且挂上了mmap。

私有文件映射

共享文件映射

mmap文件映射用例

内核态代码

#include <cgel.h>
#include <linux/list.h>
#include <linux/sched.h>
#include <linux/init_task.h>
#include <linux/string.h>
#include <linux/io.h>

#include <asm/uaccess.h>

static int task_num = 0;
static int old_task_num = 0;
static struct task_info *g_taskinfo;

static int get_bits(int num, unsigned int base)
{
	int merchant;
	int count = 0;

	merchant = num;

	while (merchant >= base) {
		merchant /= base;
		count ++;
	}

	return count + 1;
}

#define NUM_TASKINFO get_bits(task_num, 10)

static int taskinfo_mmap(struct file *file, struct vm_area_struct *vma)
{
	unsigned long taskinfo_pages;
	unsigned long size;
	ssize_t mmap_size;

	if (!g_taskinfo)
		return -EINVAL;
	mmap_size = task_num * sizeof(struct task_info);

	taskinfo_pages = virt_to_phys(g_taskinfo);
	size = (unsigned long)(vma->vm_end - vma->vm_start);

	if ((size >> PAGE_SHIFT) > mmap_size) {
		pr_err("invalid mem range, size %ld, task size :%ld\n",
				size, mmap_size);
		return -ERANGE;
	}

	return remap_pfn_range(vma,
			vma->vm_start,
			taskinfo_pages >> PAGE_SHIFT,
			size,
			vma->vm_page_prot);
}

static ssize_t get_taskinfo(struct file *fs,
		char __user *buf, size_t len, loff_t *lf)
{
	int count = 0;
	int ret = 0;
	struct task_info *taskinfo;
	struct task_struct *task;
	if (likely(access_ok(VERIFY_READ, &init_task,
					sizeof(init_task)))) {
		pr_err("init_task is in user space\n");
		return -ERANGE;
	}

	for_each_process(task) {
		if (likely(access_ok(VERIFY_READ, task,
						sizeof(*task)))) {
			pr_err("task is in the user-mem space\n");
			continue;
		}

		if (task) {
			taskinfo = &task->taskinfo;
			memcpy(g_taskinfo + count * sizeof(struct task_info),
					taskinfo, sizeof(struct task_info));
		}

		count ++;
		if (count == task_num)
			break;
	}
	
	len = get_bits(count, 10);
	char statistic[len + 1];
	memset(statistic, 0, len + 1);
	snprintf(statistic, len + 1, "%d", count);

	if ((ret = copy_to_user(buf, statistic, len)) < 0) {
		pr_err("Failed to copy to user, ret %d\n",
				ret);
		return ret;
	}

	return len;
}

static ssize_t set_taskinfo(struct file *fs,
		const char __user *buf, size_t len, loff_t *lf)
{
	char num[len];
	char *end;
	int ret;

	if (!buf) {
		return -EINVAL;
	}
	memset(num, 0, len);

	if((ret = copy_from_user(num, buf, len)) < 0) {
		return ret;
	}
	task_num = simple_strtoll(num, &end, 10);

	if (old_task_num != task_num) {
		g_taskinfo = kzalloc(task_num * sizeof(struct task_info),
				GFP_USER);
		if (!g_taskinfo)
			return -ENOMEM;
	}

	old_task_num = task_num;

	return len;
}

static int taskinfo_close(struct inode *node, struct file *fs)
{
	if (g_taskinfo)
		kfree(g_taskinfo);
	return 0;
}

static struct file_operations opts = {
	.owner = THIS_MODULE,
	.open = simple_open,
	.read = get_taskinfo,
	.write = set_taskinfo,
	.mmap = taskinfo_mmap,
	.release = taskinfo_close,
};

用户态代码

编译Makefile

CROSS_COMPILE ?=
ARCH ?=
CC = ${CROSS_COMPILE}gcc

CFLAGS += -g
SRC += taskinfo.c
OUT = taskinfo

all:
	$(CC) $(CFLAGS) $(SRC) -o $(OUT)
clean:
	rm -fr *.o

源码实现如下

#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>

struct taskinfo {
	int pid;
	int tgid;
};

static int get_bits(ssize_t num, unsigned int base)
{
	int merchant;
	int count = 0;
	merchant = num;
	while (merchant >= base) {
		merchant /= base;
		count ++;
	}

	return count + 1;
}

int main (int argc, char ** argv)
{
	struct taskinfo *task;
	char *file;
	int num;
	int fd;
	int i;
	int count = 0;
	int task_num = 0;
	ssize_t ret;
	char *addr;
	char *buf;
	char *read_buf;

	if (argc > 2) {
		file = argv[1];
		count = strlen(argv[2]);
		num = atoi(argv[2]);
	}
	else {
		perror("invalid arg\n");
		return -1;
	}

	fd = open(file, O_RDWR);
	if (fd < 0) {
		printf("can't open %s\n", file);
		return -2;
	}
	count = get_bits(num, 10);
	buf = (char *)malloc(count);
	snprintf(buf, count, "%d", num);
	read_buf = (char *)malloc(count);
	
	if ((ret = write(fd, buf, count)) < 0) {
		perror("write failed\n");
		return ret;
	}
	sleep(1);
	
	if ((ret = read(fd, read_buf, count)) < 0) {
		perror("read failed\n");
		return ret;
	}
	task_num = atoi(read_buf);

	task = (struct taskinfo *)mmap(NULL, num * sizeof(struct taskinfo),
			PROT_READ, MAP_SHARED, fd, 0);
	if (addr == MAP_FAILED) {
		perror("mmap failed\n");
		return -3;
	}
	count = 0;
	printf("pid\t\ttgid\n");
	for (i = 0; i < num; i ++) {
		if (!task) {
			break;
		}
		printf("%d\t\t%d\n", task->pid, task->tgid);
		task = task + count * sizeof(struct taskinfo);
		count ++;
		if (count == task_num)
			break;
	}
	sleep(1);
	printf("start munmap\n");
	munmap(task, num * sizeof(*task));

	sleep(1);
	printf("start close\n");
	close(fd);
	sleep(1);
	return 0;
}

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
线程概念 什么是线程 LWP:light weight process 轻量级的进程,本质仍是进程(在Linux环境下) 进程:独立地址空间,拥有PCB 线程:也有PCB,但没有独立的地址空间(共享) 区别:在于是否共享地址空间。 独居(进程);合租(线程)。 Linux下: 线程:最小的执行单位 进程:最小分配资源单位,可看成是只有一个线程的进程。 Linux内核线程实现原理 类Unix系统,早期是没有“线程”概念的,80年代才引入,借助进程机制实现出了线程的概念。因此在这类系统,进程和线程关系密切。 1. 轻量级进程(light-weight process),也有PCB,创建线程使用的底层函数和进程一样,都是clone 2. 从内核里看进程和线程是一样的,都有各自不同的PCB,但是PCB指向内存资源的三级页表是相同的 3. 进程可以蜕变成线程 4. 线程可看做寄存器和栈的集合 5. 在linux下,线程最是小的执行单位;进程是最小的分配资源单位 察看LWP号:ps –Lf pid 查看指定线程的lwp号。 三级映射:进程PCB --> 页目录(可看成数组,首地址位于PCB) --> 页表 --> 物理页面 --> 内存单元 参考:《Linux内核源代码情景分析》 ----毛德操 对于进程来说,相同的地址(同一个虚拟地址)在不同的进程,反复使用而不冲突。原因是他们虽虚拟址一样,但,页目录、页表、物理页面各不相同。相同的虚拟址,映射到不同的物理页面内存单元,最终访问不同的物理页面。 但!线程不同!两个线程具有各自独立的PCB,但共享同一个页目录,也就共享同一个页表和物理页面。所以两个PCB共享一个地址空间。 实际上,无论是创建进程的fork,还是创建线程的pthread_create,底层实现都是调用同一个内核函数clone。 如果复制对方的地址空间,那么就产出一个“进程”;如果共享对方的地址空间,就产生一个“线程”。 因此:Linux内核是不区分进程和线程的。只在用户层面上进行区分。所以,线程所有操作函数 pthread_* 是库函数,而非系统调用。 线程共享资源 1.文件描述符表 2.每种信号的处理方式 3.当前工作目录 4.用户ID和组ID 5.内存地址空间 (.text/.data/.bss/heap/共享库) 线程非共享资源 1.线程id 2.处理器现场和栈指针(内核栈) 3.独立的栈空间(用户空间栈) 4.errno变量 5.信号屏蔽字 6.调度优先级 线程优、缺点 优点: 1. 提高程序并发性 2. 开销小 3. 数据通信、共享数据方便 缺点: 1. 库函数,不稳定 2. 调试、编写困难、gdb不支持 3. 对信号支持不好 优点相对突出,缺点均不是硬伤。Linux下由于实现方法导致进程、线程差别不是很大。 线程控制原语 pthread_self函数 获取线程ID。其作用对应进程 getpid() 函数。 pthread_t pthread_self(void); 返回值:成功:0; 失败:无! 线程ID:pthread_t类型,本质:在Linux下为无符号整数(%lu),其他系统可能是结构体实现 线程ID是进程内部,识别标志。(两个进程间,线程ID允许相同) 注意:不应使用全局变量 pthread_t tid,在子线程通过pthread_create传出参数来获取线程ID,而应使用pthread_self。 pthread_create函数 创建一个新线程。 其作用,对应进程fork() 函数。 int pthread_create(pthread_t *thread, const pthread_attr_t *attr, void *(*start_routine) (void *), void *arg); 返回值:成功:0; 失败:错误号 -----Linux环境下,所有线程特点,失败均直接返回错误号。 参数: pthread_t:当前Linux可理解为:typedef unsigned long int pthread_t; 参数1:传出参数,保存系统为我们分配好的线程ID 参数2:通常传NULL,表示使用线程默认属性。若想使用具体属性也可以修改该参数。 参数3:函数指针,指向线程主函数(线程体),该函数运行结束,则线程结束。 参数4:线程主函数执行期间所使用的参数。 在一个线程调用pthread_create()创建新的线程后,当前线程从pthread_create()返回继续往下执行,而新的线程所执行的代码由我们传给pthread_create的函数指针start_routine决定。star

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值