PMFS文件系统:读写文件过程详解

一 读文件过程

1.读文件所需参数

ssize_t  pmfs_xip_file_read(struct file *filp, char __user *buf,size_t len, loff_t *ppos)从给定文件filp中的*ppos位置读取长度为len字节的数据,保存在buf数组中,返回读取的字节数。

2.整体过程

根据ppos求出要读取数据的起始页号index = pos >> PAGE_CACHE_SHIFT,根据文件对应缓冲区中的host记录的i_size求出文件的末尾页号end_index。从index页到end_index页循环读取,直到读满len字节数据。对于每一页,根据该页的页号求出该页的虚拟地址,然后根据偏移量(读取位置)、要读取的字节数进行读操作。

        核心问题是要找到每一页对应的虚拟地址,这样才能读取数据

3.查找数据块,找到返回数据块的虚拟地址,否则进行分配数据块操作

(1)首先根据文件对应缓冲区的host节点中的编号file->mapping->host->i_ino,找到该文件对应的inode节点位置。个人理解这个i_ino是pmfs_inode位置在indoe_table维护的b512树中相对偏移字节数。

从超级块中找到inode_table,indoe_table记录一棵b512树,根据i_ino可以找到对应inode节点相对虚拟起始地址的相对位置bp(偏移字节数)。

 (2)根据返回的inode节点,查找数据块__pmfs_find_data_block。每个inode节点维护一棵b512树,求解块号对应的虚拟地址时,应将外部块号的编号转换为相应inode节点块大小的块号。即

blk_shift = data_bits - sb->s_blocksize_bits;
blk_offset = file_blocknr & ((1 << blk_shift) - 1);
blocknr = file_blocknr >> blk_shift;

然后就是在b512树上进行查找操作,返回块的逻辑地址。

4.分配数据块

首先找到对应的inode节点,根据要分配的数据页数,增加b512树的高度,进行递归分配。

为编号为first_blocknr到last_blocknr的块进行分配

分配思想如下:

递归边界:树高度为1,直接进行分配,数据块由一个双向循环链表组织。

再分:把first_blocknr到last_blocknr的块分摊到当前树节index上,每个index分配不同范围的块

node_bits = (height - 1) * meta_bits;
first_index = first_blocknr >> node_bits;
last_index = last_blocknr >> node_bits;

递归:树高减1,递归为每个index分配相应范围的块。

二 主要的函数功能介绍

1. do_xip_mapping_read

mapping是用来管理文件映射到内存页面的,每个file都有这么一个结构,将文件系统中这个file对应的数据与这个file对应的内存绑定到一起。

函数功能:从ppos位置读取文件filp长度为len的数据,返回实际读取的数据的字节数。下面也给出了代码注释

static ssize_t do_xip_mapping_read(struct address_space *mapping,
	struct file_ra_state *_ra,
	struct file *filp,
	char __user *buf,
	size_t len,
	loff_t *ppos)
{
	struct inode *inode = mapping->host;//内存数据信息节点
	pgoff_t index, end_index;//要读取数据的起始页号和末尾页号
	unsigned long offset;//要读取数据在index页的偏移量
	loff_t isize, pos;
	size_t copied = 0, error = 0;
	timing_t memcpy_time;

	pos = *ppos;
	index = pos >> PAGE_CACHE_SHIFT;//pos除以4KB的商,求得起始页号,页号从0开始
	offset = pos & ~PAGE_CACHE_MASK;//pos除以4KB的余数,求得偏移量,偏移量从0开始编号

	isize = i_size_read(inode);//filp文件数据的大小
	if (!isize)
		goto out;

	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;//filp文件的最后页号,isize要先减一再移位
	do {
		unsigned long nr, left;
		void *xip_mem;
		unsigned long xip_pfn;
		int zero = 0;

		/* nr is the maximum number of bytes to copy from this page */
		nr = PAGE_CACHE_SIZE;//首先默认能从该页读取4kB大小的数据
		if (index >= end_index) {
			if (index > end_index)
				goto out;
			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
			if (nr <= offset) {
				goto out;
			}
		}
		nr = nr - offset;
		if (nr > len - copied)
			nr = len - copied;
        //找到index页的虚拟地址和物理块号
		error = pmfs_get_xip_mem(mapping, index, 0,&xip_mem, &xip_pfn);//获得该页的虚拟地址xip_mem
		if (unlikely(error)) {
			if (error == -ENODATA) {
				/* sparse */
				zero = 1;
			}
			else
				goto out;
		}

		/* If users can be writing to this page using arbitrary
		 * virtual addresses, take care about potential aliasing
		 * before reading the page on the kernel side.
		 */
		if (mapping_writably_mapped(mapping))
			/* address based flush */;

		/*
		 * Ok, we have the mem, so now we can copy it to user space...
		 *
		 * The actor routine returns how many bytes were actually used..
		 * NOTE! This may not be the same as how much of a user buffer
		 * we filled up (we may be padding etc), so we can only update
		 * "pos" here (the actor routine has to update the user buffer
		 * pointers and the remaining count).
		 */
		PMFS_START_TIMING(memcpy_r_t, memcpy_time);
		if (!zero)
			left = __copy_to_user(buf + copied, xip_mem + offset, nr);//将数据拷贝到buf中
		else
			left = __clear_user(buf + copied, nr);
		PMFS_END_TIMING(memcpy_r_t, memcpy_time);

		if (left) {
			error = -EFAULT;
			goto out;
		}
        //更新
		copied += (nr - left);
		offset += (nr - left);
		index += offset >> PAGE_CACHE_SHIFT;
		offset &= ~PAGE_CACHE_MASK;
	} while (copied < len);

out:
	*ppos = pos + copied;
	if (filp)
		file_accessed(filp);//修改文件信息

	return (copied ? copied : error);
}

2. pmfs_get_xip_mem

获得页对应的虚拟地址和物理地址

int pmfs_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
	void **kmem, unsigned long *pfn)
{
	int rc;
	sector_t block = 0;
	struct inode *inode = mapping->host;

	rc = __pmfs_get_block(inode, pgoff, create, &block);//获得pgoff块的逻辑地址block,rc是错误信息
	if (rc) {
		pmfs_dbg1("[%s:%d] rc(%d), sb->physaddr(0x%llx), block(0x%llx),"
			" pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__,
			__LINE__, rc, PMFS_SB(inode->i_sb)->phys_addr,
			block, pgoff, create, *pfn);
		return rc;
	}

	*kmem = pmfs_get_block(inode->i_sb, block);//获得pgoff的虚拟地址
	*pfn = pmfs_get_pfn(inode->i_sb, block);//获得pgoff的物理块号

	pmfs_dbg_mmapvv("[%s:%d] sb->physaddr(0x%llx), block(0x%lx),"
		" pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__, __LINE__,
		PMFS_SB(inode->i_sb)->phys_addr, block, pgoff, create, *pfn);
	return 0;
}

3. pmfs_find_and_alloc_blocks

查找并分配数据块,如果未找到,则需进行分配操作,分配后再进行一次查找操作。

static int pmfs_find_and_alloc_blocks(struct inode *inode, sector_t iblock,
	sector_t *data_block, int create)
{
	int err = -EIO;
	u64 block;
	pmfs_transaction_t *trans;
	struct pmfs_inode *pi;

	block = pmfs_find_data_block(inode, iblock);//找到iblock对应的块逻辑地址

    //没有该块
	if (!block) {
		struct super_block *sb = inode->i_sb;
		if (!create) {//不能创建,返回错误
			err = -ENODATA;
			goto err;
		}

		pi = pmfs_get_inode(sb, inode->i_ino);//找到该节点对应的pmfs_inode
		trans = pmfs_current_transaction();
		if (trans) {
			err = pmfs_alloc_blocks(trans, inode, iblock, 1, true);//分配块
			if (err) {
				pmfs_dbg_verbose("[%s:%d] Alloc failed!\n",
					__func__, __LINE__);
				goto err;
			}
		}
		else {
			/* 1 lentry for inode, 1 lentry for inode's b-tree */
			trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
			if (IS_ERR(trans)) {
				err = PTR_ERR(trans);
				goto err;
			}

			rcu_read_unlock();
			mutex_lock(&inode->i_mutex);

			pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY,
				LE_DATA);
			err = pmfs_alloc_blocks(trans, inode, iblock, 1, true);

			pmfs_commit_transaction(sb, trans);

			mutex_unlock(&inode->i_mutex);
			rcu_read_lock();
			if (err) {
				pmfs_dbg_verbose("[%s:%d] Alloc failed!\n",
					__func__, __LINE__);
				goto err;
			}
		}
		block = pmfs_find_data_block(inode, iblock);//分配块后,再找一遍
		if (!block) {
			pmfs_dbg("[%s:%d] But alloc didn't fail!\n",
				__func__, __LINE__);
			err = -ENODATA;
			goto err;
		}
	}
	pmfs_dbg_mmapvv("iblock 0x%lx allocated_block 0x%llx\n", iblock,
		block);

	*data_block = block;
	err = 0;

err:
	return err;
}

4. pmfs_find_data_block

查找数据块。通过i_ino在inode_table对应的b512树上找到相应的inode节点,再从ionde节点对应的b512树中找到块,返回的是块相对于起始虚拟地址偏移的字节数,也就是逻辑地址。注意4KB块转换成pmfs中实际块大小的块。

u64 pmfs_find_data_block(struct inode *inode, unsigned long file_blocknr)
{
	struct super_block *sb = inode->i_sb;
	struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);//找到对应的pmfs_inode
	u32 blk_shift;
	unsigned long blk_offset, blocknr = file_blocknr;
	unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];//应该是21
	unsigned int meta_bits = META_BLK_SHIFT;
	u64 bp;

	/* convert the 4K blocks into the actual blocks the inode is using */
	blk_shift = data_bits - sb->s_blocksize_bits;
	blk_offset = file_blocknr & ((1 << blk_shift) - 1);//file_blocknr在pmfs中的偏移量
	blocknr = file_blocknr >> blk_shift;//file_blocknr在pmfs中对应的块号

	if (blocknr >= (1UL << (pi->height * meta_bits)))
		return 0;

	bp = __pmfs_find_data_block(sb, pi, blocknr);//在b512树中查找块
	pmfs_dbg1("find_data_block %lx, %x %llx blk_p %p blk_shift %x"
		" blk_offset %lx\n", file_blocknr, pi->height, bp,
		pmfs_get_block(sb, bp), blk_shift, blk_offset);

	if (bp == 0)
		return 0;
	return bp + (blk_offset << sb->s_blocksize_bits);
}

5. _pmfs_find_data_block

在pmfs_inode节点对应的b512树进行查找,返回的是块相对于起始虚拟地址偏移的字节数

static inline u64 __pmfs_find_data_block(struct super_block *sb,
		struct pmfs_inode *pi, unsigned long blocknr)
{
	__le64 *level_ptr;
	u64 bp = 0;
	u32 height, bit_shift;
	unsigned int idx;

	height = pi->height;
	bp = le64_to_cpu(pi->root);

	while (height > 0) {
		level_ptr = pmfs_get_block(sb, bp);
		bit_shift = (height - 1) * META_BLK_SHIFT;
		idx = blocknr >> bit_shift;
		bp = le64_to_cpu(level_ptr[idx]);
		if (bp == 0)
			return 0;
		blocknr = blocknr & ((1 << bit_shift) - 1);
		height--;
	}
	return bp;
}

6. pmfs_get_inode

在inode_table节点对应的b512树查找inode节点

static inline struct pmfs_inode *pmfs_get_inode(struct super_block *sb,
						  u64	ino)
{
	struct pmfs_super_block *ps = pmfs_get_super(sb);
	struct pmfs_inode *inode_table = pmfs_get_inode_table(sb);
	u64 bp, block, ino_offset;

	if (ino == 0)
		return NULL;

	block = ino >> pmfs_inode_blk_shift(inode_table);
	bp = __pmfs_find_data_block(sb, inode_table, block);

	if (bp == 0)
		return NULL;
	ino_offset = (ino & (pmfs_inode_blk_size(inode_table) - 1));
	return (struct pmfs_inode *)((void *)ps + bp + ino_offset);
}

7. _pmfs_alloc_blocks

功能:从file_blocknr块号开始,分配num个块

先把4KB的块号转化成pmfs_inode对应块大小的块号,如果要分配的末尾块的块号大于b512树的最大块号,则说明b512树的高度要增加了。接着分几种情况分配块

1. 树为空。height为0,直接创建一个块并作为b512树的根,没有进行分配操作;如果height不为0,则需要先增加b512树的高度,再递归地分配从first_blocknr到last_blocknr的块

2. 树不为空。height为0,直接返回0;如果height不为0且大于b512树的高度,则需先增加树的高度,然后递归分配相应的块。

猜测如果last_blocknr=0,说明申请的块号有问题。

int __pmfs_alloc_blocks(pmfs_transaction_t *trans, struct super_block *sb,
	struct pmfs_inode *pi, unsigned long file_blocknr, unsigned int num,
	bool zero)
{
	int errval;
	unsigned long max_blocks;
	unsigned int height;
	unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
	unsigned int blk_shift, meta_bits = META_BLK_SHIFT;
	unsigned long blocknr, first_blocknr, last_blocknr, total_blocks;
	timing_t alloc_time;

	/* convert the 4K blocks into the actual blocks the inode is using */
	blk_shift = data_bits - sb->s_blocksize_bits;

	PMFS_START_TIMING(alloc_blocks_t, alloc_time);
	first_blocknr = file_blocknr >> blk_shift;
	last_blocknr = (file_blocknr + num - 1) >> blk_shift;

	pmfs_dbg_verbose("alloc_blocks height %d file_blocknr %lx num %x, "
		"first blocknr 0x%lx, last_blocknr 0x%lx\n",
		pi->height, file_blocknr, num, first_blocknr, last_blocknr);

	height = pi->height;

	blk_shift = height * meta_bits;

	max_blocks = 0x1UL << blk_shift;

	if (last_blocknr > max_blocks - 1) {
		/* B-tree height increases as a result of this allocation */
		total_blocks = last_blocknr >> blk_shift;
		while (total_blocks > 0) {
			total_blocks = total_blocks >> meta_bits;
			height++;
		}
		if (height > 3) {
			pmfs_dbg("[%s:%d] Max file size. Cant grow the file\n",
				__func__, __LINE__);
			errval = -ENOSPC;
			goto fail;
		}
	}

	if (!pi->root) {
		if (height == 0) {
			__le64 root;
			errval = pmfs_new_data_block(sb, pi, &blocknr, zero);
			if (errval) {
				pmfs_dbg_verbose("[%s:%d] failed: alloc data"
					" block\n", __func__, __LINE__);
				goto fail;
			}
			root = cpu_to_le64(pmfs_get_block_off(sb, blocknr,
				pi->i_blk_type));
			pmfs_memunlock_inode(sb, pi);
			pi->root = root;
			pi->height = height;
			pmfs_memlock_inode(sb, pi);
		}
		else {
			errval = pmfs_increase_btree_height(sb, pi, height);
			if (errval) {
				pmfs_dbg_verbose("[%s:%d] failed: inc btree"
					" height\n", __func__, __LINE__);
				goto fail;
			}
			errval = recursive_alloc_blocks(trans, sb, pi, pi->root,
				pi->height, first_blocknr, last_blocknr, 1, zero);
			if (errval < 0)
				goto fail;
		}
	}
	else {
		/* Go forward only if the height of the tree is non-zero. */
		if (height == 0)
			return 0;

		if (height > pi->height) {
			errval = pmfs_increase_btree_height(sb, pi, height);
			if (errval) {
				pmfs_dbg_verbose("Err: inc height %x:%x tot %lx"
					"\n", pi->height, height, total_blocks);
				goto fail;
			}
		}
		errval = recursive_alloc_blocks(trans, sb, pi, pi->root, height,
			first_blocknr, last_blocknr, 0, zero);
		if (errval < 0)
			goto fail;
	}
	PMFS_END_TIMING(alloc_blocks_t, alloc_time);
	return 0;
fail:
	PMFS_END_TIMING(alloc_blocks_t, alloc_time);
	return errval;
}

8. pmfs_new_block

功能:通过双向循环链表新建btype类型的块,并返回新建块的逻辑地址

int pmfs_new_block(struct super_block *sb, unsigned long *blocknr,
	unsigned short btype, int zero)
{
	struct pmfs_sb_info *sbi = PMFS_SB(sb);
	struct list_head *head = &(sbi->block_inuse_head);
	struct pmfs_blocknode *i, *next_i;
	struct pmfs_blocknode *free_blocknode = NULL;
	void *bp;
	unsigned long num_blocks = 0;
	struct pmfs_blocknode *curr_node;
	int errval = 0;
	bool found = 0;
	unsigned long next_block_low;
	unsigned long new_block_low;
	unsigned long new_block_high;

	num_blocks = pmfs_get_numblocks(btype);

	mutex_lock(&sbi->s_lock);

	list_for_each_entry(i, head, link) {
		if (i->link.next == head) {
			next_i = NULL;
			next_block_low = sbi->block_end;
		}
		else {
			next_i = list_entry(i->link.next, typeof(*i), link);
			next_block_low = next_i->block_low;
		}

		new_block_low = (i->block_high + num_blocks) & ~(num_blocks - 1);
		new_block_high = new_block_low + num_blocks - 1;

		if (new_block_high >= next_block_low) {
			/* Does not fit - skip to next blocknode */
			continue;
		}

		if ((new_block_low == (i->block_high + 1)) &&
			(new_block_high == (next_block_low - 1)))
		{
			/* Fill the gap completely */
			if (next_i) {
				i->block_high = next_i->block_high;
				list_del(&next_i->link);
				free_blocknode = next_i;
				sbi->num_blocknode_allocated--;
			}
			else {
				i->block_high = new_block_high;
			}
			found = 1;
			break;
		}

		if ((new_block_low == (i->block_high + 1)) &&
			(new_block_high < (next_block_low - 1))) {
			/* Aligns to left */
			i->block_high = new_block_high;
			found = 1;
			break;
		}

		if ((new_block_low > (i->block_high + 1)) &&
			(new_block_high == (next_block_low - 1))) {
			/* Aligns to right */
			if (next_i) {
				/* right node exist */
				next_i->block_low = new_block_low;
			}
			else {
				/* right node does NOT exist */
				curr_node = pmfs_alloc_blocknode(sb);
				PMFS_ASSERT(curr_node);
				if (curr_node == NULL) {
					errval = -ENOSPC;
					break;
				}
				curr_node->block_low = new_block_low;
				curr_node->block_high = new_block_high;
				list_add(&curr_node->link, &i->link);
			}
			found = 1;
			break;
		}

		if ((new_block_low > (i->block_high + 1)) &&
			(new_block_high < (next_block_low - 1))) {
			/* Aligns somewhere in the middle */
			curr_node = pmfs_alloc_blocknode(sb);
			PMFS_ASSERT(curr_node);
			if (curr_node == NULL) {
				errval = -ENOSPC;
				break;
			}
			curr_node->block_low = new_block_low;
			curr_node->block_high = new_block_high;
			list_add(&curr_node->link, &i->link);
			found = 1;
			break;
		}
	}

	if (found == 1) {
		sbi->num_free_blocks -= num_blocks;
	}

	mutex_unlock(&sbi->s_lock);

	if (free_blocknode)
		__pmfs_free_blocknode(free_blocknode);

	if (found == 0) {
		return -ENOSPC;
	}

	if (zero) {
		size_t size;
		bp = pmfs_get_block(sb, pmfs_get_block_off(sb, new_block_low, btype));
		pmfs_memunlock_block(sb, bp); //TBDTBD: Need to fix this
		if (btype == PMFS_BLOCK_TYPE_4K)
			size = 0x1 << 12;
		else if (btype == PMFS_BLOCK_TYPE_2M)
			size = 0x1 << 21;
		else
			size = 0x1 << 30;
		memset_nt(bp, 0, size);
		pmfs_memlock_block(sb, bp);
	}
	*blocknr = new_block_low;

	return errval;
}

9. pmfs_increase_btree_height

增加树的高度

static int pmfs_increase_btree_height(struct super_block *sb,
	struct pmfs_inode *pi, u32 new_height)
{
	u32 height = pi->height;
	__le64 *root, prev_root = pi->root;
	unsigned long blocknr;
	int errval = 0;

	pmfs_dbg_verbose("increasing tree height %x:%x\n", height, new_height);
	while (height < new_height) {
		/* allocate the meta block */
		errval = pmfs_new_block(sb, &blocknr, PMFS_BLOCK_TYPE_4K, 1);
		if (errval) {
			pmfs_err(sb, "failed to increase btree height\n");
			break;
		}
		blocknr = pmfs_get_block_off(sb, blocknr, PMFS_BLOCK_TYPE_4K);
		root = pmfs_get_block(sb, blocknr);
		pmfs_memunlock_block(sb, root);
		root[0] = prev_root;
		pmfs_memlock_block(sb, root);
		pmfs_flush_buffer(root, sizeof(*root), false);
		prev_root = cpu_to_le64(blocknr);
		height++;
	}
	pmfs_memunlock_inode(sb, pi);
	pi->root = prev_root;
	pi->height = height;
	pmfs_memlock_inode(sb, pi);
	return errval;
}

10. recursive_alloc_blocks

功能:在b512树上递归分配从first_blocknr到last_blocknr的块,分配思想见上。

static int recursive_alloc_blocks(pmfs_transaction_t *trans,
	struct super_block *sb, struct pmfs_inode *pi, __le64 block, u32 height,
	unsigned long first_blocknr, unsigned long last_blocknr, bool new_node,
	bool zero)
{
	int i, errval;
	unsigned int meta_bits = META_BLK_SHIFT, node_bits;
	__le64 *node;
	bool journal_saved = 0;
	unsigned long blocknr, first_blk, last_blk;
	unsigned int first_index, last_index;
	unsigned int flush_bytes;

	node = pmfs_get_block(sb, le64_to_cpu(block));

	node_bits = (height - 1) * meta_bits;

	first_index = first_blocknr >> node_bits;
	last_index = last_blocknr >> node_bits;

	for (i = first_index; i <= last_index; i++) {
		if (height == 1) {
			if (node[i] == 0) {
				errval = pmfs_new_data_block(sb, pi, &blocknr,
					zero);
				if (errval) {
					pmfs_dbg_verbose("alloc data blk failed"
						" %d\n", errval);
					/* For later recovery in truncate... */
					pmfs_memunlock_inode(sb, pi);
					pi->i_flags |= cpu_to_le32(
						PMFS_EOFBLOCKS_FL);
					pmfs_memlock_inode(sb, pi);
					return errval;
				}
				/* save the meta-data into the journal before
				 * modifying */
				if (new_node == 0 && journal_saved == 0) {
					int le_size = (last_index - i + 1) << 3;
					pmfs_add_logentry(sb, trans, &node[i],
						le_size, LE_DATA);
					journal_saved = 1;
				}
				pmfs_memunlock_block(sb, node);
				node[i] = cpu_to_le64(pmfs_get_block_off(sb,
					blocknr, pi->i_blk_type));
				pmfs_memlock_block(sb, node);
			}
		}
		else {
			if (node[i] == 0) {
				/* allocate the meta block */
				errval = pmfs_new_block(sb, &blocknr,
					PMFS_BLOCK_TYPE_4K, 1);
				if (errval) {
					pmfs_dbg_verbose("alloc meta blk"
						" failed\n");
					goto fail;
				}
				/* save the meta-data into the journal before
				 * modifying */
				if (new_node == 0 && journal_saved == 0) {
					int le_size = (last_index - i + 1) << 3;
					pmfs_add_logentry(sb, trans, &node[i],
						le_size, LE_DATA);
					journal_saved = 1;
				}
				pmfs_memunlock_block(sb, node);
				node[i] = cpu_to_le64(pmfs_get_block_off(sb,
					blocknr, PMFS_BLOCK_TYPE_4K));
				pmfs_memlock_block(sb, node);
				new_node = 1;
			}

			first_blk = (i == first_index) ? (first_blocknr &
				((1 << node_bits) - 1)) : 0;

			last_blk = (i == last_index) ? (last_blocknr &
				((1 << node_bits) - 1)) : (1 << node_bits) - 1;

			errval = recursive_alloc_blocks(trans, sb, pi, node[i],
				height - 1, first_blk, last_blk, new_node, zero);
			if (errval < 0)
				goto fail;
		}
	}
	if (new_node || trans == NULL) {
		/* if the changes were not logged, flush the cachelines we may
		* have modified */
		flush_bytes = (last_index - first_index + 1) * sizeof(node[0]);
		pmfs_flush_buffer(&node[first_index], flush_bytes, false);
	}
	errval = 0;
fail:
	return errval;
}

三 写文件过程

ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf,size_t len, loff_t *ppos)将buf数组中长度为len字节的数据从*ppos位置开始写入文件filp中

如果所有数据都位于同一块中,不需要声明事务,执行pmfs_file_write_fast;否则新建事务,加入日志并分配数据块,执行_pmfs_xip_file_write

四 写过程主要函数

write入口

ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf,
	size_t len, loff_t *ppos)
{
	struct address_space *mapping = filp->f_mapping;
	struct inode    *inode = mapping->host;
	struct super_block *sb = inode->i_sb;
	pmfs_transaction_t *trans;
	struct pmfs_inode *pi;
	ssize_t     written = 0;
	loff_t pos;
	u64 block;
	bool new_sblk = false, new_eblk = false;
	size_t count, offset, eblk_offset, ret;
	unsigned long start_blk, end_blk, num_blocks, max_logentries;
	bool same_block;
	timing_t xip_write_time, xip_write_fast_time;

	PMFS_START_TIMING(xip_write_t, xip_write_time);

	sb_start_write(inode->i_sb);
	mutex_lock(&inode->i_mutex);

	if (!access_ok(VERIFY_READ, buf, len)) {
		ret = -EFAULT;
		goto out;
	}
	pos = *ppos;
	count = len;
	if (count == 0) {
		ret = 0;
		goto out;
	}

#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,0,9)
	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
	if (ret || count == 0)
		goto out;
#endif
	pi = pmfs_get_inode(sb, inode->i_ino);

	offset = pos & (sb->s_blocksize - 1);
	num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1;
	/* offset in the actual block size block */
	offset = pos & (pmfs_inode_blk_size(pi) - 1);
	start_blk = pos >> sb->s_blocksize_bits;
	end_blk = start_blk + num_blocks - 1;

	block = pmfs_find_data_block(inode, start_blk);

	/* Referring to the inode's block size, not 4K */
	same_block = (((count + offset - 1) >>
		pmfs_inode_blk_shift(pi)) == 0) ? 1 : 0;
	if (block && same_block) {
		PMFS_START_TIMING(xip_write_fast_t, xip_write_fast_time);
		ret = pmfs_file_write_fast(sb, inode, pi, buf, count, pos,
			ppos, block);
		PMFS_END_TIMING(xip_write_fast_t, xip_write_fast_time);
		goto out;
	}
	max_logentries = num_blocks / MAX_PTRS_PER_LENTRY + 2;
	if (max_logentries > MAX_METABLOCK_LENTRIES)
		max_logentries = MAX_METABLOCK_LENTRIES;

	trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + max_logentries);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto out;
	}
	pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);

	ret = file_remove_privs(filp);
	if (ret) {
		pmfs_abort_transaction(sb, trans);
		goto out;
	}
	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
	pmfs_update_time(inode, pi);

	/* We avoid zeroing the alloc'd range, which is going to be overwritten
	 * by this system call anyway */
	if (offset != 0) {
		if (pmfs_find_data_block(inode, start_blk) == 0)
			new_sblk = true;
	}

	eblk_offset = (pos + count) & (pmfs_inode_blk_size(pi) - 1);
	if ((eblk_offset != 0) &&
		(pmfs_find_data_block(inode, end_blk) == 0))
		new_eblk = true;

	/* don't zero-out the allocated blocks */
	pmfs_alloc_blocks(trans, inode, start_blk, num_blocks, false);

	/* now zero out the edge blocks which will be partially written */
	pmfs_clear_edge_blk(sb, pi, new_sblk, start_blk, offset, false);
	pmfs_clear_edge_blk(sb, pi, new_eblk, end_blk, eblk_offset, true);

	written = __pmfs_xip_file_write(mapping, buf, count, pos, ppos);
	if (written < 0 || written != count)
		pmfs_dbg_verbose("write incomplete/failed: written %ld len %ld"
			" pos %llx start_blk %lx num_blocks %lx\n",
			written, count, pos, start_blk, num_blocks);

	pmfs_commit_transaction(sb, trans);
	ret = written;
out:
	mutex_unlock(&inode->i_mutex);
	sb_end_write(inode->i_sb);
	PMFS_END_TIMING(xip_write_t, xip_write_time);
	return ret;
}

pmfs_file_write_fast:要写入的数据在同一块中,首先找到块的虚拟地址,写入数据,更新i_size,i_ctime,i_mtime参数

static ssize_t pmfs_file_write_fast(struct super_block *sb, struct inode *inode,
	struct pmfs_inode *pi, const char __user *buf, size_t count, loff_t pos,
	loff_t *ppos, u64 block)
{
	void *xmem = pmfs_get_block(sb, block);
	size_t copied, ret = 0, offset;
	timing_t memcpy_time;

	offset = pos & (sb->s_blocksize - 1);

	PMFS_START_TIMING(memcpy_w_t, memcpy_time);
	pmfs_xip_mem_protect(sb, xmem + offset, count, 1);
	copied = memcpy_to_nvmm((char *)xmem, offset, buf, count);
	pmfs_xip_mem_protect(sb, xmem + offset, count, 0);
	PMFS_END_TIMING(memcpy_w_t, memcpy_time);

	pmfs_flush_edge_cachelines(pos, copied, xmem + offset);

	if (likely(copied > 0)) {
		pos += copied;
		ret = copied;
	}
	if (unlikely(copied != count && copied == 0))
		ret = -EFAULT;
	*ppos = pos;
	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
	if (pos > inode->i_size) {
		/* make sure written data is persistent before updating
		* time and size */
		PERSISTENT_MARK();
		i_size_write(inode, pos);
		PERSISTENT_BARRIER();
		pmfs_memunlock_inode(sb, pi);
		pmfs_update_time_and_size(inode, pi);
		pmfs_memlock_inode(sb, pi);
	}
	else {
		u64 c_m_time;
		/* update c_time and m_time atomically. We don't need to make the data
		 * persistent because the expectation is that the close() or an explicit
		 * fsync will do that. */
		c_m_time = (inode->i_ctime.tv_sec & 0xFFFFFFFF);
		c_m_time = c_m_time | (c_m_time << 32);
		pmfs_memunlock_inode(sb, pi);
		pmfs_memcpy_atomic(&pi->i_ctime, &c_m_time, 8);
		pmfs_memlock_inode(sb, pi);
	}
	pmfs_flush_buffer(pi, 1, false);
	return ret;
}

_pmfs_xip_file_write:对于每一页确定页号、偏移量、要写入该页的长度,找到该页数据块的虚拟地址,写入数据,更新参数。

static ssize_t
__pmfs_xip_file_write(struct address_space *mapping, const char __user *buf,
	size_t count, loff_t pos, loff_t *ppos)
{
	struct inode    *inode = mapping->host;
	struct super_block *sb = inode->i_sb;
	long        status = 0;
	size_t      bytes;
	ssize_t     written = 0;
	struct pmfs_inode *pi;
	timing_t memcpy_time, write_time;

	PMFS_START_TIMING(internal_write_t, write_time);
	pi = pmfs_get_inode(sb, inode->i_ino);
	do {
		unsigned long index;
		unsigned long offset;
		size_t copied;
		void *xmem;
		unsigned long xpfn;
		unsigned long block_nr = 0;

		offset = (pos & (sb->s_blocksize - 1)); /* Within page */
		index = pos >> sb->s_blocksize_bits;
		bytes = sb->s_blocksize - offset;
		if (bytes > count)
			bytes = count;

		status = pmfs_get_xip_mem(mapping, index, 1, &xmem, &xpfn);
		if (status)
			break;

		PMFS_START_TIMING(memcpy_w_t, memcpy_time);
		pmfs_xip_mem_protect(sb, xmem + offset, bytes, 1);
		// copy from user's buffer to xmem + offset
		copied = memcpy_to_nvmm((char *)xmem, offset, buf, bytes);
		pmfs_xip_mem_protect(sb, xmem + offset, bytes, 0);
		PMFS_END_TIMING(memcpy_w_t, memcpy_time);

		/* if start or end dest address is not 8 byte aligned,
		 * __copy_from_user_inatomic_nocache uses cacheable instructions
		 * (instead of movnti) to write. So flush those cachelines. */
		pmfs_flush_edge_cachelines(pos, copied, xmem + offset);

		if (likely(copied > 0)) {
			status = copied;

			if (status >= 0) {
				written += status;
				count -= status;
				pos += status;
				buf += status;
			}
		}
		if (unlikely(copied != bytes))
			if (status >= 0)
				status = -EFAULT;
		if (status < 0)
			break;
	} while (count);
	*ppos = pos;
	/*
	* No need to use i_size_read() here, the i_size
	* cannot change under us because we hold i_mutex.
	*/
	if (pos > inode->i_size) {
		i_size_write(inode, pos);
		pmfs_update_isize(inode, pi);
	}

	PMFS_END_TIMING(internal_write_t, write_time);
	return written ? written : status;
}

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值