一 读文件过程
1.读文件所需参数
ssize_t pmfs_xip_file_read(struct file *filp, char __user *buf,size_t len, loff_t *ppos)从给定文件filp中的*ppos位置读取长度为len字节的数据,保存在buf数组中,返回读取的字节数。
2.整体过程
根据ppos求出要读取数据的起始页号index = pos >> PAGE_CACHE_SHIFT,根据文件对应缓冲区中的host记录的i_size求出文件的末尾页号end_index。从index页到end_index页循环读取,直到读满len字节数据。对于每一页,根据该页的页号求出该页的虚拟地址,然后根据偏移量(读取位置)、要读取的字节数进行读操作。
核心问题是要找到每一页对应的虚拟地址,这样才能读取数据
3.查找数据块,找到返回数据块的虚拟地址,否则进行分配数据块操作
(1)首先根据文件对应缓冲区的host节点中的编号file->mapping->host->i_ino,找到该文件对应的inode节点位置。个人理解这个i_ino是pmfs_inode位置在indoe_table维护的b512树中相对偏移字节数。
从超级块中找到inode_table,indoe_table记录一棵b512树,根据i_ino可以找到对应inode节点相对虚拟起始地址的相对位置bp(偏移字节数)。
(2)根据返回的inode节点,查找数据块__pmfs_find_data_block。每个inode节点维护一棵b512树,求解块号对应的虚拟地址时,应将外部块号的编号转换为相应inode节点块大小的块号。即
blk_shift = data_bits - sb->s_blocksize_bits;
blk_offset = file_blocknr & ((1 << blk_shift) - 1);
blocknr = file_blocknr >> blk_shift;
然后就是在b512树上进行查找操作,返回块的逻辑地址。
4.分配数据块
首先找到对应的inode节点,根据要分配的数据页数,增加b512树的高度,进行递归分配。
为编号为first_blocknr到last_blocknr的块进行分配
分配思想如下:
递归边界:树高度为1,直接进行分配,数据块由一个双向循环链表组织。
再分:把first_blocknr到last_blocknr的块分摊到当前树节index上,每个index分配不同范围的块
node_bits = (height - 1) * meta_bits;
first_index = first_blocknr >> node_bits;
last_index = last_blocknr >> node_bits;
递归:树高减1,递归为每个index分配相应范围的块。
二 主要的函数功能介绍
1. do_xip_mapping_read
mapping是用来管理文件映射到内存页面的,每个file都有这么一个结构,将文件系统中这个file对应的数据与这个file对应的内存绑定到一起。
函数功能:从ppos位置读取文件filp长度为len的数据,返回实际读取的数据的字节数。下面也给出了代码注释
static ssize_t do_xip_mapping_read(struct address_space *mapping,
struct file_ra_state *_ra,
struct file *filp,
char __user *buf,
size_t len,
loff_t *ppos)
{
struct inode *inode = mapping->host;//内存数据信息节点
pgoff_t index, end_index;//要读取数据的起始页号和末尾页号
unsigned long offset;//要读取数据在index页的偏移量
loff_t isize, pos;
size_t copied = 0, error = 0;
timing_t memcpy_time;
pos = *ppos;
index = pos >> PAGE_CACHE_SHIFT;//pos除以4KB的商,求得起始页号,页号从0开始
offset = pos & ~PAGE_CACHE_MASK;//pos除以4KB的余数,求得偏移量,偏移量从0开始编号
isize = i_size_read(inode);//filp文件数据的大小
if (!isize)
goto out;
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;//filp文件的最后页号,isize要先减一再移位
do {
unsigned long nr, left;
void *xip_mem;
unsigned long xip_pfn;
int zero = 0;
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;//首先默认能从该页读取4kB大小的数据
if (index >= end_index) {
if (index > end_index)
goto out;
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
if (nr <= offset) {
goto out;
}
}
nr = nr - offset;
if (nr > len - copied)
nr = len - copied;
//找到index页的虚拟地址和物理块号
error = pmfs_get_xip_mem(mapping, index, 0,&xip_mem, &xip_pfn);//获得该页的虚拟地址xip_mem
if (unlikely(error)) {
if (error == -ENODATA) {
/* sparse */
zero = 1;
}
else
goto out;
}
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
/* address based flush */;
/*
* Ok, we have the mem, so now we can copy it to user space...
*
* The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
PMFS_START_TIMING(memcpy_r_t, memcpy_time);
if (!zero)
left = __copy_to_user(buf + copied, xip_mem + offset, nr);//将数据拷贝到buf中
else
left = __clear_user(buf + copied, nr);
PMFS_END_TIMING(memcpy_r_t, memcpy_time);
if (left) {
error = -EFAULT;
goto out;
}
//更新
copied += (nr - left);
offset += (nr - left);
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
} while (copied < len);
out:
*ppos = pos + copied;
if (filp)
file_accessed(filp);//修改文件信息
return (copied ? copied : error);
}
2. pmfs_get_xip_mem
获得页对应的虚拟地址和物理地址
int pmfs_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
void **kmem, unsigned long *pfn)
{
int rc;
sector_t block = 0;
struct inode *inode = mapping->host;
rc = __pmfs_get_block(inode, pgoff, create, &block);//获得pgoff块的逻辑地址block,rc是错误信息
if (rc) {
pmfs_dbg1("[%s:%d] rc(%d), sb->physaddr(0x%llx), block(0x%llx),"
" pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__,
__LINE__, rc, PMFS_SB(inode->i_sb)->phys_addr,
block, pgoff, create, *pfn);
return rc;
}
*kmem = pmfs_get_block(inode->i_sb, block);//获得pgoff的虚拟地址
*pfn = pmfs_get_pfn(inode->i_sb, block);//获得pgoff的物理块号
pmfs_dbg_mmapvv("[%s:%d] sb->physaddr(0x%llx), block(0x%lx),"
" pgoff(0x%lx), flag(0x%x), PFN(0x%lx)\n", __func__, __LINE__,
PMFS_SB(inode->i_sb)->phys_addr, block, pgoff, create, *pfn);
return 0;
}
3. pmfs_find_and_alloc_blocks
查找并分配数据块,如果未找到,则需进行分配操作,分配后再进行一次查找操作。
static int pmfs_find_and_alloc_blocks(struct inode *inode, sector_t iblock,
sector_t *data_block, int create)
{
int err = -EIO;
u64 block;
pmfs_transaction_t *trans;
struct pmfs_inode *pi;
block = pmfs_find_data_block(inode, iblock);//找到iblock对应的块逻辑地址
//没有该块
if (!block) {
struct super_block *sb = inode->i_sb;
if (!create) {//不能创建,返回错误
err = -ENODATA;
goto err;
}
pi = pmfs_get_inode(sb, inode->i_ino);//找到该节点对应的pmfs_inode
trans = pmfs_current_transaction();
if (trans) {
err = pmfs_alloc_blocks(trans, inode, iblock, 1, true);//分配块
if (err) {
pmfs_dbg_verbose("[%s:%d] Alloc failed!\n",
__func__, __LINE__);
goto err;
}
}
else {
/* 1 lentry for inode, 1 lentry for inode's b-tree */
trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto err;
}
rcu_read_unlock();
mutex_lock(&inode->i_mutex);
pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY,
LE_DATA);
err = pmfs_alloc_blocks(trans, inode, iblock, 1, true);
pmfs_commit_transaction(sb, trans);
mutex_unlock(&inode->i_mutex);
rcu_read_lock();
if (err) {
pmfs_dbg_verbose("[%s:%d] Alloc failed!\n",
__func__, __LINE__);
goto err;
}
}
block = pmfs_find_data_block(inode, iblock);//分配块后,再找一遍
if (!block) {
pmfs_dbg("[%s:%d] But alloc didn't fail!\n",
__func__, __LINE__);
err = -ENODATA;
goto err;
}
}
pmfs_dbg_mmapvv("iblock 0x%lx allocated_block 0x%llx\n", iblock,
block);
*data_block = block;
err = 0;
err:
return err;
}
4. pmfs_find_data_block
查找数据块。通过i_ino在inode_table对应的b512树上找到相应的inode节点,再从ionde节点对应的b512树中找到块,返回的是块相对于起始虚拟地址偏移的字节数,也就是逻辑地址。注意4KB块转换成pmfs中实际块大小的块。
u64 pmfs_find_data_block(struct inode *inode, unsigned long file_blocknr)
{
struct super_block *sb = inode->i_sb;
struct pmfs_inode *pi = pmfs_get_inode(sb, inode->i_ino);//找到对应的pmfs_inode
u32 blk_shift;
unsigned long blk_offset, blocknr = file_blocknr;
unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];//应该是21
unsigned int meta_bits = META_BLK_SHIFT;
u64 bp;
/* convert the 4K blocks into the actual blocks the inode is using */
blk_shift = data_bits - sb->s_blocksize_bits;
blk_offset = file_blocknr & ((1 << blk_shift) - 1);//file_blocknr在pmfs中的偏移量
blocknr = file_blocknr >> blk_shift;//file_blocknr在pmfs中对应的块号
if (blocknr >= (1UL << (pi->height * meta_bits)))
return 0;
bp = __pmfs_find_data_block(sb, pi, blocknr);//在b512树中查找块
pmfs_dbg1("find_data_block %lx, %x %llx blk_p %p blk_shift %x"
" blk_offset %lx\n", file_blocknr, pi->height, bp,
pmfs_get_block(sb, bp), blk_shift, blk_offset);
if (bp == 0)
return 0;
return bp + (blk_offset << sb->s_blocksize_bits);
}
5. _pmfs_find_data_block
在pmfs_inode节点对应的b512树进行查找,返回的是块相对于起始虚拟地址偏移的字节数
static inline u64 __pmfs_find_data_block(struct super_block *sb,
struct pmfs_inode *pi, unsigned long blocknr)
{
__le64 *level_ptr;
u64 bp = 0;
u32 height, bit_shift;
unsigned int idx;
height = pi->height;
bp = le64_to_cpu(pi->root);
while (height > 0) {
level_ptr = pmfs_get_block(sb, bp);
bit_shift = (height - 1) * META_BLK_SHIFT;
idx = blocknr >> bit_shift;
bp = le64_to_cpu(level_ptr[idx]);
if (bp == 0)
return 0;
blocknr = blocknr & ((1 << bit_shift) - 1);
height--;
}
return bp;
}
6. pmfs_get_inode
在inode_table节点对应的b512树查找inode节点
static inline struct pmfs_inode *pmfs_get_inode(struct super_block *sb,
u64 ino)
{
struct pmfs_super_block *ps = pmfs_get_super(sb);
struct pmfs_inode *inode_table = pmfs_get_inode_table(sb);
u64 bp, block, ino_offset;
if (ino == 0)
return NULL;
block = ino >> pmfs_inode_blk_shift(inode_table);
bp = __pmfs_find_data_block(sb, inode_table, block);
if (bp == 0)
return NULL;
ino_offset = (ino & (pmfs_inode_blk_size(inode_table) - 1));
return (struct pmfs_inode *)((void *)ps + bp + ino_offset);
}
7. _pmfs_alloc_blocks
功能:从file_blocknr块号开始,分配num个块
先把4KB的块号转化成pmfs_inode对应块大小的块号,如果要分配的末尾块的块号大于b512树的最大块号,则说明b512树的高度要增加了。接着分几种情况分配块
1. 树为空。height为0,直接创建一个块并作为b512树的根,没有进行分配操作;如果height不为0,则需要先增加b512树的高度,再递归地分配从first_blocknr到last_blocknr的块
2. 树不为空。height为0,直接返回0;如果height不为0且大于b512树的高度,则需先增加树的高度,然后递归分配相应的块。
猜测如果last_blocknr=0,说明申请的块号有问题。
int __pmfs_alloc_blocks(pmfs_transaction_t *trans, struct super_block *sb,
struct pmfs_inode *pi, unsigned long file_blocknr, unsigned int num,
bool zero)
{
int errval;
unsigned long max_blocks;
unsigned int height;
unsigned int data_bits = blk_type_to_shift[pi->i_blk_type];
unsigned int blk_shift, meta_bits = META_BLK_SHIFT;
unsigned long blocknr, first_blocknr, last_blocknr, total_blocks;
timing_t alloc_time;
/* convert the 4K blocks into the actual blocks the inode is using */
blk_shift = data_bits - sb->s_blocksize_bits;
PMFS_START_TIMING(alloc_blocks_t, alloc_time);
first_blocknr = file_blocknr >> blk_shift;
last_blocknr = (file_blocknr + num - 1) >> blk_shift;
pmfs_dbg_verbose("alloc_blocks height %d file_blocknr %lx num %x, "
"first blocknr 0x%lx, last_blocknr 0x%lx\n",
pi->height, file_blocknr, num, first_blocknr, last_blocknr);
height = pi->height;
blk_shift = height * meta_bits;
max_blocks = 0x1UL << blk_shift;
if (last_blocknr > max_blocks - 1) {
/* B-tree height increases as a result of this allocation */
total_blocks = last_blocknr >> blk_shift;
while (total_blocks > 0) {
total_blocks = total_blocks >> meta_bits;
height++;
}
if (height > 3) {
pmfs_dbg("[%s:%d] Max file size. Cant grow the file\n",
__func__, __LINE__);
errval = -ENOSPC;
goto fail;
}
}
if (!pi->root) {
if (height == 0) {
__le64 root;
errval = pmfs_new_data_block(sb, pi, &blocknr, zero);
if (errval) {
pmfs_dbg_verbose("[%s:%d] failed: alloc data"
" block\n", __func__, __LINE__);
goto fail;
}
root = cpu_to_le64(pmfs_get_block_off(sb, blocknr,
pi->i_blk_type));
pmfs_memunlock_inode(sb, pi);
pi->root = root;
pi->height = height;
pmfs_memlock_inode(sb, pi);
}
else {
errval = pmfs_increase_btree_height(sb, pi, height);
if (errval) {
pmfs_dbg_verbose("[%s:%d] failed: inc btree"
" height\n", __func__, __LINE__);
goto fail;
}
errval = recursive_alloc_blocks(trans, sb, pi, pi->root,
pi->height, first_blocknr, last_blocknr, 1, zero);
if (errval < 0)
goto fail;
}
}
else {
/* Go forward only if the height of the tree is non-zero. */
if (height == 0)
return 0;
if (height > pi->height) {
errval = pmfs_increase_btree_height(sb, pi, height);
if (errval) {
pmfs_dbg_verbose("Err: inc height %x:%x tot %lx"
"\n", pi->height, height, total_blocks);
goto fail;
}
}
errval = recursive_alloc_blocks(trans, sb, pi, pi->root, height,
first_blocknr, last_blocknr, 0, zero);
if (errval < 0)
goto fail;
}
PMFS_END_TIMING(alloc_blocks_t, alloc_time);
return 0;
fail:
PMFS_END_TIMING(alloc_blocks_t, alloc_time);
return errval;
}
8. pmfs_new_block
功能:通过双向循环链表新建btype类型的块,并返回新建块的逻辑地址
int pmfs_new_block(struct super_block *sb, unsigned long *blocknr,
unsigned short btype, int zero)
{
struct pmfs_sb_info *sbi = PMFS_SB(sb);
struct list_head *head = &(sbi->block_inuse_head);
struct pmfs_blocknode *i, *next_i;
struct pmfs_blocknode *free_blocknode = NULL;
void *bp;
unsigned long num_blocks = 0;
struct pmfs_blocknode *curr_node;
int errval = 0;
bool found = 0;
unsigned long next_block_low;
unsigned long new_block_low;
unsigned long new_block_high;
num_blocks = pmfs_get_numblocks(btype);
mutex_lock(&sbi->s_lock);
list_for_each_entry(i, head, link) {
if (i->link.next == head) {
next_i = NULL;
next_block_low = sbi->block_end;
}
else {
next_i = list_entry(i->link.next, typeof(*i), link);
next_block_low = next_i->block_low;
}
new_block_low = (i->block_high + num_blocks) & ~(num_blocks - 1);
new_block_high = new_block_low + num_blocks - 1;
if (new_block_high >= next_block_low) {
/* Does not fit - skip to next blocknode */
continue;
}
if ((new_block_low == (i->block_high + 1)) &&
(new_block_high == (next_block_low - 1)))
{
/* Fill the gap completely */
if (next_i) {
i->block_high = next_i->block_high;
list_del(&next_i->link);
free_blocknode = next_i;
sbi->num_blocknode_allocated--;
}
else {
i->block_high = new_block_high;
}
found = 1;
break;
}
if ((new_block_low == (i->block_high + 1)) &&
(new_block_high < (next_block_low - 1))) {
/* Aligns to left */
i->block_high = new_block_high;
found = 1;
break;
}
if ((new_block_low > (i->block_high + 1)) &&
(new_block_high == (next_block_low - 1))) {
/* Aligns to right */
if (next_i) {
/* right node exist */
next_i->block_low = new_block_low;
}
else {
/* right node does NOT exist */
curr_node = pmfs_alloc_blocknode(sb);
PMFS_ASSERT(curr_node);
if (curr_node == NULL) {
errval = -ENOSPC;
break;
}
curr_node->block_low = new_block_low;
curr_node->block_high = new_block_high;
list_add(&curr_node->link, &i->link);
}
found = 1;
break;
}
if ((new_block_low > (i->block_high + 1)) &&
(new_block_high < (next_block_low - 1))) {
/* Aligns somewhere in the middle */
curr_node = pmfs_alloc_blocknode(sb);
PMFS_ASSERT(curr_node);
if (curr_node == NULL) {
errval = -ENOSPC;
break;
}
curr_node->block_low = new_block_low;
curr_node->block_high = new_block_high;
list_add(&curr_node->link, &i->link);
found = 1;
break;
}
}
if (found == 1) {
sbi->num_free_blocks -= num_blocks;
}
mutex_unlock(&sbi->s_lock);
if (free_blocknode)
__pmfs_free_blocknode(free_blocknode);
if (found == 0) {
return -ENOSPC;
}
if (zero) {
size_t size;
bp = pmfs_get_block(sb, pmfs_get_block_off(sb, new_block_low, btype));
pmfs_memunlock_block(sb, bp); //TBDTBD: Need to fix this
if (btype == PMFS_BLOCK_TYPE_4K)
size = 0x1 << 12;
else if (btype == PMFS_BLOCK_TYPE_2M)
size = 0x1 << 21;
else
size = 0x1 << 30;
memset_nt(bp, 0, size);
pmfs_memlock_block(sb, bp);
}
*blocknr = new_block_low;
return errval;
}
9. pmfs_increase_btree_height
增加树的高度
static int pmfs_increase_btree_height(struct super_block *sb,
struct pmfs_inode *pi, u32 new_height)
{
u32 height = pi->height;
__le64 *root, prev_root = pi->root;
unsigned long blocknr;
int errval = 0;
pmfs_dbg_verbose("increasing tree height %x:%x\n", height, new_height);
while (height < new_height) {
/* allocate the meta block */
errval = pmfs_new_block(sb, &blocknr, PMFS_BLOCK_TYPE_4K, 1);
if (errval) {
pmfs_err(sb, "failed to increase btree height\n");
break;
}
blocknr = pmfs_get_block_off(sb, blocknr, PMFS_BLOCK_TYPE_4K);
root = pmfs_get_block(sb, blocknr);
pmfs_memunlock_block(sb, root);
root[0] = prev_root;
pmfs_memlock_block(sb, root);
pmfs_flush_buffer(root, sizeof(*root), false);
prev_root = cpu_to_le64(blocknr);
height++;
}
pmfs_memunlock_inode(sb, pi);
pi->root = prev_root;
pi->height = height;
pmfs_memlock_inode(sb, pi);
return errval;
}
10. recursive_alloc_blocks
功能:在b512树上递归分配从first_blocknr到last_blocknr的块,分配思想见上。
static int recursive_alloc_blocks(pmfs_transaction_t *trans,
struct super_block *sb, struct pmfs_inode *pi, __le64 block, u32 height,
unsigned long first_blocknr, unsigned long last_blocknr, bool new_node,
bool zero)
{
int i, errval;
unsigned int meta_bits = META_BLK_SHIFT, node_bits;
__le64 *node;
bool journal_saved = 0;
unsigned long blocknr, first_blk, last_blk;
unsigned int first_index, last_index;
unsigned int flush_bytes;
node = pmfs_get_block(sb, le64_to_cpu(block));
node_bits = (height - 1) * meta_bits;
first_index = first_blocknr >> node_bits;
last_index = last_blocknr >> node_bits;
for (i = first_index; i <= last_index; i++) {
if (height == 1) {
if (node[i] == 0) {
errval = pmfs_new_data_block(sb, pi, &blocknr,
zero);
if (errval) {
pmfs_dbg_verbose("alloc data blk failed"
" %d\n", errval);
/* For later recovery in truncate... */
pmfs_memunlock_inode(sb, pi);
pi->i_flags |= cpu_to_le32(
PMFS_EOFBLOCKS_FL);
pmfs_memlock_inode(sb, pi);
return errval;
}
/* save the meta-data into the journal before
* modifying */
if (new_node == 0 && journal_saved == 0) {
int le_size = (last_index - i + 1) << 3;
pmfs_add_logentry(sb, trans, &node[i],
le_size, LE_DATA);
journal_saved = 1;
}
pmfs_memunlock_block(sb, node);
node[i] = cpu_to_le64(pmfs_get_block_off(sb,
blocknr, pi->i_blk_type));
pmfs_memlock_block(sb, node);
}
}
else {
if (node[i] == 0) {
/* allocate the meta block */
errval = pmfs_new_block(sb, &blocknr,
PMFS_BLOCK_TYPE_4K, 1);
if (errval) {
pmfs_dbg_verbose("alloc meta blk"
" failed\n");
goto fail;
}
/* save the meta-data into the journal before
* modifying */
if (new_node == 0 && journal_saved == 0) {
int le_size = (last_index - i + 1) << 3;
pmfs_add_logentry(sb, trans, &node[i],
le_size, LE_DATA);
journal_saved = 1;
}
pmfs_memunlock_block(sb, node);
node[i] = cpu_to_le64(pmfs_get_block_off(sb,
blocknr, PMFS_BLOCK_TYPE_4K));
pmfs_memlock_block(sb, node);
new_node = 1;
}
first_blk = (i == first_index) ? (first_blocknr &
((1 << node_bits) - 1)) : 0;
last_blk = (i == last_index) ? (last_blocknr &
((1 << node_bits) - 1)) : (1 << node_bits) - 1;
errval = recursive_alloc_blocks(trans, sb, pi, node[i],
height - 1, first_blk, last_blk, new_node, zero);
if (errval < 0)
goto fail;
}
}
if (new_node || trans == NULL) {
/* if the changes were not logged, flush the cachelines we may
* have modified */
flush_bytes = (last_index - first_index + 1) * sizeof(node[0]);
pmfs_flush_buffer(&node[first_index], flush_bytes, false);
}
errval = 0;
fail:
return errval;
}
三 写文件过程
ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf,size_t len, loff_t *ppos)将buf数组中长度为len字节的数据从*ppos位置开始写入文件filp中
如果所有数据都位于同一块中,不需要声明事务,执行pmfs_file_write_fast;否则新建事务,加入日志并分配数据块,执行_pmfs_xip_file_write
四 写过程主要函数
write入口
ssize_t pmfs_xip_file_write(struct file *filp, const char __user *buf,
size_t len, loff_t *ppos)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
pmfs_transaction_t *trans;
struct pmfs_inode *pi;
ssize_t written = 0;
loff_t pos;
u64 block;
bool new_sblk = false, new_eblk = false;
size_t count, offset, eblk_offset, ret;
unsigned long start_blk, end_blk, num_blocks, max_logentries;
bool same_block;
timing_t xip_write_time, xip_write_fast_time;
PMFS_START_TIMING(xip_write_t, xip_write_time);
sb_start_write(inode->i_sb);
mutex_lock(&inode->i_mutex);
if (!access_ok(VERIFY_READ, buf, len)) {
ret = -EFAULT;
goto out;
}
pos = *ppos;
count = len;
if (count == 0) {
ret = 0;
goto out;
}
#if LINUX_VERSION_CODE <= KERNEL_VERSION(4,0,9)
ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
if (ret || count == 0)
goto out;
#endif
pi = pmfs_get_inode(sb, inode->i_ino);
offset = pos & (sb->s_blocksize - 1);
num_blocks = ((count + offset - 1) >> sb->s_blocksize_bits) + 1;
/* offset in the actual block size block */
offset = pos & (pmfs_inode_blk_size(pi) - 1);
start_blk = pos >> sb->s_blocksize_bits;
end_blk = start_blk + num_blocks - 1;
block = pmfs_find_data_block(inode, start_blk);
/* Referring to the inode's block size, not 4K */
same_block = (((count + offset - 1) >>
pmfs_inode_blk_shift(pi)) == 0) ? 1 : 0;
if (block && same_block) {
PMFS_START_TIMING(xip_write_fast_t, xip_write_fast_time);
ret = pmfs_file_write_fast(sb, inode, pi, buf, count, pos,
ppos, block);
PMFS_END_TIMING(xip_write_fast_t, xip_write_fast_time);
goto out;
}
max_logentries = num_blocks / MAX_PTRS_PER_LENTRY + 2;
if (max_logentries > MAX_METABLOCK_LENTRIES)
max_logentries = MAX_METABLOCK_LENTRIES;
trans = pmfs_new_transaction(sb, MAX_INODE_LENTRIES + max_logentries);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
pmfs_add_logentry(sb, trans, pi, MAX_DATA_PER_LENTRY, LE_DATA);
ret = file_remove_privs(filp);
if (ret) {
pmfs_abort_transaction(sb, trans);
goto out;
}
inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
pmfs_update_time(inode, pi);
/* We avoid zeroing the alloc'd range, which is going to be overwritten
* by this system call anyway */
if (offset != 0) {
if (pmfs_find_data_block(inode, start_blk) == 0)
new_sblk = true;
}
eblk_offset = (pos + count) & (pmfs_inode_blk_size(pi) - 1);
if ((eblk_offset != 0) &&
(pmfs_find_data_block(inode, end_blk) == 0))
new_eblk = true;
/* don't zero-out the allocated blocks */
pmfs_alloc_blocks(trans, inode, start_blk, num_blocks, false);
/* now zero out the edge blocks which will be partially written */
pmfs_clear_edge_blk(sb, pi, new_sblk, start_blk, offset, false);
pmfs_clear_edge_blk(sb, pi, new_eblk, end_blk, eblk_offset, true);
written = __pmfs_xip_file_write(mapping, buf, count, pos, ppos);
if (written < 0 || written != count)
pmfs_dbg_verbose("write incomplete/failed: written %ld len %ld"
" pos %llx start_blk %lx num_blocks %lx\n",
written, count, pos, start_blk, num_blocks);
pmfs_commit_transaction(sb, trans);
ret = written;
out:
mutex_unlock(&inode->i_mutex);
sb_end_write(inode->i_sb);
PMFS_END_TIMING(xip_write_t, xip_write_time);
return ret;
}
pmfs_file_write_fast:要写入的数据在同一块中,首先找到块的虚拟地址,写入数据,更新i_size,i_ctime,i_mtime参数
static ssize_t pmfs_file_write_fast(struct super_block *sb, struct inode *inode,
struct pmfs_inode *pi, const char __user *buf, size_t count, loff_t pos,
loff_t *ppos, u64 block)
{
void *xmem = pmfs_get_block(sb, block);
size_t copied, ret = 0, offset;
timing_t memcpy_time;
offset = pos & (sb->s_blocksize - 1);
PMFS_START_TIMING(memcpy_w_t, memcpy_time);
pmfs_xip_mem_protect(sb, xmem + offset, count, 1);
copied = memcpy_to_nvmm((char *)xmem, offset, buf, count);
pmfs_xip_mem_protect(sb, xmem + offset, count, 0);
PMFS_END_TIMING(memcpy_w_t, memcpy_time);
pmfs_flush_edge_cachelines(pos, copied, xmem + offset);
if (likely(copied > 0)) {
pos += copied;
ret = copied;
}
if (unlikely(copied != count && copied == 0))
ret = -EFAULT;
*ppos = pos;
inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
if (pos > inode->i_size) {
/* make sure written data is persistent before updating
* time and size */
PERSISTENT_MARK();
i_size_write(inode, pos);
PERSISTENT_BARRIER();
pmfs_memunlock_inode(sb, pi);
pmfs_update_time_and_size(inode, pi);
pmfs_memlock_inode(sb, pi);
}
else {
u64 c_m_time;
/* update c_time and m_time atomically. We don't need to make the data
* persistent because the expectation is that the close() or an explicit
* fsync will do that. */
c_m_time = (inode->i_ctime.tv_sec & 0xFFFFFFFF);
c_m_time = c_m_time | (c_m_time << 32);
pmfs_memunlock_inode(sb, pi);
pmfs_memcpy_atomic(&pi->i_ctime, &c_m_time, 8);
pmfs_memlock_inode(sb, pi);
}
pmfs_flush_buffer(pi, 1, false);
return ret;
}
_pmfs_xip_file_write:对于每一页确定页号、偏移量、要写入该页的长度,找到该页数据块的虚拟地址,写入数据,更新参数。
static ssize_t
__pmfs_xip_file_write(struct address_space *mapping, const char __user *buf,
size_t count, loff_t pos, loff_t *ppos)
{
struct inode *inode = mapping->host;
struct super_block *sb = inode->i_sb;
long status = 0;
size_t bytes;
ssize_t written = 0;
struct pmfs_inode *pi;
timing_t memcpy_time, write_time;
PMFS_START_TIMING(internal_write_t, write_time);
pi = pmfs_get_inode(sb, inode->i_ino);
do {
unsigned long index;
unsigned long offset;
size_t copied;
void *xmem;
unsigned long xpfn;
unsigned long block_nr = 0;
offset = (pos & (sb->s_blocksize - 1)); /* Within page */
index = pos >> sb->s_blocksize_bits;
bytes = sb->s_blocksize - offset;
if (bytes > count)
bytes = count;
status = pmfs_get_xip_mem(mapping, index, 1, &xmem, &xpfn);
if (status)
break;
PMFS_START_TIMING(memcpy_w_t, memcpy_time);
pmfs_xip_mem_protect(sb, xmem + offset, bytes, 1);
// copy from user's buffer to xmem + offset
copied = memcpy_to_nvmm((char *)xmem, offset, buf, bytes);
pmfs_xip_mem_protect(sb, xmem + offset, bytes, 0);
PMFS_END_TIMING(memcpy_w_t, memcpy_time);
/* if start or end dest address is not 8 byte aligned,
* __copy_from_user_inatomic_nocache uses cacheable instructions
* (instead of movnti) to write. So flush those cachelines. */
pmfs_flush_edge_cachelines(pos, copied, xmem + offset);
if (likely(copied > 0)) {
status = copied;
if (status >= 0) {
written += status;
count -= status;
pos += status;
buf += status;
}
}
if (unlikely(copied != bytes))
if (status >= 0)
status = -EFAULT;
if (status < 0)
break;
} while (count);
*ppos = pos;
/*
* No need to use i_size_read() here, the i_size
* cannot change under us because we hold i_mutex.
*/
if (pos > inode->i_size) {
i_size_write(inode, pos);
pmfs_update_isize(inode, pi);
}
PMFS_END_TIMING(internal_write_t, write_time);
return written ? written : status;
}