UBIFS文件系统源码分析

最新推荐文章于 2024-04-24 16:14:41 发布

ninver2007

最新推荐文章于 2024-04-24 16:14:41 发布

阅读量1.6w

点赞数 2

分类专栏：文件系统 LINUX内核文章标签： struct flash header list io tree

LINUX内核同时被 2 个专栏收录

25 篇文章 0 订阅

订阅专栏

文件系统

8 篇文章 1 订阅

订阅专栏

一）：挂载UBIFS的代码分析

陆陆续续的看UBIFS很长时间了，一直没有写出一点东西。因为我在=到能够系统的理解UBIFS的时候再写出一点东西。但是因为工作比较忙，UBIFS源码读的断断续续，老是需要复习拾起，比较浪费时间，所以决定写出一点东西，做个备份吧。

我决定在读UBIFS源码之前需要读两份关于UBIF设计的文档：

一份是《UBI-Unsorted Block Images》 ubidesign.pdf

另外一份是《A Brief Introduction to the design of UBIFS》 A Brief Introduction to the Design of UBIFS.pdf

这两份简洁的介绍了UBIFS设计的一些结构和考虑。

我们按照挂载ubifs的工序来分析代码：

（2）ubimkvol /dev/ubi0 -N ubifs -s 15MiB

（3）mount -t ubifs ubi0:ubifs /mnt

首先先分析（1），相应的代码是ubi_attach_mtd_dev（）函数，下面我们紧跟代码来看看究竟干了些什么。

1 ．ubi_attach_mtd_dev

int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset)

{

//ubi_num, vid_hdr_offset是命令传进来的参数

struct ubi_device *ubi;

int i, err, do_free = 1;

* Check if we already have the same MTD device attached.

* Note, this function assumes that UBI devices creations and deletions

* are serialized, so it does not take the &ubi_devices_lock.

for (i = 0; i < UBI_MAX_DEVICES; i++) {

ubi = ubi_devices[i];

if (ubi && mtd->index == ubi->mtd->index) {

dbg_err("mtd%d is already attached to ubi%d",

mtd->index, i);

return -EEXIST;

}

//上面的这段代码可以看英文注释，一个mtd设备（一个分区）不能被attach两次，除非你已经deatch了。所以在这段代码的开始就检查被attach的mtd设备是否已经被attach了。

if (mtd->type == MTD_UBIVOLUME) {

ubi_err("refuse attaching mtd%d - it is already emulated on "

"top of UBI", mtd->index);

return -EINVAL;

}

上面的代码接着检查被attach的mtd设备时候是一个mtd volume（卷区），如果已经是一个mtd卷了，那么就不能再被attach了。

if (ubi_num == UBI_DEV_NUM_AUTO) {

/* Search for an empty slot in the @ubi_devices array */

for (ubi_num = 0; ubi_num < UBI_MAX_DEVICES; ubi_num++)

if (!ubi_devices[ubi_num])

break;

如果在终端输入命令的时候没有带ubinum，那么就是自动分配ubinum，系统就会从ubi_device[]数组中找出一个没被使用的ubinum号

if (ubi_num == UBI_MAX_DEVICES) {

dbg_err("only %d UBI devices may be created",

UBI_MAX_DEVICES);

return -ENFILE;

}

} else {

if (ubi_num >= UBI_MAX_DEVICES)

return -EINVAL;

如果ubi_num > UBI_MAX_DEVICES，就代表没有空余ubinum号可供分配，返回出错

/* Make sure ubi_num is not busy */

if (ubi_devices[ubi_num]) {

dbg_err("ubi%d already exists", ubi_num);

return -EEXIST;

}

ubi = kzalloc(sizeof(struct ubi_device), GFP_KERNEL);

if (!ubi)

return -ENOMEM;

ubi->mtd = mtd;

ubi->ubi_num = ubi_num;

ubi->vid_hdr_offset = vid_hdr_offset;

ubi->autoresize_vol_id = -1;

mutex_init(&ubi->buf_mutex);

mutex_init(&ubi->ckvol_mutex);

mutex_init(&ubi->mult_mutex);

mutex_init(&ubi->volumes_mutex);

spin_lock_init(&ubi->volumes_lock);

初始化信号

ubi_msg("attaching mtd%d to ubi%d", mtd->index, ubi_num);

err =io_init(ubi);

if (err)

goto out_free;

下面跟着io_init()往下分析：

static int io_init(struct ubi_device *ubi)

{

if (ubi->mtd->numeraseregions != 0) {

ubi_err("multiple regions, not implemented");

return -EINVAL;

}

Numeraseregions是扫描nandflash得到的信息，如果numeraseregions等于0，代表我们需要attach的设备已经擦除过了

if (ubi->vid_hdr_offset < 0)

return -EINVAL;

ubi->vid_hdr_offset显然应该是一个正数，一般是nandflash的一页，我们的4020上的nandflash页大小为512字节，所以ubi->vid_hdr_offset为512.这儿再稍微说一下，EC header和VID header，是记录我们ubi管理信息。一般EC在一个擦除块的第一页，所以偏移量为0，VID在擦除块的第二页上，所以偏移量为512.，在我们4020的nandflash上，一个擦除块的大小为16K，也就是32页。

下面接着讲我们的扫描信息写进mtd结构体

ubi->peb_size = ubi->mtd->erasesize;

ubi->peb_count = ubi->mtd->size / ubi->mtd->erasesize;

Peb_count是指逻辑块的数目，也就是总的大小除以每一页的大小

ubi->flash_size = ubi->mtd->size;

if (ubi->mtd->block_isbad && ubi->mtd->block_markbad)

ubi->bad_allowed = 1;

ubi->min_io_size = ubi->mtd->writesize;

ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft;

if (!is_power_of_2(ubi->min_io_size)) {

ubi_err("min. I/O unit (%d) is not power of 2",

ubi->min_io_size);

return -EINVAL;

}

ubi_assert(ubi->hdrs_min_io_size > 0);

ubi_assert(ubi->hdrs_min_io_size <= ubi->min_io_size);

ubi_assert(ubi->min_io_size % ubi->hdrs_min_io_size == 0);

/* Calculate default aligned sizes of EC and VID headers */

ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size);

ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size);

dbg_msg("min_io_size %d", ubi->min_io_size);

dbg_msg("hdrs_min_io_size %d", ubi->hdrs_min_io_size);

dbg_msg("ec_hdr_alsize %d", ubi->ec_hdr_alsize);

dbg_msg("vid_hdr_alsize %d", ubi->vid_hdr_alsize);

if (ubi->vid_hdr_offset == 0)

/* Default offset */

ubi->vid_hdr_offset = ubi->vid_hdr_aloffset =

ubi->ec_hdr_alsize;

else {

ubi->vid_hdr_aloffset = ubi->vid_hdr_offset &

~(ubi->hdrs_min_io_size - 1);

ubi->vid_hdr_shift = ubi->vid_hdr_offset -

ubi->vid_hdr_aloffset;

}

Io_init剩余的部分就不分析了，比较容易

接着上面ubi_attach_mtd_dev（）往下说：

ubi->peb_buf1 = vmalloc(ubi->peb_size);

if (!ubi->peb_buf1)

goto out_free;

ubi->peb_buf2 = vmalloc(ubi->peb_size);

if (!ubi->peb_buf2)

goto out_free;

分配两个物理擦除块大小的buf，具体的用途下面再说

err = attach_by_scanning(ubi);

if (err) {

dbg_err("failed to attach by scanning, error %d", err);

goto out_free;

}

我们再跟着attach_by_scanning(ubi)细说

static int attach_by_scanning(struct ubi_device *ubi)

{

int err;

struct ubi_scan_info *si;

si = ubi_scan(ubi);

**********************************************************************************

这儿通过ubi_scan函数来扫描MTD分区的每一块。具体是调用static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si,int pnum)函数来读取EC和VID头（即没一块的前两页），在读每一页的时候，会调用check_pattern函数来判断这一页是否为空，如果每一页都是空的，那么就会发现这个MTD分区是空的。

**********************************************************************************

if (IS_ERR(si))

return PTR_ERR(si);

ubi->bad_peb_count = si->bad_peb_count;

ubi->good_peb_count = ubi->peb_count - ubi->bad_peb_count;

ubi->max_ec = si->max_ec;

ubi->mean_ec = si->mean_ec;

err = ubi_read_volume_table(ubi, si);

if (err)

goto out_si;

err = ubi_wl_init_scan(ubi, si);

**********************************************************************************

取之ubi_wl_init_scan(ubi, si);函数片段

list_for_each_entry_safe(seb, tmp, &si->erase, u.list) {

cond_resched();

e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL);

if (!e)

goto out_free;

e->pnum = seb->pnum;

e->ec = seb->ec;

ubi->lookuptbl[e->pnum] = e;

if (schedule_erase(ubi, e, 0)) {

kmem_cache_free(ubi_wl_entry_slab, e);

goto out_free;

}

在初始化wl的时候会将为每一个空页建立一个struct ubi_work *wl_wrk;结构体（该结构体的具体处理函数为erase_worker，擦除一块，并写入EC头），并添加到ubi->works队列中（list_add_tail(&wrk->list, &ubi->works)）；这儿我们渐渐的认识到ubi->works这个队列的作用，后台进程ubi_thread就是循环的处理该队列中的工作的。

在第一次attach的时候，在这儿ubi_thread进程还没有被唤醒，所以这些工作要等到进程被唤醒的时候才能被处理

**********************************************************************************

if (err)

goto out_vtbl;

err = ubi_eba_init_scan(ubi, si);

**********************************************************************************

前面我们看到了ubi_scan，其实这个这个过程是建立ubifs的基础，因为所有关于ubi和ubifs的基本信息都是在scan的过程中建立在内存中的，现在调用ubi_eba_init_scan来建立起EBA子系统就是利用前面的扫描信息，建立起没一个volumn的vtl。

if (err)

goto out_wl;

ubi_scan_destroy_si(si);

return 0;

out_wl:

ubi_wl_close(ubi);

out_vtbl:

free_internal_volumes(ubi);

vfree(ubi->vtbl);

out_si:

ubi_scan_destroy_si(si);

return err;

}

1.1 ．Ubi_scan

struct ubi_scan_info *ubi_scan(struct ubi_device *ubi)

{

int err, pnum;

struct rb_node *rb1, *rb2;

struct ubi_scan_volume *sv;

struct ubi_scan_leb *seb;

struct ubi_scan_info *si;

si = kzalloc(sizeof(struct ubi_scan_info), GFP_KERNEL);

if (!si)

return ERR_PTR(-ENOMEM);

INIT_LIST_HEAD(&si->corr);//初始化si的corrupt队列

INIT_LIST_HEAD(&si->free);// //初始化si的corrupt队列

INIT_LIST_HEAD(&si->erase);// //初始化si的corrupt队列

INIT_LIST_HEAD(&si->alien); //初始化si的corrupt队列

si->volumes = RB_ROOT;

#define RB_ROOT(struct rb_root) { NULL, },只是空的，哈哈

si->is_empty = 1;

err = -ENOMEM;

ech = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);//为ec头部分配空间，用于暂存后面读出的每一个peb的ec头部信息

if (!ech)

goto out_si;

vidh = ubi_zalloc_vid_hdr(ubi, GFP_KERNEL);//为vid头部分配空间，用于暂存后面读出的每一个peb的vid头部信息，注意扫描的目的就是收集EC和VID中信息，在内存中建立相关的信息

if (!vidh)

goto out_ech;

for (pnum = 0; pnum < ubi->peb_count; pnum++) {

cond_resched();

dbg_gen("process PEB %d", pnum);

err = process_eb(ubi, si, pnum);//具体的扫描每一个物理块

if (err < 0)

goto out_vidh;

}

dbg_msg("scanning is finished");

/* Calculate mean erase counter */

if (si->ec_count)//算平均擦除次数

si->mean_ec = div_u64(si->ec_sum, si->ec_count);

if (si->is_empty)//判断这是否是一个空的MTD，如果是空的话，那么后面的mount的时候调用create_default_filesystem在建立初始的ubifs数据

ubi_msg("empty MTD device detected");

* Few corrupted PEBs are not a problem and may be just a result of

* unclean reboots. However, many of them may indicate some problems

* with the flash HW or driver. Print a warning in this case.

if (si->corr_count >= 8 || si->corr_count >= ubi->peb_count / 4) {

ubi_warn("%d PEBs are corrupted", si->corr_count);

printk(KERN_WARNING "corrupted PEBs are:");

list_for_each_entry(seb, &si->corr, u.list)

printk(KERN_CONT " %d", seb->pnum);

printk(KERN_CONT "\n");

}

* In case of unknown erase counter we use the mean erase counter

* value.

ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {

ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb)

if (seb->ec == UBI_SCAN_UNKNOWN_EC)

seb->ec = si->mean_ec;

}

list_for_each_entry(seb, &si->free, u.list) {

if (seb->ec == UBI_SCAN_UNKNOWN_EC)

seb->ec = si->mean_ec;

}

list_for_each_entry(seb, &si->corr, u.list)

if (seb->ec == UBI_SCAN_UNKNOWN_EC)

seb->ec = si->mean_ec;

list_for_each_entry(seb, &si->erase, u.list)

if (seb->ec == UBI_SCAN_UNKNOWN_EC)

seb->ec = si->mean_ec;

err = paranoid_check_si(ubi, si);

if (err) {

if (err > 0)

err = -EINVAL;

goto out_vidh;

}

ubi_free_vid_hdr(ubi, vidh);

kfree(ech);

return si;

out_vidh:

ubi_free_vid_hdr(ubi, vidh);

out_ech:

kfree(ech);

out_si:

ubi_scan_destroy_si(si);

return ERR_PTR(err);

}

1.2 ．process_eb

static int process_eb(struct ubi_device *ubi, struct ubi_scan_info *si, int pnum)

{

long long uninitialized_var(ec);

int err, bitflips = 0, vol_id, ec_corr = 0;

dbg_bld("scan PEB %d", pnum);

/* Skip bad physical eraseblocks */

err = ubi_io_is_bad(ubi, pnum);

判断一个块是否为坏块，直接调用mtd层的mtd->block_isbad

if (err < 0)

return err;

else if (err) {

* FIXME: this is actually duty of the I/O sub-system to

* initialize this, but MTD does not provide enough

* information.

si->bad_peb_count += 1;

return 0;

}

err = ubi_io_read_ec_hdr(ubi, pnum, ech, 0);//读ec header，一般为一块的第一页

if (err < 0)

return err;

else if (err == UBI_IO_BITFLIPS)

bitflips = 1;

else if (err == UBI_IO_PEB_EMPTY)

return add_to_list(si, pnum, UBI_SCAN_UNKNOWN_EC, &si->erase);

//注意这儿，为什么这个块是empty（也就是全是0xff），还要丢到si->erase队列中呢？这是因为MTD所谓的空与UBI所谓的空不是一回事。在UBI中，空块是指只包含EC头部的块。所以这些需要将全0xff的块进行擦除，写入EC头部

else if (err == UBI_IO_BAD_EC_HDR) {

* We have to also look at the VID header, possibly it is not

* corrupted. Set %bitflips flag in order to make this PEB be

* moved and EC be re-created.

ec_corr = 1;

ec = UBI_SCAN_UNKNOWN_EC;

bitflips = 1;

}

si->is_empty = 0;

if (!ec_corr) {

int image_seq;

/* Make sure UBI version is OK */

if (ech->version != UBI_VERSION) {

ubi_err("this UBI version is %d, image version is %d",

UBI_VERSION, (int)ech->version);

return -EINVAL;

}

ec = be64_to_cpu(ech->ec);

if (ec > UBI_MAX_ERASECOUNTER) {

* Erase counter overflow. The EC headers have 64 bits

* reserved, but we anyway make use of only 31 bit

* values, as this seems to be enough for any existing

* flash. Upgrade UBI and use 64-bit erase counters

* internally.

ubi_err("erase counter overflow, max is %d",

UBI_MAX_ERASECOUNTER);

ubi_dbg_dump_ec_hdr(ech);

return -EINVAL;

}

* Make sure that all PEBs have the same image sequence number.

* This allows us to detect situations when users flash UBI

* images incorrectly, so that the flash has the new UBI image

* and leftovers from the old one. This feature was added

* relatively recently, and the sequence number was always

* zero, because old UBI implementations always set it to zero.

* For this reasons, we do not panic if some PEBs have zero

* sequence number, while other PEBs have non-zero sequence

* number.

image_seq = be32_to_cpu(ech->image_seq);

if (!ubi->image_seq && image_seq)

ubi->image_seq = image_seq;

if (ubi->image_seq && image_seq &&

ubi->image_seq != image_seq) {

ubi_err("bad image sequence number %d in PEB %d, "

"expected %d", image_seq, pnum, ubi->image_seq);

ubi_dbg_dump_ec_hdr(ech);

return -EINVAL;

}

/* OK, we've done with the EC header, let's look at the VID header */

err = ubi_io_read_vid_hdr(ubi, pnum, vidh, 0);

if (err < 0)

return err;

else if (err == UBI_IO_BITFLIPS)

bitflips = 1;

else if (err == UBI_IO_BAD_VID_HDR ||

(err == UBI_IO_PEB_FREE && ec_corr)) {

//如果是一个块的VID头，那么就添加到corr队列中去

/* VID header is corrupted */

err = add_to_list(si, pnum, ec, &si->corr);

if (err)

return err;

goto adjust_mean_ec;

} else if (err == UBI_IO_PEB_FREE) {

//如果VID头是空的，也就是说该PEB只存在EC头部，那么添加到free队列中，可以用于后面的分配。

/* No VID header - the physical eraseblock is free */

err = add_to_list(si, pnum, ec, &si->free);

if (err)

return err;

goto adjust_mean_ec;

}

vol_id = be32_to_cpu(vidh->vol_id);

if (vol_id > UBI_MAX_VOLUMES && vol_id != UBI_LAYOUT_VOLUME_ID) {

//判断vol_id是否合法，ubi内部存在一个layout_volume，专门用来保存user volumn的信息

UBI maintains internal volumes to store UBI related information e.g. volume information, flash based erase block assignment tables

int lnum = be32_to_cpu(vidh->lnum);

/* Unsupported internal volume */

switch (vidh->compat) {

case UBI_COMPAT_DELETE:

ubi_msg("\"delete\" compatible internal volume %d:%d"

" found, remove it", vol_id, lnum);

err = add_to_list(si, pnum, ec, &si->corr);

if (err)

return err;

break;

case UBI_COMPAT_RO:

ubi_msg("read-only compatible internal volume %d:%d"

" found, switch to read-only mode",

vol_id, lnum);

ubi->ro_mode = 1;

break;

case UBI_COMPAT_PRESERVE:

ubi_msg("\"preserve\" compatible internal volume %d:%d"

" found", vol_id, lnum);

err = add_to_list(si, pnum, ec, &si->alien);

if (err)

return err;

si->alien_peb_count += 1;

return 0;

case UBI_COMPAT_REJECT:

ubi_err("incompatible internal volume %d:%d found",

vol_id, lnum);

return -EINVAL;

}

if (ec_corr)

ubi_warn("valid VID header but corrupted EC header at PEB %d",

pnum);

//到这儿可以判定这个PEB是一个有效的UBI块，包含有效的EC头部很有效的VID头部

err = ubi_scan_add_used(ubi, si, pnum, ec, vidh, bitflips);

if (err)

return err;

adjust_mean_ec:

if (!ec_corr) {

si->ec_sum += ec;

si->ec_count += 1;

if (ec > si->max_ec)

si->max_ec = ec;

if (ec < si->min_ec)

si->min_ec = ec;

}

return 0;

}

1.3 ．ubi_scan_add_used

int ubi_scan_add_used (struct ubi_device *ubi, struct ubi_scan_info *si,int pnum, int ec, const struct ubi_vid_hdr *vid_hdr,int bitflips)

{

int err, vol_id, lnum;

unsigned long long sqnum;

struct ubi_scan_volume *sv;

struct ubi_scan_leb *seb;

struct rb_node **p, *parent = NULL;

vol_id = be32_to_cpu(vid_hdr->vol_id);

lnum = be32_to_cpu(vid_hdr->lnum);

sqnum = be64_to_cpu(vid_hdr->sqnum);

dbg_bld("PEB %d, LEB %d:%d, EC %d, sqnum %llu, bitflips %d",

pnum, vol_id, lnum, ec, sqnum, bitflips);

sv = add_volume(si, vol_id, pnum, vid_hdr);

调用add_volumn在检查读出的pnum的volumn id号，在内存中建立volumn的红黑树

if (IS_ERR(sv))

return PTR_ERR(sv);

if (si->max_sqnum < sqnum)

si->max_sqnum = sqnum;

* Walk the RB-tree of logical eraseblocks of volume @vol_id to look

* if this is the first instance of this logical eraseblock or not.

p = &sv->root.rb_node;

while (*p) {

int cmp_res;

parent = *p;

seb = rb_entry(parent, struct ubi_scan_leb, u.rb);

if (lnum != seb->lnum) {

if (lnum < seb->lnum)

p = &(*p)->rb_left;

else

p = &(*p)->rb_right;

continue;

}

在内存中建立ubi_scan_leb的红黑树

* There is already a physical eraseblock describing the same

* logical eraseblock present.

dbg_bld("this LEB already exists: PEB %d, sqnum %llu, "

"EC %d", seb->pnum, seb->sqnum, seb->ec);

* Make sure that the logical eraseblocks have different

* sequence numbers. Otherwise the image is bad.

* However, if the sequence number is zero, we assume it must

* be an ancient UBI image from the era when UBI did not have

* sequence numbers. We still can attach these images, unless

* there is a need to distinguish between old and new

* eraseblocks, in which case we'll refuse the image in

* 'compare_lebs()'. In other words, we attach old clean

* images, but refuse attaching old images with duplicated

* logical eraseblocks because there was an unclean reboot.

//注意上面的那个while(1)的范围，到这儿的时候表示在ubi_seb的红黑树中找到了一个描述pnum的ubi_seb结构，那么说明什么问题呢？说明在ubi中存在多个PEB指向同一个LEB.

//sqnum是一个持续增加的64bit的全局变量，我们认为它不会溢出，如果seb->sqnum == sqnum，那么显然是不合理的

if (seb->sqnum == sqnum && sqnum != 0) {

ubi_err("two LEBs with same sequence number %llu",

sqnum);

ubi_dbg_dump_seb(seb, 0);

ubi_dbg_dump_vid_hdr(vid_hdr);

return -EINVAL;

}

* Now we have to drop the older one and preserve the newer

* one.

//* @copy_flag: if this logical eraseblock was copied from another physical eraseblock (for wear-leveling reasons)

//如果存在多个PEB指向同一个LEB，那么一般是WL的时候，或者修改文件的时候发生了unclean reboot，那么我们就需要从这些多个PEB中找出哪个是最新的。compare_lebs就是完成这个工作的。

cmp_res = compare_lebs(ubi, seb, pnum, vid_hdr);

if (cmp_res < 0)

return cmp_res;

if (cmp_res & 1) {

* This logical eraseblock is newer then the one

* found earlier.

err = validate_vid_hdr(vid_hdr, sv, pnum);

if (err)

return err;

if (cmp_res & 4)

err = add_to_list(si, seb->pnum, seb->ec,

&si->corr);

else

err = add_to_list(si, seb->pnum, seb->ec,

&si->erase);

if (err)

return err;

seb->ec = ec;

seb->pnum = pnum;

seb->scrub = ((cmp_res & 2) || bitflips);

seb->sqnum = sqnum;

if (sv->highest_lnum == lnum)

sv->last_data_size =

be32_to_cpu(vid_hdr->data_size);

return 0;

} else {

* This logical eraseblock is older than the one found

* previously.

if (cmp_res & 4)

return add_to_list(si, pnum, ec, &si->corr);

else

return add_to_list(si, pnum, ec, &si->erase);

}

* We've met this logical eraseblock for the first time, add it to the

* scanning information.

//如果到这儿了，表示这是第一次遇到该LEB，那么很简单，将它添加到队列中就可以了

err = validate_vid_hdr(vid_hdr, sv, pnum);

if (err)

return err;

seb = kmalloc(sizeof(struct ubi_scan_leb), GFP_KERNEL);

if (!seb)

return -ENOMEM;

seb->ec = ec;

seb->pnum = pnum;

seb->lnum = lnum;

seb->sqnum = sqnum;

seb->scrub = bitflips;

if (sv->highest_lnum <= lnum) {

sv->highest_lnum = lnum;

sv->last_data_size = be32_to_cpu(vid_hdr->data_size);

}

sv->leb_count += 1;

rb_link_node(&seb->u.rb, parent, p);

rb_insert_color(&seb->u.rb, &sv->root);

return 0;

}

1.4 ．compare_lebs

static int compare_lebs(struct ubi_device *ubi, const struct ubi_scan_leb *seb,int pnum, const struct ubi_vid_hdr *vid_hdr)

{

void *buf;

int len, err, second_is_newer, bitflips = 0, corrupted = 0;

uint32_t data_crc, crc;

struct ubi_vid_hdr *vh = NULL;

unsigned long long sqnum2 = be64_to_cpu(vid_hdr->sqnum);

//再次判断一下是否存在sqnum相等的情况发生

if (sqnum2 == seb->sqnum) {

* This must be a really ancient UBI image which has been

* created before sequence numbers support has been added. At

* that times we used 32-bit LEB versions stored in logical

* eraseblocks. That was before UBI got into mainline. We do not

* support these images anymore. Well, those images will work

* still work, but only if no unclean reboots happened.

ubi_err("unsupported on-flash UBI format\n");

return -EINVAL;

}

/* Obviously the LEB with lower sequence counter is older */

//因为sqnum是持续增加的，而且不会溢出。所以认为sqnum大的那个PEB是最新的。 second_is_newer = !!(sqnum2 > seb->sqnum);

* Now we know which copy is newer. If the copy flag of the PEB with

* newer version is not set, then we just return, otherwise we have to

* check data CRC. For the second PEB we already have the VID header,

* for the first one - we'll need to re-read it from flash.

* Note: this may be optimized so that we wouldn't read twice.

if (second_is_newer) {

if (!vid_hdr->copy_flag) {

/* It is not a copy, so it is newer */

dbg_bld("second PEB %d is newer, copy_flag is unset",

pnum);

return 1;

}

} else {

//如果copy_flag位设置了，那么可以认为是在WL的时候发生意外。因为发生了unclear reboot，所以需要判断这个最新的PEB中的数据是否是完整的。（unclean reboot时数据可能被打断了）

pnum = seb->pnum;

vh = ubi_zalloc_vid_hdr(ubi, GFP_KERNEL);

if (!vh)

return -ENOMEM;

err = ubi_io_read_vid_hdr(ubi, pnum, vh, 0);

if (err) {

if (err == UBI_IO_BITFLIPS)

bitflips = 1;

else {

dbg_err("VID of PEB %d header is bad, but it "

"was OK earlier", pnum);

if (err > 0)

err = -EIO;

goto out_free_vidh;

}

if (!vh->copy_flag) {

/* It is not a copy, so it is newer */

dbg_bld("first PEB %d is newer, copy_flag is unset",

pnum);

err = bitflips << 1;

goto out_free_vidh;

}

vid_hdr = vh;

}

/* Read the data of the copy and check the CRC */

len = be32_to_cpu(vid_hdr->data_size);

buf = vmalloc(len);

if (!buf) {

err = -ENOMEM;

goto out_free_vidh;

}

//OK,读出数据，校验CRC

err = ubi_io_read_data(ubi, buf, pnum, 0, len);

if (err && err != UBI_IO_BITFLIPS && err != -EBADMSG)

goto out_free_buf;

data_crc = be32_to_cpu(vid_hdr->data_crc);

crc = crc32(UBI_CRC32_INIT, buf, len);

if (crc != data_crc) {

dbg_bld("PEB %d CRC error: calculated %#08x, must be %#08x",

pnum, crc, data_crc);

corrupted = 1;

bitflips = 0;

//如果CRC校验失败了，那么还沿用老的PEB

second_is_newer = !second_is_newer;

} else {

dbg_bld("PEB %d CRC is OK", pnum);

bitflips = !!err;

}

vfree(buf);

ubi_free_vid_hdr(ubi, vh);

if (second_is_newer)

dbg_bld("second PEB %d is newer, copy_flag is set", pnum);

else

dbg_bld("first PEB %d is newer, copy_flag is set", pnum);

return second_is_newer | (bitflips << 1) | (corrupted << 2);

out_free_buf:

vfree(buf);

out_free_vidh:

ubi_free_vid_hdr(ubi, vh);

return err;

}

二．创建volume

ubimkvol /dev/ubi0 -N ubifs -s 15MiB

上面的这条命令是在ubi设备0上创建一个大小为15M，名字叫做ubifs的volumn

这条命令是通过ioctl实现的，我们下面来看一下相关的代码：

/* Create volume command */

case UBI_IOCMKVOL:

{

struct ubi_mkvol_req req;

dbg_gen("create volume");

err = copy_from_user(&req, argp, sizeof(struct ubi_mkvol_req));

if (err) {

err = -EFAULT;

break;

}

req.name[req.name_len] = '\0';

err = verify_mkvol_req(ubi, &req);

if (err)

break;

mutex_lock(&ubi->device_mutex);

err = ubi_create_volume(ubi, &req);

mutex_unlock(&ubi->device_mutex);

if (err)

break;

err = put_user(req.vol_id, (__user int32_t *)argp);

if (err)

err = -EFAULT;

break;

}

函数的主体部分是ubi_create_volume。传给ubi_create_volume的是一个ubi_mkvol_req类型的结构体。

struct ubi_mkvol_req {

__s32 vol_id;//要创建的volumn的ID，可以不指定

__s32 alignment;//The @alignment field specifies the required alignment of the volume logical eraseblock. This means, that the size of logical eraseblocks will be aligned to this number, i.e.,

(UBI device logical eraseblock size) mod (@alignment) = 0.

__s64 bytes;//volume的大小

__s8 vol_type;//volume的类型，静态或者动态

__s8 padding1;

__s16 name_len;//volume的名字的长度

__s8 padding2[4];

char name[UBI_MAX_VOLUME_NAME + 1];

} __attribute__ ((packed));

int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req)

{

int i, err, vol_id = req->vol_id, do_free = 1;

struct ubi_volume *vol;

struct ubi_vtbl_record vtbl_rec;

dev_t dev;

if (ubi->ro_mode)

return -EROFS;

vol = kzalloc(sizeof(struct ubi_volume), GFP_KERNEL);

if (!vol)

return -ENOMEM;

spin_lock(&ubi->volumes_lock);

//如果没有指定vol-id,那么就是采用默认的方式获得id

if (vol_id == UBI_VOL_NUM_AUTO) {

/* Find unused volume ID */

dbg_gen("search for vacant volume ID");

for (i = 0; i < ubi->vtbl_slots; i++)

if (!ubi->volumes[i]) {

vol_id = i;

break;

}

if (vol_id == UBI_VOL_NUM_AUTO) {

dbg_err("out of volume IDs");

err = -ENFILE;

goto out_unlock;

}

req->vol_id = vol_id;

}

dbg_gen("create device %d, volume %d, %llu bytes, type %d, name %s",

ubi->ubi_num, vol_id, (unsigned long long)req->bytes,

(int)req->vol_type, req->name);

/* Ensure that this volume does not exist */

err = -EEXIST;

if (ubi->volumes[vol_id]) {

dbg_err("volume %d already exists", vol_id);

goto out_unlock;

}

/* Ensure that the name is unique */

//确认要创建的volume的名字是唯一的。与已经存在的volume对比

for (i = 0; i < ubi->vtbl_slots; i++)

if (ubi->volumes[i] &&

ubi->volumes[i]->name_len == req->name_len &&

!strcmp(ubi->volumes[i]->name, req->name)) {

dbg_err("volume \"%s\" exists (ID %d)", req->name, i);

goto out_unlock;

}

//根据req->bytes计算需要的物理块数，UBI中操作的基本单元是物理块

/* Calculate how many eraseblocks are requested */

vol->usable_leb_size = ubi->leb_size - ubi->leb_size % req->alignment;

vol->reserved_pebs += div_u64(req->bytes + vol->usable_leb_size - 1,

vol->usable_leb_size);

/* Reserve physical eraseblocks */

if (vol->reserved_pebs > ubi->avail_pebs) {

dbg_err("not enough PEBs, only %d available", ubi->avail_pebs);

err = -ENOSPC;

goto out_unlock;

}

//将ubi设备中的可用pebs减少，因为已经分配了新创建的volume

ubi->avail_pebs -= vol->reserved_pebs;

ubi->rsvd_pebs += vol->reserved_pebs;

spin_unlock(&ubi->volumes_lock);

//初始化新创建的volume的相关信息

vol->vol_id = vol_id;

vol->alignment = req->alignment;

vol->data_pad = ubi->leb_size % vol->alignment;

vol->vol_type = req->vol_type;

vol->name_len = req->name_len;

memcpy(vol->name, req->name, vol->name_len);

vol->ubi = ubi;

* Finish all pending erases because there may be some LEBs belonging

* to the same volume ID.

//刷新UBI后台中pending的workers。

err = ubi_wl_flush(ubi);

if (err)

goto out_acc;

//创建eba_tbl表，并将其初始化为UBI_LEB_UNMAPPED，只有在对具体的LEB进行写操作的时候才会真正的更新该表中的每一个LEB对应的项

vol->eba_tbl = kmalloc(vol->reserved_pebs * sizeof(int), GFP_KERNEL);

if (!vol->eba_tbl) {

err = -ENOMEM;

goto out_acc;

}

for (i = 0; i < vol->reserved_pebs; i++)

vol->eba_tbl[i] = UBI_LEB_UNMAPPED;

if (vol->vol_type == UBI_DYNAMIC_VOLUME) {

vol->used_ebs = vol->reserved_pebs;

vol->last_eb_bytes = vol->usable_leb_size;

vol->used_bytes =

(long long)vol->used_ebs * vol->usable_leb_size;

} else {

vol->used_ebs = div_u64_rem(vol->used_bytes,

vol->usable_leb_size,

&vol->last_eb_bytes);

if (vol->last_eb_bytes != 0)

vol->used_ebs += 1;

else

vol->last_eb_bytes = vol->usable_leb_size;

}

/* Register character device for the volume */

//给ubi volume注册字符接口

cdev_init(&vol->cdev, &ubi_vol_cdev_operations);

vol->cdev.owner = THIS_MODULE;

dev = MKDEV(MAJOR(ubi->cdev.dev), vol_id + 1);

err = cdev_add(&vol->cdev, dev, 1);

if (err) {

ubi_err("cannot add character device");

goto out_mapping;

}

vol->dev.release = vol_release;

vol->dev.parent = &ubi->dev;

vol->dev.devt = dev;

vol->dev.class = ubi_class;

dev_set_name(&vol->dev, "%s_%d", ubi->ubi_name, vol->vol_id);

err = device_register(&vol->dev);

if (err) {

ubi_err("cannot register device");

goto out_cdev;

}

err = volume_sysfs_init(ubi, vol);

if (err)

goto out_sysfs;

/* Fill volume table record */

//ubi中存在一个internal volume，其中保持的是每一个volume的信息，现在新创建了一个volume，就需要更新其中的这个internal volume（layout volume）的信息

memset(&vtbl_rec, 0, sizeof(struct ubi_vtbl_record));

vtbl_rec.reserved_pebs = cpu_to_be32(vol->reserved_pebs);

vtbl_rec.alignment = cpu_to_be32(vol->alignment);

vtbl_rec.data_pad = cpu_to_be32(vol->data_pad);

vtbl_rec.name_len = cpu_to_be16(vol->name_len);

if (vol->vol_type == UBI_DYNAMIC_VOLUME)

vtbl_rec.vol_type = UBI_VID_DYNAMIC;

else

vtbl_rec.vol_type = UBI_VID_STATIC;

memcpy(vtbl_rec.name, vol->name, vol->name_len);

err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);

通过一个ubi_eba_unmap_leb操作，和一个ubi_eba_write_leb操作来实现了ubifs的写操作，保证了数据的安全性

if (err)

goto out_sysfs;

spin_lock(&ubi->volumes_lock);

ubi->volumes[vol_id] = vol;

ubi->vol_count += 1;

spin_unlock(&ubi->volumes_lock);

//通知相关模块，UBI创建了一个新的volume，让它们也采取相应的措施，貌似这个通知联上只有gluebi_notifier。

ubi_volume_notify(ubi, vol, UBI_VOLUME_ADDED);

if (paranoid_check_volumes(ubi))

dbg_err("check failed while creating volume %d", vol_id);

return err;

out_sysfs:

* We have registered our device, we should not free the volume

* description object in this function in case of an error - it is

* freed by the release function.

* Get device reference to prevent the release function from being

* called just after sysfs has been closed.

do_free = 0;

get_device(&vol->dev);

volume_sysfs_close(vol);

out_cdev:

cdev_del(&vol->cdev);

out_mapping:

if (do_free)

kfree(vol->eba_tbl);

out_acc:

spin_lock(&ubi->volumes_lock);

ubi->rsvd_pebs -= vol->reserved_pebs;

ubi->avail_pebs += vol->reserved_pebs;

out_unlock:

spin_unlock(&ubi->volumes_lock);

if (do_free)

kfree(vol);

else

put_device(&vol->dev);

ubi_err("cannot create volume %d, error %d", vol_id, err);

return err;

}

三．Mount 过程

static int mount_ubifs(struct ubifs_info *c)

{

struct super_block *sb = c->vfs_sb;

int err, mounted_read_only = (sb->s_flags & MS_RDONLY);

long long x;

size_t sz;

err = init_constants_early(c);

if (err)

return err;

err = ubifs_debugging_init(c);

if (err)

return err;

//通过检查vtl表来确定volume是否为空

err = check_volume_empty(c);

if (err)

goto out_free;

//如果该volume为空，但是只读的话，显然不能写入信息，自然

//也就不能mount了

if (c->empty && (mounted_read_only || c->ro_media)) {

* This UBI volume is empty, and read-only, or the file system

* is mounted read-only - we cannot format it.

ubifs_err("can't format empty UBI volume: read-only %s",

c->ro_media ? "UBI volume" : "mount");

err = -EROFS;

goto out_free;

}

if (c->ro_media && !mounted_read_only) {

ubifs_err("cannot mount read-write - read-only media");

err = -EROFS;

goto out_free;

}

* The requirement for the buffer is that it should fit indexing B-tree

* height amount of integers. We assume the height if the TNC tree will

* never exceed 64.

err = -ENOMEM;

//bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c,在后面我们会看到在dirty_cow_bottom_up中将znode的所有的ancestors（父节点，父节点的父节点，一直到根节点未知）都设为dirty。所以在标记之前要记录一下所以的ancestors znode。这个bottom_up_buf就是用于这个目的的。

c->bottom_up_buf = kmalloc(BOTTOM_UP_HEIGHT * sizeof(int), GFP_KERNEL);

if (!c->bottom_up_buf)

goto out_free;

//sbuf: LEB-sized buffer to use

c->sbuf = vmalloc(c->leb_size);

if (!c->sbuf)

goto out_free;

if (!mounted_read_only) {

//@ileb_buf: buffer for commit in-the-gaps method

c->ileb_buf = vmalloc(c->leb_size);

if (!c->ileb_buf)

goto out_free;

}

if (c->bulk_read == 1)

//初始化bulk-read的信息，关于bulk-read的相关信息可以在通过VFS的读操作中看到详细的解释

bu_init(c);

* We have to check all CRCs, even for data nodes, when we mount the FS

* (specifically, when we are replaying).

c->always_chk_crc = 1;

//读超级块，如果该volume是空的，显然不存在超级块，这时候需要创建一个最初的文件系统

err = ubifs_read_superblock(c);

if (err)

goto out_free;

* Make sure the compressor which is set as default in the superblock

* or overridden by mount options is actually compiled in.

if (!ubifs_compr_present(c->default_compr)) {

ubifs_err("'compressor \"%s\" is not compiled in",

ubifs_compr_name(c->default_compr));

err = -ENOTSUPP;

goto out_free;

}

//初始化ubifs的一些常量

err = init_constants_sb(c);

if (err)

goto out_free;

sz = ALIGN(c->max_idx_node_sz, c->min_io_size);

sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);

c->cbuf = kmalloc(sz, GFP_NOFS);

if (!c->cbuf) {

err = -ENOMEM;

goto out_free;

}

sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);

if (!mounted_read_only) {

err = alloc_wbufs(c);

if (err)

goto out_cbuf;

/* Create background thread */

//创建UBIFS的后台进程，这个后台进程主要用于基于wbuf的读写

c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);

if (IS_ERR(c->bgt)) {

err = PTR_ERR(c->bgt);

c->bgt = NULL;

ubifs_err("cannot spawn \"%s\", error %d",

c->bgt_name, err);

goto out_wbufs;

}

//唤醒该进程

wake_up_process(c->bgt);

}

err =ubifs_read_master(c);

//见下面的具体描述

if (err)

goto out_free;

* Make sure the compressor which is set as default in the superblock

* or overridden by mount options is actually compiled in.

if (!ubifs_compr_present(c->default_compr)) {

ubifs_err("'compressor \"%s\" is not compiled in",

ubifs_compr_name(c->default_compr));

err = -ENOTSUPP;

goto out_free;

}

err = init_constants_sb(c);

if (err)

goto out_free;

sz = ALIGN(c->max_idx_node_sz, c->min_io_size);

sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);

c->cbuf = kmalloc(sz, GFP_NOFS);

if (!c->cbuf) {

err = -ENOMEM;

goto out_free;

}

sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);

if (!mounted_read_only) {

err = alloc_wbufs(c);

if (err)

goto out_cbuf;

/* Create background thread */

c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);

if (IS_ERR(c->bgt)) {

err = PTR_ERR(c->bgt);

c->bgt = NULL;

ubifs_err("cannot spawn \"%s\", error %d",

c->bgt_name, err);

goto out_wbufs;

}

wake_up_process(c->bgt);

}

err = ubifs_read_master(c);

if (err)

goto out_master;

init_constants_master(c);

if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {

ubifs_msg("recovery needed");

c->need_recovery = 1;

if (!mounted_read_only) {

err = ubifs_recover_inl_heads(c, c->sbuf);

if (err)

goto out_master;

}

} else if (!mounted_read_only) {

* Set the "dirty" flag so that if we reboot uncleanly we

* will notice this immediately on the next mount.

c->mst_node->flags |= cpu_to_le32(UBIFS_MST_DIRTY);

err = ubifs_write_master(c);

if (err)

goto out_master;

}

err = ubifs_lpt_init(c, 1, !mounted_read_only);

if (err)

goto out_lpt;

err = dbg_check_idx_size(c, c->old_idx_sz);

if (err)

goto out_lpt;

err = ubifs_replay_journal(c);

if (err)

goto out_journal;

/* Calculate 'min_idx_lebs' after journal replay */

c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);

err = ubifs_mount_orphans(c, c->need_recovery, mounted_read_only);

if (err)

goto out_orphans;

if (!mounted_read_only) {

int lnum;

err = check_free_space(c);

if (err)

goto out_orphans;

/* Check for enough log space */

lnum = c->lhead_lnum + 1;

if (lnum >= UBIFS_LOG_LNUM + c->log_lebs)

lnum = UBIFS_LOG_LNUM;

if (lnum == c->ltail_lnum) {

err = ubifs_consolidate_log(c);

if (err)

goto out_orphans;

}

if (c->need_recovery) {

err = ubifs_recover_size(c);

if (err)

goto out_orphans;

err = ubifs_rcvry_gc_commit(c);

} else {

err = take_gc_lnum(c);

if (err)

goto out_orphans;

* GC LEB may contain garbage if there was an unclean

* reboot, and it should be un-mapped.

err = ubifs_leb_unmap(c, c->gc_lnum);

if (err)

return err;

}

err = dbg_check_lprops(c);

if (err)

goto out_orphans;

} else if (c->need_recovery) {

err = ubifs_recover_size(c);

if (err)

goto out_orphans;

} else {

* Even if we mount read-only, we have to set space in GC LEB

* to proper value because this affects UBIFS free space

* reporting. We do not want to have a situation when

* re-mounting from R/O to R/W changes amount of free space.

err = take_gc_lnum(c);

if (err)

goto out_orphans;

}

spin_lock(&ubifs_infos_lock);

list_add_tail(&c->infos_list, &ubifs_infos);

spin_unlock(&ubifs_infos_lock);

if (c->need_recovery) {

if (mounted_read_only)

ubifs_msg("recovery deferred");

else {

c->need_recovery = 0;

ubifs_msg("recovery completed");

* GC LEB has to be empty and taken at this point. But

* the journal head LEBs may also be accounted as

* "empty taken" if they are empty.

ubifs_assert(c->lst.taken_empty_lebs > 0);

}

} else

ubifs_assert(c->lst.taken_empty_lebs > 0);

err = dbg_check_filesystem(c);

if (err)

goto out_infos;

err = dbg_debugfs_init_fs(c);

if (err)

goto out_infos;

c->always_chk_crc = 0;

ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",

c->vi.ubi_num, c->vi.vol_id, c->vi.name);

if (mounted_read_only)

ubifs_msg("mounted read-only");

x = (long long)c->main_lebs * c->leb_size;

ubifs_msg("file system size: %lld bytes (%lld KiB, %lld MiB, %d "

"LEBs)", x, x >> 10, x >> 20, c->main_lebs);

x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;

ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d "

"LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);

ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)",

c->fmt_version, c->ro_compat_version,

UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);

ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));

ubifs_msg("reserved for root: %llu bytes (%llu KiB)",

c->report_rp_size, c->report_rp_size >> 10);

dbg_msg("compiled on: " __DATE__ " at " __TIME__);

dbg_msg("min. I/O unit size: %d bytes", c->min_io_size);

dbg_msg("LEB size: %d bytes (%d KiB)",

c->leb_size, c->leb_size >> 10);

dbg_msg("data journal heads: %d",

c->jhead_cnt - NONDATA_JHEADS_CNT);

dbg_msg("UUID: %02X%02X%02X%02X-%02X%02X"

"-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X",

c->uuid[0], c->uuid[1], c->uuid[2], c->uuid[3],

c->uuid[4], c->uuid[5], c->uuid[6], c->uuid[7],

c->uuid[8], c->uuid[9], c->uuid[10], c->uuid[11],

c->uuid[12], c->uuid[13], c->uuid[14], c->uuid[15]);

dbg_msg("big_lpt %d", c->big_lpt);

dbg_msg("log LEBs: %d (%d - %d)",

c->log_lebs, UBIFS_LOG_LNUM, c->log_last);

dbg_msg("LPT area LEBs: %d (%d - %d)",

c->lpt_lebs, c->lpt_first, c->lpt_last);

dbg_msg("orphan area LEBs: %d (%d - %d)",

c->orph_lebs, c->orph_first, c->orph_last);

dbg_msg("main area LEBs: %d (%d - %d)",

c->main_lebs, c->main_first, c->leb_cnt - 1);

dbg_msg("index LEBs: %d", c->lst.idx_lebs);

dbg_msg("total index bytes: %lld (%lld KiB, %lld MiB)",

c->old_idx_sz, c->old_idx_sz >> 10, c->old_idx_sz >> 20);

dbg_msg("key hash type: %d", c->key_hash_type);

dbg_msg("tree fanout: %d", c->fanout);

dbg_msg("reserved GC LEB: %d", c->gc_lnum);

dbg_msg("first main LEB: %d", c->main_first);

dbg_msg("max. znode size %d", c->max_znode_sz);

dbg_msg("max. index node size %d", c->max_idx_node_sz);

dbg_msg("node sizes: data %zu, inode %zu, dentry %zu",

UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);

dbg_msg("node sizes: trun %zu, sb %zu, master %zu",

UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);

dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",

UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);

dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu",

UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,

UBIFS_MAX_DENT_NODE_SZ);

dbg_msg("dead watermark: %d", c->dead_wm);

dbg_msg("dark watermark: %d", c->dark_wm);

dbg_msg("LEB overhead: %d", c->leb_overhead);

x = (long long)c->main_lebs * c->dark_wm;

dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)",

x, x >> 10, x >> 20);

dbg_msg("maximum bud bytes: %lld (%lld KiB, %lld MiB)",

c->max_bud_bytes, c->max_bud_bytes >> 10,

c->max_bud_bytes >> 20);

dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",

c->bg_bud_bytes, c->bg_bud_bytes >> 10,

c->bg_bud_bytes >> 20);

dbg_msg("current bud bytes %lld (%lld KiB, %lld MiB)",

c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);

dbg_msg("max. seq. number: %llu", c->max_sqnum);

dbg_msg("commit number: %llu", c->cmt_no);

return 0;

out_infos:

spin_lock(&ubifs_infos_lock);

list_del(&c->infos_list);

spin_unlock(&ubifs_infos_lock);

out_orphans:

free_orphans(c);

out_journal:

destroy_journal(c);

out_lpt:

ubifs_lpt_free(c, 0);

out_master:

kfree(c->mst_node);

kfree(c->rcvrd_mst_node);

if (c->bgt)

kthread_stop(c->bgt);

out_wbufs:

free_wbufs(c);

out_cbuf:

kfree(c->cbuf);

out_free:

kfree(c->bu.buf);

vfree(c->ileb_buf);

vfree(c->sbuf);

kfree(c->bottom_up_buf);

ubifs_debugging_exit(c);

return err;

}

3.1 ubifs_read_superblock

int ubifs_read_superblock(struct ubifs_info *c)

{

int err, sup_flags;

struct ubifs_sb_node *sup;

//如果前面扫描的时候发现该卷中的LEB全部没有map，因此是一个空卷，什么信息都没有，这时候需要建立一个最原始的文件系统，其实就是写入superblock节点（LEB0），master节点（LEB1，和LEB2），commit节点（LEB3），inode节点（main_first+1），index节点（main_first+0）。

//对于这些节点，我觉得很有必要详细的描述一下。我们都知道每一个文件系统都有一个超级块，里面存放的是文件系统的基本信息，在这儿ubifs将超级块以superblock类型节点的形式写进了flash media。

//从《a brief introduce of ubi and ubifs》的文档中可以看出。为了垃圾回收，采用node-structure的形式组织文件，jiffs2中这些相关的数据结构是在mount的时候建立的，这样花费了大量的时间和内存资源，而ubifs中这些数据是保存在flash media中的。Master节点就是这样的树状信息的根节点。Master节点是一式两份的，分别保存在LEB1和LEB2上。为什么需要两份呢？

因为文件更新的时候，B+tree中的数据会变的，相应的master也就需要更新，为了防止在更新master的时候发生unclean reboot导致数据被破坏，所以保存了两份，用于unclean reboot时候的数据恢复。

if (c->empty) {

err = create_default_filesystem(c);

if (err)

return err;

}

//读出超级块，当然这个超级块有可能是上面的create_default_filesystem刚刚写进去的。

sup = ubifs_read_sb_node(c);

if (IS_ERR(sup))

return PTR_ERR(sup);

c->fmt_version = le32_to_cpu(sup->fmt_version);

c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);

* The software supports all previous versions but not future versions,

* due to the unavailability of time-travelling equipment.

if (c->fmt_version > UBIFS_FORMAT_VERSION) {

struct super_block *sb = c->vfs_sb;

int mounting_ro = sb->s_flags & MS_RDONLY;

ubifs_assert(!c->ro_media || mounting_ro);

if (!mounting_ro ||

c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {

ubifs_err("on-flash format version is w%d/r%d, but "

"software only supports up to version "

"w%d/r%d", c->fmt_version,

c->ro_compat_version, UBIFS_FORMAT_VERSION,

UBIFS_RO_COMPAT_VERSION);

if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {

ubifs_msg("only R/O mounting is possible");

err = -EROFS;

} else

err = -EINVAL;

goto out;

}

* The FS is mounted R/O, and the media format is

* R/O-compatible with the UBIFS implementation, so we can

* mount.

c->rw_incompat = 1;

}

if (c->fmt_version < 3) {

ubifs_err("on-flash format version %d is not supported",

c->fmt_version);

err = -EINVAL;

goto out;

}

//采用哪种hash运算方法

switch (sup->key_hash) {

case UBIFS_KEY_HASH_R5:

c->key_hash = key_r5_hash;

c->key_hash_type = UBIFS_KEY_HASH_R5;

break;

case UBIFS_KEY_HASH_TEST:

c->key_hash = key_test_hash;

c->key_hash_type = UBIFS_KEY_HASH_TEST;

break;

};

c->key_fmt = sup->key_fmt;

switch (c->key_fmt) {

case UBIFS_SIMPLE_KEY_FMT:

c->key_len = UBIFS_SK_LEN;

break;

default:

ubifs_err("unsupported key format");

err = -EINVAL;

goto out;

}

//用从超级块中读出的信息来初始化内存中的ubifs_info结构体

c->leb_cnt = le32_to_cpu(sup->leb_cnt);

c->max_leb_cnt = le32_to_cpu(sup->max_leb_cnt);

c->max_bud_bytes = le64_to_cpu(sup->max_bud_bytes);

c->log_lebs = le32_to_cpu(sup->log_lebs);

c->lpt_lebs = le32_to_cpu(sup->lpt_lebs);

c->orph_lebs = le32_to_cpu(sup->orph_lebs);

c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;

c->fanout = le32_to_cpu(sup->fanout);

c->lsave_cnt = le32_to_cpu(sup->lsave_cnt);

c->rp_size = le64_to_cpu(sup->rp_size);

c->rp_uid = le32_to_cpu(sup->rp_uid);

c->rp_gid = le32_to_cpu(sup->rp_gid);

sup_flags = le32_to_cpu(sup->flags);

if (!c->mount_opts.override_compr)

c->default_compr = le16_to_cpu(sup->default_compr);

c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);

memcpy(&c->uuid, &sup->uuid, 16);

c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);

/* Automatically increase file system size to the maximum size */

//ubi的volume是可以resize的，即可以改变大小。此时需要重新写超级块

c->old_leb_cnt = c->leb_cnt;

if (c->leb_cnt < c->vi.size && c->leb_cnt < c->max_leb_cnt) {

c->leb_cnt = min_t(int, c->max_leb_cnt, c->vi.size);

if (c->vfs_sb->s_flags & MS_RDONLY)

dbg_mnt("Auto resizing (ro) from %d LEBs to %d LEBs",

c->old_leb_cnt, c->leb_cnt);

else {

dbg_mnt("Auto resizing (sb) from %d LEBs to %d LEBs",

c->old_leb_cnt, c->leb_cnt);

sup->leb_cnt = cpu_to_le32(c->leb_cnt);

err = ubifs_write_sb_node(c, sup);

if (err)

goto out;

c->old_leb_cnt = c->leb_cnt;

}

c->log_bytes = (long long)c->log_lebs * c->leb_size;

c->log_last = UBIFS_LOG_LNUM + c->log_lebs - 1;

c->lpt_first = UBIFS_LOG_LNUM + c->log_lebs;

c->lpt_last = c->lpt_first + c->lpt_lebs - 1;

c->orph_first = c->lpt_last + 1;

c->orph_last = c->orph_first + c->orph_lebs - 1;

c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;

c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;

c->main_first = c->leb_cnt - c->main_lebs;

err = validate_sb(c, sup);

out:

kfree(sup);

return err;

}

3.2 create_default_filesystem

static int create_default_filesystem(struct ubifs_info *c)

{

struct ubifs_sb_node *sup;

struct ubifs_mst_node *mst;

struct ubifs_idx_node *idx;

struct ubifs_branch *br;

struct ubifs_ino_node *ino;

struct ubifs_cs_node *cs;

union ubifs_key key;

int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;

int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;

int min_leb_cnt = UBIFS_MIN_LEB_CNT;

long long tmp64, main_bytes;

__le64 tmp_le64;

/* Some functions called from here depend on the @c->key_len filed */

c->key_len = UBIFS_SK_LEN;

* First of all, we have to calculate default file-system geometry -

* log size, journal size, etc.

//首先根据文件系统的大小算相应的journal和log区的大小。Journal的目的前面可能已经提到了，因为ubifs的文件的B+tree的数据是保存在flash media中，这就带来了一个问题，每次更新文件的时候都需要更新相关的B+tree的信息，这样就会频繁的读写flash设备，降低文件系统的性能。所以采用了joural，也就是说在更新的时候先将更新相关inode的信息写进log中，在log满了的时候才一起更新flash media中的B+tree。这样降低了更新的频率，提高了文件系统的性能。

if (c->leb_cnt < 0x7FFFFFFF / DEFAULT_JNL_PERCENT)

/* We can first multiply then divide and have no overflow */

jnl_lebs = c->leb_cnt * DEFAULT_JNL_PERCENT / 100;

else

jnl_lebs = (c->leb_cnt / 100) * DEFAULT_JNL_PERCENT;

if (jnl_lebs < UBIFS_MIN_JNL_LEBS)

jnl_lebs = UBIFS_MIN_JNL_LEBS;

if (jnl_lebs * c->leb_size > DEFAULT_MAX_JNL)

jnl_lebs = DEFAULT_MAX_JNL / c->leb_size;

* The log should be large enough to fit reference nodes for all bud

* LEBs. Because buds do not have to start from the beginning of LEBs

* (half of the LEB may contain committed data), the log should

* generally be larger, make it twice as large.

tmp = 2 * (c->ref_node_alsz * jnl_lebs) + c->leb_size - 1;

log_lebs = tmp / c->leb_size;

/* Plus one LEB reserved for commit */

log_lebs += 1;

if (c->leb_cnt - min_leb_cnt > 8) {

/* And some extra space to allow writes while committing */

log_lebs += 1;

min_leb_cnt += 1;

}

max_buds = jnl_lebs - log_lebs;

if (max_buds < UBIFS_MIN_BUD_LEBS)

max_buds = UBIFS_MIN_BUD_LEBS;

* Orphan nodes are stored in a separate area. One node can store a lot

* of orphan inode numbers, but when new orphan comes we just add a new

* orphan node. At some point the nodes are consolidated into one

* orphan node.

//An orphan is an inode number whose inode node has been committed to the index with a link count of zero. That happens when an open file is deleted (unlinked) and then a commit is run

//The orphan area is a fixed number of LEBs situated between the LPT area and the main area

// orphan顾名思义是指牺牲者，在ubifs中的当一inode的引用为零的时候，这个文件需要被删除，为了防止在删除的时候发生unclean reboot，ubifs将这些需要删除的文件信息写在orphan area中，这样在发生unclean reboot的时候文件系统可以清楚的知道哪些文件需要被删除，而不是去扫描整个分区。文件系统在没有空余空间的时候也可以通过GC子系统来回收这些空间。关于orphan的相关信息就保存在orphan area中，The orphan area is a fixed number of LEBs situated between the LPT area and the main area

orph_lebs = UBIFS_MIN_ORPH_LEBS;

#ifdef CONFIG_UBIFS_FS_DEBUG

if (c->leb_cnt - min_leb_cnt > 1)

* For debugging purposes it is better to have at least 2

* orphan LEBs, because the orphan subsystem would need to do

* consolidations and would be stressed more.

orph_lebs += 1;

#endif

main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;

main_lebs -= orph_lebs;

//上面提到了，orphan区处于LPT区和main area之间。什么是LPT，LPT= LEB Properties Tree

lpt_first = UBIFS_LOG_LNUM + log_lebs;

c->lsave_cnt = DEFAULT_LSAVE_CNT;

c->max_leb_cnt = c->leb_cnt;

err =ubifs_create_dflt_lpt(c, &main_lebs, lpt_first, &lpt_lebs,

&big_lpt);

*********************************************************************************

ubifs_create_dflt_lpt算出LPT需要占用几块LEB,LPT是描述的ubifs中每一个leb的空闲bytes和dirty（这儿的脏好像并不是指被修改的意思，从代码pnode->lprops[0].dirty = iopos - node_sz;中大体的意思为没有被写，但是别人不能用的空间，因为flash操作的基本单元是page，如果在某一页中只写了一半的数据，那么另外一半就是脏的，虽然没有写东西，但是别人也用不了,Dirty space is the number of bytes taken up by obsolete nodes and padding, that can potentially be reclaimed by garbage collection）bytes。因为LPT区自己也占用了LEB，所以需要建立LPT自己的表。这想内核在启动的过程中建立自己的页表一样

a) 为跟index节点和根inode节点所占的leb创建LEB properties

b) 为其余所有的pnode节点建立信息，同时将信息写入flash media中

**********************************************************************************

if (err)

return err;

dbg_gen("LEB Properties Tree created (LEBs %d-%d)", lpt_first,

lpt_first + lpt_lebs - 1);

main_first = c->leb_cnt - main_lebs;

/* Create default superblock */

tmp = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);

sup = kzalloc(tmp, GFP_KERNEL);

if (!sup)

return -ENOMEM;

tmp64 = (long long)max_buds * c->leb_size;

if (big_lpt)

sup_flags |= UBIFS_FLG_BIGLPT;

//初始化superblock节点

sup->ch.node_type = UBIFS_SB_NODE;

sup->key_hash = UBIFS_KEY_HASH_R5;

sup->flags = cpu_to_le32(sup_flags);

sup->min_io_size = cpu_to_le32(c->min_io_size);

sup->leb_size = cpu_to_le32(c->leb_size);

sup->leb_cnt = cpu_to_le32(c->leb_cnt);

sup->max_leb_cnt = cpu_to_le32(c->max_leb_cnt);

sup->max_bud_bytes = cpu_to_le64(tmp64);

sup->log_lebs = cpu_to_le32(log_lebs);

sup->lpt_lebs = cpu_to_le32(lpt_lebs);

sup->orph_lebs = cpu_to_le32(orph_lebs);

sup->jhead_cnt = cpu_to_le32(DEFAULT_JHEADS_CNT);

sup->fanout = cpu_to_le32(DEFAULT_FANOUT);

sup->lsave_cnt = cpu_to_le32(c->lsave_cnt);

sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION);

sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN);

if (c->mount_opts.override_compr)

sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);

else

sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);

generate_random_uuid(sup->uuid);

main_bytes = (long long)main_lebs * c->leb_size;

tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100);

if (tmp64 > DEFAULT_MAX_RP_SIZE)

tmp64 = DEFAULT_MAX_RP_SIZE;

sup->rp_size = cpu_to_le64(tmp64);

sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);

//写入superblock节点到LEB0

err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);

kfree(sup);

if (err)

return err;

dbg_gen("default superblock created at LEB 0:0");

/* Create default master node */

mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);

if (!mst)

return -ENOMEM;

//初始化master节点

mst->ch.node_type = UBIFS_MST_NODE;

mst->log_lnum = cpu_to_le32(UBIFS_LOG_LNUM);

mst->highest_inum = cpu_to_le64(UBIFS_FIRST_INO);

mst->cmt_no = 0;

mst->root_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);

mst->root_offs = 0;

tmp = ubifs_idx_node_sz(c, 1);

mst->root_len = cpu_to_le32(tmp);

mst->gc_lnum = cpu_to_le32(main_first + DEFAULT_GC_LEB);

mst->ihead_lnum = cpu_to_le32(main_first + DEFAULT_IDX_LEB);

mst->ihead_offs = cpu_to_le32(ALIGN(tmp, c->min_io_size));

mst->index_size = cpu_to_le64(ALIGN(tmp, 8));

mst->lpt_lnum = cpu_to_le32(c->lpt_lnum);

mst->lpt_offs = cpu_to_le32(c->lpt_offs);

mst->nhead_lnum = cpu_to_le32(c->nhead_lnum);

mst->nhead_offs = cpu_to_le32(c->nhead_offs);

mst->ltab_lnum = cpu_to_le32(c->ltab_lnum);

mst->ltab_offs = cpu_to_le32(c->ltab_offs);

mst->lsave_lnum = cpu_to_le32(c->lsave_lnum);

mst->lsave_offs = cpu_to_le32(c->lsave_offs);

mst->lscan_lnum = cpu_to_le32(main_first);

mst->empty_lebs = cpu_to_le32(main_lebs - 2);

mst->idx_lebs = cpu_to_le32(1);

mst->leb_cnt = cpu_to_le32(c->leb_cnt);

/* Calculate lprops statistics */

tmp64 = main_bytes;

tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);

tmp64 -= ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);

mst->total_free = cpu_to_le64(tmp64);

tmp64 = ALIGN(ubifs_idx_node_sz(c, 1), c->min_io_size);

ino_waste = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size) -

UBIFS_INO_NODE_SZ;

tmp64 += ino_waste;

tmp64 -= ALIGN(ubifs_idx_node_sz(c, 1), 8);

mst->total_dirty = cpu_to_le64(tmp64);

/* The indexing LEB does not contribute to dark space */

tmp64 = (c->main_lebs - 1) * c->dark_wm;

mst->total_dark = cpu_to_le64(tmp64);

mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);

//master节点一式两份

err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,

UBI_UNKNOWN);

if (err) {

kfree(mst);

return err;

}

err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,

UBI_UNKNOWN);

kfree(mst);

if (err)

return err;

dbg_gen("default master node created at LEB %d:0", UBIFS_MST_LNUM);

/* Create the root indexing node */

tmp = ubifs_idx_node_sz(c, 1);

//idx节点。从tnc.c中的描述操作，idx的成员zbranch以及make_idx_node函数看来，idx节点是用来在flash media中保存TNC树的

内核用struct ubifs_znode结构体来代表着flash中的一个idx节点。Idx节点的孩子代表真正的数据，当然这些数据本身可以是一个idx节点，也可以是当初的数据。

这儿初始化的是TNC的根节点。

//《a brief introduce of ubi and ubifs》中说inode节点和它的数据是分开的，上面的idx节点其实是存放的数据。那么struct ubifs_ino_node类型的节点是存放的inode吗？(yes)

//In UBIFS, inodes have a correspondinginode node which records the number of directory entry links, more simply known as the link count.

//inode node is a node that holds the metadata for an inode. Every inode has

exactly one (non-obsolete) inode node.

idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);

if (!idx)

return -ENOMEM;

c->key_fmt = UBIFS_SIMPLE_KEY_FMT;

c->key_hash = key_r5_hash;

idx->ch.node_type = UBIFS_IDX_NODE;

idx->child_cnt = cpu_to_le16(1);

ino_key_init(c, &key, UBIFS_ROOT_INO);

br = ubifs_idx_branch(c, idx, 0);

key_write_idx(c, &key, &br->key);

br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);

br->len = cpu_to_le32(UBIFS_INO_NODE_SZ);

err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,

UBI_UNKNOWN);

kfree(idx);

if (err)

return err;

dbg_gen("default root indexing node created LEB %d:0",

main_first + DEFAULT_IDX_LEB);

/* Create default root inode */

tmp = ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size);

ino = kzalloc(tmp, GFP_KERNEL);

if (!ino)

return -ENOMEM;

ino_key_init_flash(c, &ino->key, UBIFS_ROOT_INO);

ino->ch.node_type = UBIFS_INO_NODE;

ino->creat_sqnum = cpu_to_le64(++c->max_sqnum);

ino->nlink = cpu_to_le32(2);

tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec);

ino->atime_sec = tmp_le64;

ino->ctime_sec = tmp_le64;

ino->mtime_sec = tmp_le64;

ino->atime_nsec = 0;

ino->ctime_nsec = 0;

ino->mtime_nsec = 0;

ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO);

ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ);

/* Set compression enabled by default */

ino->flags = cpu_to_le32(UBIFS_COMPR_FL);

err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,

main_first + DEFAULT_DATA_LEB, 0,

UBI_UNKNOWN);

kfree(ino);

if (err)

return err;

dbg_gen("root inode created at LEB %d:0",

main_first + DEFAULT_DATA_LEB);

* The first node in the log has to be the commit start node. This is

* always the case during normal file-system operation. Write a fake

* commit start node to the log.

tmp = ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size);

cs = kzalloc(tmp, GFP_KERNEL);

if (!cs)

return -ENOMEM;

cs->ch.node_type = UBIFS_CS_NODE;

//在log区域写入一个commit start node，每一次commit的时候会向log区域写入两种类型，一种就是commit start类型的节点表示一次commit的开始，两外一种就是referencr节点，里面记录了相应的日志需要操作的leb，和offset。

err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,

0, UBI_UNKNOWN);

kfree(cs);

ubifs_msg("default file-system created");

return 0;

}

3.3 ubifs_read_master

读ubifs文件系统的master节点，我们前面提到了master节点是一式两份的，因为它里面保存的是idx的最基本的东西，不容有失。而且master节点是不能同时写的，防止unclean reboot使得两份数据同时被破坏

int ubifs_read_master(struct ubifs_info *c)

{

int err, old_leb_cnt;

c->mst_node = kzalloc(c->mst_node_alsz, GFP_KERNEL);

if (!c->mst_node)

return -ENOMEM;

//检查两份master节点，看是master中的数据是否被破坏。

err = scan_for_master(c);

if (err) {

if (err == -EUCLEAN)

//如果被破坏，那么就需要恢复

err = ubifs_recover_master_node(c);

if (err)

* Note, we do not free 'c->mst_node' here because the

* unmount routine will take care of this.

return err;

}

/* Make sure that the recovery flag is clear */

//用master节点来初始化ubifs_info结构体中的信息

c->mst_node->flags &= cpu_to_le32(~UBIFS_MST_RCVRY);

c->max_sqnum = le64_to_cpu(c->mst_node->ch.sqnum);

c->highest_inum = le64_to_cpu(c->mst_node->highest_inum);

c->cmt_no = le64_to_cpu(c->mst_node->cmt_no);

c->zroot.lnum = le32_to_cpu(c->mst_node->root_lnum);

c->zroot.offs = le32_to_cpu(c->mst_node->root_offs);

c->zroot.len = le32_to_cpu(c->mst_node->root_len);

c->lhead_lnum = le32_to_cpu(c->mst_node->log_lnum);

c->gc_lnum = le32_to_cpu(c->mst_node->gc_lnum);

c->ihead_lnum = le32_to_cpu(c->mst_node->ihead_lnum);

c->ihead_offs = le32_to_cpu(c->mst_node->ihead_offs);

c->old_idx_sz = le64_to_cpu(c->mst_node->index_size);

c->lpt_lnum = le32_to_cpu(c->mst_node->lpt_lnum);

c->lpt_offs = le32_to_cpu(c->mst_node->lpt_offs);

c->nhead_lnum = le32_to_cpu(c->mst_node->nhead_lnum);

c->nhead_offs = le32_to_cpu(c->mst_node->nhead_offs);

c->ltab_lnum = le32_to_cpu(c->mst_node->ltab_lnum);

c->ltab_offs = le32_to_cpu(c->mst_node->ltab_offs);

c->lsave_lnum = le32_to_cpu(c->mst_node->lsave_lnum);

c->lsave_offs = le32_to_cpu(c->mst_node->lsave_offs);

c->lscan_lnum = le32_to_cpu(c->mst_node->lscan_lnum);

c->lst.empty_lebs = le32_to_cpu(c->mst_node->empty_lebs);

c->lst.idx_lebs = le32_to_cpu(c->mst_node->idx_lebs);

old_leb_cnt = le32_to_cpu(c->mst_node->leb_cnt);

c->lst.total_free = le64_to_cpu(c->mst_node->total_free);

c->lst.total_dirty = le64_to_cpu(c->mst_node->total_dirty);

c->lst.total_used = le64_to_cpu(c->mst_node->total_used);

c->lst.total_dead = le64_to_cpu(c->mst_node->total_dead);

c->lst.total_dark = le64_to_cpu(c->mst_node->total_dark);

c->calc_idx_sz = c->old_idx_sz;

if (c->mst_node->flags & cpu_to_le32(UBIFS_MST_NO_ORPHS))

c->no_orphs = 1;

if (old_leb_cnt != c->leb_cnt) {

/* The file system has been resized */

int growth = c->leb_cnt - old_leb_cnt;

if (c->leb_cnt < old_leb_cnt ||

c->leb_cnt < UBIFS_MIN_LEB_CNT) {

ubifs_err("bad leb_cnt on master node");

dbg_dump_node(c, c->mst_node);

return -EINVAL;

}

dbg_mnt("Auto resizing (master) from %d LEBs to %d LEBs",

old_leb_cnt, c->leb_cnt);

c->lst.empty_lebs += growth;

c->lst.total_free += growth * (long long)c->leb_size;

c->lst.total_dark += growth * (long long)c->dark_wm;

* Reflect changes back onto the master node. N.B. the master

* node gets written immediately whenever mounting (or

* remounting) in read-write mode, so we do not need to write it

* here.

c->mst_node->leb_cnt = cpu_to_le32(c->leb_cnt);

c->mst_node->empty_lebs = cpu_to_le32(c->lst.empty_lebs);

c->mst_node->total_free = cpu_to_le64(c->lst.total_free);

c->mst_node->total_dark = cpu_to_le64(c->lst.total_dark);

}

err = validate_master(c);

if (err)

return err;

err = dbg_old_index_check_init(c, &c->zroot);

return err;

}

（二）：通过VFS的读写流程

1.通过VFS的写流程

断断续续的看ubifs这么久了，感觉越看越乱，所以想先从VFS的读写接口开始慢慢的扩展一下。

const struct file_operations ubifs_file_operations = {

.llseek = generic_file_llseek,

.read = do_sync_read,

.write = do_sync_write,

.aio_read = generic_file_aio_read,

.aio_write = ubifs_aio_write,

.mmap = ubifs_file_mmap,

.fsync = ubifs_fsync,

.unlocked_ioctl = ubifs_ioctl,

.splice_read = generic_file_splice_read,

.splice_write = generic_file_splice_write,

#ifdef CONFIG_COMPAT

.compat_ioctl = ubifs_compat_ioctl,

#endif

};

其中ubifs_aio_write的代码很短。

static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,

unsigned long nr_segs, loff_t pos)

{

int err;

ssize_t ret;

struct inode *inode = iocb->ki_filp->f_mapping->host;

struct ubifs_info *c = inode->i_sb->s_fs_info;

err = update_mctime(c, inode);

if (err)

return err;

ret = generic_file_aio_write(iocb, iov, nr_segs, pos);

if (ret < 0)

return ret;

if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {

err = ubifs_sync_wbufs_by_inode(c, inode);

if (err)

return err;

}

return ret;

}

对于异步的情况，直接调用，generic_file_aio_write将数据写入到缓冲区中，由后台进程来具体的将数据写入的flash media中去。对于采用sync模式挂载的情况，就不是由后台进程来讲数据刷新到flash media中去了，而是直接调用ubifs_sync_wbufs_by_inode来讲数据直接写入到flash media中去。

下面具体的看一下这个函数的代码：

int ubifs_sync_wbufs_by_inode(struct ubifs_info *c, struct inode *inode)

{

int i, err = 0;

for (i = 0; i < c->jhead_cnt; i++) {

struct ubifs_wbuf *wbuf = &c->jheads[i].wbuf;

if (i == GCHD)

* GC head is special, do not look at it. Even if the

* head contains something related to this inode, it is

* a _copy_ of corresponding on-flash node which sits

* somewhere else.

continue;

if (!wbuf_has_ino(wbuf, inode->i_ino))

continue;

mutex_lock_nested(&wbuf->io_mutex, wbuf->jhead);

if (wbuf_has_ino(wbuf, inode->i_ino))

err = ubifs_wbuf_sync_nolock(wbuf);

mutex_unlock(&wbuf->io_mutex);

if (err) {

ubifs_ro_mode(c, err);

return err;

}

return 0;

}

该函数通过调用wbuf_has_ino来判断这些inode中是否存在数据。在获得wbuf的信号了的情况，接着调用ubifs_wbuf_sync_nolock来sync数据。

在struct ubifs_info中存在这样几个成员：

int jhead_cnt;

struct ubifs_jhead *jheads;

是用于日志文件系统的管理的。在

/fs/ubifs/ubifs-media.h中有这样的宏定义：

/* Garbage collector journal head number */

#define UBIFS_GC_HEAD 0

/* Base journal head number */

#define UBIFS_BASE_HEAD 1

/* Data journal head number */

#define UBIFS_DATA_HEAD 2

这是ubifs中用于管理的三种不同目的的缓冲区，分别用于垃圾回收,Journal head used for non-data nodes.和数据读写这三种用途。

struct ubifs_jhead {

struct ubifs_wbuf wbuf;

struct list_head buds_list;

};

struct ubifs_wbuf {

struct ubifs_info *c;

void *buf;//具体的分配用来缓冲数据的空间

int lnum;//缓冲的是哪一个flash块

int offs；//缓冲的数据的位移，也就是这一块中，offs之前位置的数据被缓冲在wbuf中。

int avail;//缓冲区的可用字节数

int used;//缓冲区中已用字节数

int dtype;// type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,

* %UBI_UNKNOWN)

int jhead;

int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);

struct mutex io_mutex;

spinlock_t lock;

ktime_t softlimit;

unsigned long long delta;

struct hrtimer timer;

unsigned int no_timer:1;

unsigned int need_sync:1;

int next_ino;

ino_t *inodes;//缓冲区中数据的host

};

从注释中可以看出struct ubifs_wbuf - UBIFS write-buffer.，这个wbuf是ubifs层的一个缓冲区，我们慢慢来看这个缓冲区是怎么实现的。

int ubifs_wbuf_sync_nolock(struct ubifs_wbuf *wbuf)

{

struct ubifs_info *c = wbuf->c;

int err, dirt;

cancel_wbuf_timer_nolock(wbuf);

//取消wbuf的定时器，因为在后台进程中通过定时器的定期刷新数据

if (!wbuf->used || wbuf->lnum == -1)

/* Write-buffer is empty or not seeked */

return 0;

dbg_io("LEB %d:%d, %d bytes, jhead %s",

wbuf->lnum, wbuf->offs, wbuf->used, dbg_jhead(wbuf->jhead));

ubifs_assert(!(c->vfs_sb->s_flags & MS_RDONLY));

ubifs_assert(!(wbuf->avail & 7));

ubifs_assert(wbuf->offs + c->min_io_size <= c->leb_size);

if (c->ro_media)

return -EROFS;

ubifs_pad(c, wbuf->buf + wbuf->used, wbuf->avail);

//调用ubi_leb_write来对LEB（逻辑块）进行读写

err = ubi_leb_write(c->ubi, wbuf->lnum, wbuf->buf, wbuf->offs,

c->min_io_size, wbuf->dtype);

if (err) {

ubifs_err("cannot write %d bytes to LEB %d:%d",

c->min_io_size, wbuf->lnum, wbuf->offs);

dbg_dump_stack();

return err;

}

dirt = wbuf->avail;

spin_lock(&wbuf->lock);

wbuf->offs += c->min_io_size;

wbuf->avail = c->min_io_size;

wbuf->used = 0;

wbuf->next_ino = 0;

spin_unlock(&wbuf->lock);

if (wbuf->sync_callback)

err = wbuf->sync_callback(c, wbuf->lnum,

c->leb_size - wbuf->offs, dirt);

return err;

}

这个函数的主体是调用UBI层的ubi_leb_write函数来将数据写入flash中。

ubi_leb_write的函数调用关系：

->ubi_leb_write（对逻辑块进行读写）

->ubi_eba_write_leb(内核中每一个volume都维护一个vol->eba_tbl的数组，其中是关于逻辑块与物理块之间的映射关系，这些映射关系同时保持在实际物理介质中的VID header中，在ubiattach的时候，建立这样的eba_tbl)

->ubi_io_write_data

->ubi_io_write

->ubi->mtd->write(ubi->mtd, addr, len, &written, buf);

这儿可以看出UBI是构建在MTD层之上的。UBI的读写之后调用了MTD层的读写。

上面的generic_file_aio_write刚才没有分析，这个函数比较复杂，在此之前我们先看一个ubifs中address_space_operations类型的一个关于内核缓冲区的操作结构体。

const struct address_space_operations ubifs_file_address_operations = {

.readpage = ubifs_readpage,

.writepage = ubifs_writepage,

.write_begin = ubifs_write_begin,

.write_end = ubifs_write_end,

.invalidatepage = ubifs_invalidatepage,

.set_page_dirty = ubifs_set_page_dirty,

.releasepage = ubifs_releasepage,

};

->generic_file_aio_write

->__generic_file_aio_write

->generic_file_buffered_write

->generic_perform_write

->ubifs_write_begin

->ubifs_write_end

这儿有一个关于write_begin和write_end的资料：

http://lwn.net/Articles/254856/

generic_file_aio_write函数结束的时候，整个写过程也就结束了，到这儿的时候，数据已经被写入了buffer_head中去了，等待内核线程pdflush发现radix树上的脏页，并最终调用ubifs_writepages。

关于ubifs_writepages，作者有一段注释，大意是说在VFS中，是先写入属于inode的数据，最后才写入inode节点的。但是对ubifs这样的日志文件系统就可能存在问题。设想存在下面的情况：一个原来长度为0的inode节点，现在想往该节点写入数据，ubifs提交日志，最终完成了写操作。在没有写入inode之前发生了一次unclear reboot，这时候重新启动的时候就会发现该inode节点还是0字节，但是数据已经写入了，占用了flash media。所以这部分空间就没办法释放了。为了避免这种情况，需要在ubifs中先写入inode节点，然后再用log的形式写入数据，这时候即使发生unclear reboot，由于提交了日志，所以数据还是可以恢复的。

static int ubifs_writepage(struct page *page, struct writeback_control *wbc)

{

struct inode *inode = page->mapping->host;

struct ubifs_inode *ui = ubifs_inode(inode);

loff_t i_size = i_size_read(inode), synced_i_size;

pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;

int err, len = i_size & (PAGE_CACHE_SIZE - 1);

void *kaddr;

dbg_gen("ino %lu, pg %lu, pg flags %#lx",

inode->i_ino, page->index, page->flags);

ubifs_assert(PagePrivate(page));

/* Is the page fully outside @i_size? (truncate in progress) */

if (page->index > end_index || (page->index == end_index && !len)) {

err = 0;

goto out_unlock;

}

spin_lock(&ui->ui_lock);

synced_i_size = ui->synced_i_size;

spin_unlock(&ui->ui_lock);

/* Is the page fully inside @i_size? */

if (page->index < end_index) {

if (page->index >= synced_i_size >> PAGE_CACHE_SHIFT) {

err = inode->i_sb->s_op->write_inode(inode, 1);

if (err)

goto out_unlock;

* The inode has been written, but the write-buffer has

* not been synchronized, so in case of an unclean

* reboot we may end up with some pages beyond inode

* size, but they would be in the journal (because

* commit flushes write buffers) and recovery would deal

* with this.

}

return do_writepage(page, PAGE_CACHE_SIZE);

}

* The page straddles @i_size. It must be zeroed out on each and every

* writepage invocation because it may be mmapped. "A file is mapped

* in multiples of the page size. For a file that is not a multiple of

* the page size, the remaining memory is zeroed when mapped, and

* writes to that region are not written out to the file."

kaddr = kmap_atomic(page, KM_USER0);

memset(kaddr + len, 0, PAGE_CACHE_SIZE - len);

flush_dcache_page(page);//将Dcache中的数据刷回内存中

kunmap_atomic(kaddr, KM_USER0);

if (i_size > synced_i_size) {

err = inode->i_sb->s_op->write_inode(inode, 1);

if (err)

goto out_unlock;

}

return do_writepage(page, len);

out_unlock:

unlock_page(page);

return err;

}

首先调用inode->i_sb->s_op->write_inode(inode, 1);将inode节点写入flash media中去，接着调用do_writepage将在page中的数据写入flash media中，在do_writepage中调用ubifs_jnl_write_data来进行日志文件系统的写操作。该函数首先将数据拷贝到wbuf中，由后台进程来进行些操作。但是日志文件系统是怎么保证unclear reboot的recovery工作的呢？

我们ubifs_add_bud_to_log(bud -----An eraseblock used by the journal)就可以看到，每次找到一块可用块将其添加到wbuf的时候，都会在日志中（ubifs在flash中保持一定的块数用于日志的目的）写入一个REF的节点。一个ref结点代表的是journal中的一个LEB，可以称之为bud。那么log中记录的就是在前一次commit之后，下一次commit之前，我们的写操作涉及到的LEB。所以我们可以理解为什么struct ubifs_jhead结构的成员中会有struct ubifs_wbuf wbuf；

struct ubifs_jhead {

struct ubifs_wbuf wbuf;

struct list_head buds_list;

};因为wbuf差不多是journal的一部分，wbuf中缓冲的是将要写入到journal中的数据。

Ubifs中一共分为六个区域：superblock，master node，the log area，the lpt area，theorphan area 和the main area.其中master node中记载着idx树的根节点，anindex node records the onflash position of its child nodes，the UBIFS wandering tree can be viewed as having two parts.A top part consisting of index nodes that create the structure of the tree, and a bottom part consisting of leaf nodes that hold the actual file data。因为wandering tree是保存在flash上的，所以在进行数据更新的时候就必然需要更新wandering tree，频繁的数据更新显然会降低文件系统的性能，所以采用了journal。Log是journal的一部分，是为了防止在flash media中频繁的更新idx树而降低文件系统效率。Ubifs将需要更新的信息写入log中，然后在提交的时候一起更新，从而降低了wandering tree的更新频率。

Ubifs文件系统在对一个文件进行修改的时候，它会将修改的数据写入到一个新块中，然后将LNUM指向该新页，将原来LNUM指向的PNUM擦除掉。所以在修改的过程中发生unclear reboot的时候，在重新启动的时候就会发现有两个PNUM指向同一个LNUM,这就说明发生了错误。同时旧的PNUM中的数据没有擦除掉，很容易恢复。

2. 通过VFS 的读流程

从上面的文件操作结构体可以看出，UBIFS对于VFS的接口函数为generic_file_aio_read。

下面来看一下函数之间的调用关系：

generic_file_aio_read：

->do_generic_file_read

->readpage

这儿的readpage的函数指针指向ubifs_readpage。

ubifs_readpage的代码清单：

static int ubifs_readpage(struct file *file, struct page *page)

{

if (ubifs_bulk_read(page))

return 0;

do_readpage(page);

unlock_page(page);

return 0;

}

上面的代码很简短。

static int ubifs_bulk_read(struct page *page)

{

struct inode *inode = page->mapping->host;

struct ubifs_info *c = inode->i_sb->s_fs_info;

struct ubifs_inode *ui = ubifs_inode(inode);

pgoff_t index = page->index, last_page_read = ui->last_page_read;

struct bu_info *bu;

int err = 0, allocated = 0;

ui->last_page_read = index;

if (!c->bulk_read)

return 0;

* Bulk-read is protected by @ui->ui_mutex, but it is an optimization,

* so don't bother if we cannot lock the mutex.

if (!mutex_trylock(&ui->ui_mutex))

return 0;

if (index != last_page_read + 1) {

/* Turn off bulk-read if we stop reading sequentially */

ui->read_in_a_row = 1;

if (ui->bulk_read)

ui->bulk_read = 0;

goto out_unlock;

}

if (!ui->bulk_read) {

ui->read_in_a_row += 1;

if (ui->read_in_a_row < 3)

goto out_unlock;

/* Three reads in a row, so switch on bulk-read */

ui->bulk_read = 1;

}

* If possible, try to use pre-allocated bulk-read information, which

* is protected by @c->bu_mutex.

if (mutex_trylock(&c->bu_mutex))

bu = &c->bu;

else {

bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN);

if (!bu)

goto out_unlock;

bu->buf = NULL;

allocated = 1;

}

bu->buf_len = c->max_bu_buf_len;

data_key_init(c, &bu->key, inode->i_ino,

page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);

err = ubifs_do_bulk_read(c, bu, page);

if (!allocated)

mutex_unlock(&c->bu_mutex);

else

kfree(bu);

out_unlock:

mutex_unlock(&ui->ui_mutex);

return err;

}

在ubifs中数据都是以LEB的形式的组织的，ubifs层的基本读写也是块读写。所以在ubifs中必然出现很多文件的尾部只占用了一整块的一小部分。当对这文件添加内容的时候，就会以另外一个DATA_NODE的形式附加在后面，这样就会在某一个块中出现在物理上连续，属于同一个节点的不同的DATA_NODE形式的ubifs节点。

ubifs_bulk_read来擦看ubifs是否支持bulk_read ,如果支持，那么就执行bulk-read操作，在ubifs中，存在一个TNC树(tree node cache),里面保持的是在内存中的inode树。首先ubifs_bulk_read通过key来从TNC树中查找到znode，bulk-read的相关信息保持在znode的zbranch数组中。

为什么要采用bulk操作，而不是直接在添加数据到文件的tail后面？因为ubifs中对数据进行了压缩，所以数据不能直接添加的，需要压缩之后以另外一个DATA_NODE的形式接在后面。

上面的两种方式最终都调用到了ubi_leb_read。注意这儿的的名字中leb，leb是logical erase block ,是UBI层虚拟的逻辑块，逻辑块与物理上的块是一一对应的，ubi中是如何实现虚拟块与物理块之间的映射关系的呢？

3.EBA 子系统 3.1erase worker

在ubifs中，存在两个头部分别用来进行ubifs的管理，分别为EC（erase header）和VID（volumne identifier header），看名字可以大体擦除这些头部的用途，其中EC是用来统计erase次数的，用于均衡损耗。VIDheader用于LEB和PEB之间的映射。

struct ubi_vid_hdr {

ubi32_t magic;

uint8_t version;

uint8_t vol_type;

uint8_t copy_flag;

uint8_t compat;

ubi32_t vol_id;//属于哪一个volumn

ubi32_t lnum;//该peb属于哪一个leb

ubi32_t leb_ver;

ubi32_t data_size;

ubi32_t used_ebs;

ubi32_t data_pad;

ubi32_t data_crc;

uint8_t padding1[12];

uint8_t ivol_data[UBI_VID_HDR_IVOL_DATA_SIZE];

ubi32_t hdr_crc;

} __attribute__((packed));

在VID头部中有一个成员为lnum，用于表示与该peb相对于的leb的number。Ubi在atatch一个MTD分区的时候会扫描每一个块，然后收集相应的信息，建立起来volumn分区信息和每一个分区的eba_tbl表。

Ubifs设计文档中提到ubifs是一个out-of-place updates的文件系统，即文件系统修改一个文件系统的时候，是先将数据读出，在缓冲区中进行写覆盖，然后将这些数据写入到一个新块中。为什么这么做呢？

我们知道Nand Flash在写之前进行擦除（具体原因不在说明），如果我只是修改了很小的不部分内容，就会发现这个读-擦除-写的代价比写入一个新块的代价大的多。

所以在ubifs在对一个文件修改时，直接修改数据写入一个新块，然后再新块中使得VID头部的lnum为原来的leb就可以了。然后将原来的leb unmap掉，这个unmap的过程将该块丢给ubi的后台进程去擦除。当然这个擦除的过程需要读出EC头部，然后更新擦除次数，重新写入被擦除的物理块中。这时候这个块就是一个free的块了。所以UBI中空块是存在EC头部但是不存在VID头部的。

上面提到了unmap一个erase block，下面看看eba子系统是如何unmap一个erase block的。

int ubi_eba_unmap_leb(struct ubi_device *ubi, struct ubi_volume *vol,int lnum)

{

int err, pnum, vol_id = vol->vol_id;

if (ubi->ro_mode)

return -EROFS;

err = leb_write_lock(ubi, vol_id, lnum);

if (err)

return err;

pnum = vol->eba_tbl[lnum];

if (pnum < 0)

/* This logical eraseblock is already unmapped */

goto out_unlock;

dbg_eba("erase LEB %d:%d, PEB %d", vol_id, lnum, pnum);

vol->eba_tbl[lnum] = UBI_LEB_UNMAPPED;

err = ubi_wl_put_peb(ubi, pnum, 0);

out_unlock:

leb_write_unlock(ubi, vol_id, lnum);

return err;

}

该函数首先在分区的eba_tbl中查找看看该erase block时候被map了，如果没有被map，那么函数直接返回。

ubi_wl_put_peb 函数最终调用schedule_erase(ubi, e, torture);来进行erase操作。

static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,

int torture)

{

struct ubi_work *wl_wrk;

dbg_wl("schedule erasure of PEB %d, EC %d, torture %d",

e->pnum, e->ec, torture);

wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS);

if (!wl_wrk)

return -ENOMEM;

wl_wrk->func = &erase_worker;

wl_wrk->e = e;

wl_wrk->torture = torture;

schedule_ubi_work(ubi, wl_wrk);

return 0;

}

这个函数的实现也比较简单，首先创建了一个ubi_work结构体，初始化之。

static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk)

{

spin_lock(&ubi->wl_lock);

list_add_tail(&wrk->list, &ubi->works);

ubi_assert(ubi->works_count >= 0);

ubi->works_count += 1;

if (ubi->thread_enabled)

wake_up_process(ubi->bgt_thread);

spin_unlock(&ubi->wl_lock);

}

将这个ubi_work结构体结构体加到&ubi->works队列中，然后唤醒ubi的后台进程。

这个后台进程是ubi_thread，主要调用do_work来执行具体的操作。在此之前我们需要详细的了解一下ubi_work结构体。

struct ubi_work {

struct list_head list;//用于将其连到队列中去

int (*func)(struct ubi_device *ubi, struct ubi_work *wrk, int cancel);

//函数指针，用于执行具体的work

/* The below fields are only relevant to erasure works */

struct ubi_wl_entry *e；//一个红黑树的入口，用于查找leb

int torture;

};

在后台擦除的时候，这个func的具体执行函数为

static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,int cancel)

->sync_erase(读出该peb的ec头部，后的erase counter)

->do_sync_erase(具体执行mtd层的erase操作，并检查擦除之后该块是否全是0xff)

->ubi_io_write_ec_hdr(重新写入ec头)

至此，一个块的unmap工作就完成了。

3.2 ．Bit filp

int ubi_eba_read_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,void *buf, int offset, int len, int check)

是eba层的读函数，被ubifs层调用。当发生位反转（bit filp）的时候，ubi认为该块不适合继续用来存储数据了，就会进行scrub操作。

具体的scrub操作由ubi_wl_scrub_peb函数执行。与上面的erase一样，也是创建一个ubi_worker，只不是现在的具体的回调函数是wear_leveling_worker

该函数将原来lnum对应的pnum中的数据拷贝到另外一个物理块中，然后将原来的物理块擦除。

4 ．Init 过程

Nand_scan用来在attach某一个MTD分区的时候扫描分区的每一个块，通过读出EC头跟VID头来确定volumn的个数，并将这些块分类（free，corp等等。在这个扫描的过程建立了一个很大的红黑树，树中的每一个节点代表一个leb。

后面在ubi_eba_init_scan中建立每一个volumn的vtl的时候会用到这个建立起来的树。

err = register_filesystem(&ubifs_fs_type);

if (err) {

ubifs_err("cannot register file system, error %d", err);

return err;

}

这儿是在ubifs_init（super.c）中关于注册ubifs文件系统类型的。

static struct file_system_type ubifs_fs_type = {

.name = "ubifs",

.owner = THIS_MODULE,

.get_sb = ubifs_get_sb,

.kill_sb = kill_anon_super,

};

文件系统类型中最主要的是关于超级块的读取。每一个类型的文件系统都有自己类型的自定义的超级块。但是为了跟VFS挂钩，需要将这些信息跟VFS定义的超级块衔接一起。所以每一个文件系统类型中都定义了这样的超级块读取函数。

（三）：UBIFS的六个area

UBIFS 中一共分为六个区，分别为

superblock area

master node area

journal (or log)area

LPT(LEB properties tree) area

Orphan area

The mian area

对于第一个区我不准备做介绍，因为superblock是每一个文件系统必备的。

MASTER AREA:UBIFS为了进行垃圾回收，采用了node结构来进行文件的管理，什么是node？我觉得以UBIFS中的一个inode node来打个比方。

struct ubifs_ino_node {

struct ubifs_ch ch;

__u8 key[UBIFS_MAX_KEY_LEN];

__le64 creat_sqnum;

__le64 size;

__le64 atime_sec;

__le64 ctime_sec;

__le64 mtime_sec;

__le32 atime_nsec;

__le32 ctime_nsec;

__le32 mtime_nsec;

__le32 nlink;

__le32 uid;

__le32 gid;

__le32 mode;

__le32 flags;

__le32 data_len;

__le32 xattr_cnt;

__le32 xattr_size;

__u8 padding1[4]; /* Watch 'zero_ino_node_unused()' if changing! */

__le32 xattr_names;

__le16 compr_type;

__u8 padding2[26]; /* Watch 'zero_ino_node_unused()' if changing! */

__u8 data[];

} __attribute__ ((packed));

node就是文件信息和数据的一个结合。上面的结构体中除了__u8 data[]之外都可以称之为文件信息，而__u8 data[]称之为文件数据。为了便于垃圾回收，文件系统必须为所有的文件建立这样的树状结构来进行管理。

为了降低启动时的扫描时间和运行的内存消耗，UBIFS将这样的树状结构保持在FLASH上，而不是在内存中。但是问题就来，怎么知道这棵树的根在哪儿？

所以master区就是为了这样的目的，当然不仅仅是为了这样的目的，这棵树的根就保存在master area中。

Journal area：上面我们提到了UBIFS中这样的树状结构是保存在flash中，那么就带来了一个问题，每次更新文件（不管是写入、修改还是删除），相应的文件信息和数据都会发生变化，那么这颗树种的结点也会发生变化。而我们知道NANDFLASH的特点，每次重新写入之前必须擦除，可见这样频繁的操作带来的是效率的低下。问了降低片上结点频繁的更新，UBIFS中创建了journal区，在其中缓存对结点的修改，然后一次写到NANDFLASH上去，这样就降低了更新的频率。UBIFS会在内存中建立TNC树（tree node cache），是对flash中这棵树的缓存，这样不可能每次都要到FLASH上去读出node结点的相关信息。在FLASH中的index node在TNC树中为znode。

LPT AREA:我们上面提到了journal area的目的，就是降低数据的更新频率。但是数据如何更新呢？也就是说，这些新添加的数据写往何处？所以必须对flash中每一个块的空间使用情况有一个了解，这就是这儿LPT（LEB properties tree）的目的。LEB properties中主要包含三个重要的参数：free space、dirty space和whether the eraseblock is an index eraseblock or not。空闲空间是指可擦除块中未使用的空间。Dirty space是指一个可擦除块中废弃的（被trunk掉的）和填充的空间的字节数（UBIFS中存在minI/O，也就是最小的写入数据字节数，如果数据不够，就需要padding来填充）。我们上面提到了master区中放的是node树的根，那么它的枝放在哪儿呢？是以index node的形式存放在可擦除块中，所以需要标记一下知道main area中这个可擦除块中存放的是否是index node。LPT area的大小是根据分区的大小来确定的。LPT也有自己的LPT，这是什么意思，就是LPT内部建立了一个ltab（LEB properties table，因为LPTarea所占的可擦出块毕竟是少数，所以采用表的形式），是LPT表所占LEB的LPT。LPT也是在commit的时候更新的。

ORPHAN AREA:在理解在这个区的作用之前，我们必须准确的了解inode node结点在UBIFS中的作用。用这篇文章中的话来解释的话，A node that holds the metadata for an inode，Every inode has exactly one（non-obsolete）inode node。Orphan area is an area for storing the inode numbers of deleted by still open inodes，needed for recovery from unclean unmounts。

MAIN AREA:这个区就不用多说了，是用来存放文件数据和index结点的。

（四）：重要数据结构

用 leeming 的话来说，一个大的工程中，最最核心的往往是数据结构体的定义。所以看代码不急着看 c 文件，而是主要看 document 和 h 文件，来理解设计者的思路，这样才能走对路。

1. struct ubi_device

UBI中对于一个UBI设备的抽象是以struct ubi_device来定义，其中包括了该UBI设备的各种信息。

struct ubi_device {

struct cdev cdev;

struct device dev;

int ubi_num;//UBI设备的标号，在ubiattach用户程序时以-d选项来输入

char ubi_name[sizeof(UBI_NAME_STR)+5];//ubi设备的名称

int vol_count;//在该UBI设备中有多少个volume

struct ubi_volume *volumes[UBI_MAX_VOLUMES+UBI_INT_VOL_COUNT];

spinlock_t volumes_lock;

int ref_count;

int image_seq;

int rsvd_pebs;//保留的LEB数目

int avail_pebs;//可用的LEB数目

int beb_rsvd_pebs;//为坏块处理而保留的LEB数目

int beb_rsvd_level;//为坏块处理而保留的LEB的正常数目

int autoresize_vol_id;

int vtbl_slots;

int vtbl_size;//volume表的大小（bytes）

struct ubi_vtbl_record *vtbl;//内存中volume表的拷贝

struct mutex device_mutex;

int max_ec;//最大的erase counter

/* Note, mean_ec is not updated run-time - should be fixed */

int mean_ec;//平均erase counter

/* EBA sub-system's stuff */

unsigned long long global_sqnum;

spinlock_t ltree_lock;

struct rb_root ltree;

struct mutex alc_mutex;

/* Wear-leveling sub-system's stuff */

struct rb_root used;//一个红黑树，其中是已用的blcok

struct rb_root erroneous;// RB-tree of erroneous used physical eraseblocks

struct rb_root free;//红黑树的根，其中是没有用到的block

struct rb_root scrub;//需要擦除的blcok

struct list_head pq[UBI_PROT_QUEUE_LEN];

int pq_head;

spinlock_t wl_lock;

struct mutex move_mutex;

struct rw_semaphore work_sem;

int wl_scheduled;

struct ubi_wl_entry **lookuptbl;// a table to quickly find a &struct ubi_wl_entry object for any physical eraseblock,，一个struct ubi_wl_entry类型的数组，以pnum为下标，记录该UBI设备的每一个block

struct ubi_wl_entry *move_from;// physical eraseblock from where the data is being moved

struct ubi_wl_entry *move_to;// physical eraseblock where the data is being moved to

int move_to_put;//标志位，用于标志目的LEB是否被put

struct list_head works;// list of pending works

int works_count;// count of pending works

struct task_struct *bgt_thread;//UBI的后台进程

int thread_enabled;

char bgt_name[sizeof(UBI_BGT_NAME_PATTERN)+2];//后台进程的名字

struct notifier_block reboot_notifier;//内核通知链

/* I/O sub-system's stuff */

long long flash_size;//MTD分区的大小

int peb_count;//LEB的数目

int peb_size;//LEB的大小（每一个block的大小）

int bad_peb_count;//坏块数目

int good_peb_count;//能使用的LEB数目

int erroneous_peb_count;

int max_erroneous;

int min_io_size;//最小操作单元的大小，也就是一个page的大小

int hdrs_min_io_size;

int ro_mode;

int leb_size;//逻辑块的大小，一般等于peb_size

int leb_start;//逻辑块块从物理块中那一块开始算，也就是之前的物理块保留用于其他目的

int ec_hdr_alsize;// size of the EC header aligned to @hdrs_min_io_size

int vid_hdr_alsize; //size of the VID header aligned to @hdrs_min_io_size

int vid_hdr_offset;//VID头部在一块之中的偏移量。一般是一个pagesize

int vid_hdr_aloffset;// starting offset of the VID header aligned to @hdrs_min_io_size

int vid_hdr_shift// contains @vid_hdr_offset - @vid_hdr_aloffset

unsigned int bad_allowed:1;

unsigned int nor_flash:1;// non-zero if working on top of NOR flash

struct mtd_info *mtd;//指向MTD分区信息，我们知道，UBI层是构建在MTD层之上的。

void *peb_buf1;//一个缓冲区，大小为一个block的大小

void *peb_buf2; //一个缓冲区，大小为一个block的大小

struct mutex buf_mutex;

struct mutex ckvol_mutex;

#ifdef CONFIG_MTD_UBI_DEBUG_PARANOID

void *dbg_peb_buf;

struct mutex dbg_buf_mutex;

#endif

};

2. struct ubi_vtbl_record

下一个重要的结构体struct ubi_vtbl_record，在认识这个结构体之前我们先看一副截图,

这幅截图是我们在attach一个设备时候的打印内容，红色的划线部分是我们要注意的内容：internal volume

什么是internal volume？它是与下面的user volume相区别的。

internal volume是内核使用来保持相应的信息的，那么它保持的是什么呢？它保持的是volume table。它是以struct ubi_vtbl_record数据结构的格式来保持的。

struct ubi_vtbl_record {

__be32 reserved_pebs;// how many physical eraseblocks are reserved for this volume

__be32 alignment;// volume alignment

__be32 data_pad;// how many bytes are unused at the end of the each physical eraseblock to satisfy the requested alignment

__u8 vol_type;//volume的类型，分为动态和静态两种，动态volume可以动态的改变它的大小

__u8 upd_marker;

__be16 name_len;//volume name length

__u8 name[UBI_VOL_NAME_MAX+1]; //volume name

__u8 flags;

__u8 padding[23];

__be32 crc;

} __attribute__ ((packed));

3. struct ubi_volume

struct ubi_volume是对UBI设备上每一个volume的抽象。

struct ubi_volume {

struct device dev;

struct cdev cdev;

struct ubi_device *ubi;//该volume在哪一个UBI设备上

int vol_id;//volume标号

int ref_count;//引用次数（不知道什么用途）

int readers;// number of users holding this volume in read-only mode

int writers;// number of users holding this volume in read-write mode

int exclusive;// whether somebody holds this volume in exclusive mode

int reserved_pebs;//该volume中保留的peb数

int vol_type;//volume类型

int usable_leb_size;// logical eraseblock size without padding

int used_ebs//可用PEB数目

int last_eb_bytes;// how many bytes are stored in the last logical eraseblock

long long used_bytes;//已用空间大小

int alignment;

int data_pad;

int name_len;//volume名字的长度

char name[UBI_VOL_NAME_MAX + 1];

int upd_ebs;

int ch_lnum; LEB number which is being changing by the atomic LEB change operation（这样在后面修改LEB数据的操作中可以看到）

int ch_dtype;

long long upd_bytes;

long long upd_received;

void *upd_buf;

int *eba_tbl;// EBA table of this volume，极其重要，LEB到PEB得影射关系需要查该表来获得

unsigned int checked:1;

unsigned int corrupted:1;

unsigned int upd_marker:1;

unsigned int updating:1;

unsigned int changing_leb:1;

unsigned int direct_writes:1;

};

4. struct ubi_scan_info

这个结构体是在attach的过程中使用的。在attach的过程中，UBIFS需要获知该设备上每一个PEB的状态，然后为重新挂载文件系统做准备。

struct ubi_scan_info {

struct rb_root volumes;//volume的红黑树的根节点

//下面是4个链表，是在扫描的过程将扫描的block进行分类，然后连接到下面4个链表中的其中一个。

struct list_head corr;

struct list_head free;

struct list_head erase;

struct list_head alien;

int bad_peb_count;//坏块数

int vols_found;//volume数

int highest_vol_id;//volume的最高标号

int alien_peb_count;

int is_empty;//标志位，用于表示该UBI设备是否为空的，在上面所说的扫描过程被置位

int min_ec;//最小erase counter

int max_ec;//最大erase counter

unsigned long long max_sqnum;//64位的sqnum

int mean_ec;//平均erase counter

uint64_t ec_sum;

int ec_count;

int corr_count;

};

5 struct ubi_scan_leb

在上面的struct ubi_scan_info中我们说到了在attach操作中的扫描过程，并且说到了struct ubi_scan_info中的4个队列，是将扫描的每一个block的信息抽象，然后挂载到这些队列中去，下面就简单的说一下对于block扫描信息的抽象。

struct ubi_scan_leb {

int ec;//erase counter，用于均衡损耗目的，以后详细介绍

//每一个卷的eba_table就是由下面两个成员构成的。

int pnum;//物理块标号

int lnum;//逻辑块标号

int scrub;

unsigned long long sqnum;

union {

struct rb_node rb;

struct list_head list;

} u;

};

6. struct ubi_ec_hdr

我们知道UBIFS是一个Wear-level的文件系统，即均衡损耗。我们就以struct ubi_ec_hdr这个开始重要结构体的介绍。

struct ubi_ec_hdr {

__be32 magic;

__u8 version;

__u8 padding1[3];

__be64 ec; /* Warning: the current limit is 31-bit anyway! */

__be32 vid_hdr_offset;

__be32 data_offset;

__be32 image_seq;

__u8 padding2[32];

__be32 hdr_crc;

} __attribute__ ((packed));

我们注意其中的一个成员变量为_be64 ec，ec是什么，ec就是erase counter。我们知道NANDFLASH是的擦除是有次数限制的，当擦除的次数太多的时候，就会变成坏块。什么是均衡损耗，就是在文件系统的管理下，我们不能对其中的一块进行过多的擦除操作。

我们来看函数ensure_wear_leveling，它只要是来判断UBI设备是否需要进行均衡损耗的相关处理，这儿就有两个问题了。1.它判断的依据是什么。2.它会进行什么样的相关来避免对一个可擦出块进行过多的擦除操作。

那么我们先来回答第一个问题，在WL子系统中，所有的可擦出块都归WL子系统来进行管理。这是一个RB数，我们来其中的每一个结点。

struct ubi_wl_entry {

union {

struct rb_node rb;

struct list_head list;

} u;

int ec;

int pnum;

};

说白了，WL只关心一个东西，那么就是ec的数值。下面是wear_leveling_worker函数中的一段核心代码：

e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);

e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))

从used中队里中取出一个LEB，显然EC是最小的（每擦除一次，EC值加一），再从free队列中取出一个EC值最大的LEB。

如果两个LEB的ec差值大于了UBI_WL_THRESHOLD，那么就需要进行WL操作了。

那么多操作是什么呢？

err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr);

将内容从一个LEB搬到另外一个LEB中去。

6.struct ubi_vid_hdr

在上面EC头部中有一个成员变量是vid_hdr_offset，是指vid_hdr在FLASH中的偏移量，接着分析第二重要的数据结构struct ubi_vid_hdr。

struct ubi_vid_hdr {

__be32 magic;

__u8 version;

__u8 vol_type;

__u8 copy_flag;

__u8 compat;

__be32 vol_id;

__be32 lnum;

__u8 padding1[4];

__be32 data_size;

__be32 used_ebs;

__be32 data_pad;

__be32 data_crc;

__u8 padding2[4];

__be64 sqnum;

__u8 padding3[12];

__be32 hdr_crc;

} __attribute__ ((packed));

这其中最重要的成员变量是__be32 vol_id和__be32 lnum。vol_id是标示该LEB属于哪儿一个volume。Lnum是指与该PNUM相对于的lnum。对于上层而言，我们操作的是逻辑块，也就是lnum，但是最终需要将数据写进pnum中去，在ubi_eba_write_leb函数中有这样的一句：

pnum = vol->eba_tbl[lnum];

每一个volume都有一个eba_tbl，是在扫描的时候建立的。如果该lnum没有影射，那么调用ubi_wl_get_peb来获得一个pnum，并相应的修改volume的eba_tbl;

7. struct ubi_scan_volume

struct ubi_scan_volume {

int vol_id;//volume标号

int highest_lnum;//该volume中的最高逻辑块标号

int leb_count;//leb数目

int vol_type;//volume类型

int used_ebs;//已用PEB数

int last_data_size;

int data_pad;

int compat;

struct rb_node rb;//不清楚具体目的，猜想应该是一个节点的cache，缓存的是刚刚访问的结点

struct rb_root root;

};

这儿主要注意一下上面的struct rb_root root变量，这个成员是一个红黑树的根，用来链接在扫描的过程中发现的属于该volume的PEB。

这个结构体是在扫描的过程中读VID头部建立起来的关于volume的临时信息。

在ubifs-media.h中定义了很多的结构体，下面简单的解释一下。

在《A brief。。。》中讲到了，UBIFS采用的node-structure。它的所有的数据都是以node的形式处理的。

struct ubifs_ch，其中ch的意思是指common header，是下面的所有结构体的共同部分。

struct ubifs_ino_node，是用来保存inode结点相关信息的。

struct ubifs_dent_node，用来来保存dent的相关信息。关于dent和inode请参考VFS部分的相关内容。

struct ubifs_data_node，用来保存具体数据的结点。

struct ubifs_trun_node，用来在trunk的时候写入journal区中的，只存在于journal区中。

struct ubifs_pad_node，用来进行数据填充的。

struct ubifs_sb_node，超结块结点，用来记载superblock的相关信息，只存在在superblock区中。

struct ubifs_mst_node，记录master结点的数据结构，记载node树的根结点以及其他信息。只存在在master area中。

struct ubifs_ref_node ，用于在更新数据的时候写入journal区，在commit的时候更新index树和LPT树，只存在在journal区中。

struct ubifs_idx_node，idx结点的头部，关于idx结点，请参考《a brief introduction to design of UBIFS》，只存在在main区中。

struct ubifs_branch，idx结点中的分支。

struct ubifs_cs_node，cs = commit start，用于在journal中表示一次commit的开始，只存在在journal区中。一次commit由一个ubifs_cs_node和若干ubifs_ref_node 组成。

struct ubifs_orph_node ，用于在orphan区中记录相关信息的结点，关于orphan请参考《a brief introduction to design of UBIFS》。

（五）：wear-leveling

在本文的开头，先接本章讲一下 EBA ，什么是 EBA ， Eraseblock Association 。

在上次提到struct ubi_volume结构体的成员变量eba_tbl的时候稍微提到了。每次文件系统需要对一个逻辑可擦除块（LEB）进行操作的时候，它就会到对应的volume的eba_tbl中去查找该逻辑可擦除块对应着哪一个物理可擦除块（PEB）。

EBA子系统的两个最重要的操作是map和unmap的过程。但是在UBI的内核源码中并没有关于map的专门函数，而是嵌套在ubi_eba_write_leb函数中，下面看来函数的具体代码：

int ubi_eba_write_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,

const void *buf, int offset, int len, int dtype)

{

int err, pnum, tries = 0, vol_id = vol->vol_id;

struct ubi_vid_hdr *vid_hdr;

if (ubi->ro_mode)

return -EROFS;

err = leb_write_lock(ubi, vol_id, lnum);

if (err)

return err;

到具体volume的eba_tbl表中去查找LEB与PEB之间的关系，如果pnum大于0就表示该LEB已经影射了

pnum = vol->eba_tbl[lnum];

if (pnum >= 0) {

dbg_eba("write %d bytes at offset %d of LEB %d:%d, PEB %d",

len, offset, vol_id, lnum, pnum);

err = ubi_io_write_data(ubi, buf, pnum, offset, len);

if (err) {

ubi_warn("failed to write data to PEB %d", pnum);

if (err == -EIO && ubi->bad_allowed)

err = recover_peb(ubi, pnum, vol_id, lnum, buf,

offset, len);

if (err)

ubi_ro_mode(ubi);

}

leb_write_unlock(ubi, vol_id, lnum);

return err;

}

* The logical eraseblock is not mapped. We have to get a free physical

* eraseblock and write the volume identifier header there first.

vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_NOFS);

if (!vid_hdr) {

leb_write_unlock(ubi, vol_id, lnum);

return -ENOMEM;

}

vid_hdr->vol_type = UBI_VID_DYNAMIC;

vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));

vid_hdr->vol_id = cpu_to_be32(vol_id);

vid_hdr->lnum = cpu_to_be32(lnum);

vid_hdr->compat = ubi_get_compat(ubi, vol_id);

vid_hdr->data_pad = cpu_to_be32(vol->data_pad);

retry:

上面的代码比较简单，也不是本次关注的内容

通过函数ubi_wl_get_peb来从WL子系统中获得一块free的PEB，然后修改volume的eba_tbl，这样一个map过程就算完成了，so easy ,~。~！！

pnum = ubi_wl_get_peb(ubi, dtype);

if (pnum < 0) {

ubi_free_vid_hdr(ubi, vid_hdr);

leb_write_unlock(ubi, vol_id, lnum);

return pnum;

}

dbg_eba("write VID hdr and %d bytes at offset %d of LEB %d:%d, PEB %d",

len, offset, vol_id, lnum, pnum);

err = ubi_io_write_vid_hdr(ubi, pnum, vid_hdr);

if (err) {

ubi_warn("failed to write VID header to LEB %d:%d, PEB %d",

vol_id, lnum, pnum);

goto write_error;

}

if (len) {

err = ubi_io_write_data(ubi, buf, pnum, offset, len);

if (err) {

ubi_warn("failed to write %d bytes at offset %d of "

"LEB %d:%d, PEB %d", len, offset, vol_id,

lnum, pnum);

goto write_error;

}

vol->eba_tbl[lnum] = pnum;

leb_write_unlock(ubi, vol_id, lnum);

ubi_free_vid_hdr(ubi, vid_hdr);

return 0;

write_error:

if (err != -EIO || !ubi->bad_allowed) {

ubi_ro_mode(ubi);

leb_write_unlock(ubi, vol_id, lnum);

ubi_free_vid_hdr(ubi, vid_hdr);

return err;

}

* Fortunately, this is the first write operation to this physical

* eraseblock, so just put it and request a new one. We assume that if

* this physical eraseblock went bad, the erase code will handle that.

err = ubi_wl_put_peb(ubi, pnum, 1);

if (err || ++tries > UBI_IO_RETRIES) {

ubi_ro_mode(ubi);

leb_write_unlock(ubi, vol_id, lnum);

ubi_free_vid_hdr(ubi, vid_hdr);

return err;

}

vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));

ubi_msg("try another PEB");

goto retry;

}

接着看一个unmap的过程：

int ubi_eba_unmap_leb(struct ubi_device *ubi, struct ubi_volume *vol,

int lnum)

{

int err, pnum, vol_id = vol->vol_id;

if (ubi->ro_mode)

return -EROFS;

err = leb_write_lock(ubi, vol_id, lnum);

if (err)

return err;

首先还是查询vol->eba_tbl表，如果对应的想为-1,说明我们要unmap的块根本就没有map，所以也就不需要做任何事情了

pnum = vol->eba_tbl[lnum];

if (pnum < 0)

/* This logical eraseblock is already unmapped */

goto out_unlock;

dbg_eba("erase LEB %d:%d, PEB %d", vol_id, lnum, pnum);

如果不是小于0，那么得到值肯定是一个PEB号，修改eba_tbl对应项为-1

vol->eba_tbl[lnum] = UBI_LEB_UNMAPPED;

我们上面提到了，在map的过程中需要从WL子系统中获得peb，现在unmap掉了，需要将PEB归还给WL子系统并需要擦除，这个是由ubi_wl_put_peb完成的。

err = ubi_wl_put_peb(ubi, pnum, 0);

out_unlock:

leb_write_unlock(ubi, vol_id, lnum);

return err;

}

从上面的这段例子中可以看出，在UBI中，获得每一个PEB都是从WL子系统中获得，释放掉的每一个PEB都要归还给WL子系统，可以说WL无处不在每一个涉及可擦除块的使用的操作肯定涉及到WL子系统。

下面介绍一下涉及的wl的主要的数据结构：

struct ubi_wl_entry {

union {

struct rb_node rb;

struct list_head list;

} u;

int ec;

int pnum;

};

从这个结构体中可以看出WL子系统操作的是实实在在的物理可擦除块，另外一个关注的就是EC头部的erase counter，这也是WL进行操作的依据。

从联合u中可以看出wl子系统中是采用红黑树来管理的。关于红黑的一些操作下面稍微掠过，并不以源码的形式详细阐述。

static void wl_tree_add(struct ubi_wl_entry *e, struct rb_root *root)该操作用于将e添加到以root为RB树根的树中

static int in_wl_tree(struct ubi_wl_entry *e, struct rb_root *root)用于判断e是否存在于以root为根的RB树中

static struct ubi_wl_entry *find_wl_entry(struct rb_root *root, int max)用于在以root为根的RB树中查找erase counter无限左接近max的PEB。

WL的作用是什么呢？上面提到了一点，就是以EC值为依据来进行可擦除逻辑块的管理，以防对某一些可擦除块过多的操作导致变为坏块。如果在操作的过程中发现，某一个可擦除块的EC值变的不正常了，也就是变的太大了。（EC值是随着擦除的次数增加的）。既然EC值已经变的这么大了，那么这块可擦除块还能用吗？能。

在include/mtd/ubi-user.h中有这样一个枚举。

enum {

UBI_LONGTERM = 1,

UBI_SHORTTERM = 2,

UBI_UNKNOWN = 3,

};

定了三种用于指定数据类型的标志位，从名字中可以看出这个枚举的目的用于说明数据是长期还是短期保存。

在ubi_wl_get_peb函数中有这样的一段代码：

case UBI_LONGTERM:

e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

break;

我们在获得一个PEB的时候，如果是用于长期保存的数据的话，那么就取一个EC值比较大（也就是已经擦除过很多次）的PEB。这样就物尽其用了。

根据ubidesign的说明：UBI select a long term storage block with a low erase count and copies the block contents to the block with the high erase count using the block moving function.但是在后面的源码中并没有看到long term这方面的考虑？（我哪儿没看到？）

函数ensure_wear_leveling就是用来判断是否存在上述的这种情况的。

static int ensure_wear_leveling(struct ubi_device *ubi)

{

int err = 0;

struct ubi_wl_entry *e1;

struct ubi_wl_entry *e2;

struct ubi_work *wrk;

spin_lock(&ubi->wl_lock);

//如果Wear-leveling已经在work工作队列了，那么这样的判断就没有必要了，因为不管你怎么判断，都是Wear-leveling必须的，而且会对pending的work造成影响，所以就什么事情也不做了。

if (ubi->wl_scheduled)

/* Wear-leveling is already in the work queue */

goto out_unlock;

* If the ubi->scrub tree is not empty, scrubbing is needed, and the

* the WL worker has to be scheduled anyway.

@情况一：如果没有已经使用的可擦除块，也就是说该UBI设备刚被attach上去，没有任何数据。

@情况二：没有可用的可擦除块。上面说到了WL是将一块的数据搬运到另外一块可擦除块中，现在没有可用的可擦除块了，工作进行不下去了

if (!ubi->scrub.rb_node) {//这个队列中的结点是从哪儿来的呢？也就是说在什么情况下添加进来的

if (!ubi->used.rb_node || !ubi->free.rb_node)

/* No physical eraseblocks - no deal */

goto out_unlock;

* We schedule wear-leveling only if the difference between the

* lowest erase counter of used physical eraseblocks and a high

* erase counter of free physical eraseblocks is greater than

* %UBI_WL_THRESHOLD.

上面说到了WL是将一块已用的可擦除块中的数据搬运到另外一块未用的可擦除块中去，所以就从used树中找一块EC值很小的（但是根据文档说，这儿应该是找一块UBI_LONGTERM类型的并且EC值比较小的），然后再从free树中找一块ec值很大的。

e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);

e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))

goto out_unlock;

dbg_wl("schedule wear-leveling");

} else

dbg_wl("schedule scrubbing");

ubi->wl_scheduled = 1;//注意这儿将wl_scheduled标志置位

spin_unlock(&ubi->wl_lock);

wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS);

if (!wrk) {

err = -ENOMEM;

goto out_cancel;

}

//构造一个worker，并添加到队列中由后台进程来完成。具体工作是由wear_leveling_worker来完成的。

wrk->func = &wear_leveling_worker;

schedule_ubi_work(ubi, wrk);

return err;

out_cancel:

spin_lock(&ubi->wl_lock);

ubi->wl_scheduled = 0;

out_unlock:

spin_unlock(&ubi->wl_lock);

return err;

}

下面就看看wear_leveling_worker这个函数的具体的工作：

static int wear_leveling_worker(struct ubi_device *ubi, struct ubi_work *wrk,

int cancel)

{

int err, scrubbing = 0, torture = 0, protect = 0, erroneous = 0;

int vol_id = -1, uninitialized_var(lnum);

struct ubi_wl_entry *e1, *e2;

struct ubi_vid_hdr *vid_hdr;

kfree(wrk);

if (cancel)

return 0;

//分配一个VID头部，因为在拷贝数据的过程中，需要重新写入VID

vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_NOFS);

if (!vid_hdr)

return -ENOMEM;

mutex_lock(&ubi->move_mutex);

spin_lock(&ubi->wl_lock);

ubi_assert(!ubi->move_from && !ubi->move_to);

ubi_assert(!ubi->move_to_put);

@下面的英文注释已经说的很清楚了，如果没有free的PEB，没有关系，可以等待被pending的erase_worker完成。但是如果连scrub都没有，那么就没有办法了，取消本次WL操作

@没有used的PEB？。在ubi_wl_get_peb函数中

rb_erase(&e->u.rb, &ubi->free)

prot_queue_add(ubi, e);

而在ubi_wl_put_peb中有：

prot_queue_del(ubi, e->pnum);

这样的操作，相信在别的地方如erase_wroker也有这样的操作。也就是说UBI会将暂时操作的PEB从相应的队列中暂时移除，把它放到ubi->pq中保护起来。

if (!ubi->free.rb_node ||

(!ubi->used.rb_node && !ubi->scrub.rb_node)) {

* No free physical eraseblocks? Well, they must be waiting in

* the queue to be erased. Cancel movement - it will be

* triggered again when a free physical eraseblock appears.

* No used physical eraseblocks? They must be temporarily

* protected from being moved. They will be moved to the

* @ubi->used tree later and the wear-leveling will be

* triggered again.

dbg_wl("cancel WL, a list is empty: free %d, used %d",

!ubi->free.rb_node, !ubi->used.rb_node);

goto out_cancel;

}

if (!ubi->scrub.rb_node) {

* Now pick the least worn-out used physical eraseblock and a

* highly worn-out free physical eraseblock. If the erase

* counters differ much enough, start wear-leveling.

e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);

//如果scrub队列是空的，那么就从free队列中取一个目标PEB进行WL操作（EC无限左接近于WL_FREE_MAX_DIFF）

e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) {

dbg_wl("no WL needed: min used EC %d, max free EC %d",

e1->ec, e2->ec);

goto out_cancel;

}

paranoid_check_in_wl_tree(e1, &ubi->used);

// rb_erase是一个红黑的基本删除操作，在lib/rbtree.c中。这儿e1中的数据被转移了，那么就需要将e1从ubi->used队列中删除掉

rb_erase(&e1->u.rb, &ubi->used);

dbg_wl("move PEB %d EC %d to PEB %d EC %d",

e1->pnum, e1->ec, e2->pnum, e2->ec);

} else {

/* Perform scrubbing */

scrubbing = 1;

//注意这儿从scrub中获得e2的时候，并没有像上面一样if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))比较e1和e2的EC值，为什么呢？因为scrub队列中的PEB都是在读的时候发生BIT_FILP的，所以必须进行WL

e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb);

e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);

paranoid_check_in_wl_tree(e1, &ubi->scrub);

rb_erase(&e1->u.rb, &ubi->scrub);

dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum);

}

paranoid_check_in_wl_tree(e2, &ubi->free);

rb_erase(&e2->u.rb, &ubi->free);

//注意这儿，这两个指针在数据搬运完成之后会被清除掉的

ubi->move_from = e1;

ubi->move_to = e2;

spin_unlock(&ubi->wl_lock);

* Now we are going to copy physical eraseblock @e1->pnum to @e2->pnum.

* We so far do not know which logical eraseblock our physical

* eraseblock (@e1) belongs to. We have to read the volume identifier

* header first.

* Note, we are protected from this PEB being unmapped and erased. The

* 'ubi_wl_put_peb()' would wait for moving to be finished if the PEB

* which is being moved was unmapped.

err = ubi_io_read_vid_hdr(ubi, e1->pnum, vid_hdr, 0);

if (err && err != UBI_IO_BITFLIPS) {

if (err == UBI_IO_PEB_FREE) {

* We are trying to move PEB without a VID header. UBI

* always write VID headers shortly after the PEB was

* given, so we have a situation when it has not yet

* had a chance to write it, because it was preempted.

* So add this PEB to the protection queue so far,

* because presumably more data will be written there

* (including the missing VID header), and then we'll

* move it.

//进一步检查VID头部，不能说因为它是从used队列中取出来的就直接将数据搬运过去了，可以以前某个地方出错了。这儿如果发现我们要搬运的PEB本身就是空，那么搬运也就没必要进行下去了。

dbg_wl("PEB %d has no VID header", e1->pnum);

protect = 1;

goto out_not_moved;

}

ubi_err("error %d while reading VID header from PEB %d",

err, e1->pnum);

goto out_error;

}

vol_id = be32_to_cpu(vid_hdr->vol_id);

lnum = be32_to_cpu(vid_hdr->lnum);

//具体搬运数据由ubi_eba_copy_leb函数完成，实现比较简单，不在赘述

err = ubi_eba_copy_leb(ubi, e1->pnum, e2->pnum, vid_hdr);

if (err) {

if (err == MOVE_CANCEL_RACE) {

* The LEB has not been moved because the volume is

* being deleted or the PEB has been put meanwhile. We

* should prevent this PEB from being selected for

* wear-leveling movement again, so put it to the

* protection queue.

protect = 1;

goto out_not_moved;

}

if (err == MOVE_CANCEL_BITFLIPS || err == MOVE_TARGET_WR_ERR ||

err == MOVE_TARGET_RD_ERR) {

* Target PEB had bit-flips or write error - torture it.

torture = 1;

goto out_not_moved;

}

if (err == MOVE_SOURCE_RD_ERR) {

* An error happened while reading the source PEB. Do

* not switch to R/O mode in this case, and give the

* upper layers a possibility to recover from this,

* e.g. by unmapping corresponding LEB. Instead, just

* put this PEB to the @ubi->erroneous list to prevent

* UBI from trying to move it over and over again.

if (ubi->erroneous_peb_count > ubi->max_erroneous) {

ubi_err("too many erroneous eraseblocks (%d)",

ubi->erroneous_peb_count);

goto out_error;

}

erroneous = 1;

goto out_not_moved;

}

if (err < 0)

goto out_error;

ubi_assert(0);

}

/* The PEB has been successfully moved */

if (scrubbing)

ubi_msg("scrubbed PEB %d (LEB %d:%d), data moved to PEB %d",

e1->pnum, vol_id, lnum, e2->pnum);

ubi_free_vid_hdr(ubi, vid_hdr);

spin_lock(&ubi->wl_lock);

if (!ubi->move_to_put) {

wl_tree_add(e2, &ubi->used);

e2 = NULL;

}

ubi->move_from = ubi->move_to = NULL;

ubi->move_to_put = ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

//这个通过后台进程来擦除e1，erase_worker

err = schedule_erase(ubi, e1, 0);

if (err) {

kmem_cache_free(ubi_wl_entry_slab, e1);

if (e2)

kmem_cache_free(ubi_wl_entry_slab, e2);

goto out_ro;

}

if (e2) {

* Well, the target PEB was put meanwhile, schedule it for

* erasure.

dbg_wl("PEB %d (LEB %d:%d) was put meanwhile, erase",

e2->pnum, vol_id, lnum);

err = schedule_erase(ubi, e2, 0);

if (err) {

kmem_cache_free(ubi_wl_entry_slab, e2);

goto out_ro;

}

dbg_wl("done");

mutex_unlock(&ubi->move_mutex);

return 0;

* For some reasons the LEB was not moved, might be an error, might be

* something else. @e1 was not changed, so return it back. @e2 might

* have been changed, schedule it for erasure.

out_not_moved:

if (vol_id != -1)

dbg_wl("cancel moving PEB %d (LEB %d:%d) to PEB %d (%d)",

e1->pnum, vol_id, lnum, e2->pnum, err);

else

dbg_wl("cancel moving PEB %d to PEB %d (%d)",

e1->pnum, e2->pnum, err);

spin_lock(&ubi->wl_lock);

if (protect)

prot_queue_add(ubi, e1);

else if (erroneous) {

wl_tree_add(e1, &ubi->erroneous);

ubi->erroneous_peb_count += 1;

} else if (scrubbing)

wl_tree_add(e1, &ubi->scrub);

else

wl_tree_add(e1, &ubi->used);

ubi_assert(!ubi->move_to_put);

ubi->move_from = ubi->move_to = NULL;

ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

ubi_free_vid_hdr(ubi, vid_hdr);

err = schedule_erase(ubi, e2, torture);

if (err) {

kmem_cache_free(ubi_wl_entry_slab, e2);

goto out_ro;

}

mutex_unlock(&ubi->move_mutex);

return 0;

out_error:

if (vol_id != -1)

ubi_err("error %d while moving PEB %d to PEB %d",

err, e1->pnum, e2->pnum);

else

ubi_err("error %d while moving PEB %d (LEB %d:%d) to PEB %d",

err, e1->pnum, vol_id, lnum, e2->pnum);

spin_lock(&ubi->wl_lock);

ubi->move_from = ubi->move_to = NULL;

ubi->move_to_put = ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

ubi_free_vid_hdr(ubi, vid_hdr);

kmem_cache_free(ubi_wl_entry_slab, e1);

kmem_cache_free(ubi_wl_entry_slab, e2);

out_ro:

ubi_ro_mode(ubi);

mutex_unlock(&ubi->move_mutex);

ubi_assert(err != 0);

return err < 0 ? err : -EIO;

out_cancel:

ubi->wl_scheduled = 0;

spin_unlock(&ubi->wl_lock);

mutex_unlock(&ubi->move_mutex);

ubi_free_vid_hdr(ubi, vid_hdr);

return 0;

}

至此，WL基本完成。主要代码都在/drivers/mtd/ubi/wl.c文件中。

那么UBIFS中在什么情况下会调用ensure_wear_leveling来判断是否进行WL。

1. erase_worker

2. ubi_wl_scrub_peb

3. ubi_wl_init_scan

对于WL，有点需要详细说明一下:上面提到了ubi->scrub中的结点是从哪儿来的？

在ubi_eba_read_leb函数，当发生BIT_FILP的时候，会调用ubi_wl_scrub_peb来进行WL。

同样在上面的ensure_wear_leveling中看到了，WL中是优先到ubi->scrub队列中查找的。

ninver2007

关注

2
点赞
踩
18

收藏

觉得还不错? 一键收藏
0
评论
UBIFS文件系统源码分析

一）：挂载UBIFS的代码分析陆陆续续的看UBIFS很长时间了，一直没有写出一点东西。因为我在=到能够系统的理解UBIFS的时候再写出一点东西。但是因为工作比较忙，UBIFS源码读的断断续续，老是需要复习拾起，比较浪费时间，所以决定写出一点东西，做个备份吧。我决定在读UBIFS源码之前需要读两份关于UBIF设计的文档：一份是《UBI-Unsorted Block Images》 ubi
复制链接

扫一扫