在Nand Flash上创建UBIFS格式的文件系统,用于存储系统的历史数据。
在实际调试过程中,发现UBIFS文件系统会产生错误,输出信息如下:
[685108.022234] nand_erase_nand: attempt to erase a bad block at page 0x00008580
[685108.050087] nand_erase_nand: attempt to erase a bad block at page 0x00008580
[685108.070232] nand_erase_nand: attempt to erase a bad block at page 0x00008580
[685108.090388] nand_erase_nand: attempt to erase a bad block at page 0x00008580
[685108.097944] UBI error: do_sync_erase: cannot erase PEB 534, error -5
[685108.104750] UBI error: erase_worker: failed to erase PEB 534, error -5
[685108.111725] UBI: mark PEB 534 as bad
[685108.115647] UBI: 17 PEBs left in the reserve
[687322.014321] UBI: scrubbed PEB 1272 (LEB 0:1453), data moved to PEB 1855
[687774.705119] UBI error: ubi_io_read: error -74 (ECC error) while reading 126976 bytes from PEB 456:4096, read 126976 bytes
[687774.733301] UBIFS error (pid 10861): ubifs_check_node: bad CRC: calculated 0x7845a4da, read 0x907d6ad0
[687774.743302] UBIFS error (pid 10861): ubifs_check_node: bad node at LEB 1769:113376
[687774.751381] magic 0x6101831
[687774.751389] crc 0x907d6ad0
[687774.751398] node_type 1 (data node)
[687774.751407] group_type 0 (no node group)
[687774.751417] sqnum 64559590
[687774.751424] len 1667
[687774.751438] key (24037, data, 1463)
[687774.751448] size 4096
[687774.751455] compr_typ 1
[687774.751463] data size 1619
[687774.751469] data:
[687774.751486] 00000000: 0a 61 61 39 0a 30 30 31 65 37 61 61 63 0a c1 01 66 c2 01 62 32 e1 01 35 e1 01 38 e1 01 62 e1 01
[687774.942198] UBIFS error (pid 10861): ubifs_scan: bad node
[687774.947989] UBIFS error (pid 10861): ubifs_scanned_corruption: corruption at LEB 1769:113376
[687774.956997] UBIFS error (pid 10861): ubifs_scanned_corruption: first 8192 bytes from LEB 1769:113376
[687774.966778] 00000000: 06101831 907d6ad0 03d919e6 00000000 00000683 00000001 00005de5 200005b7 1....j}..................].....
[687774.973358] UBIFS error (pid 10861): ubifs_scan: LEB 1769 scanning failed
[687774.980645] UBIFS warning (pid 10861): ubifs_ro_mode: switched to read-only mode, error -117
[687775.162137] UBIFS error (pid 10861): ubifs_budget_space: cannot budget space, error -117
[687804.722203] UBIFS error (pid 11722): make_reservation: cannot reserve 160 bytes in jhead 1, error -30
[687804.732084] UBIFS error (pid 11722): ubifs_write_inode: can't write inode 74, error -30
分析为Nand Flash数据写入错误,编写测试程序对nand flash进行多任务的数据读写操作代码如下:
nand_test.c
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <fcntl.h>
#include<stdio.h>
int main(void)
{
FILE * fd = NULL;
char buff[100];
char tmp[1024];
int j,k ;
char buff1[10];
while(1)
{
for(k = 0; k < 40; k ++)
{
sprintf(tmp,"/data/test%d.dat",k);
fd = fopen(tmp, "wb");
if(!fd)
{
printf("Cannot open %s\n",tmp);
break;
}
for(j = 0; j < 1024*1024; j ++)
{
sprintf(buff,"%08x\n",k*j);
if(fwrite( buff, 9,1,fd) != 1)
{
printf("Write value to file:%s failed \n",tmp);
break;
}
}
fflush(fd);
fsync(fileno(fd));
fclose(fd);
}
for(k = 0; k < 40; k ++)
{
sprintf(tmp,"/data/test%d.dat",k);
fd = fopen(tmp, "r+");
if(!fd)
{
printf("Cannot open %s\n",tmp);
break;
}
for(j = 0; j < 1024*1024; j ++)
{
if(fread( buff, 9,1,fd) != 1)
{
printf("read file:%s failed \n",tmp);
break;
}
sprintf(buff1,"%08x\n",k*j);
if(memcmp(buff,buff1,9))
{
printf("check file:%s line %d failed \n",tmp,j);
}
}
// printf("check file finish:%s \n",tmp);
fclose(fd);
}
k--;
for(; k >= 0; k --)
{
sprintf(tmp,"/data/test%d.dat",k);
sprintf(buff,"rm %s",tmp);
system(buff);
}
printf("nand test success!~ \n");
}
return 0;
}
nand_test1.c
#include <stdlib.h>
#include <sys/types.h>
#include <unistd.h>
#include <fcntl.h>
#include<stdio.h>
int main(void)
{
FILE * fd = NULL;
char buff[100];
char tmp[1024];
int j,k ;
char buff1[10];
while(1)
{
for(k = 0; k < 10; k ++)
{
sprintf(tmp,"/data/test%d.dat",k + 40);
fd = fopen(tmp, "wb");
if(!fd)
{
printf("Cannot open %s\n",tmp);
break;
}
for(j = 0; j < 1024*1024; j ++)
{
sprintf(buff,"%08x\n",k*j);
if(fwrite( buff, 9,1,fd) != 1)
{
printf("Write value to file:%s failed \n",tmp);
break;
}
}
fflush(fd);
fsync(fileno(fd));
fclose(fd);
}
for(k = 0; k < 10; k ++)
{
sprintf(tmp,"/data/test%d.dat",k + 40);
fd = fopen(tmp, "r+");
if(!fd)
{
printf("Cannot open %s\n",tmp);
break;
}
for(j = 0; j < 1024*1024; j ++)
{
if(fread( buff, 9,1,fd) != 1)
{
printf("read file:%s failed \n",tmp);
break;
}
sprintf(buff1,"%08x\n",k*j);
if(memcmp(buff,buff1,9))
{
printf("check file:%s line %d failed \n",tmp,j);
}
}
// printf("check file finish:%s \n",tmp);
fclose(fd);
}
k--;
for(; k >= 0; k --)
{
sprintf(tmp,"/data/test%d.dat",k + 40);
sprintf(buff,"rm %s",tmp);
system(buff);
}
printf("nand test 1 success!~ \n");
}
return 0;
}
测试发现,在多任务操作Nand Flash的过程中,会有比较大的概率导致nand flash错误。
在Linux Kernel的menuconfig中,配置选项 Device Drivers-> Memory Technology Device (MTD) support -> NAND Device Support->Verify NAND page writes 用来配置在nand flash写入时,是否进行额外的校验.
开启Verify NAND page writes后,执行测试程序,输出信息如下:
[270907.433837] UBI error: ubi_io_write: error -5 while writing 2048 bytes to PEB 976:106496, written 0 bytes
[270907.444130] UBI warning: ubi_eba_write_leb: failed to write data to PEB 976
[270907.451584] UBI: recover PEB 976, move data to PEB 1456
[270907.650083] UBI: data was successfully recovered
[270907.656046] UBI: run torture test for PEB 976
[270908.300205] UBI: PEB 976 passed torture test, do not mark it as bad
[271145.827524] UBI error: ubi_io_write: error -5 while writing 2048 bytes to PEB 1048:57344, written 0 bytes
[271145.837783] UBI warning: ubi_eba_write_leb: failed to write data to PEB 1048
[271145.845346] UBI: recover PEB 1048, move data to PEB 1457
[271145.963098] UBI: data was successfully recovered
[271145.968262] UBI: run torture test for PEB 1048
[271146.740653] UBI: PEB 1048 passed torture test, do not mark it as bad
分析在开启Verify NAND page writes,能有效减少nand flash出错导致的程序异常。
在长期稳定性的测试中发现,nand flash总是有可能会发生读写错误,导致分区变化为只读分区。由于在实际应用中,nand flash分区存储的为历史数据信息,所以希望在nand falsh读写错误后不修改为只读状态。参照https://e2e.ti.com/support/embedded/linux/f/354/t/171839的内容,修改如下:
===================================================================
--- fs/ubifs/scan.c (revision 1897)
+++ fs/ubifs/scan.c (working copy)
@@ -339,7 +339,7 @@
if (!quiet)
ubifs_err("corrupt empty space at LEB %d:%d",
lnum, offs);
- goto corrupted;
+ //goto corrupted;
}
return sleb;
关于nand flash在am335x系列CPU上使用出现的这个问题,ti官方也没有太好的解决方案,具体可以参照ti的官方论坛答复。