GNU系统下内存对齐分配的记录

217 篇文章 29 订阅
98 篇文章 7 订阅

遇到一个内存对齐分配的问题,记录如下:

在 GNU 系统中,malloc 或 realloc 返回的内存块地址都是 8 的倍数(如果是 64 位系统,则为 16 的倍数)。如果你需要更大的粒度,请使用 memalign, valloc或者posx_memalign。这些函数在头文件 “stdlib.h” 中声明。

在 GNU 库中,可以使用函数 free 释放 memalign 和 valloc ,posix_memalign返回的内存块。但无法在 BSD 系统中使用,而且 BSD 系统中并未提供释放这样的内存块的途径。

函数:void * memalign (size_t boundary, size_t size) 函数 memalign 将分配一个由 size 指定大小,地址是 boundary 的倍数的内存块。参数 boundary 必须是 2 的幂!函数 memalign 可以分配较大的内存块,并且可以为返回的地址指定粒度。

函数:void * valloc (size_t size) 使用函数 valloc 与使用函数 memalign 类似,函数 valloc 的内部实现里,使用页的大小作为对齐长度,使用 memalign 来分配内存。它的实现如下所示:

函数:int posix_memalign (void **memptr, size_t alignment, size_t size);和memalign的主要差别在于函数原型,内部实现流程上,基本一致。

我么可以看一下在musl库中三个函数的实现:

使用时的注意事项

1、size必须是alignment的2的整数幂次倍,如果要申请按PAGE对齐的内存,则需要通过C库的getpagesize函数获取PAGE大小。

而PAGE SIZE的获取更有意思,它最终是由应用加载的时候,从内核中传递出来的:

验证:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <malloc.h>
#include <limits.h>

#define DALLOC_SIZE (1 * 1024 * 1024)

int main(int argc, char **argv)
{
	void *p = NULL;
	void *p1 = NULL;
	void *p2 = NULL;
	void *p3 = NULL;

	printf("%s line %d, pagesize %d.\n", __func__, __LINE__, getpagesize());
	       
	while(1)
	{
		printf("==================================================================\n");
		p= malloc(DALLOC_SIZE);
		if(!p)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}
		free(p);

		printf("%s line %d malloc success, p=%p.\n", __func__, __LINE__, p);

		p1= memalign(getpagesize(), DALLOC_SIZE);
		if(!p1)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}
		free(p1);

		printf("%s line %d malloc success, p1=%p.\n", __func__, __LINE__, p1);

		int ret = posix_memalign(&p2, getpagesize(), DALLOC_SIZE);
		if(ret != 0)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}
		free(p2);

		printf("%s line %d malloc success, p2=%p.\n", __func__, __LINE__, p2);

		p3 = valloc(DALLOC_SIZE);
		if(!p3)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}
		free(p3);

		printf("%s line %d malloc success, p3=%p.\n", __func__, __LINE__, p3);

		printf("==================================================================\n");

		sleep(1);
	}
	return 0;
}

验证:

czl@czl-VirtualBox:~/align$ ./a.out 
main line 16, pagesize 4096.
==================================================================
main line 29 malloc success, p=0x7f4f2360b010.
main line 39 malloc success, p1=0x7f4f2360b000.
main line 49 malloc success, p2=0x7f4f2360b000.
main line 59 malloc success, p3=0x7f4f2360b000.
==================================================================
==================================================================
main line 29 malloc success, p=0x5584f9205670.
main line 39 malloc success, p1=0x5584f9206000.
main line 49 malloc success, p2=0x5584f9206000.
main line 59 malloc success, p3=0x5584f9206000.
==================================================================
==================================================================
main line 29 malloc success, p=0x5584f9205670.
main line 39 malloc success, p1=0x5584f9206000.
main line 49 malloc success, p2=0x5584f9206000.
main line 59 malloc success, p3=0x5584f9206000.
==================================================================
==================================================================
main line 29 malloc success, p=0x5584f9205670.
main line 39 malloc success, p1=0x5584f9206000.
main line 49 malloc success, p2=0x5584f9206000.
main line 59 malloc success, p3=0x5584f9206000.
==================================================================
==================================================================
main line 29 malloc success, p=0x5584f9205670.
main line 39 malloc success, p1=0x5584f9206000.
main line 49 malloc success, p2=0x5584f9206000.
main line 59 malloc success, p3=0x5584f9206000.
==================================================================
==================================================================
main line 29 malloc success, p=0x5584f9205670.
main line 39 malloc success, p1=0x5584f9206000.
main line 49 malloc success, p2=0x5584f9206000.
main line 59 malloc success, p3=0x5584f9206000.
==================================================================
==================================================================
main line 29 malloc success, p=0x5584f9205670.
main line 39 malloc success, p1=0x5584f9206000.
main line 49 malloc success, p2=0x5584f9206000.
main line 59 malloc success, p3=0x5584f9206000.
==================================================================
==================================================================
main line 29 malloc success, p=0x5584f9205670.
main line 39 malloc success, p1=0x5584f9206000.
main line 49 malloc success, p2=0x5584f9206000.
main line 59 malloc success, p3=0x5584f9206000.
==================================================================

2. 和O_DIRECT的关系:

修改代码,增加O_DIRET模式写的操作,分别作用与四种内存分配上面:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <malloc.h>
#include <limits.h>
#include <string.h>

#define __USE_GNU 1
#include <fcntl.h>
#include <stdint.h>
#include <sys/mman.h>
#include <errno.h>

#define DALLOC_SIZE (1 * 1024 * 1024)

static void write_odirect_test(unsigned char *p)
{
	int fdno;

	fdno = open("./new.bin", O_DIRECT|O_RDWR|O_CREAT, 0666);
	if(fdno < 0)
	{
		printf("%s line %d, open file failure.\n", __func__, __LINE__);
		return;
	}

	fallocate(fdno, 1, 0, 100*1024*1024);
	
	if(write(fdno, p, DALLOC_SIZE) != DALLOC_SIZE)
	{
		printf("%s line %d, write failure, err %s.\n", __func__, __LINE__, strerror(errno));
		return;
	}

	fsync(fdno);
	close(fdno);

	if(unlink("./new.bin") < 0)
	{
		printf("%s line %d unlink errpr!\n", __func__, __LINE__);
	}
	return;
}

int main(int argc, char **argv)
{
	void *p = NULL;
	void *p1 = NULL;
	void *p2 = NULL;
	void *p3 = NULL;

	printf("%s line %d, pagesize %d.\n", __func__, __LINE__, getpagesize());
	       
	while(1)
	{
		printf("==================================================================\n");
		p= malloc(DALLOC_SIZE);
		if(!p)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}

		write_odirect_test(p);

		free(p);

		printf("%s line %d malloc success, p=%p.\n", __func__, __LINE__, p);

		p1= memalign(getpagesize(), DALLOC_SIZE);
		if(!p1)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}

		write_odirect_test(p1);

		free(p1);

		printf("%s line %d malloc success, p1=%p.\n", __func__, __LINE__, p1);

		int ret = posix_memalign(&p2, getpagesize(), DALLOC_SIZE);
		if(ret != 0)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}

		write_odirect_test(p2);

		free(p2);

		printf("%s line %d malloc success, p2=%p.\n", __func__, __LINE__, p2);

		p3 = valloc(DALLOC_SIZE);
		if(!p3)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}

		write_odirect_test(p3);

		free(p3);

		printf("%s line %d malloc success, p3=%p.\n", __func__, __LINE__, p3);

		printf("==================================================================\n");

		sleep(1);

	}
	return 0;
}

在UBUNTU上测试,发现打印如下,很明显对齐的写失败,其它按页对齐的写都success.

czl@czl-VirtualBox:~/align$ ./a.out 
main line 52, pagesize 4096.
==================================================================
write_odirect_test line 31, write failure, err Invalid argument.
main line 68 malloc success, p=0x7f435acfd010.
main line 81 malloc success, p1=0x7f435acfd000.
main line 94 malloc success, p2=0x7f435acfd000.
main line 107 malloc success, p3=0x7f435acfd000.
==================================================================
==================================================================
write_odirect_test line 31, write failure, err Invalid argument.
main line 68 malloc success, p=0x56318ac6b670.
main line 81 malloc success, p1=0x56318ac6c000.
main line 94 malloc success, p2=0x56318ac6c000.
main line 107 malloc success, p3=0x56318ac6c000.
==================================================================
==================================================================
write_odirect_test line 31, write failure, err Invalid argument.
main line 68 malloc success, p=0x56318ac6b670.
main line 81 malloc success, p1=0x56318ac6c000.
main line 94 malloc success, p2=0x56318ac6c000.
main line 107 malloc success, p3=0x56318ac6c000.
==================================================================
==================================================================
write_odirect_test line 31, write failure, err Invalid argument.
main line 68 malloc success, p=0x56318ac6b670.
main line 81 malloc success, p1=0x56318ac6c000.
main line 94 malloc success, p2=0x56318ac6c000.
main line 107 malloc success, p3=0x56318ac6c000.
==================================================================
==================================================================
write_odirect_test line 31, write failure, err Invalid argument.
main line 68 malloc success, p=0x56318ac6b670.
main line 81 malloc success, p1=0x56318ac6c000.
main line 94 malloc success, p2=0x56318ac6c000.
main line 107 malloc success, p3=0x56318ac6c000.
==================================================================

进一步用strace追踪,发现逐个错误是从系统调返回的:

​
write(1, "================================"..., 67==================================================================
) = 67
openat(AT_FDCWD, "./new.bin", O_RDWR|O_CREAT|O_DIRECT, 0666) = 7
write(7, "\240\214c<\353\177\0\0\240\214c<\353\177\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = -1 EINVAL (Invalid argument)
write(1, "write_odirect_test line 31, writ"..., 65write_odirect_test line 31, write failure, err Invalid argument.
) = 65
write(1, "main line 68 malloc success, p=0"..., 47main line 68 malloc success, p=0x563b08a0e670.
) = 47
openat(AT_FDCWD, "./new.bin", O_RDWR|O_CREAT|O_DIRECT, 0666) = 8
write(8, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
write(1, "main line 81 malloc success, p1="..., 48main line 81 malloc success, p1=0x563b08a0f000.
) = 48
openat(AT_FDCWD, "./new.bin", O_RDWR|O_CREAT|O_DIRECT, 0666) = 8
write(8, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
write(1, "main line 94 malloc success, p2="..., 48main line 94 malloc success, p2=0x563b08a0f000.
) = 48
openat(AT_FDCWD, "./new.bin", O_RDWR|O_CREAT|O_DIRECT, 0666) = 8
write(8, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
write(1, "main line 107 malloc success, p3"..., 49main line 107 malloc success, p3=0x563b08a0f000.
) = 49
write(1, "================================"..., 67==================================================================
) = 67
write(1, "================================"..., 67==================================================================
) = 67
openat(AT_FDCWD, "./new.bin", O_RDWR|O_CREAT|O_DIRECT, 0666) = 8
write(8, "\240\214c<\353\177\0\0\240\214c<\353\177\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = -1 EINVAL (Invalid argument)
write(1, "write_odirect_test line 31, writ"..., 65write_odirect_test line 31, write failure, err Invalid argument.
) = 65
write(1, "main line 68 malloc success, p=0"..., 47main line 68 malloc success, p=0x563b08a0e670.
) = 47
openat(AT_FDCWD, "./new.bin", O_RDWR|O_CREAT|O_DIRECT, 0666) = 9
write(9, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
write(1, "main line 81 malloc success, p1="..., 48main line 81 malloc success, p1=0x563b08a0f000.
) = 48
openat(AT_FDCWD, "./new.bin", O_RDWR|O_CREAT|O_DIRECT, 0666) = 9
write(9, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
write(1, "main line 94 malloc success, p2="..., 48main line 94 malloc success, p2=0x563b08a0f000.
) = 48
openat(AT_FDCWD, "./new.bin", O_RDWR|O_CREAT|O_DIRECT, 0666) = 9
write(9, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
write(1, "main line 107 malloc success, p3"..., 49main line 107 malloc success, p3=0x563b08a0f000.
) = 49
write(1, "================================"..., 67==================================================================
) = 67

​

同样的代码,在Tina上运行试试看,如下LOG, 非对其的BUFFER竟然对O_DIRECT没有影响。

原因为止,下一步打算在虚拟机上看一下上面返回出错的具体原因,再来分析。

root@(none):/mnt/extsd# ./main
main line 71, pagesize 4096.
==================================================================
main line 87 malloc success, p=0xb6da2010.
main line 100 malloc success, p1=0xb6da2000.
main line 113 malloc success, p2=0xb6da2000.
main line 126 malloc success, p3=0xb6da2000.
==================================================================
==================================================================
main line 87 malloc success, p=0xb6da2010.
main line 100 malloc success, p1=0xb6da2000.
main line 113 malloc success, p2=0xb6da2000.
main line 126 malloc success, p3=0xb6da2000.
==================================================================
==================================================================
main line 87 malloc success, p=0xb6da2010.
main line 100 malloc success, p1=0xb6da2000.
main line 113 malloc success, p2=0xb6da2000.
main line 126 malloc success, p3=0xb6da2000.
==================================================================
==================================================================
main line 87 malloc success, p=0xb6da2010.
main line 100 malloc success, p1=0xb6da2000.
main line 113 malloc success, p2=0xb6da2000.
main line 126 malloc success, p3=0xb6da2000.
==================================================================
==================================================================
main line 87 malloc success, p=0xb6da2010.
main line 100 malloc success, p1=0xb6da2000.
main line 113 malloc success, p2=0xb6da2000.
main line 126 malloc success, p3=0xb6da2000.
==================================================================
==================================================================
main line 87 malloc success, p=0xb6da2010.
main line 100 malloc success, p1=0xb6da2000.
main line 113 malloc success, p2=0xb6da2000.
main line 126 malloc success, p3=0xb6da2000.
==================================================================
^C
root@(none):/mnt/extsd#

strace跟踪确实在地址非对齐的情况下,仍然成功写进去了:

root@(none):/mnt/extsd# strace -e trace=open,openat,write ./main
open("/usr/lib/eyesee-mpp/libgcc_s.so.1", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/etc/ld-musl-armhf.path", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = -1 ENOENT (No such file or directory)
open("/lib/libgcc_s.so.1", O_RDONLY|O_LARGEFILE|O_CLOEXEC) = 3
main line 71, pagesize 4096.
==================================================================
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 87 malloc success, p=0xb6df5010.
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 100 malloc success, p1=0xb6df5000.
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 113 malloc success, p2=0xb6df5000.
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 126 malloc success, p3=0xb6df5000.
==================================================================
==================================================================
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 87 malloc success, p=0xb6df5010.
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 100 malloc success, p1=0xb6df5000.
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 113 malloc success, p2=0xb6df5000.
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 126 malloc success, p3=0xb6df5000.
==================================================================
==================================================================
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 87 malloc success, p=0xb6df5010.
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 100 malloc success, p1=0xb6df5000.
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 113 malloc success, p2=0xb6df5000.
open("./new.bin", O_RDWR|O_CREAT|O_DIRECT|O_LARGEFILE, 0666) = 3
write(3, "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 1048576) = 1048576
main line 126 malloc success, p3=0xb6df5000.
==================================================================
^Cstrace: Process 948 detached

root@(none):/mnt/extsd#

回头给出上面疑问的分析,在内核中加入调试打印,重新编译内核,重启动UBUNTU

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 434cffcc0..c4eea337c 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1190,7 +1190,13 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 			blkbits = blksize_bits(bdev_logical_block_size(bdev));
 		blocksize_mask = (1 << blkbits) - 1;
 		if (align & blocksize_mask)
+		{
+			if(strcmp("a.out", current->comm) == 0)
+			{
+				printk("%s line %d.align 0x%lx.mask 0x%x, blkbits %d\n", __func__, __LINE__, align, blocksize_mask, blkbits);
+			}
 			goto out;
+		}
 	}
 
 	/* watch out for a 0 len io from a tricksy fs */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1513e90fb..298b9ac3d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -226,7 +226,14 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t ret;
 
 	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+	{
+		if(strcmp("a.out", current->comm) == 0)
+		{
+			printk("%s line %d.\n", __func__, __LINE__);
+		}
+
 		return -EIO;
+	}
 
 #ifdef CONFIG_FS_DAX
 	if (IS_DAX(inode))
@@ -235,7 +242,15 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	if (!inode_trylock(inode)) {
 		if (iocb->ki_flags & IOCB_NOWAIT)
+		{
+			if(strcmp("a.out", current->comm) == 0)
+			{
+				printk("%s line %d.\n", __func__, __LINE__);
+			}
+
 			return -EAGAIN;
+		}
+
 		inode_lock(inode);
 	}
 
@@ -268,6 +283,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 	ret = __generic_file_write_iter(iocb, from);
+
+	if(strcmp("a.out", current->comm) == 0)
+	{
+			printk("%s line %d.ret %ld.\n", __func__, __LINE__, ret);
+	}
 	/*
 	 * Unaligned direct AIO must be the only IO in flight. Otherwise
 	 * overlapping aligned IO after unaligned might result in data
@@ -280,10 +300,19 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret > 0)
 		ret = generic_write_sync(iocb, ret);
 
+	if(strcmp("a.out", current->comm) == 0)
+	{
+			printk("%s line %d.ret %ld.\n", __func__, __LINE__, ret);
+	}
+
 	return ret;
 
 out:
 	inode_unlock(inode);
+	if(strcmp("a.out", current->comm) == 0)
+	{
+			printk("%s line %d.ret %ld.\n", __func__, __LINE__, ret);
+	}
 	return ret;
 }
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 1429d01d8..fd43ed1b4 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3763,9 +3763,17 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 		get_block_func = ext4_dio_get_block_unwritten_async;
 		dio_flags = DIO_LOCKING;
 	}
+	if(strcmp("a.out", current->comm) == 0)
+	{
+		printk("%s line %d.ret %ld.\n", __func__, __LINE__, ret);
+	}
 	ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
 				   get_block_func, ext4_end_io_dio, NULL,
 				   dio_flags);
+	if(strcmp("a.out", current->comm) == 0)
+	{
+		printk("%s line %d.ret %ld.\n", __func__, __LINE__, ret);
+	}
 
 	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
diff --git a/fs/read_write.c b/fs/read_write.c
index 7458fccc5..2db56b455 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -490,12 +490,20 @@ static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t
 static ssize_t __vfs_write(struct file *file, const char __user *p,
 			   size_t count, loff_t *pos)
 {
+	ssize_t ret;
 	if (file->f_op->write)
-		return file->f_op->write(file, p, count, pos);
+		ret = file->f_op->write(file, p, count, pos);
 	else if (file->f_op->write_iter)
-		return new_sync_write(file, p, count, pos);
+		ret = new_sync_write(file, p, count, pos);
 	else
-		return -EINVAL;
+		ret = -EINVAL;
+
+	if(strcmp("a.out", current->comm) == 0)
+	{
+		printk("%s line %d. ret = %ld\n", __func__, __LINE__, ret);
+	}
+
+	return ret;
 }
 
 ssize_t __kernel_write(struct file *file, const void *buf, size_t count, loff_t *pos)
diff --git a/mm/filemap.c b/mm/filemap.c
index c10e237cc..2b287f146 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3189,6 +3189,10 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	} else {
 		written = filemap_write_and_wait_range(mapping, pos,
 							pos + write_len - 1);
+		if(strcmp("a.out", current->comm) == 0)
+		{
+			printk("%s line %d.ret %ld.\n", __func__, __LINE__, written);
+		}
 		if (written)
 			goto out;
 	}
@@ -3208,10 +3212,22 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
 	if (written) {
 		if (written == -EBUSY)
 			return 0;
+		if(strcmp("a.out", current->comm) == 0)
+		{
+			printk("%s line %d.ret %ld.\n", __func__, __LINE__, written);
+		}
 		goto out;
 	}
 
+		if(strcmp("a.out", current->comm) == 0)
+		{
+			printk("%s line %d.ret %ld.\n", __func__, __LINE__, written);
+		}
 	written = mapping->a_ops->direct_IO(iocb, from);
+		if(strcmp("a.out", current->comm) == 0)
+		{
+			printk("%s line %d.ret %ld.\n", __func__, __LINE__, written);
+		}
 
 	/*
 	 * Finally, try again to invalidate clean pages which might have been
@@ -3385,16 +3401,34 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	current->backing_dev_info = inode_to_bdi(inode);
 	err = file_remove_privs(file);
 	if (err)
+	{
+		if(strcmp("a.out", current->comm) == 0)
+		{
+			printk("%s line %d.ret %ld.\n", __func__, __LINE__, err);
+		}
 		goto out;
+	}
 
 	err = file_update_time(file);
 	if (err)
+	{
+		if(strcmp("a.out", current->comm) == 0)
+		{
+			printk("%s line %d.ret %ld.\n", __func__, __LINE__, err);
+		}
 		goto out;
+	}
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		loff_t pos, endbyte;
 
+
 		written = generic_file_direct_write(iocb, from);
+
+		if(strcmp("a.out", current->comm) == 0)
+		{
+			printk("%s line %d.ret %ld.\n", __func__, __LINE__, written);
+		}
 		/*
 		 * If the write stopped short of completing, fall back to
 		 * buffered writes.  Some filesystems do this for writes to
@@ -3415,6 +3449,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		 */
 		if (unlikely(status < 0)) {
 			err = status;
+			if(strcmp("a.out", current->comm) == 0)
+			{
+				printk("%s line %d.ret %ld.\n", __func__, __LINE__, err);
+			}
 			goto out;
 		}
 		/*
@@ -3435,6 +3473,10 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 			 * We don't know how much we wrote, so just return
 			 * the number of bytes which were direct-written
 			 */
+			if(strcmp("a.out", current->comm) == 0)
+			{
+				printk("%s line %d.ret %ld.\n", __func__, __LINE__, err);
+			}
 		}
 	} else {
 		written = generic_perform_write(file, from, iocb->ki_pos);

应用LOG如下:

DMESG得到LOG如下:

[   31.506164] generic_file_direct_write line 3194.ret 0.
[   31.506166] generic_file_direct_write line 3224.ret 0.
[   31.506171] ext4_direct_IO_write line 3768.ret 0.
[   31.506174] do_blockdev_direct_IO line 1196.align 0x5617a07d2670.mask 0x1ff, blkbits 9
[   31.506176] ext4_direct_IO_write line 3775.ret -22.
[   31.506201] generic_file_direct_write line 3229.ret -22.
[   31.506202] __generic_file_write_iter line 3430.ret -22.
[   31.506203] ext4_file_write_iter line 289.ret -22.
[   31.506205] ext4_file_write_iter line 305.ret -22.
[   31.506206] __vfs_write line 503. ret = -22
[   31.506227] __vfs_write line 503. ret = 65
[   31.506235] __vfs_write line 503. ret = 47
[   32.506385] __vfs_write line 503. ret = 67
[   32.508763] generic_file_direct_write line 3194.ret 0.
[   32.508765] generic_file_direct_write line 3224.ret 0.
[   32.508771] ext4_direct_IO_write line 3768.ret 0.
[   32.508774] do_blockdev_direct_IO line 1196.align 0x5617a07d2670.mask 0x1ff, blkbits 9
[   32.508776] ext4_direct_IO_write line 3775.ret -22.
[   32.508803] generic_file_direct_write line 3229.ret -22.
[   32.508804] __generic_file_write_iter line 3430.ret -22.
[   32.508806] ext4_file_write_iter line 289.ret -22.
[   32.508807] ext4_file_write_iter line 305.ret -22.
[   32.508808] __vfs_write line 503. ret = -22
[   32.508832] __vfs_write line 503. ret = 65
[   32.508840] __vfs_write line 503. ret = 47

可以看到,内核打印的ALIGN和用户态的指针完全相同,对齐的MASK为(1<<9) - 1 = 0x1ff = 511

也就是512字节对齐,其并不满足。返回的位置在下图所示,这说明想用direct io的话,传递给 read / write系统调用的buffer必须按照512字节对齐。

更多的验证:

复杂的对齐逻辑可以看下面这个函数

align的判断逻辑是或逻辑,也就是说,buffer指针,文件offset,以及读写大小,都必须需要512字节对齐才行,任何一个不对其,就会触发下面的逻辑。

判断代码:

判断ptr指针对齐的逻辑,文件偏移为0,写1M数据.

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <malloc.h>
#include <limits.h>
#include <string.h>
#include <math.h>
 
#define __USE_GNU 1
#include <fcntl.h>
#include <stdint.h>
#include <sys/mman.h>
#include <errno.h>
 
#define DALLOC_SIZE (1 * 1024 * 1024)
static int write_odirect_test(unsigned char *p)
{
	int fdno;
 
	fdno = open("./new.bin", O_DIRECT|O_RDWR|O_CREAT, 0666);
	if(fdno < 0)
	{
		printf("%s line %d, open file failure.\n", __func__, __LINE__);
		return -1;
	}
 
	fallocate(fdno, 1, 0, 100*1024*1024);
	
	if(write(fdno, p, DALLOC_SIZE) != DALLOC_SIZE)
	{
		printf("%s line %d, write failure, err %s.\n", __func__, __LINE__, strerror(errno));
		return -1;
	}
 
	close(fdno);
	if(unlink("./new.bin") < 0)
	{
		printf("%s line %d unlink errpr!\n", __func__, __LINE__);
		return -1;
	}
 
	return 0;
}
 
int main(int argc, char **argv)
{
	void *p2 = NULL;
	int align;
	int i;
 
	printf("%s line %d, pagesize %d.\n", __func__, __LINE__, getpagesize());

	for(i = 0; i < 16; i ++)	
	{
		align = pow(2, i + 3);
		printf("=============================%d=================================\n", align);
		int ret = posix_memalign(&p2, align, DALLOC_SIZE);
		if(ret != 0)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}

		printf("%s line %d malloc success, p2=%p.\n", __func__, __LINE__, p2);
		if(write_odirect_test(p2) != 0)
		{
			printf("%s line %d, align %d, failure.\n", __func__, __LINE__, align);
		}
		else
		{
			printf("%s line %d, align %d, success.\n", __func__, __LINE__, align);
		}

		free(p2);


		printf("==================================================================\n");
	}

	sleep(1);
	return 0;
}
caozilong@AwExdroid65:~/WorkSpace/alloc_memory$ ./a.out 
main line 51, pagesize 4096.
=============================8=================================
main line 64 malloc success, p2=0x7f7403891010.
write_odirect_test line 31, write failure, err Invalid argument.
main line 67, align 8, failure.
==================================================================
=============================16=================================
main line 64 malloc success, p2=0x118f010.
write_odirect_test line 31, write failure, err Invalid argument.
main line 67, align 16, failure.
==================================================================
=============================32=================================
main line 64 malloc success, p2=0x118f040.
write_odirect_test line 31, write failure, err Invalid argument.
main line 67, align 32, failure.
==================================================================
=============================64=================================
main line 64 malloc success, p2=0x118f040.
write_odirect_test line 31, write failure, err Invalid argument.
main line 67, align 64, failure.
==================================================================
=============================128=================================
main line 64 malloc success, p2=0x118f080.
write_odirect_test line 31, write failure, err Invalid argument.
main line 67, align 128, failure.
==================================================================
=============================256=================================
main line 64 malloc success, p2=0x118f100.
write_odirect_test line 31, write failure, err Invalid argument.
main line 67, align 256, failure.
==================================================================
=============================512=================================
main line 64 malloc success, p2=0x118f200.
main line 71, align 512, success.
==================================================================
=============================1024=================================
main line 64 malloc success, p2=0x118f400.
main line 71, align 1024, success.
==================================================================
=============================2048=================================
main line 64 malloc success, p2=0x118f800.
main line 71, align 2048, success.
==================================================================
=============================4096=================================
main line 64 malloc success, p2=0x1190000.
main line 71, align 4096, success.
==================================================================
=============================8192=================================
main line 64 malloc success, p2=0x1190000.
main line 71, align 8192, success.
==================================================================
=============================16384=================================
main line 64 malloc success, p2=0x1190000.
main line 71, align 16384, success.
==================================================================
=============================32768=================================
main line 64 malloc success, p2=0x1190000.
main line 71, align 32768, success.
==================================================================
=============================65536=================================
main line 64 malloc success, p2=0x1190000.
main line 71, align 65536, success.
==================================================================
=============================131072=================================
main line 64 malloc success, p2=0x11a0000.
main line 71, align 131072, success.
==================================================================
=============================262144=================================
main line 64 malloc success, p2=0x7f7403880000.
main line 71, align 262144, success.
==================================================================
caozilong@AwExdroid65:~/WorkSpace/alloc_memory$ 

判断文件pos的逻辑,ptr 512对齐,写1M数据.

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <malloc.h>
#include <limits.h>
#include <string.h>
#include <math.h>
 
#define __USE_GNU 1
#include <fcntl.h>
#include <stdint.h>
#include <sys/mman.h>
#include <errno.h>
 
#define DALLOC_SIZE (1 * 1024 * 1024)
static int write_odirect_test(unsigned char *p)
{
	int fdno;
 
	fdno = open("./new.bin", O_DIRECT|O_RDWR|O_CREAT, 0666);
	if(fdno < 0)
	{
		printf("%s line %d, open file failure.\n", __func__, __LINE__);
		return -1;
	}
 
	fallocate(fdno, 1, 0, 100*1024*1024);
 
	int pos = 0;
 
	for(pos = 0; pos <= 4096; pos ++)
	{
		lseek(fdno, pos, SEEK_SET);
	
		int cur =  lseek(fdno, 0, SEEK_CUR);
		if(write(fdno, p, DALLOC_SIZE) != DALLOC_SIZE)
		{
			/*printf("%s line %d, write failure, err %s, off %d.\n", __func__, __LINE__, strerror(errno), pos);*/
			/*return -1;*/
		}
		else
		{
			printf("%s line %d, write success, off %d, cur %d.\n", __func__, __LINE__, pos, cur);
		}
	}
 
	close(fdno);
 
	if(unlink("./new.bin") < 0)
	{
		printf("%s line %d unlink errpr!\n", __func__, __LINE__);
		return -1;
	}
 
	return 0;
}
 
int main(int argc, char **argv)
{
	void *p2 = NULL;
	int align;
	int i;
 
	printf("%s line %d, pagesize %d.\n", __func__, __LINE__, getpagesize());
 
	//for(i = 0; i < 16; i ++)	
	{
		//align = pow(2, i + 3);
		align = 512;
		printf("=============================%d=================================\n", align);
		int ret = posix_memalign(&p2, align, DALLOC_SIZE);
		if(ret != 0)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}
 
		printf("%s line %d malloc success, p2=%p.\n", __func__, __LINE__, p2);
		if(write_odirect_test(p2) != 0)
		{
			printf("%s line %d, align %d, failure.\n", __func__, __LINE__, align);
		}
		else
		{
			printf("%s line %d, align %d, success.\n", __func__, __LINE__, align);
		}
 
		free(p2);
 
 
		printf("==================================================================\n");
	}
 
	sleep(1);
	return 0;
}
caozilong@AwExdroid65:~/WorkSpace/alloc_memory$ ./a.out 
main line 64, pagesize 4096.
=============================512=================================
main line 78 malloc success, p2=0x7f6d6ed52200.
write_odirect_test line 43, write success, off 0, cur 0.
write_odirect_test line 43, write success, off 512, cur 512.
write_odirect_test line 43, write success, off 1024, cur 1024.
write_odirect_test line 43, write success, off 1536, cur 1536.
write_odirect_test line 43, write success, off 2048, cur 2048.
write_odirect_test line 43, write success, off 2560, cur 2560.
write_odirect_test line 43, write success, off 3072, cur 3072.
write_odirect_test line 43, write success, off 3584, cur 3584.
write_odirect_test line 43, write success, off 4096, cur 4096.
main line 85, align 512, success.
==================================================================
caozilong@AwExdroid65:~/WorkSpace/alloc_memory$ 

可以看到,position也必须512字节对齐.

长度测试,ptr512对齐,pos512对齐,只有长度变化,可以看到,也要求512字节对齐.

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <malloc.h>
#include <limits.h>
#include <string.h>
#include <math.h>
 
#define __USE_GNU 1
#include <fcntl.h>
#include <stdint.h>
#include <sys/mman.h>
#include <errno.h>
 
#define DALLOC_SIZE (1 * 1024 * 1024)
static int write_odirect_test(unsigned char *p)
{
	int fdno;
 
	fdno = open("./new.bin", O_DIRECT|O_RDWR|O_CREAT, 0666);
	if(fdno < 0)
	{
		printf("%s line %d, open file failure.\n", __func__, __LINE__);
		return -1;
	}
 
	fallocate(fdno, 1, 0, 100*1024*1024);

	int len = 0;

	for(len = 0; len <= 4096; len ++)
	{
		lseek(fdno, 0, SEEK_SET);
	
		int cur =  lseek(fdno, 0, SEEK_CUR);
		if(write(fdno, p, len) != len)
		{
			/*printf("%s line %d, write failure, err %s, off %d.\n", __func__, __LINE__, strerror(errno), pos);*/
			/*return -1;*/
		}
		else
		{
			printf("%s line %d, write success, len %d, cur %d.\n", __func__, __LINE__, len, cur);
		}
	}

	close(fdno);

	if(unlink("./new.bin") < 0)
	{
		printf("%s line %d unlink errpr!\n", __func__, __LINE__);
		return -1;
	}
 
	return 0;
}
 
int main(int argc, char **argv)
{
	void *p2 = NULL;
	int align;
	int i;
 
	printf("%s line %d, pagesize %d.\n", __func__, __LINE__, getpagesize());

	//for(i = 0; i < 16; i ++)	
	{
		//align = pow(2, i + 3);
		align = 512;
		printf("=============================%d=================================\n", align);
		int ret = posix_memalign(&p2, align, DALLOC_SIZE);
		if(ret != 0)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}

		printf("%s line %d malloc success, p2=%p.\n", __func__, __LINE__, p2);
		if(write_odirect_test(p2) != 0)
		{
			printf("%s line %d, align %d, failure.\n", __func__, __LINE__, align);
		}
		else
		{
			printf("%s line %d, align %d, success.\n", __func__, __LINE__, align);
		}

		free(p2);


		printf("==================================================================\n");
	}

	sleep(1);
	return 0;
}
czl@czl-VirtualBox:~/WorkSpace/changdu$ ./a.out 
main line 64, pagesize 4096.
=============================512=================================
main line 78 malloc success, p2=0x7f070f72b200.
write_odirect_test line 43, write success, len 0, cur 0.
write_odirect_test line 43, write success, len 512, cur 0.
write_odirect_test line 43, write success, len 1024, cur 0.
write_odirect_test line 43, write success, len 1536, cur 0.
write_odirect_test line 43, write success, len 2048, cur 0.
write_odirect_test line 43, write success, len 2560, cur 0.
write_odirect_test line 43, write success, len 3072, cur 0.
write_odirect_test line 43, write success, len 3584, cur 0.
write_odirect_test line 43, write success, len 4096, cur 0.
main line 85, align 512, success.
==================================================================
czl@czl-VirtualBox:~/WorkSpace/changdu$ 

总之,在ubuntu 上,O_DIRECT模式要求写操作必须是buffer地址,文件position以及文件长度,都按照512字节对齐或者是512字节的整数倍。

读应该也是如此,需要验证,下面我们仅仅验证LEN不512字节对齐的情况,注意这里的fallocate的mode要设置为0, 这样,才会设置文件的SIZE为tructate size,否则,模式为1的话,SIZE还是0,这不会对写测试造成影响。但是会对读造成影响。

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <malloc.h>
#include <limits.h>
#include <string.h>
#include <math.h>
 
#define __USE_GNU 1
#include <fcntl.h>
#include <stdint.h>
#include <sys/mman.h>
#include <errno.h>
 
#define DALLOC_SIZE (1 * 1024 * 1024)
#define ALLOCA_SIZE (100*1024*1024)
static int write_odirect_test(unsigned char *p)
{
	int fdno;
 
	fdno = open("./new.bin", O_DIRECT|O_RDWR|O_CREAT, 0666);
	if(fdno < 0)
	{
		printf("%s line %d, open file failure.\n", __func__, __LINE__);
		return -1;
	}
 
	fallocate(fdno, 0, 0, ALLOCA_SIZE);
	//close(fdno);
	//return 0;

	int len = 0;
 
	for(len = 0; len <= 4096; len ++)
	{
		lseek(fdno, 0, SEEK_SET);
		int cur =  lseek(fdno, 0, SEEK_CUR);

		if(read(fdno, p, len) != len)
		{
			/*printf("%s line %d, write failure, err %s, off %d.\n", __func__, __LINE__, strerror(errno), pos);*/
			/*return -1;*/
		}
		else
		{
			printf("%s line %d, read success, len %d. cur %d ptr %p.\n", __func__, __LINE__, len, cur, p);
		}
	}
 
	close(fdno);
 
	if(unlink("./new.bin") < 0)
	{
		printf("%s line %d unlink errpr!\n", __func__, __LINE__);
		return -1;
	}
 
	return 0;
}
 
int main(int argc, char **argv)
{
	void *p2 = NULL;
	int align;
	int i;
 
	printf("%s line %d, pagesize %d.\n", __func__, __LINE__, getpagesize());
 
	//for(i = 0; i < 16; i ++)	
	{
		//align = pow(2, i + 3);
		align = 512;
		printf("=============================%d=================================\n", align);
		int ret = posix_memalign(&p2, align, DALLOC_SIZE);
		if(ret != 0)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}
 
		printf("%s line %d malloc success, p2=%p.\n", __func__, __LINE__, p2);
		if(write_odirect_test(p2) != 0)
		{
			printf("%s line %d, align %d, failure.\n", __func__, __LINE__, align);
		}
		else
		{
			printf("%s line %d, align %d, success.\n", __func__, __LINE__, align);
		}
 
		free(p2);
 
 
		printf("==================================================================\n");
	}
 
	sleep(1);
	return 0;
}
czl@czl-VirtualBox:~/WorkSpace/changdu$ ./a.out 
main line 67, pagesize 4096.
=============================512=================================
main line 81 malloc success, p2=0x7f6fc17ac200.
write_odirect_test line 46, read success, len 0. cur 0 ptr 0x7f6fc17ac200.
write_odirect_test line 46, read success, len 512. cur 0 ptr 0x7f6fc17ac200.
write_odirect_test line 46, read success, len 1024. cur 0 ptr 0x7f6fc17ac200.
write_odirect_test line 46, read success, len 1536. cur 0 ptr 0x7f6fc17ac200.
write_odirect_test line 46, read success, len 2048. cur 0 ptr 0x7f6fc17ac200.
write_odirect_test line 46, read success, len 2560. cur 0 ptr 0x7f6fc17ac200.
write_odirect_test line 46, read success, len 3072. cur 0 ptr 0x7f6fc17ac200.
write_odirect_test line 46, read success, len 3584. cur 0 ptr 0x7f6fc17ac200.
write_odirect_test line 46, read success, len 4096. cur 0 ptr 0x7f6fc17ac200.
main line 88, align 512, success.
==================================================================
czl@czl-VirtualBox:~/WorkSpace/changdu$ 

可以看到,LEN仍然是要512对齐的!

在Tina下验证:

由于tina下的ftruncate语义貌似不同,所以手工创建一个100m的文件共测试用例读写,否则,一个默认为0size的文件会触发内核下面的return逻辑。窃以为造成这种差异的主要原因是文件系统的不同,在PC上测试用例访问的ext4文件系统,进入的是ext4_direct_IO逻辑,而Tina上,TF卡是FAT文件系统,访问的是fat_direct_IO,内部会有如下的逻辑判断。而且通过MSDOS_I宏可以明显看出,这个 mmu_private 变量是FAT文件系统私有的,其它的文件系统没有。

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <malloc.h>
#include <limits.h>
#include <string.h>
#include <math.h>
 
#define __USE_GNU 1
#include <fcntl.h>
#include <stdint.h>
#include <sys/mman.h>
#include <errno.h>
 
#define DALLOC_SIZE (1 * 1024 * 1024)
static int write_odirect_test(unsigned char *p)
{
	int fdno;
 
	fdno = open("./new.bin", O_DIRECT|O_RDWR, 0666);
	/*fdno = open("./new.bin", O_DIRECT|O_RDWR|O_CREAT, 0666);*/
	if(fdno < 0)
	{
		printf("%s line %d, open file failure.\n", __func__, __LINE__);
		return -1;
	}
 
	//fallocate(fdno, 1, 0, 100*1024*1024);
	/*fallocate(fdno, 0, 0, 100*1024*1024);*/
	/*close(fdno);*/
	/*return 0;*/
 
	int pos = 0;
 
	for(pos = 0; pos <= 4096; pos ++)
	{
		lseek(fdno, pos, SEEK_SET);
	
		int cur =  lseek(fdno, 0, SEEK_CUR);
		if(write(fdno, p, DALLOC_SIZE) != DALLOC_SIZE)
		{
			/*printf("%s line %d, write failure, err %s, off %d.\n", __func__, __LINE__, strerror(errno), pos);*/
			/*return -1;*/
		}
		else
		{
			printf("%s line %d, write success, off %d, cur %d.\n", __func__, __LINE__, pos, cur);
		}
	}
 
	close(fdno);
 
 /*
  *   if(unlink("./new.bin") < 0)
  *   {
  *       printf("%s line %d unlink errpr!\n", __func__, __LINE__);
  *       return -1;
  *   }
  *
  */
	return 0;
}
 
int main(int argc, char **argv)
{
	void *p2 = NULL;
	int align;
	int i;
 
	printf("%s line %d, pagesize %d.\n", __func__, __LINE__, getpagesize());
 
	//for(i = 0; i < 16; i ++)	
	{
		//align = pow(2, i + 3);
		align = 512;
		printf("=============================%d=================================\n", align);
		int ret = posix_memalign(&p2, align, DALLOC_SIZE);
		if(ret != 0)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}
 
		printf("%s line %d malloc success, p2=%p.\n", __func__, __LINE__, p2);
		if(write_odirect_test(p2) != 0)
		{
			printf("%s line %d, align %d, failure.\n", __func__, __LINE__, align);
		}
		else
		{
			printf("%s line %d, align %d, success.\n", __func__, __LINE__, align);
		}
 
		free(p2);
 
 
		printf("==================================================================\n");
	}
 
	sleep(1);
	return 0;
}


最后的问题:

文件大小如果不是512字节的整数倍,当读最后一笔数据时,len还一定要512对齐吗?

先说答案,要,一定要,前面的逻辑也说明了,判断align的条件只看ptr, length, offset的或结果,任何一个参数不对齐,都会触发返回。用户态接收到invalid parameter.

那怎么解决这个问题的呢?答案是:虽然三个参数务必要对齐,但是,返回的不是对齐后的大小,而是最后一笔数据的实际值,这样就没问题了。

验证用例:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <malloc.h>
#include <limits.h>
#include <string.h>
#include <math.h>
 
#define __USE_GNU 1
#include <fcntl.h>
#include <stdint.h>
#include <sys/mman.h>
#include <errno.h>
 
#define DALLOC_SIZE (1 * 1024 * 1024)
#define ALLOCA_SIZE (100*1024*1024 + 100)
static int write_odirect_test(unsigned char *p)
{
	int fdno;
 
	fdno = open("./new.bin", O_DIRECT|O_RDWR|O_CREAT, 0666);
	if(fdno < 0)
	{
		printf("%s line %d, open file failure.\n", __func__, __LINE__);
		return -1;
	}
 
	fallocate(fdno, 0, 0, ALLOCA_SIZE);
	//close(fdno);
	//return 0;

	int len = 0;
	int retlen = 0;
	int cur = 0;
 
	lseek(fdno, 0, SEEK_SET);
	//for(len = 0; len <= 4096; len ++)
#if 1
	while(1)
	{
		cur =  lseek(fdno, 0, SEEK_CUR);
		if((retlen = read(fdno, p, 512)) != 512)
		{
			printf("%s line %d, read failure, err %s, cur 0x%x retlen %d.\n", __func__, __LINE__, strerror(errno), cur,retlen);
			return -1;
		}
		else
		{
			printf("%s line %d, read success, len %d. cur 0x%x ptr %p, retlen %d.\n", __func__, __LINE__, 512, cur, p, retlen);
		}
	}
#else
	lseek(fdno, 0x6400000, SEEK_SET);
	cur =  lseek(fdno, 0, SEEK_CUR);
	retlen = read(fdno, p, 512);

	printf("%s line %d, cur 0x%x, retlen %d.tips:%s.\n", __func__, __LINE__, cur, retlen, strerror(errno));
#endif
 
	close(fdno);
 
	if(unlink("./new.bin") < 0)
	{
		printf("%s line %d unlink errpr!\n", __func__, __LINE__);
		return -1;
	}
 
	return 0;
}
 
int main(int argc, char **argv)
{
	void *p2 = NULL;
	int align;
	int i;
 
	printf("%s line %d, pagesize %d.\n", __func__, __LINE__, getpagesize());
 
	//for(i = 0; i < 16; i ++)	
	{
		//align = pow(2, i + 3);
		align = 512;
		printf("=============================%d=================================\n", align);
		int ret = posix_memalign(&p2, align, DALLOC_SIZE);
		if(ret != 0)
		{
			printf("%s line %d, malloc failure.\n", __func__, __LINE__);
			return -1;
		}
 
		printf("%s line %d malloc success, p2=%p.\n", __func__, __LINE__, p2);
		if(write_odirect_test(p2) != 0)
		{
			printf("%s line %d, align %d, failure.\n", __func__, __LINE__, align);
		}
		else
		{
			printf("%s line %d, align %d, success.\n", __func__, __LINE__, align);
		}
 
		free(p2);
 
 
		printf("==================================================================\n");
	}
 
	sleep(1);
	return 0;
}

运行结果,文件大小是100M+100字节,可以看到最后一笔实际上是读成功了,读出来的长度是100,正好是最后一笔非512对齐的数据大小。但是传递的参数都是512对齐的。

造成fat和ext4表现差异的原因,关键在于不同的文件系统direct_IO函数指针的处理不同,可以按照下图的逻辑去分析。


结束! 

  • 7
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

papaofdoudou

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值