linux中的DMA提速1：各种内存分配验证

千册

已于 2022-11-01 21:39:16 修改

阅读量2.3k

点赞数 5

分类专栏： linux内核文章标签： linux

于 2022-11-01 17:33:57 首次发布

本文链接：https://blog.csdn.net/yueni_zhao/article/details/127623895

版权

linux内核专栏收录该内容

88 篇文章 28 订阅

订阅专栏

前言

驱动需要优化驱动中的DMA速度。在此记录。使用DMA传输1280*800的数据就是1M字节。

使用外设的DMA功能，就需要给外设相应的DMA缓存寄存器设置一个地址，这个地址在内核中叫总线地址，而我们使用kmalloc分配的地址是虚拟地址，它是通过物理地址经过MMU转换得来的。内核中的三种地址，虚拟地址、物理地址、总线地址。

1）mmap时使用的是物理地址。

2）总线地址（例如：dma_addr_t），是通过虚拟地址得到的。在内核中，用的最多的就是虚拟地址。

3）设置外设寄存器时，使用的是总线地址，例如本次学习的dma_addr_t类型的地址。

看下面这段代码，virt_to_phys(p)就是虚拟机地址转换为物理地址。对了，mmap的时候使用的是物理地址。

virt_to_phys(p) >> PAGE_SHIFT

当前测试环境PAGE_SIZE为4KB，所以PAGE_SHIFT的值是12，这句代码是为了获得该物理地址的页帧号，由此可知，这个地址需要是页（PAGE）对齐的。

#define PAGE_SHIFT	12

#ifdef __ASSEMBLY__
#define PAGE_SIZE	(1 << PAGE_SHIFT)
#else
#define PAGE_SIZE	(1UL << PAGE_SHIFT)
#endif

#define PAGE_MASK	(~(PAGE_SIZE-1))

static int csi_dev_mmap(struct file *file, struct vm_area_struct *vma) {
void *p = xxx;
...
    if (remap_pfn_range(vma, vma->vm_start, virt_to_phys(p) >> PAGE_SHIFT,
	        vma->vm_end-vma->vm_start, vma->vm_page_prot)) {
	        printk( "remap_pfn_range error\n");
	        return -EAGAIN;
	}
...
    return 0
}

相关的宏__phys_to_pfn就是获得物理地址的页帧号，__pfn_to_phys是计算出页帧号对应的物理地址。pfn是page frame number的缩写。

/*
 * Convert a physical address to a Page Frame Number and back
 */
#define	__phys_to_pfn(paddr)	((unsigned long)((paddr) >> PAGE_SHIFT))
#define	__pfn_to_phys(pfn)	((phys_addr_t)(pfn) << PAGE_SHIFT)

提到虚拟地址，肯定是有MMU的，MMU是啥，MMU就是月老，负责物理地址与虚拟地址配对，一夫一妻吗？月老出生的年代是一夫一妻吗？没研究过。反正MMU不是一夫一妻的，一个物理地址，可以有多个虚拟地址，当然可以有对应的总线地址，一个虚拟地址同一时间只能映射到一个物理地址。

本次学习的DMA提速探索，就需要全部使用这三个身份，物理地址<=>虚拟地址<=>总线地址（dma_addr_t）。

还有两个函数用于虚拟地址和总线地址的转换，暂时用不到，先记录下来。

unsigned long virt_to_bus(volatile void *address);
void *bus_to_virt(unsigned long address);

一验证的方案

所以初级方案，并不是说，这种方案不好，而是这种办法适合于大部分场景，在内核中绝大多数代码也是这么写的。这种方案容易理解，在内核中可以很容易的借鉴到。

方案一:dma_alloc_coherent函数

这种方案操作简单，dma_alloc_coherent返回虚拟地址，并且给dma_addr赋值为对应的总线地址。这个地址就可以设置到外设寄存器中，外设就可以自动向该地址写入数据，写入完毕后就会触发中断。

char *buf;	
dma_addr_t dma_addr;
buf_size = 1280*800;
buf = dma_alloc_coherent(dev,
                    PAGE_ALIGN(buf_size),
                    &dma_addr,
                    GFP_DMA | GFP_KERNEL);
		if(IS_ERR(buf)){
			DEBUG_CM("kmalloc error");
			return -ENOMEM;
		}

方案二：dma_map_single+kmalloc

kmalloc赋值分配空间，然后通过dma_map_single找到对应的总线地址，经过测试相较于前一种方案。速度上并没有提升。几乎就是一样的。

char *buf;	
dma_addr_t dma_addr;
buf_size = 1280*800;
buf = kmalloc(buf_size,GFP_KERNEL|GFP_DMA);		
		if(IS_ERR(buf)){
			DEBUG_CM("kmalloc error");
			return -ENOMEM;
		}
		new_node->dma_addr = dma_map_single(cm_dev->dev, 
buf, 
CSI_DEV_IMAGE_SIZE, 
DMA_FROM_DEVICE);

方案三：__get_free_pages+dma_map_single

8表示2的八次方，就是256个页，页大小是4096.然后就是1048576，速度没有明显变化。我都开始怀疑，我这样测试是不是不对啊。

new_node->buf = (void *)__get_free_pages(GFP_KERNEL|GFP_DMA,8);		
		if(IS_ERR(new_node->buf)){
			kfree(new_node);
			DEBUG_CM("kmalloc error");
			spin_unlock_irqrestore(&cm_dev->slock,flags);
			return -ENOMEM;
		}
		new_node->buf_size = CSI_DEV_IMAGE_SIZE;
		new_node->dma_addr = dma_map_single(cm_dev->dev, new_node->buf, new_node->buf_size, DMA_FROM_DEVICE);

也可以写成下面这样,效果是一样的

        int order;
        new_node->buf_size = CSI_DEV_IMAGE_SIZE;
        order = get_order(CSI_DEV_IMAGE_SIZE);
		new_node->buf = (void *)__get_dma_pages(GFP_KERNEL, order);		
		if(IS_ERR(new_node->buf)){
			kfree(new_node);
			DEBUG_CM("kmalloc error");
			spin_unlock_irqrestore(&cm_dev->slock,flags);
			return -ENOMEM;
		}
		new_node->dma_addr = dma_map_single(cm_dev->dev, new_node->buf, new_node->buf_size, DMA_FROM_DEVICE);

二后来验证的方案

方案一：pgprot_noncached

如下所示，加上这个以后，在用户空间memcpy1280*800的数据，时间增加了20毫秒。

static int csi_dev_mmap(struct file *file, struct vm_area_struct *vma) {
...
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
...
}

pgprot_noncached宏有何神奇之处呢？定义如下所示，L_PTE_MT_MASK, L_PTE_MT_UNCACHED这两个宏是和二级页表，三级页表相关的。

typedef struct {
	unsigned long pgprot;
} pgprot_t;

#define pgprot_val(x)	((x).pgprot)
#define __pgprot(x)	((pgprot_t) { (x) } )

#define __pgprot_modify(prot,mask,bits)		\
	__pgprot((pgprot_val(prot) & ~(mask)) | (bits))

#define pgprot_noncached(prot) \
	__pgprot_modify(prot, L_PTE_MT_MASK, L_PTE_MT_UNCACHED)

pgprot_noncached（）实际禁止了相关页的Cache和写缓冲（Write Buffer），pgprot_writecombine（）则没有禁止写缓冲。ARM的写缓冲器是一个非常小的FIFO存储器，位于处理器核与主存之间，其目的在于将处理器核和Cache从较慢的主存写操作中解脱出来。写缓冲区与Cache在存储层次上处于同一层次，但是它只作用于写主存。

方案二：dma_mmap_coherent

方案三：dma_alloc_writecombine，执行失败

		DEBUG_CSI_INIT("");
		new_node->buf_size = CSI_DEV_IMAGE_SIZE;
		new_node->buf = dma_alloc_writecombine(cm_dev->dev,
					PAGE_ALIGN(new_node->buf_size),
					&new_node->dma_addr,
					GFP_DMA | GFP_KERNEL);
		if(new_node->buf == NULL || IS_ERR(new_node->buf)){
			//kfree(new_node);
			DEBUG_CSI_INIT("new_node->buf = %p",new_node->buf);
			DEBUG_CSI_INIT("dma_alloc_writecombine error");
			//mutex_unlock(&cm_dev->lock);
			return -ENOMEM;
		}

经过测试，如果要使用这个函数，就需要实现struct dma_map_ops*指针，如下所示，如果没有自定义，就使用默认的arm_dma_ops。

static inline struct dma_map_ops *__generic_dma_ops(struct device *dev)
{
	if (dev && dev->archdata.dma_ops)
		return dev->archdata.dma_ops;
	return &arm_dma_ops;
}

然后代码改成

        DEBUG_CSI_INIT("");
		new_node->buf_size = CSI_DEV_IMAGE_SIZE;
		cm_dev->dev->archdata.dma_ops = NULL;
		new_node->buf = dma_alloc_writecombine(cm_dev->dev,
					PAGE_ALIGN(new_node->buf_size),
					&new_node->dma_addr,
					GFP_DMA | GFP_KERNEL);
		if(new_node->buf == NULL || IS_ERR(new_node->buf)){
			DEBUG_CSI_INIT("new_node->buf = %p",new_node->buf);
			DEBUG_CSI_INIT("dma_alloc_writecombine error");
			return -ENOMEM;
		}
		DEBUG_CSI_INIT("");

第一次执行失败，原因是dma_alloc_writecombine的调用两侧加了自旋锁，去掉自旋锁后。就执行成功了，得出结论：dma_alloc_writecombine的不能位于自旋锁区间。

这个运行速度，几乎和以前一样。

方案四：dmam_alloc_noncoherent，执行失败

或者dma_alloc_noncoherent，都执行失败。

        DEBUG_CSI_INIT("");
		new_node->buf_size = CSI_DEV_IMAGE_SIZE;
		new_node->buf = dmam_alloc_noncoherent(cm_dev->dev,
					PAGE_ALIGN(new_node->buf_size),
					&new_node->dma_addr,
					GFP_DMA | GFP_KERNEL);
		if(new_node->buf == NULL || IS_ERR(new_node->buf)){
			//kfree(new_node);
			DEBUG_CSI_INIT("new_node->buf = %p",new_node->buf);
			DEBUG_CSI_INIT("dmam_alloc_noncoherent error");
			//mutex_unlock(&cm_dev->lock);
			return -ENOMEM;
		}


[   62.372363] init:/big/csi_driver/csi_ov/csi_dev.c:csi_dev_node_array_init:1769: new_node->buf =   (null)
[   62.381852] init:/big/csi_driver/csi_ov/csi_dev.c:csi_dev_node_array_init:1770: dmam_alloc_noncoherent error

三后来的后来加的

方案1：cma

Contiguous Memory Allocator, CMA，连续内存分配器，用于分配连续的大块内存。CMA分配器，会Reserve一片物理内存区域：设备驱动不用时，内存管理系统将该区域用于分配和管理可移动类型页面；设备驱动使用时，用于连续内存分配，此时已经分配的页面需要进行迁移；此外，CMA分配器还可以与DMA子系统集成在一起，使用DMA的设备驱动程序无需使用单独的CMA API。

cma的设备树：由内容可知，cma的大小是8X $2^{24}$ = 8X $2^{4}$ X1M = 128M

reserved-memory {
	#address-cells = <1>;
	#size-cells = <1>;
	ranges;

	linux,cma {
		compatible = "shared-dma-pool";
		reusable;
		size = <0x8000000>;
		linux,cma-default;
	};
};

相关的头文件

#include <linux/dma-contiguous.h>

dma_alloc_from_contiguous函数原型

#define CONFIG_CMA_ALIGNMENT 8
/**
 * dma_alloc_from_contiguous() - allocate pages from contiguous area
 * @dev:   Pointer to device for which the allocation is performed.
 * @count: Requested number of pages.
 * @align: Requested alignment of pages (in PAGE_SIZE order).
 *
 * This function allocates memory buffer for specified device. It uses
 * device specific contiguous memory area if available or the default
 * global one. Requires architecture specific dev_get_cma_area() helper
 * function.
 */
struct page *dma_alloc_from_contiguous(struct device *dev, int count,
				       unsigned int align)
{
	if (align > CONFIG_CMA_ALIGNMENT)
		align = CONFIG_CMA_ALIGNMENT;

	return cma_alloc(dev_get_cma_area(dev), count, align);
}
/**
 * dma_alloc_from_contiguous() - allocate pages from contiguous area
 * @dev:   Pointer to device for which the allocation is performed.
 * @count: Requested number of pages.
 * @align: Requested alignment of pages (in PAGE_SIZE order).
 *
 * This function allocates memory buffer for specified device. It uses
 * device specific contiguous memory area if available or the default
 * global one. Requires architecture specific dev_get_cma_area() helper
 * function.
 */
struct page *dma_alloc_from_contiguous(struct device *dev, int count,
				       unsigned int align)
{
	if (align > CONFIG_CMA_ALIGNMENT)
		align = CONFIG_CMA_ALIGNMENT;

	return cma_alloc(dev_get_cma_area(dev), count, align);
}

使用这个函数时，编译报错，这不是WARNING吗？函数都找不到，就是错误。

WARNING: "dma_alloc_from_contiguous" [/big/csi_driver/csi_ov/csi_dev.ko] undefined!

dma_alloc_from_contiguous函数默认情况下并没有导出符号，这是不是意味着内核不想让我们使用它呢？修改文件drivers\base\dma-contiguous.c，添加导出符号。

EXPORT_SYMBOL(dma_alloc_from_contiguous);

测试代码，测试结果能工作，相比使用__get_dma_pages分配的内存缓冲区，速度没有提升。

DEBUG_CSI_INIT("CONFIG_CMA_ALIGNMENT =%d",CONFIG_CMA_ALIGNMENT);
		new_node->buf_size = CSI_DEV_IMAGE_SIZE;
		order = get_order(new_node->buf_size);
		new_node->page = dma_alloc_from_contiguous(cm_dev->dev,
			new_node->buf_size >> PAGE_SHIFT,
			order);

		if(new_node->page == NULL || IS_ERR(new_node->page)){
			DEBUG_CSI_INIT("new_node->page = %p",new_node->buf);
			DEBUG_CSI_INIT("dma_alloc_from_contiguous error");
			return -ENOMEM;
		}
		new_node->buf = (char*)page_to_phys(new_node->page);
		new_node->dma_addr = dma_map_single(cm_dev->dev, new_node->buf, new_node->buf_size, DMA_FROM_DEVICE);