rvv向量化 epi-builtins-ref C语言intrinsics接口学习

最新推荐文章于 2024-09-12 09:39:26 发布

清枫♚

最新推荐文章于 2024-09-12 09:39:26 发布

阅读量884

点赞数 21

文章标签： c语言学习开发语言

本文链接：https://blog.csdn.net/weixin_47100480/article/details/137624498

版权

本文详细介绍了RISC-V向量指令集中的向量类型、掩码类型以及多个基本运算指令如vfadd、vfdiv等的原型和操作过程，为初学者提供了一个学习RISC-V向量化编程的起点。

摘要由CSDN通过智能技术生成

纠结了很久的riscv vector指令集的向量化编程，不知从何入手，那就一点一点学习吧

Vector Types
向量类型命名规则：

__epi_<factor>x<ty>

factor是向量相对于VLEN/ELEN的元素个数。
__epi_2xf32和Lepi_2xf64具有相同数量的元素，但不同的元素类型。
ty是元素类型。
__epi_2xf32和__epi_4xf32具有不同数量的元素，但元素类型相同。

向量类型接口定义
Mask Types
掩码类型与LMUL无关，因为它们总是使用单个向量寄存器，但是值仍然是有用的，掩码的元素类型为i1。

例如，两个__epi_2x之间的关系操作将计算_epi_2xi1类型的掩码。

还有一种混合类型，有需要的读者可自行查看官方文档

指令详解
Instruction:
vsetvli
Prototypes:

unsigned long int __builtin_epi_vsetvl(unsigned long int rvl,
							/* constant */ unsigned long int sew,
							/* constant */ unsigned long int lmul);

Operation:

gvl = compute_vector_length(rvl, sew, lmul)
result = gvl

Instruction
vsetvli
Prototypes:

	unsigned long int
	__builtin_epi_vsetvlmax(/* constant */ unsigned long int sew,	
							/* constant */ unsigned long int lmul);

Operation:

gvl = compute_vlmax(sew, lmul)
result = gvl

Instruction:
vfadd.vv
Prototypes:

	__epi_2xf32 __builtin_epi_vfadd_2xf32(__epi_2xf32 a, __epi_2xf32 b,
											unsigned long int gvl);
											
	__epi_1xf64 __builtin_epi_vfadd_1xf64(__epi_1xf64 a, __epi_1xf64 b,
											unsigned long int gvl);
											
	__epi_4xf32 __builtin_epi_vfadd_4xf32(__epi_4xf32 a, __epi_4xf32 b,		
											unsigned long int gvl);
											
	__epi_2xf64 __builtin_epi_vfadd_2xf64(__epi_2xf64 a, __epi_2xf64 b,		
											unsigned long int gvl);
											
	__epi_8xf32 __builtin_epi_vfadd_8xf32(__epi_8xf32 a, __epi_8xf32 b,	
											unsigned long int gvl);
											
	__epi_4xf64 __builtin_epi_vfadd_4xf64(__epi_4xf64 a, __epi_4xf64 b,	
											unsigned long int gvl);
											
	__epi_16xf32 __builtin_epi_vfadd_16xf32(__epi_16xf32 a, __epi_16xf32 b,
											unsigned long int gvl);
											
	__epi_8xf64 __builtin_epi_vfadd_8xf64(__epi_8xf64 a, __epi_8xf64 b,
											unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = a[element] + b[element]
result[gvl : VLMAX] = 0

部分指令还有掩码操作类型，主要是为了处理分支，在此不作过多介绍，可自行查看文档，后面对于Prototypes也只选取其中一种类型，只要搞懂了向量类型的定义，就很容易能够理解

Instruction:
vfdiv.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfdiv_2xf32(__epi_2xf32 a, __epi_2xf32 b,
										unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = a[element] / b[element]
result[gvl : VLMAX] = 0

Instruction:
vfmacc.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfmacc_2xf32(__epi_2xf32 c, __epi_2xf32 a,
											__epi_2xf32 b, unsi	gned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = a[element] * b[element] + c[element]
result[gvl : VLMAX] = 0

Instructions:
vfmadd.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfmadd_2xf32(__epi_2xf32 a, __epi_2xf32 b,
									__epi_2xf32 c, unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = a[element] * b[element] + c[element]
result[gvl : VLMAX] = 0

这里的vfmadd.vv和vfmacc.vv指令操作貌似是一样的，可能这个文档有问题，查看riscv vector sepc手册，发现这两条指令并不相同，如下：
在这里插入图片描述
Instruction:
vfmax.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfmax_2xf32(__epi_2xf32 a, __epi_2xf32 b,
										unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = max(a[element], b[element])
result[gvl : VLMAX] = 0

Instruction:
vfmin.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfmin_2xf32(__epi_2xf32 a, __epi_2xf32 b,	
										unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = min(a[element], b[element])
result[gvl : VLMAX] = 0

Instruction:
vfmsac.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfmsac_2xf32(__epi_2xf32 c, __epi_2xf32 a,
								__epi_2xf32 b, unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = a[element] * b[element] - c[element]
result[gvl : VLMAX] = 0

Instruction:
vfmsub.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfmsub_2xf32(__epi_2xf32 a, __epi_2xf32 b,
								__epi_2xf32 c, unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = a[element] * b[element] - c[element]
result[gvl : VLMAX] = 0

这两条指令的实现也有问题，与上面的vfmacc.vv与vfmadd.vv类似

Instructions:
vfmul.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfmul_2xf32(__epi_2xf32 a, __epi_2xf32 b,
										unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = a[element] * b[element]
result[gvl : VLMAX] = 0

Instruction:
vfnmacc.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfnmacc_2xf32(__epi_2xf32 c, __epi_2xf32 a,
							__epi_2xf32 b, unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = -( a[element] * b[element] ) - c[element]
result[gvl : VLMAX] = 0

Instruction:
vfnmadd.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfnmadd_2xf32(__epi_2xf32 a, __epi_2xf32 b,
							__epi_2xf32 c, unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = -( a[element] * b[element] ) - c[element]
result[gvl : VLMAX] = 0

Instruction:
vfnmsac.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfnmsac_2xf32(__epi_2xf32 c, __epi_2xf32 a,
							__epi_2xf32 b, unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = -( a[element] * b[element] ) + c[element]
result[gvl : VLMAX] = 0

Instruction:
vfnmsub.vv
Prototypes:

__epi_2xf32 __builtin_epi_vfnmsub_2xf32(__epi_2xf32 a, __epi_2xf32 b,
							__epi_2xf32 c, unsigned long int gvl);

Operation:

for element = 0 to gvl - 1
result[element] = a[element] * b[element] - c[element]
result[gvl : VLMAX] = 0

Instruction:
vfredmax.vs
Prototypes:

__epi_2xf32 __builtin_epi_vfredmax_2xf32(__epi_2xf32 a, __epi_2xf32 b,
										unsigned long int gvl);

Operation:

if gvl > 0:
	current_max = b[0]
	for element = 0 to gvl - 1
		current_max = max(current_max, a[element])
	result[0] = current_max
	result[1 : VLMAX] = 0

Instruction:
vfredmin.vs
Prototypes:

__epi_2xf32 __builtin_epi_vfredmin_2xf32(__epi_2xf32 a, __epi_2xf32 b,
									unsigned long int gvl);

Operation:

if gvl > 0:
	current_min = b[0]
	for element = 0 to gvl - 1
		current_min = max(current_min, a[element])
	result[0] = current_min
	result[1 : VLMAX] = 0

Instruction:
vfredosum.vs
Prototypes:

__epi_2xf32 __builtin_epi_vfredosum_2xf32(__epi_2xf32 a, __epi_2xf32 b,
									unsigned long int gvl);

Operation:

if gvl > 0:
	current_sum = b[0]
	for element = 0 to gvl - 1
		current_sum = current_sum + a[element]
	result[0] = current_sum
	result[1 : VLMAX] = 0

剩余指令在此不再赘述，想继续学习的同学，可自行查看阅读pdf（已上传），根据我的总结的模板，阅读起来会轻松容易

清枫♚

关注

21
点赞
踩
9

收藏

觉得还不错? 一键收藏
打赏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫