head.S中__idmap_text_start 至__idmap_text_end映射

kernel : linux 4.4

参考文章: 内存初始化代码分析(一):identity mapping和kernel image mapping (wowotech.net)

先给出映射图:

代码在head.S中__create_page_tables:

	create_pgd_entry x0, x3, x5, x6
	mov	x5, x3				// __pa(__idmap_text_start)
	adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)
	create_block_map x0, x7, x3, x5, x6

此时的

x0:idmap_pg_dir   0x1462000

x3:__pa(__idmap_text_start)   //0xC17000

x5:0x28

x6:swapper_pg_dir+#SWAPPER_DIR_SIZE=0x1467000

寄存器内容可以参考我之前文章在arm64 head.S中用汇编实现打印64位寄存器和字符的宏,打印出来。

create_pgd_entry代码如下,

/*
 * Macro to populate the PGD (and possibily PUD) for the corresponding
 * block entry in the next level (tbl) for the given virtual address.
 *
 * Preserves:	tbl, next, virt
 * Corrupts:	tmp1, tmp2
 */
	.macro	create_pgd_entry, tbl, virt, tmp1, tmp2
	create_table_entry \tbl, \virt, PGDIR_SHIFT, PTRS_PER_PGD, \tmp1, \tmp2
#if SWAPPER_PGTABLE_LEVELS > 3
	create_table_entry \tbl, \virt, PUD_SHIFT, PTRS_PER_PUD, \tmp1, \tmp2
#endif
#if SWAPPER_PGTABLE_LEVELS > 2
	create_table_entry \tbl, \virt, SWAPPER_TABLE_SHIFT, PTRS_PER_PTE, \tmp1, \tmp2
#endif
	.endm

SWAPPER_PGTABLE_LEVELS为2,所以两个if分支都不执行。只执行create_table_entry \tbl, \virt, PGDIR_SHIFT, PTRS_PER_PGD, \tmp1, \tmp2

这里是在level 1 table(也叫PUD)中填充某个entry。

create_table_entry代码如下:

/*
 * Macro to create a table entry to the next page.
 *
 *	tbl:	page table address
 *	virt:	virtual address
 *	shift:	#imm page table shift
 *	ptrs:	#imm pointers per table page
 *
 * Preserves:	virt
 * Corrupts:	tmp1, tmp2
 * Returns:	tbl -> next level table page address
 */
	.macro	create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2
	lsr	\tmp1, \virt, #\shift
	and	\tmp1, \tmp1, #\ptrs - 1	// table index
	add	\tmp2, \tbl, #PAGE_SIZE
	orr	\tmp2, \tmp2, #PMD_TYPE_TABLE	// address of next table and entry type
	str	\tmp2, [\tbl, \tmp1, lsl #3]
	add	\tbl, \tbl, #PAGE_SIZE		// next level table page
	.endm

代入实参,代码如下

lsr	x5, x3, #30  //x5=x3>>30    x3即下图的VA 虚拟地址, VA bits[38:30]是PUD表(也叫level 1表)中的index,  x5=0
and	x5, x5, #512 - 1	// table index   x5=x5&511=x5&0x1ff   总共512个index
add	x6, x0, #4096       //x6=x0+4096, x0是实际的物理地址,也是页表地址, x6是x0的下一个页 0x1463000
orr	x6, x6, #(3<<0)	    // address of next table and entry type  //x6=x6|(3<<0)  table entry
str	x6, [x0, x5, lsl #3]   //[x0+x5*(1<<3)] = x6,   (0x1462000+0*8)地址内容0x1463000
add	x0, x0, #4096		// next level table page   //x0=x0+4096,    x0=0x1462000+4096=0x1463000

虚拟地址的某几位对应某表index的关系如下

根据虚拟地址(VA)的bits[38:30]算出level 1 table中的index

然后按下面的table描述定义,用物理地址bits[47:12]填充该index对应地址内容(是一个table描述符)bits[47:12],bit[1:0]为3表示是table描述符

即level 1 table地址0x1462000,其index 0, 表项内容为0x1463000

伪代码如下

.macro	create_table_entry, tbl, virt, shift, ptrs, tmp1, tmp2
伪代码:
{
	tmp1 = virt >> shift
	tmp1 = tmp1 & (ptrs-1)
	tmp2 = tbl + 4096
	tmp2 = tmp2 | #PMD_TYPE_TABLE
	char *p = tbl + tmp1*8      //8表示表中每项占据64位
	*p = tmp2
	tbl += 4096  
}

这里的tbl是0x1462000,是level1 table,

shifit是30

虚拟地址virt是0xC17000,表示__pa(__idmap_text_start),查看vmlinux.lds中

. = ALIGN(0x00001000); __idmap_text_start = .; *(.idmap.text) __idmap_text_end = .;

表示段.idmap.text的首地址, 查看system.map该段函数有

其bits[38:30]表示level 1 table中的index,值为0,每个表项占据8字节

PMD_TYPE_TABLE是3表示每个表项table 描述符(2 表示block 描述符),table entry格式如下

这里页大小是4KB,因此m是12

即 需要将level 2 table地址的[47:12], 填入上述table描述符的bits[47:12]

level 1 table地址0x1462000, index 0,表项内容0x1463003

create_table_entry创建完level 1 table的第0表项后,继续填充level 2 table。level 2 table每个表项是block 描述符,每个block地址是2M对齐,

mov	x5, x3				// __pa(__idmap_text_start)
adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)
create_block_map x0, x7, x3, x5, x6

此时参数

x0=0x1463000

x3:__pa(__idmap_text_start)   //0xC17000  .idmap.text代码段的地址

x5:x3

x6:0xC17448

x7:SWAPPER_MM_MMUFLAGS  0xf11

create_block_map代码如下

/*
 * Macro to populate block entries in the page table for the start..end
 * virtual range (inclusive).
 *
 * Preserves:	tbl, flags
 * Corrupts:	phys, start, end, pstate
 */
	.macro	create_block_map, tbl, flags, phys, start, end
	lsr	\phys, \phys, #SWAPPER_BLOCK_SHIFT
	lsr	\start, \start, #SWAPPER_BLOCK_SHIFT
	and	\start, \start, #PTRS_PER_PTE - 1	// table index
	orr	\phys, \flags, \phys, lsl #SWAPPER_BLOCK_SHIFT	// table entry
	lsr	\end, \end, #SWAPPER_BLOCK_SHIFT
	and	\end, \end, #PTRS_PER_PTE - 1		// table end index
9999:	str	\phys, [\tbl, \start, lsl #3]		// store the entry
	add	\start, \start, #1			// next entry
	add	\phys, \phys, #SWAPPER_BLOCK_SIZE		// next block
	cmp	\start, \end
	b.ls	9999b
	.endm

SWAPPER_BLOCK_SHIFT:0x15 =21    表示VA bits[29:21]

SWAPPER_BLOCK_SIZE:0x200000    2M大小

PTRS_PER_PTE:0x200 = 512

代入实际参数

	lsr	x3, x3, #21       //x3=x3>>21 得到x3的bit[63:21], 最重要是获得Output address的bit[47:21], 值为6
	lsr	x5, x5, #21       //获得VA bits[63:21] 
	and	x5, x5, #512 - 1	// table index //获得VA bits[29:21]即level2 table index, 起始index     值为6
	orr	x3, x7, x3, lsl #21	// table entry  x3=x7 | (x3<<21), x7是属性占据bit[11:0]=0xf11, level2 table index对应的block entry
	lsr	x6, x6, #21
	and	x6, x6, #512 - 1		// table end index   level2 table 结束index   值也为6
9999: 	str	x3, [x0, x5, lsl #3]		// store the entry //x0是level2 table的地址 从x5开始到x6结束,填充每个block entry(descriptor) 
	add	x5, x5, #1			// next entry
	add	x3, x3, #0x200000		// next block   
	cmp	x5, x6
	b.ls	9999b

伪代码如下:

.macro	create_block_map, tbl, flags, phys, start, end
伪代码:
{
	phys  = phys >> 21
	start = start >> 21
	start = start & (512-1)   //获得start的bits[29:21]
	phys = flags | (phys<<21)
	end = end >>21
	end = end & (512-1)       //获得end的bits[29:21]
	
	int i;
	char *p;
	for (i=start; i<= end; i++) {
		p = tbl + start*8;
	    *p = phys;
	    phys += 0x200000;        //一个block有2M大小
	}
}

block描述符

这里n取21,Output address即映射后的物理地址,所以应该将物理地址的bits[47:21],填入该描述符的bits[47:21], 其他字段意思如下

 这里需要映射的物理地址是__idmap_text_start到__idmap_text_end,即物理地址0xC17000到0xC17448,作为物理地址需要算出bits[47:21]填充到level 2 table的某个表项中

因为是identity 映射,即物理地址和虚拟地址一样,所以0xC17000到0xC17448,也是虚拟地址,作为虚拟地址,需要算出level 2 table 的index

0xC17000的bits[47:21]是6,0xC177448的bits[47:21]是6,所以只需要一个表项6就够了

此时的level 2 table地址0x1463000,index 6的表项内容:bits[47:21]是0x6, 对应block地址0xC00000, bits[11:0]是0xf11

将level 1 table地址,level 1 table的0表项,level 2 table的第6表项打印出来,打印代码

	create_pgd_entry x0, x3, x5, x6
	mov	x5, x3				// __pa(__idmap_text_start)
	adr_l	x6, __idmap_text_end		// __pa(__idmap_text_end)
	create_block_map x0, x7, x3, x5, x6


        print_char x8, x1,x2,#0x3a    // ':'
        print_reg64 x25,x8                 //address of the level 1 table
        ldr x9, [x25, #0]                         //content of the (levwl 1 table + index0)
        print_char x10, x1,x2,#0x3a    // ':'
        print_reg64 x9,x10                //print the content of the (levwl 1 table + index0)
        lsr x9, x9, #12                          //level2 table address[47:12]
        lsl  x9,x9,#12                           //level2 table address

        ldr x10, [x9, #48]                         //content of the (level 2 table + index6) 
        print_char x11, x1,x2,#0x3a    // ':'
        print_reg64 x10,x11                 //print the content of the (level 2 table + index0)
 

测试如下:

level 1 table地址0x1462000,  第一表项内容0x1463003,  

获取level 2 table地址: 因为每个table占据一页,所以4K对齐后地址0x1463000

__idmap_text_start值是0xC17448, 其既是虚拟地址,作为虚拟地址时,其bits[29:21]=6, 即占据level 2 table第6个表项, 地址为0x1463000+6*8,

该地址对应的是block描述符

__idmap_text_start又是物理地址,需要将bits[47:21](值为0b110)存入描述符的bits[47:21]

打印出的block描述符为0xC00F11, bits[47:21]为0b110, 测试正确

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
下面是一个使用C语言实现的torch.nn.functional.multi_head_attention_forward的示例代码: ```c #include <stdio.h> #include <stdlib.h> #include <math.h> #define MAX_SEQ_LENGTH 256 #define MAX_HIDDEN_SIZE 512 void multi_head_attention_forward(float *input, float *weight_q, float *weight_k, float *weight_v, float *weight_o, float *bias_q, float *bias_k, float *bias_v, float *bias_o, float *output, int batch_size, int seq_length, int num_heads, int head_size, float dropout_prob) { float q[MAX_SEQ_LENGTH][MAX_HIDDEN_SIZE]; float k[MAX_SEQ_LENGTH][MAX_HIDDEN_SIZE]; float v[MAX_SEQ_LENGTH][MAX_HIDDEN_SIZE]; float qk[MAX_SEQ_LENGTH][MAX_SEQ_LENGTH]; float qkv[MAX_SEQ_LENGTH][MAX_HIDDEN_SIZE]; float o[MAX_SEQ_LENGTH][MAX_HIDDEN_SIZE]; float attention_probs[MAX_SEQ_LENGTH][MAX_SEQ_LENGTH]; float q_bias[MAX_SEQ_LENGTH]; float k_bias[MAX_SEQ_LENGTH]; float v_bias[MAX_SEQ_LENGTH]; float o_bias[MAX_SEQ_LENGTH]; float q_scale_factor = sqrtf((float)head_size); float k_scale_factor = sqrtf((float)head_size); float v_scale_factor = sqrtf((float)head_size); float attention_scale_factor = 1.0f / sqrtf((float)head_size); // Compute q, k, v for (int i = 0; i < batch_size; i++) { for (int j = 0; j < seq_length; j++) { for (int h = 0; h < num_heads; h++) { for (int d = 0; d < head_size; d++) { int input_idx = i * seq_length * num_heads * head_size + j * num_heads * head_size + h * head_size + d; int q_idx = i * seq_length * num_heads * head_size + j * num_heads * head_size + h * head_size + d; int k_idx = i * seq_length * num_heads * head_size + j * num_heads * head_size + h * head_size + d; int v_idx = i * seq_length * num_heads * head_size + j * num_heads * head_size + h * head_size + d; q[q_idx] = input[input_idx] * q_scale_factor + bias_q[h * head_size + d]; k[k_idx] = input[input_idx] * k_scale_factor + bias_k[h * head_size + d]; v[v_idx] = input[input_idx] * v_scale_factor + bias_v[h * head_size + d]; } } } } // Compute qk for (int i = 0; i < batch_size * seq_length * num_heads; i++) { for (int j = 0; j < seq_length * num_heads; j++) { qk[i][j] = 0.0f; for (int d = 0; d < head_size; d++) { int q_idx = i * head_size + d; int k_idx = j * head_size + d; qk[i][j] += q[q_idx] * k[k_idx]; } } } // Compute attention_probs for (int i = 0; i < batch_size * seq_length * num_heads; i++) { for (int j = 0; j < seq_length * num_heads; j++) { attention_probs[i][j] = expf(qk[i][j] * attention_scale_factor); } } // Apply dropout for (int i = 0; i < batch_size * seq_length * num_heads; i++) { for (int j = 0; j < seq_length * num_heads; j++) { if ((float)rand() / RAND_MAX < dropout_prob) { attention_probs[i][j] = 0.0f; } } } // Normalize attention_probs for (int i = 0; i < batch_size * seq_length * num_heads; i++) { float sum = 0.0f; for (int j = 0; j < seq_length * num_heads; j++) { sum += attention_probs[i][j]; } for (int j = 0; j < seq_length * num_heads; j++) { attention_probs[i][j] /= sum; } } // Compute qkv for (int i = 0; i < batch_size * seq_length * num_heads; i++) { for (int j = 0; j < head_size; j++) { qkv[i][j] = 0.0f; for (int k = 0; k < seq_length * num_heads; k++) { int q_idx = i * head_size + j; int v_idx = k * head_size + j; qkv[i][j] += attention_probs[i][k] * v[v_idx]; } } } // Compute o for (int i = 0; i < batch_size; i++) { for (int j = 0; j < seq_length; j++) { for (int h = 0; h < num_heads; h++) { for (int d = 0; d < head_size; d++) { int o_idx = i * seq_length * num_heads * head_size + j * num_heads * head_size + h * head_size + d; int qkv_idx = i * seq_length * num_heads * head_size + j * num_heads * head_size + h * head_size + d; o[o_idx] = qkv[qkv_idx] + bias_o[h * head_size + d]; } } } } // Compute output for (int i = 0; i < batch_size; i++) { for (int j = 0; j < seq_length; j++) { for (int h = 0; h < num_heads * head_size; h++) { int output_idx = i * seq_length * num_heads * head_size + j * num_heads * head_size + h; int o_idx = i * seq_length * num_heads * head_size + j * num_heads * head_size + h; output[output_idx] = o[o_idx]; } } } } ``` 这是一个简单的多头注意力机制的前向传播函数,输入参数包括输入张量(input)、查询权重矩阵(weight_q)、键权重矩阵(weight_k)、值权重矩阵(weight_v)、输出权重矩阵(weight_o)、查询偏置向量(bias_q)、键偏置向量(bias_k)、值偏置向量(bias_v)、输出偏置向量(bias_o)、输出张量(output)、批次大小(batch_size)、序列长度(seq_length)、头数(num_heads)、头大小(head_size)和dropout概率(dropout_prob)。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值