000000000000e470 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE>:
e470: a9a97bfd stp x29, x30, [sp,#-368]! //把x29和x30存放到[sp0-368], sp = sp0-368
e474: 910003fd mov x29, sp //把sp赋值给x29
e478: 6d0627e8 stp d8, d9, [sp,#96] //把d8和d9 保存到[sp+96]
e47c: b9402c30 ldr w16, [x1,#44] //从top_blob中获取channel个数
e480: 6d072fea stp d10, d11, [sp,#112]
e484: a90363f7 stp x23, x24, [sp,#48]
e488: 6d0837ec stp d12, d13, [sp,#128]
e48c: a90573fb stp x27, x28, [sp,#80]
e490: 6d093fee stp d14, d15, [sp,#144]
e494: a90153f3 stp x19, x20, [sp,#16]
e498: a9025bf5 stp x21, x22, [sp,#32]
e49c: a9046bf9 stp x25, x26, [sp,#64]
e4a0: 6b1f021f cmp w16, wzr //把top_blob的channel个数与0作比较
e4a4: f9400043 ldr x3, [x2] //加载kenerl赋值给kernel
e4a8: b9402408 ldr w8, [x0,#36] //加载bottom_blob的width
e4ac: b9402c1c ldr w28, [x0,#44] //加载bottom_blob的channel
e4b0: b9402424 ldr w4, [x1,#36] //加载top_blob的width
e4b4: b9402838 ldr w24, [x1,#40] //加载top_blob的height
e4b8: 540033cd b.le eb30 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x6c0> //把top_blob的channel个数与0作比较
e4bc: 531f7906 lsl w6, w8, #1 //bottom_blob的width左移1位赋值给w6;w6=bottom_blob的width*2
e4c0: aa0003fa mov x26, x0 //把bottom_blob赋值给x26
e4c4: 0b0800c7 add w7, w6, w8 //w7=3*w8=bottom_blob的width*3
e4c8: 937e7cc0 sbfiz x0, x6, #2, #32 //x0=bottom_blob的width*8
e4cc: 531e7505 lsl w5, w8, #2 //bottom_blob的width*4赋值给w5
e4d0: f9008fa0 str x0, [x29,#280] //bottom_blob的width*8
e4d4: 937e7ce0 sbfiz x0, x7, #2, #32 //x0=bottom_blob的width*12
e4d8: 531b6b82 lsl w2, w28, #5 //bottom_blob的channel左移5位,即channel*32
e4dc: 0b0800aa add w10, w5, w8 //bottom_blob的width+width*4
e4e0: f9008ba0 str x0, [x29,#272] //bottom_blob的width*12
e4e4: 937e7ca0 sbfiz x0, x5, #2, #32 //x0=bottom_blob的width*16
e4e8: 4b1c0c59 sub w25, w2, w28, lsl #3 //bottom_blob的channel*24
e4ec: 13027c95 asr w21, w4, #2 //top_blob的width算术右移2位,即w21=top_blob的width/4
e4f0: 51000b1e sub w30, w24, #0x2 //top_blob的height算术右移2位,即w21=top_blob的height/4
e4f4: f90087a0 str x0, [x29,#264] //bottom_blob的width*16
e4f8: 937e7d40 sbfiz x0, x10, #2, #32 //(bottom_blob的channel*32+width*4)*4
e4fc: 937e7d0f sbfiz x15, x8, #2, #32 //bottom_blob的width*4
e500: 121f7bc9 and w9, w30, #0xfffffffe //把top_blob的height剪成偶数,
e504: f90083a0 str x0, [x29,#256] //(bottom_blob的channel*32+width*4)*4
e508: 0b1c0320 add w0, w25, w28 //w0 = bottom_blob的channel*25
e50c: aa0303fb mov x27, x3 //把kernel赋值给x27
e510: b90127a0 str w0, [x29,#292] //bottom_blob的channel*25
e514: 937e7c96 sbfiz x22, x4, #2, #32 //top_blob的width*4
e518: 4b150894 sub w20, w4, w21, lsl #2 //w20=0
e51c: 910041f3 add x19, x15, #0x10 //bottom_blob的width*4+16
e520: 11000937 add w23, w9, #0x2 //bottom_blob的width*16+18
e524: 2a1803e3 mov w3, w24 //top_blob的height给w3
e528: 2a0403e0 mov w0, w4 //top_blob的width给w0
e52c: d280000e mov x14, #0x0 // #0
e530: 9104e3b9 add x25, x29, #0x138 //x25=sp+138
e534: f9400824 ldr x4, [x1,#16] //top_blob 的elemsize
e538: 6b1f039f cmp w28, wzr //bottom_blob的channel 与0作比较
e53c: f9401827 ldr x7, [x1,#48] //top_blob的cstep到x7
e540: f9400028 ldr x8, [x1] //top_blob的data到x8中
e544: 1b037c02 mul w2, w0, w3 ///top_blob的width*top_blob的height 赋值给w2
e548: 9b077c87 mul x7, x4, x7 //top_blob 的elemsize * top_blob的cstep 赋值给x7
e54c: f9000b24 str x4, [x25,#16] //把top_blob 的elemsize保存到sp+138+16
e550: 52800044 mov w4, #0x2 // #2
e554: 93407c42 sxtw x2, w2 //把top_blob的width*top_blob的height 扩展为64bit 赋值给x2
e558: b9002720 str w0, [x25,#36] //把top_blob的width保存到sp+138+36
e55c: d503201f nop
e560: 9b0e20e7 madd x7, x7, x14, x8 //top_blob的data到x7中,top_blob 的elemsize * top_blo的cstep*0 + top_blob的data, 因为初始化时x14为0, Mat out = top_blob.channel(p);
e564: 52800020 mov w0, #0x1 // #1
e568: f900073f str xzr, [x25,#8]
e56c: f9000327 str x7, [x25] top_blob的data到sp+138
e570: f9000f3f str xzr, [x25,#24]
e574: b9002324 str w4, [x25,#32]
e578: b9002b23 str w3, [x25,#40] //top_blob的height
e57c: b9002f20 str w0, [x25,#44] //
e580: f9001b22 str x2, [x25,#48] //把top_blob的width*top_blob的height保存到sp+138+48
e584: 54002f0d b.le eb64 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x6f4> //bottom channel 与0做比较
e588: aa1b03e0 mov x0, x27 //把把kernel赋值给x0
e58c: d280001e mov x30, #0x0 // #0
e590: aa0003e6 mov x6, x0 //把把kernel赋值给x6
e594: f9401b42 ldr x2, [x26,#48] //把bottom_blob的cstep赋值给x2
e598: f9400b4a ldr x10, [x26,#16] //把bottom_blob的elemsize赋值给x10
e59c: 91010004 add x4, x0, #0x40 //把kernel地址+64赋值给x64
e5a0: 3cc104d1 ldr q17, [x6],#16 //加载kernel的一个128bit的向量到q17中,然后,x6指向的位置更新为kernel+16.float32x4_t _k0123 = vld1q_f32(kernel0);
e5a4: 91014005 add x5, x0, #0x50 //x5存放的地址为kernel+80
e5a8: f9400349 ldr x9, [x26] //把bottom_blob的data赋值给x9
e5ac: 9100c003 add x3, x0, #0x30 //把kernel地址+48赋值给x3
e5b0: 9b027d42 mul x2, x10, x2 //把bottom_blob的elemsize * bottom_blob的cstep 赋值给x2
e5b4: bd406013 ldr s19, [x0,#96] //把kernel地址+96的值赋给s19
e5b8: 91008008 add x8, x0, #0x20 //把kernel地址+32赋值给x8
e5bc: 3dc00097 ldr q23, [x4] //从kernel地址+64加载一个128bit的向量到q23, float32x4_t _k16171819 = vld1q_f32(kernel0+16);
e5c0: 7100071f cmp w24, #0x1 //top_blob的height与1作比较,应该对应代码中的 i+1 < outh
e5c4: 9b1e2442 madd x2, x2, x30, x9 //bottom_blob的elemsize * bottom_blob的cstep*0 + bottom_blob的data, const float* img0 = bottom_blob.channel(q);
e5c8: 3dc000d4 ldr q20, [x6] //把kernel+16取128bit值赋给向量q20,即 q20 float32x4_t _k4567 = vld1q_f32(kernel0+4);
e5cc: 9100500b add x11, x0, #0x14 //把kernel地址+20赋值给x11
e5d0: f9408fa4 ldr x4, [x29,#280] //bottom_blob的width*8赋值给 x4
e5d4: 9100a00c add x12, x0, #0x28 //把kernel+40赋给x12
e5d8: 3dc000b2 ldr q18, [x5] //从kernel+80地址中取一个128bit向量赋给q18, 即float32x4_t _k20212223 = vld1q_f32(kernel0+20);
e5dc: 9100f00d add x13, x0, #0x3c //把kernel+60地址赋给x13
e5e0: f9408ba5 ldr x5, [x29,#272] //bottom_blob的width*12赋值给x5
e5e4: 8b040044 add x4, x2, x4 //img0+width*8,即 const float* r2 = img0 + w*2;
e5e8: f94087a6 ldr x6, [x29,#264] //bottom_blob的width*16
e5ec: 4e040673 dup v19.4s, v19.s[0] //复制向量的值,即 v19 float32x4_t _k24242424 = vdupq_n_f32(kernel0[24]);
e5f0: 3dc00076 ldr q22, [x3] //从kernel地址+48处加载一个128bit向量到q22, float32x4_t _k12131415 = vld1q_f32(kernel0+12);
e5f4: 8b050045 add x5, x2, x5 //img0+width*12
e5f8: 3dc00115 ldr q21, [x8] //从kernel地址+32处加载一个128bit向量赋给q21,即 float32x4_t _k891011 = vld1q_f32(kernel0+8);
e5fc: 8b0f0043 add x3, x2, x15 //img0 + width*4;
e600: 8b060046 add x6, x2, x6 //img0 + width*16
e604: 54002acd b.le eb5c <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x6ec> //top_blob的height与1作比较,如果小于等于,则跳转
e608: f94083a8 ldr x8, [x29,#256] //(bottom_blob的width+width*4)*4
e60c: 8b1600e9 add x9, x7, x22 ///top_blob的width*4+top_blob的data
e610: 52800012 mov w18, #0x0 // #0
e614: 8b080048 add x8, x2, x8 //img0 + 20*width
e618: 6b1f02bf cmp w21, wzr //初始值为top_blob的height/4,对应于代码的nn
e61c: 54000c8d b.le e7ac <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x33c>
e620: 2a1503ea mov w10, w21
e624: f98040e0 prfm pldl1keep, [x7,#128] //预取top_blob的data的头128个字节l1缓存,对应于outptr
e628: 4c4078e7 ld1 {v7.4s}, [x7] //把top_blob的data的值加载到向量寄存器v7中。
e62c: f9804120 prfm pldl1keep, [x9,#128] //预取top_blob的data + top_blob的width*4 的头128个字节到l1 缓存,对应于outptr2
e630: 4c407928 ld1 {v8.4s}, [x9] //top_blob的data + top_blob的width*4 的值加载到向量v8中
e634: f9808060 prfm pldl1keep, [x3,#256] //预取img0 + width*4的数据的头256个字节到l1缓存中
e638: 4c40a869 ld1 {v9.4s, v10.4s}, [x3] //把img0 + width*4的值加载到向量v9和v10中
e63c: 91004063 add x3, x3, #0x10 //img0 + width*4+16
e640: 6e0a212b ext v11.16b, v9.16b, v10.16b, #4 //从v9(r1)和v10(r1)中抽取向量的元素,组成新的向量,v10中取低位的4个字节,v9区高位的12个字节
e644: 4fb4912d fmul v13.4s, v9.4s, v20.s[1] //v9(r1)的向量 * v20(_k4567)的通道1的标量,结果存在向量中v13
e648: 4f911128 fmla v8.4s, v9.4s, v17.s[0] //v9(r1)向量 * v17(_k0123)的通道0的标量 + v8向量,结果存放在向量v8(top_blob的data + top_blob的width*4)中
e64c: 6e0a412c ext v12.16b, v9.16b, v10.16b, #8 //从v9(r1)和v10(r1)中抽取向量的元素,组成新的向量,v10中取低位的8个字节,v9区高位的8个字节
e650: 4f941967 fmla v7.4s, v11.4s, v20.s[2] //v11的向量 * v20(_k4567)的通道2的标量 + v7向量,结果存在向量中v7(top_blob的data)
e654: 4fb1916e fmul v14.4s, v11.4s, v17.s[1] //v11的向量 * v17(_k0123)的通道1的标量,结果存放至v14中。
e658: 6e0a612b ext v11.16b, v9.16b, v10.16b, #12 //从v9(r1)和v10(r1)中抽取向量的元素,组成新的向量,v10取低位12个字节,v9取高位的4个字节。
e65c: 4fb4198d fmla v13.4s, v12.4s, v20.s[3] //向量12 * v20(_k4567)的通道3的标量 + v13向量,结果存放在v13向量中
e660: 4f911988 fmla v8.4s, v12.4s, v17.s[2] //向量12 * v17(_k0123)的通道2的标量 + v8向量,结果存放在v8向量中
e664: 4f951167 fmla v7.4s, v11.4s, v21.s[0] //向量11 * v21(_k891011)的通道0的标量 + v7向量,结果存放在v7中
e668: 4fb1196e fmla v14.4s, v11.4s, v17.s[3] //向量11 * v17(_k0123)的通道3的标量 + v14向量,结果存放在v14中
e66c: f9808080 prfm pldl1keep, [x4,#256] //预取img0 + width*8(r2)的头256个字节到L1缓存中
e670: 4fb5114d fmla v13.4s, v10.4s, v21.s[1] //向量10 * v21(_k891011)的通道1的标量 + v13向量,结果存放在v13向量中
e674: 4f941148 fmla v8.4s, v10.4s, v20.s[0] //向量10 * v20(_k4567)的通道0的标量 + v8向量,结果存放在v8向量中
e678: 4c40a889 ld1 {v9.4s, v10.4s}, [x4] //把img0 + width*8的值加载到向量v9和v10中
e67c: 91004084 add x4, x4, #0x10
e680: 6e0a212b ext v11.16b, v9.16b, v10.16b, #4
e684: 4f951927 fmla v7.4s, v9.4s, v21.s[2]
e688: 4fb4112e fmla v14.4s, v9.4s, v20.s[1]
e68c: 6e0a412c ext v12.16b, v9.16b, v10.16b, #8
e690: 4fb5196d fmla v13.4s, v11.4s, v21.s[3]
e694: 4f941968 fmla v8.4s, v11.4s, v20.s[2]
e698: 6e0a612b ext v11.16b, v9.16b, v10.16b, #12
e69c: 4f961187 fmla v7.4s, v12.4s, v22.s[0]
e6a0: 4fb4198e fmla v14.4s, v12.4s, v20.s[3]
e6a4: 4fb6116d fmla v13.4s, v11.4s, v22.s[1]
e6a8: 4f951168 fmla v8.4s, v11.4s, v21.s[0]
e6ac: f98080a0 prfm pldl1keep, [x5,#256]
e6b0: 4f961947 fmla v7.4s, v10.4s, v22.s[2]
e6b4: 4fb5114e fmla v14.4s, v10.4s, v21.s[1]
e6b8: 4c40a8a9 ld1 {v9.4s, v10.4s}, [x5] //把img0 + width*12的值加载到向量v9和v10中
e6bc: 910040a5 add x5, x5, #0x10
e6c0: 6e0a212b ext v11.16b, v9.16b, v10.16b, #4
e6c4: 4fb6192d fmla v13.4s, v9.4s, v22.s[3]
e6c8: 4f951928 fmla v8.4s, v9.4s, v21.s[2]
e6cc: 6e0a412c ext v12.16b, v9.16b, v10.16b, #8
e6d0: 4f971167 fmla v7.4s, v11.4s, v23.s[0]
e6d4: 4fb5196e fmla v14.4s, v11.4s, v21.s[3]
e6d8: 6e0a612b ext v11.16b, v9.16b, v10.16b, #12
e6dc: 4fb7118d fmla v13.4s, v12.4s, v23.s[1]
e6e0: 4f961188 fmla v8.4s, v12.4s, v22.s[0]
e6e4: 4f971967 fmla v7.4s, v11.4s, v23.s[2]
e6e8: 4fb6116e fmla v14.4s, v11.4s, v22.s[1]
e6ec: f98080c0 prfm pldl1keep, [x6,#256]
e6f0: 4fb7194d fmla v13.4s, v10.4s, v23.s[3]
e6f4: 4f961948 fmla v8.4s, v10.4s, v22.s[2]
e6f8: 4c40a8c9 ld1 {v9.4s, v10.4s}, [x6] //把img0 + width*16的值加载到向量v9和v10中
e6fc: 910040c6 add x6, x6, #0x10
e700: 6e0a212b ext v11.16b, v9.16b, v10.16b, #4
e704: 4f921127 fmla v7.4s, v9.4s, v18.s[0]
e708: 4fb6192e fmla v14.4s, v9.4s, v22.s[3]
e70c: 6e0a412c ext v12.16b, v9.16b, v10.16b, #8
e710: 4fb2116d fmla v13.4s, v11.4s, v18.s[1]
e714: 4f971168 fmla v8.4s, v11.4s, v23.s[0]
e718: 6e0a612b ext v11.16b, v9.16b, v10.16b, #12
e71c: 4f921987 fmla v7.4s, v12.4s, v18.s[2]
e720: 4fb7118e fmla v14.4s, v12.4s, v23.s[1]
e724: 4fb2196d fmla v13.4s, v11.4s, v18.s[3]
e728: 4f971968 fmla v8.4s, v11.4s, v23.s[2]
e72c: f9808040 prfm pldl1keep, [x2,#256]
e730: 4f931147 fmla v7.4s, v10.4s, v19.s[0]
e734: 4fb7194e fmla v14.4s, v10.4s, v23.s[3]
e738: 4c40a849 ld1 {v9.4s, v10.4s}, [x2] //把img0 的值加载到向量v9和v10中
e73c: 91004042 add x2, x2, #0x10
e740: 6e0a212b ext v11.16b, v9.16b, v10.16b, #4
e744: 4fb1116d fmla v13.4s, v11.4s, v17.s[1]
e748: 6e0a412c ext v12.16b, v9.16b, v10.16b, #8
e74c: 4f911987 fmla v7.4s, v12.4s, v17.s[2]
e750: 6e0a612b ext v11.16b, v9.16b, v10.16b, #12
e754: f9808100 prfm pldl1keep, [x8,#256]
e758: 4fb1196d fmla v13.4s, v11.4s, v17.s[3]
e75c: 4c40a90b ld1 {v11.4s, v12.4s}, [x8] //把img0 *20*width 的值加载到向量v9和v10中
e760: 91004108 add x8, x8, #0x10
e764: 4f921168 fmla v8.4s, v11.4s, v18.s[0]
e768: 4f93118e fmla v14.4s, v12.4s, v19.s[0]
e76c: 4f911127 fmla v7.4s, v9.4s, v17.s[0]
e770: 4f94114d fmla v13.4s, v10.4s, v20.s[0]
e774: 6e0c2169 ext v9.16b, v11.16b, v12.16b, #4
e778: 6e0c416a ext v10.16b, v11.16b, v12.16b, #8
e77c: 4fb2112e fmla v14.4s, v9.4s, v18.s[1]
e780: 6e0c6169 ext v9.16b, v11.16b, v12.16b, #12
e784: 4f921948 fmla v8.4s, v10.4s, v18.s[2]
e788: 4fb2192e fmla v14.4s, v9.4s, v18.s[3]
e78c: 4e2dd4e7 fadd v7.4s, v7.4s, v13.4s
e790: 4c9f78e7 st1 {v7.4s}, [x7], #16 //输出outptr对应的4个点的卷积值
e794: 4e2ed508 fadd v8.4s, v8.4s, v14.4s
e798: f98040e0 prfm pldl1keep, [x7,#128]
e79c: 4c4078e7 ld1 {v7.4s}, [x7]
e7a0: 4c9f7928 st1 {v8.4s}, [x9], #16 //输出outptr2对应的4个点的卷积值
e7a4: 7100054a subs w10, w10, #0x1
e7a8: 54fff421 b.ne e62c <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x1bc>
e7ac: 6b1f029f cmp w20, wzr
e7b0: 5400092d b.le e8d4 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x464>
e7b4: 2a1403ea mov w10, w20
e7b8: f90097a1 str x1, [x29,#296]
e7bc: 3dc00062 ldr q2, [x3]
e7c0: 91001063 add x3, x3, #0x4
e7c4: 3dc0017a ldr q26, [x11]
e7c8: 7100054a subs w10, w10, #0x1
e7cc: 3dc00099 ldr q25, [x4]
e7d0: 91001084 add x4, x4, #0x4
e7d4: 3dc00198 ldr q24, [x12]
e7d8: 3dc000a1 ldr q1, [x5]
e7dc: 910010a5 add x5, x5, #0x4
e7e0: 6e22de27 fmul v7.4s, v17.4s, v2.4s
e7e4: 3dc001a3 ldr q3, [x13]
e7e8: 6e22df42 fmul v2.4s, v26.4s, v2.4s
e7ec: 3dc000c0 ldr q0, [x6]
e7f0: 3dc0005b ldr q27, [x2]
e7f4: 91001042 add x2, x2, #0x4
e7f8: b9401001 ldr w1, [x0,#16]
e7fc: 3dc0011c ldr q28, [x8]
e800: 4e39cf47 fmla v7.4s, v26.4s, v25.4s
e804: b9404c11 ldr w17, [x0,#76]
e808: 4e39cf02 fmla v2.4s, v24.4s, v25.4s
e80c: 910010c6 add x6, x6, #0x4
e810: 4e041c25 mov v5.s[0], w1
e814: b9400c41 ldr w1, [x2,#12]
e818: 91001108 add x8, x8, #0x4
e81c: 4e21cf07 fmla v7.4s, v24.4s, v1.4s
e820: bd4000f8 ldr s24, [x7]
e824: 4e21cc62 fmla v2.4s, v3.4s, v1.4s
e828: bd400cc1 ldr s1, [x6,#12]
e82c: 4e041c24 mov v4.s[0], w1
e830: b9402401 ldr w1, [x0,#36]
e834: 4e20cc67 fmla v7.4s, v3.4s, v0.4s
e838: bd400d03 ldr s3, [x8,#12]
e83c: 4e20ce42 fmla v2.4s, v18.4s, v0.4s
e840: bd406000 ldr s0, [x0,#96]
e844: 4e0c1c25 mov v5.s[1], w1
e848: b9400c61 ldr w1, [x3,#12]
e84c: 4e0c1c24 mov v4.s[1], w1
e850: 4e3bce22 fmla v2.4s, v17.4s, v27.4s
e854: b9403801 ldr w1, [x0,#56]
e858: 4e141c25 mov v5.s[2], w1
e85c: b9400c81 ldr w1, [x4,#12]
e860: 4e3cce47 fmla v7.4s, v18.4s, v28.4s
e864: 4e141c24 mov v4.s[2], w1
e868: 4e1c1e25 mov v5.s[3], w17
e86c: b9400ca1 ldr w1, [x5,#12]
e870: 4e1c1c24 mov v4.s[3], w1
e874: 4e24cca2 fmla v2.4s, v5.4s, v4.4s
e878: b9400cc1 ldr w1, [x6,#12]
e87c: 6e042084 ext v4.16b, v4.16b, v4.16b, #4
e880: 4e1c1c24 mov v4.s[3], w1
e884: 4e24cca7 fmla v7.4s, v5.4s, v4.4s
e888: 1e200821 fmul s1, s1, s0
e88c: 5e080459 mov d25, v2.d[0]
e890: 1e230800 fmul s0, s0, s3
e894: 6e084442 mov v2.d[0], v2.d[1]
e898: 6e0844e3 mov v3.d[0], v7.d[1]
e89c: 0e27d467 fadd v7.2s, v3.2s, v7.2s
e8a0: 0e39d442 fadd v2.2s, v2.2s, v25.2s
e8a4: 2e27d442 faddp v2.2s, v2.2s, v7.2s
e8a8: 5e040443 mov s3, v2.s[0]
e8ac: 5e0c0442 mov s2, v2.s[1]
e8b0: 1e232821 fadd s1, s1, s3
e8b4: 1e222802 fadd s2, s0, s2
e8b8: 1e212b18 fadd s24, s24, s1
e8bc: bc0044f8 str s24, [x7],#4
e8c0: bd400121 ldr s1, [x9]
e8c4: 1e222821 fadd s1, s1, s2
e8c8: bc004521 str s1, [x9],#4
e8cc: 54fff781 b.ne e7bc <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x34c>
e8d0: f94097a1 ldr x1, [x29,#296]
e8d4: 11000a52 add w18, w18, #0x2
e8d8: 8b130042 add x2, x2, x19
e8dc: 6b17025f cmp w18, w23
e8e0: 8b130063 add x3, x3, x19
e8e4: 8b130084 add x4, x4, x19
e8e8: 8b1300a5 add x5, x5, x19
e8ec: 8b1300c6 add x6, x6, x19
e8f0: 8b130108 add x8, x8, x19
e8f4: 8b1600e7 add x7, x7, x22
e8f8: 8b160129 add x9, x9, x22
e8fc: 54ffe8e1 b.ne e618 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x1a8>
e900: 2a1703f2 mov w18, w23
e904: 6b12031f cmp w24, w18
e908: 5400108d b.le eb18 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x6a8>
e90c: 6b1f02bf cmp w21, wzr
e910: 540008cd b.le ea28 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x5b8>
e914: 2a1503e8 mov w8, w21
e918: f98040e0 prfm pldl1keep, [x7,#128]
e91c: f9808040 prfm pldl1keep, [x2,#256]
e920: 4c40a848 ld1 {v8.4s, v9.4s}, [x2]
e924: 91004042 add x2, x2, #0x10
e928: 4c4078e7 ld1 {v7.4s}, [x7]
e92c: 6e09210a ext v10.16b, v8.16b, v9.16b, #4
e930: 6e09410b ext v11.16b, v8.16b, v9.16b, #8
e934: 6e09610c ext v12.16b, v8.16b, v9.16b, #12
e938: 4f911107 fmla v7.4s, v8.4s, v17.s[0]
e93c: 4fb1914d fmul v13.4s, v10.4s, v17.s[1]
e940: f9808060 prfm pldl1keep, [x3,#256]
e944: 4f91996e fmul v14.4s, v11.4s, v17.s[2]
e948: 4fb1998f fmul v15.4s, v12.4s, v17.s[3]
e94c: 4f941127 fmla v7.4s, v9.4s, v20.s[0]
e950: 4c40a868 ld1 {v8.4s, v9.4s}, [x3]
e954: 91004063 add x3, x3, #0x10
e958: 6e09210a ext v10.16b, v8.16b, v9.16b, #4
e95c: 6e09410b ext v11.16b, v8.16b, v9.16b, #8
e960: 6e09610c ext v12.16b, v8.16b, v9.16b, #12
e964: 4fb41107 fmla v7.4s, v8.4s, v20.s[1]
e968: 4f94194d fmla v13.4s, v10.4s, v20.s[2]
e96c: f9808080 prfm pldl1keep, [x4,#256]
e970: 4fb4196e fmla v14.4s, v11.4s, v20.s[3]
e974: 4f95118f fmla v15.4s, v12.4s, v21.s[0]
e978: 4fb51127 fmla v7.4s, v9.4s, v21.s[1]
e97c: 4c40a888 ld1 {v8.4s, v9.4s}, [x4]
e980: 91004084 add x4, x4, #0x10
e984: 6e09210a ext v10.16b, v8.16b, v9.16b, #4
e988: 6e09410b ext v11.16b, v8.16b, v9.16b, #8
e98c: 6e09610c ext v12.16b, v8.16b, v9.16b, #12
e990: 4f951907 fmla v7.4s, v8.4s, v21.s[2]
e994: 4fb5194d fmla v13.4s, v10.4s, v21.s[3]
e998: f98080a0 prfm pldl1keep, [x5,#256]
e99c: 4f96116e fmla v14.4s, v11.4s, v22.s[0]
e9a0: 4fb6118f fmla v15.4s, v12.4s, v22.s[1]
e9a4: 4f961927 fmla v7.4s, v9.4s, v22.s[2]
e9a8: 4c40a8a8 ld1 {v8.4s, v9.4s}, [x5]
e9ac: 910040a5 add x5, x5, #0x10
e9b0: 6e09210a ext v10.16b, v8.16b, v9.16b, #4
e9b4: 6e09410b ext v11.16b, v8.16b, v9.16b, #8
e9b8: 6e09610c ext v12.16b, v8.16b, v9.16b, #12
e9bc: 4fb61907 fmla v7.4s, v8.4s, v22.s[3]
e9c0: 4f97114d fmla v13.4s, v10.4s, v23.s[0]
e9c4: f98080c0 prfm pldl1keep, [x6,#256]
e9c8: 4fb7116e fmla v14.4s, v11.4s, v23.s[1]
e9cc: 4f97198f fmla v15.4s, v12.4s, v23.s[2]
e9d0: 4fb71927 fmla v7.4s, v9.4s, v23.s[3]
e9d4: 4c40a8c8 ld1 {v8.4s, v9.4s}, [x6]
e9d8: 910040c6 add x6, x6, #0x10
e9dc: 6e09210a ext v10.16b, v8.16b, v9.16b, #4
e9e0: 6e09410b ext v11.16b, v8.16b, v9.16b, #8
e9e4: 6e09610c ext v12.16b, v8.16b, v9.16b, #12
e9e8: 4f921107 fmla v7.4s, v8.4s, v18.s[0]
e9ec: 4fb2114d fmla v13.4s, v10.4s, v18.s[1]
e9f0: 4f92196e fmla v14.4s, v11.4s, v18.s[2]
e9f4: 4fb2198f fmla v15.4s, v12.4s, v18.s[3]
e9f8: 4f931127 fmla v7.4s, v9.4s, v19.s[0]
e9fc: 4e2fd5ce fadd v14.4s, v14.4s, v15.4s
ea00: 4e2dd4e7 fadd v7.4s, v7.4s, v13.4s
ea04: f9808040 prfm pldl1keep, [x2,#256]
ea08: 4e2ed4e7 fadd v7.4s, v7.4s, v14.4s
ea0c: 4c40a848 ld1 {v8.4s, v9.4s}, [x2]
ea10: 91004042 add x2, x2, #0x10
ea14: 4c9f78e7 st1 {v7.4s}, [x7], #16
ea18: f98040e0 prfm pldl1keep, [x7,#128]
ea1c: 71000508 subs w8, w8, #0x1
ea20: 54fff841 b.ne e928 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x4b8>
ea24: d1004042 sub x2, x2, #0x10
ea28: 6b1f029f cmp w20, wzr
ea2c: 5400066d b.le eaf8 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x688>
ea30: 2a1403e8 mov w8, w20
ea34: f90097a1 str x1, [x29,#296]
ea38: 3dc00040 ldr q0, [x2]
ea3c: 91001042 add x2, x2, #0x4
ea40: 3dc00079 ldr q25, [x3]
ea44: 91001063 add x3, x3, #0x4
ea48: 3dc0017a ldr q26, [x11]
ea4c: 71000508 subs w8, w8, #0x1
ea50: 3dc00087 ldr q7, [x4]
ea54: 91001084 add x4, x4, #0x4
ea58: 3dc00198 ldr q24, [x12]
ea5c: 6e20de20 fmul v0.4s, v17.4s, v0.4s
ea60: 3dc000a2 ldr q2, [x5]
ea64: 3dc001a3 ldr q3, [x13]
ea68: 910010a5 add x5, x5, #0x4
ea6c: 3dc000c1 ldr q1, [x6]
ea70: 910010c6 add x6, x6, #0x4
ea74: b9401001 ldr w1, [x0,#16]
ea78: 4e39cf40 fmla v0.4s, v26.4s, v25.4s
ea7c: b9402411 ldr w17, [x0,#36]
ea80: 4e041c26 mov v6.s[0], w1
ea84: b9400c41 ldr w1, [x2,#12]
ea88: 4e041c30 mov v16.s[0], w1
ea8c: 4e27cf00 fmla v0.4s, v24.4s, v7.4s
ea90: b9400c61 ldr w1, [x3,#12]
ea94: 4e0c1c30 mov v16.s[1], w1
ea98: b9400c81 ldr w1, [x4,#12]
ea9c: b940380a ldr w10, [x0,#56]
eaa0: 4e22cc60 fmla v0.4s, v3.4s, v2.4s
eaa4: bd400cc2 ldr s2, [x6,#12]
eaa8: b9404c09 ldr w9, [x0,#76]
eaac: 4e141c30 mov v16.s[2], w1
eab0: 4e0c1e26 mov v6.s[1], w17
eab4: 4e21ce40 fmla v0.4s, v18.4s, v1.4s
eab8: bd406001 ldr s1, [x0,#96]
eabc: b9400ca1 ldr w1, [x5,#12]
eac0: 1e210842 fmul s2, s2, s1
eac4: 4e141d46 mov v6.s[2], w10
eac8: 4e1c1c30 mov v16.s[3], w1
eacc: 4e1c1d26 mov v6.s[3], w9
ead0: 4e30ccc0 fmla v0.4s, v6.4s, v16.4s
ead4: bd4000e1 ldr s1, [x7]
ead8: 6e084403 mov v3.d[0], v0.d[1]
eadc: 0e20d460 fadd v0.2s, v3.2s, v0.2s
eae0: 2e20d400 faddp v0.2s, v0.2s, v0.2s
eae4: 1e202840 fadd s0, s2, s0
eae8: 1e202821 fadd s1, s1, s0
eaec: bc0044e1 str s1, [x7],#4
eaf0: 54fffa41 b.ne ea38 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x5c8>
eaf4: f94097a1 ldr x1, [x29,#296]
eaf8: 11000652 add w18, w18, #0x1
eafc: 91004042 add x2, x2, #0x10
eb00: 6b18025f cmp w18, w24
eb04: 91004063 add x3, x3, #0x10
eb08: 91004084 add x4, x4, #0x10
eb0c: 910040a5 add x5, x5, #0x10
eb10: 910040c6 add x6, x6, #0x10
eb14: 54ffefc1 b.ne e90c <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x49c>
eb18: 910007de add x30, x30, #0x1
eb1c: 91019000 add x0, x0, #0x64
eb20: 6b1e039f cmp w28, w30
eb24: 5400020d b.le eb64 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x6f4>
eb28: f9409fa7 ldr x7, [x29,#312]
eb2c: 17fffe99 b e590 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x120>
eb30: a94153f3 ldp x19, x20, [sp,#16]
eb34: 6d4627e8 ldp d8, d9, [sp,#96]
eb38: a9425bf5 ldp x21, x22, [sp,#32]
eb3c: 6d472fea ldp d10, d11, [sp,#112]
eb40: a94363f7 ldp x23, x24, [sp,#48]
eb44: 6d4837ec ldp d12, d13, [sp,#128]
eb48: a9446bf9 ldp x25, x26, [sp,#64]
eb4c: 6d493fee ldp d14, d15, [sp,#144]
eb50: a94573fb ldp x27, x28, [sp,#80]
eb54: a8d77bfd ldp x29, x30, [sp],#368
eb58: d65f03c0 ret
eb5c: 52800012 mov w18, #0x0 // #0
eb60: 17ffff69 b e904 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x494>
eb64: f90057a1 str x1, [x29,#168]
eb68: 910005ce add x14, x14, #0x1
eb6c: b94127a1 ldr w1, [x29,#292]
eb70: aa1903e0 mov x0, x25
eb74: f9007fb0 str x16, [x29,#248]
eb78: 8b21cb7b add x27, x27, w1, sxtw #2
eb7c: f90097ae str x14, [x29,#296]
eb80: f90053ae str x14, [x29,#160]
eb84: f9007baf str x15, [x29,#240]
eb88: 3d802fb0 str q16, [x29,#176]
eb8c: 3d8033a6 str q6, [x29,#192]
eb90: 3d8037a4 str q4, [x29,#208]
eb94: 3d803ba5 str q5, [x29,#224]
eb98: 94000000 bl 0 <_ZN4ncnnL14conv7x7s2_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE>
eb9c: f94097ae ldr x14, [x29,#296]
eba0: f9407fb0 ldr x16, [x29,#248]
eba4: 6b0e021f cmp w16, w14
eba8: 54fffc4d b.le eb30 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0x6c0>
ebac: f94057a1 ldr x1, [x29,#168]
ebb0: 3dc03ba5 ldr q5, [x29,#224]
ebb4: f9407baf ldr x15, [x29,#240]
ebb8: 3dc037a4 ldr q4, [x29,#208]
ebbc: b9402420 ldr w0, [x1,#36]
ebc0: b9402823 ldr w3, [x1,#40]
ebc4: 3dc033a6 ldr q6, [x29,#192]
ebc8: 3dc02fb0 ldr q16, [x29,#176]
ebcc: 17fffe5a b e534 <_ZN4ncnnL14conv5x5s1_neonERKNS_3MatERS0_S2_S2_RKNS_6OptionE+0xc4>
基本思路已经理解了。下一步学习和分析winograd矩阵相乘,即NCNN的实现。