class
C_VECTOR4D
... {
public:
union
...{
__declspec(align(16)) __m128 v; // SIMD data type access
float M[4]; // array indexed storage
// explicit names
struct
...{
float x,y,z,w;
}; // end struct
}; // end union
// note: the declspec is redundant since in the type __m128 forces
// the compiler to align in 16-byte boundaries, so as long as __m128 is
// part of the union declspec is NOT needed :) But, it can't
// hurt and when you are defining locals and globals, always put
// declspec(align(16)) to KNOW data is on 16-byte boundaries
// CONSTRUCTORS //
C_VECTOR4D()
...{
// void constructor
// initialize vector to 0.0.0.1
x=y=z=0; w=1.0;
} // end C_VECTOR4D
/**///
C_VECTOR4D(float _x, float _y, float _z, float _w = 1.0)
...{
// initialize vector to sent values
x = _x;
y = _y;
z = _z;
w = _w;
} // end C_VECTOR4D
// FUNCTIONS
void init(float _x, float _y, float _z, float _w = 1.0)
...{
// initialize vector to sent values
x = _x;
y = _y;
z = _z;
w = _w;
} // end init
/**///
void zero(void)
...{
// initialize vector to 0.0.0.1
x=y=z=0; w=1.0;
} // end zero
/**///
float length(void)
...{
// computes the length of the vector
C_VECTOR4D vr = *this;
// set w=0
vr.w = 0;
//float a= SIMD_SHUFFLE(0x02,0x03,0x00,0x01) ;
// compile pure asm version?
#if (SIMD_ASM==1)
// begin inline asm version of SIMD dot product since we need its
// results for the length since length = sqrt(v*v)
_asm
...{
// first we need dot product of this*this
movaps xmm0, vr.v // move left operand into xmm0
mulps xmm0, xmm0 // multiply operands vertically
// at this point, xmm0 =
// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
// or more simply: let xmm0 = [x,y,z,1] =
// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
// we need to sum the x,y,z components into a single scalar
// to compute the final dot product of:
// dp = x + y + z == x1*x2 + y1*y2 + z1*z2
// begin
// xmm0: = [x,y,z,1] (note: all regs in low to hight order)
// xmm1: = [?,?,?,?]
movaps xmm1, xmm0 // copy result into xmm1
// xmm0: = [x,y,z,1]
// xmm1: = [x,y,z,1]
shufps xmm1, xmm0, SIMD_SHUFFLE(0x01,0x00,0x03,0x02) //y,x,1,z
// xmm0: = [x,y,z,1]
// xmm1: = [z,1,x,y]
addps xmm1, xmm0
// xmm0: = [x ,y ,z ,1]
// xmm1: = [x+z,y+1,x+z,y+1]
shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01) //x+z,y+1,x+z,y+1//因为其实w是为0的,所以y+1=y
// xmm0: = [y ,x ,y+1,x+z]
// xmm1: = [x+z,y+1,x+z,y+1]
// finally we can add!
addps xmm0, xmm1
// xmm0: = [x+y+z,x+y+1,x+y+z+1,x+y+z+1]
// xmm1: = [x+z ,y+1 ,x+z ,y+1]
// xmm0.x contains the dot product
// xmm0.z, xmm0.w contains the dot+1
// now low double word contains dot product, let's take squaroot
sqrtss xmm0, xmm0
movaps vr, xmm0 // save results
} // end asm
#endif // end use inline asm version
// compile intrinsic version?
#if (SIMD_INTRISIC==1)
#endif // end use intrinsic library version
// return result
return(vr.x);
} // end length
// OVERLOADED OPERATORS //
float& operator[](int index)
...{
// return the ith element from the array
return(M[index]);
} // end operator[]
/**///
C_VECTOR4D operator+(C_VECTOR4D &v)
...{
// adds the "this" vector and the sent vector
__declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes
// compile pure asm version?
#if (SIMD_ASM==1)
// begin inline asm version of SIMD add
_asm
...{
mov esi, this // "this" contains a point to the left operand
mov edi, v // v points to the right operand
movaps xmm0, [esi] // esi points to first vector, move into xmm0
addps xmm0, [edi] // edi points to second vector, add it to xmm0
movaps vr, xmm0 // move result into output vector
} // end asm
#endif // end use inline asm version
// compile intrinsic version?
#if (SIMD_INTRISIC==1)
vr.v = _mm_add_ps(this->v, v.v);
#endif // end use intrinsic library version
// always set w=1
vr.w = 1.0;
// return result
return(vr);
} // end operator+
/**///
C_VECTOR4D operator-(C_VECTOR4D &v)
...{
// subtracts the "this" vector and the sent vector
__declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes
// compile pure asm version?
#if (SIMD_ASM==1)
// begin inline asm version of SIMD add
_asm
...{
mov esi, this // "this" contains a point to the left operand
mov edi, v // v points to the right operand
movaps xmm0, [esi] // esi points to first vector, move into xmm0
subps xmm0, [edi] // edi points to second vector, subtract it from xmm0
movaps vr, xmm0 // move result into output vector
} // end asm
#endif // end use inline asm version
// compile intrinsic version?
#if (SIMD_INTRISIC==1)
vr.v = _mm_sub_ps(this->v, v.v);
#endif // end use intrinsic library version
// always set w=1
vr.w = 1.0;
// return result
return(vr);
} // end operator-
/**///
float operator*(C_VECTOR4D &v)
...{
// the dot product will be * since dot product is a more common operation
// computes the dot between between the "this" vector and the sent vector
__declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes
// compile pure asm version?
#if (SIMD_ASM==1)
// begin inline asm version of SIMD dot product
_asm
...{
mov esi, this // "this" contains a point to the left operand
mov edi, v // v points to the right operand
movaps xmm0, [esi] // move left operand into xmm0
mulps xmm0, [edi] // multiply operands vertically
// at this point, xmm0 =
// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
// or more simply: let xmm0 = [x,y,z,1] =
// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
// we need to sum the x,y,z components into a single scalar
// to compute the final dot product of:
// dp = x + y + z where x = x1*x2, y = y1*y2, z = z1*z2
// begin
// xmm0: = [x,y,z,1] (note: all regs in low to hight order)
// xmm1: = [?,?,?,?]
movaps xmm1, xmm0 // copy result into xmm1
// xmm0: = [x,y,z,1]
// xmm1: = [x,y,z,1]
shufps xmm1, xmm0, SIMD_SHUFFLE(0x01,0x00,0x03,0x02)
// xmm0: = [x,y,z,1]
// xmm1: = [z,1,x,y]
addps xmm1, xmm0
// xmm0: = [x ,y ,z ,1]
// xmm1: = [x+z,y+1,x+z,y+1]
shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01)
// xmm0: = [y ,x ,y+1,x+z]
// xmm1: = [x+z,y+1,x+z,y+1]
// finally we can add!
addps xmm0, xmm1
// xmm0: = [x+y+z,x+y+1,x+y+z+1,x+y+z+1]
// xmm1: = [x+z ,y+1 ,x+z ,y+1]
// xmm0.x contains the dot product
// xmm0.z, xmm0.w contains the dot+1
movaps vr, xmm0
} // end asm
#endif // end use inline asm version
// compile intrinsic version?
#if (SIMD_INTRISIC==1)
vr.v = _mm_mul_ps(this->v, v.v);
return(vr.x + vr.y + vr.z);
#endif // end use intrinsic library version
// return result
return(vr.x);
} // end operator*
/**///
void print(void)
...{
// this member function prints out the vector
printf(" v = [%f, %f, %f, %f]", this->x, this->y, this->z, this->w);
} // end print
/**///
} ; // end class C_VECTOR4D
... {
public:
union
...{
__declspec(align(16)) __m128 v; // SIMD data type access
float M[4]; // array indexed storage
// explicit names
struct
...{
float x,y,z,w;
}; // end struct
}; // end union
// note: the declspec is redundant since in the type __m128 forces
// the compiler to align in 16-byte boundaries, so as long as __m128 is
// part of the union declspec is NOT needed :) But, it can't
// hurt and when you are defining locals and globals, always put
// declspec(align(16)) to KNOW data is on 16-byte boundaries
// CONSTRUCTORS //
C_VECTOR4D()
...{
// void constructor
// initialize vector to 0.0.0.1
x=y=z=0; w=1.0;
} // end C_VECTOR4D
/**///
C_VECTOR4D(float _x, float _y, float _z, float _w = 1.0)
...{
// initialize vector to sent values
x = _x;
y = _y;
z = _z;
w = _w;
} // end C_VECTOR4D
// FUNCTIONS
void init(float _x, float _y, float _z, float _w = 1.0)
...{
// initialize vector to sent values
x = _x;
y = _y;
z = _z;
w = _w;
} // end init
/**///
void zero(void)
...{
// initialize vector to 0.0.0.1
x=y=z=0; w=1.0;
} // end zero
/**///
float length(void)
...{
// computes the length of the vector
C_VECTOR4D vr = *this;
// set w=0
vr.w = 0;
//float a= SIMD_SHUFFLE(0x02,0x03,0x00,0x01) ;
// compile pure asm version?
#if (SIMD_ASM==1)
// begin inline asm version of SIMD dot product since we need its
// results for the length since length = sqrt(v*v)
_asm
...{
// first we need dot product of this*this
movaps xmm0, vr.v // move left operand into xmm0
mulps xmm0, xmm0 // multiply operands vertically
// at this point, xmm0 =
// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
// or more simply: let xmm0 = [x,y,z,1] =
// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
// we need to sum the x,y,z components into a single scalar
// to compute the final dot product of:
// dp = x + y + z == x1*x2 + y1*y2 + z1*z2
// begin
// xmm0: = [x,y,z,1] (note: all regs in low to hight order)
// xmm1: = [?,?,?,?]
movaps xmm1, xmm0 // copy result into xmm1
// xmm0: = [x,y,z,1]
// xmm1: = [x,y,z,1]
shufps xmm1, xmm0, SIMD_SHUFFLE(0x01,0x00,0x03,0x02) //y,x,1,z
// xmm0: = [x,y,z,1]
// xmm1: = [z,1,x,y]
addps xmm1, xmm0
// xmm0: = [x ,y ,z ,1]
// xmm1: = [x+z,y+1,x+z,y+1]
shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01) //x+z,y+1,x+z,y+1//因为其实w是为0的,所以y+1=y
// xmm0: = [y ,x ,y+1,x+z]
// xmm1: = [x+z,y+1,x+z,y+1]
// finally we can add!
addps xmm0, xmm1
// xmm0: = [x+y+z,x+y+1,x+y+z+1,x+y+z+1]
// xmm1: = [x+z ,y+1 ,x+z ,y+1]
// xmm0.x contains the dot product
// xmm0.z, xmm0.w contains the dot+1
// now low double word contains dot product, let's take squaroot
sqrtss xmm0, xmm0
movaps vr, xmm0 // save results
} // end asm
#endif // end use inline asm version
// compile intrinsic version?
#if (SIMD_INTRISIC==1)
#endif // end use intrinsic library version
// return result
return(vr.x);
} // end length
// OVERLOADED OPERATORS //
float& operator[](int index)
...{
// return the ith element from the array
return(M[index]);
} // end operator[]
/**///
C_VECTOR4D operator+(C_VECTOR4D &v)
...{
// adds the "this" vector and the sent vector
__declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes
// compile pure asm version?
#if (SIMD_ASM==1)
// begin inline asm version of SIMD add
_asm
...{
mov esi, this // "this" contains a point to the left operand
mov edi, v // v points to the right operand
movaps xmm0, [esi] // esi points to first vector, move into xmm0
addps xmm0, [edi] // edi points to second vector, add it to xmm0
movaps vr, xmm0 // move result into output vector
} // end asm
#endif // end use inline asm version
// compile intrinsic version?
#if (SIMD_INTRISIC==1)
vr.v = _mm_add_ps(this->v, v.v);
#endif // end use intrinsic library version
// always set w=1
vr.w = 1.0;
// return result
return(vr);
} // end operator+
/**///
C_VECTOR4D operator-(C_VECTOR4D &v)
...{
// subtracts the "this" vector and the sent vector
__declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes
// compile pure asm version?
#if (SIMD_ASM==1)
// begin inline asm version of SIMD add
_asm
...{
mov esi, this // "this" contains a point to the left operand
mov edi, v // v points to the right operand
movaps xmm0, [esi] // esi points to first vector, move into xmm0
subps xmm0, [edi] // edi points to second vector, subtract it from xmm0
movaps vr, xmm0 // move result into output vector
} // end asm
#endif // end use inline asm version
// compile intrinsic version?
#if (SIMD_INTRISIC==1)
vr.v = _mm_sub_ps(this->v, v.v);
#endif // end use intrinsic library version
// always set w=1
vr.w = 1.0;
// return result
return(vr);
} // end operator-
/**///
float operator*(C_VECTOR4D &v)
...{
// the dot product will be * since dot product is a more common operation
// computes the dot between between the "this" vector and the sent vector
__declspec(align(16)) C_VECTOR4D vr; // used to hold result, aligned on 16 bytes
// compile pure asm version?
#if (SIMD_ASM==1)
// begin inline asm version of SIMD dot product
_asm
...{
mov esi, this // "this" contains a point to the left operand
mov edi, v // v points to the right operand
movaps xmm0, [esi] // move left operand into xmm0
mulps xmm0, [edi] // multiply operands vertically
// at this point, xmm0 =
// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
// or more simply: let xmm0 = [x,y,z,1] =
// [ (v1.x * v2.x), (v1.y * v2.y), (v1.z * v2.z), (1*1) ]
// we need to sum the x,y,z components into a single scalar
// to compute the final dot product of:
// dp = x + y + z where x = x1*x2, y = y1*y2, z = z1*z2
// begin
// xmm0: = [x,y,z,1] (note: all regs in low to hight order)
// xmm1: = [?,?,?,?]
movaps xmm1, xmm0 // copy result into xmm1
// xmm0: = [x,y,z,1]
// xmm1: = [x,y,z,1]
shufps xmm1, xmm0, SIMD_SHUFFLE(0x01,0x00,0x03,0x02)
// xmm0: = [x,y,z,1]
// xmm1: = [z,1,x,y]
addps xmm1, xmm0
// xmm0: = [x ,y ,z ,1]
// xmm1: = [x+z,y+1,x+z,y+1]
shufps xmm0, xmm1, SIMD_SHUFFLE(0x02,0x03,0x00,0x01)
// xmm0: = [y ,x ,y+1,x+z]
// xmm1: = [x+z,y+1,x+z,y+1]
// finally we can add!
addps xmm0, xmm1
// xmm0: = [x+y+z,x+y+1,x+y+z+1,x+y+z+1]
// xmm1: = [x+z ,y+1 ,x+z ,y+1]
// xmm0.x contains the dot product
// xmm0.z, xmm0.w contains the dot+1
movaps vr, xmm0
} // end asm
#endif // end use inline asm version
// compile intrinsic version?
#if (SIMD_INTRISIC==1)
vr.v = _mm_mul_ps(this->v, v.v);
return(vr.x + vr.y + vr.z);
#endif // end use intrinsic library version
// return result
return(vr.x);
} // end operator*
/**///
void print(void)
...{
// this member function prints out the vector
printf(" v = [%f, %f, %f, %f]", this->x, this->y, this->z, this->w);
} // end print
/**///
} ; // end class C_VECTOR4D