原型是别人的代码,我把它改成SSE2版本了。
输入明文和输出密码长度都是5个32位整数,由于是SSE2的关系,输入输出都是并行打包4个。
#include <intrin.h>
#define rol_SSE2(x, y) _mm_or_si128(_mm_slli_epi32((x), (y)), _mm_srli_epi32((x), 32 - (y)))
#define K1_SSE2 _mm_set1_epi32(0x5A827999L)
#define K2_SSE2 _mm_set1_epi32(0x6ED9EBA1L)
#define K3_SSE2 _mm_set1_epi32(0x8F1BBCDCL)
#define K4_SSE2 _mm_set1_epi32(0xCA62C1D6L)
#define F1_SSE2(x, y, z) _mm_xor_si128((z), _mm_and_si128((x), _mm_xor_si128((y), (z))))
#define F2_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128((x), (y)), (z))
#define F3_SSE2(x, y, z) _mm_or_si128(_mm_and_si128((x), (y)), _mm_and_si128((z), _mm_or_si128((x), (y))))
#define F4_SSE2(x, y, z) _mm_xor_si128(_mm_xor_si128((x), (y)), (z))
#define MG_SSE2(i) (p_idata[i & 0x0f] = rol_SSE2(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(p_idata[i & 0x0f], p_idata[(i - 14) & 0x0f]), p_idata[(i - 8) & 0x0f]), p_idata[(i - 3) & 0x0f]), 1))
#define R_SSE2(a, b, c, d, e, f, k, m) {e = _mm_add_epi32(_mm_add_epi32(_mm_add_epi32(_mm_add_epi32(e, rol_SSE2(a, 5)), f(b, c, d)), k), m); b = rol_SSE2(b, 30);}
void SHA1_Hash_SSE2(__m128i *idata, __m128i *odata, unsigned short n)
{
__m128i p_idata[16];
__m128i a, b, c, d, e;
unsigned int f;
p_idata[0] = idata[0];
p_idata[1] = idata[1];
p_idata[2] = idata[2];
p_idata[3] = idata[3];
p_idata[4] = idata[4];
for(f = 0; f < n; f ++)
{
a = _mm_set1_epi32(0x67452301);
b = _mm_set1_epi32(0xefcdab89);
c = _mm_set1_epi32(0x98badcfe);
d = _mm_set1_epi32(0x10325476);
e = _mm_set1_epi32(0xc3d2e1f0);
p_idata[5] = _mm_set1_epi32(0x00000000);
p_idata[6] = _mm_set1_epi32(0x00000000);
p_idata[7] = _mm_set1_epi32(0x00000000);
p_idata[8] = _mm_set1_epi32(0x00000000);
p_idata[9] = _mm_set1_epi32(0x00000000);
p_idata[10] = _mm_set1_epi32(0x00000000);
p_idata[11] = _mm_set1_epi32(0x00000000);
p_idata[12] = _mm_set1_epi32(0x00000000);
p_idata[13] = _mm_set1_epi32(0x00000000);
p_idata[14] = _mm_set1_epi32(0x00000000);
p_idata[15] = _mm_set1_epi32(0x000000a0);
{
R_SSE2( a, b, c, d, e, F1_SSE2, K1_SSE2, p_idata[ 0] );
R_SSE2( e, a, b, c, d, F1_SSE2, K1_SSE2, p_idata[ 1]);
R_SSE2( d, e, a, b, c, F1_SSE2, K1_SSE2, p_idata[ 2] );
R_SSE2( c, d, e, a, b, F1_SSE2, K1_SSE2, p_idata[ 3] );
R_SSE2( b, c, d, e, a, F1_SSE2, K1_SSE2, p_idata[ 4] );
R_SSE2( a, b, c, d, e, F1_SSE2, K1_SSE2, p_idata[ 5] );
R_SSE2( e, a, b, c, d, F1_SSE2, K1_SSE2,p_idata[ 6] );
R_SSE2( d, e, a, b, c, F1_SSE2, K1_SSE2,p_idata[ 7] );
R_SSE2( c, d, e, a, b, F1_SSE2, K1_SSE2,p_idata[ 8] );
R_SSE2( b, c, d, e, a, F1_SSE2, K1_SSE2,p_idata[ 9] );
R_SSE2( a, b, c, d, e, F1_SSE2, K1_SSE2,p_idata[10] );
R_SSE2( e, a, b, c, d, F1_SSE2, K1_SSE2,p_idata[11] );
R_SSE2( d, e, a, b, c, F1_SSE2, K1_SSE2,p_idata[12] );
R_SSE2( c, d, e, a, b, F1_SSE2, K1_SSE2,p_idata[13] );
R_SSE2( b, c, d, e, a, F1_SSE2, K1_SSE2,p_idata[14] );
R_SSE2( a, b, c, d, e, F1_SSE2, K1_SSE2,p_idata[15] );
R_SSE2( e, a, b, c, d, F1_SSE2, K1_SSE2, MG_SSE2(16) );
R_SSE2( d, e, a, b, c, F1_SSE2, K1_SSE2, MG_SSE2(17) );
R_SSE2( c, d, e, a, b, F1_SSE2, K1_SSE2, MG_SSE2(18) );
R_SSE2( b, c, d, e, a, F1_SSE2, K1_SSE2, MG_SSE2(19) );
}
{
R_SSE2( a, b, c, d, e, F2_SSE2, K2_SSE2, MG_SSE2(20) );
R_SSE2( e, a, b, c, d, F2_SSE2, K2_SSE2, MG_SSE2(21) );
R_SSE2( d, e, a, b, c, F2_SSE2, K2_SSE2, MG_SSE2(22) );
R_SSE2( c, d, e, a, b, F2_SSE2, K2_SSE2, MG_SSE2(23) );
R_SSE2( b, c, d, e, a, F2_SSE2, K2_SSE2, MG_SSE2(24) );
R_SSE2( a, b, c, d, e, F2_SSE2, K2_SSE2, MG_SSE2(25) );
R_SSE2( e, a, b, c, d, F2_SSE2, K2_SSE2, MG_SSE2(26) );
R_SSE2( d, e, a, b, c, F2_SSE2, K2_SSE2, MG_SSE2(27) );
R_SSE2( c, d, e, a, b, F2_SSE2, K2_SSE2, MG_SSE2(28) );
R_SSE2( b, c, d, e, a, F2_SSE2, K2_SSE2, MG_SSE2(29) );
R_SSE2( a, b, c, d, e, F2_SSE2, K2_SSE2, MG_SSE2(30) );
R_SSE2( e, a, b, c, d, F2_SSE2, K2_SSE2, MG_SSE2(31) );
R_SSE2( d, e, a, b, c, F2_SSE2, K2_SSE2, MG_SSE2(32) );
R_SSE2( c, d, e, a, b, F2_SSE2, K2_SSE2, MG_SSE2(33) );
R_SSE2( b, c, d, e, a, F2_SSE2, K2_SSE2, MG_SSE2(34) );
R_SSE2( a, b, c, d, e, F2_SSE2, K2_SSE2, MG_SSE2(35) );
R_SSE2( e, a, b, c, d, F2_SSE2, K2_SSE2, MG_SSE2(36) );
R_SSE2( d, e, a, b, c, F2_SSE2, K2_SSE2, MG_SSE2(37) );
R_SSE2( c, d, e, a, b, F2_SSE2, K2_SSE2, MG_SSE2(38) );
R_SSE2( b, c, d, e, a, F2_SSE2, K2_SSE2, MG_SSE2(39) );
}
{
R_SSE2( a, b, c, d, e, F3_SSE2, K3_SSE2, MG_SSE2(40) );
R_SSE2( e, a, b, c, d, F3_SSE2, K3_SSE2, MG_SSE2(41) );
R_SSE2( d, e, a, b, c, F3_SSE2, K3_SSE2, MG_SSE2(42) );
R_SSE2( c, d, e, a, b, F3_SSE2, K3_SSE2, MG_SSE2(43) );
R_SSE2( b, c, d, e, a, F3_SSE2, K3_SSE2, MG_SSE2(44) );
R_SSE2( a, b, c, d, e, F3_SSE2, K3_SSE2, MG_SSE2(45) );
R_SSE2( e, a, b, c, d, F3_SSE2, K3_SSE2, MG_SSE2(46) );
R_SSE2( d, e, a, b, c, F3_SSE2, K3_SSE2, MG_SSE2(47) );
R_SSE2( c, d, e, a, b, F3_SSE2, K3_SSE2, MG_SSE2(48) );
R_SSE2( b, c, d, e, a, F3_SSE2, K3_SSE2, MG_SSE2(49) );
R_SSE2( a, b, c, d, e, F3_SSE2, K3_SSE2, MG_SSE2(50) );
R_SSE2( e, a, b, c, d, F3_SSE2, K3_SSE2, MG_SSE2(51) );
R_SSE2( d, e, a, b, c, F3_SSE2, K3_SSE2, MG_SSE2(52) );
R_SSE2( c, d, e, a, b, F3_SSE2, K3_SSE2, MG_SSE2(53) );
R_SSE2( b, c, d, e, a, F3_SSE2, K3_SSE2, MG_SSE2(54) );
R_SSE2( a, b, c, d, e, F3_SSE2, K3_SSE2, MG_SSE2(55) );
R_SSE2( e, a, b, c, d, F3_SSE2, K3_SSE2, MG_SSE2(56) );
R_SSE2( d, e, a, b, c, F3_SSE2, K3_SSE2, MG_SSE2(57) );
R_SSE2( c, d, e, a, b, F3_SSE2, K3_SSE2, MG_SSE2(58) );
R_SSE2( b, c, d, e, a, F3_SSE2, K3_SSE2, MG_SSE2(59) );
}
{
R_SSE2( a, b, c, d, e, F4_SSE2, K4_SSE2, MG_SSE2(60) );
R_SSE2( e, a, b, c, d, F4_SSE2, K4_SSE2, MG_SSE2(61) );
R_SSE2( d, e, a, b, c, F4_SSE2, K4_SSE2, MG_SSE2(62) );
R_SSE2( c, d, e, a, b, F4_SSE2, K4_SSE2, MG_SSE2(63) );
R_SSE2( b, c, d, e, a, F4_SSE2, K4_SSE2, MG_SSE2(64) );
R_SSE2( a, b, c, d, e, F4_SSE2, K4_SSE2, MG_SSE2(65) );
R_SSE2( e, a, b, c, d, F4_SSE2, K4_SSE2, MG_SSE2(66) );
R_SSE2( d, e, a, b, c, F4_SSE2, K4_SSE2, MG_SSE2(67) );
R_SSE2( c, d, e, a, b, F4_SSE2, K4_SSE2, MG_SSE2(68) );
R_SSE2( b, c, d, e, a, F4_SSE2, K4_SSE2, MG_SSE2(69) );
R_SSE2( a, b, c, d, e, F4_SSE2, K4_SSE2, MG_SSE2(70) );
R_SSE2( e, a, b, c, d, F4_SSE2, K4_SSE2, MG_SSE2(71) );
R_SSE2( d, e, a, b, c, F4_SSE2, K4_SSE2, MG_SSE2(72) );
R_SSE2( c, d, e, a, b, F4_SSE2, K4_SSE2, MG_SSE2(73) );
R_SSE2( b, c, d, e, a, F4_SSE2, K4_SSE2, MG_SSE2(74) );
R_SSE2( a, b, c, d, e, F4_SSE2, K4_SSE2, MG_SSE2(75) );
R_SSE2( e, a, b, c, d, F4_SSE2, K4_SSE2, MG_SSE2(76) );
R_SSE2( d, e, a, b, c, F4_SSE2, K4_SSE2, MG_SSE2(77) );
R_SSE2( c, d, e, a, b, F4_SSE2, K4_SSE2, MG_SSE2(78) );
R_SSE2( b, c, d, e, a, F4_SSE2, K4_SSE2, MG_SSE2(79) );
}
p_idata[0] = _mm_add_epi32(a, _mm_set1_epi32(0x67452301));
p_idata[1] = _mm_add_epi32(b, _mm_set1_epi32(0xefcdab89));
p_idata[2] = _mm_add_epi32(c, _mm_set1_epi32(0x98badcfe));
p_idata[3] = _mm_add_epi32(d, _mm_set1_epi32(0x10325476));
p_idata[4] = _mm_add_epi32(e, _mm_set1_epi32(0xc3d2e1f0));
}
odata[0] = p_idata[0];
odata[1] = p_idata[1];
odata[2] = p_idata[2];
odata[3] = p_idata[3];
odata[4] = p_idata[4];
}
200字以内,仅用于支线交流,主线讨论请采用回复功能。