1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
/*
u1.h version $Date: 2014/09/08 17:44:28 $
D. J. Bernstein
Romain Dolbeau
Public domain.
*/
// Modified by kerukuro for use in cppcrypto.
// if (!bytes) return;
while (bytes >=64) {
__m128i x_0, x_1, x_2, x_3;
__m128i t_1;
const __m128i rot16 = _mm_set_epi8(13,12,15,14,9,8,11,10,5,4,7,6,1,0,3,2);
const __m128i rot8 = _mm_set_epi8(14,13,12,15,10,9,8,11,6,5,4,7,2,1,0,3);
uint32 in12, in13;
x_0 = _mm_load_si128((__m128i*)(x + 0));
x_1 = _mm_load_si128((__m128i*)(x + 4));
x_2 = _mm_load_si128((__m128i*)(x + 8));
x_3 = _mm_load_si128((__m128i*)(x + 12));
for (i = 0 ; i < r ; ++i) {
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x93);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x39);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_3 = _mm_shuffle_epi8(x_3, rot16);
x_2 = _mm_add_epi32(x_2, x_3);
x_1 = _mm_xor_si128(x_1, x_2);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 12);
t_1 = _mm_srli_epi32(t_1, 20);
x_1 = _mm_xor_si128(x_1, t_1);
x_0 = _mm_add_epi32(x_0, x_1);
x_3 = _mm_xor_si128(x_3, x_0);
x_0 = _mm_shuffle_epi32(x_0, 0x39);
x_3 = _mm_shuffle_epi8(x_3, rot8);
x_2 = _mm_add_epi32(x_2, x_3);
x_3 = _mm_shuffle_epi32(x_3, 0x4e);
x_1 = _mm_xor_si128(x_1, x_2);
x_2 = _mm_shuffle_epi32(x_2, 0x93);
t_1 = x_1;
x_1 = _mm_slli_epi32(x_1, 7);
t_1 = _mm_srli_epi32(t_1, 25);
x_1 = _mm_xor_si128(x_1, t_1);
}
x_0 = _mm_add_epi32(x_0, _mm_loadu_si128((__m128i*)(x + 0)));
x_1 = _mm_add_epi32(x_1, _mm_loadu_si128((__m128i*)(x + 4)));
x_2 = _mm_add_epi32(x_2, _mm_loadu_si128((__m128i*)(x + 8)));
x_3 = _mm_add_epi32(x_3, _mm_loadu_si128((__m128i*)(x + 12)));
x_0 = _mm_xor_si128(x_0, _mm_loadu_si128((__m128i*)(m + 0)));
x_1 = _mm_xor_si128(x_1, _mm_loadu_si128((__m128i*)(m + 16)));
x_2 = _mm_xor_si128(x_2, _mm_loadu_si128((__m128i*)(m + 32)));
x_3 = _mm_xor_si128(x_3, _mm_loadu_si128((__m128i*)(m + 48)));
_mm_storeu_si128((__m128i*)(out + 0), x_0);
_mm_storeu_si128((__m128i*)(out + 16), x_1);
_mm_storeu_si128((__m128i*)(out + 32), x_2);
_mm_storeu_si128((__m128i*)(out + 48), x_3);
in12 = x[12];
in13 = x[13];
in12 ++;
if (in12 == 0)
in13 ++;
x[12] = in12;
x[13] = in13;
bytes -= 64;
out += 64;
m += 64;
}
|