]>
Commit | Line | Data |
---|---|---|
0c1f3509 MG |
1 | #include "cpusupport.h" |
2 | #ifdef CPUSUPPORT_X86_AESNI | |
3 | ||
4 | #include <stdint.h> | |
5 | #include <stdlib.h> | |
6 | #include <wmmintrin.h> | |
7 | ||
8 | #include "insecure_memzero.h" | |
9 | #include "warnp.h" | |
10 | ||
11 | #include "crypto_aes_aesni.h" | |
12 | ||
13 | /* Expanded-key structure. */ | |
14 | struct crypto_aes_key_aesni { | |
15 | uint8_t rkeys_buf[15 * sizeof(__m128i) + (sizeof(__m128i) - 1)]; | |
16 | __m128i * rkeys; | |
17 | size_t nr; | |
18 | }; | |
19 | ||
20 | /* Compute an AES-128 round key. */ | |
21 | #define MKRKEY128(rkeys, i, rcon) do { \ | |
22 | __m128i _s = rkeys[i - 1]; \ | |
23 | __m128i _t = rkeys[i - 1]; \ | |
24 | _s = _mm_xor_si128(_s, _mm_slli_si128(_s, 4)); \ | |
25 | _s = _mm_xor_si128(_s, _mm_slli_si128(_s, 8)); \ | |
26 | _t = _mm_aeskeygenassist_si128(_t, rcon); \ | |
27 | _t = _mm_shuffle_epi32(_t, 0xff); \ | |
28 | rkeys[i] = _mm_xor_si128(_s, _t); \ | |
29 | } while (0) | |
30 | ||
31 | /** | |
32 | * crypto_aes_key_expand_128_aesni(key, rkeys): | |
33 | * Expand the 128-bit AES key ${key} into the 11 round keys ${rkeys}. This | |
34 | * implementation uses x86 AESNI instructions, and should only be used if | |
35 | * CPUSUPPORT_X86_AESNI is defined and cpusupport_x86_aesni() returns nonzero. | |
36 | */ | |
37 | static void | |
38 | crypto_aes_key_expand_128_aesni(const uint8_t key[16], __m128i rkeys[11]) | |
39 | { | |
40 | ||
41 | /* The first round key is just the key. */ | |
42 | /** | |
43 | * XXX Compiler breakage: | |
44 | * The intrinsic defined by Intel for _mm_loadu_si128 defines it as | |
45 | * taking a (const __m128i *) parameter. This forces us to write a | |
46 | * bug: The cast to (const __m128i *) is invalid since it increases | |
47 | * the alignment requirement of the pointer. Alas, until compilers | |
48 | * get fixed intrinsics, all we can do is code the bug and require | |
49 | * that alignment-requirement-increasing compiler warnings get | |
50 | * disabled. | |
51 | */ | |
52 | rkeys[0] = _mm_loadu_si128((const __m128i *)&key[0]); | |
53 | ||
54 | /* | |
55 | * Each of the remaining round keys are computed from the preceding | |
56 | * round key: rotword+subword+rcon (provided as aeskeygenassist) to | |
57 | * compute the 'temp' value, then xor with 1, 2, 3, or all 4 of the | |
58 | * 32-bit words from the preceding round key. Unfortunately, 'rcon' | |
59 | * is encoded as an immediate value, so we need to write the loop out | |
60 | * ourselves rather than allowing the compiler to expand it. | |
61 | */ | |
62 | MKRKEY128(rkeys, 1, 0x01); | |
63 | MKRKEY128(rkeys, 2, 0x02); | |
64 | MKRKEY128(rkeys, 3, 0x04); | |
65 | MKRKEY128(rkeys, 4, 0x08); | |
66 | MKRKEY128(rkeys, 5, 0x10); | |
67 | MKRKEY128(rkeys, 6, 0x20); | |
68 | MKRKEY128(rkeys, 7, 0x40); | |
69 | MKRKEY128(rkeys, 8, 0x80); | |
70 | MKRKEY128(rkeys, 9, 0x1b); | |
71 | MKRKEY128(rkeys, 10, 0x36); | |
72 | } | |
73 | ||
74 | /* Compute an AES-256 round key. */ | |
75 | #define MKRKEY256(rkeys, i, shuffle, rcon) do { \ | |
76 | __m128i _s = rkeys[i - 2]; \ | |
77 | __m128i _t = rkeys[i - 1]; \ | |
78 | _s = _mm_xor_si128(_s, _mm_slli_si128(_s, 4)); \ | |
79 | _s = _mm_xor_si128(_s, _mm_slli_si128(_s, 8)); \ | |
80 | _t = _mm_aeskeygenassist_si128(_t, rcon); \ | |
81 | _t = _mm_shuffle_epi32(_t, shuffle); \ | |
82 | rkeys[i] = _mm_xor_si128(_s, _t); \ | |
83 | } while (0) | |
84 | ||
85 | /** | |
86 | * crypto_aes_key_expand_256_aesni(key, rkeys): | |
87 | * Expand the 256-bit AES key ${key} into the 15 round keys ${rkeys}. This | |
88 | * implementation uses x86 AESNI instructions, and should only be used if | |
89 | * CPUSUPPORT_X86_AESNI is defined and cpusupport_x86_aesni() returns nonzero. | |
90 | */ | |
91 | static void | |
92 | crypto_aes_key_expand_256_aesni(const uint8_t key[32], __m128i rkeys[15]) | |
93 | { | |
94 | ||
95 | /* The first two round keys are just the key. */ | |
96 | /** | |
97 | * XXX Compiler breakage: | |
98 | * The intrinsic defined by Intel for _mm_loadu_si128 defines it as | |
99 | * taking a (const __m128i *) parameter. This forces us to write a | |
100 | * bug: The cast to (const __m128i *) is invalid since it increases | |
101 | * the alignment requirement of the pointer. Alas, until compilers | |
102 | * get fixed intrinsics, all we can do is code the bug and require | |
103 | * that alignment-requirement-increasing compiler warnings get | |
104 | * disabled. | |
105 | */ | |
106 | rkeys[0] = _mm_loadu_si128((const __m128i *)&key[0]); | |
107 | rkeys[1] = _mm_loadu_si128((const __m128i *)&key[16]); | |
108 | ||
109 | /* | |
110 | * Each of the remaining round keys are computed from the preceding | |
111 | * pair of keys. Even rounds use rotword+subword+rcon, while odd | |
112 | * rounds just use subword; the aeskeygenassist instruction computes | |
113 | * both, and we use 0xff or 0xaa to select the one we need. The rcon | |
114 | * value used is irrelevant for odd rounds since we ignore the value | |
115 | * which it feeds into. Unfortunately, the 'shuffle' and 'rcon' | |
116 | * values are encoded into the instructions as immediates, so we need | |
117 | * to write the loop out ourselves rather than allowing the compiler | |
118 | * to expand it. | |
119 | */ | |
120 | MKRKEY256(rkeys, 2, 0xff, 0x01); | |
121 | MKRKEY256(rkeys, 3, 0xaa, 0x00); | |
122 | MKRKEY256(rkeys, 4, 0xff, 0x02); | |
123 | MKRKEY256(rkeys, 5, 0xaa, 0x00); | |
124 | MKRKEY256(rkeys, 6, 0xff, 0x04); | |
125 | MKRKEY256(rkeys, 7, 0xaa, 0x00); | |
126 | MKRKEY256(rkeys, 8, 0xff, 0x08); | |
127 | MKRKEY256(rkeys, 9, 0xaa, 0x00); | |
128 | MKRKEY256(rkeys, 10, 0xff, 0x10); | |
129 | MKRKEY256(rkeys, 11, 0xaa, 0x00); | |
130 | MKRKEY256(rkeys, 12, 0xff, 0x20); | |
131 | MKRKEY256(rkeys, 13, 0xaa, 0x00); | |
132 | MKRKEY256(rkeys, 14, 0xff, 0x40); | |
133 | } | |
134 | ||
135 | /** | |
136 | * crypto_aes_key_expand_aesni(key, len): | |
137 | * Expand the ${len}-byte AES key ${key} into a structure which can be passed | |
138 | * to crypto_aes_encrypt_block_aesni. The length must be 16 or 32. This | |
139 | * implementation uses x86 AESNI instructions, and should only be used if | |
140 | * CPUSUPPORT_X86_AESNI is defined and cpusupport_x86_aesni() returns nonzero. | |
141 | */ | |
142 | void * | |
143 | crypto_aes_key_expand_aesni(const uint8_t * key, size_t len) | |
144 | { | |
145 | struct crypto_aes_key_aesni * kexp; | |
146 | size_t rkey_offset; | |
147 | ||
148 | /* Allocate structure. */ | |
149 | if ((kexp = malloc(sizeof(struct crypto_aes_key_aesni))) == NULL) | |
150 | goto err0; | |
151 | ||
152 | /* Figure out where to put the round keys. */ | |
153 | rkey_offset = (uintptr_t)(&kexp->rkeys_buf[0]) % sizeof(__m128i); | |
154 | rkey_offset = (sizeof(__m128i) - rkey_offset) % sizeof(__m128i); | |
155 | kexp->rkeys = (void *)&kexp->rkeys_buf[rkey_offset]; | |
156 | ||
157 | /* Compute round keys. */ | |
158 | if (len == 16) { | |
159 | kexp->nr = 10; | |
160 | crypto_aes_key_expand_128_aesni(key, kexp->rkeys); | |
161 | } else if (len == 32) { | |
162 | kexp->nr = 14; | |
163 | crypto_aes_key_expand_256_aesni(key, kexp->rkeys); | |
164 | } else { | |
165 | warn0("Unsupported AES key length: %zu bytes", len); | |
166 | goto err1; | |
167 | } | |
168 | ||
169 | /* Success! */ | |
170 | return (kexp); | |
171 | ||
172 | err1: | |
173 | free(kexp); | |
174 | err0: | |
175 | /* Failure! */ | |
176 | return (NULL); | |
177 | } | |
178 | ||
179 | /** | |
180 | * crypto_aes_encrypt_block_aesni(in, out, key): | |
181 | * Using the expanded AES key ${key}, encrypt the block ${in} and write the | |
182 | * resulting ciphertext to ${out}. This implementation uses x86 AESNI | |
183 | * instructions, and should only be used if CPUSUPPORT_X86_AESNI is defined | |
184 | * and cpusupport_x86_aesni() returns nonzero. | |
185 | */ | |
186 | void | |
187 | crypto_aes_encrypt_block_aesni(const uint8_t * in, uint8_t * out, | |
188 | const void * key) | |
189 | { | |
190 | const struct crypto_aes_key_aesni * _key = key; | |
191 | const __m128i * aes_key = _key->rkeys; | |
192 | __m128i aes_state; | |
193 | size_t nr = _key->nr; | |
194 | ||
195 | aes_state = _mm_loadu_si128((const __m128i *)in); | |
196 | aes_state = _mm_xor_si128(aes_state, aes_key[0]); | |
197 | aes_state = _mm_aesenc_si128(aes_state, aes_key[1]); | |
198 | aes_state = _mm_aesenc_si128(aes_state, aes_key[2]); | |
199 | aes_state = _mm_aesenc_si128(aes_state, aes_key[3]); | |
200 | aes_state = _mm_aesenc_si128(aes_state, aes_key[4]); | |
201 | aes_state = _mm_aesenc_si128(aes_state, aes_key[5]); | |
202 | aes_state = _mm_aesenc_si128(aes_state, aes_key[6]); | |
203 | aes_state = _mm_aesenc_si128(aes_state, aes_key[7]); | |
204 | aes_state = _mm_aesenc_si128(aes_state, aes_key[8]); | |
205 | aes_state = _mm_aesenc_si128(aes_state, aes_key[9]); | |
206 | if (nr > 10) { | |
207 | aes_state = _mm_aesenc_si128(aes_state, aes_key[10]); | |
208 | aes_state = _mm_aesenc_si128(aes_state, aes_key[11]); | |
209 | ||
210 | if (nr > 12) { | |
211 | aes_state = _mm_aesenc_si128(aes_state, aes_key[12]); | |
212 | aes_state = _mm_aesenc_si128(aes_state, aes_key[13]); | |
213 | } | |
214 | } | |
215 | ||
216 | aes_state = _mm_aesenclast_si128(aes_state, aes_key[nr]); | |
217 | _mm_storeu_si128((__m128i *)out, aes_state); | |
218 | } | |
219 | ||
220 | /** | |
221 | * crypto_aes_key_free_aesni(key): | |
222 | * Free the expanded AES key ${key}. | |
223 | */ | |
224 | void | |
225 | crypto_aes_key_free_aesni(void * key) | |
226 | { | |
227 | ||
228 | /* Behave consistently with free(NULL). */ | |
229 | if (key == NULL) | |
230 | return; | |
231 | ||
232 | /* Attempt to zero the expanded key. */ | |
233 | insecure_memzero(key, sizeof(struct crypto_aes_key_aesni)); | |
234 | ||
235 | /* Free the key. */ | |
236 | free(key); | |
237 | } | |
238 | ||
239 | #endif /* CPUSUPPORT_X86_AESNI */ |