| 1 | #include "cpusupport.h" |
| 2 | #ifdef CPUSUPPORT_X86_AESNI |
| 3 | |
| 4 | #include <stdint.h> |
| 5 | #include <stdlib.h> |
| 6 | #include <wmmintrin.h> |
| 7 | |
| 8 | #include "insecure_memzero.h" |
| 9 | #include "warnp.h" |
| 10 | |
| 11 | #include "crypto_aes_aesni.h" |
| 12 | |
| 13 | /* Expanded-key structure. */ |
| 14 | struct crypto_aes_key_aesni { |
| 15 | uint8_t rkeys_buf[15 * sizeof(__m128i) + (sizeof(__m128i) - 1)]; |
| 16 | __m128i * rkeys; |
| 17 | size_t nr; |
| 18 | }; |
| 19 | |
| 20 | /* Compute an AES-128 round key. */ |
| 21 | #define MKRKEY128(rkeys, i, rcon) do { \ |
| 22 | __m128i _s = rkeys[i - 1]; \ |
| 23 | __m128i _t = rkeys[i - 1]; \ |
| 24 | _s = _mm_xor_si128(_s, _mm_slli_si128(_s, 4)); \ |
| 25 | _s = _mm_xor_si128(_s, _mm_slli_si128(_s, 8)); \ |
| 26 | _t = _mm_aeskeygenassist_si128(_t, rcon); \ |
| 27 | _t = _mm_shuffle_epi32(_t, 0xff); \ |
| 28 | rkeys[i] = _mm_xor_si128(_s, _t); \ |
| 29 | } while (0) |
| 30 | |
| 31 | /** |
| 32 | * crypto_aes_key_expand_128_aesni(key, rkeys): |
| 33 | * Expand the 128-bit AES key ${key} into the 11 round keys ${rkeys}. This |
| 34 | * implementation uses x86 AESNI instructions, and should only be used if |
| 35 | * CPUSUPPORT_X86_AESNI is defined and cpusupport_x86_aesni() returns nonzero. |
| 36 | */ |
| 37 | static void |
| 38 | crypto_aes_key_expand_128_aesni(const uint8_t key[16], __m128i rkeys[11]) |
| 39 | { |
| 40 | |
| 41 | /* The first round key is just the key. */ |
| 42 | /** |
| 43 | * XXX Compiler breakage: |
| 44 | * The intrinsic defined by Intel for _mm_loadu_si128 defines it as |
| 45 | * taking a (const __m128i *) parameter. This forces us to write a |
| 46 | * bug: The cast to (const __m128i *) is invalid since it increases |
| 47 | * the alignment requirement of the pointer. Alas, until compilers |
| 48 | * get fixed intrinsics, all we can do is code the bug and require |
| 49 | * that alignment-requirement-increasing compiler warnings get |
| 50 | * disabled. |
| 51 | */ |
| 52 | rkeys[0] = _mm_loadu_si128((const __m128i *)&key[0]); |
| 53 | |
| 54 | /* |
| 55 | * Each of the remaining round keys are computed from the preceding |
| 56 | * round key: rotword+subword+rcon (provided as aeskeygenassist) to |
| 57 | * compute the 'temp' value, then xor with 1, 2, 3, or all 4 of the |
| 58 | * 32-bit words from the preceding round key. Unfortunately, 'rcon' |
| 59 | * is encoded as an immediate value, so we need to write the loop out |
| 60 | * ourselves rather than allowing the compiler to expand it. |
| 61 | */ |
| 62 | MKRKEY128(rkeys, 1, 0x01); |
| 63 | MKRKEY128(rkeys, 2, 0x02); |
| 64 | MKRKEY128(rkeys, 3, 0x04); |
| 65 | MKRKEY128(rkeys, 4, 0x08); |
| 66 | MKRKEY128(rkeys, 5, 0x10); |
| 67 | MKRKEY128(rkeys, 6, 0x20); |
| 68 | MKRKEY128(rkeys, 7, 0x40); |
| 69 | MKRKEY128(rkeys, 8, 0x80); |
| 70 | MKRKEY128(rkeys, 9, 0x1b); |
| 71 | MKRKEY128(rkeys, 10, 0x36); |
| 72 | } |
| 73 | |
| 74 | /* Compute an AES-256 round key. */ |
| 75 | #define MKRKEY256(rkeys, i, shuffle, rcon) do { \ |
| 76 | __m128i _s = rkeys[i - 2]; \ |
| 77 | __m128i _t = rkeys[i - 1]; \ |
| 78 | _s = _mm_xor_si128(_s, _mm_slli_si128(_s, 4)); \ |
| 79 | _s = _mm_xor_si128(_s, _mm_slli_si128(_s, 8)); \ |
| 80 | _t = _mm_aeskeygenassist_si128(_t, rcon); \ |
| 81 | _t = _mm_shuffle_epi32(_t, shuffle); \ |
| 82 | rkeys[i] = _mm_xor_si128(_s, _t); \ |
| 83 | } while (0) |
| 84 | |
| 85 | /** |
| 86 | * crypto_aes_key_expand_256_aesni(key, rkeys): |
| 87 | * Expand the 256-bit AES key ${key} into the 15 round keys ${rkeys}. This |
| 88 | * implementation uses x86 AESNI instructions, and should only be used if |
| 89 | * CPUSUPPORT_X86_AESNI is defined and cpusupport_x86_aesni() returns nonzero. |
| 90 | */ |
| 91 | static void |
| 92 | crypto_aes_key_expand_256_aesni(const uint8_t key[32], __m128i rkeys[15]) |
| 93 | { |
| 94 | |
| 95 | /* The first two round keys are just the key. */ |
| 96 | /** |
| 97 | * XXX Compiler breakage: |
| 98 | * The intrinsic defined by Intel for _mm_loadu_si128 defines it as |
| 99 | * taking a (const __m128i *) parameter. This forces us to write a |
| 100 | * bug: The cast to (const __m128i *) is invalid since it increases |
| 101 | * the alignment requirement of the pointer. Alas, until compilers |
| 102 | * get fixed intrinsics, all we can do is code the bug and require |
| 103 | * that alignment-requirement-increasing compiler warnings get |
| 104 | * disabled. |
| 105 | */ |
| 106 | rkeys[0] = _mm_loadu_si128((const __m128i *)&key[0]); |
| 107 | rkeys[1] = _mm_loadu_si128((const __m128i *)&key[16]); |
| 108 | |
| 109 | /* |
| 110 | * Each of the remaining round keys are computed from the preceding |
| 111 | * pair of keys. Even rounds use rotword+subword+rcon, while odd |
| 112 | * rounds just use subword; the aeskeygenassist instruction computes |
| 113 | * both, and we use 0xff or 0xaa to select the one we need. The rcon |
| 114 | * value used is irrelevant for odd rounds since we ignore the value |
| 115 | * which it feeds into. Unfortunately, the 'shuffle' and 'rcon' |
| 116 | * values are encoded into the instructions as immediates, so we need |
| 117 | * to write the loop out ourselves rather than allowing the compiler |
| 118 | * to expand it. |
| 119 | */ |
| 120 | MKRKEY256(rkeys, 2, 0xff, 0x01); |
| 121 | MKRKEY256(rkeys, 3, 0xaa, 0x00); |
| 122 | MKRKEY256(rkeys, 4, 0xff, 0x02); |
| 123 | MKRKEY256(rkeys, 5, 0xaa, 0x00); |
| 124 | MKRKEY256(rkeys, 6, 0xff, 0x04); |
| 125 | MKRKEY256(rkeys, 7, 0xaa, 0x00); |
| 126 | MKRKEY256(rkeys, 8, 0xff, 0x08); |
| 127 | MKRKEY256(rkeys, 9, 0xaa, 0x00); |
| 128 | MKRKEY256(rkeys, 10, 0xff, 0x10); |
| 129 | MKRKEY256(rkeys, 11, 0xaa, 0x00); |
| 130 | MKRKEY256(rkeys, 12, 0xff, 0x20); |
| 131 | MKRKEY256(rkeys, 13, 0xaa, 0x00); |
| 132 | MKRKEY256(rkeys, 14, 0xff, 0x40); |
| 133 | } |
| 134 | |
| 135 | /** |
| 136 | * crypto_aes_key_expand_aesni(key, len): |
| 137 | * Expand the ${len}-byte AES key ${key} into a structure which can be passed |
| 138 | * to crypto_aes_encrypt_block_aesni. The length must be 16 or 32. This |
| 139 | * implementation uses x86 AESNI instructions, and should only be used if |
| 140 | * CPUSUPPORT_X86_AESNI is defined and cpusupport_x86_aesni() returns nonzero. |
| 141 | */ |
| 142 | void * |
| 143 | crypto_aes_key_expand_aesni(const uint8_t * key, size_t len) |
| 144 | { |
| 145 | struct crypto_aes_key_aesni * kexp; |
| 146 | size_t rkey_offset; |
| 147 | |
| 148 | /* Allocate structure. */ |
| 149 | if ((kexp = malloc(sizeof(struct crypto_aes_key_aesni))) == NULL) |
| 150 | goto err0; |
| 151 | |
| 152 | /* Figure out where to put the round keys. */ |
| 153 | rkey_offset = (uintptr_t)(&kexp->rkeys_buf[0]) % sizeof(__m128i); |
| 154 | rkey_offset = (sizeof(__m128i) - rkey_offset) % sizeof(__m128i); |
| 155 | kexp->rkeys = (void *)&kexp->rkeys_buf[rkey_offset]; |
| 156 | |
| 157 | /* Compute round keys. */ |
| 158 | if (len == 16) { |
| 159 | kexp->nr = 10; |
| 160 | crypto_aes_key_expand_128_aesni(key, kexp->rkeys); |
| 161 | } else if (len == 32) { |
| 162 | kexp->nr = 14; |
| 163 | crypto_aes_key_expand_256_aesni(key, kexp->rkeys); |
| 164 | } else { |
| 165 | warn0("Unsupported AES key length: %zu bytes", len); |
| 166 | goto err1; |
| 167 | } |
| 168 | |
| 169 | /* Success! */ |
| 170 | return (kexp); |
| 171 | |
| 172 | err1: |
| 173 | free(kexp); |
| 174 | err0: |
| 175 | /* Failure! */ |
| 176 | return (NULL); |
| 177 | } |
| 178 | |
| 179 | /** |
| 180 | * crypto_aes_encrypt_block_aesni(in, out, key): |
| 181 | * Using the expanded AES key ${key}, encrypt the block ${in} and write the |
| 182 | * resulting ciphertext to ${out}. This implementation uses x86 AESNI |
| 183 | * instructions, and should only be used if CPUSUPPORT_X86_AESNI is defined |
| 184 | * and cpusupport_x86_aesni() returns nonzero. |
| 185 | */ |
| 186 | void |
| 187 | crypto_aes_encrypt_block_aesni(const uint8_t * in, uint8_t * out, |
| 188 | const void * key) |
| 189 | { |
| 190 | const struct crypto_aes_key_aesni * _key = key; |
| 191 | const __m128i * aes_key = _key->rkeys; |
| 192 | __m128i aes_state; |
| 193 | size_t nr = _key->nr; |
| 194 | |
| 195 | aes_state = _mm_loadu_si128((const __m128i *)in); |
| 196 | aes_state = _mm_xor_si128(aes_state, aes_key[0]); |
| 197 | aes_state = _mm_aesenc_si128(aes_state, aes_key[1]); |
| 198 | aes_state = _mm_aesenc_si128(aes_state, aes_key[2]); |
| 199 | aes_state = _mm_aesenc_si128(aes_state, aes_key[3]); |
| 200 | aes_state = _mm_aesenc_si128(aes_state, aes_key[4]); |
| 201 | aes_state = _mm_aesenc_si128(aes_state, aes_key[5]); |
| 202 | aes_state = _mm_aesenc_si128(aes_state, aes_key[6]); |
| 203 | aes_state = _mm_aesenc_si128(aes_state, aes_key[7]); |
| 204 | aes_state = _mm_aesenc_si128(aes_state, aes_key[8]); |
| 205 | aes_state = _mm_aesenc_si128(aes_state, aes_key[9]); |
| 206 | if (nr > 10) { |
| 207 | aes_state = _mm_aesenc_si128(aes_state, aes_key[10]); |
| 208 | aes_state = _mm_aesenc_si128(aes_state, aes_key[11]); |
| 209 | |
| 210 | if (nr > 12) { |
| 211 | aes_state = _mm_aesenc_si128(aes_state, aes_key[12]); |
| 212 | aes_state = _mm_aesenc_si128(aes_state, aes_key[13]); |
| 213 | } |
| 214 | } |
| 215 | |
| 216 | aes_state = _mm_aesenclast_si128(aes_state, aes_key[nr]); |
| 217 | _mm_storeu_si128((__m128i *)out, aes_state); |
| 218 | } |
| 219 | |
| 220 | /** |
| 221 | * crypto_aes_key_free_aesni(key): |
| 222 | * Free the expanded AES key ${key}. |
| 223 | */ |
| 224 | void |
| 225 | crypto_aes_key_free_aesni(void * key) |
| 226 | { |
| 227 | |
| 228 | /* Behave consistently with free(NULL). */ |
| 229 | if (key == NULL) |
| 230 | return; |
| 231 | |
| 232 | /* Attempt to zero the expanded key. */ |
| 233 | insecure_memzero(key, sizeof(struct crypto_aes_key_aesni)); |
| 234 | |
| 235 | /* Free the key. */ |
| 236 | free(key); |
| 237 | } |
| 238 | |
| 239 | #endif /* CPUSUPPORT_X86_AESNI */ |