Dees Troy | 4dff2e6 | 2013-11-10 04:11:43 +0000 | [diff] [blame] | 1 | /* |
| 2 | * version 20110505 |
| 3 | * D. J. Bernstein |
| 4 | * Public domain. |
| 5 | * |
| 6 | * Based on crypto_core/salsa208/armneon/core.c from SUPERCOP 20130419 |
| 7 | */ |
| 8 | |
| 9 | #define ROUNDS 8 |
| 10 | static void |
| 11 | salsa20_8_intrinsic(void * input) |
| 12 | { |
| 13 | int i; |
| 14 | |
| 15 | const uint32x4_t abab = {-1,0,-1,0}; |
| 16 | |
| 17 | /* |
| 18 | * This is modified since we only have one argument. Usually you'd rearrange |
| 19 | * the constant, key, and input bytes, but we just have one linear array to |
| 20 | * rearrange which is a bit easier. |
| 21 | */ |
| 22 | |
| 23 | /* |
| 24 | * Change the input to be diagonals as if it's a 4x4 matrix of 32-bit values. |
| 25 | */ |
| 26 | uint32x4_t x0x5x10x15; |
| 27 | uint32x4_t x12x1x6x11; |
| 28 | uint32x4_t x8x13x2x7; |
| 29 | uint32x4_t x4x9x14x3; |
| 30 | |
| 31 | uint32x4_t x0x1x10x11; |
| 32 | uint32x4_t x12x13x6x7; |
| 33 | uint32x4_t x8x9x2x3; |
| 34 | uint32x4_t x4x5x14x15; |
| 35 | |
| 36 | uint32x4_t x0x1x2x3; |
| 37 | uint32x4_t x4x5x6x7; |
| 38 | uint32x4_t x8x9x10x11; |
| 39 | uint32x4_t x12x13x14x15; |
| 40 | |
| 41 | x0x1x2x3 = vld1q_u8((uint8_t *) input); |
| 42 | x4x5x6x7 = vld1q_u8(16 + (uint8_t *) input); |
| 43 | x8x9x10x11 = vld1q_u8(32 + (uint8_t *) input); |
| 44 | x12x13x14x15 = vld1q_u8(48 + (uint8_t *) input); |
| 45 | |
| 46 | x0x1x10x11 = vcombine_u32(vget_low_u32(x0x1x2x3), vget_high_u32(x8x9x10x11)); |
| 47 | x4x5x14x15 = vcombine_u32(vget_low_u32(x4x5x6x7), vget_high_u32(x12x13x14x15)); |
| 48 | x8x9x2x3 = vcombine_u32(vget_low_u32(x8x9x10x11), vget_high_u32(x0x1x2x3)); |
| 49 | x12x13x6x7 = vcombine_u32(vget_low_u32(x12x13x14x15), vget_high_u32(x4x5x6x7)); |
| 50 | |
| 51 | x0x5x10x15 = vbslq_u32(abab,x0x1x10x11,x4x5x14x15); |
| 52 | x8x13x2x7 = vbslq_u32(abab,x8x9x2x3,x12x13x6x7); |
| 53 | x4x9x14x3 = vbslq_u32(abab,x4x5x14x15,x8x9x2x3); |
| 54 | x12x1x6x11 = vbslq_u32(abab,x12x13x6x7,x0x1x10x11); |
| 55 | |
| 56 | uint32x4_t start0 = x0x5x10x15; |
| 57 | uint32x4_t start1 = x12x1x6x11; |
| 58 | uint32x4_t start3 = x4x9x14x3; |
| 59 | uint32x4_t start2 = x8x13x2x7; |
| 60 | |
| 61 | /* From here on this should be the same as the SUPERCOP version. */ |
| 62 | |
| 63 | uint32x4_t diag0 = start0; |
| 64 | uint32x4_t diag1 = start1; |
| 65 | uint32x4_t diag2 = start2; |
| 66 | uint32x4_t diag3 = start3; |
| 67 | |
| 68 | uint32x4_t a0; |
| 69 | uint32x4_t a1; |
| 70 | uint32x4_t a2; |
| 71 | uint32x4_t a3; |
| 72 | |
| 73 | for (i = ROUNDS;i > 0;i -= 2) { |
| 74 | a0 = diag1 + diag0; |
| 75 | diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); |
| 76 | a1 = diag0 + diag3; |
| 77 | diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); |
| 78 | a2 = diag3 + diag2; |
| 79 | diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); |
| 80 | a3 = diag2 + diag1; |
| 81 | diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); |
| 82 | |
| 83 | diag3 = vextq_u32(diag3,diag3,3); |
| 84 | diag2 = vextq_u32(diag2,diag2,2); |
| 85 | diag1 = vextq_u32(diag1,diag1,1); |
| 86 | |
| 87 | a0 = diag3 + diag0; |
| 88 | diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); |
| 89 | a1 = diag0 + diag1; |
| 90 | diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); |
| 91 | a2 = diag1 + diag2; |
| 92 | diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); |
| 93 | a3 = diag2 + diag3; |
| 94 | diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); |
| 95 | |
| 96 | diag1 = vextq_u32(diag1,diag1,3); |
| 97 | diag2 = vextq_u32(diag2,diag2,2); |
| 98 | diag3 = vextq_u32(diag3,diag3,1); |
| 99 | } |
| 100 | |
| 101 | x0x5x10x15 = diag0 + start0; |
| 102 | x12x1x6x11 = diag1 + start1; |
| 103 | x8x13x2x7 = diag2 + start2; |
| 104 | x4x9x14x3 = diag3 + start3; |
| 105 | |
| 106 | x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); |
| 107 | x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); |
| 108 | x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); |
| 109 | x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); |
| 110 | |
| 111 | x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); |
| 112 | x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); |
| 113 | x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); |
| 114 | x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); |
| 115 | |
| 116 | vst1q_u8((uint8_t *) input,(uint8x16_t) x0x1x2x3); |
| 117 | vst1q_u8(16 + (uint8_t *) input,(uint8x16_t) x4x5x6x7); |
| 118 | vst1q_u8(32 + (uint8_t *) input,(uint8x16_t) x8x9x10x11); |
| 119 | vst1q_u8(48 + (uint8_t *) input,(uint8x16_t) x12x13x14x15); |
| 120 | } |