Dees_Troy | 51a0e82 | 2012-09-05 15:24:24 -0400 | [diff] [blame] | 1 | /* |
| 2 | * Copyright (C) 2010 The Android Open Source Project |
| 3 | * |
| 4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | * you may not use this file except in compliance with the License. |
| 6 | * You may obtain a copy of the License at |
| 7 | * |
| 8 | * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | * |
| 10 | * Unless required by applicable law or agreed to in writing, software |
| 11 | * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | * See the License for the specific language governing permissions and |
| 14 | * limitations under the License. |
| 15 | */ |
| 16 | |
| 17 | /* |
| 18 | * This is a fast-and-accurate implementation of inverse Discrete Cosine |
| 19 | * Transform (IDCT) for ARMv6+. It also performs dequantization of the input |
| 20 | * coefficients just like other methods. |
| 21 | * |
| 22 | * This implementation is based on the scaled 1-D DCT algorithm proposed by |
| 23 | * Arai, Agui, and Nakajima. The following code is based on the figure 4-8 |
| 24 | * on page 52 of the JPEG textbook by Pennebaker and Mitchell. Coefficients |
| 25 | * are (almost) directly mapped into registers. |
| 26 | * |
| 27 | * The accuracy is achieved by using SMULWy and SMLAWy instructions. Both |
| 28 | * multiply 32 bits by 16 bits and store the top 32 bits of the result. It |
| 29 | * makes 32-bit fixed-point arithmetic possible without overflow. That is |
| 30 | * why jpeg_idct_ifast(), which is written in C, cannot be improved. |
| 31 | * |
| 32 | * More tricks are used to gain more speed. First of all, we use as many |
| 33 | * registers as possible. ARM processor has 16 registers including sp (r13) |
| 34 | * and pc (r15), so only 14 registers can be used without limitations. In |
| 35 | * general, we let r0 to r7 hold the coefficients; r10 and r11 hold four |
| 36 | * 16-bit constants; r12 and r14 hold two of the four arguments; and r8 hold |
| 37 | * intermediate value. In the second pass, r9 is the loop counter. In the |
| 38 | * first pass, r8 to r11 are used to hold quantization values, so the loop |
| 39 | * counter is held by sp. Yes, the stack pointer. Since it must be aligned |
| 40 | * to 4-byte boundary all the time, we align it to 32-byte boundary and use |
| 41 | * bit 3 to bit 5. As the result, we actually use 14.1 registers. :-) |
| 42 | * |
| 43 | * Second, we rearrange quantization values to access them sequentially. The |
| 44 | * table is first transposed, and the new columns are placed in the order of |
| 45 | * 7, 5, 1, 3, 0, 2, 4, 6. Thus we can use LDMDB to load four values at a |
| 46 | * time. Rearranging coefficients also helps, but that requires to change a |
| 47 | * dozen of files, which seems not worth it. In addition, we choose to scale |
| 48 | * up quantization values by 13 bits, so the coefficients are scaled up by |
| 49 | * 16 bits after both passes. Then we can pack and saturate them two at a |
| 50 | * time using PKHTB and USAT16 instructions. |
| 51 | * |
| 52 | * Third, we reorder the instructions to avoid bubbles in the pipeline. This |
| 53 | * is done by hand accroding to the cycle timings and the interlock behavior |
| 54 | * described in the technical reference manual of ARM1136JF-S. We also take |
| 55 | * advantage of dual issue processors by interleaving instructions with |
| 56 | * dependencies. It has been benchmarked on four devices and all the results |
| 57 | * showed distinguishable improvements. Note that PLD instructions actually |
| 58 | * slow things down, so they are removed at the last minute. In the future, |
| 59 | * this might be futher improved using a system profiler. |
| 60 | */ |
| 61 | |
| 62 | #ifdef __arm__ |
| 63 | #include <machine/cpu-features.h> |
| 64 | #endif |
| 65 | |
| 66 | #if __ARM_ARCH__ >= 6 |
| 67 | |
| 68 | // void armv6_idct(short *coefs, int *quans, unsigned char *rows, int col) |
| 69 | .arm |
| 70 | .text |
| 71 | .align |
| 72 | .global armv6_idct |
| 73 | .func armv6_idct |
| 74 | |
| 75 | armv6_idct: |
| 76 | // Push everything except sp (r13) and pc (r15). |
| 77 | stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14} |
| 78 | |
| 79 | // r12 = quans, r14 = coefs. |
| 80 | sub r4, sp, #236 |
| 81 | bic sp, r4, #31 |
| 82 | add r5, sp, #224 |
| 83 | add r12, r1, #256 |
| 84 | stm r5, {r2, r3, r4} |
| 85 | add r14, r0, #16 |
| 86 | |
| 87 | pass1_head: |
| 88 | // Load quantization values. (q[0, 2, 4, 6]) |
| 89 | ldmdb r12!, {r8, r9, r10, r11} |
| 90 | |
| 91 | // Load coefficients. (c[4, 1, 2, 3, 0, 5, 6, 7]) |
| 92 | ldrsh r4, [r14, #-2] ! |
| 93 | ldrsh r1, [r14, #16] |
| 94 | ldrsh r2, [r14, #32] |
| 95 | ldrsh r3, [r14, #48] |
| 96 | ldrsh r0, [r14, #64] |
| 97 | ldrsh r5, [r14, #80] |
| 98 | ldrsh r6, [r14, #96] |
| 99 | ldrsh r7, [r14, #112] |
| 100 | |
| 101 | // r4 = q[0] * c[0]; |
| 102 | mul r4, r8, r4 |
| 103 | |
| 104 | // Check if ACs are all zero. |
| 105 | cmp r0, #0 |
| 106 | orreqs r8, r1, r2 |
| 107 | orreqs r8, r3, r5 |
| 108 | orreqs r8, r6, r7 |
| 109 | beq pass1_zero |
| 110 | |
| 111 | // Step 1: Dequantizations. |
| 112 | |
| 113 | // r2 = q[2] * c[2]; |
| 114 | // r0 = q[4] * c[4] + r4; |
| 115 | // r6 = q[6] * c[6] + r2; |
| 116 | mul r2, r9, r2 |
| 117 | mla r0, r10, r0, r4 |
| 118 | mla r6, r11, r6, r2 |
| 119 | |
| 120 | // Load quantization values. (q[7, 5, 1, 3]) |
| 121 | ldmdb r12!, {r8, r9, r10, r11} |
| 122 | |
| 123 | // r4 = r4 * 2 - r0 = -(r0 - r4 * 2); |
| 124 | // r2 = r2 * 2 - r6 = -(r6 - r2 * 2); |
| 125 | rsb r4, r0, r4, lsl #1 |
| 126 | rsb r2, r6, r2, lsl #1 |
| 127 | |
| 128 | // r7 = q[7] * c[7]; |
| 129 | // r5 = q[5] * c[5]; |
| 130 | // r1 = q[1] * c[1] + r7; |
| 131 | // r3 = q[3] * c[3] + r5; |
| 132 | mul r7, r8, r7 |
| 133 | mul r5, r9, r5 |
| 134 | mla r1, r10, r1, r7 |
| 135 | mla r3, r11, r3, r5 |
| 136 | |
| 137 | // Load constants. |
| 138 | ldrd r10, constants |
| 139 | |
| 140 | // Step 2: Rotations and Butterflies. |
| 141 | |
| 142 | // r7 = r1 - r7 * 2; |
| 143 | // r1 = r1 - r3; |
| 144 | // r5 = r5 * 2 - r3 = -(r3 - r5 * 2); |
| 145 | // r3 = r1 + r3 * 2; |
| 146 | // r8 = r5 + r7; |
| 147 | sub r7, r1, r7, lsl #1 |
| 148 | sub r1, r1, r3 |
| 149 | rsb r5, r3, r5, lsl #1 |
| 150 | add r3, r1, r3, lsl #1 |
| 151 | add r8, r5, r7 |
| 152 | |
| 153 | // r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2; |
| 154 | // r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536; |
| 155 | // r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1; |
| 156 | smlawt r2, r2, r10, r2 |
| 157 | smulwb r8, r8, r10 |
| 158 | smlawt r1, r1, r10, r1 |
| 159 | |
| 160 | // r0 = r0 + r6; |
| 161 | // r2 = r2 - r6; |
| 162 | // r6 = r0 - r6 * 2; |
| 163 | add r0, r0, r6 |
| 164 | sub r2, r2, r6 |
| 165 | sub r6, r0, r6, lsl #1 |
| 166 | |
| 167 | // r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8; |
| 168 | // r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8; |
| 169 | smlawt r5, r5, r11, r8 |
| 170 | smlawb r8, r7, r11, r8 |
| 171 | |
| 172 | // r4 = r4 + r2; |
| 173 | // r0 = r0 + r3; |
| 174 | // r2 = r4 - r2 * 2; |
| 175 | add r4, r4, r2 |
| 176 | add r0, r0, r3 |
| 177 | sub r2, r4, r2, lsl #1 |
| 178 | |
| 179 | // r7 = r5 * 8 - r3 = -(r3 - r5 * 8); |
| 180 | // r3 = r0 - r3 * 2; |
| 181 | // r1 = r1 - r7; |
| 182 | // r4 = r4 + r7; |
| 183 | // r5 = r8 * 8 - r1 = -(r1 - r8 * 8); |
| 184 | // r7 = r4 - r7 * 2; |
| 185 | rsb r7, r3, r5, lsl #3 |
| 186 | sub r3, r0, r3, lsl #1 |
| 187 | sub r1, r1, r7 |
| 188 | add r4, r4, r7 |
| 189 | rsb r5, r1, r8, lsl #3 |
| 190 | sub r7, r4, r7, lsl #1 |
| 191 | |
| 192 | // r2 = r2 + r1; |
| 193 | // r6 = r6 + r5; |
| 194 | // r1 = r2 - r1 * 2; |
| 195 | // r5 = r6 - r5 * 2; |
| 196 | add r2, r2, r1 |
| 197 | add r6, r6, r5 |
| 198 | sub r1, r2, r1, lsl #1 |
| 199 | sub r5, r6, r5, lsl #1 |
| 200 | |
| 201 | // Step 3: Reorder and Save. |
| 202 | |
| 203 | str r0, [sp, #-4] ! |
| 204 | str r4, [sp, #32] |
| 205 | str r2, [sp, #64] |
| 206 | str r6, [sp, #96] |
| 207 | str r5, [sp, #128] |
| 208 | str r1, [sp, #160] |
| 209 | str r7, [sp, #192] |
| 210 | str r3, [sp, #224] |
| 211 | b pass1_tail |
| 212 | |
| 213 | // Precomputed 16-bit constants: 27146, 15137, -21407, -8867. |
| 214 | // Put them in the middle since LDRD only accepts offsets from -255 to 255. |
| 215 | .align 3 |
| 216 | constants: |
| 217 | .word 0x6a0a3b21 |
| 218 | .word 0xac61dd5d |
| 219 | |
| 220 | pass1_zero: |
| 221 | str r4, [sp, #-4] ! |
| 222 | str r4, [sp, #32] |
| 223 | str r4, [sp, #64] |
| 224 | str r4, [sp, #96] |
| 225 | str r4, [sp, #128] |
| 226 | str r4, [sp, #160] |
| 227 | str r4, [sp, #192] |
| 228 | str r4, [sp, #224] |
| 229 | sub r12, r12, #16 |
| 230 | |
| 231 | pass1_tail: |
| 232 | ands r9, sp, #31 |
| 233 | bne pass1_head |
| 234 | |
| 235 | // r12 = rows, r14 = col. |
| 236 | ldr r12, [sp, #256] |
| 237 | ldr r14, [sp, #260] |
| 238 | |
| 239 | // Load constants. |
| 240 | ldrd r10, constants |
| 241 | |
| 242 | pass2_head: |
| 243 | // Load coefficients. (c[0, 1, 2, 3, 4, 5, 6, 7]) |
| 244 | ldmia sp!, {r0, r1, r2, r3, r4, r5, r6, r7} |
| 245 | |
| 246 | // r0 = r0 + 0x00808000; |
| 247 | add r0, r0, #0x00800000 |
| 248 | add r0, r0, #0x00008000 |
| 249 | |
| 250 | // Step 1: Analog to the first pass. |
| 251 | |
| 252 | // r0 = r0 + r4; |
| 253 | // r6 = r6 + r2; |
| 254 | add r0, r0, r4 |
| 255 | add r6, r6, r2 |
| 256 | |
| 257 | // r4 = r0 - r4 * 2; |
| 258 | // r2 = r2 * 2 - r6 = -(r6 - r2 * 2); |
| 259 | sub r4, r0, r4, lsl #1 |
| 260 | rsb r2, r6, r2, lsl #1 |
| 261 | |
| 262 | // r1 = r1 + r7; |
| 263 | // r3 = r3 + r5; |
| 264 | add r1, r1, r7 |
| 265 | add r3, r3, r5 |
| 266 | |
| 267 | // Step 2: Rotations and Butterflies. |
| 268 | |
| 269 | // r7 = r1 - r7 * 2; |
| 270 | // r1 = r1 - r3; |
| 271 | // r5 = r5 * 2 - r3 = -(r3 - r5 * 2); |
| 272 | // r3 = r1 + r3 * 2; |
| 273 | // r8 = r5 + r7; |
| 274 | sub r7, r1, r7, lsl #1 |
| 275 | sub r1, r1, r3 |
| 276 | rsb r5, r3, r5, lsl #1 |
| 277 | add r3, r1, r3, lsl #1 |
| 278 | add r8, r5, r7 |
| 279 | |
| 280 | // r2 = r2 * 1.41421 = r2 * 27146 / 65536 + r2; |
| 281 | // r8 = r8 * 1.84776 / 8 = r8 * 15137 / 65536; |
| 282 | // r1 = r1 * 1.41421 = r1 * 27146 / 65536 + r1; |
| 283 | smlawt r2, r2, r10, r2 |
| 284 | smulwb r8, r8, r10 |
| 285 | smlawt r1, r1, r10, r1 |
| 286 | |
| 287 | // r0 = r0 + r6; |
| 288 | // r2 = r2 - r6; |
| 289 | // r6 = r0 - r6 * 2; |
| 290 | add r0, r0, r6 |
| 291 | sub r2, r2, r6 |
| 292 | sub r6, r0, r6, lsl #1 |
| 293 | |
| 294 | // r5 = r5 * -2.61313 / 8 + r8 = r5 * -21407 / 65536 + r8; |
| 295 | // r8 = r7 * -1.08239 / 8 + r8 = r7 * -8867 / 65536 + r8; |
| 296 | smlawt r5, r5, r11, r8 |
| 297 | smlawb r8, r7, r11, r8 |
| 298 | |
| 299 | // r4 = r4 + r2; |
| 300 | // r0 = r0 + r3; |
| 301 | // r2 = r4 - r2 * 2; |
| 302 | add r4, r4, r2 |
| 303 | add r0, r0, r3 |
| 304 | sub r2, r4, r2, lsl #1 |
| 305 | |
| 306 | // r7 = r5 * 8 - r3 = -(r3 - r5 * 8); |
| 307 | // r3 = r0 - r3 * 2; |
| 308 | // r1 = r1 - r7; |
| 309 | // r4 = r4 + r7; |
| 310 | // r5 = r8 * 8 - r1 = -(r1 - r8 * 8); |
| 311 | // r7 = r4 - r7 * 2; |
| 312 | rsb r7, r3, r5, lsl #3 |
| 313 | sub r3, r0, r3, lsl #1 |
| 314 | sub r1, r1, r7 |
| 315 | add r4, r4, r7 |
| 316 | rsb r5, r1, r8, lsl #3 |
| 317 | sub r7, r4, r7, lsl #1 |
| 318 | |
| 319 | // r2 = r2 + r1; |
| 320 | // r6 = r6 + r5; |
| 321 | // r1 = r2 - r1 * 2; |
| 322 | // r5 = r6 - r5 * 2; |
| 323 | add r2, r2, r1 |
| 324 | add r6, r6, r5 |
| 325 | sub r1, r2, r1, lsl #1 |
| 326 | sub r5, r6, r5, lsl #1 |
| 327 | |
| 328 | // Step 3: Reorder and Save. |
| 329 | |
| 330 | // Load output pointer. |
| 331 | ldr r8, [r12], #4 |
| 332 | |
| 333 | // For little endian: r6, r2, r4, r0, r3, r7, r1, r5. |
| 334 | pkhtb r6, r6, r4, asr #16 |
| 335 | pkhtb r2, r2, r0, asr #16 |
| 336 | pkhtb r3, r3, r1, asr #16 |
| 337 | pkhtb r7, r7, r5, asr #16 |
| 338 | usat16 r6, #8, r6 |
| 339 | usat16 r2, #8, r2 |
| 340 | usat16 r3, #8, r3 |
| 341 | usat16 r7, #8, r7 |
| 342 | orr r0, r2, r6, lsl #8 |
| 343 | orr r1, r7, r3, lsl #8 |
| 344 | |
| 345 | #ifdef __ARMEB__ |
| 346 | // Reverse bytes for big endian. |
| 347 | rev r0, r0 |
| 348 | rev r1, r1 |
| 349 | #endif |
| 350 | |
| 351 | // Use STR instead of STRD to support unaligned access. |
| 352 | str r0, [r8, r14] ! |
| 353 | str r1, [r8, #4] |
| 354 | |
| 355 | pass2_tail: |
| 356 | adds r9, r9, #0x10000000 |
| 357 | bpl pass2_head |
| 358 | |
| 359 | ldr sp, [sp, #8] |
| 360 | add sp, sp, #236 |
| 361 | |
| 362 | ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14} |
| 363 | bx lr |
| 364 | .endfunc |
| 365 | |
| 366 | #endif |