Dees_Troy | 51a0e82 | 2012-09-05 15:24:24 -0400 | [diff] [blame] | 1 | /*========================================================================= |
| 2 | * jdidct-armv7.s |
| 3 | * |
| 4 | * Copyright (c) 2010, Code Aurora Forum. All rights reserved. |
| 5 | * |
| 6 | * Redistribution and use in source and binary forms, with or without |
| 7 | * modification, are permitted provided that the following conditions are |
| 8 | * met: |
| 9 | * * Redistributions of source code must retain the above copyright |
| 10 | * notice, this list of conditions and the following disclaimer. |
| 11 | * * Redistributions in binary form must reproduce the above |
| 12 | * copyright notice, this list of conditions and the following |
| 13 | * disclaimer in the documentation and/or other materials provided |
| 14 | * with the distribution. |
| 15 | * * Neither the name of Code Aurora Forum, Inc. nor the names of its |
| 16 | * contributors may be used to endorse or promote products derived |
| 17 | * from this software without specific prior written permission. |
| 18 | * |
| 19 | * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED |
| 20 | * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF |
| 21 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT |
| 22 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS |
| 23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
| 24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
| 25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR |
| 26 | * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, |
| 27 | * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE |
| 28 | * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN |
| 29 | * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 30 | *========================================================================== |
| 31 | |
| 32 | *========================================================================== |
| 33 | * FUNCTION LIST |
| 34 | *-------------------------------------------------------------------------- |
| 35 | * - idct_1x1_venum |
| 36 | * - idct_2x2_venum |
| 37 | * - idct_4x4_venum |
| 38 | * - idct_8x8_venum |
| 39 | * |
| 40 | *========================================================================== |
| 41 | */ |
| 42 | |
| 43 | @========================================================================== |
| 44 | @ MACRO DEFINITION |
| 45 | @========================================================================== |
| 46 | .macro Transpose8x8 |
| 47 | @================================================================== |
| 48 | @ Transpose an 8 x 8 x 16 bit matrix in place |
| 49 | @ Input: q8 to q15 |
| 50 | @ Output: q8 to q15 |
| 51 | @ Registers used: q8 to q15 |
| 52 | @ Assumptions: 8 x 8 x 16 bit data |
| 53 | @================================================================== |
| 54 | |
| 55 | vswp d17, d24 @q8, q12 |
| 56 | vswp d23, d30 @q11, q15 |
| 57 | vswp d21, d28 @q10, q14 |
| 58 | vswp d19, d26 @q9, q13 |
| 59 | |
| 60 | vtrn.32 q8, q10 |
| 61 | vtrn.32 q9, q11 |
| 62 | vtrn.32 q12, q14 |
| 63 | vtrn.32 q13, q15 |
| 64 | |
| 65 | vtrn.16 q8, q9 |
| 66 | vtrn.16 q10, q11 |
| 67 | vtrn.16 q12, q13 |
| 68 | vtrn.16 q14, q15 |
| 69 | .endm |
| 70 | |
| 71 | .macro IDCT1D |
| 72 | @================================================================== |
| 73 | @ One dimensional 64 element inverse DCT |
| 74 | @ Input: q8 to q15 loaded with data |
| 75 | @ q0 loaded with constants |
| 76 | @ Output: q8 to q15 |
| 77 | @ Registers used: q0, q4 to q15 |
| 78 | @ Assumptions: 16 bit data, first elements in least significant |
| 79 | @ halfwords |
| 80 | @================================================================== |
| 81 | |
| 82 | @1st stage |
| 83 | vqrdmulh.s16 q4, q15, d0[2] @q4 = a1*vx7 |
| 84 | vqrdmulh.s16 q5, q9, d0[2] @q5 = a1*vx1 |
| 85 | vqrdmulh.s16 q6, q13, d0[3] @q6 = a2*vx5 |
| 86 | vqrdmulh.s16 q7, q11, d1[1] @q7 = ma2*vx3 |
| 87 | vqrdmulh.s16 q2, q14, d0[1] @q6 = a0*vx6 |
| 88 | vqrdmulh.s16 q3, q10, d0[1] @q7 = a0*vx2 |
| 89 | vqadd.s16 q9, q4, q9 @q9 = t1 = a1*vx7 + vx1 |
| 90 | vqsub.s16 q5, q5, q15 @q5 = t8 = a1*vx1 - vx7 |
| 91 | vqadd.s16 q15, q6, q11 @q15 = t7 = a2*vx5 + vx3 |
| 92 | vqadd.s16 q11, q7, q13 @q11 = t3 = ma2*vx3 + vx5 |
| 93 | |
| 94 | @2nd stage |
| 95 | vqadd.s16 q13, q8, q12 @q13 = t5 = vx0 + vx4 |
| 96 | vqsub.s16 q8, q8, q12 @q8 = t0 = vx0 - vx4 |
| 97 | vqadd.s16 q10, q2, q10 @q10 = t2 = a0*vx6 + vx2 |
| 98 | vqsub.s16 q12, q3, q14 @q12 = t4 = a0*vx2 - vx6 |
| 99 | vqadd.s16 q14, q5, q11 @q14 = t6 = t8 + t3 |
| 100 | vqsub.s16 q11, q5, q11 @q11 = t3 = t8 - t3 |
| 101 | vqsub.s16 q5, q9, q15 @q5 = t8 = t1 - t7 |
| 102 | vqadd.s16 q9, q9, q15 @q9 = t1 = t1 + t7 |
| 103 | |
| 104 | @3rd stage |
| 105 | vqadd.s16 q15, q13, q10 @q15 = t7 = t5 + t2 |
| 106 | vqsub.s16 q10, q13, q10 @q10 = t2 = t5 - t2 |
| 107 | vqadd.s16 q13, q8, q12 @q13 = t5 = t0 + t4 |
| 108 | vqsub.s16 q7, q8, q12 @q7 = t0 = t0 - t4 |
| 109 | vqsub.s16 q12, q5, q11 @q12 = t4 = t8 - t3 |
| 110 | vqadd.s16 q11, q5, q11 @q11 = t3 = t8 + t3 |
| 111 | |
| 112 | @4th stage |
| 113 | vqadd.s16 q8, q15, q9 @q8 = vy0 = t7 + t1 |
| 114 | vqsub.s16 q15, q15, q9 @q15 = vy7 = t7 - t1 |
| 115 | vqrdmulh.s16 q6, q12, d0[0] @q6 = c4*t4 |
| 116 | vqrdmulh.s16 q4, q11, d0[0] @q4 = c4*t3 |
| 117 | vqsub.s16 q12, q10, q14 @q12 = vy4 = t2 - t6 |
| 118 | vqadd.s16 q11, q10, q14 @q11 = vy3 = t2 + t6 |
| 119 | vqadd.s16 q10, q7, q6 @q10 = vy2 = t0 + c4*t4 |
| 120 | vqsub.s16 q14, q13, q4 @q14 = vy6 = t5 - c4*t3 |
| 121 | vqadd.s16 q9, q13, q4 @q9 = vy1 = t5 + c4*t3 |
| 122 | vqsub.s16 q13, q7, q6 @q13 = vy5 = t0 - c4*t4 |
| 123 | .endm |
| 124 | |
| 125 | .macro PART1 |
| 126 | @================================================================== |
| 127 | @ Load input input data from memory and shift |
| 128 | @================================================================== |
| 129 | vld1.16 {d16, d17},[r0]! @q8 =row0 |
| 130 | vqshl.s16 q8, q8, #4 @Input data too big?!! |
| 131 | @Maximum MPEG input is 2047/-2048. |
| 132 | vld1.16 {d18, d19},[r0]! @q9 =row1 |
| 133 | vqshl.s16 q9, q9, #4 @Shift 1 instead of 4 |
| 134 | |
| 135 | vld1.16 {d20, d21},[r0]! @q10=row2 |
| 136 | vqshl.s16 q10, q10, #4 |
| 137 | |
| 138 | vld1.16 {d22, d23},[r0]! @q11=row3 |
| 139 | vqshl.s16 q11, q11, #4 |
| 140 | |
| 141 | vld1.16 {d24, d25},[r0]! @q12=row4 |
| 142 | vqshl.s16 q12, q12, #4 |
| 143 | |
| 144 | vld1.16 {d26, d27},[r0]! @q13=row5 |
| 145 | vqshl.s16 q13, q13, #4 |
| 146 | vld1.16 {d28, d29},[r0]! @q14=row6 |
| 147 | vqshl.s16 q14, q14, #4 |
| 148 | vld1.16 {d30, d31},[r0]! @q15=row7 |
| 149 | vqshl.s16 q15, q15, #4 |
| 150 | |
| 151 | @================================================================== |
| 152 | @ refresh the constants that was clobbered last time through IDCT1D |
| 153 | @================================================================== |
| 154 | vld1.16 {d4, d5},[r7] @q2 =constants[2] |
| 155 | vld1.16 {d6, d7},[r8] @q3 =constants[3] |
| 156 | vld1.16 {d8, d9},[r9] @q4 =constants[4] |
| 157 | .endm |
| 158 | |
| 159 | .macro PART2 |
| 160 | @================================================================== |
| 161 | @ Prescale the input |
| 162 | @================================================================== |
| 163 | vqrdmulh.s16 q12, q12, q1 @q12=row4 * constants[1] = vx4 |
| 164 | vqrdmulh.s16 q15, q15, q2 @q15=row7 * constants[2] = vx7 |
| 165 | vqrdmulh.s16 q9, q9, q2 @q9 =row1 * constants[2] = vx1 |
| 166 | vqrdmulh.s16 q13, q13, q4 @q13=row5 * constants[4] = vx5 |
| 167 | vqrdmulh.s16 q11, q11, q4 @q11=row3 * constants[4] = vx3 |
| 168 | vqrdmulh.s16 q14, q14, q3 @q14=row6 * constants[3] = vx6 |
| 169 | vqrdmulh.s16 q10, q10, q3 @q10=row2 * constants[3] = vx2 |
| 170 | vqrdmulh.s16 q8, q8, q1 @q8 =row0 * constants[1] = vx0 |
| 171 | |
| 172 | @================================================================== |
| 173 | @ At thsi point, the input 8x8 x 16 bit coefficients are |
| 174 | @ transposed, prescaled, and loaded in q8 to q15 |
| 175 | @ q0 loaded with scalar constants |
| 176 | @ Perform 1D IDCT |
| 177 | @================================================================== |
| 178 | IDCT1D @perform 1d idct |
| 179 | |
| 180 | @================================================================== |
| 181 | @ Transpose the intermediate results to get read for vertical |
| 182 | @ transformation |
| 183 | @================================================================== |
| 184 | vswp d17, d24 @q8, q12 |
| 185 | vswp d23, d30 @q11, q15 |
| 186 | vswp d21, d28 @q10, q14 |
| 187 | vswp d19, d26 @q9, q13 |
| 188 | |
| 189 | @================================================================== |
| 190 | @ Load the bias |
| 191 | @================================================================== |
| 192 | vdup.32 q4, d1[1] @a cycle is saved by loading |
| 193 | @the bias at this point |
| 194 | |
| 195 | @================================================================== |
| 196 | @ Finish the transposition |
| 197 | @================================================================== |
| 198 | vtrn.32 q8, q10 |
| 199 | vtrn.32 q9, q11 |
| 200 | vtrn.32 q12, q14 |
| 201 | vtrn.32 q13, q15 |
| 202 | vtrn.16 q8, q9 |
| 203 | vtrn.16 q10, q11 |
| 204 | vtrn.16 q12, q13 |
| 205 | vtrn.16 q14, q15 |
| 206 | |
| 207 | @================================================================== |
| 208 | @ Add bias |
| 209 | @================================================================== |
| 210 | vqadd.s16 q8, q8, q4 |
| 211 | |
| 212 | @================================================================== |
| 213 | @ IDCT 2nd half |
| 214 | @================================================================== |
| 215 | IDCT1D @perform 1d dct |
| 216 | |
| 217 | @================================================================== |
| 218 | @ Scale and clamp the output to correct range and save to memory |
| 219 | @ 1. scale to 8bits by right shift 6 |
| 220 | @ 2. clamp output to [0, 255] by min/max |
| 221 | @ 3. use multiple store. Each store will save one row of output. |
| 222 | @ The st queue size is 4, so do no more than 4 str in sequence. |
| 223 | @================================================================== |
| 224 | ldr r5, =constants+5*16 @constants[5], |
| 225 | vld1.16 d10, [r5] @load clamping parameters |
| 226 | vdup.s16 q6, d10[0] @q6=[0000000000000000] |
| 227 | vdup.s16 q7, d10[1] @q7=[FFFFFFFFFFFFFFFF] |
| 228 | |
| 229 | @Save the results |
| 230 | vshr.s16 q8, q8, #6 @q8 = vy0 |
| 231 | vmax.s16 q8, q8, q6 @clamp >0 |
| 232 | vmin.s16 q8, q8, q7 @clamp <255 |
| 233 | |
| 234 | vshr.s16 q9, q9, #6 @q9 = vy1 |
| 235 | vmax.s16 q9, q9, q6 @clamp >0 |
| 236 | vmin.s16 q9, q9, q7 @clamp <255 |
| 237 | |
| 238 | vshr.s16 q10, q10, #6 @q10 = vy2 |
| 239 | vmax.s16 q10, q10, q6 @clamp >0 |
| 240 | vmin.s16 q10, q10, q7 @clamp <255 |
| 241 | |
| 242 | vshr.s16 q11, q11, #6 @q11 = vy3 |
| 243 | vmax.s16 q11, q11, q6 @clamp >0 |
| 244 | vmin.s16 q11, q11, q7 @clamp <255 |
| 245 | |
| 246 | vst1.16 {d16, d17},[r1],r2 @q8 =row0 |
| 247 | vst1.16 {d18, d19},[r1],r2 @q9 =row1 |
| 248 | vst1.16 {d20, d21},[r1],r2 @q10=row2 |
| 249 | vst1.16 {d22, d23},[r1],r2 @q11=row3 |
| 250 | |
| 251 | vshr.s16 q12, q12, #6 @q12 = vy4 |
| 252 | vmax.s16 q12, q12, q6 @clamp >0 |
| 253 | vmin.s16 q12, q12, q7 @clamp <255 |
| 254 | |
| 255 | vshr.s16 q13, q13, #6 @q13 = vy5 |
| 256 | vmax.s16 q13, q13, q6 @clamp >0 |
| 257 | vmin.s16 q13, q13, q7 @clamp <255 |
| 258 | |
| 259 | vshr.s16 q14, q14, #6 @q14 = vy6 |
| 260 | vmax.s16 q14, q14, q6 @clamp >0 |
| 261 | vmin.s16 q14, q14, q7 @clamp <255 |
| 262 | |
| 263 | vshr.s16 q15, q15, #6 @q15 = vy7 |
| 264 | vmax.s16 q15, q15, q6 @clamp >0 |
| 265 | vmin.s16 q15, q15, q7 @clamp <255 |
| 266 | |
| 267 | vst1.16 {d24, d25},[r1],r2 @q12=row4 |
| 268 | vst1.16 {d26, d27},[r1],r2 @q13=row5 |
| 269 | vst1.16 {d28, d29},[r1],r2 @q14=row6 |
| 270 | vst1.16 {d30, d31},[r1] @q15=row7 |
| 271 | .endm |
| 272 | |
| 273 | .macro BIG_BODY_TRANSPOSE_INPUT |
| 274 | @================================================================== |
| 275 | @ Main body of idct |
| 276 | @================================================================== |
| 277 | PART1 |
| 278 | Transpose8x8 |
| 279 | PART2 |
| 280 | .endm |
| 281 | |
| 282 | .macro IDCT_ENTRY |
| 283 | @================================================================== |
| 284 | @ Load the locations of the constants |
| 285 | @================================================================== |
| 286 | ldr r5, =constants+0*16 @constants[0] |
| 287 | ldr r6, =constants+1*16 @constants[1] |
| 288 | ldr r7, =constants+2*16 @constants[2] |
| 289 | ldr r8, =constants+3*16 @constants[3] |
| 290 | ldr r9, =constants+4*16 @constants[4] |
| 291 | |
| 292 | @================================================================== |
| 293 | @ Load the coefficients |
| 294 | @ only some input coefficients are load due to register constrain |
| 295 | @================================================================== |
| 296 | vld1.16 {d0, d1},[r5] @q0 =constants[0] (scalars) |
| 297 | vld1.16 {d2, d3},[r6] @q1 =constants[1] |
| 298 | .endm |
| 299 | @========================================================================== |
| 300 | @ END of MACRO DEFINITION |
| 301 | @========================================================================== |
| 302 | |
| 303 | |
| 304 | .section idct_func, "x" @ ARE |
| 305 | .text @ idct_func, CODE, READONLY |
| 306 | .align 2 |
| 307 | .code 32 @ CODE32 |
| 308 | |
| 309 | @========================================================================== |
| 310 | @ Main Routine |
| 311 | @========================================================================== |
| 312 | |
| 313 | .global idct_1x1_venum |
| 314 | .global idct_2x2_venum |
| 315 | .global idct_4x4_venum |
| 316 | .global idct_8x8_venum |
| 317 | |
| 318 | @========================================================================== |
| 319 | @ FUNCTION : idct_1x1_venum |
| 320 | @-------------------------------------------------------------------------- |
| 321 | @ DISCRIPTION : ARM optimization of one 1x1 block iDCT |
| 322 | @-------------------------------------------------------------------------- |
| 323 | @ C PROTOTYPE : void idct_1x1_venum(int16 * input, |
| 324 | @ int16 * output, |
| 325 | @ int32 stride) |
| 326 | @-------------------------------------------------------------------------- |
| 327 | @ REG INPUT : R0 pointer to input (int16) |
| 328 | @ R1 pointer to output (int16) |
| 329 | @ R2 block stride |
| 330 | @-------------------------------------------------------------------------- |
| 331 | @ STACK ARG : None |
| 332 | @-------------------------------------------------------------------------- |
| 333 | @ MEM INPUT : None |
| 334 | @-------------------------------------------------------------------------- |
| 335 | @ REG OUTPUT : None |
| 336 | @-------------------------------------------------------------------------- |
| 337 | @ MEM OUTPUT : None |
| 338 | @-------------------------------------------------------------------------- |
| 339 | @ REG AFFECTED : R0 - R2 |
| 340 | @-------------------------------------------------------------------------- |
| 341 | @ STACK USAGE : none |
| 342 | @-------------------------------------------------------------------------- |
| 343 | @ CYCLES : 17 cycles |
| 344 | @-------------------------------------------------------------------------- |
| 345 | @ NOTES : |
| 346 | @ This idct_1x1_venum code was developed with ARM instruction set. |
| 347 | @ |
| 348 | @ ARM REGISTER ALLOCATION |
| 349 | @ ========================================================================= |
| 350 | @ r0 : pointer to input data |
| 351 | @ r1 : pointer to output area |
| 352 | @ r2 : stride in the output buffer |
| 353 | @========================================================================== |
| 354 | .type idct_1x1_venum, %function |
| 355 | idct_1x1_venum: |
| 356 | |
| 357 | ldrsh r3, [r0] @ Load signed half word (int16) |
| 358 | ldr r2, =1028 @ 1028 = 4 + 128 << 3 |
| 359 | @ 4 for rounding, 128 for offset |
| 360 | add r2, r3, r2 |
| 361 | asrs r2, r2, #3 @ Divide by 8, and set status bit |
| 362 | movmi r2, #0 @ Clamp to be greater than 0 |
| 363 | cmp r2, #255 |
| 364 | movgt r2, #255 @ Clamp to be less than 255 |
| 365 | str r2, [r1] @ Save output |
| 366 | bx lr @ Return to caller |
| 367 | |
| 368 | @ end of idct_1x1_venum |
| 369 | |
| 370 | |
| 371 | @========================================================================== |
| 372 | @ FUNCTION : idct_2x2_venum |
| 373 | @-------------------------------------------------------------------------- |
| 374 | @ DISCRIPTION : VeNum optimization of one 2x2 block iDCT |
| 375 | @-------------------------------------------------------------------------- |
| 376 | @ C PROTOTYPE : void idct_2x2_venum(int16 * input, |
| 377 | @ int16 * output, |
| 378 | @ int32 stride) |
| 379 | @-------------------------------------------------------------------------- |
| 380 | @ REG INPUT : R0 pointer to input (int16) |
| 381 | @ R1 pointer to output (int16) |
| 382 | @ R2 block stride |
| 383 | @-------------------------------------------------------------------------- |
| 384 | @ STACK ARG : None |
| 385 | @-------------------------------------------------------------------------- |
| 386 | @ MEM INPUT : None |
| 387 | @-------------------------------------------------------------------------- |
| 388 | @ REG OUTPUT : None |
| 389 | @-------------------------------------------------------------------------- |
| 390 | @ MEM OUTPUT : None |
| 391 | @-------------------------------------------------------------------------- |
| 392 | @ REG AFFECTED : R0 - R2 |
| 393 | @-------------------------------------------------------------------------- |
| 394 | @ STACK USAGE : none |
| 395 | @-------------------------------------------------------------------------- |
| 396 | @ CYCLES : 27 cycles |
| 397 | @-------------------------------------------------------------------------- |
| 398 | @ NOTES : Output buffer must be an 8x8 16-bit buffer |
| 399 | @ |
| 400 | @ ARM REGISTER ALLOCATION |
| 401 | @ ========================================== |
| 402 | @ r0 : pointer to input data |
| 403 | @ r1 : pointer to output area |
| 404 | @ r2 : stride in the output buffer |
| 405 | @ ------------------------------------------- |
| 406 | @ |
| 407 | @ VENUM REGISTER ALLOCATION |
| 408 | @ ================================================= |
| 409 | @ q0 : output x0 - x4 |
| 410 | @ q1 : not used |
| 411 | @ q2 : not used |
| 412 | @ q3 : not used |
| 413 | @ q4 : not used |
| 414 | @ q5 : not used |
| 415 | @ q6 : not used |
| 416 | @ q7 : not used |
| 417 | @ q8 : input y0 - y4 |
| 418 | @ q9 : intermediate value |
| 419 | @ q10 : intermediate value |
| 420 | @ q11 : offset value |
| 421 | @ q12 : clamp value |
| 422 | @ q13 : not used |
| 423 | @ q14 : not used |
| 424 | @ q15 : not used |
| 425 | @========================================================================== |
| 426 | .type idct_2x2_venum, %function |
| 427 | idct_2x2_venum: |
| 428 | |
| 429 | vld4.32 {d16, d17, d18, d19}, [r0] |
| 430 | @ d16: y0 | y1 | y2 | y3 (LSB | MSB) |
| 431 | |
| 432 | vtrn.32 d16, d17 @ d16: y0 | y1 | X | X |
| 433 | @ d17: y2 | y3 | X | X |
| 434 | |
| 435 | vqadd.s16 d18, d16, d17 @ d18: y0+y2 | y1+y3 | X | X q: saturated |
| 436 | vqsub.s16 d19, d16, d17 @ d19: y0-y2 | y1-y3 | X | X q: saturated |
| 437 | |
| 438 | vtrn.16 d18, d19 @ d18: y0+y2 | y0-y2 | X | X |
| 439 | @ d19: y1+y3 | y1-y3 | X | X |
| 440 | |
| 441 | vqadd.s16 d20, d18, d19 @ d20: (y0+y2)+(y1+y3) | (y0-y2)+(y1-y3) |
| 442 | @ x0 | x2 | X | X |
| 443 | vqsub.s16 d21, d18, d19 @ d21: (y0+y2)-(y1+y3) | (y0-y2)-(y1-y3) |
| 444 | @ x1 | x3 | X | X |
| 445 | |
| 446 | vtrn.16 d20, d21 @ d20: x0 | x1 | X | X |
| 447 | @ d21: x2 | x3 | X | X |
| 448 | |
| 449 | vrshr.s16 q10, q10, #3 @ Divide by 8 |
| 450 | |
| 451 | vmov.i16 q11, #128 @ q11 = 128|128|128|128|128|128|128|128 |
| 452 | vqadd.s16 q0, q10, q11 @ Add offset to make output in [0,255] |
| 453 | |
| 454 | vmov.i16 q12, #0 @ q12 = [0000000000000000] |
| 455 | vmov.i16 q13, #255 @ q13 = [FFFFFFFFFFFFFFFF] (hex) |
| 456 | |
| 457 | vmax.s16 q0, q0, q12 @ Clamp > 0 |
| 458 | vmin.s16 q0, q0, q13 @ Clamp < 255 |
| 459 | |
| 460 | vstr d0, [r1] @ Store x0 | x1 | X | X |
| 461 | @ Potential out of boundary issue |
| 462 | add r1, r1, r2 @ Add the offset to the output pointer |
| 463 | vstr d1, [r1] @ Store x2 | x3 | X | X |
| 464 | @ Potential out of boundary issue |
| 465 | bx lr @ Return to caller |
| 466 | |
| 467 | @ end of idct_2x2_venum |
| 468 | |
| 469 | |
| 470 | @========================================================================== |
| 471 | @ FUNCTION : idct_4x4_venum |
| 472 | @-------------------------------------------------------------------------- |
| 473 | @ DISCRIPTION : VeNum optimization of one 4x4 block iDCT |
| 474 | @-------------------------------------------------------------------------- |
| 475 | @ C PROTOTYPE : void idct_4x4_venum(int16 * input, |
| 476 | @ int16 * output, |
| 477 | @ int32 stride) |
| 478 | @-------------------------------------------------------------------------- |
| 479 | @ REG INPUT : R0 pointer to input (int16) |
| 480 | @ R1 pointer to output (int16) |
| 481 | @ R2 block stride |
| 482 | @-------------------------------------------------------------------------- |
| 483 | @ STACK ARG : None |
| 484 | @-------------------------------------------------------------------------- |
| 485 | @ MEM INPUT : None |
| 486 | @-------------------------------------------------------------------------- |
| 487 | @ REG OUTPUT : None |
| 488 | @-------------------------------------------------------------------------- |
| 489 | @ MEM OUTPUT : None |
| 490 | @-------------------------------------------------------------------------- |
| 491 | @ REG AFFECTED : R0 - R3, R12 |
| 492 | @-------------------------------------------------------------------------- |
| 493 | @ STACK USAGE : none |
| 494 | @-------------------------------------------------------------------------- |
| 495 | @ CYCLES : 56 cycles |
| 496 | @-------------------------------------------------------------------------- |
| 497 | @ NOTES : |
| 498 | @ |
| 499 | @ ARM REGISTER ALLOCATION |
| 500 | @ ========================================== |
| 501 | @ r0 : pointer to input data |
| 502 | @ r1 : pointer to output area |
| 503 | @ r2 : stride in the output buffer |
| 504 | @ r3 : pointer to the coefficient set |
| 505 | @ r12 : pointer to the coefficient set |
| 506 | @ ------------------------------------------- |
| 507 | @ |
| 508 | @ VENUM REGISTER ALLOCATION |
| 509 | @ ================================================= |
| 510 | @ q0 : coefficients[0] |
| 511 | @ q1 : coefficients[1] |
| 512 | @ q2 : coefficients[2] |
| 513 | @ q3 : coefficients[3] |
| 514 | @ q4 : not used |
| 515 | @ q5 : not used |
| 516 | @ q6 : not used |
| 517 | @ q7 : not used |
| 518 | @ q8 : input y0 - y7 |
| 519 | @ q9 : input y8 - y15 |
| 520 | @ q10 : intermediate value |
| 521 | @ q11 : intermediate value |
| 522 | @ q12 : intermediate value |
| 523 | @ q13 : intermediate value |
| 524 | @ q14 : intermediate value |
| 525 | @ q15 : intermediate value |
| 526 | @========================================================================== |
| 527 | .type idct_4x4_venum, %function |
| 528 | idct_4x4_venum: |
| 529 | |
| 530 | @ Load the locations of the first 2 sets of coefficients |
| 531 | ldr r3, =coefficient+0*16 @ coefficient[0] |
| 532 | ldr r12, =coefficient+1*16 @ coefficient[1] |
| 533 | |
| 534 | @ Load the first 2 sets of coefficients |
| 535 | vld1.16 {d0, d1},[r3] @ q0 = C4 | C2 | C4 | C6 | C4 | C2 | C4 | C6 |
| 536 | vld1.16 {d2, d3},[r12] @ q1 = C4 | C6 | C4 | C2 | C4 | C6 | C4 | C2 |
| 537 | |
| 538 | @ Load the locations of the second 2 sets of coefficients |
| 539 | ldr r3, =coefficient+2*16 @ coefficient[2] |
| 540 | ldr r12, =coefficient+3*16 @ coefficient[3] |
| 541 | |
| 542 | @ Load the second 2 sets of coefficients |
| 543 | vld1.16 {d4, d5},[r3] @ q2 = C4 | C4 | C4 | C4 | C2 | C2 | C2 | C2 |
| 544 | vld1.16 {d6, d7},[r12] @ q3 = C4 | C4 | C4 | C4 | C6 | C6 | C6 | C6 |
| 545 | |
| 546 | @ Load the input values |
| 547 | vld1.16 {d16}, [r0], r2 @ d16: y0 | y1 | y2 | y3 (LSB | MSB) |
| 548 | vld1.16 {d17}, [r0], r2 @ d17: y4 | y5 | y6 | y7 (LSB | MSB) |
| 549 | vld1.16 {d18}, [r0], r2 @ d18: y8 | y9 | y10 | y11 (LSB | MSB) |
| 550 | vld1.16 {d19}, [r0], r2 @ d19: y12 | y13 | y14 | y15 (LSB | MSB) |
| 551 | |
| 552 | @ Apply iDCT Horizonally |
| 553 | |
| 554 | @ q8: y0 |y1 |y2 |y3 |y4 |y5 |y6 |y7 |
| 555 | @ q9: y8 |y9 |y10|y11|y12|y13|y14|y15 |
| 556 | |
| 557 | @====================================================================== |
| 558 | @ vqrdmulh doubles the result and save the high 16 bits of the result, |
| 559 | @ this is equivalent to right shift by 15 bits. |
| 560 | @ since coefficients are in Q15 format, it contradicts with the right |
| 561 | @ shift 15 here, so the final result is in Q0 format |
| 562 | @ |
| 563 | @ vqrdmulh will also round the result |
| 564 | @====================================================================== |
| 565 | |
| 566 | vqrdmulh.s16 q10, q8, q0 @ q10: C4*y0 | C2*y1 | C4*y2 | C6*y3 | C4*y4 | C2*y5 | C4*y6 | C6*y7 |
| 567 | vqrdmulh.s16 q11, q8, q1 @ q11: C4*y0 | C6*y1 | C4*y2 | C2*y3 | C4*y4 | C6*y5 | C4*y6 | C2*y7 |
| 568 | |
| 569 | vqrdmulh.s16 q12, q9, q0 @ q12: C4*y8 | C2*y9 | C4*y10 | C6*y11 | C4*y12 | C2*y13 | C4*y14 | C6*y15 |
| 570 | vqrdmulh.s16 q13, q9, q1 @ q13: C4*y8 | C6*y9 | C4*y10 | C2*y11 | C4*y12 | C6*y13 | C4*y14 | C2*y15 |
| 571 | |
| 572 | vtrn.32 q10, q12 @ q10: C4*y0 | C2*y1 | C4*y8 | C2*y9 | C4*y4 | C2*y5 | C4*y12 | C2*y13 |
| 573 | @ q12: C4*y2 | C6*y3 | C4*y10 | C6*y11 | C4*y6 | C6*y7 | C4*y14 | C6*y15 |
| 574 | |
| 575 | vtrn.32 q11, q13 @ q11: C4*y0 | C6*y1 | C4*y8 | C6*y9 | C4*y4 | C6*y5 | C4*y12 | C6*y13 |
| 576 | @ q13: C4*y2 | C2*y3 | C4*y10 | C2*y11 | C4*y6 | C2*y7 | C4*y14 | C2*y15 |
| 577 | |
| 578 | vqadd.s16 q14, q10, q12 @ q14: C4*y0 + C4*y2 | C2*y1 + C6*y3 | C4*y8 + C4*y10 | C2*y9 + C6*y11 | C4*y4 + C4*y6 | C2*y5 + C6*y7 | C4*y12 + C4*y14 | C2*y13 + C6*y15 |
| 579 | @ S0 | S2 | S8 | S10 | S4 | S6 | S12 | S14 |
| 580 | |
| 581 | vqsub.s16 q15, q11, q13 @ q15: C4*y0 - C4*y2 | C6*y1 - C2*y3 | C4*y8 - C4*y10 | C6*y9 - C2*y11 | C4*y4 - C4*y6 | C6*y5 - C2*y7 | C4*y12 - C4*y14 | C6*y13 - C2*y15 |
| 582 | @ S1 | S3 | S9 | S11 | S5 | S7 | S13 | S15 |
| 583 | |
| 584 | vtrn.16 q14, q15 @ q14: S0 | S1 | S8 | S9 | S4 | S5 | S12 | S13 |
| 585 | @ q15: S2 | S3 | S10 | S11 | S6 | S7 | S14 | S15 |
| 586 | |
| 587 | vqadd.s16 q8, q14, q15 @ q8: Z0 | Z1 | Z8 | Z9 | Z4 | Z5 | Z12 | Z13 |
| 588 | vqsub.s16 q9, q14, q15 @ q9: Z3 | Z2 | Z11 | Z10 | Z7 | Z6 | Z15 | Z14 |
| 589 | vrev32.16 q9, q9 @ q9: Z2 | Z3 | Z10 | Z11 | Z6 | Z7 | Z14 | Z15 |
| 590 | |
| 591 | |
| 592 | @ Apply iDCT Vertically |
| 593 | |
| 594 | vtrn.32 q8, q9 @ q8: Z0 | Z1 | Z2 | Z3 | Z4 | Z5 | Z6 | Z7 |
| 595 | @ q9: Z8 | Z9 | Z10 | Z11 | Z12 | Z13 | Z14 | Z15 |
| 596 | |
| 597 | |
| 598 | vqrdmulh.s16 q10, q8, q2 @ q10: C4*Z0 | C4*Z1 | C4*Z2 | C4*Z3 | C2*Z4 | C2*Z5 | C2*Z6 | C2*Z7 |
| 599 | vqrdmulh.s16 q11, q8, q3 @ q11: C4*Z0 | C4*Z1 | C4*Z2 | C4*Z3 | C6*Z4 | C6*Z5 | C6*Z6 | C6*Z7 |
| 600 | |
| 601 | vqrdmulh.s16 q12, q9, q2 @ q12: C4*Z8 | C4*Z9 | C4*Z10 | C4*Z11 | C2*Z12 | C2*Z13 | C2*Z14 | C2*Z15 |
| 602 | vqrdmulh.s16 q13, q9, q3 @ q13: C4*Z8 | C4*Z9 | C4*Z10 | C4*Z11 | C6*Z12 | C6*Z13 | C6*Z14 | C6*Z15 |
| 603 | |
| 604 | vqadd.s16 q14, q10, q13 @ q14: C4*Z0+C4*Z8 | C4*Z1+C4*Z9 | C4*Z2+C4*Z10 | C4*Z3+C4*Z11 | C2*Z4+C6*Z12 | C2*Z5+C6*Z13 | C2*Z6+C6*Z14 | C2*Z7+C6*Z15 |
| 605 | @ s0 | s4 | s8 | s12 | s2 | s6 | s10 | s14 |
| 606 | |
| 607 | vqsub.s16 q15, q11, q12 @ q15: C4*Z0-C4*Z8 | C4*Z1-C4*Z9 | C4*Z2-C4*Z10 | C4*Z3-C4*Z11 | C6*Z4-C2*Z12 | C6*Z5-C2*Z13 | C6*Z6-C2*Z14 | C6*Z7-C2*Z15 |
| 608 | @ s1 | s5 | s9 | s13 | s3 | s7 | s11 | s15 |
| 609 | |
| 610 | vswp d29, d30 @ q14: s0 | s4 | s8 | s12 | s1 | s5 | s9 | s13 |
| 611 | @ q15: s2 | s6 | s10 | s14 | s3 | s7 | s11 | s15 |
| 612 | |
| 613 | vqadd.s16 q8, q14, q15 @ q8: x0 | x4 | x8 | x12 | x1 | x5 | x9 | x13 |
| 614 | vqsub.s16 q9, q14, q15 @ q9: x3 | x7 | x11 | x15 | x2 | x6 | x10 | x14 |
| 615 | |
| 616 | vmov.i16 q10, #0 @ q10=[0000000000000000] |
| 617 | vmov.i16 q11, #255 @ q11=[FFFFFFFFFFFFFFFF] (hex) |
| 618 | |
| 619 | vmov.i16 q0, #128 @ q0 = 128|128|128|128|128|128|128|128 |
| 620 | |
| 621 | vqadd.s16 q8, q8, q0 @ Add the offset |
| 622 | vqadd.s16 q9, q9, q0 @ Add the offset |
| 623 | |
| 624 | vmax.s16 q8, q8, q10 @ clamp > 0 |
| 625 | vmin.s16 q8, q8, q11 @ clamp < 255 |
| 626 | |
| 627 | vmax.s16 q9, q9, q10 @ clamp > 0 |
| 628 | vmin.s16 q9, q9, q11 @ clamp < 255 |
| 629 | |
| 630 | vst1.16 {d16}, [r1], r2 @ d16: x0 | x1 | x2 | x3 (LSB | MSB) |
| 631 | vst1.16 {d17}, [r1], r2 @ d17: x4 | x5 | x6 | x7 (LSB | MSB) |
| 632 | vst1.16 {d19}, [r1], r2 @ d18: x8 | x9 | x10 | x11 (LSB | MSB) |
| 633 | vst1.16 {d18}, [r1], r2 @ d19: x12| x13 | x14 | x15 (LSB | MSB) |
| 634 | |
| 635 | bx lr @ Return to caller |
| 636 | |
| 637 | @ end of idct_4x4_venum |
| 638 | |
| 639 | @========================================================================== |
| 640 | @ FUNCTION : idct_8x8_venum |
| 641 | @-------------------------------------------------------------------------- |
| 642 | @ DISCRIPTION : VeNum optimization of one 8x8 block iDCT |
| 643 | @-------------------------------------------------------------------------- |
| 644 | @ C PROTOTYPE : void idct_8x8_venum(int16 * input, |
| 645 | @ int16 * output, |
| 646 | @ int32 stride) |
| 647 | @-------------------------------------------------------------------------- |
| 648 | @ REG INPUT : R0 pointer to input (int16) |
| 649 | @ R1 pointer to output (int16) |
| 650 | @ R2 block stride |
| 651 | @-------------------------------------------------------------------------- |
| 652 | @ STACK ARG : None |
| 653 | @-------------------------------------------------------------------------- |
| 654 | @ MEM INPUT : None |
| 655 | @-------------------------------------------------------------------------- |
| 656 | @ REG OUTPUT : None |
| 657 | @-------------------------------------------------------------------------- |
| 658 | @ MEM OUTPUT : None |
| 659 | @-------------------------------------------------------------------------- |
| 660 | @ REG AFFECTED : R0 - R9 |
| 661 | @-------------------------------------------------------------------------- |
| 662 | @ STACK USAGE : none |
| 663 | @-------------------------------------------------------------------------- |
| 664 | @ CYCLES : 177 cycles |
| 665 | @-------------------------------------------------------------------------- |
| 666 | @ NOTES : |
| 667 | @ |
| 668 | @ It was tested to be IEEE 1180 compliant. Since IEEE 1180 compliance is more stringent |
| 669 | @ than MPEG-4 compliance, this version is also MPEG-4 compliant. |
| 670 | @ |
| 671 | @ CODE STRUCTURE: |
| 672 | @ (i) Macros for transposing an 8x8 matrix and for configuring the VFP unit are defined. |
| 673 | @ (ii) Macro for IDCT in one dimension is defined as four stages |
| 674 | @ (iii) The two dimensional code begins |
| 675 | @ (iv) constants are defined in the area DataArea |
| 676 | @ |
| 677 | @ PROGRAM FLOW: |
| 678 | @ |
| 679 | @ The VFP is configured |
| 680 | @ The parameters to IDCT are loaded |
| 681 | @ the coefficients are loaded |
| 682 | @ loop: |
| 683 | @ decrement loop counter |
| 684 | @ The first input Matrix is loaded and pre-scaled |
| 685 | @ The input is prescaled using the constants |
| 686 | @ IDCT is performed in one dimension on the 8 columns |
| 687 | @ The matrix is transposed |
| 688 | @ A bias is loaded an added to the matrix |
| 689 | @ IDCT is performed in one dimension on the 8 rows |
| 690 | @ The matrix is post-scaled |
| 691 | @ The matrix is saved |
| 692 | @ test loop counter and loop if greater than zero |
| 693 | @ stop |
| 694 | @ |
| 695 | @ |
| 696 | @ ARM REGISTER ALLOCATION |
| 697 | @ ========================================== |
| 698 | @ r0 : pointer to input data |
| 699 | @ r1 : pointer to output are |
| 700 | @ r2 : stride in the output buffer |
| 701 | @ r3 : |
| 702 | @ r4 : |
| 703 | @ r5 : pointer to constants[0] [5] |
| 704 | @ r6 : pointer to constants[1] |
| 705 | @ r7 : pointer to constants[2] |
| 706 | @ r8 : pointer to constants[3] |
| 707 | @ r9 : pointer to constants[4] |
| 708 | @ ------------------------------------------- |
| 709 | @ |
| 710 | @ VENUM REGISTER ALLOCATION |
| 711 | @ ================================================= |
| 712 | @ q0 : constants[0] |
| 713 | @ q1 : constants[1] |
| 714 | @ q2 : constants[2], IDCT1D in-place scratch |
| 715 | @ q3 : constants[3], IDCT1D in-place scratch |
| 716 | @ q4 : constants[4], IDCT1D in-place scratch, and bias compensation |
| 717 | @ q5 : IDCT1D in-place scratch |
| 718 | @ q6 : IDCT1D in-place scratch |
| 719 | @ q7 : IDCT1D in-place scratch |
| 720 | @ q8 : Matrix[0] IDCT1D in-place scratch |
| 721 | @ q9 : Matrix[1] IDCT1D in-place scratch |
| 722 | @ q10 : Matrix[2] IDCT1D in-place scratch |
| 723 | @ q11 : Matrix[3] IDCT1D in-place scratch |
| 724 | @ q12 : Matrix[4] IDCT1D in-place scratch |
| 725 | @ q13 : Matrix[5] IDCT1D in-place scratch |
| 726 | @ q14 : Matrix[6] IDCT1D in-place scratch |
| 727 | @ q15 : Matrix[7] IDCT1D in-place scratch |
| 728 | @========================================================================== |
| 729 | .type idct_8x8_venum, %function |
| 730 | idct_8x8_venum: |
| 731 | |
| 732 | push {r5-r9} |
| 733 | vpush {d8-d15} |
| 734 | IDCT_ENTRY |
| 735 | BIG_BODY_TRANSPOSE_INPUT |
| 736 | vpop {d8-d15} |
| 737 | pop {r5-r9} |
| 738 | bx lr |
| 739 | @ end of idct_8x8_venum |
| 740 | |
| 741 | @========================================================================== |
| 742 | @ Constants Definition AREA: define idct kernel, bias |
| 743 | @========================================================================== |
| 744 | .section ro_data_area @ AREA RODataArea |
| 745 | .data @ DATA, READONLY |
| 746 | .align 5 @ ALIGN=5 |
| 747 | |
| 748 | constants: |
| 749 | .hword 23170, 13573, 6518, 21895, -23170, -21895, 8223, 8224 |
| 750 | .hword 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725 |
| 751 | .hword 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521 |
| 752 | .hword 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692 |
| 753 | .hword 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722 |
| 754 | .hword 0, 255, 0, 0 |
| 755 | |
| 756 | coefficient: @ These are the coefficent used by 4x4 iDCT in Q15 format |
| 757 | .hword 11585, 15137, 11585, 6270, 11585, 15137, 11585, 6270 @ C4, C2, C4, C6, C4, C2, C4, C6 /2 |
| 758 | .hword 11585, 6270, 11585, 15137, 11585, 6270, 11585, 15137 @ C4, C6, C4, C2, C4, C6, C4, C2 /2 |
| 759 | .hword 11585, 11585, 11585, 11585, 15137, 15137, 15137, 15137 @ C4, C4, C4, C4, C2, C2, C2, C2 /2 |
| 760 | .hword 11585, 11585, 11585, 11585, 6270, 6270, 6270, 6270 @ C4, C4, C4, C4, C6, C6, C6, C6 /2 |
| 761 | |
| 762 | .end |