blob: d61e219f738b000dc2cfa53b393644822b78b200 [file] [log] [blame]
Dees_Troy51a0e822012-09-05 15:24:24 -04001/*=========================================================================
2* jdidct-armv7.s
3*
4* Copyright (c) 2010, Code Aurora Forum. All rights reserved.
5*
6* Redistribution and use in source and binary forms, with or without
7* modification, are permitted provided that the following conditions are
8* met:
9* * Redistributions of source code must retain the above copyright
10* notice, this list of conditions and the following disclaimer.
11* * Redistributions in binary form must reproduce the above
12* copyright notice, this list of conditions and the following
13* disclaimer in the documentation and/or other materials provided
14* with the distribution.
15* * Neither the name of Code Aurora Forum, Inc. nor the names of its
16* contributors may be used to endorse or promote products derived
17* from this software without specific prior written permission.
18*
19* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
20* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
21* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
22* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
23* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
29* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30*==========================================================================
31
32*==========================================================================
33* FUNCTION LIST
34*--------------------------------------------------------------------------
35* - idct_1x1_venum
36* - idct_2x2_venum
37* - idct_4x4_venum
38* - idct_8x8_venum
39*
40*==========================================================================
41*/
42
43@==========================================================================
44@ MACRO DEFINITION
45@==========================================================================
46 .macro Transpose8x8
47 @==================================================================
48 @ Transpose an 8 x 8 x 16 bit matrix in place
49 @ Input: q8 to q15
50 @ Output: q8 to q15
51 @ Registers used: q8 to q15
52 @ Assumptions: 8 x 8 x 16 bit data
53 @==================================================================
54
55 vswp d17, d24 @q8, q12
56 vswp d23, d30 @q11, q15
57 vswp d21, d28 @q10, q14
58 vswp d19, d26 @q9, q13
59
60 vtrn.32 q8, q10
61 vtrn.32 q9, q11
62 vtrn.32 q12, q14
63 vtrn.32 q13, q15
64
65 vtrn.16 q8, q9
66 vtrn.16 q10, q11
67 vtrn.16 q12, q13
68 vtrn.16 q14, q15
69 .endm
70
71 .macro IDCT1D
72 @==================================================================
73 @ One dimensional 64 element inverse DCT
74 @ Input: q8 to q15 loaded with data
75 @ q0 loaded with constants
76 @ Output: q8 to q15
77 @ Registers used: q0, q4 to q15
78 @ Assumptions: 16 bit data, first elements in least significant
79 @ halfwords
80 @==================================================================
81
82 @1st stage
83 vqrdmulh.s16 q4, q15, d0[2] @q4 = a1*vx7
84 vqrdmulh.s16 q5, q9, d0[2] @q5 = a1*vx1
85 vqrdmulh.s16 q6, q13, d0[3] @q6 = a2*vx5
86 vqrdmulh.s16 q7, q11, d1[1] @q7 = ma2*vx3
87 vqrdmulh.s16 q2, q14, d0[1] @q6 = a0*vx6
88 vqrdmulh.s16 q3, q10, d0[1] @q7 = a0*vx2
89 vqadd.s16 q9, q4, q9 @q9 = t1 = a1*vx7 + vx1
90 vqsub.s16 q5, q5, q15 @q5 = t8 = a1*vx1 - vx7
91 vqadd.s16 q15, q6, q11 @q15 = t7 = a2*vx5 + vx3
92 vqadd.s16 q11, q7, q13 @q11 = t3 = ma2*vx3 + vx5
93
94 @2nd stage
95 vqadd.s16 q13, q8, q12 @q13 = t5 = vx0 + vx4
96 vqsub.s16 q8, q8, q12 @q8 = t0 = vx0 - vx4
97 vqadd.s16 q10, q2, q10 @q10 = t2 = a0*vx6 + vx2
98 vqsub.s16 q12, q3, q14 @q12 = t4 = a0*vx2 - vx6
99 vqadd.s16 q14, q5, q11 @q14 = t6 = t8 + t3
100 vqsub.s16 q11, q5, q11 @q11 = t3 = t8 - t3
101 vqsub.s16 q5, q9, q15 @q5 = t8 = t1 - t7
102 vqadd.s16 q9, q9, q15 @q9 = t1 = t1 + t7
103
104 @3rd stage
105 vqadd.s16 q15, q13, q10 @q15 = t7 = t5 + t2
106 vqsub.s16 q10, q13, q10 @q10 = t2 = t5 - t2
107 vqadd.s16 q13, q8, q12 @q13 = t5 = t0 + t4
108 vqsub.s16 q7, q8, q12 @q7 = t0 = t0 - t4
109 vqsub.s16 q12, q5, q11 @q12 = t4 = t8 - t3
110 vqadd.s16 q11, q5, q11 @q11 = t3 = t8 + t3
111
112 @4th stage
113 vqadd.s16 q8, q15, q9 @q8 = vy0 = t7 + t1
114 vqsub.s16 q15, q15, q9 @q15 = vy7 = t7 - t1
115 vqrdmulh.s16 q6, q12, d0[0] @q6 = c4*t4
116 vqrdmulh.s16 q4, q11, d0[0] @q4 = c4*t3
117 vqsub.s16 q12, q10, q14 @q12 = vy4 = t2 - t6
118 vqadd.s16 q11, q10, q14 @q11 = vy3 = t2 + t6
119 vqadd.s16 q10, q7, q6 @q10 = vy2 = t0 + c4*t4
120 vqsub.s16 q14, q13, q4 @q14 = vy6 = t5 - c4*t3
121 vqadd.s16 q9, q13, q4 @q9 = vy1 = t5 + c4*t3
122 vqsub.s16 q13, q7, q6 @q13 = vy5 = t0 - c4*t4
123 .endm
124
125 .macro PART1
126 @==================================================================
127 @ Load input input data from memory and shift
128 @==================================================================
129 vld1.16 {d16, d17},[r0]! @q8 =row0
130 vqshl.s16 q8, q8, #4 @Input data too big?!!
131 @Maximum MPEG input is 2047/-2048.
132 vld1.16 {d18, d19},[r0]! @q9 =row1
133 vqshl.s16 q9, q9, #4 @Shift 1 instead of 4
134
135 vld1.16 {d20, d21},[r0]! @q10=row2
136 vqshl.s16 q10, q10, #4
137
138 vld1.16 {d22, d23},[r0]! @q11=row3
139 vqshl.s16 q11, q11, #4
140
141 vld1.16 {d24, d25},[r0]! @q12=row4
142 vqshl.s16 q12, q12, #4
143
144 vld1.16 {d26, d27},[r0]! @q13=row5
145 vqshl.s16 q13, q13, #4
146 vld1.16 {d28, d29},[r0]! @q14=row6
147 vqshl.s16 q14, q14, #4
148 vld1.16 {d30, d31},[r0]! @q15=row7
149 vqshl.s16 q15, q15, #4
150
151 @==================================================================
152 @ refresh the constants that was clobbered last time through IDCT1D
153 @==================================================================
154 vld1.16 {d4, d5},[r7] @q2 =constants[2]
155 vld1.16 {d6, d7},[r8] @q3 =constants[3]
156 vld1.16 {d8, d9},[r9] @q4 =constants[4]
157 .endm
158
159 .macro PART2
160 @==================================================================
161 @ Prescale the input
162 @==================================================================
163 vqrdmulh.s16 q12, q12, q1 @q12=row4 * constants[1] = vx4
164 vqrdmulh.s16 q15, q15, q2 @q15=row7 * constants[2] = vx7
165 vqrdmulh.s16 q9, q9, q2 @q9 =row1 * constants[2] = vx1
166 vqrdmulh.s16 q13, q13, q4 @q13=row5 * constants[4] = vx5
167 vqrdmulh.s16 q11, q11, q4 @q11=row3 * constants[4] = vx3
168 vqrdmulh.s16 q14, q14, q3 @q14=row6 * constants[3] = vx6
169 vqrdmulh.s16 q10, q10, q3 @q10=row2 * constants[3] = vx2
170 vqrdmulh.s16 q8, q8, q1 @q8 =row0 * constants[1] = vx0
171
172 @==================================================================
173 @ At thsi point, the input 8x8 x 16 bit coefficients are
174 @ transposed, prescaled, and loaded in q8 to q15
175 @ q0 loaded with scalar constants
176 @ Perform 1D IDCT
177 @==================================================================
178 IDCT1D @perform 1d idct
179
180 @==================================================================
181 @ Transpose the intermediate results to get read for vertical
182 @ transformation
183 @==================================================================
184 vswp d17, d24 @q8, q12
185 vswp d23, d30 @q11, q15
186 vswp d21, d28 @q10, q14
187 vswp d19, d26 @q9, q13
188
189 @==================================================================
190 @ Load the bias
191 @==================================================================
192 vdup.32 q4, d1[1] @a cycle is saved by loading
193 @the bias at this point
194
195 @==================================================================
196 @ Finish the transposition
197 @==================================================================
198 vtrn.32 q8, q10
199 vtrn.32 q9, q11
200 vtrn.32 q12, q14
201 vtrn.32 q13, q15
202 vtrn.16 q8, q9
203 vtrn.16 q10, q11
204 vtrn.16 q12, q13
205 vtrn.16 q14, q15
206
207 @==================================================================
208 @ Add bias
209 @==================================================================
210 vqadd.s16 q8, q8, q4
211
212 @==================================================================
213 @ IDCT 2nd half
214 @==================================================================
215 IDCT1D @perform 1d dct
216
217 @==================================================================
218 @ Scale and clamp the output to correct range and save to memory
219 @ 1. scale to 8bits by right shift 6
220 @ 2. clamp output to [0, 255] by min/max
221 @ 3. use multiple store. Each store will save one row of output.
222 @ The st queue size is 4, so do no more than 4 str in sequence.
223 @==================================================================
224 ldr r5, =constants+5*16 @constants[5],
225 vld1.16 d10, [r5] @load clamping parameters
226 vdup.s16 q6, d10[0] @q6=[0000000000000000]
227 vdup.s16 q7, d10[1] @q7=[FFFFFFFFFFFFFFFF]
228
229 @Save the results
230 vshr.s16 q8, q8, #6 @q8 = vy0
231 vmax.s16 q8, q8, q6 @clamp >0
232 vmin.s16 q8, q8, q7 @clamp <255
233
234 vshr.s16 q9, q9, #6 @q9 = vy1
235 vmax.s16 q9, q9, q6 @clamp >0
236 vmin.s16 q9, q9, q7 @clamp <255
237
238 vshr.s16 q10, q10, #6 @q10 = vy2
239 vmax.s16 q10, q10, q6 @clamp >0
240 vmin.s16 q10, q10, q7 @clamp <255
241
242 vshr.s16 q11, q11, #6 @q11 = vy3
243 vmax.s16 q11, q11, q6 @clamp >0
244 vmin.s16 q11, q11, q7 @clamp <255
245
246 vst1.16 {d16, d17},[r1],r2 @q8 =row0
247 vst1.16 {d18, d19},[r1],r2 @q9 =row1
248 vst1.16 {d20, d21},[r1],r2 @q10=row2
249 vst1.16 {d22, d23},[r1],r2 @q11=row3
250
251 vshr.s16 q12, q12, #6 @q12 = vy4
252 vmax.s16 q12, q12, q6 @clamp >0
253 vmin.s16 q12, q12, q7 @clamp <255
254
255 vshr.s16 q13, q13, #6 @q13 = vy5
256 vmax.s16 q13, q13, q6 @clamp >0
257 vmin.s16 q13, q13, q7 @clamp <255
258
259 vshr.s16 q14, q14, #6 @q14 = vy6
260 vmax.s16 q14, q14, q6 @clamp >0
261 vmin.s16 q14, q14, q7 @clamp <255
262
263 vshr.s16 q15, q15, #6 @q15 = vy7
264 vmax.s16 q15, q15, q6 @clamp >0
265 vmin.s16 q15, q15, q7 @clamp <255
266
267 vst1.16 {d24, d25},[r1],r2 @q12=row4
268 vst1.16 {d26, d27},[r1],r2 @q13=row5
269 vst1.16 {d28, d29},[r1],r2 @q14=row6
270 vst1.16 {d30, d31},[r1] @q15=row7
271 .endm
272
273 .macro BIG_BODY_TRANSPOSE_INPUT
274 @==================================================================
275 @ Main body of idct
276 @==================================================================
277 PART1
278 Transpose8x8
279 PART2
280 .endm
281
282 .macro IDCT_ENTRY
283 @==================================================================
284 @ Load the locations of the constants
285 @==================================================================
286 ldr r5, =constants+0*16 @constants[0]
287 ldr r6, =constants+1*16 @constants[1]
288 ldr r7, =constants+2*16 @constants[2]
289 ldr r8, =constants+3*16 @constants[3]
290 ldr r9, =constants+4*16 @constants[4]
291
292 @==================================================================
293 @ Load the coefficients
294 @ only some input coefficients are load due to register constrain
295 @==================================================================
296 vld1.16 {d0, d1},[r5] @q0 =constants[0] (scalars)
297 vld1.16 {d2, d3},[r6] @q1 =constants[1]
298 .endm
299@==========================================================================
300@ END of MACRO DEFINITION
301@==========================================================================
302
303
304 .section idct_func, "x" @ ARE
305 .text @ idct_func, CODE, READONLY
306 .align 2
307 .code 32 @ CODE32
308
309@==========================================================================
310@ Main Routine
311@==========================================================================
312
313 .global idct_1x1_venum
314 .global idct_2x2_venum
315 .global idct_4x4_venum
316 .global idct_8x8_venum
317
318@==========================================================================
319@ FUNCTION : idct_1x1_venum
320@--------------------------------------------------------------------------
321@ DISCRIPTION : ARM optimization of one 1x1 block iDCT
322@--------------------------------------------------------------------------
323@ C PROTOTYPE : void idct_1x1_venum(int16 * input,
324@ int16 * output,
325@ int32 stride)
326@--------------------------------------------------------------------------
327@ REG INPUT : R0 pointer to input (int16)
328@ R1 pointer to output (int16)
329@ R2 block stride
330@--------------------------------------------------------------------------
331@ STACK ARG : None
332@--------------------------------------------------------------------------
333@ MEM INPUT : None
334@--------------------------------------------------------------------------
335@ REG OUTPUT : None
336@--------------------------------------------------------------------------
337@ MEM OUTPUT : None
338@--------------------------------------------------------------------------
339@ REG AFFECTED : R0 - R2
340@--------------------------------------------------------------------------
341@ STACK USAGE : none
342@--------------------------------------------------------------------------
343@ CYCLES : 17 cycles
344@--------------------------------------------------------------------------
345@ NOTES :
346@ This idct_1x1_venum code was developed with ARM instruction set.
347@
348@ ARM REGISTER ALLOCATION
349@ =========================================================================
350@ r0 : pointer to input data
351@ r1 : pointer to output area
352@ r2 : stride in the output buffer
353@==========================================================================
354.type idct_1x1_venum, %function
355idct_1x1_venum:
356
357 ldrsh r3, [r0] @ Load signed half word (int16)
358 ldr r2, =1028 @ 1028 = 4 + 128 << 3
359 @ 4 for rounding, 128 for offset
360 add r2, r3, r2
361 asrs r2, r2, #3 @ Divide by 8, and set status bit
362 movmi r2, #0 @ Clamp to be greater than 0
363 cmp r2, #255
364 movgt r2, #255 @ Clamp to be less than 255
365 str r2, [r1] @ Save output
366 bx lr @ Return to caller
367
368 @ end of idct_1x1_venum
369
370
371@==========================================================================
372@ FUNCTION : idct_2x2_venum
373@--------------------------------------------------------------------------
374@ DISCRIPTION : VeNum optimization of one 2x2 block iDCT
375@--------------------------------------------------------------------------
376@ C PROTOTYPE : void idct_2x2_venum(int16 * input,
377@ int16 * output,
378@ int32 stride)
379@--------------------------------------------------------------------------
380@ REG INPUT : R0 pointer to input (int16)
381@ R1 pointer to output (int16)
382@ R2 block stride
383@--------------------------------------------------------------------------
384@ STACK ARG : None
385@--------------------------------------------------------------------------
386@ MEM INPUT : None
387@--------------------------------------------------------------------------
388@ REG OUTPUT : None
389@--------------------------------------------------------------------------
390@ MEM OUTPUT : None
391@--------------------------------------------------------------------------
392@ REG AFFECTED : R0 - R2
393@--------------------------------------------------------------------------
394@ STACK USAGE : none
395@--------------------------------------------------------------------------
396@ CYCLES : 27 cycles
397@--------------------------------------------------------------------------
398@ NOTES : Output buffer must be an 8x8 16-bit buffer
399@
400@ ARM REGISTER ALLOCATION
401@ ==========================================
402@ r0 : pointer to input data
403@ r1 : pointer to output area
404@ r2 : stride in the output buffer
405@ -------------------------------------------
406@
407@ VENUM REGISTER ALLOCATION
408@ =================================================
409@ q0 : output x0 - x4
410@ q1 : not used
411@ q2 : not used
412@ q3 : not used
413@ q4 : not used
414@ q5 : not used
415@ q6 : not used
416@ q7 : not used
417@ q8 : input y0 - y4
418@ q9 : intermediate value
419@ q10 : intermediate value
420@ q11 : offset value
421@ q12 : clamp value
422@ q13 : not used
423@ q14 : not used
424@ q15 : not used
425@==========================================================================
426.type idct_2x2_venum, %function
427idct_2x2_venum:
428
429 vld4.32 {d16, d17, d18, d19}, [r0]
430 @ d16: y0 | y1 | y2 | y3 (LSB | MSB)
431
432 vtrn.32 d16, d17 @ d16: y0 | y1 | X | X
433 @ d17: y2 | y3 | X | X
434
435 vqadd.s16 d18, d16, d17 @ d18: y0+y2 | y1+y3 | X | X q: saturated
436 vqsub.s16 d19, d16, d17 @ d19: y0-y2 | y1-y3 | X | X q: saturated
437
438 vtrn.16 d18, d19 @ d18: y0+y2 | y0-y2 | X | X
439 @ d19: y1+y3 | y1-y3 | X | X
440
441 vqadd.s16 d20, d18, d19 @ d20: (y0+y2)+(y1+y3) | (y0-y2)+(y1-y3)
442 @ x0 | x2 | X | X
443 vqsub.s16 d21, d18, d19 @ d21: (y0+y2)-(y1+y3) | (y0-y2)-(y1-y3)
444 @ x1 | x3 | X | X
445
446 vtrn.16 d20, d21 @ d20: x0 | x1 | X | X
447 @ d21: x2 | x3 | X | X
448
449 vrshr.s16 q10, q10, #3 @ Divide by 8
450
451 vmov.i16 q11, #128 @ q11 = 128|128|128|128|128|128|128|128
452 vqadd.s16 q0, q10, q11 @ Add offset to make output in [0,255]
453
454 vmov.i16 q12, #0 @ q12 = [0000000000000000]
455 vmov.i16 q13, #255 @ q13 = [FFFFFFFFFFFFFFFF] (hex)
456
457 vmax.s16 q0, q0, q12 @ Clamp > 0
458 vmin.s16 q0, q0, q13 @ Clamp < 255
459
460 vstr d0, [r1] @ Store x0 | x1 | X | X
461 @ Potential out of boundary issue
462 add r1, r1, r2 @ Add the offset to the output pointer
463 vstr d1, [r1] @ Store x2 | x3 | X | X
464 @ Potential out of boundary issue
465 bx lr @ Return to caller
466
467 @ end of idct_2x2_venum
468
469
470@==========================================================================
471@ FUNCTION : idct_4x4_venum
472@--------------------------------------------------------------------------
473@ DISCRIPTION : VeNum optimization of one 4x4 block iDCT
474@--------------------------------------------------------------------------
475@ C PROTOTYPE : void idct_4x4_venum(int16 * input,
476@ int16 * output,
477@ int32 stride)
478@--------------------------------------------------------------------------
479@ REG INPUT : R0 pointer to input (int16)
480@ R1 pointer to output (int16)
481@ R2 block stride
482@--------------------------------------------------------------------------
483@ STACK ARG : None
484@--------------------------------------------------------------------------
485@ MEM INPUT : None
486@--------------------------------------------------------------------------
487@ REG OUTPUT : None
488@--------------------------------------------------------------------------
489@ MEM OUTPUT : None
490@--------------------------------------------------------------------------
491@ REG AFFECTED : R0 - R3, R12
492@--------------------------------------------------------------------------
493@ STACK USAGE : none
494@--------------------------------------------------------------------------
495@ CYCLES : 56 cycles
496@--------------------------------------------------------------------------
497@ NOTES :
498@
499@ ARM REGISTER ALLOCATION
500@ ==========================================
501@ r0 : pointer to input data
502@ r1 : pointer to output area
503@ r2 : stride in the output buffer
504@ r3 : pointer to the coefficient set
505@ r12 : pointer to the coefficient set
506@ -------------------------------------------
507@
508@ VENUM REGISTER ALLOCATION
509@ =================================================
510@ q0 : coefficients[0]
511@ q1 : coefficients[1]
512@ q2 : coefficients[2]
513@ q3 : coefficients[3]
514@ q4 : not used
515@ q5 : not used
516@ q6 : not used
517@ q7 : not used
518@ q8 : input y0 - y7
519@ q9 : input y8 - y15
520@ q10 : intermediate value
521@ q11 : intermediate value
522@ q12 : intermediate value
523@ q13 : intermediate value
524@ q14 : intermediate value
525@ q15 : intermediate value
526@==========================================================================
527.type idct_4x4_venum, %function
528idct_4x4_venum:
529
530 @ Load the locations of the first 2 sets of coefficients
531 ldr r3, =coefficient+0*16 @ coefficient[0]
532 ldr r12, =coefficient+1*16 @ coefficient[1]
533
534 @ Load the first 2 sets of coefficients
535 vld1.16 {d0, d1},[r3] @ q0 = C4 | C2 | C4 | C6 | C4 | C2 | C4 | C6
536 vld1.16 {d2, d3},[r12] @ q1 = C4 | C6 | C4 | C2 | C4 | C6 | C4 | C2
537
538 @ Load the locations of the second 2 sets of coefficients
539 ldr r3, =coefficient+2*16 @ coefficient[2]
540 ldr r12, =coefficient+3*16 @ coefficient[3]
541
542 @ Load the second 2 sets of coefficients
543 vld1.16 {d4, d5},[r3] @ q2 = C4 | C4 | C4 | C4 | C2 | C2 | C2 | C2
544 vld1.16 {d6, d7},[r12] @ q3 = C4 | C4 | C4 | C4 | C6 | C6 | C6 | C6
545
546 @ Load the input values
547 vld1.16 {d16}, [r0], r2 @ d16: y0 | y1 | y2 | y3 (LSB | MSB)
548 vld1.16 {d17}, [r0], r2 @ d17: y4 | y5 | y6 | y7 (LSB | MSB)
549 vld1.16 {d18}, [r0], r2 @ d18: y8 | y9 | y10 | y11 (LSB | MSB)
550 vld1.16 {d19}, [r0], r2 @ d19: y12 | y13 | y14 | y15 (LSB | MSB)
551
552 @ Apply iDCT Horizonally
553
554 @ q8: y0 |y1 |y2 |y3 |y4 |y5 |y6 |y7
555 @ q9: y8 |y9 |y10|y11|y12|y13|y14|y15
556
557 @======================================================================
558 @ vqrdmulh doubles the result and save the high 16 bits of the result,
559 @ this is equivalent to right shift by 15 bits.
560 @ since coefficients are in Q15 format, it contradicts with the right
561 @ shift 15 here, so the final result is in Q0 format
562 @
563 @ vqrdmulh will also round the result
564 @======================================================================
565
566 vqrdmulh.s16 q10, q8, q0 @ q10: C4*y0 | C2*y1 | C4*y2 | C6*y3 | C4*y4 | C2*y5 | C4*y6 | C6*y7
567 vqrdmulh.s16 q11, q8, q1 @ q11: C4*y0 | C6*y1 | C4*y2 | C2*y3 | C4*y4 | C6*y5 | C4*y6 | C2*y7
568
569 vqrdmulh.s16 q12, q9, q0 @ q12: C4*y8 | C2*y9 | C4*y10 | C6*y11 | C4*y12 | C2*y13 | C4*y14 | C6*y15
570 vqrdmulh.s16 q13, q9, q1 @ q13: C4*y8 | C6*y9 | C4*y10 | C2*y11 | C4*y12 | C6*y13 | C4*y14 | C2*y15
571
572 vtrn.32 q10, q12 @ q10: C4*y0 | C2*y1 | C4*y8 | C2*y9 | C4*y4 | C2*y5 | C4*y12 | C2*y13
573 @ q12: C4*y2 | C6*y3 | C4*y10 | C6*y11 | C4*y6 | C6*y7 | C4*y14 | C6*y15
574
575 vtrn.32 q11, q13 @ q11: C4*y0 | C6*y1 | C4*y8 | C6*y9 | C4*y4 | C6*y5 | C4*y12 | C6*y13
576 @ q13: C4*y2 | C2*y3 | C4*y10 | C2*y11 | C4*y6 | C2*y7 | C4*y14 | C2*y15
577
578 vqadd.s16 q14, q10, q12 @ q14: C4*y0 + C4*y2 | C2*y1 + C6*y3 | C4*y8 + C4*y10 | C2*y9 + C6*y11 | C4*y4 + C4*y6 | C2*y5 + C6*y7 | C4*y12 + C4*y14 | C2*y13 + C6*y15
579 @ S0 | S2 | S8 | S10 | S4 | S6 | S12 | S14
580
581 vqsub.s16 q15, q11, q13 @ q15: C4*y0 - C4*y2 | C6*y1 - C2*y3 | C4*y8 - C4*y10 | C6*y9 - C2*y11 | C4*y4 - C4*y6 | C6*y5 - C2*y7 | C4*y12 - C4*y14 | C6*y13 - C2*y15
582 @ S1 | S3 | S9 | S11 | S5 | S7 | S13 | S15
583
584 vtrn.16 q14, q15 @ q14: S0 | S1 | S8 | S9 | S4 | S5 | S12 | S13
585 @ q15: S2 | S3 | S10 | S11 | S6 | S7 | S14 | S15
586
587 vqadd.s16 q8, q14, q15 @ q8: Z0 | Z1 | Z8 | Z9 | Z4 | Z5 | Z12 | Z13
588 vqsub.s16 q9, q14, q15 @ q9: Z3 | Z2 | Z11 | Z10 | Z7 | Z6 | Z15 | Z14
589 vrev32.16 q9, q9 @ q9: Z2 | Z3 | Z10 | Z11 | Z6 | Z7 | Z14 | Z15
590
591
592 @ Apply iDCT Vertically
593
594 vtrn.32 q8, q9 @ q8: Z0 | Z1 | Z2 | Z3 | Z4 | Z5 | Z6 | Z7
595 @ q9: Z8 | Z9 | Z10 | Z11 | Z12 | Z13 | Z14 | Z15
596
597
598 vqrdmulh.s16 q10, q8, q2 @ q10: C4*Z0 | C4*Z1 | C4*Z2 | C4*Z3 | C2*Z4 | C2*Z5 | C2*Z6 | C2*Z7
599 vqrdmulh.s16 q11, q8, q3 @ q11: C4*Z0 | C4*Z1 | C4*Z2 | C4*Z3 | C6*Z4 | C6*Z5 | C6*Z6 | C6*Z7
600
601 vqrdmulh.s16 q12, q9, q2 @ q12: C4*Z8 | C4*Z9 | C4*Z10 | C4*Z11 | C2*Z12 | C2*Z13 | C2*Z14 | C2*Z15
602 vqrdmulh.s16 q13, q9, q3 @ q13: C4*Z8 | C4*Z9 | C4*Z10 | C4*Z11 | C6*Z12 | C6*Z13 | C6*Z14 | C6*Z15
603
604 vqadd.s16 q14, q10, q13 @ q14: C4*Z0+C4*Z8 | C4*Z1+C4*Z9 | C4*Z2+C4*Z10 | C4*Z3+C4*Z11 | C2*Z4+C6*Z12 | C2*Z5+C6*Z13 | C2*Z6+C6*Z14 | C2*Z7+C6*Z15
605 @ s0 | s4 | s8 | s12 | s2 | s6 | s10 | s14
606
607 vqsub.s16 q15, q11, q12 @ q15: C4*Z0-C4*Z8 | C4*Z1-C4*Z9 | C4*Z2-C4*Z10 | C4*Z3-C4*Z11 | C6*Z4-C2*Z12 | C6*Z5-C2*Z13 | C6*Z6-C2*Z14 | C6*Z7-C2*Z15
608 @ s1 | s5 | s9 | s13 | s3 | s7 | s11 | s15
609
610 vswp d29, d30 @ q14: s0 | s4 | s8 | s12 | s1 | s5 | s9 | s13
611 @ q15: s2 | s6 | s10 | s14 | s3 | s7 | s11 | s15
612
613 vqadd.s16 q8, q14, q15 @ q8: x0 | x4 | x8 | x12 | x1 | x5 | x9 | x13
614 vqsub.s16 q9, q14, q15 @ q9: x3 | x7 | x11 | x15 | x2 | x6 | x10 | x14
615
616 vmov.i16 q10, #0 @ q10=[0000000000000000]
617 vmov.i16 q11, #255 @ q11=[FFFFFFFFFFFFFFFF] (hex)
618
619 vmov.i16 q0, #128 @ q0 = 128|128|128|128|128|128|128|128
620
621 vqadd.s16 q8, q8, q0 @ Add the offset
622 vqadd.s16 q9, q9, q0 @ Add the offset
623
624 vmax.s16 q8, q8, q10 @ clamp > 0
625 vmin.s16 q8, q8, q11 @ clamp < 255
626
627 vmax.s16 q9, q9, q10 @ clamp > 0
628 vmin.s16 q9, q9, q11 @ clamp < 255
629
630 vst1.16 {d16}, [r1], r2 @ d16: x0 | x1 | x2 | x3 (LSB | MSB)
631 vst1.16 {d17}, [r1], r2 @ d17: x4 | x5 | x6 | x7 (LSB | MSB)
632 vst1.16 {d19}, [r1], r2 @ d18: x8 | x9 | x10 | x11 (LSB | MSB)
633 vst1.16 {d18}, [r1], r2 @ d19: x12| x13 | x14 | x15 (LSB | MSB)
634
635 bx lr @ Return to caller
636
637 @ end of idct_4x4_venum
638
639@==========================================================================
640@ FUNCTION : idct_8x8_venum
641@--------------------------------------------------------------------------
642@ DISCRIPTION : VeNum optimization of one 8x8 block iDCT
643@--------------------------------------------------------------------------
644@ C PROTOTYPE : void idct_8x8_venum(int16 * input,
645@ int16 * output,
646@ int32 stride)
647@--------------------------------------------------------------------------
648@ REG INPUT : R0 pointer to input (int16)
649@ R1 pointer to output (int16)
650@ R2 block stride
651@--------------------------------------------------------------------------
652@ STACK ARG : None
653@--------------------------------------------------------------------------
654@ MEM INPUT : None
655@--------------------------------------------------------------------------
656@ REG OUTPUT : None
657@--------------------------------------------------------------------------
658@ MEM OUTPUT : None
659@--------------------------------------------------------------------------
660@ REG AFFECTED : R0 - R9
661@--------------------------------------------------------------------------
662@ STACK USAGE : none
663@--------------------------------------------------------------------------
664@ CYCLES : 177 cycles
665@--------------------------------------------------------------------------
666@ NOTES :
667@
668@ It was tested to be IEEE 1180 compliant. Since IEEE 1180 compliance is more stringent
669@ than MPEG-4 compliance, this version is also MPEG-4 compliant.
670@
671@ CODE STRUCTURE:
672@ (i) Macros for transposing an 8x8 matrix and for configuring the VFP unit are defined.
673@ (ii) Macro for IDCT in one dimension is defined as four stages
674@ (iii) The two dimensional code begins
675@ (iv) constants are defined in the area DataArea
676@
677@ PROGRAM FLOW:
678@
679@ The VFP is configured
680@ The parameters to IDCT are loaded
681@ the coefficients are loaded
682@ loop:
683@ decrement loop counter
684@ The first input Matrix is loaded and pre-scaled
685@ The input is prescaled using the constants
686@ IDCT is performed in one dimension on the 8 columns
687@ The matrix is transposed
688@ A bias is loaded an added to the matrix
689@ IDCT is performed in one dimension on the 8 rows
690@ The matrix is post-scaled
691@ The matrix is saved
692@ test loop counter and loop if greater than zero
693@ stop
694@
695@
696@ ARM REGISTER ALLOCATION
697@ ==========================================
698@ r0 : pointer to input data
699@ r1 : pointer to output are
700@ r2 : stride in the output buffer
701@ r3 :
702@ r4 :
703@ r5 : pointer to constants[0] [5]
704@ r6 : pointer to constants[1]
705@ r7 : pointer to constants[2]
706@ r8 : pointer to constants[3]
707@ r9 : pointer to constants[4]
708@ -------------------------------------------
709@
710@ VENUM REGISTER ALLOCATION
711@ =================================================
712@ q0 : constants[0]
713@ q1 : constants[1]
714@ q2 : constants[2], IDCT1D in-place scratch
715@ q3 : constants[3], IDCT1D in-place scratch
716@ q4 : constants[4], IDCT1D in-place scratch, and bias compensation
717@ q5 : IDCT1D in-place scratch
718@ q6 : IDCT1D in-place scratch
719@ q7 : IDCT1D in-place scratch
720@ q8 : Matrix[0] IDCT1D in-place scratch
721@ q9 : Matrix[1] IDCT1D in-place scratch
722@ q10 : Matrix[2] IDCT1D in-place scratch
723@ q11 : Matrix[3] IDCT1D in-place scratch
724@ q12 : Matrix[4] IDCT1D in-place scratch
725@ q13 : Matrix[5] IDCT1D in-place scratch
726@ q14 : Matrix[6] IDCT1D in-place scratch
727@ q15 : Matrix[7] IDCT1D in-place scratch
728@==========================================================================
729.type idct_8x8_venum, %function
730idct_8x8_venum:
731
732 push {r5-r9}
733 vpush {d8-d15}
734 IDCT_ENTRY
735 BIG_BODY_TRANSPOSE_INPUT
736 vpop {d8-d15}
737 pop {r5-r9}
738 bx lr
739 @ end of idct_8x8_venum
740
741@==========================================================================
742@ Constants Definition AREA: define idct kernel, bias
743@==========================================================================
744 .section ro_data_area @ AREA RODataArea
745 .data @ DATA, READONLY
746 .align 5 @ ALIGN=5
747
748constants:
749 .hword 23170, 13573, 6518, 21895, -23170, -21895, 8223, 8224
750 .hword 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725
751 .hword 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521
752 .hword 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692
753 .hword 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722
754 .hword 0, 255, 0, 0
755
756coefficient: @ These are the coefficent used by 4x4 iDCT in Q15 format
757 .hword 11585, 15137, 11585, 6270, 11585, 15137, 11585, 6270 @ C4, C2, C4, C6, C4, C2, C4, C6 /2
758 .hword 11585, 6270, 11585, 15137, 11585, 6270, 11585, 15137 @ C4, C6, C4, C2, C4, C6, C4, C2 /2
759 .hword 11585, 11585, 11585, 11585, 15137, 15137, 15137, 15137 @ C4, C4, C4, C4, C2, C2, C2, C2 /2
760 .hword 11585, 11585, 11585, 11585, 6270, 6270, 6270, 6270 @ C4, C4, C4, C4, C6, C6, C6, C6 /2
761
762.end