Blame - libjpegtwrp/asm/armv7/jdidct-armv7.S - android_bootable_recovery

blob: d61e219f738b000dc2cfa53b393644822b78b200 [file] [log] [blame]

Dees_Troy	51a0e82	2012-09-05 15:24:24 -0400	[diff] [blame]	1	/*=========================================================================
				2	* jdidct-armv7.s
				3	*
				4	* Copyright (c) 2010, Code Aurora Forum. All rights reserved.
				5	*
				6	* Redistribution and use in source and binary forms, with or without
				7	* modification, are permitted provided that the following conditions are
				8	* met:
				9	* * Redistributions of source code must retain the above copyright
				10	* notice, this list of conditions and the following disclaimer.
				11	* * Redistributions in binary form must reproduce the above
				12	* copyright notice, this list of conditions and the following
				13	* disclaimer in the documentation and/or other materials provided
				14	* with the distribution.
				15	* * Neither the name of Code Aurora Forum, Inc. nor the names of its
				16	* contributors may be used to endorse or promote products derived
				17	* from this software without specific prior written permission.
				18	*
				19	* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
				20	* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
				21	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
				22	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
				23	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
				24	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
				25	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
				26	* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
				27	* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
				28	* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
				29	* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
				30	*==========================================================================
				31
				32	*==========================================================================
				33	* FUNCTION LIST
				34	*--------------------------------------------------------------------------
				35	* - idct_1x1_venum
				36	* - idct_2x2_venum
				37	* - idct_4x4_venum
				38	* - idct_8x8_venum
				39	*
				40	*==========================================================================
				41	*/
				42
				43	@==========================================================================
				44	@ MACRO DEFINITION
				45	@==========================================================================
				46	.macro Transpose8x8
				47	@==================================================================
				48	@ Transpose an 8 x 8 x 16 bit matrix in place
				49	@ Input: q8 to q15
				50	@ Output: q8 to q15
				51	@ Registers used: q8 to q15
				52	@ Assumptions: 8 x 8 x 16 bit data
				53	@==================================================================
				54
				55	vswp d17, d24 @q8, q12
				56	vswp d23, d30 @q11, q15
				57	vswp d21, d28 @q10, q14
				58	vswp d19, d26 @q9, q13
				59
				60	vtrn.32 q8, q10
				61	vtrn.32 q9, q11
				62	vtrn.32 q12, q14
				63	vtrn.32 q13, q15
				64
				65	vtrn.16 q8, q9
				66	vtrn.16 q10, q11
				67	vtrn.16 q12, q13
				68	vtrn.16 q14, q15
				69	.endm
				70
				71	.macro IDCT1D
				72	@==================================================================
				73	@ One dimensional 64 element inverse DCT
				74	@ Input: q8 to q15 loaded with data
				75	@ q0 loaded with constants
				76	@ Output: q8 to q15
				77	@ Registers used: q0, q4 to q15
				78	@ Assumptions: 16 bit data, first elements in least significant
				79	@ halfwords
				80	@==================================================================
				81
				82	@1st stage
				83	vqrdmulh.s16 q4, q15, d0[2] @q4 = a1*vx7
				84	vqrdmulh.s16 q5, q9, d0[2] @q5 = a1*vx1
				85	vqrdmulh.s16 q6, q13, d0[3] @q6 = a2*vx5
				86	vqrdmulh.s16 q7, q11, d1[1] @q7 = ma2*vx3
				87	vqrdmulh.s16 q2, q14, d0[1] @q6 = a0*vx6
				88	vqrdmulh.s16 q3, q10, d0[1] @q7 = a0*vx2
				89	vqadd.s16 q9, q4, q9 @q9 = t1 = a1*vx7 + vx1
				90	vqsub.s16 q5, q5, q15 @q5 = t8 = a1*vx1 - vx7
				91	vqadd.s16 q15, q6, q11 @q15 = t7 = a2*vx5 + vx3
				92	vqadd.s16 q11, q7, q13 @q11 = t3 = ma2*vx3 + vx5
				93
				94	@2nd stage
				95	vqadd.s16 q13, q8, q12 @q13 = t5 = vx0 + vx4
				96	vqsub.s16 q8, q8, q12 @q8 = t0 = vx0 - vx4
				97	vqadd.s16 q10, q2, q10 @q10 = t2 = a0*vx6 + vx2
				98	vqsub.s16 q12, q3, q14 @q12 = t4 = a0*vx2 - vx6
				99	vqadd.s16 q14, q5, q11 @q14 = t6 = t8 + t3
				100	vqsub.s16 q11, q5, q11 @q11 = t3 = t8 - t3
				101	vqsub.s16 q5, q9, q15 @q5 = t8 = t1 - t7
				102	vqadd.s16 q9, q9, q15 @q9 = t1 = t1 + t7
				103
				104	@3rd stage
				105	vqadd.s16 q15, q13, q10 @q15 = t7 = t5 + t2
				106	vqsub.s16 q10, q13, q10 @q10 = t2 = t5 - t2
				107	vqadd.s16 q13, q8, q12 @q13 = t5 = t0 + t4
				108	vqsub.s16 q7, q8, q12 @q7 = t0 = t0 - t4
				109	vqsub.s16 q12, q5, q11 @q12 = t4 = t8 - t3
				110	vqadd.s16 q11, q5, q11 @q11 = t3 = t8 + t3
				111
				112	@4th stage
				113	vqadd.s16 q8, q15, q9 @q8 = vy0 = t7 + t1
				114	vqsub.s16 q15, q15, q9 @q15 = vy7 = t7 - t1
				115	vqrdmulh.s16 q6, q12, d0[0] @q6 = c4*t4
				116	vqrdmulh.s16 q4, q11, d0[0] @q4 = c4*t3
				117	vqsub.s16 q12, q10, q14 @q12 = vy4 = t2 - t6
				118	vqadd.s16 q11, q10, q14 @q11 = vy3 = t2 + t6
				119	vqadd.s16 q10, q7, q6 @q10 = vy2 = t0 + c4*t4
				120	vqsub.s16 q14, q13, q4 @q14 = vy6 = t5 - c4*t3
				121	vqadd.s16 q9, q13, q4 @q9 = vy1 = t5 + c4*t3
				122	vqsub.s16 q13, q7, q6 @q13 = vy5 = t0 - c4*t4
				123	.endm
				124
				125	.macro PART1
				126	@==================================================================
				127	@ Load input input data from memory and shift
				128	@==================================================================
				129	vld1.16 {d16, d17},[r0]! @q8 =row0
				130	vqshl.s16 q8, q8, #4 @Input data too big?!!
				131	@Maximum MPEG input is 2047/-2048.
				132	vld1.16 {d18, d19},[r0]! @q9 =row1
				133	vqshl.s16 q9, q9, #4 @Shift 1 instead of 4
				134
				135	vld1.16 {d20, d21},[r0]! @q10=row2
				136	vqshl.s16 q10, q10, #4
				137
				138	vld1.16 {d22, d23},[r0]! @q11=row3
				139	vqshl.s16 q11, q11, #4
				140
				141	vld1.16 {d24, d25},[r0]! @q12=row4
				142	vqshl.s16 q12, q12, #4
				143
				144	vld1.16 {d26, d27},[r0]! @q13=row5
				145	vqshl.s16 q13, q13, #4
				146	vld1.16 {d28, d29},[r0]! @q14=row6
				147	vqshl.s16 q14, q14, #4
				148	vld1.16 {d30, d31},[r0]! @q15=row7
				149	vqshl.s16 q15, q15, #4
				150
				151	@==================================================================
				152	@ refresh the constants that was clobbered last time through IDCT1D
				153	@==================================================================
				154	vld1.16 {d4, d5},[r7] @q2 =constants[2]
				155	vld1.16 {d6, d7},[r8] @q3 =constants[3]
				156	vld1.16 {d8, d9},[r9] @q4 =constants[4]
				157	.endm
				158
				159	.macro PART2
				160	@==================================================================
				161	@ Prescale the input
				162	@==================================================================
				163	vqrdmulh.s16 q12, q12, q1 @q12=row4 * constants[1] = vx4
				164	vqrdmulh.s16 q15, q15, q2 @q15=row7 * constants[2] = vx7
				165	vqrdmulh.s16 q9, q9, q2 @q9 =row1 * constants[2] = vx1
				166	vqrdmulh.s16 q13, q13, q4 @q13=row5 * constants[4] = vx5
				167	vqrdmulh.s16 q11, q11, q4 @q11=row3 * constants[4] = vx3
				168	vqrdmulh.s16 q14, q14, q3 @q14=row6 * constants[3] = vx6
				169	vqrdmulh.s16 q10, q10, q3 @q10=row2 * constants[3] = vx2
				170	vqrdmulh.s16 q8, q8, q1 @q8 =row0 * constants[1] = vx0
				171
				172	@==================================================================
				173	@ At thsi point, the input 8x8 x 16 bit coefficients are
				174	@ transposed, prescaled, and loaded in q8 to q15
				175	@ q0 loaded with scalar constants
				176	@ Perform 1D IDCT
				177	@==================================================================
				178	IDCT1D @perform 1d idct
				179
				180	@==================================================================
				181	@ Transpose the intermediate results to get read for vertical
				182	@ transformation
				183	@==================================================================
				184	vswp d17, d24 @q8, q12
				185	vswp d23, d30 @q11, q15
				186	vswp d21, d28 @q10, q14
				187	vswp d19, d26 @q9, q13
				188
				189	@==================================================================
				190	@ Load the bias
				191	@==================================================================
				192	vdup.32 q4, d1[1] @a cycle is saved by loading
				193	@the bias at this point
				194
				195	@==================================================================
				196	@ Finish the transposition
				197	@==================================================================
				198	vtrn.32 q8, q10
				199	vtrn.32 q9, q11
				200	vtrn.32 q12, q14
				201	vtrn.32 q13, q15
				202	vtrn.16 q8, q9
				203	vtrn.16 q10, q11
				204	vtrn.16 q12, q13
				205	vtrn.16 q14, q15
				206
				207	@==================================================================
				208	@ Add bias
				209	@==================================================================
				210	vqadd.s16 q8, q8, q4
				211
				212	@==================================================================
				213	@ IDCT 2nd half
				214	@==================================================================
				215	IDCT1D @perform 1d dct
				216
				217	@==================================================================
				218	@ Scale and clamp the output to correct range and save to memory
				219	@ 1. scale to 8bits by right shift 6
				220	@ 2. clamp output to [0, 255] by min/max
				221	@ 3. use multiple store. Each store will save one row of output.
				222	@ The st queue size is 4, so do no more than 4 str in sequence.
				223	@==================================================================
				224	ldr r5, =constants+5*16 @constants[5],
				225	vld1.16 d10, [r5] @load clamping parameters
				226	vdup.s16 q6, d10[0] @q6=[0000000000000000]
				227	vdup.s16 q7, d10[1] @q7=[FFFFFFFFFFFFFFFF]
				228
				229	@Save the results
				230	vshr.s16 q8, q8, #6 @q8 = vy0
				231	vmax.s16 q8, q8, q6 @clamp >0
				232	vmin.s16 q8, q8, q7 @clamp <255
				233
				234	vshr.s16 q9, q9, #6 @q9 = vy1
				235	vmax.s16 q9, q9, q6 @clamp >0
				236	vmin.s16 q9, q9, q7 @clamp <255
				237
				238	vshr.s16 q10, q10, #6 @q10 = vy2
				239	vmax.s16 q10, q10, q6 @clamp >0
				240	vmin.s16 q10, q10, q7 @clamp <255
				241
				242	vshr.s16 q11, q11, #6 @q11 = vy3
				243	vmax.s16 q11, q11, q6 @clamp >0
				244	vmin.s16 q11, q11, q7 @clamp <255
				245
				246	vst1.16 {d16, d17},[r1],r2 @q8 =row0
				247	vst1.16 {d18, d19},[r1],r2 @q9 =row1
				248	vst1.16 {d20, d21},[r1],r2 @q10=row2
				249	vst1.16 {d22, d23},[r1],r2 @q11=row3
				250
				251	vshr.s16 q12, q12, #6 @q12 = vy4
				252	vmax.s16 q12, q12, q6 @clamp >0
				253	vmin.s16 q12, q12, q7 @clamp <255
				254
				255	vshr.s16 q13, q13, #6 @q13 = vy5
				256	vmax.s16 q13, q13, q6 @clamp >0
				257	vmin.s16 q13, q13, q7 @clamp <255
				258
				259	vshr.s16 q14, q14, #6 @q14 = vy6
				260	vmax.s16 q14, q14, q6 @clamp >0
				261	vmin.s16 q14, q14, q7 @clamp <255
				262
				263	vshr.s16 q15, q15, #6 @q15 = vy7
				264	vmax.s16 q15, q15, q6 @clamp >0
				265	vmin.s16 q15, q15, q7 @clamp <255
				266
				267	vst1.16 {d24, d25},[r1],r2 @q12=row4
				268	vst1.16 {d26, d27},[r1],r2 @q13=row5
				269	vst1.16 {d28, d29},[r1],r2 @q14=row6
				270	vst1.16 {d30, d31},[r1] @q15=row7
				271	.endm
				272
				273	.macro BIG_BODY_TRANSPOSE_INPUT
				274	@==================================================================
				275	@ Main body of idct
				276	@==================================================================
				277	PART1
				278	Transpose8x8
				279	PART2
				280	.endm
				281
				282	.macro IDCT_ENTRY
				283	@==================================================================
				284	@ Load the locations of the constants
				285	@==================================================================
				286	ldr r5, =constants+0*16 @constants[0]
				287	ldr r6, =constants+1*16 @constants[1]
				288	ldr r7, =constants+2*16 @constants[2]
				289	ldr r8, =constants+3*16 @constants[3]
				290	ldr r9, =constants+4*16 @constants[4]
				291
				292	@==================================================================
				293	@ Load the coefficients
				294	@ only some input coefficients are load due to register constrain
				295	@==================================================================
				296	vld1.16 {d0, d1},[r5] @q0 =constants[0] (scalars)
				297	vld1.16 {d2, d3},[r6] @q1 =constants[1]
				298	.endm
				299	@==========================================================================
				300	@ END of MACRO DEFINITION
				301	@==========================================================================
				302
				303
				304	.section idct_func, "x" @ ARE
				305	.text @ idct_func, CODE, READONLY
				306	.align 2
				307	.code 32 @ CODE32
				308
				309	@==========================================================================
				310	@ Main Routine
				311	@==========================================================================
				312
				313	.global idct_1x1_venum
				314	.global idct_2x2_venum
				315	.global idct_4x4_venum
				316	.global idct_8x8_venum
				317
				318	@==========================================================================
				319	@ FUNCTION : idct_1x1_venum
				320	@--------------------------------------------------------------------------
				321	@ DISCRIPTION : ARM optimization of one 1x1 block iDCT
				322	@--------------------------------------------------------------------------
				323	@ C PROTOTYPE : void idct_1x1_venum(int16 * input,
				324	@ int16 * output,
				325	@ int32 stride)
				326	@--------------------------------------------------------------------------
				327	@ REG INPUT : R0 pointer to input (int16)
				328	@ R1 pointer to output (int16)
				329	@ R2 block stride
				330	@--------------------------------------------------------------------------
				331	@ STACK ARG : None
				332	@--------------------------------------------------------------------------
				333	@ MEM INPUT : None
				334	@--------------------------------------------------------------------------
				335	@ REG OUTPUT : None
				336	@--------------------------------------------------------------------------
				337	@ MEM OUTPUT : None
				338	@--------------------------------------------------------------------------
				339	@ REG AFFECTED : R0 - R2
				340	@--------------------------------------------------------------------------
				341	@ STACK USAGE : none
				342	@--------------------------------------------------------------------------
				343	@ CYCLES : 17 cycles
				344	@--------------------------------------------------------------------------
				345	@ NOTES :
				346	@ This idct_1x1_venum code was developed with ARM instruction set.
				347	@
				348	@ ARM REGISTER ALLOCATION
				349	@ =========================================================================
				350	@ r0 : pointer to input data
				351	@ r1 : pointer to output area
				352	@ r2 : stride in the output buffer
				353	@==========================================================================
				354	.type idct_1x1_venum, %function
				355	idct_1x1_venum:
				356
				357	ldrsh r3, [r0] @ Load signed half word (int16)
				358	ldr r2, =1028 @ 1028 = 4 + 128 << 3
				359	@ 4 for rounding, 128 for offset
				360	add r2, r3, r2
				361	asrs r2, r2, #3 @ Divide by 8, and set status bit
				362	movmi r2, #0 @ Clamp to be greater than 0
				363	cmp r2, #255
				364	movgt r2, #255 @ Clamp to be less than 255
				365	str r2, [r1] @ Save output
				366	bx lr @ Return to caller
				367
				368	@ end of idct_1x1_venum
				369
				370
				371	@==========================================================================
				372	@ FUNCTION : idct_2x2_venum
				373	@--------------------------------------------------------------------------
				374	@ DISCRIPTION : VeNum optimization of one 2x2 block iDCT
				375	@--------------------------------------------------------------------------
				376	@ C PROTOTYPE : void idct_2x2_venum(int16 * input,
				377	@ int16 * output,
				378	@ int32 stride)
				379	@--------------------------------------------------------------------------
				380	@ REG INPUT : R0 pointer to input (int16)
				381	@ R1 pointer to output (int16)
				382	@ R2 block stride
				383	@--------------------------------------------------------------------------
				384	@ STACK ARG : None
				385	@--------------------------------------------------------------------------
				386	@ MEM INPUT : None
				387	@--------------------------------------------------------------------------
				388	@ REG OUTPUT : None
				389	@--------------------------------------------------------------------------
				390	@ MEM OUTPUT : None
				391	@--------------------------------------------------------------------------
				392	@ REG AFFECTED : R0 - R2
				393	@--------------------------------------------------------------------------
				394	@ STACK USAGE : none
				395	@--------------------------------------------------------------------------
				396	@ CYCLES : 27 cycles
				397	@--------------------------------------------------------------------------
				398	@ NOTES : Output buffer must be an 8x8 16-bit buffer
				399	@
				400	@ ARM REGISTER ALLOCATION
				401	@ ==========================================
				402	@ r0 : pointer to input data
				403	@ r1 : pointer to output area
				404	@ r2 : stride in the output buffer
				405	@ -------------------------------------------
				406	@
				407	@ VENUM REGISTER ALLOCATION
				408	@ =================================================
				409	@ q0 : output x0 - x4
				410	@ q1 : not used
				411	@ q2 : not used
				412	@ q3 : not used
				413	@ q4 : not used
				414	@ q5 : not used
				415	@ q6 : not used
				416	@ q7 : not used
				417	@ q8 : input y0 - y4
				418	@ q9 : intermediate value
				419	@ q10 : intermediate value
				420	@ q11 : offset value
				421	@ q12 : clamp value
				422	@ q13 : not used
				423	@ q14 : not used
				424	@ q15 : not used
				425	@==========================================================================
				426	.type idct_2x2_venum, %function
				427	idct_2x2_venum:
				428
				429	vld4.32 {d16, d17, d18, d19}, [r0]
				430	@ d16: y0 \| y1 \| y2 \| y3 (LSB \| MSB)
				431
				432	vtrn.32 d16, d17 @ d16: y0 \| y1 \| X \| X
				433	@ d17: y2 \| y3 \| X \| X
				434
				435	vqadd.s16 d18, d16, d17 @ d18: y0+y2 \| y1+y3 \| X \| X q: saturated
				436	vqsub.s16 d19, d16, d17 @ d19: y0-y2 \| y1-y3 \| X \| X q: saturated
				437
				438	vtrn.16 d18, d19 @ d18: y0+y2 \| y0-y2 \| X \| X
				439	@ d19: y1+y3 \| y1-y3 \| X \| X
				440
				441	vqadd.s16 d20, d18, d19 @ d20: (y0+y2)+(y1+y3) \| (y0-y2)+(y1-y3)
				442	@ x0 \| x2 \| X \| X
				443	vqsub.s16 d21, d18, d19 @ d21: (y0+y2)-(y1+y3) \| (y0-y2)-(y1-y3)
				444	@ x1 \| x3 \| X \| X
				445
				446	vtrn.16 d20, d21 @ d20: x0 \| x1 \| X \| X
				447	@ d21: x2 \| x3 \| X \| X
				448
				449	vrshr.s16 q10, q10, #3 @ Divide by 8
				450
				451	vmov.i16 q11, #128 @ q11 = 128\|128\|128\|128\|128\|128\|128\|128
				452	vqadd.s16 q0, q10, q11 @ Add offset to make output in [0,255]
				453
				454	vmov.i16 q12, #0 @ q12 = [0000000000000000]
				455	vmov.i16 q13, #255 @ q13 = [FFFFFFFFFFFFFFFF] (hex)
				456
				457	vmax.s16 q0, q0, q12 @ Clamp > 0
				458	vmin.s16 q0, q0, q13 @ Clamp < 255
				459
				460	vstr d0, [r1] @ Store x0 \| x1 \| X \| X
				461	@ Potential out of boundary issue
				462	add r1, r1, r2 @ Add the offset to the output pointer
				463	vstr d1, [r1] @ Store x2 \| x3 \| X \| X
				464	@ Potential out of boundary issue
				465	bx lr @ Return to caller
				466
				467	@ end of idct_2x2_venum
				468
				469
				470	@==========================================================================
				471	@ FUNCTION : idct_4x4_venum
				472	@--------------------------------------------------------------------------
				473	@ DISCRIPTION : VeNum optimization of one 4x4 block iDCT
				474	@--------------------------------------------------------------------------
				475	@ C PROTOTYPE : void idct_4x4_venum(int16 * input,
				476	@ int16 * output,
				477	@ int32 stride)
				478	@--------------------------------------------------------------------------
				479	@ REG INPUT : R0 pointer to input (int16)
				480	@ R1 pointer to output (int16)
				481	@ R2 block stride
				482	@--------------------------------------------------------------------------
				483	@ STACK ARG : None
				484	@--------------------------------------------------------------------------
				485	@ MEM INPUT : None
				486	@--------------------------------------------------------------------------
				487	@ REG OUTPUT : None
				488	@--------------------------------------------------------------------------
				489	@ MEM OUTPUT : None
				490	@--------------------------------------------------------------------------
				491	@ REG AFFECTED : R0 - R3, R12
				492	@--------------------------------------------------------------------------
				493	@ STACK USAGE : none
				494	@--------------------------------------------------------------------------
				495	@ CYCLES : 56 cycles
				496	@--------------------------------------------------------------------------
				497	@ NOTES :
				498	@
				499	@ ARM REGISTER ALLOCATION
				500	@ ==========================================
				501	@ r0 : pointer to input data
				502	@ r1 : pointer to output area
				503	@ r2 : stride in the output buffer
				504	@ r3 : pointer to the coefficient set
				505	@ r12 : pointer to the coefficient set
				506	@ -------------------------------------------
				507	@
				508	@ VENUM REGISTER ALLOCATION
				509	@ =================================================
				510	@ q0 : coefficients[0]
				511	@ q1 : coefficients[1]
				512	@ q2 : coefficients[2]
				513	@ q3 : coefficients[3]
				514	@ q4 : not used
				515	@ q5 : not used
				516	@ q6 : not used
				517	@ q7 : not used
				518	@ q8 : input y0 - y7
				519	@ q9 : input y8 - y15
				520	@ q10 : intermediate value
				521	@ q11 : intermediate value
				522	@ q12 : intermediate value
				523	@ q13 : intermediate value
				524	@ q14 : intermediate value
				525	@ q15 : intermediate value
				526	@==========================================================================
				527	.type idct_4x4_venum, %function
				528	idct_4x4_venum:
				529
				530	@ Load the locations of the first 2 sets of coefficients
				531	ldr r3, =coefficient+0*16 @ coefficient[0]
				532	ldr r12, =coefficient+1*16 @ coefficient[1]
				533
				534	@ Load the first 2 sets of coefficients
				535	vld1.16 {d0, d1},[r3] @ q0 = C4 \| C2 \| C4 \| C6 \| C4 \| C2 \| C4 \| C6
				536	vld1.16 {d2, d3},[r12] @ q1 = C4 \| C6 \| C4 \| C2 \| C4 \| C6 \| C4 \| C2
				537
				538	@ Load the locations of the second 2 sets of coefficients
				539	ldr r3, =coefficient+2*16 @ coefficient[2]
				540	ldr r12, =coefficient+3*16 @ coefficient[3]
				541
				542	@ Load the second 2 sets of coefficients
				543	vld1.16 {d4, d5},[r3] @ q2 = C4 \| C4 \| C4 \| C4 \| C2 \| C2 \| C2 \| C2
				544	vld1.16 {d6, d7},[r12] @ q3 = C4 \| C4 \| C4 \| C4 \| C6 \| C6 \| C6 \| C6
				545
				546	@ Load the input values
				547	vld1.16 {d16}, [r0], r2 @ d16: y0 \| y1 \| y2 \| y3 (LSB \| MSB)
				548	vld1.16 {d17}, [r0], r2 @ d17: y4 \| y5 \| y6 \| y7 (LSB \| MSB)
				549	vld1.16 {d18}, [r0], r2 @ d18: y8 \| y9 \| y10 \| y11 (LSB \| MSB)
				550	vld1.16 {d19}, [r0], r2 @ d19: y12 \| y13 \| y14 \| y15 (LSB \| MSB)
				551
				552	@ Apply iDCT Horizonally
				553
				554	@ q8: y0 \|y1 \|y2 \|y3 \|y4 \|y5 \|y6 \|y7
				555	@ q9: y8 \|y9 \|y10\|y11\|y12\|y13\|y14\|y15
				556
				557	@======================================================================
				558	@ vqrdmulh doubles the result and save the high 16 bits of the result,
				559	@ this is equivalent to right shift by 15 bits.
				560	@ since coefficients are in Q15 format, it contradicts with the right
				561	@ shift 15 here, so the final result is in Q0 format
				562	@
				563	@ vqrdmulh will also round the result
				564	@======================================================================
				565
				566	vqrdmulh.s16 q10, q8, q0 @ q10: C4y0 \| C2y1 \| C4y2 \| C6y3 \| C4y4 \| C2y5 \| C4y6 \| C6y7
				567	vqrdmulh.s16 q11, q8, q1 @ q11: C4y0 \| C6y1 \| C4y2 \| C2y3 \| C4y4 \| C6y5 \| C4y6 \| C2y7
				568
				569	vqrdmulh.s16 q12, q9, q0 @ q12: C4y8 \| C2y9 \| C4y10 \| C6y11 \| C4y12 \| C2y13 \| C4y14 \| C6y15
				570	vqrdmulh.s16 q13, q9, q1 @ q13: C4y8 \| C6y9 \| C4y10 \| C2y11 \| C4y12 \| C6y13 \| C4y14 \| C2y15
				571
				572	vtrn.32 q10, q12 @ q10: C4y0 \| C2y1 \| C4y8 \| C2y9 \| C4y4 \| C2y5 \| C4y12 \| C2y13
				573	@ q12: C4y2 \| C6y3 \| C4y10 \| C6y11 \| C4y6 \| C6y7 \| C4y14 \| C6y15
				574
				575	vtrn.32 q11, q13 @ q11: C4y0 \| C6y1 \| C4y8 \| C6y9 \| C4y4 \| C6y5 \| C4y12 \| C6y13
				576	@ q13: C4y2 \| C2y3 \| C4y10 \| C2y11 \| C4y6 \| C2y7 \| C4y14 \| C2y15
				577
				578	vqadd.s16 q14, q10, q12 @ q14: C4y0 + C4y2 \| C2y1 + C6y3 \| C4y8 + C4y10 \| C2y9 + C6y11 \| C4y4 + C4y6 \| C2y5 + C6y7 \| C4y12 + C4y14 \| C2y13 + C6y15
				579	@ S0 \| S2 \| S8 \| S10 \| S4 \| S6 \| S12 \| S14
				580
				581	vqsub.s16 q15, q11, q13 @ q15: C4y0 - C4y2 \| C6y1 - C2y3 \| C4y8 - C4y10 \| C6y9 - C2y11 \| C4y4 - C4y6 \| C6y5 - C2y7 \| C4y12 - C4y14 \| C6y13 - C2y15
				582	@ S1 \| S3 \| S9 \| S11 \| S5 \| S7 \| S13 \| S15
				583
				584	vtrn.16 q14, q15 @ q14: S0 \| S1 \| S8 \| S9 \| S4 \| S5 \| S12 \| S13
				585	@ q15: S2 \| S3 \| S10 \| S11 \| S6 \| S7 \| S14 \| S15
				586
				587	vqadd.s16 q8, q14, q15 @ q8: Z0 \| Z1 \| Z8 \| Z9 \| Z4 \| Z5 \| Z12 \| Z13
				588	vqsub.s16 q9, q14, q15 @ q9: Z3 \| Z2 \| Z11 \| Z10 \| Z7 \| Z6 \| Z15 \| Z14
				589	vrev32.16 q9, q9 @ q9: Z2 \| Z3 \| Z10 \| Z11 \| Z6 \| Z7 \| Z14 \| Z15
				590
				591
				592	@ Apply iDCT Vertically
				593
				594	vtrn.32 q8, q9 @ q8: Z0 \| Z1 \| Z2 \| Z3 \| Z4 \| Z5 \| Z6 \| Z7
				595	@ q9: Z8 \| Z9 \| Z10 \| Z11 \| Z12 \| Z13 \| Z14 \| Z15
				596
				597
				598	vqrdmulh.s16 q10, q8, q2 @ q10: C4Z0 \| C4Z1 \| C4Z2 \| C4Z3 \| C2Z4 \| C2Z5 \| C2Z6 \| C2Z7
				599	vqrdmulh.s16 q11, q8, q3 @ q11: C4Z0 \| C4Z1 \| C4Z2 \| C4Z3 \| C6Z4 \| C6Z5 \| C6Z6 \| C6Z7
				600
				601	vqrdmulh.s16 q12, q9, q2 @ q12: C4Z8 \| C4Z9 \| C4Z10 \| C4Z11 \| C2Z12 \| C2Z13 \| C2Z14 \| C2Z15
				602	vqrdmulh.s16 q13, q9, q3 @ q13: C4Z8 \| C4Z9 \| C4Z10 \| C4Z11 \| C6Z12 \| C6Z13 \| C6Z14 \| C6Z15
				603
				604	vqadd.s16 q14, q10, q13 @ q14: C4Z0+C4Z8 \| C4Z1+C4Z9 \| C4Z2+C4Z10 \| C4Z3+C4Z11 \| C2Z4+C6Z12 \| C2Z5+C6Z13 \| C2Z6+C6Z14 \| C2Z7+C6Z15
				605	@ s0 \| s4 \| s8 \| s12 \| s2 \| s6 \| s10 \| s14
				606
				607	vqsub.s16 q15, q11, q12 @ q15: C4Z0-C4Z8 \| C4Z1-C4Z9 \| C4Z2-C4Z10 \| C4Z3-C4Z11 \| C6Z4-C2Z12 \| C6Z5-C2Z13 \| C6Z6-C2Z14 \| C6Z7-C2Z15
				608	@ s1 \| s5 \| s9 \| s13 \| s3 \| s7 \| s11 \| s15
				609
				610	vswp d29, d30 @ q14: s0 \| s4 \| s8 \| s12 \| s1 \| s5 \| s9 \| s13
				611	@ q15: s2 \| s6 \| s10 \| s14 \| s3 \| s7 \| s11 \| s15
				612
				613	vqadd.s16 q8, q14, q15 @ q8: x0 \| x4 \| x8 \| x12 \| x1 \| x5 \| x9 \| x13
				614	vqsub.s16 q9, q14, q15 @ q9: x3 \| x7 \| x11 \| x15 \| x2 \| x6 \| x10 \| x14
				615
				616	vmov.i16 q10, #0 @ q10=[0000000000000000]
				617	vmov.i16 q11, #255 @ q11=[FFFFFFFFFFFFFFFF] (hex)
				618
				619	vmov.i16 q0, #128 @ q0 = 128\|128\|128\|128\|128\|128\|128\|128
				620
				621	vqadd.s16 q8, q8, q0 @ Add the offset
				622	vqadd.s16 q9, q9, q0 @ Add the offset
				623
				624	vmax.s16 q8, q8, q10 @ clamp > 0
				625	vmin.s16 q8, q8, q11 @ clamp < 255
				626
				627	vmax.s16 q9, q9, q10 @ clamp > 0
				628	vmin.s16 q9, q9, q11 @ clamp < 255
				629
				630	vst1.16 {d16}, [r1], r2 @ d16: x0 \| x1 \| x2 \| x3 (LSB \| MSB)
				631	vst1.16 {d17}, [r1], r2 @ d17: x4 \| x5 \| x6 \| x7 (LSB \| MSB)
				632	vst1.16 {d19}, [r1], r2 @ d18: x8 \| x9 \| x10 \| x11 (LSB \| MSB)
				633	vst1.16 {d18}, [r1], r2 @ d19: x12\| x13 \| x14 \| x15 (LSB \| MSB)
				634
				635	bx lr @ Return to caller
				636
				637	@ end of idct_4x4_venum
				638
				639	@==========================================================================
				640	@ FUNCTION : idct_8x8_venum
				641	@--------------------------------------------------------------------------
				642	@ DISCRIPTION : VeNum optimization of one 8x8 block iDCT
				643	@--------------------------------------------------------------------------
				644	@ C PROTOTYPE : void idct_8x8_venum(int16 * input,
				645	@ int16 * output,
				646	@ int32 stride)
				647	@--------------------------------------------------------------------------
				648	@ REG INPUT : R0 pointer to input (int16)
				649	@ R1 pointer to output (int16)
				650	@ R2 block stride
				651	@--------------------------------------------------------------------------
				652	@ STACK ARG : None
				653	@--------------------------------------------------------------------------
				654	@ MEM INPUT : None
				655	@--------------------------------------------------------------------------
				656	@ REG OUTPUT : None
				657	@--------------------------------------------------------------------------
				658	@ MEM OUTPUT : None
				659	@--------------------------------------------------------------------------
				660	@ REG AFFECTED : R0 - R9
				661	@--------------------------------------------------------------------------
				662	@ STACK USAGE : none
				663	@--------------------------------------------------------------------------
				664	@ CYCLES : 177 cycles
				665	@--------------------------------------------------------------------------
				666	@ NOTES :
				667	@
				668	@ It was tested to be IEEE 1180 compliant. Since IEEE 1180 compliance is more stringent
				669	@ than MPEG-4 compliance, this version is also MPEG-4 compliant.
				670	@
				671	@ CODE STRUCTURE:
				672	@ (i) Macros for transposing an 8x8 matrix and for configuring the VFP unit are defined.
				673	@ (ii) Macro for IDCT in one dimension is defined as four stages
				674	@ (iii) The two dimensional code begins
				675	@ (iv) constants are defined in the area DataArea
				676	@
				677	@ PROGRAM FLOW:
				678	@
				679	@ The VFP is configured
				680	@ The parameters to IDCT are loaded
				681	@ the coefficients are loaded
				682	@ loop:
				683	@ decrement loop counter
				684	@ The first input Matrix is loaded and pre-scaled
				685	@ The input is prescaled using the constants
				686	@ IDCT is performed in one dimension on the 8 columns
				687	@ The matrix is transposed
				688	@ A bias is loaded an added to the matrix
				689	@ IDCT is performed in one dimension on the 8 rows
				690	@ The matrix is post-scaled
				691	@ The matrix is saved
				692	@ test loop counter and loop if greater than zero
				693	@ stop
				694	@
				695	@
				696	@ ARM REGISTER ALLOCATION
				697	@ ==========================================
				698	@ r0 : pointer to input data
				699	@ r1 : pointer to output are
				700	@ r2 : stride in the output buffer
				701	@ r3 :
				702	@ r4 :
				703	@ r5 : pointer to constants[0] [5]
				704	@ r6 : pointer to constants[1]
				705	@ r7 : pointer to constants[2]
				706	@ r8 : pointer to constants[3]
				707	@ r9 : pointer to constants[4]
				708	@ -------------------------------------------
				709	@
				710	@ VENUM REGISTER ALLOCATION
				711	@ =================================================
				712	@ q0 : constants[0]
				713	@ q1 : constants[1]
				714	@ q2 : constants[2], IDCT1D in-place scratch
				715	@ q3 : constants[3], IDCT1D in-place scratch
				716	@ q4 : constants[4], IDCT1D in-place scratch, and bias compensation
				717	@ q5 : IDCT1D in-place scratch
				718	@ q6 : IDCT1D in-place scratch
				719	@ q7 : IDCT1D in-place scratch
				720	@ q8 : Matrix[0] IDCT1D in-place scratch
				721	@ q9 : Matrix[1] IDCT1D in-place scratch
				722	@ q10 : Matrix[2] IDCT1D in-place scratch
				723	@ q11 : Matrix[3] IDCT1D in-place scratch
				724	@ q12 : Matrix[4] IDCT1D in-place scratch
				725	@ q13 : Matrix[5] IDCT1D in-place scratch
				726	@ q14 : Matrix[6] IDCT1D in-place scratch
				727	@ q15 : Matrix[7] IDCT1D in-place scratch
				728	@==========================================================================
				729	.type idct_8x8_venum, %function
				730	idct_8x8_venum:
				731
				732	push {r5-r9}
				733	vpush {d8-d15}
				734	IDCT_ENTRY
				735	BIG_BODY_TRANSPOSE_INPUT
				736	vpop {d8-d15}
				737	pop {r5-r9}
				738	bx lr
				739	@ end of idct_8x8_venum
				740
				741	@==========================================================================
				742	@ Constants Definition AREA: define idct kernel, bias
				743	@==========================================================================
				744	.section ro_data_area @ AREA RODataArea
				745	.data @ DATA, READONLY
				746	.align 5 @ ALIGN=5
				747
				748	constants:
				749	.hword 23170, 13573, 6518, 21895, -23170, -21895, 8223, 8224
				750	.hword 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725
				751	.hword 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521
				752	.hword 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692
				753	.hword 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722
				754	.hword 0, 255, 0, 0
				755
				756	coefficient: @ These are the coefficent used by 4x4 iDCT in Q15 format
				757	.hword 11585, 15137, 11585, 6270, 11585, 15137, 11585, 6270 @ C4, C2, C4, C6, C4, C2, C4, C6 /2
				758	.hword 11585, 6270, 11585, 15137, 11585, 6270, 11585, 15137 @ C4, C6, C4, C2, C4, C6, C4, C2 /2
				759	.hword 11585, 11585, 11585, 11585, 15137, 15137, 15137, 15137 @ C4, C4, C4, C4, C2, C2, C2, C2 /2
				760	.hword 11585, 11585, 11585, 11585, 6270, 6270, 6270, 6270 @ C4, C4, C4, C4, C6, C6, C6, C6 /2
				761
				762	.end