blob: 95bd4bf7820a62e251fdb4537edf12df111f1aa3 [file] [log] [blame]
Dees_Troy51a0e822012-09-05 15:24:24 -04001/*------------------------------------------------------------------------
2* jdcolor-android-armv7.S
3*
4* Copyright (c) 2010, Code Aurora Forum. All rights reserved.
5*
6* Redistribution and use in source and binary forms, with or without
7* modification, are permitted provided that the following conditions are
8* met:
9* * Redistributions of source code must retain the above copyright
10* notice, this list of conditions and the following disclaimer.
11* * Redistributions in binary form must reproduce the above
12* copyright notice, this list of conditions and the following
13* disclaimer in the documentation and/or other materials provided
14* with the distribution.
15* * Neither the name of Code Aurora Forum, Inc. nor the names of its
16* contributors may be used to endorse or promote products derived
17* from this software without specific prior written permission.
18*
19* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
20* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
21* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
22* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
23* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
26* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
28* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
29* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30*--------------------------------------------------------------------------
31
32*--------------------------------------------------------------------------
33* FUNCTION LIST
34*--------------------------------------------------------------------------
35*
36* - yvup2rgb565_venum
37* - yyvup2rgb565_venum
38* - yvup2abgr8888_venum
39* - yyvup2abgr8888_venum
40*
41*--------------------------------------------------------------------------
42*/
43
44 .section yvu_plain_to_rgb_android, "x" @ AREA
45 .text @ |.text|, CODE, READONLY
46 .align 2
47 .code 32 @ CODE32
48
49/*-----------------------------------------------------------------------------
50 * ARM Registers
51 * ---------------------------------------------------------------------------- */
52p_y .req r0
53p_cr .req r1
54p_cb .req r2
55p_rgb .req r3
56p_bgr .req r3
57length .req r12
58
59 .global yvup2rgb565_venum
60 .global yyvup2rgb565_venum
61 .global yvup2abgr8888_venum
62 .global yyvup2abgr8888_venum
63
64@ coefficients in color conversion matrix multiplication
65.equ COEFF_Y, 256 @ contribution of Y
66.equ COEFF_V_RED, 359 @ contribution of V for red
67.equ COEFF_U_GREEN, -88 @ contribution of U for green
68.equ COEFF_V_GREEN, -183 @ contribution of V for green
69.equ COEFF_U_BLUE, 454 @ contribution of U for blue
70
71@ Clamping constants 0x0 and 0xFF
72.equ COEFF_0, 0
73.equ COEFF_255, 255
74
75@ Bias coefficients for red, green and blue
76.equ COEFF_BIAS_R, -45824 @ Red bias = -359*128 + 128
77.equ COEFF_BIAS_G, 34816 @ Green bias = (88+183)*128 + 128
78.equ COEFF_BIAS_B, -57984 @ Blue bias = -454*128 + 128
79
80
81/*--------------------------------------------------------------------------
82* FUNCTION : yvup2rgb565_venum
83*--------------------------------------------------------------------------
84* DESCRIPTION : Perform YVU planar to RGB565 conversion.
85*--------------------------------------------------------------------------
86* C PROTOTYPE : void yvup2rgb565_venum(uint8_t *p_y,
87* uint8_t *p_cr,
88* uint8_t *p_cb,
89* uint8_t *p_rgb565,
90* uint32_t length)
91*--------------------------------------------------------------------------
92* REG INPUT : R0: uint8_t *p_y
93* pointer to the input Y Line
94* R1: uint8_t *p_cr
95* pointer to the input Cr Line
96* R2: uint8_t *p_cb
97* pointer to the input Cb Line
98* R3: uint8_t *p_rgb565
99* pointer to the output RGB Line
100* R12: uint32_t length
101* width of Line
102*--------------------------------------------------------------------------
103* STACK ARG : None
104*--------------------------------------------------------------------------
105* REG OUTPUT : None
106*--------------------------------------------------------------------------
107* MEM INPUT : p_y - a line of Y pixels
108* p_cr - a line of Cr pixels
109* p_cb - a line of Cb pixels
110* length - the width of the input line
111*--------------------------------------------------------------------------
112* MEM OUTPUT : p_rgb565 - the converted rgb pixels
113*--------------------------------------------------------------------------
114* REG AFFECTED : ARM: R0-R4, R12
115* NEON: Q0-Q15
116*--------------------------------------------------------------------------
117* STACK USAGE : none
118*--------------------------------------------------------------------------
119* CYCLES : none
120*
121*--------------------------------------------------------------------------
122* NOTES :
123*--------------------------------------------------------------------------
124*/
125.type yvup2rgb565_venum, %function
126yvup2rgb565_venum:
127 /*-------------------------------------------------------------------------
128 * Store stack registers
129 * ------------------------------------------------------------------------ */
130 STMFD SP!, {LR}
131
132 VPUSH {D8-D15}
133
134 PLD [R0, R3] @ preload luma line
135
136 ADR R12, constants
137
138 VLD1.S16 {D6, D7}, [R12]! @ D6, D7: 359 | -88 | -183 | 454 | 256 | 0 | 255 | 0
139 VLD1.S32 {D30, D31}, [R12] @ Q15 : -45824 | 34816 | -57984 | X
140
141 /*-------------------------------------------------------------------------
142 * Load the 5th parameter via stack
143 * R0 ~ R3 are used to pass the first 4 parameters, the 5th and above
144 * parameters are passed via stack
145 * ------------------------------------------------------------------------ */
146 LDR R12, [SP, #68] @ LR is pushed into the stack so SP is
147 @ decreased by 4,
148 @ D8-D15 are also pushed into the stack
149 @ so SP is decreased by
150 @ 8-byte/D-Register * 8 D-Registers = 64,
151 @ so SP needs to be increased by 64+4=68
152 @ to get the value that was first pushed
153 @ into stack (the 5th parameter passed in
154 @ throught stack)
155
156 /*-------------------------------------------------------------------------
157 * Load clamping parameters to duplicate vector elements
158 * ------------------------------------------------------------------------ */
159 VDUP.S16 Q4, D7[1] @ Q4: 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
160 VDUP.S16 Q5, D7[2] @ Q5: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
161
162 /*-------------------------------------------------------------------------
163 * Read bias
164 * ------------------------------------------------------------------------ */
165 VDUP.S32 Q0, D30[0] @ Q0: -45824 | -45824 | -45824 | -45824
166 VDUP.S32 Q1, D30[1] @ Q1: 34816 | 34816 | 34816 | 34816
167 VDUP.S32 Q2, D31[0] @ Q2: -70688 | -70688 | -70688 | -70688
168
169
170 /*-------------------------------------------------------------------------
171 * The main loop
172 * ------------------------------------------------------------------------ */
173loop_yvup2rgb565:
174
175 /*-------------------------------------------------------------------------
176 * Load input from Y, V and U
177 * D12 : Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
178 * D14 : V0 V1 V2 V3 V4 V5 V6 V7
179 * D15 : U0 U1 U2 U3 U4 U5 U6 U7
180 * ------------------------------------------------------------------------ */
181 VLD1.U8 {D12}, [p_y]! @ Load 8 Y elements (uint8) to D12
182 VLD1.U8 {D14}, [p_cr]! @ Load 8 Cr elements (uint8) to D14
183 VLD1.U8 {D15}, [p_cb]! @ Load 8 Cb elements (uint8) to D15
184
185 /*-------------------------------------------------------------------------
186 * Expand uint8 value to uint16
187 * D18, D19: Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
188 * D20, D21: V0 V1 V2 V3 V4 V5 V6 V7
189 * D22, D23: U0 U1 U2 U3 U4 U5 U6 U7
190 * ------------------------------------------------------------------------ */
191 VMOVL.U8 Q9, D12
192 VMOVL.U8 Q10, D14
193 VMOVL.U8 Q11, D15
194
195 /*-------------------------------------------------------------------------
196 * Multiply contribution from chrominance, results are in 32-bit
197 * ------------------------------------------------------------------------ */
198 VMULL.S16 Q12, D20, D6[0] @ Q12: 359*(V0,V1,V2,V3) Red
199 VMULL.S16 Q13, D22, D6[1] @ Q13: -88*(U0,U1,U2,U3) Green
200 VMLAL.S16 Q13, D20, D6[2] @ Q13: -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
201 VMULL.S16 Q14, D22, D6[3] @ Q14: 454*(U0,U1,U2,U3) Blue
202
203 /*-------------------------------------------------------------------------
204 * Add bias
205 * ------------------------------------------------------------------------ */
206 VADD.S32 Q12, Q0 @ Q12 add Red bias -45824
207 VADD.S32 Q13, Q1 @ Q13 add Green bias 34816
208 VADD.S32 Q14, Q2 @ Q14 add Blue bias -57984
209
210 /*-------------------------------------------------------------------------
211 * Calculate Red, Green, Blue
212 * ------------------------------------------------------------------------ */
213 VMLAL.S16 Q12, D18, D7[0] @ Q12: R0, R1, R2, R3 in 32-bit Q8 format
214 VMLAL.S16 Q13, D18, D7[0] @ Q13: G0, G1, G2, G3 in 32-bit Q8 format
215 VMLAL.S16 Q14, D18, D7[0] @ Q14: B0, B1, B2, B3 in 32-bit Q8 format
216
217 /*-------------------------------------------------------------------------
218 * Right shift eight bits with rounding
219 * ------------------------------------------------------------------------ */
220 VSHRN.S32 D18 , Q12, #8 @ D18: R0, R1, R2, R3 in 16-bit Q0 format
221 VSHRN.S32 D20 , Q13, #8 @ D20: G0, G1, G2, G3 in 16-bit Q0 format
222 VSHRN.S32 D22, Q14, #8 @ D22: B0, B1, B2, B3 in 16-bit Q0 format
223
224 /*-------------------------------------------------------------------------
225 * Done with the first 4 elements, continue on the next 4 elements
226 * ------------------------------------------------------------------------ */
227
228 /*-------------------------------------------------------------------------
229 * Multiply contribution from chrominance, results are in 32-bit
230 * ------------------------------------------------------------------------ */
231 VMULL.S16 Q12, D21, D6[0] @ Q12: 359*(V0,V1,V2,V3) Red
232 VMULL.S16 Q13, D23, D6[1] @ Q13: -88*(U0,U1,U2,U3) Green
233 VMLAL.S16 Q13, D21, D6[2] @ Q13: -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
234 VMULL.S16 Q14, D23, D6[3] @ Q14: 454*(U0,U1,U2,U3) Blue
235
236 /*-------------------------------------------------------------------------
237 * Add bias
238 * ------------------------------------------------------------------------ */
239 VADD.S32 Q12, Q0 @ Q12 add Red bias -45824
240 VADD.S32 Q13, Q1 @ Q13 add Green bias 34816
241 VADD.S32 Q14, Q2 @ Q14 add Blue bias -57984
242
243 /*-------------------------------------------------------------------------
244 * Calculate Red, Green, Blue
245 * ------------------------------------------------------------------------ */
246 VMLAL.S16 Q12, D19, D7[0] @ Q12: R0, R1, R2, R3 in 32-bit Q8 format
247 VMLAL.S16 Q13, D19, D7[0] @ Q13: G0, G1, G2, G3 in 32-bit Q8 format
248 VMLAL.S16 Q14, D19, D7[0] @ Q14: B0, B1, B2, B3 in 32-bit Q8 format
249
250 /*-------------------------------------------------------------------------
251 * Right shift eight bits with rounding
252 * ------------------------------------------------------------------------ */
253 VSHRN.S32 D19 , Q12, #8 @ D18: R0, R1, R2, R3 in 16-bit Q0 format
254 VSHRN.S32 D21 , Q13, #8 @ D20: G0, G1, G2, G3 in 16-bit Q0 format
255 VSHRN.S32 D23, Q14, #8 @ D22: B0, B1, B2, B3 in 16-bit Q0 format
256
257 /*-------------------------------------------------------------------------
258 * Clamp the value to be within [0~255]
259 * ------------------------------------------------------------------------ */
260 VMAX.S16 Q9, Q9, Q4 @ if Q9 < 0, Q9 = 0
261 VMIN.S16 Q9, Q9, Q5 @ if Q9 > 255, Q9 = 255
262 VQMOVUN.S16 D28, Q9 @ store Red to D28, narrow the value from int16 to int8
263
264 VMAX.S16 Q10, Q10, Q4 @ if Q10 < 0, Q10 = 0
265 VMIN.S16 Q10, Q10, Q5 @ if Q10 > 255, Q10 = 255
266 VQMOVUN.S16 D27, Q10 @ store Green to D27, narrow the value from int16 to int8
267
268 VMAX.S16 Q11, Q11, Q4 @ if Q11 < 0, Q11 = 0
269 VMIN.S16 Q11, Q11, Q5 @ if Q11 > 255, Q11 = 255
270 VQMOVUN.S16 D26, Q11 @ store Blue to D26, narrow the value from int16 to int8.
271
272 /*-------------------------------------------------------------------------
273 * D27: 3 bits of Green + 5 bits of Blue
274 * D28: 5 bits of Red + 3 bits of Green
275 * ------------------------------------------------------------------------ */
276 VSRI.8 D28, D27, #5 @ right shift G by 5 and insert to R
277 VSHL.U8 D27, D27, #3 @ left shift G by 3
278 VSRI.8 D27, D26, #3 @ right shift B by 3 and insert to G
279
280 SUBS length, length, #8 @ check if the length is less than 8
281
282 BMI trailing_yvup2rgb565 @ jump to trailing processing if remaining length is less than 8
283
284 VST2.U8 {D27, D28}, [p_rgb]! @ vector store Red, Green, Blue to destination
285 @ Blue at LSB
286
287 BHI loop_yvup2rgb565 @ loop if more than 8 pixels left
288
289 BEQ end_yvup2rgb565 @ done if exactly 8 pixel processed in the loop
290
291
292trailing_yvup2rgb565:
293 /*-------------------------------------------------------------------------
294 * There are from 1 ~ 7 pixels left in the trailing part.
295 * First adding 7 to the length so the length would be from 0 ~ 6.
296 * eg: 1 pixel left in the trailing part, so 1-8+7 = 0.
297 * Then save 1 pixel unconditionally since at least 1 pixels left in the
298 * trailing part.
299 * ------------------------------------------------------------------------ */
300 ADDS length, length, #7 @ there are 7 or less in the trailing part
301
302 VST2.U8 {D27[0], D28[0]}, [p_rgb]! @ at least 1 pixel left in the trailing part
303 BEQ end_yvup2rgb565 @ done if 0 pixel left
304
305 SUBS length, length, #1 @ update length counter
306 VST2.U8 {D27[1], D28[1]}, [p_rgb]! @ store one more pixel
307 BEQ end_yvup2rgb565 @ done if 0 pixel left
308
309 SUBS length, length, #1 @ update length counter
310 VST2.U8 {D27[2], D28[2]}, [p_rgb]! @ store one more pixel
311 BEQ end_yvup2rgb565 @ done if 0 pixel left
312
313 SUBS length, length, #1 @ update length counter
314 VST2.U8 {D27[3], D28[3]}, [p_rgb]! @ store one more pixel
315 BEQ end_yvup2rgb565 @ done if 0 pixel left
316
317 SUBS length, length, #1 @ update length counter
318 VST2.U8 {D27[4], D28[4]}, [p_rgb]! @ store one more pixel
319 BEQ end_yvup2rgb565 @ done if 0 pixel left
320
321 SUBS length, length, #1 @ update length counter
322 VST2.U8 {D27[5], D28[5]}, [p_rgb]! @ store one more pixel
323 BEQ end_yvup2rgb565 @ done if 0 pixel left
324
325 SUBS length, length, #1 @ update length counter
326 VST2.U8 {D27[6], D28[6]}, [p_rgb]! @ store one more pixel
327
328end_yvup2rgb565:
329 VPOP {D8-D15}
330 LDMFD SP!, {PC}
331
332 @ end of yvup2rgb565
333
334
335/*--------------------------------------------------------------------------
336* FUNCTION : yyvup2rgb565_venum
337*--------------------------------------------------------------------------
338* DESCRIPTION : Perform YYVU planar to RGB565 conversion.
339*--------------------------------------------------------------------------
340* C PROTOTYPE : void yyvup2rgb565_venum(uint8_t *p_y,
341* uint8_t *p_cr,
342* uint8_t *p_cb,
343* uint8_t *p_rgb565,
344* uint32_t length)
345*--------------------------------------------------------------------------
346* REG INPUT : R0: uint8_t *p_y
347* pointer to the input Y Line
348* R1: uint8_t *p_cr
349* pointer to the input Cr Line
350* R2: uint8_t *p_cb
351* pointer to the input Cb Line
352* R3: uint8_t *p_rgb565
353* pointer to the output RGB Line
354* R12: uint32_t length
355* width of Line
356*--------------------------------------------------------------------------
357* STACK ARG : None
358*--------------------------------------------------------------------------
359* REG OUTPUT : None
360*--------------------------------------------------------------------------
361* MEM INPUT : p_y - a line of Y pixels
362* p_cr - a line of Cr pixels
363* p_cb - a line of Cb pixels
364* length - the width of the input line
365*--------------------------------------------------------------------------
366* MEM OUTPUT : p_rgb565 - the converted rgb pixels
367*--------------------------------------------------------------------------
368* REG AFFECTED : ARM: R0-R4, R12
369* NEON: Q0-Q15
370*--------------------------------------------------------------------------
371* STACK USAGE : none
372*--------------------------------------------------------------------------
373* CYCLES : none
374*
375*--------------------------------------------------------------------------
376* NOTES :
377*--------------------------------------------------------------------------
378*/
379.type yyvup2rgb565_venum, %function
380yyvup2rgb565_venum:
381 /*-------------------------------------------------------------------------
382 * Store stack registers
383 * ------------------------------------------------------------------------ */
384 STMFD SP!, {LR}
385
386 VPUSH {D8-D15}
387
388 PLD [R0, R3] @ preload luma line
389
390 ADR R12, constants
391
392 VLD1.S16 {D6, D7}, [R12]! @ D6, D7: 359 | -88 | -183 | 454 | 256 | 0 | 255 | 0
393 VLD1.S32 {D30, D31}, [R12] @ Q15 : -45824 | 34816 | -57984 | X
394
395 /*-------------------------------------------------------------------------
396 * Load the 5th parameter via stack
397 * R0 ~ R3 are used to pass the first 4 parameters, the 5th and above
398 * parameters are passed via stack
399 * ------------------------------------------------------------------------ */
400 LDR R12, [SP, #68] @ LR is pushed into the stack so SP is
401 @ decreased by 4,
402 @ D8-D15 are also pushed into the stack
403 @ so SP is decreased by
404 @ 8-byte/D-Register * 8 D-Registers = 64,
405 @ so SP needs to be increased by 64+4=68
406 @ to get the value that was first pushed
407 @ into stack (the 5th parameter passed in
408 @ throught stack)
409
410 /*-------------------------------------------------------------------------
411 * Load clamping parameters to duplicate vector elements
412 * ------------------------------------------------------------------------ */
413 VDUP.S16 Q4, D7[1] @ Q4: 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
414 VDUP.S16 Q5, D7[2] @ Q5: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
415
416 /*-------------------------------------------------------------------------
417 * Read bias
418 * ------------------------------------------------------------------------ */
419 VDUP.S32 Q0, D30[0] @ Q0: -45824 | -45824 | -45824 | -45824
420 VDUP.S32 Q1, D30[1] @ Q1: 34816 | 34816 | 34816 | 34816
421 VDUP.S32 Q2, D31[0] @ Q2: -70688 | -70688 | -70688 | -70688
422
423
424 /*-------------------------------------------------------------------------
425 * The main loop
426 * ------------------------------------------------------------------------ */
427loop_yyvup2rgb565:
428
429 /*-------------------------------------------------------------------------
430 * Load input from Y, V and U
431 * D12, D13: Y0 Y2 Y4 Y6 Y8 Y10 Y12 Y14, Y1 Y3 Y5 Y7 Y9 Y11 Y13 Y15
432 * D14 : V0 V1 V2 V3 V4 V5 V6 V7
433 * D15 : U0 U1 U2 U3 U4 U5 U6 U7
434 * ------------------------------------------------------------------------ */
435 VLD2.U8 {D12,D13}, [p_y]! @ Load 16 Luma elements (uint8) to D12, D13
436 VLD1.U8 {D14}, [p_cr]! @ Load 8 Cr elements (uint8) to D14
437 VLD1.U8 {D15}, [p_cb]! @ Load 8 Cb elements (uint8) to D15
438
439 /*-------------------------------------------------------------------------
440 * Expand uint8 value to uint16
441 * D24, D25: Y0 Y2 Y4 Y6 Y8 Y10 Y12 Y14
442 * D26, D27: Y1 Y3 Y5 Y7 Y9 Y11 Y13 Y15
443 * D28, D29: V0 V1 V2 V3 V4 V5 V6 V7
444 * D30, D31: U0 U1 U2 U3 U4 U5 U6 U7
445 * ------------------------------------------------------------------------ */
446 VMOVL.U8 Q12, D12
447 VMOVL.U8 Q13, D13
448 VMOVL.U8 Q14, D14
449 VMOVL.U8 Q15, D15
450
451 /*-------------------------------------------------------------------------
452 * Multiply contribution from chrominance, results are in 32-bit
453 * ------------------------------------------------------------------------ */
454 VMULL.S16 Q6, D28, D6[0] @ Q6: 359*(V0,V1,V2,V3) Red
455 VMULL.S16 Q7, D30, D6[1] @ Q7: -88*(U0,U1,U2,U3) Green
456 VMLAL.S16 Q7, D28, D6[2] @ q7: -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
457 VMULL.S16 Q8, D30, D6[3] @ q8: 454*(U0,U1,U2,U3) Blue
458
459 /*-------------------------------------------------------------------------
460 * Add bias
461 * ------------------------------------------------------------------------ */
462 VADD.S32 Q6, Q0 @ Q6 add Red bias -45824
463 VADD.S32 Q7, Q1 @ Q7 add Green bias 34816
464 VADD.S32 Q8, Q2 @ Q8 add Blue bias -57984
465
466 /*-------------------------------------------------------------------------
467 * Calculate Red, Green, Blue
468 * ------------------------------------------------------------------------ */
469 VMOV.S32 Q9, Q6
470 VMLAL.S16 Q6, D24, D7[0] @ Q6: R0, R2, R4, R6 in 32-bit Q8 format
471 VMLAL.S16 Q9, D26, D7[0] @ Q9: R1, R3, R5, R7 in 32-bit Q8 format
472
473 VMOV.S32 Q10, Q7
474 VMLAL.S16 Q7, D24, D7[0] @ Q7: G0, G2, G4, G6 in 32-bit Q8 format
475 VMLAL.S16 Q10, D26, D7[0] @ Q10: G1, G3, G5, G7 in 32-bit Q8 format
476
477 VMOV.S32 Q11, Q8
478 VMLAL.S16 Q8, D24, D7[0] @ Q8: B0, B2, B4, B6 in 32-bit Q8 format
479 VMLAL.S16 Q11, D26, D7[0] @ Q11: B1, B3, B5, B7 in 32-bit Q8 format
480
481 /*-------------------------------------------------------------------------
482 * Right shift eight bits with rounding
483 * ------------------------------------------------------------------------ */
484 VSHRN.S32 D12, Q6, #8 @ D12: R0 R2 R4 R6 in 16-bit Q0 format
485 VSHRN.S32 D13, Q9, #8 @ D13: R1 R3 R5 R7 in 16-bit Q0 format
486 VZIP.16 D12, D13 @ Q6 : R0 R1 R2 R3 R4 R5 R6 R7
487
488 VSHRN.S32 D18, Q7, #8 @ D18: G0 G2 G4 G6 in 16-bit Q0 format
489 VSHRN.S32 D19, Q10, #8 @ D19: G1 G3 G5 G7 in 16-bit Q0 format
490 VZIP.16 D18, D19 @ Q9 : G0 G1 G2 G3 G4 G5 G6 G7
491
492 VSHRN.S32 D20, Q8, #8 @ D20: B0 B2 B4 B6 in 16-bit Q0 format
493 VSHRN.S32 D21, Q11, #8 @ D21: B1 B3 B5 B7 in 16-bit Q0 format
494 VZIP.16 D20, D21 @ Q10: B0 B1 B2 B3 B4 B5 B6 B7
495
496 /*-------------------------------------------------------------------------
497 * Clamp the value to be within [0~255]
498 * ------------------------------------------------------------------------ */
499 VMAX.S16 Q6, Q6, Q4 @ if Q6 < 0, Q6 = 0
500 VMIN.S16 Q6, Q6, Q5 @ if Q6 > 255, Q6 = 255
501 VQMOVUN.S16 D23, Q6 @ store Red to D23, narrow the value from int16 to int8
502
503 VMAX.S16 Q9, Q9, Q4 @ if Q9 < 0, Q9 = 0
504 VMIN.S16 Q9, Q9, Q5 @ if Q9 > 255, Q9 = 255
505 VQMOVUN.S16 D22, Q9 @ store Green to D22, narrow the value from int16 to int8
506
507 VMAX.S16 Q10, Q10, Q4 @ if Q10 < 0, Q10 = 0
508 VMIN.S16 Q10, Q10, Q5 @ if Q10 > 255, Q10 = 255
509 VQMOVUN.S16 D21, Q10 @ store Blue to D21, narrow the value from int16 to int8
510
511 /*-------------------------------------------------------------------------
512 * D22: 3 bits of Green + 5 bits of Blue
513 * D23: 5 bits of Red + 3 bits of Green
514 * ------------------------------------------------------------------------ */
515 VSRI.8 D23, D22, #5 @ right shift G by 5 and insert to R
516 VSHL.U8 D22, D22, #3 @ left shift G by 3
517 VSRI.8 D22, D21, #3 @ right shift B by 3 and insert to G
518
519 SUBS length, length, #8 @ check if the length is less than 8
520
521 BMI trailing_yyvup2rgb565 @ jump to trailing processing if remaining length is less than 8
522
523 VST2.U8 {D22,D23}, [p_rgb]! @ vector store Red, Green, Blue to destination
524 @ Blue at LSB
525
526 BEQ end_yyvup2rgb565 @ done if exactly 8 pixel processed in the loop
527
528
529 /*-------------------------------------------------------------------------
530 * Done with the first 8 elements, continue on the next 8 elements
531 * ------------------------------------------------------------------------ */
532
533 /*-------------------------------------------------------------------------
534 * Multiply contribution from chrominance, results are in 32-bit
535 * ------------------------------------------------------------------------ */
536 VMULL.S16 Q6, D29, D6[0] @ Q6: 359*(V4,V5,V6,V7) Red
537 VMULL.S16 Q7, D31, D6[1] @ Q7: -88*(U4,U5,U6,U7) Green
538 VMLAL.S16 Q7, D29, D6[2] @ Q7: -88*(U4,U5,U6,U7) - 183*(V4,V5,V6,V7)
539 VMULL.S16 Q8, D31, D6[3] @ Q8: 454*(U4,U5,U6,U7) Blue
540
541 /*-------------------------------------------------------------------------
542 * Add bias
543 * ------------------------------------------------------------------------ */
544 VADD.S32 Q6, Q0 @ Q6 add Red bias -45824
545 VADD.S32 Q7, Q1 @ Q7 add Green bias 34816
546 VADD.S32 Q8, Q2 @ Q8 add Blue bias -57984
547
548 /*-------------------------------------------------------------------------
549 * Calculate Red, Green, Blue
550 * ------------------------------------------------------------------------ */
551 VMOV.S32 Q9, Q6
552 VMLAL.S16 Q6, D25, D7[0] @ Q6: R8 R10 R12 R14 in 32-bit Q8 format
553 VMLAL.S16 Q9, D27, D7[0] @ Q9: R9 R11 R13 R15 in 32-bit Q8 format
554
555 VMOV.S32 Q10, Q7
556 VMLAL.S16 Q7, D25, D7[0] @ Q7: G0, G2, G4, G6 in 32-bit Q8 format
557 VMLAL.S16 Q10, D27, D7[0] @ Q10 : G1, G3, G5, G7 in 32-bit Q8 format
558
559 VMOV.S32 Q11, Q8
560 VMLAL.S16 Q8, D25, D7[0] @ Q8: B0, B2, B4, B6 in 32-bit Q8 format
561 VMLAL.S16 Q11, D27, D7[0] @ Q11 : B1, B3, B5, B7 in 32-bit Q8 format
562
563 /*-------------------------------------------------------------------------
564 * Right shift eight bits with rounding
565 * ------------------------------------------------------------------------ */
566 VSHRN.S32 D12, Q6, #8 @ D12: R8 R10 R12 R14 in 16-bit Q0 format
567 VSHRN.S32 D13, Q9, #8 @ D13: R9 R11 R13 R15 in 16-bit Q0 format
568 VZIP.16 D12, D13 @ Q6: R8 R9 R10 R11 R12 R13 R14 R15
569
570 VSHRN.S32 D18, Q7, #8 @ D18: G8 G10 G12 G14 in 16-bit Q0 format
571 VSHRN.S32 D19, Q10, #8 @ D19: G9 G11 G13 G15 in 16-bit Q0 format
572 VZIP.16 D18, D19 @ Q9: G8 G9 G10 G11 G12 G13 G14 G15
573
574 VSHRN.S32 D20, Q8, #8 @ D20: B8 B10 B12 B14 in 16-bit Q0 format
575 VSHRN.S32 D21, Q11, #8 @ D21: B9 B11 B13 B15 in 16-bit Q0 format
576 VZIP.16 D20, D21 @ Q10: B8 B9 B10 B11 B12 B13 B14 B15
577
578 /*-------------------------------------------------------------------------
579 * Clamp the value to be within [0~255]
580 * ------------------------------------------------------------------------ */
581 VMAX.S16 Q6, Q6, Q4 @ if Q6 < 0, Q6 = 0
582 VMIN.S16 Q6, Q6, Q5 @ if Q6 > 255, Q6 = 255
583 VQMOVUN.S16 D23, Q6 @ store Red to D23, narrow the value from int16 to int8
584
585 VMAX.S16 Q9, Q9, Q4 @ if Q9 < 0, Q9 = 0
586 VMIN.S16 Q9, Q9, Q5 @ if Q9 > 255, Q9 = 255
587 VQMOVUN.S16 D22, Q9 @ store Green to D22, narrow the value from int16 to int8
588
589 VMAX.S16 Q10, Q10, Q4 @ if Q10 < 0, Q10 = 0
590 VMIN.S16 Q10, Q10, Q5 @ if Q10 > 255, Q10 = 255
591 VQMOVUN.S16 D21, Q10 @ store Blue to D21, narrow the value from int16 to int8
592
593 /*-------------------------------------------------------------------------
594 * D22: 3 bits of Green + 5 bits of Blue
595 * D23: 5 bits of Red + 3 bits of Green
596 * ------------------------------------------------------------------------ */
597 VSRI.8 D23, D22, #5 @ right shift G by 5 and insert to R
598 VSHL.U8 D22, D22, #3 @ left shift G by 3
599 VSRI.8 D22, D21, #3 @ right shift B by 3 and insert to G
600
601 SUBS length, length, #8 @ check if the length is less than 8
602
603 BMI trailing_yyvup2rgb565 @ jump to trailing processing if remaining length is less than 8
604
605 VST2.U8 {D22,D23}, [p_rgb]! @ vector store Red, Green, Blue to destination
606 @ Blue at LSB
607
608 BHI loop_yyvup2rgb565 @ loop if more than 8 pixels left
609
610 BEQ end_yyvup2rgb565 @ done if exactly 8 pixel processed in the loop
611
612
613trailing_yyvup2rgb565:
614 /*-------------------------------------------------------------------------
615 * There are from 1 ~ 7 pixels left in the trailing part.
616 * First adding 7 to the length so the length would be from 0 ~ 6.
617 * eg: 1 pixel left in the trailing part, so 1-8+7 = 0.
618 * Then save 1 pixel unconditionally since at least 1 pixels left in the
619 * trailing part.
620 * ------------------------------------------------------------------------ */
621 ADDS length, length, #7 @ there are 7 or less in the trailing part
622
623 VST2.U8 {D22[0],D23[0]}, [p_rgb]! @ at least 1 pixel left in the trailing part
624 BEQ end_yyvup2rgb565 @ done if 0 pixel left
625
626 SUBS length, length, #1 @ update length counter
627 VST2.U8 {D22[1],D23[1]}, [p_rgb]! @ store one more pixel
628 BEQ end_yyvup2rgb565 @ done if 0 pixel left
629
630 SUBS length, length, #1 @ update length counter
631 VST2.U8 {D22[2],D23[2]}, [p_rgb]! @ store one more pixel
632 BEQ end_yyvup2rgb565 @ done if 0 pixel left
633
634 SUBS length, length, #1 @ update length counter
635 VST2.U8 {D22[3],D23[3]}, [p_rgb]! @ store one more pixel
636 BEQ end_yyvup2rgb565 @ done if 0 pixel left
637
638 SUBS length, length, #1 @ update length counter
639 VST2.U8 {D22[4],D23[4]}, [p_rgb]! @ store one more pixel
640 BEQ end_yyvup2rgb565 @ done if 0 pixel left
641
642 SUBS length, length, #1 @ update length counter
643 VST2.U8 {D22[5],D23[5]}, [p_rgb]! @ store one more pixel
644 BEQ end_yyvup2rgb565 @ done if 0 pixel left
645
646 SUBS length, length, #1 @ update length counter
647 VST2.U8 {D22[6],D23[6]}, [p_rgb]! @ store one more pixel
648
649end_yyvup2rgb565:
650 VPOP {D8-D15}
651 LDMFD SP!, {PC}
652
653 @ end of yyvup2rgb565
654
655constants:
656 .hword (COEFF_V_RED), (COEFF_U_GREEN), (COEFF_V_GREEN), (COEFF_U_BLUE) @ 359 | -88 | -183 | 454
657 .hword (COEFF_Y), (COEFF_0), (COEFF_255) , (COEFF_0) @ 256 | 0 | 255 | 0
658 .word (COEFF_BIAS_R), (COEFF_BIAS_G), (COEFF_BIAS_B) @ -45824 | 34816 | -57984 | X
659
660/*--------------------------------------------------------------------------
661* FUNCTION : yvup2abgr8888_venum
662*--------------------------------------------------------------------------
663* DESCRIPTION : Perform YVU planar to ABGR8888 conversion.
664*--------------------------------------------------------------------------
665* C PROTOTYPE : void yvup2abgr8888_venum(uint8_t *p_y,
666* uint8_t *p_cr,
667* uint8_t *p_cb,
668* uint8_t *p_abgr8888,
669* uint32_t length)
670*--------------------------------------------------------------------------
671* REG INPUT : R0: uint8_t *p_y
672* pointer to the input Y Line
673* R1: uint8_t *p_cr
674* pointer to the input Cr Line
675* R2: uint8_t *p_cb
676* pointer to the input Cb Line
677* R3: uint8_t *p_abgr8888
678* pointer to the output ABGR Line
679* R12: uint32_t length
680* width of Line
681*--------------------------------------------------------------------------
682* STACK ARG : None
683*--------------------------------------------------------------------------
684* REG OUTPUT : None
685*--------------------------------------------------------------------------
686* MEM INPUT : p_y - a line of Y pixels
687* p_cr - a line of Cr pixels
688* p_cb - a line of Cb pixels
689* length - the width of the input line
690*--------------------------------------------------------------------------
691* MEM OUTPUT : p_abgr8888 - the converted ABGR pixels
692*--------------------------------------------------------------------------
693* REG AFFECTED : ARM: R0-R4, R12
694* NEON: Q0-Q15
695*--------------------------------------------------------------------------
696* STACK USAGE : none
697*--------------------------------------------------------------------------
698* CYCLES : none
699*
700*--------------------------------------------------------------------------
701* NOTES :
702*--------------------------------------------------------------------------
703*/
704.type yvup2abgr8888_venum, %function
705yvup2abgr8888_venum:
706 /*-------------------------------------------------------------------------
707 * Store stack registers
708 * ------------------------------------------------------------------------ */
709 STMFD SP!, {LR}
710
711 VPUSH {D8-D15}
712
713 PLD [R0, R3] @ preload luma line
714
715 ADR R12, constants
716
717 VLD1.S16 {D6, D7}, [R12]! @ D6, D7: 359 | -88 | -183 | 454 | 256 | 0 | 255 | 0
718 VLD1.S32 {D30, D31}, [R12] @ Q15 : -45824 | 34816 | -57984 | X
719
720 /*-------------------------------------------------------------------------
721 * Load the 5th parameter via stack
722 * R0 ~ R3 are used to pass the first 4 parameters, the 5th and above
723 * parameters are passed via stack
724 * ------------------------------------------------------------------------ */
725 LDR R12, [SP, #68] @ LR is pushed into the stack so SP is
726 @ decreased by 4,
727 @ D8-D15 are also pushed into the stack
728 @ so SP is decreased by
729 @ 8-byte/D-Register * 8 D-Registers = 64,
730 @ so SP needs to be increased by 64+4=68
731 @ to get the value that was first pushed
732 @ into stack (the 5th parameter passed in
733 @ throught stack)
734
735 /*-------------------------------------------------------------------------
736 * Load clamping parameters to duplicate vector elements
737 * ------------------------------------------------------------------------ */
738 VDUP.S16 Q4, D7[1] @ Q4: 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
739 VDUP.S16 Q5, D7[2] @ Q5: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
740
741 /*-------------------------------------------------------------------------
742 * Read bias
743 * ------------------------------------------------------------------------ */
744 VDUP.S32 Q0, D30[0] @ Q0: -45824 | -45824 | -45824 | -45824
745 VDUP.S32 Q1, D30[1] @ Q1: 34816 | 34816 | 34816 | 34816
746 VDUP.S32 Q2, D31[0] @ Q2: -70688 | -70688 | -70688 | -70688
747
748
749 /*-------------------------------------------------------------------------
750 * The main loop
751 * ------------------------------------------------------------------------ */
752loop_yvup2abgr:
753
754 /*-------------------------------------------------------------------------
755 * Load input from Y, V and U
756 * D12 : Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
757 * D14 : V0 V1 V2 V3 V4 V5 V6 V7
758 * D15 : U0 U1 U2 U3 U4 U5 U6 U7
759 * ------------------------------------------------------------------------ */
760 VLD1.U8 {D12}, [p_y]! @ Load 8 Luma elements (uint8) to D12
761 VLD1.U8 {D14}, [p_cr]! @ Load 8 Cr elements (uint8) to D14
762 VLD1.U8 {D15}, [p_cb]! @ Load 8 Cb elements (uint8) to D15
763
764 /*-------------------------------------------------------------------------
765 * Expand uint8 value to uint16
766 * D18, D19: Y0 Y1 Y2 Y3 Y4 Y5 Y6 Y7
767 * D20, D21: V0 V1 V2 V3 V4 V5 V6 V7
768 * D22, D23: U0 U1 U2 U3 U4 U5 U6 U7
769 * ------------------------------------------------------------------------ */
770 VMOVL.U8 Q9, D12
771 VMOVL.U8 Q10, D14
772 VMOVL.U8 Q11, D15
773
774 /*-------------------------------------------------------------------------
775 * Multiply contribution from chrominance, results are in 32-bit
776 * ------------------------------------------------------------------------ */
777 VMULL.S16 Q12, D20, D6[0] @ Q12: 359*(V0,V1,V2,V3) Red
778 VMULL.S16 Q13, D22, D6[1] @ Q13: -88*(U0,U1,U2,U3) Green
779 VMLAL.S16 Q13, D20, D6[2] @ Q13: -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
780 VMULL.S16 Q14, D22, D6[3] @ Q14: 454*(U0,U1,U2,U3) Blue
781
782 /*-------------------------------------------------------------------------
783 * Add bias
784 * ------------------------------------------------------------------------ */
785 VADD.S32 Q12, Q0 @ Q12 add Red bias -45824
786 VADD.S32 Q13, Q1 @ Q13 add Green bias 34816
787 VADD.S32 Q14, Q2 @ Q14 add Blue bias -57984
788
789 /*-------------------------------------------------------------------------
790 * Calculate Red, Green, Blue
791 * ------------------------------------------------------------------------ */
792 VMLAL.S16 Q12, D18, D7[0] @ Q12: R0, R1, R2, R3 in 32-bit Q8 format
793 VMLAL.S16 Q13, D18, D7[0] @ Q13: G0, G1, G2, G3 in 32-bit Q8 format
794 VMLAL.S16 Q14, D18, D7[0] @ Q14: B0, B1, B2, B3 in 32-bit Q8 format
795
796 /*-------------------------------------------------------------------------
797 * Right shift eight bits with rounding
798 * ------------------------------------------------------------------------ */
799 VSHRN.S32 D18 , Q12, #8 @ D18: R0, R1, R2, R3 in 16-bit Q0 format
800 VSHRN.S32 D20 , Q13, #8 @ D20: G0, G1, G2, G3 in 16-bit Q0 format
801 VSHRN.S32 D22, Q14, #8 @ D22: B0, B1, B2, B3 in 16-bit Q0 format
802
803 /*-------------------------------------------------------------------------
804 * Done with the first 4 elements, continue on the next 4 elements
805 * ------------------------------------------------------------------------ */
806
807 /*-------------------------------------------------------------------------
808 * Multiply contribution from chrominance, results are in 32-bit
809 * ------------------------------------------------------------------------ */
810 VMULL.S16 Q12, D21, D6[0] @ Q12: 359*(V0,V1,V2,V3) Red
811 VMULL.S16 Q13, D23, D6[1] @ Q13: -88*(U0,U1,U2,U3) Green
812 VMLAL.S16 Q13, D21, D6[2] @ Q13: -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
813 VMULL.S16 Q14, D23, D6[3] @ Q14: 454*(U0,U1,U2,U3) Blue
814
815 /*-------------------------------------------------------------------------
816 * Add bias
817 * ------------------------------------------------------------------------ */
818 VADD.S32 Q12, Q0 @ Q12 add Red bias -45824
819 VADD.S32 Q13, Q1 @ Q13 add Green bias 34816
820 VADD.S32 Q14, Q2 @ Q14 add Blue bias -57984
821
822 /*-------------------------------------------------------------------------
823 * Calculate Red, Green, Blue
824 * ------------------------------------------------------------------------ */
825 VMLAL.S16 Q12, D19, D7[0] @ Q12: R0, R1, R2, R3 in 32-bit Q8 format
826 VMLAL.S16 Q13, D19, D7[0] @ Q13: G0, G1, G2, G3 in 32-bit Q8 format
827 VMLAL.S16 Q14, D19, D7[0] @ Q14: B0, B1, B2, B3 in 32-bit Q8 format
828
829 /*-------------------------------------------------------------------------
830 * Right shift eight bits with rounding
831 * ------------------------------------------------------------------------ */
832 VSHRN.S32 D19 , Q12, #8 @ D18: R0, R1, R2, R3 in 16-bit Q0 format
833 VSHRN.S32 D21 , Q13, #8 @ D20: G0, G1, G2, G3 in 16-bit Q0 format
834 VSHRN.S32 D23, Q14, #8 @ D22: B0, B1, B2, B3 in 16-bit Q0 format
835
836 /*-------------------------------------------------------------------------
837 * Clamp the value to be within [0~255]
838 * ------------------------------------------------------------------------ */
839 VMAX.S16 Q11, Q11, Q4 @ if Q11 < 0, Q11 = 0
840 VMIN.S16 Q11, Q11, Q5 @ if Q11 > 255, Q11 = 255
841 VQMOVUN.S16 D28, Q11 @ store Blue to D28, narrow the value from int16 to int8
842
843 VMAX.S16 Q10, Q10, Q4 @ if Q10 < 0, Q10 = 0
844 VMIN.S16 Q10, Q10, Q5 @ if Q10 > 255, Q10 = 255
845 VQMOVUN.S16 D27, Q10 @ store Green to D27, narrow the value from int16 to int8
846
847 VMAX.S16 Q9, Q9, Q4 @ if Q9 < 0, Q9 = 0
848 VMIN.S16 Q9, Q9, Q5 @ if Q9 > 255, Q9 = 255
849 VQMOVUN.S16 D26, Q9 @ store Red to D26, narrow the value from int16 to int8
850
851 /*-------------------------------------------------------------------------
852 * abgr format with leading 0xFF byte
853 * ------------------------------------------------------------------------ */
854 VMOVN.I16 D29, Q5 @ D29: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
855
856 SUBS length, length, #8 @ check if the length is less than 8
857
858 BMI trailing_yvup2abgr @ jump to trailing processing if remaining length is less than 8
859
860 VST4.U8 {D26,D27,D28,D29}, [p_bgr]! @ vector store Red, Green, Blue to destination
861 @ Blue at LSB
862
863 BHI loop_yvup2abgr @ loop if more than 8 pixels left
864
865 BEQ end_yvup2abgr @ done if exactly 8 pixel processed in the loop
866
867
868trailing_yvup2abgr:
869 /*-------------------------------------------------------------------------
870 * There are from 1 ~ 7 pixels left in the trailing part.
871 * First adding 7 to the length so the length would be from 0 ~ 6.
872 * eg: 1 pixel left in the trailing part, so 1-8+7 = 0.
873 * Then save 1 pixel unconditionally since at least 1 pixels left in the
874 * trailing part.
875 * ------------------------------------------------------------------------ */
876 ADDS length, length, #7 @ there are 7 or less in the trailing part
877
878 VST4.U8 {D26[0], D27[0], D28[0], D29[0]}, [p_bgr]! @ at least 1 pixel left in the trailing part
879 BEQ end_yvup2abgr @ done if 0 pixel left
880
881 SUBS length, length, #1 @ update length counter
882 VST4.U8 {D26[1], D27[1], D28[1], D29[1]}, [p_bgr]! @ store one more pixel
883 BEQ end_yvup2abgr @ done if 0 pixel left
884
885 SUBS length, length, #1 @ update length counter
886 VST4.U8 {D26[2], D27[2], D28[2], D29[2]}, [p_bgr]! @ store one more pixel
887 BEQ end_yvup2abgr @ done if 0 pixel left
888
889 SUBS length, length, #1 @ update length counter
890 VST4.U8 {D26[3], D27[3], D28[3], D29[3]}, [p_bgr]! @ store one more pixel
891 BEQ end_yvup2abgr @ done if 0 pixel left
892
893 SUBS length, length, #1 @ update length counter
894 VST4.U8 {D26[4], D27[4], D28[4], D29[4]}, [p_bgr]! @ store one more pixel
895 BEQ end_yvup2abgr @ done if 0 pixel left
896
897 SUBS length, length, #1 @ update length counter
898 VST4.U8 {D26[5], D27[5], D28[5], D29[5]}, [p_bgr]! @ store one more pixel
899 BEQ end_yvup2abgr @ done if 0 pixel left
900
901 SUBS length, length, #1 @ update length counter
902 VST4.U8 {D26[6], D27[6], D28[6], D29[6]}, [p_bgr]! @ store one more pixel
903
904end_yvup2abgr:
905 VPOP {D8-D15}
906 LDMFD SP!, {PC}
907 @ end of yvup2abgr
908
909/*--------------------------------------------------------------------------
910* FUNCTION : yyvup2abgr8888_venum
911*--------------------------------------------------------------------------
912* DESCRIPTION : Perform YYVU planar to ABGR8888 conversion.
913*--------------------------------------------------------------------------
914* C PROTOTYPE : void yyvup2abgr8888_venum(uint8_t *p_y,
915* uint8_t *p_cr,
916* uint8_t *p_cb,
917* uint8_t *p_abgr8888,
918* uint32_t length)
919*--------------------------------------------------------------------------
920* REG INPUT : R0: uint8_t *p_y
921* pointer to the input Y Line
922* R1: uint8_t *p_cr
923* pointer to the input Cr Line
924* R2: uint8_t *p_cb
925* pointer to the input Cb Line
926* R3: uint8_t *p_abgr8888
927* pointer to the output ABGR Line
928* R12: uint32_t length
929* width of Line
930*--------------------------------------------------------------------------
931* STACK ARG : None
932*--------------------------------------------------------------------------
933* REG OUTPUT : None
934*--------------------------------------------------------------------------
935* MEM INPUT : p_y - a line of Y pixels
936* p_cr - a line of Cr pixels
937* p_cb - a line of Cb pixels
938* length - the width of the input line
939*--------------------------------------------------------------------------
940* MEM OUTPUT : p_abgr8888 - the converted ABGR pixels
941*--------------------------------------------------------------------------
942* REG AFFECTED : ARM: R0-R4, R12
943* NEON: Q0-Q15
944*--------------------------------------------------------------------------
945* STACK USAGE : none
946*--------------------------------------------------------------------------
947* CYCLES : none
948*
949*--------------------------------------------------------------------------
950* NOTES :
951*--------------------------------------------------------------------------
952*/
953.type yyvup2abgr8888_venum, %function
954yyvup2abgr8888_venum:
955 /*-------------------------------------------------------------------------
956 * Store stack registers
957 * ------------------------------------------------------------------------ */
958 STMFD SP!, {LR}
959
960 VPUSH {D8-D15}
961
962 PLD [R0, R3] @ preload luma line
963
964 ADR R12, constants
965
966 VLD1.S16 {D6, D7}, [R12]! @ D6, D7: 359 | -88 | -183 | 454 | 256 | 0 | 255 | 0
967 VLD1.S32 {D30, D31}, [R12] @ Q15 : -45824 | 34816 | -57984 | X
968
969 /*-------------------------------------------------------------------------
970 * Load the 5th parameter via stack
971 * R0 ~ R3 are used to pass the first 4 parameters, the 5th and above
972 * parameters are passed via stack
973 * ------------------------------------------------------------------------ */
974 LDR R12, [SP, #68] @ LR is pushed into the stack so SP is
975 @ decreased by 4,
976 @ D8-D15 are also pushed into the stack
977 @ so SP is decreased by
978 @ 8-byte/D-Register * 8 D-Registers = 64,
979 @ so SP needs to be increased by 64+4=68
980 @ to get the value that was first pushed
981 @ into stack (the 5th parameter passed in
982 @ throught stack)
983
984 /*-------------------------------------------------------------------------
985 * Load clamping parameters to duplicate vector elements
986 * ------------------------------------------------------------------------ */
987 VDUP.S16 Q4, D7[1] @ Q4: 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0
988 VDUP.S16 Q5, D7[2] @ Q5: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
989
990 /*-------------------------------------------------------------------------
991 * Read bias
992 * ------------------------------------------------------------------------ */
993 VDUP.S32 Q0, D30[0] @ Q0: -45824 | -45824 | -45824 | -45824
994 VDUP.S32 Q1, D30[1] @ Q1: 34816 | 34816 | 34816 | 34816
995 VDUP.S32 Q2, D31[0] @ Q2: -70688 | -70688 | -70688 | -70688
996
997
998 /*-------------------------------------------------------------------------
999 * The main loop
1000 * ------------------------------------------------------------------------ */
1001loop_yyvup2abgr:
1002
1003 /*-------------------------------------------------------------------------
1004 * Load input from Y, V and U
1005 * D12, D13: Y0 Y2 Y4 Y6 Y8 Y10 Y12 Y14, Y1 Y3 Y5 Y7 Y9 Y11 Y13 Y15
1006 * D14 : V0 V1 V2 V3 V4 V5 V6 V7
1007 * D15 : U0 U1 U2 U3 U4 U5 U6 U7
1008 * ------------------------------------------------------------------------ */
1009 VLD2.U8 {D12,D13}, [p_y]! @ Load 16 Luma elements (uint8) to D12, D13
1010 VLD1.U8 {D14}, [p_cr]! @ Load 8 Cr elements (uint8) to D14
1011 VLD1.U8 {D15}, [p_cb]! @ Load 8 Cb elements (uint8) to D15
1012
1013 /*-------------------------------------------------------------------------
1014 * Expand uint8 value to uint16
1015 * D24, D25: Y0 Y2 Y4 Y6 Y8 Y10 Y12 Y14
1016 * D26, D27: Y1 Y3 Y5 Y7 Y9 Y11 Y13 Y15
1017 * D28, D29: V0 V1 V2 V3 V4 V5 V6 V7
1018 * D30, D31: U0 U1 U2 U3 U4 U5 U6 U7
1019 * ------------------------------------------------------------------------ */
1020 VMOVL.U8 Q12, D12
1021 VMOVL.U8 Q13, D13
1022 VMOVL.U8 Q14, D14
1023 VMOVL.U8 Q15, D15
1024
1025 /*-------------------------------------------------------------------------
1026 * Multiply contribution from chrominance, results are in 32-bit
1027 * ------------------------------------------------------------------------ */
1028 VMULL.S16 Q6, D28, D6[0] @ Q6: 359*(V0,V1,V2,V3) Red
1029 VMULL.S16 Q7, D30, D6[1] @ Q7: -88*(U0,U1,U2,U3) Green
1030 VMLAL.S16 Q7, D28, D6[2] @ Q7: -88*(U0,U1,U2,U3) - 183*(V0,V1,V2,V3)
1031 VMULL.S16 Q8, D30, D6[3] @ Q8: 454*(U0,U1,U2,U3) Blue
1032
1033 /*-------------------------------------------------------------------------
1034 * Add bias
1035 * ------------------------------------------------------------------------ */
1036 VADD.S32 Q6, Q0 @ Q6 add Red bias -45824
1037 VADD.S32 Q7, Q1 @ Q7 add Green bias 34816
1038 VADD.S32 Q8, Q2 @ Q8 add Blue bias -57984
1039
1040 /*-------------------------------------------------------------------------
1041 * Calculate Red, Green, Blue
1042 * ------------------------------------------------------------------------ */
1043 VMOV.S32 Q9, Q6
1044 VMLAL.S16 Q6, D24, D7[0] @ Q6: R0, R2, R4, R6 in 32-bit Q8 format
1045 VMLAL.S16 Q9, D26, D7[0] @ Q9: R1, R3, R5, R7 in 32-bit Q8 format
1046
1047 VMOV.S32 Q10, Q7
1048 VMLAL.S16 Q7, D24, D7[0] @ Q7: G0, G2, G4, G6 in 32-bit Q8 format
1049 VMLAL.S16 Q10, D26, D7[0] @ Q10: G1, G3, G5, G7 in 32-bit Q8 format
1050
1051 VMOV.S32 Q11, Q8
1052 VMLAL.S16 Q8, D24, D7[0] @ Q8: B0, B2, B4, B6 in 32-bit Q8 format
1053 VMLAL.S16 Q11, D26, D7[0] @ Q11: B1, B3, B5, B7 in 32-bit Q8 format
1054
1055 /*-------------------------------------------------------------------------
1056 * Right shift eight bits with rounding
1057 * ------------------------------------------------------------------------ */
1058 VSHRN.S32 D12, Q6, #8 @ D12: R0 R2 R4 R6 in 16-bit Q0 format
1059 VSHRN.S32 D13, Q9, #8 @ D13: R1 R3 R5 R7 in 16-bit Q0 format
1060 VZIP.16 D12, D13 @ Q6 : R0 R1 R2 R3 R4 R5 R6 R7
1061
1062 VSHRN.S32 D18, Q7, #8 @ D18: G0 G2 G4 G6 in 16-bit Q0 format
1063 VSHRN.S32 D19, Q10, #8 @ D19: G1 G3 G5 G7 in 16-bit Q0 format
1064 VZIP.16 D18, D19 @ Q9 : G0 G1 G2 G3 G4 G5 G6 G7
1065
1066 VSHRN.S32 D20, Q8, #8 @ D20: B0 B2 B4 B6 in 16-bit Q0 format
1067 VSHRN.S32 D21, Q11, #8 @ D21: B1 B3 B5 B7 in 16-bit Q0 format
1068 VZIP.16 D20, D21 @ Q10: B0 B1 B2 B3 B4 B5 B6 B7
1069
1070 /*-------------------------------------------------------------------------
1071 * Clamp the value to be within [0~255]
1072 * ------------------------------------------------------------------------ */
1073 VMAX.S16 Q10, Q10, Q4 @ if Q10 < 0, Q10 = 0
1074 VMIN.S16 Q10, Q10, Q5 @ if Q10 > 255, Q10 = 255
1075 VQMOVUN.S16 D23, Q10 @ store Blue to D23, narrow the value from int16 to int8
1076
1077 VMAX.S16 Q9, Q9, Q4 @ if Q9 < 0, Q9 = 0
1078 VMIN.S16 Q9, Q9, Q5 @ if Q9 > 255, Q9 = 255
1079 VQMOVUN.S16 D22, Q9 @ store Green to D22, narrow the value from int16 to int8
1080
1081 VMAX.S16 Q6, Q6, Q4 @ if Q6 < 0, Q6 = 0
1082 VMIN.S16 Q6, Q6, Q5 @ if Q6 > 255, Q6 = 255
1083 VQMOVUN.S16 D21, Q6 @ store Red to D21, narrow the value from int16 to int8
1084
1085 /*-------------------------------------------------------------------------
1086 * abgr format with leading 0xFF byte
1087 * ------------------------------------------------------------------------ */
1088 VMOVN.I16 D24, Q5 @ D24: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
1089
1090 SUBS length, length, #8 @ check if the length is less than 8
1091
1092 BMI trailing_yyvup2abgr @ jump to trailing processing if remaining length is less than 8
1093
1094 VST4.U8 {D21,D22,D23,D24}, [p_bgr]! @ vector store Blue, Green, Red to destination
1095 @ Red at LSB
1096
1097 BEQ end_yyvup2abgr @ done if exactly 8 pixel processed in the loop
1098
1099
1100 /*-------------------------------------------------------------------------
1101 * Done with the first 8 elements, continue on the next 8 elements
1102 * ------------------------------------------------------------------------ */
1103
1104 /*-------------------------------------------------------------------------
1105 * Multiply contribution from chrominance, results are in 32-bit
1106 * ------------------------------------------------------------------------ */
1107 VMULL.S16 Q6, D29, D6[0] @ Q6: 359*(V4,V5,V6,V7) Red
1108 VMULL.S16 Q7, D31, D6[1] @ Q7: -88*(U4,U5,U6,U7) Green
1109 VMLAL.S16 Q7, D29, D6[2] @ Q7: -88*(U4,U5,U6,U7) - 183*(V4,V5,V6,V7)
1110 VMULL.S16 Q8, D31, D6[3] @ Q8: 454*(U4,U5,U6,U7) Blue
1111
1112 /*-------------------------------------------------------------------------
1113 * Add bias
1114 * ------------------------------------------------------------------------ */
1115 VADD.S32 Q6, Q0 @ Q6 add Red bias -45824
1116 VADD.S32 Q7, Q1 @ Q7 add Green bias 34816
1117 VADD.S32 Q8, Q2 @ Q8 add Blue bias -57984
1118
1119 /*-------------------------------------------------------------------------
1120 * Calculate Red, Green, Blue
1121 * ------------------------------------------------------------------------ */
1122 VMOV.S32 Q9, Q6
1123 VMLAL.S16 Q6, D25, D7[0] @ Q6: R8 R10 R12 R14 in 32-bit Q8 format
1124 VMLAL.S16 Q9, D27, D7[0] @ Q9: R9 R11 R13 R15 in 32-bit Q8 format
1125
1126 VMOV.S32 Q10, Q7
1127 VMLAL.S16 Q7, D25, D7[0] @ Q7: G0, G2, G4, G6 in 32-bit Q8 format
1128 VMLAL.S16 Q10, D27, D7[0] @ Q10 : G1, G3, G5, G7 in 32-bit Q8 format
1129
1130 VMOV.S32 Q11, Q8
1131 VMLAL.S16 Q8, D25, D7[0] @ Q8: B0, B2, B4, B6 in 32-bit Q8 format
1132 VMLAL.S16 Q11, D27, D7[0] @ Q11 : B1, B3, B5, B7 in 32-bit Q8 format
1133
1134 /*-------------------------------------------------------------------------
1135 * Right shift eight bits with rounding
1136 * ------------------------------------------------------------------------ */
1137 VSHRN.S32 D12, Q6, #8 @ D12: R8 R10 R12 R14 in 16-bit Q0 format
1138 VSHRN.S32 D13, Q9, #8 @ D13: R9 R11 R13 R15 in 16-bit Q0 format
1139 VZIP.16 D12, D13 @ Q6: R8 R9 R10 R11 R12 R13 R14 R15
1140
1141 VSHRN.S32 D18, Q7, #8 @ D18: G8 G10 G12 G14 in 16-bit Q0 format
1142 VSHRN.S32 D19, Q10, #8 @ D19: G9 G11 G13 G15 in 16-bit Q0 format
1143 VZIP.16 D18, D19 @ Q9: G8 G9 G10 G11 G12 G13 G14 G15
1144
1145 VSHRN.S32 D20, Q8, #8 @ D20: B8 B10 B12 B14 in 16-bit Q0 format
1146 VSHRN.S32 D21, Q11, #8 @ D21: B9 B11 B13 B15 in 16-bit Q0 format
1147 VZIP.16 D20, D21 @ Q10: B8 B9 B10 B11 B12 B13 B14 B15
1148
1149 /*-------------------------------------------------------------------------
1150 * Clamp the value to be within [0~255]
1151 * ------------------------------------------------------------------------ */
1152 VMAX.S16 Q10, Q10, Q4 @ if Q10 < 0, Q10 = 0
1153 VMIN.S16 Q10, Q10, Q5 @ if Q10 > 255, Q10 = 255
1154 VQMOVUN.S16 D23, Q10 @ store Blue to D23, narrow the value from int16 to int8
1155
1156 VMAX.S16 Q9, Q9, Q4 @ if Q9 < 0, Q9 = 0
1157 VMIN.S16 Q9, Q9, Q5 @ if Q9 > 255, Q9 = 255
1158 VQMOVUN.S16 D22, Q9 @ store Green to D22, narrow the value from int16 to int8
1159
1160 VMAX.S16 Q6, Q6, Q4 @ if Q6 < 0, Q6 = 0
1161 VMIN.S16 Q6, Q6, Q5 @ if Q6 > 255, Q6 = 255
1162 VQMOVUN.S16 D21, Q6 @ store Red to D21, narrow the value from int16 to int8
1163
1164 /*-------------------------------------------------------------------------
1165 * abgr format with leading 0xFF byte
1166 * ------------------------------------------------------------------------ */
1167 VMOVN.I16 D24, Q5 @ D24: 255 | 255 | 255 | 255 | 255 | 255 | 255 | 255
1168
1169 SUBS length, length, #8 @ check if the length is less than 8
1170
1171 BMI trailing_yyvup2abgr @ jump to trailing processing if remaining length is less than 8
1172
1173 VST4.U8 {D21,D22,D23,D24}, [p_bgr]! @ vector store Blue, Green, Red to destination
1174 @ Red at LSB
1175
1176 BHI loop_yyvup2abgr @ loop if more than 8 pixels left
1177
1178 BEQ end_yyvup2abgr @ done if exactly 8 pixel processed in the loop
1179
1180
1181trailing_yyvup2abgr:
1182 /*-------------------------------------------------------------------------
1183 * There are from 1 ~ 7 pixels left in the trailing part.
1184 * First adding 7 to the length so the length would be from 0 ~ 6.
1185 * eg: 1 pixel left in the trailing part, so 1-8+7 = 0.
1186 * Then save 1 pixel unconditionally since at least 1 pixels left in the
1187 * trailing part.
1188 * ------------------------------------------------------------------------ */
1189 ADDS length, length, #7 @ there are 7 or less in the trailing part
1190
1191 VST4.U8 {D21[0],D22[0],D23[0],D24[0]}, [p_bgr]! @ at least 1 pixel left in the trailing part
1192 BEQ end_yyvup2abgr @ done if 0 pixel left
1193
1194 SUBS length, length, #1 @ update length counter
1195 VST4.U8 {D21[1],D22[1],D23[1],D24[1]}, [p_bgr]! @ store one more pixel
1196 BEQ end_yyvup2abgr @ done if 0 pixel left
1197
1198 SUBS length, length, #1 @ update length counter
1199 VST4.U8 {D21[2],D22[2],D23[2],D24[2]}, [p_bgr]! @ store one more pixel
1200 BEQ end_yyvup2abgr @ done if 0 pixel left
1201
1202 SUBS length, length, #1 @ update length counter
1203 VST4.U8 {D21[3],D22[3],D23[3],D24[3]}, [p_bgr]! @ store one more pixel
1204 BEQ end_yyvup2abgr @ done if 0 pixel left
1205
1206 SUBS length, length, #1 @ update length counter
1207 VST4.U8 {D21[4],D22[4],D23[4],D24[4]}, [p_bgr]! @ store one more pixel
1208 BEQ end_yyvup2abgr @ done if 0 pixel left
1209
1210 SUBS length, length, #1 @ update length counter
1211 VST4.U8 {D21[5],D22[5],D23[5],D24[5]}, [p_bgr]! @ store one more pixel
1212 BEQ end_yyvup2abgr @ done if 0 pixel left
1213
1214 SUBS length, length, #1 @ update length counter
1215 VST4.U8 {D21[6],D22[6],D23[6],D24[6]}, [p_bgr]! @ store one more pixel
1216
1217end_yyvup2abgr:
1218 VPOP {D8-D15}
1219 LDMFD SP!, {PC}
1220 @ end of yyvup2abgr
1221
1222.end
1223