Blame - libpixelflinger/codeflinger/texturing.cpp - android_bootable_recovery

blob: e6997bd4be903c3457dea92998ec638a78964a66 [file] [log] [blame]

bigbiff	673c7ae	2020-12-02 19:44:56 -0500	[diff] [blame]	1	/* libs/pixelflinger/codeflinger/texturing.cpp
				2	**
				3	** Copyright 2006, The Android Open Source Project
				4	**
				5	** Licensed under the Apache License, Version 2.0 (the "License");
				6	** you may not use this file except in compliance with the License.
				7	** You may obtain a copy of the License at
				8	**
				9	** http://www.apache.org/licenses/LICENSE-2.0
				10	**
				11	** Unless required by applicable law or agreed to in writing, software
				12	** distributed under the License is distributed on an "AS IS" BASIS,
				13	** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	** See the License for the specific language governing permissions and
				15	** limitations under the License.
				16	*/
				17
				18	#define LOG_TAG "pixelflinger-code"
				19
				20	#include <assert.h>
				21	#include <stdint.h>
				22	#include <stdio.h>
				23	#include <stdlib.h>
				24	#include <sys/types.h>
				25
				26	#include <log/log.h>
				27
				28	#include "GGLAssembler.h"
				29
				30	namespace android {
				31
				32	// ---------------------------------------------------------------------------
				33
				34	// iterators are initialized like this:
				35	// (intToFixedCenter(x) * dx)>>16 + x0
				36	// ((x<<16 + 0x8000) * dx)>>16 + x0
				37	// ((x<<16)dx + (0x8000dx))>>16 + x0
				38	// ( (x*dx) + dx>>1 ) + x0
				39	// (x*dx) + (dx>>1 + x0)
				40
				41	void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
				42	{
				43	context_t const* c = mBuilderContext.c;
				44
				45	if (mSmooth) {
				46	// NOTE: we could take this case in the mDithering + !mSmooth case,
				47	// but this would use up to 4 more registers for the color components
				48	// for only a little added quality.
				49	// Currently, this causes the system to run out of registers in
				50	// some case (see issue #719496)
				51
				52	comment("compute initial iterated color (smooth and/or dither case)");
				53
				54	parts.iterated_packed = 0;
				55	parts.packed = 0;
				56
				57	// 0x1: color component
				58	// 0x2: iterators
				59	const int optReload = mOptLevel >> 1;
				60	if (optReload >= 3) parts.reload = 0; // reload nothing
				61	else if (optReload == 2) parts.reload = 2; // reload iterators
				62	else if (optReload == 1) parts.reload = 1; // reload colors
				63	else if (optReload <= 0) parts.reload = 3; // reload both
				64
				65	if (!mSmooth) {
				66	// we're not smoothing (just dithering), we never have to
				67	// reload the iterators
				68	parts.reload &= ~2;
				69	}
				70
				71	Scratch scratches(registerFile());
				72	const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
				73	const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
				74	for (int i=0 ; i<4 ; i++) {
				75	if (!mInfo[i].iterated)
				76	continue;
				77
				78	// this component exists in the destination and is not replaced
				79	// by a texture unit.
				80	const int c = (parts.reload & 1) ? t0 : obtainReg();
				81	if (i==0) CONTEXT_LOAD(c, iterators.ydady);
				82	if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
				83	if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
				84	if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
				85	parts.argb[i].reg = c;
				86
				87	if (mInfo[i].smooth) {
				88	parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
				89	const int dvdx = parts.argb_dx[i].reg;
				90	CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
				91	MLA(AL, 0, c, x.reg, dvdx, c);
				92
				93	// adjust the color iterator to make sure it won't overflow
				94	if (!mAA) {
				95	// this is not needed when we're using anti-aliasing
				96	// because we will (have to) clamp the components
				97	// anyway.
				98	int end = scratches.obtain();
				99	MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
				100	MLA(AL, 1, end, dvdx, end, c);
				101	SUB(MI, 0, c, c, end);
				102	BIC(AL, 0, c, c, reg_imm(c, ASR, 31));
				103	scratches.recycle(end);
				104	}
				105	}
				106
				107	if (parts.reload & 1) {
				108	CONTEXT_STORE(c, generated_vars.argb[i].c);
				109	}
				110	}
				111	} else {
				112	// We're not smoothed, so we can
				113	// just use a packed version of the color and extract the
				114	// components as needed (or not at all if we don't blend)
				115
				116	// figure out if we need the iterated color
				117	int load = 0;
				118	for (int i=0 ; i<4 ; i++) {
				119	component_info_t& info = mInfo[i];
				120	if ((info.inDest \|\| info.needed) && !info.replaced)
				121	load \|= 1;
				122	}
				123
				124	parts.iterated_packed = 1;
				125	parts.packed = (!mTextureMachine.mask && !mBlending
				126	&& !mFog && !mDithering);
				127	parts.reload = 0;
				128	if (load \|\| parts.packed) {
				129	if (mBlending \|\| mDithering \|\| mInfo[GGLFormat::ALPHA].needed) {
				130	comment("load initial iterated color (8888 packed)");
				131	parts.iterated.setTo(obtainReg(),
				132	&(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
				133	CONTEXT_LOAD(parts.iterated.reg, packed8888);
				134	} else {
				135	comment("load initial iterated color (dest format packed)");
				136
				137	parts.iterated.setTo(obtainReg(), &mCbFormat);
				138
				139	// pre-mask the iterated color
				140	const int bits = parts.iterated.size();
				141	const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
				142	uint32_t mask = 0;
				143	if (mMasking) {
				144	for (int i=0 ; i<4 ; i++) {
				145	const int component_mask = 1<<i;
				146	const int h = parts.iterated.format.c[i].h;
				147	const int l = parts.iterated.format.c[i].l;
				148	if (h && (!(mMasking & component_mask))) {
				149	mask \|= ((1<<(h-l))-1) << l;
				150	}
				151	}
				152	}
				153
				154	if (mMasking && ((mask & size)==0)) {
				155	// none of the components are present in the mask
				156	} else {
				157	CONTEXT_LOAD(parts.iterated.reg, packed);
				158	if (mCbFormat.size == 1) {
				159	AND(AL, 0, parts.iterated.reg,
				160	parts.iterated.reg, imm(0xFF));
				161	} else if (mCbFormat.size == 2) {
				162	MOV(AL, 0, parts.iterated.reg,
				163	reg_imm(parts.iterated.reg, LSR, 16));
				164	}
				165	}
				166
				167	// pre-mask the iterated color
				168	if (mMasking) {
				169	build_and_immediate(parts.iterated.reg, parts.iterated.reg,
				170	mask, bits);
				171	}
				172	}
				173	}
				174	}
				175	}
				176
				177	void GGLAssembler::build_iterated_color(
				178	component_t& fragment,
				179	const fragment_parts_t& parts,
				180	int component,
				181	Scratch& regs)
				182	{
				183	fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE);
				184
				185	if (!mInfo[component].iterated)
				186	return;
				187
				188	if (parts.iterated_packed) {
				189	// iterated colors are packed, extract the one we need
				190	extract(fragment, parts.iterated, component);
				191	} else {
				192	fragment.h = GGL_COLOR_BITS;
				193	fragment.l = GGL_COLOR_BITS - 8;
				194	fragment.flags \|= CLEAR_LO;
				195	// iterated colors are held in their own register,
				196	// (smooth and/or dithering case)
				197	if (parts.reload==3) {
				198	// this implies mSmooth
				199	Scratch scratches(registerFile());
				200	int dx = scratches.obtain();
				201	CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
				202	CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
				203	ADD(AL, 0, dx, fragment.reg, dx);
				204	CONTEXT_STORE(dx, generated_vars.argb[component].c);
				205	} else if (parts.reload & 1) {
				206	CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
				207	} else {
				208	// we don't reload, so simply rename the register and mark as
				209	// non CORRUPTIBLE so that the texture env or blending code
				210	// won't modify this (renamed) register
				211	regs.recycle(fragment.reg);
				212	fragment.reg = parts.argb[component].reg;
				213	fragment.flags &= ~CORRUPTIBLE;
				214	}
				215	if (mInfo[component].smooth && mAA) {
				216	// when using smooth shading AND anti-aliasing, we need to clamp
				217	// the iterators because there is always an extra pixel on the
				218	// edges, which most of the time will cause an overflow
				219	// (since technically its outside of the domain).
				220	BIC(AL, 0, fragment.reg, fragment.reg,
				221	reg_imm(fragment.reg, ASR, 31));
				222	component_sat(fragment);
				223	}
				224	}
				225	}
				226
				227	// ---------------------------------------------------------------------------
				228
				229	void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
				230	{
				231	// gather some informations about the components we need to process...
				232	const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) \| GGL_CLEAR;
				233	switch(opcode) {
				234	case GGL_COPY:
				235	mLogicOp = 0;
				236	break;
				237	case GGL_CLEAR:
				238	case GGL_SET:
				239	mLogicOp = LOGIC_OP;
				240	break;
				241	case GGL_AND:
				242	case GGL_AND_REVERSE:
				243	case GGL_AND_INVERTED:
				244	case GGL_XOR:
				245	case GGL_OR:
				246	case GGL_NOR:
				247	case GGL_EQUIV:
				248	case GGL_OR_REVERSE:
				249	case GGL_OR_INVERTED:
				250	case GGL_NAND:
				251	mLogicOp = LOGIC_OP\|LOGIC_OP_SRC\|LOGIC_OP_DST;
				252	break;
				253	case GGL_NOOP:
				254	case GGL_INVERT:
				255	mLogicOp = LOGIC_OP\|LOGIC_OP_DST;
				256	break;
				257	case GGL_COPY_INVERTED:
				258	mLogicOp = LOGIC_OP\|LOGIC_OP_SRC;
				259	break;
				260	};
				261	}
				262
				263	void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
				264	{
				265	uint8_t replaced=0;
				266	mTextureMachine.mask = 0;
				267	mTextureMachine.activeUnits = 0;
				268	for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
				269	texture_unit_t& tmu = mTextureMachine.tmu[i];
				270	if (replaced == 0xF) {
				271	// all components are replaced, skip this TMU.
				272	tmu.format_idx = 0;
				273	tmu.mask = 0;
				274	tmu.replaced = replaced;
				275	continue;
				276	}
				277	tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
				278	tmu.format = c->formats[tmu.format_idx];
				279	tmu.bits = tmu.format.size*8;
				280	tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
				281	tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
				282	tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
				283	tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
				284	tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
				285	&& tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
				286
				287	// 5551 linear filtering is not supported
				288	if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
				289	tmu.linear = 0;
				290
				291	tmu.mask = 0;
				292	tmu.replaced = replaced;
				293
				294	if (tmu.format_idx) {
				295	mTextureMachine.activeUnits++;
				296	if (tmu.format.c[0].h) tmu.mask \|= 0x1;
				297	if (tmu.format.c[1].h) tmu.mask \|= 0x2;
				298	if (tmu.format.c[2].h) tmu.mask \|= 0x4;
				299	if (tmu.format.c[3].h) tmu.mask \|= 0x8;
				300	if (tmu.env == GGL_REPLACE) {
				301	replaced \|= tmu.mask;
				302	} else if (tmu.env == GGL_DECAL) {
				303	if (!tmu.format.c[GGLFormat::ALPHA].h) {
				304	// if we don't have alpha, decal does nothing
				305	tmu.mask = 0;
				306	} else {
				307	// decal always ignores At
				308	tmu.mask &= ~(1<<GGLFormat::ALPHA);
				309	}
				310	}
				311	}
				312	mTextureMachine.mask \|= tmu.mask;
				313	//printf("%d: mask=%08lx, replaced=%08lx\n",
				314	// i, int(tmu.mask), int(tmu.replaced));
				315	}
				316	mTextureMachine.replaced = replaced;
				317	mTextureMachine.directTexture = 0;
				318	//printf("replaced=%08lx\n", mTextureMachine.replaced);
				319	}
				320
				321
				322	void GGLAssembler::init_textures(
				323	tex_coord_t* coords,
				324	const reg_t& x, const reg_t& y)
				325	{
				326	const needs_t& needs = mBuilderContext.needs;
				327	int Rx = x.reg;
				328	int Ry = y.reg;
				329
				330	if (mTextureMachine.mask) {
				331	comment("compute texture coordinates");
				332	}
				333
				334	// init texture coordinates for each tmu
				335	const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
				336	const bool multiTexture = mTextureMachine.activeUnits > 1;
				337	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
				338	const texture_unit_t& tmu = mTextureMachine.tmu[i];
				339	if (tmu.format_idx == 0)
				340	continue;
				341	if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
				342	(tmu.twrap == GGL_NEEDS_WRAP_11))
				343	{
				344	// 1:1 texture
				345	pointer_t& txPtr = coords[i].ptr;
				346	txPtr.setTo(obtainReg(), tmu.bits);
				347	CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
				348	ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16)); // x += (s>>16)
				349	CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
				350	ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16)); // y += (t>>16)
				351	// merge base & offset
				352	CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
				353	SMLABB(AL, Rx, Ry, txPtr.reg, Rx); // x+y*stride
				354	CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
				355	base_offset(txPtr, txPtr, Rx);
				356	} else {
				357	Scratch scratches(registerFile());
				358	reg_t& s = coords[i].s;
				359	reg_t& t = coords[i].t;
				360	// s = (x * dsdx)>>16 + ydsdy
				361	// s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
				362	// t = (x * dtdx)>>16 + ydtdy
				363	// t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
				364	s.setTo(obtainReg());
				365	t.setTo(obtainReg());
				366	const int need_w = GGL_READ_NEEDS(W, needs.n);
				367	if (need_w) {
				368	CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
				369	CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
				370	} else {
				371	int ydsdy = scratches.obtain();
				372	int ydtdy = scratches.obtain();
				373	CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
				374	CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
				375	CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
				376	CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
				377	MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
				378	MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
				379	}
				380
				381	if ((mOptLevel&1)==0) {
				382	CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
				383	CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
				384	recycleReg(s.reg);
				385	recycleReg(t.reg);
				386	}
				387	}
				388
				389	// direct texture?
				390	if (!multiTexture && !mBlending && !mDithering && !mFog &&
				391	cb_format_idx == tmu.format_idx && !tmu.linear &&
				392	mTextureMachine.replaced == tmu.mask)
				393	{
				394	mTextureMachine.directTexture = i + 1;
				395	}
				396	}
				397	}
				398
				399	void GGLAssembler::build_textures( fragment_parts_t& parts,
				400	Scratch& regs)
				401	{
				402	// We don't have a way to spill registers automatically
				403	// spill depth and AA regs, when we know we may have to.
				404	// build the spill list...
				405	uint32_t spill_list = 0;
				406	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
				407	const texture_unit_t& tmu = mTextureMachine.tmu[i];
				408	if (tmu.format_idx == 0)
				409	continue;
				410	if (tmu.linear) {
				411	// we may run out of register if we have linear filtering
				412	// at 1 or 4 bytes / pixel on any texture unit.
				413	if (tmu.format.size == 1) {
				414	// if depth and AA enabled, we'll run out of 1 register
				415	if (parts.z.reg > 0 && parts.covPtr.reg > 0)
				416	spill_list \|= 1<<parts.covPtr.reg;
				417	}
				418	if (tmu.format.size == 4) {
				419	// if depth or AA enabled, we'll run out of 1 or 2 registers
				420	if (parts.z.reg > 0)
				421	spill_list \|= 1<<parts.z.reg;
				422	if (parts.covPtr.reg > 0)
				423	spill_list \|= 1<<parts.covPtr.reg;
				424	}
				425	}
				426	}
				427
				428	Spill spill(registerFile(), *this, spill_list);
				429
				430	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
				431	const texture_unit_t& tmu = mTextureMachine.tmu[i];
				432	if (tmu.format_idx == 0)
				433	continue;
				434
				435	pointer_t& txPtr = parts.coords[i].ptr;
				436	pixel_t& texel = parts.texel[i];
				437
				438	// repeat...
				439	if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
				440	(tmu.twrap == GGL_NEEDS_WRAP_11))
				441	{ // 1:1 textures
				442	comment("fetch texel");
				443	texel.setTo(regs.obtain(), &tmu.format);
				444	load(txPtr, texel, WRITE_BACK);
				445	} else {
				446	Scratch scratches(registerFile());
				447	reg_t& s = parts.coords[i].s;
				448	reg_t& t = parts.coords[i].t;
				449	if ((mOptLevel&1)==0) {
				450	comment("reload s/t (multitexture or linear filtering)");
				451	s.reg = scratches.obtain();
				452	t.reg = scratches.obtain();
				453	CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
				454	CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
				455	}
				456
				457	if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
				458	return;
				459
				460	comment("compute repeat/clamp");
				461	int u = scratches.obtain();
				462	int v = scratches.obtain();
				463	int width = scratches.obtain();
				464	int height = scratches.obtain();
				465	int U = 0;
				466	int V = 0;
				467
				468	if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
				469	return;
				470
				471	CONTEXT_LOAD(width, generated_vars.texture[i].width);
				472	CONTEXT_LOAD(height, generated_vars.texture[i].height);
				473
				474	int FRAC_BITS = 0;
				475	if (tmu.linear) {
				476	// linear interpolation
				477	if (tmu.format.size == 1) {
				478	// for 8-bits textures, we can afford
				479	// 7 bits of fractional precision at no
				480	// additional cost (we can't do 8 bits
				481	// because filter8 uses signed 16 bits muls)
				482	FRAC_BITS = 7;
				483	} else if (tmu.format.size == 2) {
				484	// filter16() is internally limited to 4 bits, so:
				485	// FRAC_BITS=2 generates less instructions,
				486	// FRAC_BITS=3,4,5 creates unpleasant artifacts,
				487	// FRAC_BITS=6+ looks good
				488	FRAC_BITS = 6;
				489	} else if (tmu.format.size == 4) {
				490	// filter32() is internally limited to 8 bits, so:
				491	// FRAC_BITS=4 looks good
				492	// FRAC_BITS=5+ looks better, but generates 3 extra ipp
				493	FRAC_BITS = 6;
				494	} else {
				495	// for all other cases we use 4 bits.
				496	FRAC_BITS = 4;
				497	}
				498	}
				499	wrapping(u, s.reg, width, tmu.swrap, FRAC_BITS);
				500	wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
				501
				502	if (tmu.linear) {
				503	comment("compute linear filtering offsets");
				504	// pixel size scale
				505	const int shift = 31 - gglClz(tmu.format.size);
				506	U = scratches.obtain();
				507	V = scratches.obtain();
				508
				509	if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
				510	return;
				511
				512	// sample the texel center
				513	SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
				514	SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
				515
				516	// get the fractionnal part of U,V
				517	AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
				518	AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
				519
				520	// compute width-1 and height-1
				521	SUB(AL, 0, width, width, imm(1));
				522	SUB(AL, 0, height, height, imm(1));
				523
				524	// get the integer part of U,V and clamp/wrap
				525	// and compute offset to the next texel
				526	if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
				527	// u has already been REPEATed
				528	MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
				529	MOV(MI, 0, u, width);
				530	CMP(AL, u, width);
				531	MOV(LT, 0, width, imm(1 << shift));
				532	if (shift)
				533	MOV(GE, 0, width, reg_imm(width, LSL, shift));
				534	RSB(GE, 0, width, width, imm(0));
				535	} else {
				536	// u has not been CLAMPed yet
				537	// algorithm:
				538	// if ((u>>4) >= width)
				539	// u = width<<4
				540	// width = 0
				541	// else
				542	// width = 1<<shift
				543	// u = u>>4; // get integer part
				544	// if (u<0)
				545	// u = 0
				546	// width = 0
				547	// generated_vars.rt = width
				548
				549	CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
				550	MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
				551	MOV(LE, 0, width, imm(0));
				552	MOV(GT, 0, width, imm(1 << shift));
				553	MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
				554	MOV(MI, 0, u, imm(0));
				555	MOV(MI, 0, width, imm(0));
				556	}
				557	CONTEXT_STORE(width, generated_vars.rt);
				558
				559	const int stride = width;
				560	CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
				561	if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
				562	// v has already been REPEATed
				563	MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
				564	MOV(MI, 0, v, height);
				565	CMP(AL, v, height);
				566	MOV(LT, 0, height, imm(1 << shift));
				567	if (shift)
				568	MOV(GE, 0, height, reg_imm(height, LSL, shift));
				569	RSB(GE, 0, height, height, imm(0));
				570	MUL(AL, 0, height, stride, height);
				571	} else {
				572	// v has not been CLAMPed yet
				573	CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
				574	MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
				575	MOV(LE, 0, height, imm(0));
				576	if (shift) {
				577	MOV(GT, 0, height, reg_imm(stride, LSL, shift));
				578	} else {
				579	MOV(GT, 0, height, stride);
				580	}
				581	MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
				582	MOV(MI, 0, v, imm(0));
				583	MOV(MI, 0, height, imm(0));
				584	}
				585	CONTEXT_STORE(height, generated_vars.lb);
				586	}
				587
				588	scratches.recycle(width);
				589	scratches.recycle(height);
				590
				591	// iterate texture coordinates...
				592	comment("iterate s,t");
				593	int dsdx = scratches.obtain();
				594	int dtdx = scratches.obtain();
				595
				596	if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
				597	return;
				598
				599	CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
				600	CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
				601	ADD(AL, 0, s.reg, s.reg, dsdx);
				602	ADD(AL, 0, t.reg, t.reg, dtdx);
				603	if ((mOptLevel&1)==0) {
				604	CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
				605	CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
				606	scratches.recycle(s.reg);
				607	scratches.recycle(t.reg);
				608	}
				609	scratches.recycle(dsdx);
				610	scratches.recycle(dtdx);
				611
				612	// merge base & offset...
				613	comment("merge base & offset");
				614	texel.setTo(regs.obtain(), &tmu.format);
				615	txPtr.setTo(texel.reg, tmu.bits);
				616	int stride = scratches.obtain();
				617
				618	if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
				619	return;
				620
				621	CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
				622	CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
				623	SMLABB(AL, u, v, stride, u); // u+v*stride
				624	base_offset(txPtr, txPtr, u);
				625
				626	// load texel
				627	if (!tmu.linear) {
				628	comment("fetch texel");
				629	load(txPtr, texel, 0);
				630	} else {
				631	// recycle registers we don't need anymore
				632	scratches.recycle(u);
				633	scratches.recycle(v);
				634	scratches.recycle(stride);
				635
				636	comment("fetch texel, bilinear");
				637	switch (tmu.format.size) {
				638	case 1: filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
				639	case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
				640	case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
				641	case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
				642	}
				643	}
				644	}
				645	}
				646	}
				647
				648	void GGLAssembler::build_iterate_texture_coordinates(
				649	const fragment_parts_t& parts)
				650	{
				651	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
				652	const texture_unit_t& tmu = mTextureMachine.tmu[i];
				653	if (tmu.format_idx == 0)
				654	continue;
				655
				656	if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
				657	(tmu.twrap == GGL_NEEDS_WRAP_11))
				658	{ // 1:1 textures
				659	const pointer_t& txPtr = parts.coords[i].ptr;
				660	ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
				661	} else {
				662	Scratch scratches(registerFile());
				663	int s = parts.coords[i].s.reg;
				664	int t = parts.coords[i].t.reg;
				665	if ((mOptLevel&1)==0) {
				666	s = scratches.obtain();
				667	t = scratches.obtain();
				668	CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
				669	CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
				670	}
				671	int dsdx = scratches.obtain();
				672	int dtdx = scratches.obtain();
				673	CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
				674	CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
				675	ADD(AL, 0, s, s, dsdx);
				676	ADD(AL, 0, t, t, dtdx);
				677	if ((mOptLevel&1)==0) {
				678	CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
				679	CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
				680	}
				681	}
				682	}
				683	}
				684
				685	void GGLAssembler::filter8(
				686	const fragment_parts_t& /parts/,
				687	pixel_t& texel, const texture_unit_t& tmu,
				688	int U, int V, pointer_t& txPtr,
				689	int FRAC_BITS)
				690	{
				691	if (tmu.format.components != GGL_ALPHA &&
				692	tmu.format.components != GGL_LUMINANCE)
				693	{
				694	// this is a packed format, and we don't support
				695	// linear filtering (it's probably RGB 332)
				696	// Should not happen with OpenGL\|ES
				697	LDRB(AL, texel.reg, txPtr.reg);
				698	return;
				699	}
				700
				701	// ------------------------
				702	// about ~22 cycles / pixel
				703	Scratch scratches(registerFile());
				704
				705	int pixel= scratches.obtain();
				706	int d = scratches.obtain();
				707	int u = scratches.obtain();
				708	int k = scratches.obtain();
				709	int rt = scratches.obtain();
				710	int lb = scratches.obtain();
				711
				712	// RB -> U * V
				713
				714	CONTEXT_LOAD(rt, generated_vars.rt);
				715	CONTEXT_LOAD(lb, generated_vars.lb);
				716
				717	int offset = pixel;
				718	ADD(AL, 0, offset, lb, rt);
				719	LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
				720	SMULBB(AL, u, U, V);
				721	SMULBB(AL, d, pixel, u);
				722	RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
				723
				724	// LB -> (1-U) * V
				725	RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
				726	LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
				727	SMULBB(AL, u, U, V);
				728	SMLABB(AL, d, pixel, u, d);
				729	SUB(AL, 0, k, k, u);
				730
				731	// LT -> (1-U)*(1-V)
				732	RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
				733	LDRB(AL, pixel, txPtr.reg);
				734	SMULBB(AL, u, U, V);
				735	SMLABB(AL, d, pixel, u, d);
				736
				737	// RT -> U*(1-V)
				738	LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
				739	SUB(AL, 0, u, k, u);
				740	SMLABB(AL, texel.reg, pixel, u, d);
				741
				742	for (int i=0 ; i<4 ; i++) {
				743	if (!texel.format.c[i].h) continue;
				744	texel.format.c[i].h = FRAC_BITS*2+8;
				745	texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
				746	}
				747	texel.format.size = 4;
				748	texel.format.bitsPerPixel = 32;
				749	texel.flags \|= CLEAR_LO;
				750	}
				751
				752	void GGLAssembler::filter16(
				753	const fragment_parts_t& /parts/,
				754	pixel_t& texel, const texture_unit_t& tmu,
				755	int U, int V, pointer_t& txPtr,
				756	int FRAC_BITS)
				757	{
				758	// compute the mask
				759	// XXX: it would be nice if the mask below could be computed
				760	// automatically.
				761	uint32_t mask = 0;
				762	int shift = 0;
				763	int prec = 0;
				764	switch (tmu.format_idx) {
				765	case GGL_PIXEL_FORMAT_RGB_565:
				766	// source: 00000ggg.ggg00000 \| rrrrr000.000bbbbb
				767	// result: gggggggg.gggrrrrr \| rrrrr0bb.bbbbbbbb
				768	mask = 0x07E0F81F;
				769	shift = 16;
				770	prec = 5;
				771	break;
				772	case GGL_PIXEL_FORMAT_RGBA_4444:
				773	// 0000,1111,0000,1111 \| 0000,1111,0000,1111
				774	mask = 0x0F0F0F0F;
				775	shift = 12;
				776	prec = 4;
				777	break;
				778	case GGL_PIXEL_FORMAT_LA_88:
				779	// 0000,0000,1111,1111 \| 0000,0000,1111,1111
				780	// AALL -> 00AA \| 00LL
				781	mask = 0x00FF00FF;
				782	shift = 8;
				783	prec = 8;
				784	break;
				785	default:
				786	// unsupported format, do something sensical...
				787	ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
				788	LDRH(AL, texel.reg, txPtr.reg);
				789	return;
				790	}
				791
				792	const int adjust = FRAC_BITS*2 - prec;
				793	const int round = 0;
				794
				795	// update the texel format
				796	texel.format.size = 4;
				797	texel.format.bitsPerPixel = 32;
				798	texel.flags \|= CLEAR_HI\|CLEAR_LO;
				799	for (int i=0 ; i<4 ; i++) {
				800	if (!texel.format.c[i].h) continue;
				801	const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
				802	texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
				803	texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
				804	}
				805
				806	// ------------------------
				807	// about ~40 cycles / pixel
				808	Scratch scratches(registerFile());
				809
				810	int pixel= scratches.obtain();
				811	int d = scratches.obtain();
				812	int u = scratches.obtain();
				813	int k = scratches.obtain();
				814
				815	// RB -> U * V
				816	int offset = pixel;
				817	CONTEXT_LOAD(offset, generated_vars.rt);
				818	CONTEXT_LOAD(u, generated_vars.lb);
				819	ADD(AL, 0, offset, offset, u);
				820
				821	LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
				822	SMULBB(AL, u, U, V);
				823	ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
				824	build_and_immediate(pixel, pixel, mask, 32);
				825	if (adjust) {
				826	if (round)
				827	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				828	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				829	}
				830	MUL(AL, 0, d, pixel, u);
				831	RSB(AL, 0, k, u, imm(1<<prec));
				832
				833	// LB -> (1-U) * V
				834	CONTEXT_LOAD(offset, generated_vars.lb);
				835	RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
				836	LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
				837	SMULBB(AL, u, U, V);
				838	ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
				839	build_and_immediate(pixel, pixel, mask, 32);
				840	if (adjust) {
				841	if (round)
				842	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				843	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				844	}
				845	MLA(AL, 0, d, pixel, u, d);
				846	SUB(AL, 0, k, k, u);
				847
				848	// LT -> (1-U)*(1-V)
				849	RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
				850	LDRH(AL, pixel, txPtr.reg);
				851	SMULBB(AL, u, U, V);
				852	ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
				853	build_and_immediate(pixel, pixel, mask, 32);
				854	if (adjust) {
				855	if (round)
				856	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				857	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				858	}
				859	MLA(AL, 0, d, pixel, u, d);
				860
				861	// RT -> U*(1-V)
				862	CONTEXT_LOAD(offset, generated_vars.rt);
				863	LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
				864	SUB(AL, 0, u, k, u);
				865	ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
				866	build_and_immediate(pixel, pixel, mask, 32);
				867	MLA(AL, 0, texel.reg, pixel, u, d);
				868	}
				869
				870	void GGLAssembler::filter24(
				871	const fragment_parts_t& /parts/,
				872	pixel_t& texel, const texture_unit_t& /tmu/,
				873	int /U/, int /V/, pointer_t& txPtr,
				874	int /FRAC_BITS/)
				875	{
				876	// not supported yet (currently disabled)
				877	load(txPtr, texel, 0);
				878	}
				879
				880	void GGLAssembler::filter32(
				881	const fragment_parts_t& /parts/,
				882	pixel_t& texel, const texture_unit_t& /tmu/,
				883	int U, int V, pointer_t& txPtr,
				884	int FRAC_BITS)
				885	{
				886	const int adjust = FRAC_BITS*2 - 8;
				887	const int round = 0;
				888
				889	// ------------------------
				890	// about ~38 cycles / pixel
				891	Scratch scratches(registerFile());
				892
				893	int pixel= scratches.obtain();
				894	int dh = scratches.obtain();
				895	int u = scratches.obtain();
				896	int k = scratches.obtain();
				897
				898	int temp = scratches.obtain();
				899	int dl = scratches.obtain();
				900	int mask = scratches.obtain();
				901
				902	MOV(AL, 0, mask, imm(0xFF));
				903	ORR(AL, 0, mask, mask, imm(0xFF0000));
				904
				905	// RB -> U * V
				906	int offset = pixel;
				907	CONTEXT_LOAD(offset, generated_vars.rt);
				908	CONTEXT_LOAD(u, generated_vars.lb);
				909	ADD(AL, 0, offset, offset, u);
				910
				911	LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
				912	SMULBB(AL, u, U, V);
				913	AND(AL, 0, temp, mask, pixel);
				914	if (adjust) {
				915	if (round)
				916	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				917	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				918	}
				919	MUL(AL, 0, dh, temp, u);
				920	AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
				921	MUL(AL, 0, dl, temp, u);
				922	RSB(AL, 0, k, u, imm(0x100));
				923
				924	// LB -> (1-U) * V
				925	CONTEXT_LOAD(offset, generated_vars.lb);
				926	RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
				927	LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
				928	SMULBB(AL, u, U, V);
				929	AND(AL, 0, temp, mask, pixel);
				930	if (adjust) {
				931	if (round)
				932	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				933	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				934	}
				935	MLA(AL, 0, dh, temp, u, dh);
				936	AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
				937	MLA(AL, 0, dl, temp, u, dl);
				938	SUB(AL, 0, k, k, u);
				939
				940	// LT -> (1-U)*(1-V)
				941	RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
				942	LDR(AL, pixel, txPtr.reg);
				943	SMULBB(AL, u, U, V);
				944	AND(AL, 0, temp, mask, pixel);
				945	if (adjust) {
				946	if (round)
				947	ADD(AL, 0, u, u, imm(1<<(adjust-1)));
				948	MOV(AL, 0, u, reg_imm(u, LSR, adjust));
				949	}
				950	MLA(AL, 0, dh, temp, u, dh);
				951	AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
				952	MLA(AL, 0, dl, temp, u, dl);
				953
				954	// RT -> U*(1-V)
				955	CONTEXT_LOAD(offset, generated_vars.rt);
				956	LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
				957	SUB(AL, 0, u, k, u);
				958	AND(AL, 0, temp, mask, pixel);
				959	MLA(AL, 0, dh, temp, u, dh);
				960	AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
				961	MLA(AL, 0, dl, temp, u, dl);
				962
				963	AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
				964	AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
				965	ORR(AL, 0, texel.reg, dh, dl);
				966	}
				967
				968	void GGLAssembler::build_texture_environment(
				969	component_t& fragment,
				970	const fragment_parts_t& parts,
				971	int component,
				972	Scratch& regs)
				973	{
				974	const uint32_t component_mask = 1<<component;
				975	const bool multiTexture = mTextureMachine.activeUnits > 1;
				976	for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
				977	texture_unit_t& tmu = mTextureMachine.tmu[i];
				978
				979	if (tmu.mask & component_mask) {
				980	// replace or modulate with this texture
				981	if ((tmu.replaced & component_mask) == 0) {
				982	// not replaced by a later tmu...
				983
				984	Scratch scratches(registerFile());
				985	pixel_t texel(parts.texel[i]);
				986
				987	if (multiTexture &&
				988	tmu.swrap == GGL_NEEDS_WRAP_11 &&
				989	tmu.twrap == GGL_NEEDS_WRAP_11)
				990	{
				991	texel.reg = scratches.obtain();
				992	texel.flags \|= CORRUPTIBLE;
				993	comment("fetch texel (multitexture 1:1)");
				994	load(parts.coords[i].ptr, texel, WRITE_BACK);
				995	}
				996
				997	component_t incoming(fragment);
				998	modify(fragment, regs);
				999
				1000	switch (tmu.env) {
				1001	case GGL_REPLACE:
				1002	extract(fragment, texel, component);
				1003	break;
				1004	case GGL_MODULATE:
				1005	modulate(fragment, incoming, texel, component);
				1006	break;
				1007	case GGL_DECAL:
				1008	decal(fragment, incoming, texel, component);
				1009	break;
				1010	case GGL_BLEND:
				1011	blend(fragment, incoming, texel, component, i);
				1012	break;
				1013	case GGL_ADD:
				1014	add(fragment, incoming, texel, component);
				1015	break;
				1016	}
				1017	}
				1018	}
				1019	}
				1020	}
				1021
				1022	// ---------------------------------------------------------------------------
				1023
				1024	void GGLAssembler::wrapping(
				1025	int d,
				1026	int coord, int size,
				1027	int tx_wrap, int tx_linear)
				1028	{
				1029	// notes:
				1030	// if tx_linear is set, we need 4 extra bits of precision on the result
				1031	// SMULL/UMULL is 3 cycles
				1032	Scratch scratches(registerFile());
				1033	int c = coord;
				1034	if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
				1035	// UMULL takes 4 cycles (interlocked), and we can get away with
				1036	// 2 cycles using SMULWB, but we're loosing 16 bits of precision
				1037	// out of 32 (this is not a problem because the iterator keeps
				1038	// its full precision)
				1039	// UMULL(AL, 0, size, d, c, size);
				1040	// note: we can't use SMULTB because it's signed.
				1041	MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
				1042	SMULWB(AL, d, d, size);
				1043	} else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
				1044	if (tx_linear) {
				1045	// 1 cycle
				1046	MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
				1047	} else {
				1048	// 4 cycles (common case)
				1049	MOV(AL, 0, d, reg_imm(coord, ASR, 16));
				1050	BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
				1051	CMP(AL, d, size);
				1052	SUB(GE, 0, d, size, imm(1));
				1053	}
				1054	}
				1055	}
				1056
				1057	// ---------------------------------------------------------------------------
				1058
				1059	void GGLAssembler::modulate(
				1060	component_t& dest,
				1061	const component_t& incoming,
				1062	const pixel_t& incomingTexel, int component)
				1063	{
				1064	Scratch locals(registerFile());
				1065	integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
				1066	extract(texel, incomingTexel, component);
				1067
				1068	const int Nt = texel.size();
				1069	// Nt should always be less than 10 bits because it comes
				1070	// from the TMU.
				1071
				1072	int Ni = incoming.size();
				1073	// Ni could be big because it comes from previous MODULATEs
				1074
				1075	if (Nt == 1) {
				1076	// texel acts as a bit-mask
				1077	// dest = incoming & ((texel << incoming.h)-texel)
				1078	RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
				1079	AND(AL, 0, dest.reg, dest.reg, incoming.reg);
				1080	dest.l = incoming.l;
				1081	dest.h = incoming.h;
				1082	dest.flags \|= (incoming.flags & CLEAR_LO);
				1083	} else if (Ni == 1) {
				1084	MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
				1085	AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
				1086	dest.l = 0;
				1087	dest.h = Nt;
				1088	} else {
				1089	int inReg = incoming.reg;
				1090	int shift = incoming.l;
				1091	if ((Nt + Ni) > 32) {
				1092	// we will overflow, reduce the precision of Ni to 8 bits
				1093	// (Note Nt cannot be more than 10 bits which happens with
				1094	// 565 textures and GGL_LINEAR)
				1095	shift += Ni-8;
				1096	Ni = 8;
				1097	}
				1098
				1099	// modulate by the component with the lowest precision
				1100	if (Nt >= Ni) {
				1101	if (shift) {
				1102	// XXX: we should be able to avoid this shift
				1103	// when shift==16 && Nt<16 && Ni<16, in which
				1104	// we could use SMULBT below.
				1105	MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
				1106	inReg = dest.reg;
				1107	shift = 0;
				1108	}
				1109	// operation: (Cf*Ct)/((1<<Ni)-1)
				1110	// approximated with: Cf*(Ct + Ct>>(Ni-1))>>Ni
				1111	// this operation doesn't change texel's size
				1112	ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
				1113	if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
				1114	else MUL(AL, 0, dest.reg, texel.reg, dest.reg);
				1115	dest.l = Ni;
				1116	dest.h = Nt + Ni;
				1117	} else {
				1118	if (shift && (shift != 16)) {
				1119	// if shift==16, we can use 16-bits mul instructions later
				1120	MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
				1121	inReg = dest.reg;
				1122	shift = 0;
				1123	}
				1124	// operation: (Cf*Ct)/((1<<Nt)-1)
				1125	// approximated with: Ct*(Cf + Cf>>(Nt-1))>>Nt
				1126	// this operation doesn't change incoming's size
				1127	Scratch scratches(registerFile());
				1128	int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
				1129	if (t == inReg)
				1130	t = scratches.obtain();
				1131	ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
				1132	if (Nt<16 && Ni<16) {
				1133	if (shift==16) SMULBT(AL, dest.reg, t, inReg);
				1134	else SMULBB(AL, dest.reg, t, inReg);
				1135	} else MUL(AL, 0, dest.reg, t, inReg);
				1136	dest.l = Nt;
				1137	dest.h = Nt + Ni;
				1138	}
				1139
				1140	// low bits are not valid
				1141	dest.flags \|= CLEAR_LO;
				1142
				1143	// no need to keep more than 8 bits/component
				1144	if (dest.size() > 8)
				1145	dest.l = dest.h-8;
				1146	}
				1147	}
				1148
				1149	void GGLAssembler::decal(
				1150	component_t& dest,
				1151	const component_t& incoming,
				1152	const pixel_t& incomingTexel, int component)
				1153	{
				1154	// RGBA:
				1155	// Cv = Cf(1 - At) + CtAt = Cf + (Ct - Cf)*At
				1156	// Av = Af
				1157	Scratch locals(registerFile());
				1158	integer_t texel(locals.obtain(), 32, CORRUPTIBLE);
				1159	integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
				1160	extract(texel, incomingTexel, component);
				1161	extract(factor, incomingTexel, GGLFormat::ALPHA);
				1162
				1163	// no need to keep more than 8-bits for decal
				1164	int Ni = incoming.size();
				1165	int shift = incoming.l;
				1166	if (Ni > 8) {
				1167	shift += Ni-8;
				1168	Ni = 8;
				1169	}
				1170	integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
				1171	if (shift) {
				1172	MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
				1173	incomingNorm.reg = dest.reg;
				1174	incomingNorm.flags \|= CORRUPTIBLE;
				1175	}
				1176	ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
				1177	build_blendOneMinusFF(dest, factor, incomingNorm, texel);
				1178	}
				1179
				1180	void GGLAssembler::blend(
				1181	component_t& dest,
				1182	const component_t& incoming,
				1183	const pixel_t& incomingTexel, int component, int tmu)
				1184	{
				1185	// RGBA:
				1186	// Cv = (1 - Ct)Cf + CtCc = Cf + (Cc - Cf)*Ct
				1187	// Av = At*Af
				1188
				1189	if (component == GGLFormat::ALPHA) {
				1190	modulate(dest, incoming, incomingTexel, component);
				1191	return;
				1192	}
				1193
				1194	Scratch locals(registerFile());
				1195	integer_t color(locals.obtain(), 8, CORRUPTIBLE);
				1196	integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
				1197	LDRB(AL, color.reg, mBuilderContext.Rctx,
				1198	immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
				1199	extract(factor, incomingTexel, component);
				1200
				1201	// no need to keep more than 8-bits for blend
				1202	int Ni = incoming.size();
				1203	int shift = incoming.l;
				1204	if (Ni > 8) {
				1205	shift += Ni-8;
				1206	Ni = 8;
				1207	}
				1208	integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
				1209	if (shift) {
				1210	MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
				1211	incomingNorm.reg = dest.reg;
				1212	incomingNorm.flags \|= CORRUPTIBLE;
				1213	}
				1214	ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
				1215	build_blendOneMinusFF(dest, factor, incomingNorm, color);
				1216	}
				1217
				1218	void GGLAssembler::add(
				1219	component_t& dest,
				1220	const component_t& incoming,
				1221	const pixel_t& incomingTexel, int component)
				1222	{
				1223	// RGBA:
				1224	// Cv = Cf + Ct;
				1225	Scratch locals(registerFile());
				1226
				1227	component_t incomingTemp(incoming);
				1228
				1229	// use "dest" as a temporary for extracting the texel, unless "dest"
				1230	// overlaps "incoming".
				1231	integer_t texel(dest.reg, 32, CORRUPTIBLE);
				1232	if (dest.reg == incomingTemp.reg)
				1233	texel.reg = locals.obtain();
				1234	extract(texel, incomingTexel, component);
				1235
				1236	if (texel.s < incomingTemp.size()) {
				1237	expand(texel, texel, incomingTemp.size());
				1238	} else if (texel.s > incomingTemp.size()) {
				1239	if (incomingTemp.flags & CORRUPTIBLE) {
				1240	expand(incomingTemp, incomingTemp, texel.s);
				1241	} else {
				1242	incomingTemp.reg = locals.obtain();
				1243	expand(incomingTemp, incoming, texel.s);
				1244	}
				1245	}
				1246
				1247	if (incomingTemp.l) {
				1248	ADD(AL, 0, dest.reg, texel.reg,
				1249	reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
				1250	} else {
				1251	ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
				1252	}
				1253	dest.l = 0;
				1254	dest.h = texel.size();
				1255	component_sat(dest);
				1256	}
				1257
				1258	// ----------------------------------------------------------------------------
				1259
				1260	}; // namespace android
				1261