Merge tag 'android-11.0.0_r16' into android-10.0 Android 11.0.0 release 16 - twrp bringup patch

commit: 673c7ae01e96a141aadc234549ddb58dc00b93d4 [log] [tgz]
author: bigbiff <bigbiff@teamw.in> Wed Dec 02 19:44:56 2020 -0500
committer: bigbiff <bigbiff@teamw.in> Wed Dec 16 19:06:18 2020 -0500
tree: 93eaae786c73e842740197600ac7dc054605f51b
parent: a9fc4f9eba836e2d0dba9413d35b2f77ba59c8dd [diff]
parent: d74a03d544399b23844f43ac36dc1de893b52d90 [diff]
diff --git a/libpixelflinger/codeflinger/ARMAssembler.cpp b/libpixelflinger/codeflinger/ARMAssembler.cpp
new file mode 100644
index 0000000..f47b6e4
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssembler.cpp

@@ -0,0 +1,579 @@
+/* libs/pixelflinger/codeflinger/ARMAssembler.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "ARMAssembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cutils/properties.h>
+#include <log/log.h>
+#include <private/pixelflinger/ggl_context.h>
+
+#include "ARMAssembler.h"
+#include "CodeCache.h"
+#include "disassem.h"
+
+// ----------------------------------------------------------------------------
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark ARMAssembler...
+#endif
+
+ARMAssembler::ARMAssembler(const sp<Assembly>& assembly)
+    :   ARMAssemblerInterface(),
+        mAssembly(assembly)
+{
+    mBase = mPC = (uint32_t *)assembly->base();
+    mDuration = ggl_system_time();
+}
+
+ARMAssembler::~ARMAssembler()
+{
+}
+
+uint32_t* ARMAssembler::pc() const
+{
+    return mPC;
+}
+
+uint32_t* ARMAssembler::base() const
+{
+    return mBase;
+}
+
+void ARMAssembler::reset()
+{
+    mBase = mPC = (uint32_t *)mAssembly->base();
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+}
+
+int ARMAssembler::getCodegenArch()
+{
+    return CODEGEN_ARCH_ARM;
+}
+
+// ----------------------------------------------------------------------------
+
+void ARMAssembler::disassemble(const char* name)
+{
+    if (name) {
+        printf("%s:\n", name);
+    }
+    size_t count = pc()-base();
+    uint32_t* i = base();
+    while (count--) {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(i);
+        if (label >= 0) {
+            printf("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(i);
+        if (comment >= 0) {
+            printf("; %s\n", mComments.valueAt(comment));
+        }
+        printf("%08x:    %08x    ", uintptr_t(i), int(i[0]));
+        ::disassemble((uintptr_t)i);
+        i++;
+    }
+}
+
+void ARMAssembler::comment(const char* string)
+{
+    mComments.add(mPC, string);
+}
+
+void ARMAssembler::label(const char* theLabel)
+{
+    mLabels.add(theLabel, mPC);
+    mLabelsInverseMapping.add(mPC, theLabel);
+}
+
+void ARMAssembler::B(int cc, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (cc<<28) | (0xA<<24) | 0;
+}
+
+void ARMAssembler::BL(int cc, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (cc<<28) | (0xB<<24) | 0;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Prolog/Epilog & Generate...
+#endif
+
+
+void ARMAssembler::prolog()
+{
+    // write dummy prolog code
+    mPrologPC = mPC;
+    STM(AL, FD, SP, 1, LSAVED);
+}
+
+void ARMAssembler::epilog(uint32_t touched)
+{
+    touched &= LSAVED;
+    if (touched) {
+        // write prolog code
+        uint32_t* pc = mPC;
+        mPC = mPrologPC;
+        STM(AL, FD, SP, 1, touched | LLR);
+        mPC = pc;
+        // write epilog code
+        LDM(AL, FD, SP, 1, touched | LLR);
+        BX(AL, LR);
+    } else {   // heh, no registers to save!
+        // write prolog code
+        uint32_t* pc = mPC;
+        mPC = mPrologPC;
+        MOV(AL, 0, R0, R0); // NOP
+        mPC = pc;
+        // write epilog code
+        BX(AL, LR);
+    }
+}
+
+int ARMAssembler::generate(const char* name)
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--) {
+        const branch_target_t& bt = mBranchTargets[count];
+        uint32_t* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                "error resolving branch targets, target_pc is null");
+        int32_t offset = int32_t(target_pc - (bt.pc+2));
+        *bt.pc |= offset & 0xFFFFFF;
+    }
+
+    mAssembly->resize( int(pc()-base())*4 );
+
+    // the instruction cache is flushed by CodeCache
+    const int64_t duration = ggl_system_time() - mDuration;
+    const char * const format = "generated %s (%d ins) at [%p:%p] in %lld ns\n";
+    ALOGI(format, name, int(pc()-base()), base(), pc(), duration);
+
+    char value[PROPERTY_VALUE_MAX];
+    property_get("debug.pf.disasm", value, "0");
+    if (atoi(value) != 0) {
+        printf(format, name, int(pc()-base()), base(), pc(), duration);
+        disassemble(name);
+    }
+
+    return OK;
+}
+
+uint32_t* ARMAssembler::pcForLabel(const char* label)
+{
+    return mLabels.valueFor(label);
+}
+
+// ----------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Data Processing...
+#endif
+
+void ARMAssembler::dataProcessing(int opcode, int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    *mPC++ = (cc<<28) | (opcode<<21) | (s<<20) | (Rn<<16) | (Rd<<12) | Op2;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Multiply...
+#endif
+
+// multiply...
+void ARMAssembler::MLA(int cc, int s,
+        int Rd, int Rm, int Rs, int Rn) {
+    if (Rd == Rm) { int t = Rm; Rm=Rs; Rs=t; }
+    LOG_FATAL_IF(Rd==Rm, "MLA(r%u,r%u,r%u,r%u)", Rd,Rm,Rs,Rn);
+    *mPC++ =    (cc<<28) | (1<<21) | (s<<20) |
+                (Rd<<16) | (Rn<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::MUL(int cc, int s,
+        int Rd, int Rm, int Rs) {
+    if (Rd == Rm) { int t = Rm; Rm=Rs; Rs=t; }
+    LOG_FATAL_IF(Rd==Rm, "MUL(r%u,r%u,r%u)", Rd,Rm,Rs);
+    *mPC++ = (cc<<28) | (s<<20) | (Rd<<16) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::UMULL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::UMUAL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<21) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::SMULL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::SMUAL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (1<<21) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Branches...
+#endif
+
+// branches...
+void ARMAssembler::B(int cc, uint32_t* pc)
+{
+    int32_t offset = int32_t(pc - (mPC+2));
+    *mPC++ = (cc<<28) | (0xA<<24) | (offset & 0xFFFFFF);
+}
+
+void ARMAssembler::BL(int cc, uint32_t* pc)
+{
+    int32_t offset = int32_t(pc - (mPC+2));
+    *mPC++ = (cc<<28) | (0xB<<24) | (offset & 0xFFFFFF);
+}
+
+void ARMAssembler::BX(int cc, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x12FFF10 | Rn;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Data Transfer...
+#endif
+
+// data transfert...
+void ARMAssembler::LDR(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<20) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::LDRB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<22) | (1<<20) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::STR(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::STRB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<22) | (Rn<<16) | (Rd<<12) | offset;
+}
+
+void ARMAssembler::LDRH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xB0 | offset;
+}
+void ARMAssembler::LDRSB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xD0 | offset;
+}
+void ARMAssembler::LDRSH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xF0 | offset;
+}
+void ARMAssembler::STRH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (Rn<<16) | (Rd<<12) | 0xB0 | offset;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Block Data Transfer...
+#endif
+
+// block data transfer...
+void ARMAssembler::LDM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{   //                    ED FD EA FA      IB IA DB DA
+    const uint8_t P[8] = { 1, 0, 1, 0,      1, 0, 1, 0 };
+    const uint8_t U[8] = { 1, 1, 0, 0,      1, 1, 0, 0 };
+    *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+            (uint32_t(U[dir])<<23) | (1<<20) | (W<<21) | (Rn<<16) | reg_list;
+}
+
+void ARMAssembler::STM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{   //                    ED FD EA FA      IB IA DB DA
+    const uint8_t P[8] = { 0, 1, 0, 1,      1, 0, 1, 0 };
+    const uint8_t U[8] = { 0, 0, 1, 1,      1, 1, 0, 0 };
+    *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+            (uint32_t(U[dir])<<23) | (0<<20) | (W<<21) | (Rn<<16) | reg_list;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Special...
+#endif
+
+// special...
+void ARMAssembler::SWP(int cc, int Rn, int Rd, int Rm) {
+    *mPC++ = (cc<<28) | (2<<23) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+}
+void ARMAssembler::SWPB(int cc, int Rn, int Rd, int Rm) {
+    *mPC++ = (cc<<28) | (2<<23) | (1<<22) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+}
+void ARMAssembler::SWI(int cc, uint32_t comment) {
+    *mPC++ = (cc<<28) | (0xF<<24) | comment;
+}
+
+#if 0
+#pragma mark -
+#pragma mark DSP instructions...
+#endif
+
+// DSP instructions...
+void ARMAssembler::PLD(int Rn, uint32_t offset) {
+    LOG_ALWAYS_FATAL_IF(!((offset&(1<<24)) && !(offset&(1<<21))),
+                        "PLD only P=1, W=0");
+    *mPC++ = 0xF550F000 | (Rn<<16) | offset;
+}
+
+void ARMAssembler::CLZ(int cc, int Rd, int Rm)
+{
+    *mPC++ = (cc<<28) | 0x16F0F10| (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QADD(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1000050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QDADD(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1400050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QSUB(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1200050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QDSUB(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1600050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs)
+{
+    *mPC++ = (cc<<28) | 0x1600080 | (Rd<<16) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMULW(int cc, int y,
+                int Rd, int Rm, int Rs)
+{
+    *mPC++ = (cc<<28) | 0x12000A0 | (Rd<<16) | (Rs<<8) | (y<<4) | Rm;
+}
+
+void ARMAssembler::SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1000080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm)
+{
+    *mPC++ = (cc<<28) | 0x1400080 | (RdHi<<16) | (RdLo<<12) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1200080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (y<<4) | Rm;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Byte/half word extract and extend (ARMv6+ only)...
+#endif
+
+void ARMAssembler::UXTB16(int cc, int Rd, int Rm, int rotate)
+{
+    *mPC++ = (cc<<28) | 0x6CF0070 | (Rd<<12) | ((rotate >> 3) << 10) | Rm;
+}
+#if 0
+#pragma mark -
+#pragma mark Bit manipulation (ARMv7+ only)...
+#endif
+
+// Bit manipulation (ARMv7+ only)...
+void ARMAssembler::UBFX(int cc, int Rd, int Rn, int lsb, int width)
+{
+    *mPC++ = (cc<<28) | 0x7E00000 | ((width-1)<<16) | (Rd<<12) | (lsb<<7) | 0x50 | Rn;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Addressing modes...
+#endif
+
+int ARMAssembler::buildImmediate(
+        uint32_t immediate, uint32_t& rot, uint32_t& imm)
+{
+    rot = 0;
+    imm = immediate;
+    if (imm > 0x7F) { // skip the easy cases
+        while (!(imm&3)  || (imm&0xFC000000)) {
+            uint32_t newval;
+            newval = imm >> 2;
+            newval |= (imm&3) << 30;
+            imm = newval;
+            rot += 2;
+            if (rot == 32) {
+                rot = 0;
+                break;
+            }
+        }
+    }
+    rot = (16 - (rot>>1)) & 0xF;
+
+    if (imm>=0x100)
+        return -EINVAL;
+
+    if (((imm>>(rot<<1)) | (imm<<(32-(rot<<1)))) != immediate)
+        return -1;
+
+    return 0;
+}
+
+// shifters...
+
+bool ARMAssembler::isValidImmediate(uint32_t immediate)
+{
+    uint32_t rot, imm;
+    return buildImmediate(immediate, rot, imm) == 0;
+}
+
+uint32_t ARMAssembler::imm(uint32_t immediate)
+{
+    uint32_t rot, imm;
+    int err = buildImmediate(immediate, rot, imm);
+
+    LOG_ALWAYS_FATAL_IF(err==-EINVAL,
+                        "immediate %08x cannot be encoded",
+                        immediate);
+
+    LOG_ALWAYS_FATAL_IF(err,
+                        "immediate (%08x) encoding bogus!",
+                        immediate);
+
+    return (1<<25) | (rot<<8) | imm;
+}
+
+uint32_t ARMAssembler::reg_imm(int Rm, int type, uint32_t shift)
+{
+    return ((shift&0x1F)<<7) | ((type&0x3)<<5) | (Rm&0xF);
+}
+
+uint32_t ARMAssembler::reg_rrx(int Rm)
+{
+    return (ROR<<5) | (Rm&0xF);
+}
+
+uint32_t ARMAssembler::reg_reg(int Rm, int type, int Rs)
+{
+    return ((Rs&0xF)<<8) | ((type&0x3)<<5) | (1<<4) | (Rm&0xF);
+}
+
+// addressing modes...
+// LDR(B)/STR(B)/PLD (immediate and Rm can be negative, which indicate U=0)
+uint32_t ARMAssembler::immed12_pre(int32_t immed12, int W)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+    return (1<<24) | (((uint32_t(immed12)>>31)^1)<<23) |
+            ((W&1)<<21) | (abs(immed12)&0x7FF);
+}
+
+uint32_t ARMAssembler::immed12_post(int32_t immed12)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+
+    return (((uint32_t(immed12)>>31)^1)<<23) | (abs(immed12)&0x7FF);
+}
+
+uint32_t ARMAssembler::reg_scale_pre(int Rm, int type,
+        uint32_t shift, int W)
+{
+    return  (1<<25) | (1<<24) |
+            (((uint32_t(Rm)>>31)^1)<<23) | ((W&1)<<21) |
+            reg_imm(abs(Rm), type, shift);
+}
+
+uint32_t ARMAssembler::reg_scale_post(int Rm, int type, uint32_t shift)
+{
+    return (1<<25) | (((uint32_t(Rm)>>31)^1)<<23) | reg_imm(abs(Rm), type, shift);
+}
+
+// LDRH/LDRSB/LDRSH/STRH (immediate and Rm can be negative, which indicate U=0)
+uint32_t ARMAssembler::immed8_pre(int32_t immed8, int W)
+{
+    uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+
+    return  (1<<24) | (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) |
+            ((W&1)<<21) | (((offset&0xF0)<<4)|(offset&0xF));
+}
+
+uint32_t ARMAssembler::immed8_post(int32_t immed8)
+{
+    uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+
+    return (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) |
+            (((offset&0xF0)<<4) | (offset&0xF));
+}
+
+uint32_t ARMAssembler::reg_pre(int Rm, int W)
+{
+    return (1<<24) | (((uint32_t(Rm)>>31)^1)<<23) | ((W&1)<<21) | (abs(Rm)&0xF);
+}
+
+uint32_t ARMAssembler::reg_post(int Rm)
+{
+    return (((uint32_t(Rm)>>31)^1)<<23) | (abs(Rm)&0xF);
+}
+
+}; // namespace android

diff --git a/libpixelflinger/codeflinger/ARMAssembler.h b/libpixelflinger/codeflinger/ARMAssembler.h
new file mode 100644
index 0000000..76acf7e
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssembler.h

@@ -0,0 +1,187 @@
+/* libs/pixelflinger/codeflinger/ARMAssembler.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#ifndef ANDROID_ARMASSEMBLER_H
+#define ANDROID_ARMASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "tinyutils/smartpointer.h"
+#include "utils/Vector.h"
+#include "utils/KeyedVector.h"
+
+#include "ARMAssemblerInterface.h"
+#include "CodeCache.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssembler : public ARMAssemblerInterface
+{
+public:
+    explicit    ARMAssembler(const sp<Assembly>& assembly);
+    virtual     ~ARMAssembler();
+
+    uint32_t*   base() const;
+    uint32_t*   pc() const;
+
+
+    void        disassemble(const char* name);
+
+    // ------------------------------------------------------------------------
+    // ARMAssemblerInterface...
+    // ------------------------------------------------------------------------
+
+    virtual void    reset();
+
+    virtual int     generate(const char* name);
+    virtual int     getCodegenArch();
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // shifters...
+    virtual bool        isValidImmediate(uint32_t immed);
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    virtual uint32_t    imm(uint32_t immediate);
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    virtual uint32_t    reg_rrx(int Rm);
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes...
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0);
+    virtual uint32_t    immed12_post(int32_t immed12);
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0);
+    virtual uint32_t    immed8_post(int32_t immed8);
+    virtual uint32_t    reg_pre(int Rm, int W=0);
+    virtual uint32_t    reg_post(int Rm);
+
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    virtual uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+
+
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width);
+
+private:
+                ARMAssembler(const ARMAssembler& rhs);
+                ARMAssembler& operator = (const ARMAssembler& rhs);
+
+    sp<Assembly>    mAssembly;
+    uint32_t*       mBase;
+    uint32_t*       mPC;
+    uint32_t*       mPrologPC;
+    int64_t         mDuration;
+    
+    struct branch_target_t {
+        inline branch_target_t() : label(0), pc(0) { }
+        inline branch_target_t(const char* l, uint32_t* p)
+            : label(l), pc(p) { }
+        const char* label;
+        uint32_t*   pc;
+    };
+    
+    Vector<branch_target_t>                 mBranchTargets;
+    KeyedVector< const char*, uint32_t* >   mLabels;
+    KeyedVector< uint32_t*, const char* >   mLabelsInverseMapping;
+    KeyedVector< uint32_t*, const char* >   mComments;
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_H

diff --git a/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp b/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp
new file mode 100644
index 0000000..c96cf4b
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp

@@ -0,0 +1,90 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerInterface.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+#define LOG_TAG "pixelflinger-code"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <log/log.h>
+
+#include "ARMAssemblerInterface.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+ARMAssemblerInterface::~ARMAssemblerInterface()
+{
+}
+
+// --------------------------------------------------------------------
+
+// The following two functions are static and used for initializers
+// in the original ARM code. The above versions (without __), are now
+// virtual, and can be overridden in the MIPS code. But since these are
+// needed at initialization time, they must be static. Not thrilled with
+// this implementation, but it works...
+
+uint32_t ARMAssemblerInterface::__immed12_pre(int32_t immed12, int W)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+    return (1<<24) | (((uint32_t(immed12)>>31)^1)<<23) |
+            ((W&1)<<21) | (abs(immed12)&0x7FF);
+}
+
+uint32_t ARMAssemblerInterface::__immed8_pre(int32_t immed8, int W)
+{
+    uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+
+    return  (1<<24) | (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) |
+            ((W&1)<<21) | (((offset&0xF0)<<4)|(offset&0xF));
+}
+
+// The following four functions are required for address manipulation
+// These are virtual functions, which can be overridden by architectures
+// that need special handling of address values (e.g. 64-bit arch)
+
+void ARMAssemblerInterface::ADDR_LDR(int cc, int Rd,
+     int Rn, uint32_t offset)
+{
+    LDR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerInterface::ADDR_STR(int cc, int Rd,
+     int Rn, uint32_t offset)
+{
+    STR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerInterface::ADDR_ADD(int cc, int s,
+     int Rd, int Rn, uint32_t Op2)
+{
+    dataProcessing(opADD, cc, s, Rd, Rn, Op2);
+}
+void ARMAssemblerInterface::ADDR_SUB(int cc, int s,
+     int Rd, int Rn, uint32_t Op2)
+{
+    dataProcessing(opSUB, cc, s, Rd, Rn, Op2);
+}
+}; // namespace android
+

diff --git a/libpixelflinger/codeflinger/ARMAssemblerInterface.h b/libpixelflinger/codeflinger/ARMAssemblerInterface.h
new file mode 100644
index 0000000..72935ac
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerInterface.h

@@ -0,0 +1,349 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerInterface.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_ARMASSEMBLER_INTERFACE_H
+#define ANDROID_ARMASSEMBLER_INTERFACE_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssemblerInterface
+{
+public:
+    virtual ~ARMAssemblerInterface();
+
+    enum {
+        EQ, NE, CS, CC, MI, PL, VS, VC, HI, LS, GE, LT, GT, LE, AL, NV,
+        HS = CS,
+        LO = CC
+    };
+    enum {
+        S = 1
+    };
+    enum {
+        LSL, LSR, ASR, ROR
+    };
+    enum {
+        ED, FD, EA, FA,
+        IB, IA, DB, DA
+    };
+    enum {
+        R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15,
+        SP = R13,
+        LR = R14,
+        PC = R15
+    };
+    enum {
+        #define LIST(rr) L##rr=1<<rr
+        LIST(R0), LIST(R1), LIST(R2), LIST(R3), LIST(R4), LIST(R5), LIST(R6),
+        LIST(R7), LIST(R8), LIST(R9), LIST(R10), LIST(R11), LIST(R12),
+        LIST(R13), LIST(R14), LIST(R15),
+        LIST(SP), LIST(LR), LIST(PC),
+        #undef LIST
+        LSAVED = LR4|LR5|LR6|LR7|LR8|LR9|LR10|LR11 | LLR
+    };
+
+    enum {
+        CODEGEN_ARCH_ARM = 1, CODEGEN_ARCH_MIPS, CODEGEN_ARCH_ARM64, CODEGEN_ARCH_MIPS64
+    };
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // these static versions are used for initializers on LDxx/STxx below
+    static uint32_t    __immed12_pre(int32_t immed12, int W=0);
+    static uint32_t    __immed8_pre(int32_t immed12, int W=0);
+
+    virtual bool        isValidImmediate(uint32_t immed) = 0;
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm) = 0;
+
+    virtual uint32_t    imm(uint32_t immediate) = 0;
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift) = 0;
+    virtual uint32_t    reg_rrx(int Rm) = 0;
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs) = 0;
+
+    // addressing modes... 
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0) = 0;
+    virtual uint32_t    immed12_post(int32_t immed12) = 0;
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0) = 0;
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0) = 0;
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0) = 0;
+    virtual uint32_t    immed8_post(int32_t immed8) = 0;
+    virtual uint32_t    reg_pre(int Rm, int W=0) = 0;
+    virtual uint32_t    reg_post(int Rm) = 0;
+
+    // -----------------------------------------------------------------------
+    // basic instructions & code generation
+    // -----------------------------------------------------------------------
+
+    // generate the code
+    virtual void reset() = 0;
+    virtual int  generate(const char* name) = 0;
+    virtual void disassemble(const char* name) = 0;
+    virtual int  getCodegenArch() = 0;
+    
+    // construct prolog and epilog
+    virtual void prolog() = 0;
+    virtual void epilog(uint32_t touched) = 0;
+    virtual void comment(const char* string) = 0;
+
+    // data processing...
+    enum {
+        opAND, opEOR, opSUB, opRSB, opADD, opADC, opSBC, opRSC, 
+        opTST, opTEQ, opCMP, opCMN, opORR, opMOV, opBIC, opMVN,
+        opADD64, opSUB64
+    };
+
+    virtual void
+            dataProcessing( int opcode, int cc, int s,
+                            int Rd, int Rn,
+                            uint32_t Op2) = 0;
+    
+    // multiply...
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+
+    // branches...
+    virtual void B(int cc, uint32_t* pc) = 0;
+    virtual void BL(int cc, uint32_t* pc) = 0;
+    virtual void BX(int cc, int Rn) = 0;
+
+    virtual void label(const char* theLabel) = 0;
+    virtual void B(int cc, const char* label) = 0;
+    virtual void BL(int cc, const char* label) = 0;
+
+    // valid only after generate() has been called
+    virtual uint32_t* pcForLabel(const char* label) = 0;
+
+    // data transfer...
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0)) = 0;
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0)) = 0;
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0)) = 0;
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0)) = 0;
+
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0)) = 0;
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = __immed8_pre(0)) = 0;
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0)) = 0;
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0)) = 0;
+
+    // block data transfer...
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list) = 0;
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list) = 0;
+
+    // special...
+    virtual void SWP(int cc, int Rn, int Rd, int Rm) = 0;
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm) = 0;
+    virtual void SWI(int cc, uint32_t comment) = 0;
+
+    // DSP instructions...
+    enum {
+        // B=0, T=1
+        //     yx
+        xyBB = 0, // 0000,
+        xyTB = 2, // 0010,
+        xyBT = 4, // 0100,
+        xyTT = 6, // 0110,
+        yB   = 0, // 0000,
+        yT   = 4, // 0100
+    };
+
+    virtual void PLD(int Rn, uint32_t offset) = 0;
+
+    virtual void CLZ(int cc, int Rd, int Rm) = 0;
+    
+    virtual void QADD(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn) = 0;
+    
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm) = 0;
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+
+    // byte/half word extract...
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate) = 0;
+
+    // bit manipulation...
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width) = 0;
+
+    // -----------------------------------------------------------------------
+    // convenience...
+    // -----------------------------------------------------------------------
+    inline void
+    ADC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opADC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    ADD(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opADD, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    AND(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opAND, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    BIC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opBIC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    EOR(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opEOR, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    MOV(int cc, int s, int Rd, uint32_t Op2) {
+        dataProcessing(opMOV, cc, s, Rd, 0, Op2);
+    }
+    inline void
+    MVN(int cc, int s, int Rd, uint32_t Op2) {
+        dataProcessing(opMVN, cc, s, Rd, 0, Op2);
+    }
+    inline void
+    ORR(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opORR, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    RSB(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opRSB, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    RSC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opRSC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    SBC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opSBC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    SUB(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opSUB, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    TEQ(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opTEQ, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    TST(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opTST, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    CMP(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opCMP, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    CMN(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opCMN, cc, 1, 0, Rn, Op2);
+    }
+
+    inline void SMULBB(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyBB, Rd, Rm, Rs);    }
+    inline void SMULTB(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyTB, Rd, Rm, Rs);    }
+    inline void SMULBT(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyBT, Rd, Rm, Rs);    }
+    inline void SMULTT(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyTT, Rd, Rm, Rs);    }
+
+    inline void SMULWB(int cc, int Rd, int Rm, int Rs) {
+        SMULW(cc, yB, Rd, Rm, Rs);    }
+    inline void SMULWT(int cc, int Rd, int Rm, int Rs) {
+        SMULW(cc, yT, Rd, Rm, Rs);    }
+
+    inline void
+    SMLABB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyBB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLATB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyTB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLABT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyBT, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLATT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyTT, Rd, Rm, Rs, Rn);    }
+
+    inline void
+    SMLALBB(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyBB, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALTB(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyTB, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALBT(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyBT, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALTT(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyTT, RdHi, RdLo, Rs, Rm);    }
+
+    inline void
+    SMLAWB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLAW(cc, yB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLAWT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLAW(cc, yT, Rd, Rm, Rs, Rn);    }
+
+    // Address loading/storing/manipulation
+    virtual void ADDR_LDR(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_STR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_ADD(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+    virtual void ADDR_SUB(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_INTERFACE_H

diff --git a/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp b/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp
new file mode 100644
index 0000000..816de48
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp

@@ -0,0 +1,311 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerProxy.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "ARMAssemblerProxy.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+ARMAssemblerProxy::ARMAssemblerProxy()
+    : mTarget(0)
+{
+}
+
+ARMAssemblerProxy::ARMAssemblerProxy(ARMAssemblerInterface* target)
+    : mTarget(target)
+{
+}
+
+ARMAssemblerProxy::~ARMAssemblerProxy()
+{
+    delete mTarget;
+}
+
+void ARMAssemblerProxy::setTarget(ARMAssemblerInterface* target)
+{
+    delete mTarget;
+    mTarget = target;
+}
+
+void ARMAssemblerProxy::reset() {
+    mTarget->reset();
+}
+int ARMAssemblerProxy::generate(const char* name) {
+    return mTarget->generate(name);
+}
+void ARMAssemblerProxy::disassemble(const char* name) {
+    return mTarget->disassemble(name);
+}
+int ARMAssemblerProxy::getCodegenArch()
+{
+    return mTarget->getCodegenArch();
+}
+void ARMAssemblerProxy::prolog() {
+    mTarget->prolog();
+}
+void ARMAssemblerProxy::epilog(uint32_t touched) {
+    mTarget->epilog(touched);
+}
+void ARMAssemblerProxy::comment(const char* string) {
+    mTarget->comment(string);
+}
+
+
+
+// addressing modes
+
+bool ARMAssemblerProxy::isValidImmediate(uint32_t immed)
+{
+    return mTarget->isValidImmediate(immed);
+}
+
+int ARMAssemblerProxy::buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm)
+{
+    return mTarget->buildImmediate(i, rot, imm);
+}
+
+
+
+uint32_t ARMAssemblerProxy::imm(uint32_t immediate)
+{
+    return mTarget->imm(immediate);
+}
+
+uint32_t ARMAssemblerProxy::reg_imm(int Rm, int type, uint32_t shift)
+{
+    return mTarget->reg_imm(Rm, type, shift);
+}
+
+uint32_t ARMAssemblerProxy::reg_rrx(int Rm)
+{
+    return mTarget->reg_rrx(Rm);
+}
+
+uint32_t ARMAssemblerProxy::reg_reg(int Rm, int type, int Rs)
+{
+    return mTarget->reg_reg(Rm, type, Rs);
+}
+
+
+// addressing modes...
+// LDR(B)/STR(B)/PLD
+// (immediate and Rm can be negative, which indicates U=0)
+uint32_t ARMAssemblerProxy::immed12_pre(int32_t immed12, int W)
+{
+    return mTarget->immed12_pre(immed12, W);
+}
+
+uint32_t ARMAssemblerProxy::immed12_post(int32_t immed12)
+{
+    return mTarget->immed12_post(immed12);
+}
+
+uint32_t ARMAssemblerProxy::reg_scale_pre(int Rm, int type, uint32_t shift, int W)
+{
+    return mTarget->reg_scale_pre(Rm, type, shift, W);
+}
+
+uint32_t ARMAssemblerProxy::reg_scale_post(int Rm, int type, uint32_t shift)
+{
+    return mTarget->reg_scale_post(Rm, type, shift);
+}
+
+
+// LDRH/LDRSB/LDRSH/STRH
+// (immediate and Rm can be negative, which indicates U=0)
+uint32_t ARMAssemblerProxy::immed8_pre(int32_t immed8, int W)
+{
+    return mTarget->immed8_pre(immed8, W);
+}
+
+uint32_t ARMAssemblerProxy::immed8_post(int32_t immed8)
+{
+    return mTarget->immed8_post(immed8);
+}
+
+uint32_t ARMAssemblerProxy::reg_pre(int Rm, int W)
+{
+    return mTarget->reg_pre(Rm, W);
+}
+
+uint32_t ARMAssemblerProxy::reg_post(int Rm)
+{
+    return mTarget->reg_post(Rm);
+}
+
+
+//------------------------------------------------------------------------
+
+
+
+void ARMAssemblerProxy::dataProcessing( int opcode, int cc, int s,
+                                        int Rd, int Rn, uint32_t Op2)
+{
+    mTarget->dataProcessing(opcode, cc, s, Rd, Rn, Op2);
+}
+
+void ARMAssemblerProxy::MLA(int cc, int s, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->MLA(cc, s, Rd, Rm, Rs, Rn);
+}
+void ARMAssemblerProxy::MUL(int cc, int s, int Rd, int Rm, int Rs) {
+    mTarget->MUL(cc, s, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::UMULL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->UMULL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::UMUAL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->UMUAL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::SMULL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->SMULL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::SMUAL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->SMUAL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+
+void ARMAssemblerProxy::B(int cc, uint32_t* pc) {
+    mTarget->B(cc, pc); 
+}
+void ARMAssemblerProxy::BL(int cc, uint32_t* pc) {
+    mTarget->BL(cc, pc); 
+}
+void ARMAssemblerProxy::BX(int cc, int Rn) {
+    mTarget->BX(cc, Rn); 
+}
+void ARMAssemblerProxy::label(const char* theLabel) {
+    mTarget->label(theLabel);
+}
+void ARMAssemblerProxy::B(int cc, const char* label) {
+    mTarget->B(cc, label);
+}
+void ARMAssemblerProxy::BL(int cc, const char* label) {
+    mTarget->BL(cc, label);
+}
+
+uint32_t* ARMAssemblerProxy::pcForLabel(const char* label) {
+    return mTarget->pcForLabel(label);
+}
+
+void ARMAssemblerProxy::LDR(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STR(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STRB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STRB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRSB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRSB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRSH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRSH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STRH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STRH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDM(int cc, int dir, int Rn, int W, uint32_t reg_list) {
+    mTarget->LDM(cc, dir, Rn, W, reg_list);
+}
+void ARMAssemblerProxy::STM(int cc, int dir, int Rn, int W, uint32_t reg_list) {
+    mTarget->STM(cc, dir, Rn, W, reg_list);
+}
+
+void ARMAssemblerProxy::SWP(int cc, int Rn, int Rd, int Rm) {
+    mTarget->SWP(cc, Rn, Rd, Rm);
+}
+void ARMAssemblerProxy::SWPB(int cc, int Rn, int Rd, int Rm) {
+    mTarget->SWPB(cc, Rn, Rd, Rm);
+}
+void ARMAssemblerProxy::SWI(int cc, uint32_t comment) {
+    mTarget->SWI(cc, comment);
+}
+
+
+void ARMAssemblerProxy::PLD(int Rn, uint32_t offset) {
+    mTarget->PLD(Rn, offset);
+}
+void ARMAssemblerProxy::CLZ(int cc, int Rd, int Rm) {
+    mTarget->CLZ(cc, Rd, Rm);
+}
+void ARMAssemblerProxy::QADD(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QADD(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QDADD(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QDADD(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QSUB(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QSUB(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QDSUB(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QDSUB(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::SMUL(int cc, int xy, int Rd, int Rm, int Rs) {
+    mTarget->SMUL(cc, xy, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::SMULW(int cc, int y, int Rd, int Rm, int Rs) {
+    mTarget->SMULW(cc, y, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::SMLA(int cc, int xy, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->SMLA(cc, xy, Rd, Rm, Rs, Rn);
+}
+void ARMAssemblerProxy::SMLAL(  int cc, int xy,
+                                int RdHi, int RdLo, int Rs, int Rm) {
+    mTarget->SMLAL(cc, xy, RdHi, RdLo, Rs, Rm);
+}
+void ARMAssemblerProxy::SMLAW(int cc, int y, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->SMLAW(cc, y, Rd, Rm, Rs, Rn);
+}
+
+void ARMAssemblerProxy::UXTB16(int cc, int Rd, int Rm, int rotate) {
+    mTarget->UXTB16(cc, Rd, Rm, rotate);
+}
+
+void ARMAssemblerProxy::UBFX(int cc, int Rd, int Rn, int lsb, int width) {
+    mTarget->UBFX(cc, Rd, Rn, lsb, width);
+}
+
+void ARMAssemblerProxy::ADDR_LDR(int cc, int Rd, int Rn, uint32_t offset) {
+     mTarget->ADDR_LDR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::ADDR_STR(int cc, int Rd, int Rn, uint32_t offset) {
+     mTarget->ADDR_STR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::ADDR_ADD(int cc, int s, int Rd, int Rn, uint32_t Op2){
+     mTarget->ADDR_ADD(cc, s, Rd, Rn, Op2);
+}
+void ARMAssemblerProxy::ADDR_SUB(int cc, int s, int Rd, int Rn, uint32_t Op2){
+     mTarget->ADDR_SUB(cc, s, Rd, Rn, Op2);
+}
+
+}; // namespace android
+

diff --git a/libpixelflinger/codeflinger/ARMAssemblerProxy.h b/libpixelflinger/codeflinger/ARMAssemblerProxy.h
new file mode 100644
index 0000000..10d0390
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerProxy.h

@@ -0,0 +1,164 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerProxy.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_ARMASSEMBLER_PROXY_H
+#define ANDROID_ARMASSEMBLER_PROXY_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "ARMAssemblerInterface.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssemblerProxy : public ARMAssemblerInterface
+{
+public:
+    // ARMAssemblerProxy take ownership of the target
+
+                ARMAssemblerProxy();
+    explicit    ARMAssemblerProxy(ARMAssemblerInterface* target);
+    virtual     ~ARMAssemblerProxy();
+
+    void setTarget(ARMAssemblerInterface* target);
+
+    virtual void    reset();
+    virtual int     generate(const char* name);
+    virtual void    disassemble(const char* name);
+    virtual int     getCodegenArch();
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    virtual bool        isValidImmediate(uint32_t immed);
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    virtual uint32_t    imm(uint32_t immediate);
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    virtual uint32_t    reg_rrx(int Rm);
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes...
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0);
+    virtual uint32_t    immed12_post(int32_t immed12);
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0);
+    virtual uint32_t    immed8_post(int32_t immed8);
+    virtual uint32_t    reg_pre(int Rm, int W=0);
+    virtual uint32_t    reg_post(int Rm);
+
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width);
+
+    virtual void ADDR_LDR(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_STR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_ADD(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+    virtual void ADDR_SUB(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+
+private:
+    ARMAssemblerInterface*  mTarget;
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_PROXY_H

diff --git a/libpixelflinger/codeflinger/Arm64Assembler.cpp b/libpixelflinger/codeflinger/Arm64Assembler.cpp
new file mode 100644
index 0000000..271a9b9
--- /dev/null
+++ b/libpixelflinger/codeflinger/Arm64Assembler.cpp

@@ -0,0 +1,1240 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#define LOG_TAG "ArmToArm64Assembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cutils/properties.h>
+#include <log/log.h>
+#include <private/pixelflinger/ggl_context.h>
+
+#include "codeflinger/Arm64Assembler.h"
+#include "codeflinger/Arm64Disassembler.h"
+#include "codeflinger/CodeCache.h"
+
+/*
+** --------------------------------------------
+** Support for Arm64 in GGLAssembler JIT
+** --------------------------------------------
+**
+** Approach
+** - GGLAssembler and associated files are largely un-changed.
+** - A translator class maps ArmAssemblerInterface calls to
+**   generate Arm64 instructions.
+**
+** ----------------------
+** ArmToArm64Assembler
+** ----------------------
+**
+** - Subclassed from ArmAssemblerInterface
+**
+** - Translates each ArmAssemblerInterface call to generate
+**   one or more Arm64 instructions  as necessary.
+**
+** - Does not implement ArmAssemblerInterface portions unused by GGLAssembler
+**   It calls NOT_IMPLEMENTED() for such cases, which in turn logs
+**    a fatal message.
+**
+** - Uses A64_.. series of functions to generate instruction machine code
+**   for Arm64 instructions. These functions also log the instruction
+**   to LOG, if ARM64_ASM_DEBUG define is set to 1
+**
+** - Dumps machine code and eqvt assembly if "debug.pf.disasm" option is set
+**   It uses arm64_disassemble to perform disassembly
+**
+** - Uses register 13 (SP in ARM), 15 (PC in ARM), 16, 17 for storing
+**   intermediate results. GGLAssembler does not use SP and PC as these
+**   registers are marked as reserved. The temporary registers are not
+**   saved/restored on stack as these are caller-saved registers in Arm64
+**
+** - Uses CSEL instruction to support conditional execution. The result is
+**   stored in a temporary register and then copied to the target register
+**   if the condition is true.
+**
+** - In the case of conditional data transfer instructions, conditional
+**   branch is used to skip over instruction, if the condition is false
+**
+** - Wherever possible, immediate values are transferred to temporary
+**   register prior to processing. This simplifies overall implementation
+**   as instructions requiring immediate values are converted to
+**   move immediate instructions followed by register-register instruction.
+**
+** --------------------------------------------
+** ArmToArm64Assembler unit test bench
+** --------------------------------------------
+**
+** - Tests ArmToArm64Assembler interface for all the possible
+**   ways in which GGLAssembler uses ArmAssemblerInterface interface.
+**
+** - Uses test jacket (written in assembly) to set the registers,
+**   condition flags prior to calling generated instruction. It also
+**   copies registers and flags at the end of execution. Caller then
+**   checks if generated code performed correct operation based on
+**   output registers and flags.
+**
+** - Broadly contains three type of tests, (i) data operation tests
+**   (ii) data transfer tests and (iii) LDM/STM tests.
+**
+** ----------------------
+** Arm64 disassembler
+** ----------------------
+** - This disassembler disassembles only those machine codes which can be
+**   generated by ArmToArm64Assembler. It has a unit testbench which
+**   tests all the instructions supported by the disassembler.
+**
+** ------------------------------------------------------------------
+** ARMAssembler/ARMAssemblerInterface/ARMAssemblerProxy changes
+** ------------------------------------------------------------------
+**
+** - In existing code, addresses were being handled as 32 bit values at
+**   certain places.
+**
+** - Added a new set of functions for address load/store/manipulation.
+**   These are ADDR_LDR, ADDR_STR, ADDR_ADD, ADDR_SUB and they map to
+**   default 32 bit implementations in ARMAssemblerInterface.
+**
+** - ArmToArm64Assembler maps these functions to appropriate 64 bit
+**   functions.
+**
+** ----------------------
+** GGLAssembler changes
+** ----------------------
+** - Since ArmToArm64Assembler can generate 4 Arm64 instructions for
+**   each call in worst case, the memory required is set to 4 times
+**   ARM memory
+**
+** - Address load/store/manipulation were changed to use new functions
+**   added in the ARMAssemblerInterface.
+**
+*/
+
+
+#define NOT_IMPLEMENTED()  LOG_FATAL("Arm instruction %s not yet implemented\n", __func__)
+
+#define ARM64_ASM_DEBUG 0
+
+#if ARM64_ASM_DEBUG
+    #define LOG_INSTR(...) ALOGD("\t" __VA_ARGS__)
+    #define LOG_LABEL(...) ALOGD(__VA_ARGS__)
+#else
+    #define LOG_INSTR(...) ((void)0)
+    #define LOG_LABEL(...) ((void)0)
+#endif
+
+namespace android {
+
+static __unused const char* shift_codes[] =
+{
+    "LSL", "LSR", "ASR", "ROR"
+};
+static __unused const char *cc_codes[] =
+{
+    "EQ", "NE", "CS", "CC", "MI",
+    "PL", "VS", "VC", "HI", "LS",
+    "GE", "LT", "GT", "LE", "AL", "NV"
+};
+
+ArmToArm64Assembler::ArmToArm64Assembler(const sp<Assembly>& assembly)
+    :   ARMAssemblerInterface(),
+        mAssembly(assembly)
+{
+    mBase = mPC = (uint32_t *)assembly->base();
+    mDuration = ggl_system_time();
+    mZeroReg = 13;
+    mTmpReg1 = 15;
+    mTmpReg2 = 16;
+    mTmpReg3 = 17;
+}
+
+ArmToArm64Assembler::ArmToArm64Assembler(void *base)
+    :   ARMAssemblerInterface(), mAssembly(NULL)
+{
+    mBase = mPC = (uint32_t *)base;
+    mDuration = ggl_system_time();
+    // Regs 13, 15, 16, 17 are used as temporary registers
+    mZeroReg = 13;
+    mTmpReg1 = 15;
+    mTmpReg2 = 16;
+    mTmpReg3 = 17;
+}
+
+ArmToArm64Assembler::~ArmToArm64Assembler()
+{
+}
+
+uint32_t* ArmToArm64Assembler::pc() const
+{
+    return mPC;
+}
+
+uint32_t* ArmToArm64Assembler::base() const
+{
+    return mBase;
+}
+
+void ArmToArm64Assembler::reset()
+{
+    if(mAssembly == NULL)
+        mPC = mBase;
+    else
+        mBase = mPC = (uint32_t *)mAssembly->base();
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+#if ARM64_ASM_DEBUG
+    ALOGI("RESET\n");
+#endif
+}
+
+int ArmToArm64Assembler::getCodegenArch()
+{
+    return CODEGEN_ARCH_ARM64;
+}
+
+// ----------------------------------------------------------------------------
+
+void ArmToArm64Assembler::disassemble(const char* name)
+{
+    if(name)
+    {
+        printf("%s:\n", name);
+    }
+    size_t count = pc()-base();
+    uint32_t* i = base();
+    while (count--)
+    {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(i);
+        if (label >= 0)
+        {
+            printf("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(i);
+        if (comment >= 0)
+        {
+            printf("; %s\n", mComments.valueAt(comment));
+        }
+        printf("%p:    %08x    ", i, uint32_t(i[0]));
+        {
+            char instr[256];
+            ::arm64_disassemble(*i, instr);
+            printf("%s\n", instr);
+        }
+        i++;
+    }
+}
+
+void ArmToArm64Assembler::comment(const char* string)
+{
+    mComments.add(mPC, string);
+    LOG_INSTR("//%s\n", string);
+}
+
+void ArmToArm64Assembler::label(const char* theLabel)
+{
+    mLabels.add(theLabel, mPC);
+    mLabelsInverseMapping.add(mPC, theLabel);
+    LOG_LABEL("%s:\n", theLabel);
+}
+
+void ArmToArm64Assembler::B(int cc, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    LOG_INSTR("B%s %s\n", cc_codes[cc], label );
+    *mPC++ = (0x54 << 24) | cc;
+}
+
+void ArmToArm64Assembler::BL(int /*cc*/, const char* /*label*/)
+{
+    NOT_IMPLEMENTED(); //Not Required
+}
+
+// ----------------------------------------------------------------------------
+//Prolog/Epilog & Generate...
+// ----------------------------------------------------------------------------
+
+void ArmToArm64Assembler::prolog()
+{
+    // write prolog code
+    mPrologPC = mPC;
+    *mPC++ = A64_MOVZ_X(mZeroReg,0,0);
+}
+
+void ArmToArm64Assembler::epilog(uint32_t /*touched*/)
+{
+    // write epilog code
+    static const int XLR = 30;
+    *mPC++ = A64_RET(XLR);
+}
+
+int ArmToArm64Assembler::generate(const char* name)
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--)
+    {
+        const branch_target_t& bt = mBranchTargets[count];
+        uint32_t* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                "error resolving branch targets, target_pc is null");
+        int32_t offset = int32_t(target_pc - bt.pc);
+        *bt.pc |= (offset & 0x7FFFF) << 5;
+    }
+
+    if(mAssembly != NULL)
+        mAssembly->resize( int(pc()-base())*4 );
+
+    // the instruction cache is flushed by CodeCache
+    const int64_t duration = ggl_system_time() - mDuration;
+    const char * const format = "generated %s (%d ins) at [%p:%p] in %" PRId64 "ns\n";
+    ALOGI(format, name, int(pc()-base()), base(), pc(), duration);
+
+
+    char value[PROPERTY_VALUE_MAX];
+    property_get("debug.pf.disasm", value, "0");
+    if (atoi(value) != 0)
+    {
+        printf(format, name, int(pc()-base()), base(), pc(), duration);
+        disassemble(name);
+    }
+    return OK;
+}
+
+uint32_t* ArmToArm64Assembler::pcForLabel(const char* label)
+{
+    return mLabels.valueFor(label);
+}
+
+// ----------------------------------------------------------------------------
+// Data Processing...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::dataProcessingCommon(int opcode,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    if(opcode != opSUB && s == 1)
+    {
+        NOT_IMPLEMENTED(); //Not required
+        return;
+    }
+
+    if(opcode != opSUB && opcode != opADD && opcode != opAND &&
+       opcode != opORR && opcode != opMVN)
+    {
+        NOT_IMPLEMENTED(); //Not required
+        return;
+    }
+
+    if(Op2 == OPERAND_REG_IMM && mAddrMode.reg_imm_shift > 31)
+        {
+        NOT_IMPLEMENTED();
+        return;
+    }
+
+    //Store immediate in temporary register and convert
+    //immediate operation into register operation
+    if(Op2 == OPERAND_IMM)
+    {
+        int imm = mAddrMode.immediate;
+        *mPC++ = A64_MOVZ_W(mTmpReg2, imm & 0x0000FFFF, 0);
+        *mPC++ = A64_MOVK_W(mTmpReg2, (imm >> 16) & 0x0000FFFF, 16);
+        Op2 = mTmpReg2;
+    }
+
+
+    {
+        uint32_t shift;
+        uint32_t amount;
+        uint32_t Rm;
+
+        if(Op2 == OPERAND_REG_IMM)
+        {
+            shift   = mAddrMode.reg_imm_type;
+            amount  = mAddrMode.reg_imm_shift;
+            Rm      = mAddrMode.reg_imm_Rm;
+        }
+        else if(Op2 < OPERAND_REG)
+        {
+            shift   = 0;
+            amount  = 0;
+            Rm      = Op2;
+        }
+        else
+        {
+            NOT_IMPLEMENTED(); //Not required
+            return;
+        }
+
+        switch(opcode)
+        {
+            case opADD: *mPC++ = A64_ADD_W(Rd, Rn, Rm, shift, amount); break;
+            case opAND: *mPC++ = A64_AND_W(Rd, Rn, Rm, shift, amount); break;
+            case opORR: *mPC++ = A64_ORR_W(Rd, Rn, Rm, shift, amount); break;
+            case opMVN: *mPC++ = A64_ORN_W(Rd, Rn, Rm, shift, amount); break;
+            case opSUB: *mPC++ = A64_SUB_W(Rd, Rn, Rm, shift, amount, s);break;
+        };
+
+    }
+}
+
+void ArmToArm64Assembler::dataProcessing(int opcode, int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    uint32_t Wd;
+
+    if(cc != AL)
+        Wd = mTmpReg1;
+    else
+        Wd = Rd;
+
+    if(opcode == opADD || opcode == opAND || opcode == opORR ||opcode == opSUB)
+    {
+        dataProcessingCommon(opcode, s, Wd, Rn, Op2);
+    }
+    else if(opcode == opCMP)
+    {
+        dataProcessingCommon(opSUB, 1, mTmpReg3, Rn, Op2);
+    }
+    else if(opcode == opRSB)
+    {
+        dataProcessingCommon(opSUB, s, Wd, Rn, Op2);
+        dataProcessingCommon(opSUB, s, Wd, mZeroReg, Wd);
+    }
+    else if(opcode == opMOV)
+    {
+        dataProcessingCommon(opORR, 0, Wd, mZeroReg, Op2);
+        if(s == 1)
+        {
+            dataProcessingCommon(opSUB, 1, mTmpReg3, Wd, mZeroReg);
+        }
+    }
+    else if(opcode == opMVN)
+    {
+        dataProcessingCommon(opMVN, s, Wd, mZeroReg, Op2);
+    }
+    else if(opcode == opBIC)
+    {
+        dataProcessingCommon(opMVN, s, mTmpReg3, mZeroReg, Op2);
+        dataProcessingCommon(opAND, s, Wd, Rn, mTmpReg3);
+    }
+    else
+    {
+        NOT_IMPLEMENTED();
+        return;
+    }
+
+    if(cc != AL)
+    {
+        *mPC++ = A64_CSEL_W(Rd, mTmpReg1, Rd, cc);
+    }
+}
+// ----------------------------------------------------------------------------
+// Address Processing...
+// ----------------------------------------------------------------------------
+
+void ArmToArm64Assembler::ADDR_ADD(int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+    if(s  != 0) { NOT_IMPLEMENTED(); return;} //Not required
+
+
+    if(Op2 == OPERAND_REG_IMM && mAddrMode.reg_imm_type == LSL)
+    {
+        int Rm = mAddrMode.reg_imm_Rm;
+        int amount = mAddrMode.reg_imm_shift;
+        *mPC++ = A64_ADD_X_Wm_SXTW(Rd, Rn, Rm, amount);
+    }
+    else if(Op2 < OPERAND_REG)
+    {
+        int Rm = Op2;
+        int amount = 0;
+        *mPC++ = A64_ADD_X_Wm_SXTW(Rd, Rn, Rm, amount);
+    }
+    else if(Op2 == OPERAND_IMM)
+    {
+        int imm = mAddrMode.immediate;
+        *mPC++ = A64_MOVZ_W(mTmpReg1, imm & 0x0000FFFF, 0);
+        *mPC++ = A64_MOVK_W(mTmpReg1, (imm >> 16) & 0x0000FFFF, 16);
+
+        int Rm = mTmpReg1;
+        int amount = 0;
+        *mPC++ = A64_ADD_X_Wm_SXTW(Rd, Rn, Rm, amount);
+    }
+    else
+    {
+        NOT_IMPLEMENTED(); //Not required
+    }
+}
+
+void ArmToArm64Assembler::ADDR_SUB(int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+    if(s  != 0) { NOT_IMPLEMENTED(); return;} //Not required
+
+    if(Op2 == OPERAND_REG_IMM && mAddrMode.reg_imm_type == LSR)
+    {
+        *mPC++ = A64_ADD_W(mTmpReg1, mZeroReg, mAddrMode.reg_imm_Rm,
+                           LSR, mAddrMode.reg_imm_shift);
+        *mPC++ = A64_SUB_X_Wm_SXTW(Rd, Rn, mTmpReg1, 0);
+    }
+    else
+    {
+        NOT_IMPLEMENTED(); //Not required
+    }
+}
+
+// ----------------------------------------------------------------------------
+// multiply...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::MLA(int cc, int s,int Rd, int Rm, int Rs, int Rn)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+
+    *mPC++ = A64_MADD_W(Rd, Rm, Rs, Rn);
+    if(s == 1)
+        dataProcessingCommon(opSUB, 1, mTmpReg1, Rd, mZeroReg);
+}
+void ArmToArm64Assembler::MUL(int cc, int s, int Rd, int Rm, int Rs)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+    if(s  != 0) { NOT_IMPLEMENTED(); return;} //Not required
+    *mPC++ = A64_MADD_W(Rd, Rm, Rs, mZeroReg);
+}
+void ArmToArm64Assembler::UMULL(int /*cc*/, int /*s*/,
+        int /*RdLo*/, int /*RdHi*/, int /*Rm*/, int /*Rs*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::UMUAL(int /*cc*/, int /*s*/,
+        int /*RdLo*/, int /*RdHi*/, int /*Rm*/, int /*Rs*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::SMULL(int /*cc*/, int /*s*/,
+        int /*RdLo*/, int /*RdHi*/, int /*Rm*/, int /*Rs*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::SMUAL(int /*cc*/, int /*s*/,
+        int /*RdLo*/, int /*RdHi*/, int /*Rm*/, int /*Rs*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+// ----------------------------------------------------------------------------
+// branches relative to PC...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::B(int /*cc*/, uint32_t* /*pc*/){
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::BL(int /*cc*/, uint32_t* /*pc*/){
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::BX(int /*cc*/, int /*Rn*/){
+    NOT_IMPLEMENTED(); //Not required
+}
+
+// ----------------------------------------------------------------------------
+// data transfer...
+// ----------------------------------------------------------------------------
+enum dataTransferOp
+{
+    opLDR,opLDRB,opLDRH,opSTR,opSTRB,opSTRH
+};
+
+void ArmToArm64Assembler::dataTransfer(int op, int cc,
+                            int Rd, int Rn, uint32_t op_type, uint32_t size)
+{
+    const int XSP = 31;
+    if(Rn == SP)
+        Rn = XSP;
+
+    if(op_type == OPERAND_IMM)
+    {
+        int addrReg;
+        int imm = mAddrMode.immediate;
+        if(imm >= 0 && imm < (1<<12))
+            *mPC++ = A64_ADD_IMM_X(mTmpReg1, mZeroReg, imm, 0);
+        else if(imm < 0 && -imm < (1<<12))
+            *mPC++ = A64_SUB_IMM_X(mTmpReg1, mZeroReg, -imm, 0);
+        else
+        {
+            NOT_IMPLEMENTED();
+            return;
+        }
+
+        addrReg = Rn;
+        if(mAddrMode.preindex == true || mAddrMode.postindex == true)
+        {
+            *mPC++ = A64_ADD_X(mTmpReg2, addrReg, mTmpReg1);
+            if(mAddrMode.preindex == true)
+                addrReg = mTmpReg2;
+        }
+
+        if(cc != AL)
+            *mPC++ = A64_B_COND(cc^1, 8);
+
+        *mPC++ = A64_LDRSTR_Wm_SXTW_0(op, size, Rd, addrReg, mZeroReg);
+
+        if(mAddrMode.writeback == true)
+            *mPC++ = A64_CSEL_X(Rn, mTmpReg2, Rn, cc);
+    }
+    else if(op_type == OPERAND_REG_OFFSET)
+    {
+        if(cc != AL)
+            *mPC++ = A64_B_COND(cc^1, 8);
+        *mPC++ = A64_LDRSTR_Wm_SXTW_0(op, size, Rd, Rn, mAddrMode.reg_offset);
+
+    }
+    else if(op_type > OPERAND_UNSUPPORTED)
+    {
+        if(cc != AL)
+            *mPC++ = A64_B_COND(cc^1, 8);
+        *mPC++ = A64_LDRSTR_Wm_SXTW_0(op, size, Rd, Rn, mZeroReg);
+    }
+    else
+    {
+        NOT_IMPLEMENTED(); // Not required
+    }
+    return;
+
+}
+void ArmToArm64Assembler::ADDR_LDR(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opLDR, cc, Rd, Rn, op_type, 64);
+}
+void ArmToArm64Assembler::ADDR_STR(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opSTR, cc, Rd, Rn, op_type, 64);
+}
+void ArmToArm64Assembler::LDR(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opLDR, cc, Rd, Rn, op_type);
+}
+void ArmToArm64Assembler::LDRB(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opLDRB, cc, Rd, Rn, op_type);
+}
+void ArmToArm64Assembler::STR(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opSTR, cc, Rd, Rn, op_type);
+}
+
+void ArmToArm64Assembler::STRB(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opSTRB, cc, Rd, Rn, op_type);
+}
+
+void ArmToArm64Assembler::LDRH(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opLDRH, cc, Rd, Rn, op_type);
+}
+void ArmToArm64Assembler::LDRSB(int /*cc*/, int /*Rd*/, int /*Rn*/, uint32_t /*offset*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::LDRSH(int /*cc*/, int /*Rd*/, int /*Rn*/, uint32_t /*offset*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::STRH(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opSTRH, cc, Rd, Rn, op_type);
+}
+
+// ----------------------------------------------------------------------------
+// block data transfer...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::LDM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{
+    const int XSP = 31;
+    if(cc != AL || dir != IA || W == 0 || Rn != SP)
+    {
+        NOT_IMPLEMENTED();
+        return;
+    }
+
+    for(int i = 0; i < 32; ++i)
+    {
+        if((reg_list & (1 << i)))
+        {
+            int reg = i;
+            int size = 16;
+            *mPC++ = A64_LDR_IMM_PostIndex(reg, XSP, size);
+        }
+    }
+}
+
+void ArmToArm64Assembler::STM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{
+    const int XSP = 31;
+    if(cc != AL || dir != DB || W == 0 || Rn != SP)
+    {
+        NOT_IMPLEMENTED();
+        return;
+    }
+
+    for(int i = 31; i >= 0; --i)
+    {
+        if((reg_list & (1 << i)))
+        {
+            int size = -16;
+            int reg  = i;
+            *mPC++ = A64_STR_IMM_PreIndex(reg, XSP, size);
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// special...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::SWP(int /*cc*/, int /*Rn*/, int /*Rd*/, int /*Rm*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::SWPB(int /*cc*/, int /*Rn*/, int /*Rd*/, int /*Rm*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::SWI(int /*cc*/, uint32_t /*comment*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+// ----------------------------------------------------------------------------
+// DSP instructions...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::PLD(int /*Rn*/, uint32_t /*offset*/) {
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::CLZ(int /*cc*/, int /*Rd*/, int /*Rm*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::QADD(int /*cc*/, int /*Rd*/, int /*Rm*/, int /*Rn*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::QDADD(int /*cc*/, int /*Rd*/, int /*Rm*/, int /*Rn*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::QSUB(int /*cc*/, int /*Rd*/, int /*Rm*/, int /*Rn*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::QDSUB(int /*cc*/, int /*Rd*/, int /*Rm*/, int /*Rn*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+// ----------------------------------------------------------------------------
+// 16 x 16 multiplication
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+
+    if (xy & xyTB)
+        *mPC++ = A64_SBFM_W(mTmpReg1, Rm, 16, 31);
+    else
+        *mPC++ = A64_SBFM_W(mTmpReg1, Rm, 0, 15);
+
+    if (xy & xyBT)
+        *mPC++ = A64_SBFM_W(mTmpReg2, Rs, 16, 31);
+    else
+        *mPC++ = A64_SBFM_W(mTmpReg2, Rs, 0, 15);
+
+    *mPC++ = A64_MADD_W(Rd,mTmpReg1,mTmpReg2, mZeroReg);
+}
+// ----------------------------------------------------------------------------
+// 32 x 16 multiplication
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::SMULW(int cc, int y, int Rd, int Rm, int Rs)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+
+    if (y & yT)
+        *mPC++ = A64_SBFM_W(mTmpReg1, Rs, 16, 31);
+    else
+        *mPC++ = A64_SBFM_W(mTmpReg1, Rs, 0, 15);
+
+    *mPC++ = A64_SBFM_W(mTmpReg2, Rm, 0, 31);
+    *mPC++ = A64_SMADDL(mTmpReg3,mTmpReg1,mTmpReg2, mZeroReg);
+    *mPC++ = A64_UBFM_X(Rd,mTmpReg3, 16, 47);
+}
+// ----------------------------------------------------------------------------
+// 16 x 16 multiplication and accumulate
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::SMLA(int cc, int xy, int Rd, int Rm, int Rs, int Rn)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+    if(xy != xyBB) { NOT_IMPLEMENTED(); return;} //Not required
+
+    *mPC++ = A64_SBFM_W(mTmpReg1, Rm, 0, 15);
+    *mPC++ = A64_SBFM_W(mTmpReg2, Rs, 0, 15);
+    *mPC++ = A64_MADD_W(Rd, mTmpReg1, mTmpReg2, Rn);
+}
+
+void ArmToArm64Assembler::SMLAL(int /*cc*/, int /*xy*/,
+                int /*RdHi*/, int /*RdLo*/, int /*Rs*/, int /*Rm*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+    return;
+}
+
+void ArmToArm64Assembler::SMLAW(int /*cc*/, int /*y*/,
+                int /*Rd*/, int /*Rm*/, int /*Rs*/, int /*Rn*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+    return;
+}
+
+// ----------------------------------------------------------------------------
+// Byte/half word extract and extend
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::UXTB16(int cc, int Rd, int Rm, int rotate)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+
+    *mPC++ = A64_EXTR_W(mTmpReg1, Rm, Rm, rotate * 8);
+
+    uint32_t imm = 0x00FF00FF;
+    *mPC++ = A64_MOVZ_W(mTmpReg2, imm & 0xFFFF, 0);
+    *mPC++ = A64_MOVK_W(mTmpReg2, (imm >> 16) & 0x0000FFFF, 16);
+    *mPC++ = A64_AND_W(Rd,mTmpReg1, mTmpReg2);
+}
+
+// ----------------------------------------------------------------------------
+// Bit manipulation
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::UBFX(int cc, int Rd, int Rn, int lsb, int width)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+    *mPC++ = A64_UBFM_W(Rd, Rn, lsb, lsb + width - 1);
+}
+// ----------------------------------------------------------------------------
+// Shifters...
+// ----------------------------------------------------------------------------
+int ArmToArm64Assembler::buildImmediate(
+        uint32_t immediate, uint32_t& rot, uint32_t& imm)
+{
+    rot = 0;
+    imm = immediate;
+    return 0; // Always true
+}
+
+
+bool ArmToArm64Assembler::isValidImmediate(uint32_t immediate)
+{
+    uint32_t rot, imm;
+    return buildImmediate(immediate, rot, imm) == 0;
+}
+
+uint32_t ArmToArm64Assembler::imm(uint32_t immediate)
+{
+    mAddrMode.immediate = immediate;
+    mAddrMode.writeback = false;
+    mAddrMode.preindex  = false;
+    mAddrMode.postindex = false;
+    return OPERAND_IMM;
+
+}
+
+uint32_t ArmToArm64Assembler::reg_imm(int Rm, int type, uint32_t shift)
+{
+    mAddrMode.reg_imm_Rm = Rm;
+    mAddrMode.reg_imm_type = type;
+    mAddrMode.reg_imm_shift = shift;
+    return OPERAND_REG_IMM;
+}
+
+uint32_t ArmToArm64Assembler::reg_rrx(int /*Rm*/)
+{
+    NOT_IMPLEMENTED();
+    return OPERAND_UNSUPPORTED;
+}
+
+uint32_t ArmToArm64Assembler::reg_reg(int /*Rm*/, int /*type*/, int /*Rs*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+    return OPERAND_UNSUPPORTED;
+}
+// ----------------------------------------------------------------------------
+// Addressing modes...
+// ----------------------------------------------------------------------------
+uint32_t ArmToArm64Assembler::immed12_pre(int32_t immed12, int W)
+{
+    mAddrMode.immediate = immed12;
+    mAddrMode.writeback = W;
+    mAddrMode.preindex  = true;
+    mAddrMode.postindex = false;
+    return OPERAND_IMM;
+}
+
+uint32_t ArmToArm64Assembler::immed12_post(int32_t immed12)
+{
+    mAddrMode.immediate = immed12;
+    mAddrMode.writeback = true;
+    mAddrMode.preindex  = false;
+    mAddrMode.postindex = true;
+    return OPERAND_IMM;
+}
+
+uint32_t ArmToArm64Assembler::reg_scale_pre(int Rm, int type,
+        uint32_t shift, int W)
+{
+    if(type != 0 || shift != 0 || W != 0)
+    {
+        NOT_IMPLEMENTED(); //Not required
+        return OPERAND_UNSUPPORTED;
+    }
+    else
+    {
+        mAddrMode.reg_offset = Rm;
+        return OPERAND_REG_OFFSET;
+    }
+}
+
+uint32_t ArmToArm64Assembler::reg_scale_post(int /*Rm*/, int /*type*/, uint32_t /*shift*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+    return OPERAND_UNSUPPORTED;
+}
+
+uint32_t ArmToArm64Assembler::immed8_pre(int32_t immed8, int W)
+{
+    mAddrMode.immediate = immed8;
+    mAddrMode.writeback = W;
+    mAddrMode.preindex  = true;
+    mAddrMode.postindex = false;
+    return OPERAND_IMM;
+}
+
+uint32_t ArmToArm64Assembler::immed8_post(int32_t immed8)
+{
+    mAddrMode.immediate = immed8;
+    mAddrMode.writeback = true;
+    mAddrMode.preindex  = false;
+    mAddrMode.postindex = true;
+    return OPERAND_IMM;
+}
+
+uint32_t ArmToArm64Assembler::reg_pre(int Rm, int W)
+{
+    if(W != 0)
+    {
+        NOT_IMPLEMENTED(); //Not required
+        return OPERAND_UNSUPPORTED;
+    }
+    else
+    {
+        mAddrMode.reg_offset = Rm;
+        return OPERAND_REG_OFFSET;
+    }
+}
+
+uint32_t ArmToArm64Assembler::reg_post(int /*Rm*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+    return OPERAND_UNSUPPORTED;
+}
+
+// ----------------------------------------------------------------------------
+// A64 instructions
+// ----------------------------------------------------------------------------
+
+static __unused const char * dataTransferOpName[] =
+{
+    "LDR","LDRB","LDRH","STR","STRB","STRH"
+};
+
+static const uint32_t dataTransferOpCode [] =
+{
+    ((0xB8u << 24) | (0x3 << 21) | (0x6 << 13) | (0x0 << 12) |(0x1 << 11)),
+    ((0x38u << 24) | (0x3 << 21) | (0x6 << 13) | (0x1 << 12) |(0x1 << 11)),
+    ((0x78u << 24) | (0x3 << 21) | (0x6 << 13) | (0x0 << 12) |(0x1 << 11)),
+    ((0xB8u << 24) | (0x1 << 21) | (0x6 << 13) | (0x0 << 12) |(0x1 << 11)),
+    ((0x38u << 24) | (0x1 << 21) | (0x6 << 13) | (0x1 << 12) |(0x1 << 11)),
+    ((0x78u << 24) | (0x1 << 21) | (0x6 << 13) | (0x0 << 12) |(0x1 << 11))
+};
+uint32_t ArmToArm64Assembler::A64_LDRSTR_Wm_SXTW_0(uint32_t op,
+                            uint32_t size, uint32_t Rt,
+                            uint32_t Rn, uint32_t Rm)
+{
+    if(size == 32)
+    {
+        LOG_INSTR("%s W%d, [X%d, W%d, SXTW #0]\n",
+                   dataTransferOpName[op], Rt, Rn, Rm);
+        return(dataTransferOpCode[op] | (Rm << 16) | (Rn << 5) | Rt);
+    }
+    else
+    {
+        LOG_INSTR("%s X%d, [X%d, W%d, SXTW #0]\n",
+                  dataTransferOpName[op], Rt, Rn, Rm);
+        return(dataTransferOpCode[op] | (0x1<<30) | (Rm<<16) | (Rn<<5)|Rt);
+    }
+}
+
+uint32_t ArmToArm64Assembler::A64_STR_IMM_PreIndex(uint32_t Rt,
+                            uint32_t Rn, int32_t simm)
+{
+    if(Rn == 31)
+        LOG_INSTR("STR W%d, [SP, #%d]!\n", Rt, simm);
+    else
+        LOG_INSTR("STR W%d, [X%d, #%d]!\n", Rt, Rn, simm);
+
+    uint32_t imm9 = (unsigned)(simm) & 0x01FF;
+    return (0xB8 << 24) | (imm9 << 12) | (0x3 << 10) | (Rn << 5) | Rt;
+}
+
+uint32_t ArmToArm64Assembler::A64_LDR_IMM_PostIndex(uint32_t Rt,
+                            uint32_t Rn, int32_t simm)
+{
+    if(Rn == 31)
+        LOG_INSTR("LDR W%d, [SP], #%d\n",Rt,simm);
+    else
+        LOG_INSTR("LDR W%d, [X%d], #%d\n",Rt, Rn, simm);
+
+    uint32_t imm9 = (unsigned)(simm) & 0x01FF;
+    return (0xB8 << 24) | (0x1 << 22) |
+             (imm9 << 12) | (0x1 << 10) | (Rn << 5) | Rt;
+
+}
+uint32_t ArmToArm64Assembler::A64_ADD_X_Wm_SXTW(uint32_t Rd,
+                               uint32_t Rn,
+                               uint32_t Rm,
+                               uint32_t amount)
+{
+    LOG_INSTR("ADD X%d, X%d, W%d, SXTW #%d\n", Rd, Rn, Rm, amount);
+    return ((0x8B << 24) | (0x1 << 21) |(Rm << 16) |
+              (0x6 << 13) | (amount << 10) | (Rn << 5) | Rd);
+
+}
+
+uint32_t ArmToArm64Assembler::A64_SUB_X_Wm_SXTW(uint32_t Rd,
+                               uint32_t Rn,
+                               uint32_t Rm,
+                               uint32_t amount)
+{
+    LOG_INSTR("SUB X%d, X%d, W%d, SXTW #%d\n", Rd, Rn, Rm, amount);
+    return ((0xCB << 24) | (0x1 << 21) |(Rm << 16) |
+            (0x6 << 13) | (amount << 10) | (Rn << 5) | Rd);
+
+}
+
+uint32_t ArmToArm64Assembler::A64_B_COND(uint32_t cc, uint32_t offset)
+{
+    LOG_INSTR("B.%s #.+%d\n", cc_codes[cc], offset);
+    return (0x54 << 24) | ((offset/4) << 5) | (cc);
+
+}
+uint32_t ArmToArm64Assembler::A64_ADD_X(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount)
+{
+    LOG_INSTR("ADD X%d, X%d, X%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+    return ((0x8B << 24) | (shift << 22) | ( Rm << 16) |
+            (amount << 10) |(Rn << 5) | Rd);
+}
+uint32_t ArmToArm64Assembler::A64_ADD_IMM_X(uint32_t Rd, uint32_t Rn,
+                                          uint32_t imm, uint32_t shift)
+{
+    LOG_INSTR("ADD X%d, X%d, #%d, LSL #%d\n", Rd, Rn, imm, shift);
+    return (0x91 << 24) | ((shift/12) << 22) | (imm << 10) | (Rn << 5) | Rd;
+}
+
+uint32_t ArmToArm64Assembler::A64_SUB_IMM_X(uint32_t Rd, uint32_t Rn,
+                                          uint32_t imm, uint32_t shift)
+{
+    LOG_INSTR("SUB X%d, X%d, #%d, LSL #%d\n", Rd, Rn, imm, shift);
+    return (0xD1 << 24) | ((shift/12) << 22) | (imm << 10) | (Rn << 5) | Rd;
+}
+
+uint32_t ArmToArm64Assembler::A64_ADD_W(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount)
+{
+    LOG_INSTR("ADD W%d, W%d, W%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+    return ((0x0B << 24) | (shift << 22) | ( Rm << 16) |
+            (amount << 10) |(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_SUB_W(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount,
+                                          uint32_t setflag)
+{
+    if(setflag == 0)
+    {
+        LOG_INSTR("SUB W%d, W%d, W%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+        return ((0x4B << 24) | (shift << 22) | ( Rm << 16) |
+                (amount << 10) |(Rn << 5) | Rd);
+    }
+    else
+    {
+        LOG_INSTR("SUBS W%d, W%d, W%d, %s #%d\n",
+                   Rd, Rn, Rm, shift_codes[shift], amount);
+        return ((0x6B << 24) | (shift << 22) | ( Rm << 16) |
+                (amount << 10) |(Rn << 5) | Rd);
+    }
+}
+
+uint32_t ArmToArm64Assembler::A64_AND_W(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount)
+{
+    LOG_INSTR("AND W%d, W%d, W%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+    return ((0x0A << 24) | (shift << 22) | ( Rm << 16) |
+            (amount << 10) |(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_ORR_W(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount)
+{
+    LOG_INSTR("ORR W%d, W%d, W%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+    return ((0x2A << 24) | (shift << 22) | ( Rm << 16) |
+            (amount << 10) |(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_ORN_W(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount)
+{
+    LOG_INSTR("ORN W%d, W%d, W%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+    return ((0x2A << 24) | (shift << 22) | (0x1 << 21) | ( Rm << 16) |
+            (amount << 10) |(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_CSEL_X(uint32_t Rd, uint32_t Rn,
+                                           uint32_t Rm, uint32_t cond)
+{
+    LOG_INSTR("CSEL X%d, X%d, X%d, %s\n", Rd, Rn, Rm, cc_codes[cond]);
+    return ((0x9A << 24)|(0x1 << 23)|(Rm << 16) |(cond << 12)| (Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_CSEL_W(uint32_t Rd, uint32_t Rn,
+                                           uint32_t Rm, uint32_t cond)
+{
+    LOG_INSTR("CSEL W%d, W%d, W%d, %s\n", Rd, Rn, Rm, cc_codes[cond]);
+    return ((0x1A << 24)|(0x1 << 23)|(Rm << 16) |(cond << 12)| (Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_RET(uint32_t Rn)
+{
+    LOG_INSTR("RET X%d\n", Rn);
+    return ((0xD6 << 24) | (0x1 << 22) | (0x1F << 16) | (Rn << 5));
+}
+
+uint32_t ArmToArm64Assembler::A64_MOVZ_X(uint32_t Rd, uint32_t imm,
+                                         uint32_t shift)
+{
+    LOG_INSTR("MOVZ X%d, #0x%x, LSL #%d\n", Rd, imm, shift);
+    return(0xD2 << 24) | (0x1 << 23) | ((shift/16) << 21) |  (imm << 5) | Rd;
+}
+
+uint32_t ArmToArm64Assembler::A64_MOVK_W(uint32_t Rd, uint32_t imm,
+                                         uint32_t shift)
+{
+    LOG_INSTR("MOVK W%d, #0x%x, LSL #%d\n", Rd, imm, shift);
+    return (0x72 << 24) | (0x1 << 23) | ((shift/16) << 21) | (imm << 5) | Rd;
+}
+
+uint32_t ArmToArm64Assembler::A64_MOVZ_W(uint32_t Rd, uint32_t imm,
+                                         uint32_t shift)
+{
+    LOG_INSTR("MOVZ W%d, #0x%x, LSL #%d\n", Rd, imm, shift);
+    return(0x52 << 24) | (0x1 << 23) | ((shift/16) << 21) |  (imm << 5) | Rd;
+}
+
+uint32_t ArmToArm64Assembler::A64_SMADDL(uint32_t Rd, uint32_t Rn,
+                                           uint32_t Rm, uint32_t Ra)
+{
+    LOG_INSTR("SMADDL X%d, W%d, W%d, X%d\n",Rd, Rn, Rm, Ra);
+    return ((0x9B << 24) | (0x1 << 21) | (Rm << 16)|(Ra << 10)|(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_MADD_W(uint32_t Rd, uint32_t Rn,
+                                           uint32_t Rm, uint32_t Ra)
+{
+    LOG_INSTR("MADD W%d, W%d, W%d, W%d\n",Rd, Rn, Rm, Ra);
+    return ((0x1B << 24) | (Rm << 16) | (Ra << 10) |(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_SBFM_W(uint32_t Rd, uint32_t Rn,
+                                           uint32_t immr, uint32_t imms)
+{
+    LOG_INSTR("SBFM W%d, W%d, #%d, #%d\n", Rd, Rn, immr, imms);
+    return ((0x13 << 24) | (immr << 16) | (imms << 10) | (Rn << 5) | Rd);
+
+}
+uint32_t ArmToArm64Assembler::A64_UBFM_W(uint32_t Rd, uint32_t Rn,
+                                           uint32_t immr, uint32_t imms)
+{
+    LOG_INSTR("UBFM W%d, W%d, #%d, #%d\n", Rd, Rn, immr, imms);
+    return ((0x53 << 24) | (immr << 16) | (imms << 10) | (Rn << 5) | Rd);
+
+}
+uint32_t ArmToArm64Assembler::A64_UBFM_X(uint32_t Rd, uint32_t Rn,
+                                           uint32_t immr, uint32_t imms)
+{
+    LOG_INSTR("UBFM X%d, X%d, #%d, #%d\n", Rd, Rn, immr, imms);
+    return ((0xD3 << 24) | (0x1 << 22) |
+            (immr << 16) | (imms << 10) | (Rn << 5) | Rd);
+
+}
+uint32_t ArmToArm64Assembler::A64_EXTR_W(uint32_t Rd, uint32_t Rn,
+                                           uint32_t Rm, uint32_t lsb)
+{
+    LOG_INSTR("EXTR W%d, W%d, W%d, #%d\n", Rd, Rn, Rm, lsb);
+    return (0x13 << 24)|(0x1 << 23) | (Rm << 16) | (lsb << 10)|(Rn << 5) | Rd;
+}
+
+}; // namespace android

diff --git a/libpixelflinger/codeflinger/Arm64Assembler.h b/libpixelflinger/codeflinger/Arm64Assembler.h
new file mode 100644
index 0000000..527c757
--- /dev/null
+++ b/libpixelflinger/codeflinger/Arm64Assembler.h

@@ -0,0 +1,290 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef ANDROID_ARMTOARM64ASSEMBLER_H
+#define ANDROID_ARMTOARM64ASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "tinyutils/smartpointer.h"
+#include "utils/Vector.h"
+#include "utils/KeyedVector.h"
+
+#include "tinyutils/smartpointer.h"
+#include "codeflinger/ARMAssemblerInterface.h"
+#include "codeflinger/CodeCache.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ArmToArm64Assembler : public ARMAssemblerInterface
+{
+public:
+    explicit    ArmToArm64Assembler(const sp<Assembly>& assembly);
+    explicit    ArmToArm64Assembler(void *base);
+    virtual     ~ArmToArm64Assembler();
+
+    uint32_t*   base() const;
+    uint32_t*   pc() const;
+
+
+    void        disassemble(const char* name);
+
+    // ------------------------------------------------------------------------
+    // ARMAssemblerInterface...
+    // ------------------------------------------------------------------------
+
+    virtual void    reset();
+
+    virtual int     generate(const char* name);
+    virtual int     getCodegenArch();
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // shifters...
+    virtual bool        isValidImmediate(uint32_t immed);
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    virtual uint32_t    imm(uint32_t immediate);
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    virtual uint32_t    reg_rrx(int Rm);
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes...
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0);
+    virtual uint32_t    immed12_post(int32_t immed12);
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0);
+    virtual uint32_t    immed8_post(int32_t immed8);
+    virtual uint32_t    reg_pre(int Rm, int W=0);
+    virtual uint32_t    reg_post(int Rm);
+
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    virtual uint32_t* pcForLabel(const char* label);
+
+    virtual void ADDR_LDR(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void ADDR_ADD(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+    virtual void ADDR_SUB(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+    virtual void ADDR_STR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+
+
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width);
+
+private:
+    ArmToArm64Assembler(const ArmToArm64Assembler& rhs);
+    ArmToArm64Assembler& operator = (const ArmToArm64Assembler& rhs);
+
+    // -----------------------------------------------------------------------
+    // helper functions
+    // -----------------------------------------------------------------------
+
+    void dataTransfer(int operation, int cc, int Rd, int Rn,
+                      uint32_t operand_type, uint32_t size = 32);
+    void dataProcessingCommon(int opcode, int s,
+                      int Rd, int Rn, uint32_t Op2);
+
+    // -----------------------------------------------------------------------
+    // Arm64 instructions
+    // -----------------------------------------------------------------------
+    uint32_t A64_B_COND(uint32_t cc, uint32_t offset);
+    uint32_t A64_RET(uint32_t Rn);
+
+    uint32_t A64_LDRSTR_Wm_SXTW_0(uint32_t operation,
+                                uint32_t size, uint32_t Rt,
+                                uint32_t Rn, uint32_t Rm);
+
+    uint32_t A64_STR_IMM_PreIndex(uint32_t Rt, uint32_t Rn, int32_t simm);
+    uint32_t A64_LDR_IMM_PostIndex(uint32_t Rt,uint32_t Rn, int32_t simm);
+
+    uint32_t A64_ADD_X_Wm_SXTW(uint32_t Rd, uint32_t Rn, uint32_t Rm,
+                               uint32_t amount);
+    uint32_t A64_SUB_X_Wm_SXTW(uint32_t Rd, uint32_t Rn, uint32_t Rm,
+                               uint32_t amount);
+
+    uint32_t A64_ADD_IMM_X(uint32_t Rd, uint32_t Rn,
+                           uint32_t imm, uint32_t shift = 0);
+    uint32_t A64_SUB_IMM_X(uint32_t Rd, uint32_t Rn,
+                           uint32_t imm, uint32_t shift = 0);
+
+    uint32_t A64_ADD_X(uint32_t Rd, uint32_t Rn,
+                       uint32_t Rm, uint32_t shift = 0, uint32_t amount = 0);
+    uint32_t A64_ADD_W(uint32_t Rd, uint32_t Rn, uint32_t Rm,
+                       uint32_t shift = 0, uint32_t amount = 0);
+    uint32_t A64_SUB_W(uint32_t Rd, uint32_t Rn, uint32_t Rm,
+                       uint32_t shift = 0, uint32_t amount = 0,
+                       uint32_t setflag = 0);
+    uint32_t A64_AND_W(uint32_t Rd, uint32_t Rn,
+                       uint32_t Rm, uint32_t shift = 0, uint32_t amount = 0);
+    uint32_t A64_ORR_W(uint32_t Rd, uint32_t Rn,
+                       uint32_t Rm, uint32_t shift = 0, uint32_t amount = 0);
+    uint32_t A64_ORN_W(uint32_t Rd, uint32_t Rn,
+                       uint32_t Rm, uint32_t shift = 0, uint32_t amount = 0);
+
+    uint32_t A64_MOVZ_W(uint32_t Rd, uint32_t imm, uint32_t shift);
+    uint32_t A64_MOVZ_X(uint32_t Rd, uint32_t imm, uint32_t shift);
+    uint32_t A64_MOVK_W(uint32_t Rd, uint32_t imm, uint32_t shift);
+
+    uint32_t A64_SMADDL(uint32_t Rd, uint32_t Rn, uint32_t Rm, uint32_t Ra);
+    uint32_t A64_MADD_W(uint32_t Rd, uint32_t Rn, uint32_t Rm, uint32_t Ra);
+
+    uint32_t A64_SBFM_W(uint32_t Rd, uint32_t Rn,
+                        uint32_t immr, uint32_t imms);
+    uint32_t A64_UBFM_W(uint32_t Rd, uint32_t Rn,
+                        uint32_t immr, uint32_t imms);
+    uint32_t A64_UBFM_X(uint32_t Rd, uint32_t Rn,
+                        uint32_t immr, uint32_t imms);
+
+    uint32_t A64_EXTR_W(uint32_t Rd, uint32_t Rn, uint32_t Rm, uint32_t lsb);
+    uint32_t A64_CSEL_X(uint32_t Rd, uint32_t Rn, uint32_t Rm, uint32_t cond);
+    uint32_t A64_CSEL_W(uint32_t Rd, uint32_t Rn, uint32_t Rm, uint32_t cond);
+
+    uint32_t*       mBase;
+    uint32_t*       mPC;
+    uint32_t*       mPrologPC;
+    int64_t         mDuration;
+    uint32_t        mTmpReg1, mTmpReg2, mTmpReg3, mZeroReg;
+
+    struct branch_target_t {
+        inline branch_target_t() : label(0), pc(0) { }
+        inline branch_target_t(const char* l, uint32_t* p)
+            : label(l), pc(p) { }
+        const char* label;
+        uint32_t*   pc;
+    };
+
+    sp<Assembly>    mAssembly;
+    Vector<branch_target_t>                 mBranchTargets;
+    KeyedVector< const char*, uint32_t* >   mLabels;
+    KeyedVector< uint32_t*, const char* >   mLabelsInverseMapping;
+    KeyedVector< uint32_t*, const char* >   mComments;
+
+    enum operand_type_t
+    {
+        OPERAND_REG = 0x20,
+        OPERAND_IMM,
+        OPERAND_REG_IMM,
+        OPERAND_REG_OFFSET,
+        OPERAND_UNSUPPORTED
+    };
+
+    struct addr_mode_t {
+        int32_t         immediate;
+        bool            writeback;
+        bool            preindex;
+        bool            postindex;
+        int32_t         reg_imm_Rm;
+        int32_t         reg_imm_type;
+        uint32_t        reg_imm_shift;
+        int32_t         reg_offset;
+    } mAddrMode;
+
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARM64ASSEMBLER_H

diff --git a/libpixelflinger/codeflinger/Arm64Disassembler.cpp b/libpixelflinger/codeflinger/Arm64Disassembler.cpp
new file mode 100644
index 0000000..70f1ff1
--- /dev/null
+++ b/libpixelflinger/codeflinger/Arm64Disassembler.cpp

@@ -0,0 +1,316 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+
+struct disasm_table_entry_t
+{
+    uint32_t       mask;
+    uint32_t       value;
+    const char*    instr_template;
+};
+
+
+static disasm_table_entry_t disasm_table[] =
+{
+    {0xff000000, 0x91000000, "add <xd|sp>, <xn|sp>, #<imm1>, <shift1>"},
+    {0xff000000, 0xd1000000, "sub <xd|sp>, <xn|sp>, #<imm1>, <shift1>"},
+    {0xff200000, 0x8b000000, "add <xd>, <xn>, <xm>, <shift2> #<amt1>"},
+    {0xff200000, 0x0b000000, "add <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff200000, 0x4b000000, "sub <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff200000, 0x6b000000, "subs <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff200000, 0x0a000000, "and <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff200000, 0x2a000000, "orr <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff200000, 0x2a200000, "orn <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff800000, 0x72800000, "movk <wd>, #<imm2>, lsl #<shift3>"},
+    {0xff800000, 0x52800000, "movz <wd>, #<imm2>, lsl #<shift3>"},
+    {0xff800000, 0xd2800000, "movz <xd>, #<imm2>, lsl #<shift3>"},
+    {0xffe00c00, 0x1a800000, "csel <wd>, <wn>, <wm>, <cond1>"},
+    {0xffe00c00, 0x9a800000, "csel <xd>, <xn>, <xm>, <cond1>"},
+    {0xffe00c00, 0x5a800000, "csinv <wd>, <wn>, <wm>, <cond1>"},
+    {0xffe08000, 0x1b000000, "madd <wd>, <wn>, <wm>, <wa>"},
+    {0xffe08000, 0x9b200000, "smaddl <xd>, <wn>, <wm>, <xa>"},
+    {0xffe04c00, 0xb8604800, "ldr <wt>, [<xn|sp>, <r1><m1>, <ext1> #<amt2>]"},
+    {0xffe04c00, 0xb8204800, "str <wt>, [<xn|sp>, <r1><m1>, <ext1> #<amt2>]"},
+    {0xffe04c00, 0xf8604800, "ldr <xt>, [<xn|sp>, <r1><m1>, <ext1> #<amt3>]"},
+    {0xffe04c00, 0xf8204800, "str <xt>, [<xn|sp>, <r1><m1>, <ext1> #<amt3>]"},
+    {0xffe04c00, 0x38604800, "ldrb <wt>, [<xn|sp>, <r1><m1>, <ext1> <amt5>]"},
+    {0xffe04c00, 0x38204800, "strb <wt>, [<xn|sp>, <r1><m1>, <ext1> <amt5>]"},
+    {0xffe04c00, 0x78604800, "ldrh <wt>, [<xn|sp>, <r1><m1>, <ext1> #<amt6>]"},
+    {0xffe04c00, 0x78204800, "strh <wt>, [<xn|sp>, <r1><m1>, <ext1> #<amt6>]"},
+    {0xffe00c00, 0xb8400400, "ldr <wt>, [<xn|sp>], #<simm1>"},
+    {0xffe00c00, 0xb8000c00, "str <wt>, [<xn|sp>, #<simm1>]!"},
+    {0xffc00000, 0x13000000, "sbfm <wd>, <wn>, #<immr1>, #<imms1>"},
+    {0xffc00000, 0x53000000, "ubfm <wd>, <wn>, #<immr1>, #<imms1>"},
+    {0xffc00000, 0xd3400000, "ubfm <xd>, <xn>, #<immr1>, #<imms1>"},
+    {0xffe00000, 0x13800000, "extr <wd>, <wn>, <wm>, #<lsb1>"},
+    {0xff000000, 0x54000000, "b.<cond2> <label1>"},
+    {0xfffffc1f, 0xd65f0000, "ret <xn>"},
+    {0xffe00000, 0x8b200000, "add <xd|sp>, <xn|sp>, <r2><m1>, <ext2> #<amt4>"},
+    {0xffe00000, 0xcb200000, "sub <xd|sp>, <xn|sp>, <r2><m1>, <ext2> #<amt4>"}
+};
+
+static int32_t bits_signed(uint32_t instr, uint32_t msb, uint32_t lsb)
+{
+    int32_t value;
+    value   = ((int32_t)instr) << (31 - msb);
+    value >>= (31 - msb);
+    value >>= lsb;
+    return value;
+}
+static uint32_t bits_unsigned(uint32_t instr, uint32_t msb, uint32_t lsb)
+{
+    uint32_t width = msb - lsb + 1;
+    uint32_t mask  = (1 << width) - 1;
+    return ((instr >> lsb) & mask);
+}
+
+static void get_token(const char *instr, uint32_t index, char *token)
+{
+    uint32_t i, j;
+    for(i = index, j = 0; i < strlen(instr); ++i)
+    {
+        if(instr[index] == '<' && instr[i] == '>')
+        {
+            token[j++] = instr[i];
+            break;
+        }
+        else if(instr[index] != '<' && instr[i] == '<')
+        {
+            break;
+        }
+        else
+        {
+            token[j++] = instr[i];
+        }
+    }
+    token[j] = '\0';
+    return;
+}
+
+
+static const char * token_cc_table[] =
+{
+    "eq", "ne", "cs", "cc", "mi",
+    "pl", "vs", "vc", "hi", "ls",
+    "ge", "lt", "gt", "le", "al", "nv"
+};
+
+static void decode_rx_zr_token(uint32_t reg, const char *prefix, char *instr_part)
+{
+    if(reg == 31)
+        sprintf(instr_part, "%s%s", prefix, "zr");
+    else
+        sprintf(instr_part, "%s%d", prefix, reg);
+}
+
+static void decode_token(uint32_t code, char *token, char *instr_part)
+{
+    if(strcmp(token, "<imm1>") == 0)
+        sprintf(instr_part, "0x%x", bits_unsigned(code, 21,10));
+    else if(strcmp(token, "<imm2>") == 0)
+        sprintf(instr_part, "0x%x", bits_unsigned(code, 20,5));
+    else if(strcmp(token, "<shift1>") == 0)
+        sprintf(instr_part, "lsl #%d", bits_unsigned(code, 23,22) * 12);
+    else if(strcmp(token, "<shift2>") == 0)
+    {
+        static const char * shift2_table[] = { "lsl", "lsr", "asr", "ror"};
+        sprintf(instr_part, "%s", shift2_table[bits_unsigned(code, 23,22)]);
+    }
+    else if(strcmp(token, "<shift3>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 22,21) * 16);
+    else if(strcmp(token, "<amt1>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 15,10));
+    else if(strcmp(token, "<amt2>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 12,12) * 2);
+    else if(strcmp(token, "<amt3>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 12,12) * 3);
+    else if(strcmp(token, "<amt4>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 12,10));
+    else if(strcmp(token, "<amt5>") == 0)
+    {
+        static const char * amt5_table[] = {"", "#0"};
+        sprintf(instr_part, "%s", amt5_table[bits_unsigned(code, 12,12)]);
+    }
+    else if(strcmp(token, "<amt6>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 12,12));
+    else if(strcmp(token, "<simm1>") == 0)
+        sprintf(instr_part, "%d", bits_signed(code, 20,12));
+    else if(strcmp(token, "<immr1>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 21,16));
+    else if(strcmp(token, "<imms1>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 15,10));
+    else if(strcmp(token, "<lsb1>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 15,10));
+    else if(strcmp(token, "<cond1>") == 0)
+        sprintf(instr_part, "%s", token_cc_table[bits_unsigned(code, 15,12)]);
+    else if(strcmp(token, "<cond2>") == 0)
+        sprintf(instr_part, "%s", token_cc_table[bits_unsigned(code, 4,0)]);
+    else if(strcmp(token, "<r1>") == 0)
+    {
+        const char * token_r1_table[] =
+        {
+            "reserved", "reserved", "w", "x",
+            "reserved", "reserved", "w", "x"
+        };
+        sprintf(instr_part, "%s", token_r1_table[bits_unsigned(code, 15,13)]);
+    }
+    else if(strcmp(token, "<r2>") == 0)
+    {
+        static const char * token_r2_table[] =
+        {
+                "w","w","w", "x", "w", "w", "w", "x"
+        };
+        sprintf(instr_part, "%s", token_r2_table[bits_unsigned(code, 15,13)]);
+    }
+    else if(strcmp(token, "<m1>") == 0)
+    {
+        uint32_t reg = bits_unsigned(code, 20,16);
+        if(reg == 31)
+            sprintf(instr_part, "%s", "zr");
+        else
+            sprintf(instr_part, "%d", reg);
+    }
+    else if(strcmp(token, "<ext1>") == 0)
+    {
+        static const char * token_ext1_table[] =
+        {
+             "reserved","reserved","uxtw", "lsl",
+             "reserved","reserved", "sxtw", "sxtx"
+        };
+        sprintf(instr_part, "%s", token_ext1_table[bits_unsigned(code, 15,13)]);
+    }
+    else if(strcmp(token, "<ext2>") == 0)
+    {
+        static const char * token_ext2_table[] =
+        {
+                "uxtb","uxth","uxtw","uxtx",
+                "sxtb","sxth","sxtw","sxtx"
+        };
+        sprintf(instr_part, "%s", token_ext2_table[bits_unsigned(code, 15,13)]);
+    }
+    else if (strcmp(token, "<label1>") == 0)
+    {
+        int32_t offset = bits_signed(code, 23,5) * 4;
+        if(offset > 0)
+            sprintf(instr_part, "#.+%d", offset);
+        else
+            sprintf(instr_part, "#.-%d", -offset);
+    }
+    else if (strcmp(token, "<xn|sp>") == 0)
+    {
+        uint32_t reg = bits_unsigned(code, 9, 5);
+        if(reg == 31)
+            sprintf(instr_part, "%s", "sp");
+        else
+            sprintf(instr_part, "x%d", reg);
+    }
+    else if (strcmp(token, "<xd|sp>") == 0)
+    {
+        uint32_t reg = bits_unsigned(code, 4, 0);
+        if(reg == 31)
+            sprintf(instr_part, "%s", "sp");
+        else
+            sprintf(instr_part, "x%d", reg);
+    }
+    else if (strcmp(token, "<xn>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 9, 5), "x", instr_part);
+    else if (strcmp(token, "<xd>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 4, 0), "x", instr_part);
+    else if (strcmp(token, "<xm>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 20, 16), "x", instr_part);
+    else if (strcmp(token, "<xa>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 14, 10), "x", instr_part);
+    else if (strcmp(token, "<xt>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 4, 0), "x", instr_part);
+    else if (strcmp(token, "<wn>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 9, 5), "w", instr_part);
+    else if (strcmp(token, "<wd>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 4, 0), "w", instr_part);
+    else if (strcmp(token, "<wm>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 20, 16), "w", instr_part);
+    else if (strcmp(token, "<wa>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 14, 10), "w", instr_part);
+    else if (strcmp(token, "<wt>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 4, 0), "w", instr_part);
+    else
+    {
+        sprintf(instr_part, "error");
+    }
+    return;
+}
+
+int arm64_disassemble(uint32_t code, char* instr)
+{
+    uint32_t i;
+    char token[256];
+    char instr_part[256];
+
+    if(instr == NULL)
+        return -1;
+
+    bool matched = false;
+    disasm_table_entry_t *entry = NULL;
+    for(i = 0; i < sizeof(disasm_table)/sizeof(disasm_table_entry_t); ++i)
+    {
+        entry = &disasm_table[i];
+        if((code & entry->mask) == entry->value)
+        {
+            matched = true;
+            break;
+        }
+    }
+    if(matched == false)
+    {
+        strcpy(instr, "Unknown Instruction");
+        return -1;
+    }
+    else
+    {
+        uint32_t index = 0;
+        uint32_t length = strlen(entry->instr_template);
+        instr[0] = '\0';
+        do
+        {
+            get_token(entry->instr_template, index, token);
+            if(token[0] == '<')
+            {
+                decode_token(code, token, instr_part);
+                strcat(instr, instr_part);
+            }
+            else
+            {
+                strcat(instr, token);
+            }
+            index += strlen(token);
+        }while(index < length);
+        return 0;
+    }
+}

diff --git a/libpixelflinger/codeflinger/Arm64Disassembler.h b/libpixelflinger/codeflinger/Arm64Disassembler.h
new file mode 100644
index 0000000..86f3aba
--- /dev/null
+++ b/libpixelflinger/codeflinger/Arm64Disassembler.h

@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef ANDROID_ARM64DISASSEMBLER_H
+#define ANDROID_ARM64DISASSEMBLER_H
+
+#include <inttypes.h>
+int arm64_disassemble(uint32_t code, char* instr);
+
+#endif //ANDROID_ARM64ASSEMBLER_H

diff --git a/libpixelflinger/codeflinger/CodeCache.cpp b/libpixelflinger/codeflinger/CodeCache.cpp
new file mode 100644
index 0000000..32691a3
--- /dev/null
+++ b/libpixelflinger/codeflinger/CodeCache.cpp

@@ -0,0 +1,217 @@
+/* libs/pixelflinger/codeflinger/CodeCache.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "CodeCache"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <cutils/ashmem.h>
+#include <log/log.h>
+
+#include "CodeCache.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+#if defined(__arm__) || defined(__aarch64__)
+#include <unistd.h>
+#include <errno.h>
+#endif
+
+#if defined(__mips__)
+#include <asm/cachectl.h>
+#include <errno.h>
+#endif
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// A dlmalloc mspace is used to manage the code cache over a mmaped region.
+#define HAVE_MMAP 0
+#define HAVE_MREMAP 0
+#define HAVE_MORECORE 0
+#define MALLOC_ALIGNMENT 16
+#define MSPACES 1
+#define NO_MALLINFO 1
+#define ONLY_MSPACES 1
+// Custom heap error handling.
+#define PROCEED_ON_ERROR 0
+static void heap_error(const char* msg, const char* function, void* p);
+#define CORRUPTION_ERROR_ACTION(m) \
+    heap_error("HEAP MEMORY CORRUPTION", __FUNCTION__, NULL)
+#define USAGE_ERROR_ACTION(m,p) \
+    heap_error("ARGUMENT IS INVALID HEAP ADDRESS", __FUNCTION__, p)
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wexpansion-to-defined"
+#pragma GCC diagnostic ignored "-Wnull-pointer-arithmetic"
+#include "../../../../external/dlmalloc/malloc.c"
+#pragma GCC diagnostic pop
+
+static void heap_error(const char* msg, const char* function, void* p) {
+    ALOG(LOG_FATAL, LOG_TAG, "@@@ ABORTING: CODE FLINGER: %s IN %s addr=%p",
+         msg, function, p);
+    /* So that we can get a memory dump around p */
+    *((int **) 0xdeadbaad) = (int *) p;
+}
+
+// ----------------------------------------------------------------------------
+
+static void* gExecutableStore = NULL;
+static mspace gMspace = NULL;
+const size_t kMaxCodeCacheCapacity = 1024 * 1024;
+
+static mspace getMspace()
+{
+    if (gExecutableStore == NULL) {
+        int fd = ashmem_create_region("CodeFlinger code cache",
+                                      kMaxCodeCacheCapacity);
+        LOG_ALWAYS_FATAL_IF(fd < 0,
+                            "Creating code cache, ashmem_create_region "
+                            "failed with error '%s'", strerror(errno));
+        gExecutableStore = mmap(NULL, kMaxCodeCacheCapacity,
+                                PROT_READ | PROT_WRITE | PROT_EXEC,
+                                MAP_PRIVATE, fd, 0);
+        LOG_ALWAYS_FATAL_IF(gExecutableStore == MAP_FAILED,
+                            "Creating code cache, mmap failed with error "
+                            "'%s'", strerror(errno));
+        close(fd);
+        gMspace = create_mspace_with_base(gExecutableStore, kMaxCodeCacheCapacity,
+                                          /*locked=*/ false);
+        mspace_set_footprint_limit(gMspace, kMaxCodeCacheCapacity);
+    }
+    return gMspace;
+}
+
+Assembly::Assembly(size_t size)
+    : mCount(0), mSize(0)
+{
+    mBase = (uint32_t*)mspace_malloc(getMspace(), size);
+    LOG_ALWAYS_FATAL_IF(mBase == NULL,
+                        "Failed to create Assembly of size %zd in executable "
+                        "store of size %zd", size, kMaxCodeCacheCapacity);
+    mSize = size;
+}
+
+Assembly::~Assembly()
+{
+    mspace_free(getMspace(), mBase);
+}
+
+void Assembly::incStrong(const void*) const
+{
+    mCount.fetch_add(1, std::memory_order_relaxed);
+}
+
+void Assembly::decStrong(const void*) const
+{
+    if (mCount.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+        delete this;
+    }
+}
+
+ssize_t Assembly::size() const
+{
+    if (!mBase) return NO_MEMORY;
+    return mSize;
+}
+
+uint32_t* Assembly::base() const
+{
+    return mBase;
+}
+
+ssize_t Assembly::resize(size_t newSize)
+{
+    mBase = (uint32_t*)mspace_realloc(getMspace(), mBase, newSize);
+    LOG_ALWAYS_FATAL_IF(mBase == NULL,
+                        "Failed to resize Assembly to %zd in code cache "
+                        "of size %zd", newSize, kMaxCodeCacheCapacity);
+    mSize = newSize;
+    return size();
+}
+
+// ----------------------------------------------------------------------------
+
+CodeCache::CodeCache(size_t size)
+    : mCacheSize(size), mCacheInUse(0)
+{
+    pthread_mutex_init(&mLock, 0);
+}
+
+CodeCache::~CodeCache()
+{
+    pthread_mutex_destroy(&mLock);
+}
+
+sp<Assembly> CodeCache::lookup(const AssemblyKeyBase& keyBase) const
+{
+    pthread_mutex_lock(&mLock);
+    sp<Assembly> r;
+    ssize_t index = mCacheData.indexOfKey(key_t(keyBase));
+    if (index >= 0) {
+        const cache_entry_t& e = mCacheData.valueAt(index);
+        e.when = mWhen++;
+        r = e.entry;
+    }
+    pthread_mutex_unlock(&mLock);
+    return r;
+}
+
+int CodeCache::cache(  const AssemblyKeyBase& keyBase,
+                            const sp<Assembly>& assembly)
+{
+    pthread_mutex_lock(&mLock);
+
+    const ssize_t assemblySize = assembly->size();
+    while (mCacheInUse + assemblySize > mCacheSize) {
+        // evict the LRU
+        size_t lru = 0;
+        size_t count = mCacheData.size();
+        for (size_t i=0 ; i<count ; i++) {
+            const cache_entry_t& e = mCacheData.valueAt(i);
+            if (e.when < mCacheData.valueAt(lru).when) {
+                lru = i;
+            }
+        }
+        const cache_entry_t& e = mCacheData.valueAt(lru);
+        mCacheInUse -= e.entry->size();
+        mCacheData.removeItemsAt(lru);
+    }
+
+    ssize_t err = mCacheData.add(key_t(keyBase), cache_entry_t(assembly, mWhen));
+    if (err >= 0) {
+        mCacheInUse += assemblySize;
+        mWhen++;
+        // synchronize caches...
+        char* base = reinterpret_cast<char*>(assembly->base());
+        char* curr = reinterpret_cast<char*>(base + assembly->size());
+        __builtin___clear_cache(base, curr);
+    }
+
+    pthread_mutex_unlock(&mLock);
+    return err;
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android

diff --git a/libpixelflinger/codeflinger/CodeCache.h b/libpixelflinger/codeflinger/CodeCache.h
new file mode 100644
index 0000000..9326453
--- /dev/null
+++ b/libpixelflinger/codeflinger/CodeCache.h

@@ -0,0 +1,136 @@
+/* libs/pixelflinger/codeflinger/CodeCache.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_CODECACHE_H
+#define ANDROID_CODECACHE_H
+
+#include <atomic>
+#include <stdint.h>
+#include <pthread.h>
+#include <sys/types.h>
+
+#include "utils/KeyedVector.h"
+#include "tinyutils/smartpointer.h"
+
+namespace android {
+
+using namespace tinyutils;
+
+// ----------------------------------------------------------------------------
+
+class AssemblyKeyBase {
+public:
+    virtual ~AssemblyKeyBase() { }
+    virtual int compare_type(const AssemblyKeyBase& key) const = 0;
+};
+
+template  <typename T>
+class AssemblyKey : public AssemblyKeyBase
+{
+public:
+    explicit AssemblyKey(const T& rhs) : mKey(rhs) { }
+    virtual int compare_type(const AssemblyKeyBase& key) const {
+        const T& rhs = static_cast<const AssemblyKey&>(key).mKey;
+        return android::compare_type(mKey, rhs);
+    }
+private:
+    T mKey;
+};
+
+// ----------------------------------------------------------------------------
+
+class Assembly
+{
+public:
+    explicit    Assembly(size_t size);
+    virtual     ~Assembly();
+
+    ssize_t     size() const;
+    uint32_t*   base() const;
+    ssize_t     resize(size_t size);
+
+    // protocol for sp<>
+            void    incStrong(const void* id) const;
+            void    decStrong(const void* id) const;
+    typedef void    weakref_type;
+
+private:
+    mutable std::atomic<int32_t>     mCount;
+            uint32_t*   mBase;
+            size_t      mSize;
+};
+
+// ----------------------------------------------------------------------------
+
+class CodeCache
+{
+public:
+// pretty simple cache API...
+    explicit            CodeCache(size_t size);
+                        ~CodeCache();
+
+    sp<Assembly>        lookup(const AssemblyKeyBase& key) const;
+
+    int                 cache(const AssemblyKeyBase& key,
+                              const sp<Assembly>& assembly);
+
+private:
+    // nothing to see here...
+    struct cache_entry_t {
+        inline cache_entry_t() { }
+        inline cache_entry_t(const sp<Assembly>& a, int64_t w)
+                : entry(a), when(w) { }
+        sp<Assembly>            entry;
+        mutable int64_t         when;
+    };
+
+    class key_t {
+        friend int compare_type(
+            const key_value_pair_t<key_t, cache_entry_t>&,
+            const key_value_pair_t<key_t, cache_entry_t>&);
+        const AssemblyKeyBase* mKey;
+    public:
+        key_t() { };
+        explicit key_t(const AssemblyKeyBase& k) : mKey(&k)  { }
+    };
+
+    mutable pthread_mutex_t             mLock;
+    mutable int64_t                     mWhen;
+    size_t                              mCacheSize;
+    size_t                              mCacheInUse;
+    KeyedVector<key_t, cache_entry_t>   mCacheData;
+
+    friend int compare_type(
+        const key_value_pair_t<key_t, cache_entry_t>&,
+        const key_value_pair_t<key_t, cache_entry_t>&);
+};
+
+// KeyedVector uses compare_type(), which is more efficient, than
+// just using operator < ()
+inline int compare_type(
+    const key_value_pair_t<CodeCache::key_t, CodeCache::cache_entry_t>& lhs,
+    const key_value_pair_t<CodeCache::key_t, CodeCache::cache_entry_t>& rhs)
+{
+    return lhs.key.mKey->compare_type(*(rhs.key.mKey));
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
+#endif //ANDROID_CODECACHE_H

diff --git a/libpixelflinger/codeflinger/GGLAssembler.cpp b/libpixelflinger/codeflinger/GGLAssembler.cpp
new file mode 100644
index 0000000..04e285d
--- /dev/null
+++ b/libpixelflinger/codeflinger/GGLAssembler.cpp

@@ -0,0 +1,1195 @@
+/* libs/pixelflinger/codeflinger/GGLAssembler.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "GGLAssembler"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <log/log.h>
+
+#include "GGLAssembler.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+GGLAssembler::GGLAssembler(ARMAssemblerInterface* target)
+    : ARMAssemblerProxy(target),
+      RegisterAllocator(ARMAssemblerProxy::getCodegenArch()), mOptLevel(7)
+{
+}
+
+GGLAssembler::~GGLAssembler()
+{
+}
+
+void GGLAssembler::prolog()
+{
+    ARMAssemblerProxy::prolog();
+}
+
+void GGLAssembler::epilog(uint32_t touched)
+{
+    ARMAssemblerProxy::epilog(touched);
+}
+
+void GGLAssembler::reset(int opt_level)
+{
+    ARMAssemblerProxy::reset();
+    RegisterAllocator::reset();
+    mOptLevel = opt_level;
+}
+
+// ---------------------------------------------------------------------------
+
+int GGLAssembler::scanline(const needs_t& needs, context_t const* c)
+{
+    int err = 0;
+    int opt_level = mOptLevel;
+    while (opt_level >= 0) {
+        reset(opt_level);
+        err = scanline_core(needs, c);
+        if (err == 0)
+            break;
+        opt_level--;
+    }
+    
+    // XXX: in theory, pcForLabel is not valid before generate()
+    uint32_t* fragment_start_pc = pcForLabel("fragment_loop");
+    uint32_t* fragment_end_pc = pcForLabel("epilog");
+    const int per_fragment_ops = int(fragment_end_pc - fragment_start_pc);
+    
+    // build a name for our pipeline
+    char name[64];    
+    sprintf(name,
+            "scanline__%08X:%08X_%08X_%08X [%3d ipp]",
+            needs.p, needs.n, needs.t[0], needs.t[1], per_fragment_ops);
+
+    if (err) {
+        ALOGE("Error while generating ""%s""\n", name);
+        disassemble(name);
+        return -1;
+    }
+
+    return generate(name);
+}
+
+int GGLAssembler::scanline_core(const needs_t& needs, context_t const* c)
+{
+    mBlendFactorCached = 0;
+    mBlending = 0;
+    mMasking = 0;
+    mAA        = GGL_READ_NEEDS(P_AA, needs.p);
+    mDithering = GGL_READ_NEEDS(P_DITHER, needs.p);
+    mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER;
+    mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER;
+    mFog       = GGL_READ_NEEDS(P_FOG, needs.p) != 0;
+    mSmooth    = GGL_READ_NEEDS(SHADE, needs.n) != 0;
+    mBuilderContext.needs = needs;
+    mBuilderContext.c = c;
+    mBuilderContext.Rctx = reserveReg(R0); // context always in R0
+    mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ];
+
+    // ------------------------------------------------------------------------
+
+    decodeLogicOpNeeds(needs);
+
+    decodeTMUNeeds(needs, c);
+
+    mBlendSrc  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n));
+    mBlendDst  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n));
+    mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n));
+    mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n));
+
+    if (!mCbFormat.c[GGLFormat::ALPHA].h) {
+        if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendSrc == GGL_DST_ALPHA)) {
+            mBlendSrc = GGL_ONE;
+        }
+        if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendSrcA == GGL_DST_ALPHA)) {
+            mBlendSrcA = GGL_ONE;
+        }
+        if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendDst == GGL_DST_ALPHA)) {
+            mBlendDst = GGL_ONE;
+        }
+        if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendDstA == GGL_DST_ALPHA)) {
+            mBlendDstA = GGL_ONE;
+        }
+    }
+
+    // if we need the framebuffer, read it now
+    const int blending =    blending_codes(mBlendSrc, mBlendDst) |
+                            blending_codes(mBlendSrcA, mBlendDstA);
+
+    // XXX: handle special cases, destination not modified...
+    if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
+        (mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) {
+        // Destination unmodified (beware of logic ops)
+    } else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
+        (mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) {
+        // Destination is zero (beware of logic ops)
+    }
+    
+    int fbComponents = 0;
+    const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n);
+    for (int i=0 ; i<4 ; i++) {
+        const int mask = 1<<i;
+        component_info_t& info = mInfo[i];
+        int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+        int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+        if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA)
+            fs = GGL_ONE;
+        info.masked =   !!(masking & mask);
+        info.inDest =   !info.masked && mCbFormat.c[i].h && 
+                        ((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp));
+        if (mCbFormat.components >= GGL_LUMINANCE &&
+                (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
+            info.inDest = false;
+        }
+        info.needed =   (i==GGLFormat::ALPHA) && 
+                        (isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS);
+        info.replaced = !!(mTextureMachine.replaced & mask);
+        info.iterated = (!info.replaced && (info.inDest || info.needed)); 
+        info.smooth =   mSmooth && info.iterated;
+        info.fog =      mFog && info.inDest && (i != GGLFormat::ALPHA);
+        info.blend =    (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
+
+        mBlending |= (info.blend ? mask : 0);
+        mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0;
+        fbComponents |= mCbFormat.c[i].h ? mask : 0;
+    }
+
+    mAllMasked = (mMasking == fbComponents);
+    if (mAllMasked) {
+        mDithering = 0;
+    }
+    
+    fragment_parts_t parts;
+
+    // ------------------------------------------------------------------------
+    prolog();
+    // ------------------------------------------------------------------------
+
+    build_scanline_prolog(parts, needs);
+
+    if (registerFile().status())
+        return registerFile().status();
+
+    // ------------------------------------------------------------------------
+    label("fragment_loop");
+    // ------------------------------------------------------------------------
+    {
+        Scratch regs(registerFile());
+
+        if (mDithering) {
+            // update the dither index.
+            MOV(AL, 0, parts.count.reg,
+                    reg_imm(parts.count.reg, ROR, GGL_DITHER_ORDER_SHIFT));
+            ADD(AL, 0, parts.count.reg, parts.count.reg,
+                    imm( 1 << (32 - GGL_DITHER_ORDER_SHIFT)));
+            MOV(AL, 0, parts.count.reg,
+                    reg_imm(parts.count.reg, ROR, 32 - GGL_DITHER_ORDER_SHIFT));
+        }
+
+        // XXX: could we do an early alpha-test here in some cases?
+        // It would probaly be used only with smooth-alpha and no texture
+        // (or no alpha component in the texture).
+
+        // Early z-test
+        if (mAlphaTest==GGL_ALWAYS) {
+            build_depth_test(parts, Z_TEST|Z_WRITE);
+        } else {
+            // we cannot do the z-write here, because
+            // it might be killed by the alpha-test later
+            build_depth_test(parts, Z_TEST);
+        }
+
+        { // texture coordinates
+            Scratch scratches(registerFile());
+
+            // texel generation
+            build_textures(parts, regs);
+            if (registerFile().status())
+                return registerFile().status();
+        }
+
+        if ((blending & (FACTOR_DST|BLEND_DST)) || 
+                (mMasking && !mAllMasked) ||
+                (mLogicOp & LOGIC_OP_DST)) 
+        {
+            // blending / logic_op / masking need the framebuffer
+            mDstPixel.setTo(regs.obtain(), &mCbFormat);
+
+            // load the framebuffer pixel
+            comment("fetch color-buffer");
+            load(parts.cbPtr, mDstPixel);
+        }
+
+        if (registerFile().status())
+            return registerFile().status();
+
+        pixel_t pixel;
+        int directTex = mTextureMachine.directTexture;
+        if (directTex | parts.packed) {
+            // note: we can't have both here
+            // iterated color or direct texture
+            pixel = directTex ? parts.texel[directTex-1] : parts.iterated;
+            pixel.flags &= ~CORRUPTIBLE;
+        } else {
+            if (mDithering) {
+                const int ctxtReg = mBuilderContext.Rctx;
+                const int mask = GGL_DITHER_SIZE-1;
+                parts.dither = reg_t(regs.obtain());
+                AND(AL, 0, parts.dither.reg, parts.count.reg, imm(mask));
+                ADDR_ADD(AL, 0, parts.dither.reg, ctxtReg, parts.dither.reg);
+                LDRB(AL, parts.dither.reg, parts.dither.reg,
+                        immed12_pre(GGL_OFFSETOF(ditherMatrix)));
+            }
+        
+            // allocate a register for the resulting pixel
+            pixel.setTo(regs.obtain(), &mCbFormat, FIRST);
+
+            build_component(pixel, parts, GGLFormat::ALPHA,    regs);
+
+            if (mAlphaTest!=GGL_ALWAYS) {
+                // only handle the z-write part here. We know z-test
+                // was successful, as well as alpha-test.
+                build_depth_test(parts, Z_WRITE);
+            }
+
+            build_component(pixel, parts, GGLFormat::RED,      regs);
+            build_component(pixel, parts, GGLFormat::GREEN,    regs);
+            build_component(pixel, parts, GGLFormat::BLUE,     regs);
+
+            pixel.flags |= CORRUPTIBLE;
+        }
+
+        if (registerFile().status())
+            return registerFile().status();
+        
+        if (pixel.reg == -1) {
+            // be defensive here. if we're here it's probably
+            // that this whole fragment is a no-op.
+            pixel = mDstPixel;
+        }
+        
+        if (!mAllMasked) {
+            // logic operation
+            build_logic_op(pixel, regs);
+    
+            // masking
+            build_masking(pixel, regs); 
+    
+            comment("store");
+            store(parts.cbPtr, pixel, WRITE_BACK);
+        }
+    }
+
+    if (registerFile().status())
+        return registerFile().status();
+
+    // update the iterated color...
+    if (parts.reload != 3) {
+        build_smooth_shade(parts);
+    }
+
+    // update iterated z
+    build_iterate_z(parts);
+
+    // update iterated fog
+    build_iterate_f(parts);
+
+    SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
+    B(PL, "fragment_loop");
+    label("epilog");
+    epilog(registerFile().touched());
+
+    if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) {
+        if (mDepthTest!=GGL_ALWAYS) {
+            label("discard_before_textures");
+            build_iterate_texture_coordinates(parts);
+        }
+        label("discard_after_textures");
+        build_smooth_shade(parts);
+        build_iterate_z(parts);
+        build_iterate_f(parts);
+        if (!mAllMasked) {
+            ADDR_ADD(AL, 0, parts.cbPtr.reg, parts.cbPtr.reg, imm(parts.cbPtr.size>>3));
+        }
+        SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
+        B(PL, "fragment_loop");
+        epilog(registerFile().touched());
+    }
+
+    return registerFile().status();
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_scanline_prolog(
+    fragment_parts_t& parts, const needs_t& needs)
+{
+    Scratch scratches(registerFile());
+
+    // compute count
+    comment("compute ct (# of pixels to process)");
+    parts.count.setTo(obtainReg());
+    int Rx = scratches.obtain();    
+    int Ry = scratches.obtain();
+    CONTEXT_LOAD(Rx, iterators.xl);
+    CONTEXT_LOAD(parts.count.reg, iterators.xr);
+    CONTEXT_LOAD(Ry, iterators.y);
+
+    // parts.count = iterators.xr - Rx
+    SUB(AL, 0, parts.count.reg, parts.count.reg, Rx);
+    SUB(AL, 0, parts.count.reg, parts.count.reg, imm(1));
+
+    if (mDithering) {
+        // parts.count.reg = 0xNNNNXXDD
+        // NNNN = count-1
+        // DD   = dither offset
+        // XX   = 0xxxxxxx (x = garbage)
+        Scratch scratches(registerFile());
+        int tx = scratches.obtain();
+        int ty = scratches.obtain();
+        AND(AL, 0, tx, Rx, imm(GGL_DITHER_MASK));
+        AND(AL, 0, ty, Ry, imm(GGL_DITHER_MASK));
+        ADD(AL, 0, tx, tx, reg_imm(ty, LSL, GGL_DITHER_ORDER_SHIFT));
+        ORR(AL, 0, parts.count.reg, tx, reg_imm(parts.count.reg, LSL, 16));
+    } else {
+        // parts.count.reg = 0xNNNN0000
+        // NNNN = count-1
+        MOV(AL, 0, parts.count.reg, reg_imm(parts.count.reg, LSL, 16));
+    }
+
+    if (!mAllMasked) {
+        // compute dst ptr
+        comment("compute color-buffer pointer");
+        const int cb_bits = mCbFormat.size*8;
+        int Rs = scratches.obtain();
+        parts.cbPtr.setTo(obtainReg(), cb_bits);
+        CONTEXT_LOAD(Rs, state.buffers.color.stride);
+        CONTEXT_ADDR_LOAD(parts.cbPtr.reg, state.buffers.color.data);
+        SMLABB(AL, Rs, Ry, Rs, Rx);  // Rs = Rx + Ry*Rs
+        base_offset(parts.cbPtr, parts.cbPtr, Rs);
+        scratches.recycle(Rs);
+    }
+    
+    // init fog
+    const int need_fog = GGL_READ_NEEDS(P_FOG, needs.p);
+    if (need_fog) {
+        comment("compute initial fog coordinate");
+        Scratch scratches(registerFile());
+        int dfdx = scratches.obtain();
+        int ydfdy = scratches.obtain();
+        int f = ydfdy;
+        CONTEXT_LOAD(dfdx,  generated_vars.dfdx);
+        CONTEXT_LOAD(ydfdy, iterators.ydfdy);
+        MLA(AL, 0, f, Rx, dfdx, ydfdy);
+        CONTEXT_STORE(f, generated_vars.f);
+    }
+
+    // init Z coordinate
+    if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
+        parts.z = reg_t(obtainReg());
+        comment("compute initial Z coordinate");
+        Scratch scratches(registerFile());
+        int dzdx = scratches.obtain();
+        int ydzdy = parts.z.reg;
+        CONTEXT_LOAD(dzdx,  generated_vars.dzdx);   // 1.31 fixed-point
+        CONTEXT_LOAD(ydzdy, iterators.ydzdy);       // 1.31 fixed-point
+        MLA(AL, 0, parts.z.reg, Rx, dzdx, ydzdy);
+
+        // we're going to index zbase of parts.count
+        // zbase = base + (xl-count + stride*y)*2
+        int Rs = dzdx;
+        int zbase = scratches.obtain();
+        CONTEXT_LOAD(Rs, state.buffers.depth.stride);
+        CONTEXT_ADDR_LOAD(zbase, state.buffers.depth.data);
+        SMLABB(AL, Rs, Ry, Rs, Rx);
+        ADD(AL, 0, Rs, Rs, reg_imm(parts.count.reg, LSR, 16));
+        ADDR_ADD(AL, 0, zbase, zbase, reg_imm(Rs, LSL, 1));
+        CONTEXT_ADDR_STORE(zbase, generated_vars.zbase);
+    }
+
+    // init texture coordinates
+    init_textures(parts.coords, reg_t(Rx), reg_t(Ry));
+    scratches.recycle(Ry);
+
+    // iterated color
+    init_iterated_color(parts, reg_t(Rx));
+
+    // init coverage factor application (anti-aliasing)
+    if (mAA) {
+        parts.covPtr.setTo(obtainReg(), 16);
+        CONTEXT_ADDR_LOAD(parts.covPtr.reg, state.buffers.coverage);
+        ADDR_ADD(AL, 0, parts.covPtr.reg, parts.covPtr.reg, reg_imm(Rx, LSL, 1));
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_component( pixel_t& pixel,
+                                    const fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& regs)
+{
+    static char const * comments[] = {"alpha", "red", "green", "blue"};
+    comment(comments[component]);
+
+    // local register file
+    Scratch scratches(registerFile());
+    const int dst_component_size = pixel.component_size(component);
+
+    component_t temp(-1);
+    build_incoming_component( temp, dst_component_size,
+            parts, component, scratches, regs);
+
+    if (mInfo[component].inDest) {
+
+        // blending...
+        build_blending( temp, mDstPixel, component, scratches );
+
+        // downshift component and rebuild pixel...
+        downshift(pixel, component, temp, parts.dither);
+    }
+}
+
+void GGLAssembler::build_incoming_component(
+                                    component_t& temp,
+                                    int dst_size,
+                                    const fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& scratches,
+                                    Scratch& global_regs)
+{
+    const uint32_t component_mask = 1<<component;
+
+    // Figure out what we need for the blending stage...
+    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA) {
+        fs = GGL_ONE;
+    }
+
+    // Figure out what we need to extract and for what reason
+    const int blending = blending_codes(fs, fd);
+
+    // Are we actually going to blend?
+    const int need_blending = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
+    
+    // expand the source if the destination has more bits
+    int need_expander = false;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT-1 ; i++) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if ((tmu.format_idx) &&
+            (parts.texel[i].component_size(component) < dst_size)) {
+            need_expander = true;
+        }
+    }
+
+    // do we need to extract this component?
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    const int blend_needs_alpha_source = (component==GGLFormat::ALPHA) &&
+                                        (isAlphaSourceNeeded());
+    int need_extract = mInfo[component].needed;
+    if (mInfo[component].inDest)
+    {
+        need_extract |= ((need_blending ?
+                (blending & (BLEND_SRC|FACTOR_SRC)) : need_expander));
+        need_extract |= (mTextureMachine.mask != mTextureMachine.replaced);
+        need_extract |= mInfo[component].smooth;
+        need_extract |= mInfo[component].fog;
+        need_extract |= mDithering;
+        need_extract |= multiTexture;
+    }
+
+    if (need_extract) {
+        Scratch& regs = blend_needs_alpha_source ? global_regs : scratches;
+        component_t fragment;
+
+        // iterated color
+        build_iterated_color(fragment, parts, component, regs);
+
+        // texture environement (decal, modulate, replace)
+        build_texture_environment(fragment, parts, component, regs);
+
+        // expand the source if the destination has more bits
+        if (need_expander && (fragment.size() < dst_size)) {
+            // we're here only if we fetched a texel
+            // (so we know for sure fragment is CORRUPTIBLE)
+            expand(fragment, fragment, dst_size);
+        }
+
+        // We have a few specific things to do for the alpha-channel
+        if ((component==GGLFormat::ALPHA) &&
+            (mInfo[component].needed || fragment.size()<dst_size))
+        {
+            // convert to integer_t first and make sure
+            // we don't corrupt a needed register
+            if (fragment.l) {
+                component_t incoming(fragment);
+                modify(fragment, regs);
+                MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSR, incoming.l));
+                fragment.h -= fragment.l;
+                fragment.l = 0;
+            }
+
+            // coverage factor application
+            build_coverage_application(fragment, parts, regs);
+
+            // alpha-test
+            build_alpha_test(fragment, parts);
+
+            if (blend_needs_alpha_source) {
+                // We keep only 8 bits for the blending stage
+                const int shift = fragment.h <= 8 ? 0 : fragment.h-8;
+                if (fragment.flags & CORRUPTIBLE) {
+                    fragment.flags &= ~CORRUPTIBLE;
+                    mAlphaSource.setTo(fragment.reg,
+                            fragment.size(), fragment.flags);
+                    if (shift) {
+                        MOV(AL, 0, mAlphaSource.reg,
+                            reg_imm(mAlphaSource.reg, LSR, shift));
+                    }
+                } else {
+                    // XXX: it would better to do this in build_blend_factor()
+                    // so we can avoid the extra MOV below.
+                    mAlphaSource.setTo(regs.obtain(),
+                            fragment.size(), CORRUPTIBLE);
+                    if (shift) {
+                        MOV(AL, 0, mAlphaSource.reg,
+                            reg_imm(fragment.reg, LSR, shift));
+                    } else {
+                        MOV(AL, 0, mAlphaSource.reg, fragment.reg);
+                    }
+                }
+                mAlphaSource.s -= shift;
+            }
+        }
+
+        // fog...
+        build_fog( fragment, component, regs );
+
+        temp = fragment;
+    } else {
+        if (mInfo[component].inDest) {
+            // extraction not needed and replace
+            // we just select the right component
+            if ((mTextureMachine.replaced & component_mask) == 0) {
+                // component wasn't replaced, so use it!
+                temp = component_t(parts.iterated, component);
+            }
+            for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
+                const texture_unit_t& tmu = mTextureMachine.tmu[i];
+                if ((tmu.mask & component_mask) &&
+                    ((tmu.replaced & component_mask) == 0)) {
+                    temp = component_t(parts.texel[i], component);
+                }
+            }
+        }
+    }
+}
+
+bool GGLAssembler::isAlphaSourceNeeded() const
+{
+    // XXX: also needed for alpha-test
+    const int bs = mBlendSrc;
+    const int bd = mBlendDst;
+    return  bs==GGL_SRC_ALPHA_SATURATE ||
+            bs==GGL_SRC_ALPHA || bs==GGL_ONE_MINUS_SRC_ALPHA ||
+            bd==GGL_SRC_ALPHA || bd==GGL_ONE_MINUS_SRC_ALPHA ; 
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_smooth_shade(const fragment_parts_t& parts)
+{
+    if (mSmooth && !parts.iterated_packed) {
+        // update the iterated color in a pipelined way...
+        comment("update iterated color");
+        Scratch scratches(registerFile());
+
+        const int reload = parts.reload;
+        for (int i=0 ; i<4 ; i++) {
+            if (!mInfo[i].iterated) 
+                continue;
+                
+            int c = parts.argb[i].reg;
+            int dx = parts.argb_dx[i].reg;
+            
+            if (reload & 1) {
+                c = scratches.obtain();
+                CONTEXT_LOAD(c, generated_vars.argb[i].c);
+            }
+            if (reload & 2) {
+                dx = scratches.obtain();
+                CONTEXT_LOAD(dx, generated_vars.argb[i].dx);
+            }
+            
+            if (mSmooth) {
+                ADD(AL, 0, c, c, dx);
+            }
+            
+            if (reload & 1) {
+                CONTEXT_STORE(c, generated_vars.argb[i].c);
+                scratches.recycle(c);
+            }
+            if (reload & 2) {
+                scratches.recycle(dx);
+            }
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_coverage_application(component_t& fragment,
+        const fragment_parts_t& parts, Scratch& regs)
+{
+    // here fragment.l is guarenteed to be 0
+    if (mAA) {
+        // coverages are 1.15 fixed-point numbers
+        comment("coverage application");
+
+        component_t incoming(fragment);
+        modify(fragment, regs);
+
+        Scratch scratches(registerFile());
+        int cf = scratches.obtain();
+        LDRH(AL, cf, parts.covPtr.reg, immed8_post(2));
+        if (fragment.h > 31) {
+            fragment.h--;
+            SMULWB(AL, fragment.reg, incoming.reg, cf);
+        } else {
+            MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSL, 1));
+            SMULWB(AL, fragment.reg, fragment.reg, cf);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_alpha_test(component_t& fragment,
+                                    const fragment_parts_t& /*parts*/)
+{
+    if (mAlphaTest != GGL_ALWAYS) {
+        comment("Alpha Test");
+        Scratch scratches(registerFile());
+        int ref = scratches.obtain();
+        const int shift = GGL_COLOR_BITS-fragment.size();
+        CONTEXT_LOAD(ref, state.alpha_test.ref);
+        if (shift) CMP(AL, fragment.reg, reg_imm(ref, LSR, shift));
+        else       CMP(AL, fragment.reg, ref);
+        int cc = NV;
+        switch (mAlphaTest) {
+        case GGL_NEVER:     cc = NV;    break;
+        case GGL_LESS:      cc = LT;    break;
+        case GGL_EQUAL:     cc = EQ;    break;
+        case GGL_LEQUAL:    cc = LS;    break;
+        case GGL_GREATER:   cc = HI;    break;
+        case GGL_NOTEQUAL:  cc = NE;    break;
+        case GGL_GEQUAL:    cc = HS;    break;
+        }
+        B(cc^1, "discard_after_textures");
+    }
+}
+
+// ---------------------------------------------------------------------------
+            
+void GGLAssembler::build_depth_test(
+        const fragment_parts_t& parts, uint32_t mask)
+{
+    mask &= Z_TEST|Z_WRITE;
+    const needs_t& needs = mBuilderContext.needs;
+    const int zmask = GGL_READ_NEEDS(P_MASK_Z, needs.p);
+    Scratch scratches(registerFile());
+
+    if (mDepthTest != GGL_ALWAYS || zmask) {
+        int cc=AL, ic=AL;
+        switch (mDepthTest) {
+        case GGL_LESS:      ic = HI;    break;
+        case GGL_EQUAL:     ic = EQ;    break;
+        case GGL_LEQUAL:    ic = HS;    break;
+        case GGL_GREATER:   ic = LT;    break;
+        case GGL_NOTEQUAL:  ic = NE;    break;
+        case GGL_GEQUAL:    ic = LS;    break;
+        case GGL_NEVER:
+            // this never happens, because it's taken care of when 
+            // computing the needs. but we keep it for completness.
+            comment("Depth Test (NEVER)");
+            B(AL, "discard_before_textures");
+            return;
+        case GGL_ALWAYS:
+            // we're here because zmask is enabled
+            mask &= ~Z_TEST;    // test always passes.
+            break;
+        }
+        
+        // inverse the condition
+        cc = ic^1;
+        
+        if ((mask & Z_WRITE) && !zmask) {
+            mask &= ~Z_WRITE;
+        }
+        
+        if (!mask)
+            return;
+
+        comment("Depth Test");
+
+        int zbase = scratches.obtain();
+        int depth = scratches.obtain();
+        int z = parts.z.reg;
+        
+        CONTEXT_ADDR_LOAD(zbase, generated_vars.zbase);  // stall
+        ADDR_SUB(AL, 0, zbase, zbase, reg_imm(parts.count.reg, LSR, 15));
+            // above does zbase = zbase + ((count >> 16) << 1)
+
+        if (mask & Z_TEST) {
+            LDRH(AL, depth, zbase);  // stall
+            CMP(AL, depth, reg_imm(z, LSR, 16));
+            B(cc, "discard_before_textures");
+        }
+        if (mask & Z_WRITE) {
+            if (mask == Z_WRITE) {
+                // only z-write asked, cc is meaningless
+                ic = AL;
+            }
+            MOV(AL, 0, depth, reg_imm(z, LSR, 16));
+            STRH(ic, depth, zbase);
+        }
+    }
+}
+
+void GGLAssembler::build_iterate_z(const fragment_parts_t& parts)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
+        Scratch scratches(registerFile());
+        int dzdx = scratches.obtain();
+        CONTEXT_LOAD(dzdx, generated_vars.dzdx);    // stall
+        ADD(AL, 0, parts.z.reg, parts.z.reg, dzdx); 
+    }
+}
+
+void GGLAssembler::build_iterate_f(const fragment_parts_t& /*parts*/)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    if (GGL_READ_NEEDS(P_FOG, needs.p)) {
+        Scratch scratches(registerFile());
+        int dfdx = scratches.obtain();
+        int f = scratches.obtain();
+        CONTEXT_LOAD(f,     generated_vars.f);
+        CONTEXT_LOAD(dfdx,  generated_vars.dfdx);   // stall
+        ADD(AL, 0, f, f, dfdx);
+        CONTEXT_STORE(f,    generated_vars.f);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_logic_op(pixel_t& pixel, Scratch& regs)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+    if (opcode == GGL_COPY)
+        return;
+    
+    comment("logic operation");
+
+    pixel_t s(pixel);
+    if (!(pixel.flags & CORRUPTIBLE)) {
+        pixel.reg = regs.obtain();
+        pixel.flags |= CORRUPTIBLE;
+    }
+    
+    pixel_t d(mDstPixel);
+    switch(opcode) {
+    case GGL_CLEAR:         MOV(AL, 0, pixel.reg, imm(0));          break;
+    case GGL_AND:           AND(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_AND_REVERSE:   BIC(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_COPY:                                                  break;
+    case GGL_AND_INVERTED:  BIC(AL, 0, pixel.reg, d.reg, s.reg);    break;
+    case GGL_NOOP:          MOV(AL, 0, pixel.reg, d.reg);           break;
+    case GGL_XOR:           EOR(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_OR:            ORR(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_NOR:           ORR(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_EQUIV:         EOR(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_INVERT:        MVN(AL, 0, pixel.reg, d.reg);           break;
+    case GGL_OR_REVERSE:    // s | ~d == ~(~s & d)
+                            BIC(AL, 0, pixel.reg, d.reg, s.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_COPY_INVERTED: MVN(AL, 0, pixel.reg, s.reg);           break;
+    case GGL_OR_INVERTED:   // ~s | d == ~(s & ~d)
+                            BIC(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_NAND:          AND(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_SET:           MVN(AL, 0, pixel.reg, imm(0));          break;
+    };        
+}
+
+// ---------------------------------------------------------------------------
+
+static uint32_t find_bottom(uint32_t val)
+{
+    uint32_t i = 0;
+    while (!(val & (3<<i)))
+        i+= 2;
+    return i;
+}
+
+static void normalize(uint32_t& val, uint32_t& rot)
+{
+    rot = 0;
+    while (!(val&3)  || (val & 0xFC000000)) {
+        uint32_t newval;
+        newval = val >> 2;
+        newval |= (val&3) << 30;
+        val = newval;
+        rot += 2;
+        if (rot == 32) {
+            rot = 0;
+            break;
+        }
+    }
+}
+
+void GGLAssembler::build_and_immediate(int d, int s, uint32_t mask, int bits)
+{
+    uint32_t rot;
+    uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
+    mask &= size;
+
+    if (mask == size) {
+        if (d != s)
+            MOV( AL, 0, d, s);
+        return;
+    }
+    
+    if ((getCodegenArch() == CODEGEN_ARCH_MIPS) ||
+        (getCodegenArch() == CODEGEN_ARCH_MIPS64)) {
+        // MIPS can do 16-bit imm in 1 instr, 32-bit in 3 instr
+        // the below ' while (mask)' code is buggy on mips
+        // since mips returns true on isValidImmediate()
+        // then we get multiple AND instr (positive logic)
+        AND( AL, 0, d, s, imm(mask) );
+        return;
+    }
+    else if (getCodegenArch() == CODEGEN_ARCH_ARM64) {
+        AND( AL, 0, d, s, imm(mask) );
+        return;
+    }
+
+    int negative_logic = !isValidImmediate(mask);
+    if (negative_logic) {
+        mask = ~mask & size;
+    }
+    normalize(mask, rot);
+
+    if (mask) {
+        while (mask) {
+            uint32_t bitpos = find_bottom(mask);
+            int shift = rot + bitpos;
+            uint32_t m = mask & (0xff << bitpos);
+            mask &= ~m;
+            m >>= bitpos;
+            int32_t newMask =  (m<<shift) | (m>>(32-shift));
+            if (!negative_logic) {
+                AND( AL, 0, d, s, imm(newMask) );
+            } else {
+                BIC( AL, 0, d, s, imm(newMask) );
+            }
+            s = d;
+        }
+    } else {
+        MOV( AL, 0, d, imm(0));
+    }
+}		
+
+void GGLAssembler::build_masking(pixel_t& pixel, Scratch& regs)
+{
+    if (!mMasking || mAllMasked) {
+        return;
+    }
+
+    comment("color mask");
+
+    pixel_t fb(mDstPixel);
+    pixel_t s(pixel);
+    if (!(pixel.flags & CORRUPTIBLE)) {
+        pixel.reg = regs.obtain();
+        pixel.flags |= CORRUPTIBLE;
+    }
+
+    int mask = 0;
+    for (int i=0 ; i<4 ; i++) {
+        const int component_mask = 1<<i;
+        const int h = fb.format.c[i].h;
+        const int l = fb.format.c[i].l;
+        if (h && (!(mMasking & component_mask))) {
+            mask |= ((1<<(h-l))-1) << l;
+        }
+    }
+
+    // There is no need to clear the masked components of the source
+    // (unless we applied a logic op), because they're already zeroed 
+    // by construction (masked components are not computed)
+
+    if (mLogicOp) {
+        const needs_t& needs = mBuilderContext.needs;
+        const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+        if (opcode != GGL_CLEAR) {
+            // clear masked component of source
+            build_and_immediate(pixel.reg, s.reg, mask, fb.size());
+            s = pixel;
+        }
+    }
+
+    // clear non masked components of destination
+    build_and_immediate(fb.reg, fb.reg, ~mask, fb.size()); 
+
+    // or back the channels that were masked
+    if (s.reg == fb.reg) {
+         // this is in fact a MOV
+        if (s.reg == pixel.reg) {
+            // ugh. this in in fact a nop
+        } else {
+            MOV(AL, 0, pixel.reg, fb.reg);
+        }
+    } else {
+        ORR(AL, 0, pixel.reg, s.reg, fb.reg);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::base_offset(
+        const pointer_t& d, const pointer_t& b, const reg_t& o)
+{
+    switch (b.size) {
+    case 32:
+        ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 2));
+        break;
+    case 24:
+        if (d.reg == b.reg) {
+            ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
+            ADDR_ADD(AL, 0, d.reg, d.reg, o.reg);
+        } else {
+            ADDR_ADD(AL, 0, d.reg, o.reg, reg_imm(o.reg, LSL, 1));
+            ADDR_ADD(AL, 0, d.reg, d.reg, b.reg);
+        }
+        break;
+    case 16:
+        ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
+        break;
+    case 8:
+        ADDR_ADD(AL, 0, d.reg, b.reg, o.reg);
+        break;
+    }
+}
+
+// ----------------------------------------------------------------------------
+// cheezy register allocator...
+// ----------------------------------------------------------------------------
+
+// Modified to support MIPS processors, in a very simple way. We retain the
+// (Arm) limit of 16 total registers, but shift the mapping of those registers
+// from 0-15, to 2-17. Register 0 on Mips cannot be used as GP registers, and
+// register 1 has a traditional use as a temp).
+
+RegisterAllocator::RegisterAllocator(int arch) : mRegs(arch)
+{
+}
+
+void RegisterAllocator::reset()
+{
+    mRegs.reset();
+}
+
+int RegisterAllocator::reserveReg(int reg)
+{
+    return mRegs.reserve(reg);
+}
+
+int RegisterAllocator::obtainReg()
+{
+    return mRegs.obtain();
+}
+
+void RegisterAllocator::recycleReg(int reg)
+{
+    mRegs.recycle(reg);
+}
+
+RegisterAllocator::RegisterFile& RegisterAllocator::registerFile()
+{
+    return mRegs;
+}
+
+// ----------------------------------------------------------------------------
+
+RegisterAllocator::RegisterFile::RegisterFile(int codegen_arch)
+    : mRegs(0), mTouched(0), mStatus(0), mArch(codegen_arch), mRegisterOffset(0)
+{
+    if ((mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS) ||
+        (mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS64)) {
+        mRegisterOffset = 2;    // ARM has regs 0..15, MIPS offset to 2..17
+    }
+    reserve(ARMAssemblerInterface::SP);
+    reserve(ARMAssemblerInterface::PC);
+}
+
+RegisterAllocator::RegisterFile::RegisterFile(const RegisterFile& rhs, int codegen_arch)
+    : mRegs(rhs.mRegs), mTouched(rhs.mTouched), mArch(codegen_arch), mRegisterOffset(0)
+{
+    if ((mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS) ||
+        (mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS64)) {
+        mRegisterOffset = 2;    // ARM has regs 0..15, MIPS offset to 2..17
+    }
+}
+
+RegisterAllocator::RegisterFile::~RegisterFile()
+{
+}
+
+bool RegisterAllocator::RegisterFile::operator == (const RegisterFile& rhs) const
+{
+    return (mRegs == rhs.mRegs);
+}
+
+void RegisterAllocator::RegisterFile::reset()
+{
+    mRegs = mTouched = mStatus = 0;
+    reserve(ARMAssemblerInterface::SP);
+    reserve(ARMAssemblerInterface::PC);
+}
+
+// RegisterFile::reserve() take a register parameter in the
+// range 0-15 (Arm compatible), but on a Mips processor, will
+// return the actual allocated register in the range 2-17.
+int RegisterAllocator::RegisterFile::reserve(int reg)
+{
+    reg += mRegisterOffset;
+    LOG_ALWAYS_FATAL_IF(isUsed(reg),
+                        "reserving register %d, but already in use",
+                        reg);
+    mRegs |= (1<<reg);
+    mTouched |= mRegs;
+    return reg;
+}
+
+// This interface uses regMask in range 2-17 on MIPS, no translation.
+void RegisterAllocator::RegisterFile::reserveSeveral(uint32_t regMask)
+{
+    mRegs |= regMask;
+    mTouched |= regMask;
+}
+
+int RegisterAllocator::RegisterFile::isUsed(int reg) const
+{
+    LOG_ALWAYS_FATAL_IF(reg>=16+(int)mRegisterOffset, "invalid register %d", reg);
+    return mRegs & (1<<reg);
+}
+
+int RegisterAllocator::RegisterFile::obtain()
+{
+    const char priorityList[14] = {  0,  1, 2, 3, 
+                                    12, 14, 4, 5, 
+                                     6,  7, 8, 9,
+                                    10, 11 };
+    const int nbreg = sizeof(priorityList);
+    int i, r, reg;
+    for (i=0 ; i<nbreg ; i++) {
+        r = priorityList[i];
+        if (!isUsed(r + mRegisterOffset)) {
+            break;
+        }
+    }
+    // this is not an error anymore because, we'll try again with
+    // a lower optimization level.
+    //ALOGE_IF(i >= nbreg, "pixelflinger ran out of registers\n");
+    if (i >= nbreg) {
+        mStatus |= OUT_OF_REGISTERS;
+        // we return SP so we can more easily debug things
+        // the code will never be run anyway.
+        return ARMAssemblerInterface::SP; 
+    }
+    reg = reserve(r);  // Param in Arm range 0-15, returns range 2-17 on Mips.
+    return reg;
+}
+
+bool RegisterAllocator::RegisterFile::hasFreeRegs() const
+{
+    uint32_t regs = mRegs >> mRegisterOffset;   // MIPS fix.
+    return ((regs & 0xFFFF) == 0xFFFF) ? false : true;
+}
+
+int RegisterAllocator::RegisterFile::countFreeRegs() const
+{
+    uint32_t regs = mRegs >> mRegisterOffset;   // MIPS fix.
+    int f = ~regs & 0xFFFF;
+    // now count number of 1
+   f = (f & 0x5555) + ((f>>1) & 0x5555);
+   f = (f & 0x3333) + ((f>>2) & 0x3333);
+   f = (f & 0x0F0F) + ((f>>4) & 0x0F0F);
+   f = (f & 0x00FF) + ((f>>8) & 0x00FF);
+   return f;
+}
+
+void RegisterAllocator::RegisterFile::recycle(int reg)
+{
+    // commented out, since common failure of running out of regs
+    // triggers this assertion. Since the code is not execectued
+    // in that case, it does not matter. No reason to FATAL err.
+    // LOG_FATAL_IF(!isUsed(reg),
+    //         "recycling unallocated register %d",
+    //         reg);
+    mRegs &= ~(1<<reg);
+}
+
+void RegisterAllocator::RegisterFile::recycleSeveral(uint32_t regMask)
+{
+    // commented out, since common failure of running out of regs
+    // triggers this assertion. Since the code is not execectued
+    // in that case, it does not matter. No reason to FATAL err.
+    // LOG_FATAL_IF((mRegs & regMask)!=regMask,
+    //         "recycling unallocated registers "
+    //         "(recycle=%08x, allocated=%08x, unallocated=%08x)",
+    //         regMask, mRegs, mRegs&regMask);
+    mRegs &= ~regMask;
+}
+
+uint32_t RegisterAllocator::RegisterFile::touched() const
+{
+    return mTouched;
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+

diff --git a/libpixelflinger/codeflinger/GGLAssembler.h b/libpixelflinger/codeflinger/GGLAssembler.h
new file mode 100644
index 0000000..47dbf3a
--- /dev/null
+++ b/libpixelflinger/codeflinger/GGLAssembler.h

@@ -0,0 +1,572 @@
+/* libs/pixelflinger/codeflinger/GGLAssembler.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_GGLASSEMBLER_H
+#define ANDROID_GGLASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <private/pixelflinger/ggl_context.h>
+
+#include "ARMAssemblerProxy.h"
+
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+#define CONTEXT_ADDR_LOAD(REG, FIELD) \
+    ADDR_LDR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+#define CONTEXT_ADDR_STORE(REG, FIELD) \
+    ADDR_STR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+#define CONTEXT_LOAD(REG, FIELD) \
+    LDR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+#define CONTEXT_STORE(REG, FIELD) \
+    STR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+
+class RegisterAllocator
+{
+public:
+    class RegisterFile;
+    
+                    RegisterAllocator(int arch);  // NOLINT, implicit
+    RegisterFile&   registerFile();
+    int             reserveReg(int reg);
+    int             obtainReg();
+    void            recycleReg(int reg);
+    void            reset();
+
+    class RegisterFile
+    {
+    public:
+                            RegisterFile(int arch);  // NOLINT, implicit
+                            RegisterFile(const RegisterFile& rhs, int arch);
+                            ~RegisterFile();
+
+                void        reset();
+
+                bool operator == (const RegisterFile& rhs) const;
+                bool operator != (const RegisterFile& rhs) const {
+                    return !operator == (rhs);
+                }
+
+                int         reserve(int reg);
+                void        reserveSeveral(uint32_t regMask);
+
+                void        recycle(int reg);
+                void        recycleSeveral(uint32_t regMask);
+
+                int         obtain();
+        inline  int         isUsed(int reg) const;
+
+                bool        hasFreeRegs() const;
+                int         countFreeRegs() const;                
+
+                uint32_t    touched() const;
+        inline  uint32_t    status() const { return mStatus; }
+        
+        enum {
+            OUT_OF_REGISTERS = 0x1
+        };
+
+    private:
+        uint32_t    mRegs;
+        uint32_t    mTouched;
+        uint32_t    mStatus;
+        int         mArch;
+        uint32_t    mRegisterOffset;    // lets reg alloc use 2..17 for mips
+                                        // while arm uses 0..15
+    };
+ 
+    class Scratch
+    {
+    public:
+            explicit Scratch(RegisterFile& regFile)
+                : mRegFile(regFile), mScratch(0) { 
+            }
+            ~Scratch() {
+                mRegFile.recycleSeveral(mScratch);
+            }
+        int obtain() { 
+            int reg = mRegFile.obtain();
+            mScratch |= 1<<reg;
+            return reg;
+        }
+        void recycle(int reg) {
+            mRegFile.recycle(reg);
+            mScratch &= ~(1<<reg);
+        }
+        bool isUsed(int reg) {
+            return (mScratch & (1<<reg));
+        }
+        int countFreeRegs() {
+            return mRegFile.countFreeRegs();
+        }
+    private:
+        RegisterFile&   mRegFile;
+        uint32_t        mScratch;
+    };
+
+    class Spill
+    {
+    public:
+        Spill(RegisterFile& regFile, ARMAssemblerInterface& gen, uint32_t reglist)
+            : mRegFile(regFile), mGen(gen), mRegList(reglist), mCount(0)
+        {
+            if (reglist) {
+                int count = 0;
+                while (reglist) {
+                    count++;
+                    reglist &= ~(1 << (31 - __builtin_clz(reglist)));
+                }
+                if (count == 1) {
+                    int reg = 31 - __builtin_clz(mRegList);
+                    mGen.STR(mGen.AL, reg, mGen.SP, mGen.immed12_pre(-4, 1));
+                } else {
+                    mGen.STM(mGen.AL, mGen.DB, mGen.SP, 1, mRegList);
+                }
+                mRegFile.recycleSeveral(mRegList);
+                mCount = count;
+            }
+        }
+        ~Spill() {
+            if (mRegList) {
+                if (mCount == 1) {
+                    int reg = 31 - __builtin_clz(mRegList);
+                    mGen.LDR(mGen.AL, reg, mGen.SP, mGen.immed12_post(4));
+                } else {
+                    mGen.LDM(mGen.AL, mGen.IA, mGen.SP, 1, mRegList);
+                }
+                mRegFile.reserveSeveral(mRegList);
+            }
+        }
+    private:
+        RegisterFile&           mRegFile;
+        ARMAssemblerInterface&  mGen;
+        uint32_t                mRegList;
+        int                     mCount;
+    };
+    
+private:
+    RegisterFile    mRegs;
+};
+
+// ----------------------------------------------------------------------------
+
+class GGLAssembler : public ARMAssemblerProxy, public RegisterAllocator
+{
+public:
+
+    explicit    GGLAssembler(ARMAssemblerInterface* target);
+    virtual     ~GGLAssembler();
+
+    uint32_t*   base() const { return 0; } // XXX
+    uint32_t*   pc() const { return 0; } // XXX
+
+    void        reset(int opt_level);
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+
+        // generate scanline code for given needs
+    int         scanline(const needs_t& needs, context_t const* c);
+    int         scanline_core(const needs_t& needs, context_t const* c);
+
+        enum {
+            CLEAR_LO    = 0x0001,
+            CLEAR_HI    = 0x0002,
+            CORRUPTIBLE = 0x0004,
+            FIRST       = 0x0008
+        };
+
+        enum { //load/store flags
+            WRITE_BACK  = 0x0001
+        };
+
+        struct reg_t {
+            reg_t() : reg(-1), flags(0) {
+            }
+            reg_t(int r, int f=0)  // NOLINT, implicit
+                : reg(r), flags(f) {
+            }
+            void setTo(int r, int f=0) {
+                reg=r; flags=f;
+            }
+            int         reg;
+            uint16_t    flags;
+        };
+
+        struct integer_t : public reg_t {
+            integer_t() : reg_t(), s(0) {
+            }
+            integer_t(int r, int sz=32, int f=0)  // NOLINT, implicit
+                : reg_t(r, f), s(sz) {
+            }
+            void setTo(int r, int sz=32, int f=0) {
+                reg_t::setTo(r, f); s=sz;
+            }
+            int8_t s;
+            inline int size() const { return s; }
+        };
+        
+        struct pixel_t : public reg_t {
+            pixel_t() : reg_t() {
+                memset(&format, 0, sizeof(GGLFormat));
+            }
+            pixel_t(int r, const GGLFormat* fmt, int f=0)
+                : reg_t(r, f), format(*fmt) {
+            }
+            void setTo(int r, const GGLFormat* fmt, int f=0) {
+                reg_t::setTo(r, f); format = *fmt;
+            }
+            GGLFormat format;
+            inline int hi(int c) const { return format.c[c].h; }
+            inline int low(int c) const { return format.c[c].l; }
+            inline int mask(int c) const { return ((1<<size(c))-1) << low(c); }
+            inline int size() const { return format.size*8; }
+            inline int size(int c) const { return component_size(c); }
+            inline int component_size(int c) const { return hi(c) - low(c); }
+        };
+
+        struct component_t : public reg_t {
+            component_t() : reg_t(), h(0), l(0) {
+            }
+            component_t(int r, int f=0)  // NOLINT, implicit
+                : reg_t(r, f), h(0), l(0) {
+            }
+            component_t(int r, int lo, int hi, int f=0)
+                : reg_t(r, f), h(hi), l(lo) {
+            }
+            explicit component_t(const integer_t& rhs)
+                : reg_t(rhs.reg, rhs.flags), h(rhs.s), l(0) {
+            }
+            explicit component_t(const pixel_t& rhs, int component) {
+                setTo(  rhs.reg, 
+                        rhs.format.c[component].l,
+                        rhs.format.c[component].h,
+                        rhs.flags|CLEAR_LO|CLEAR_HI);
+            }
+            void setTo(int r, int lo=0, int hi=0, int f=0) {
+                reg_t::setTo(r, f); h=hi; l=lo;
+            }
+            int8_t h;
+            int8_t l;
+            inline int size() const { return h-l; }
+        };
+
+        struct pointer_t : public reg_t {
+            pointer_t() : reg_t(), size(0) {
+            }
+            pointer_t(int r, int s, int f=0)
+                : reg_t(r, f), size(s) {
+            }
+            void setTo(int r, int s, int f=0) {
+                reg_t::setTo(r, f); size=s;
+            }
+            int8_t size;
+        };
+
+
+private:
+    // GGLAssembler hides RegisterAllocator's and ARMAssemblerProxy's reset
+    // methods by providing a reset method with a different parameter set. The
+    // intent of GGLAssembler's reset method is to wrap the inherited reset
+    // methods, so make these methods private in order to prevent direct calls
+    // to these methods from clients.
+    using RegisterAllocator::reset;
+    using ARMAssemblerProxy::reset;
+
+    struct tex_coord_t {
+        reg_t       s;
+        reg_t       t;
+        pointer_t   ptr;
+    };
+
+    struct fragment_parts_t {
+        uint32_t    packed  : 1;
+        uint32_t    reload  : 2;
+        uint32_t    iterated_packed  : 1;
+        pixel_t     iterated;
+        pointer_t   cbPtr;
+        pointer_t   covPtr;
+        reg_t       count;
+        reg_t       argb[4];
+        reg_t       argb_dx[4];
+        reg_t       z;
+        reg_t       dither;
+        pixel_t     texel[GGL_TEXTURE_UNIT_COUNT];
+        tex_coord_t coords[GGL_TEXTURE_UNIT_COUNT];
+    };
+    
+    struct texture_unit_t {
+        int         format_idx;
+        GGLFormat   format;
+        int         bits;
+        int         swrap;
+        int         twrap;
+        int         env;
+        int         pot;
+        int         linear;
+        uint8_t     mask;
+        uint8_t     replaced;
+    };
+
+    struct texture_machine_t {
+        texture_unit_t  tmu[GGL_TEXTURE_UNIT_COUNT];
+        uint8_t         mask;
+        uint8_t         replaced;
+        uint8_t         directTexture;
+        uint8_t         activeUnits;
+    };
+
+    struct component_info_t {
+        bool    masked      : 1;
+        bool    inDest      : 1;
+        bool    needed      : 1;
+        bool    replaced    : 1;
+        bool    iterated    : 1;
+        bool    smooth      : 1;
+        bool    blend       : 1;
+        bool    fog         : 1;
+    };
+
+    struct builder_context_t {
+        context_t const*    c;
+        needs_t             needs;
+        int                 Rctx;
+    };
+
+    template <typename T>
+    void modify(T& r, Scratch& regs)
+    {
+        if (!(r.flags & CORRUPTIBLE)) {
+            r.reg = regs.obtain();
+            r.flags |= CORRUPTIBLE;
+        }
+    }
+
+    // helpers
+    void    base_offset(const pointer_t& d, const pointer_t& b, const reg_t& o);
+
+    // texture environement
+    void    modulate(   component_t& dest,
+                        const component_t& incoming,
+                        const pixel_t& texel, int component);
+
+    void    decal(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component);
+
+    void    blend(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component, int tmu);
+
+    void    add(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component);
+
+    // load/store stuff
+    void    store(const pointer_t& addr, const pixel_t& src, uint32_t flags=0);
+    void    load(const pointer_t& addr, const pixel_t& dest, uint32_t flags=0);
+    void    extract(integer_t& d, const pixel_t& s, int component);    
+    void    extract(component_t& d, const pixel_t& s, int component);    
+    void    extract(integer_t& d, int s, int h, int l, int bits=32);
+    void    expand(integer_t& d, const integer_t& s, int dbits);
+    void    expand(integer_t& d, const component_t& s, int dbits);
+    void    expand(component_t& d, const component_t& s, int dbits);
+    void    downshift(pixel_t& d, int component, component_t s, const reg_t& dither);
+
+
+    void    mul_factor( component_t& d,
+                        const integer_t& v,
+                        const integer_t& f);
+
+    void    mul_factor_add( component_t& d,
+                            const integer_t& v,
+                            const integer_t& f,
+                            const component_t& a);
+
+    void    component_add(  component_t& d,
+                            const integer_t& dst,
+                            const integer_t& src);
+
+    void    component_sat(  const component_t& v);
+
+
+    void    build_scanline_prolog(  fragment_parts_t& parts,
+                                    const needs_t& needs);
+
+    void    build_smooth_shade(const fragment_parts_t& parts);
+
+    void    build_component(    pixel_t& pixel,
+                                const fragment_parts_t& parts,
+                                int component,
+                                Scratch& global_scratches);
+                                
+    void    build_incoming_component(
+                                component_t& temp,
+                                int dst_size,
+                                const fragment_parts_t& parts,
+                                int component,
+                                Scratch& scratches,
+                                Scratch& global_scratches);
+
+    void    init_iterated_color(fragment_parts_t& parts, const reg_t& x);
+
+    void    build_iterated_color(   component_t& fragment,
+                                    const fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& regs);
+
+    void    decodeLogicOpNeeds(const needs_t& needs);
+    
+    void    decodeTMUNeeds(const needs_t& needs, context_t const* c);
+
+    void    init_textures(  tex_coord_t* coords,
+                            const reg_t& x,
+                            const reg_t& y);
+
+    void    build_textures( fragment_parts_t& parts,
+                            Scratch& regs);
+
+    void    filter8(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter16(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter24(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter32(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    build_texture_environment(  component_t& fragment,
+                                        const fragment_parts_t& parts,
+                                        int component,
+                                        Scratch& regs);
+
+    void    wrapping(   int d,
+                        int coord, int size,
+                        int tx_wrap, int tx_linear);
+
+    void    build_fog(  component_t& temp,
+                        int component,
+                        Scratch& parent_scratches);
+
+    void    build_blending(     component_t& in_out,
+                                const pixel_t& pixel,
+                                int component,
+                                Scratch& parent_scratches);
+
+    void    build_blend_factor(
+                integer_t& factor, int f, int component,
+                const pixel_t& dst_pixel,
+                integer_t& fragment,
+                integer_t& fb,
+                Scratch& scratches);
+
+    void    build_blendFOneMinusF(  component_t& temp,
+                                    const integer_t& factor, 
+                                    const integer_t& fragment,
+                                    const integer_t& fb);
+
+    void    build_blendOneMinusFF(  component_t& temp,
+                                    const integer_t& factor, 
+                                    const integer_t& fragment,
+                                    const integer_t& fb);
+
+    void build_coverage_application(component_t& fragment,
+                                    const fragment_parts_t& parts,
+                                    Scratch& regs);
+
+    void build_alpha_test(component_t& fragment, const fragment_parts_t& parts);
+
+    enum { Z_TEST=1, Z_WRITE=2 }; 
+    void build_depth_test(const fragment_parts_t& parts, uint32_t mask);
+    void build_iterate_z(const fragment_parts_t& parts);
+    void build_iterate_f(const fragment_parts_t& parts);
+    void build_iterate_texture_coordinates(const fragment_parts_t& parts);
+
+    void build_logic_op(pixel_t& pixel, Scratch& regs);
+
+    void build_masking(pixel_t& pixel, Scratch& regs);
+
+    void build_and_immediate(int d, int s, uint32_t mask, int bits);
+
+    bool    isAlphaSourceNeeded() const;
+
+    enum {
+        FACTOR_SRC=1, FACTOR_DST=2, BLEND_SRC=4, BLEND_DST=8 
+    };
+    
+    enum {
+        LOGIC_OP=1, LOGIC_OP_SRC=2, LOGIC_OP_DST=4
+    };
+
+    static int blending_codes(int fs, int fd);
+
+    builder_context_t   mBuilderContext;
+    texture_machine_t   mTextureMachine;
+    component_info_t    mInfo[4];
+    int                 mBlending;
+    int                 mMasking;
+    int                 mAllMasked;
+    int                 mLogicOp;
+    int                 mAlphaTest;
+    int                 mAA;
+    int                 mDithering;
+    int                 mDepthTest;
+
+    int             mSmooth;
+    int             mFog;
+    pixel_t         mDstPixel;
+    
+    GGLFormat       mCbFormat;
+    
+    int             mBlendFactorCached;
+    integer_t       mAlphaSource;
+    
+    int             mBaseRegister;
+    
+    int             mBlendSrc;
+    int             mBlendDst;
+    int             mBlendSrcA;
+    int             mBlendDstA;
+    
+    int             mOptLevel;
+};
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
+#endif // ANDROID_GGLASSEMBLER_H

diff --git a/libpixelflinger/codeflinger/MIPS64Assembler.cpp b/libpixelflinger/codeflinger/MIPS64Assembler.cpp
new file mode 100644
index 0000000..d6d2156
--- /dev/null
+++ b/libpixelflinger/codeflinger/MIPS64Assembler.cpp

@@ -0,0 +1,1447 @@
+/* libs/pixelflinger/codeflinger/MIPS64Assembler.cpp
+**
+** Copyright 2015, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+
+/* MIPS64 assembler and ARM->MIPS64 assembly translator
+**
+** The approach is utilize MIPSAssembler generator, using inherited MIPS64Assembler
+** that overrides just the specific MIPS64r6 instructions.
+** For now ArmToMips64Assembler is copied over from ArmToMipsAssembler class,
+** changing some MIPS64r6 related stuff.
+**
+*/
+
+#define LOG_TAG "MIPS64Assembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cutils/properties.h>
+#include <log/log.h>
+#include <private/pixelflinger/ggl_context.h>
+
+#include "MIPS64Assembler.h"
+#include "CodeCache.h"
+#include "mips64_disassem.h"
+
+#define NOT_IMPLEMENTED()  LOG_ALWAYS_FATAL("Arm instruction %s not yet implemented\n", __func__)
+#define __unused __attribute__((__unused__))
+
+// ----------------------------------------------------------------------------
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark ArmToMips64Assembler...
+#endif
+
+ArmToMips64Assembler::ArmToMips64Assembler(const sp<Assembly>& assembly,
+                                           char *abuf, int linesz, int instr_count)
+    :   ARMAssemblerInterface(),
+        mArmDisassemblyBuffer(abuf),
+        mArmLineLength(linesz),
+        mArmInstrCount(instr_count),
+        mInum(0),
+        mAssembly(assembly)
+{
+    mMips = new MIPS64Assembler(assembly, this);
+    mArmPC = (uint32_t **) malloc(ARM_MAX_INSTUCTIONS * sizeof(uint32_t *));
+    init_conditional_labels();
+}
+
+ArmToMips64Assembler::ArmToMips64Assembler(void* assembly)
+    :   ARMAssemblerInterface(),
+        mArmDisassemblyBuffer(NULL),
+        mInum(0),
+        mAssembly(NULL)
+{
+    mMips = new MIPS64Assembler(assembly, this);
+    mArmPC = (uint32_t **) malloc(ARM_MAX_INSTUCTIONS * sizeof(uint32_t *));
+    init_conditional_labels();
+}
+
+ArmToMips64Assembler::~ArmToMips64Assembler()
+{
+    delete mMips;
+    free((void *) mArmPC);
+}
+
+uint32_t* ArmToMips64Assembler::pc() const
+{
+    return mMips->pc();
+}
+
+uint32_t* ArmToMips64Assembler::base() const
+{
+    return mMips->base();
+}
+
+void ArmToMips64Assembler::reset()
+{
+    cond.labelnum = 0;
+    mInum = 0;
+    mMips->reset();
+}
+
+int ArmToMips64Assembler::getCodegenArch()
+{
+    return CODEGEN_ARCH_MIPS64;
+}
+
+void ArmToMips64Assembler::comment(const char* string)
+{
+    mMips->comment(string);
+}
+
+void ArmToMips64Assembler::label(const char* theLabel)
+{
+    mMips->label(theLabel);
+}
+
+void ArmToMips64Assembler::disassemble(const char* name)
+{
+    mMips->disassemble(name);
+}
+
+void ArmToMips64Assembler::init_conditional_labels()
+{
+    int i;
+    for (i=0;i<99; ++i) {
+        sprintf(cond.label[i], "cond_%d", i);
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Prolog/Epilog & Generate...
+#endif
+
+void ArmToMips64Assembler::prolog()
+{
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->DADDIU(R_sp, R_sp, -(5 * 8));
+    mMips->SD(R_s0, R_sp, 0);
+    mMips->SD(R_s1, R_sp, 8);
+    mMips->SD(R_s2, R_sp, 16);
+    mMips->SD(R_s3, R_sp, 24);
+    mMips->SD(R_s4, R_sp, 32);
+    mMips->MOVE(R_v0, R_a0);    // move context * passed in a0 to v0 (arm r0)
+}
+
+void ArmToMips64Assembler::epilog(uint32_t touched __unused)
+{
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->LD(R_s0, R_sp, 0);
+    mMips->LD(R_s1, R_sp, 8);
+    mMips->LD(R_s2, R_sp, 16);
+    mMips->LD(R_s3, R_sp, 24);
+    mMips->LD(R_s4, R_sp, 32);
+    mMips->DADDIU(R_sp, R_sp, (5 * 8));
+    mMips->JR(R_ra);
+
+}
+
+int ArmToMips64Assembler::generate(const char* name)
+{
+    return mMips->generate(name);
+}
+
+void ArmToMips64Assembler::fix_branches()
+{
+    mMips->fix_branches();
+}
+
+uint32_t* ArmToMips64Assembler::pcForLabel(const char* label)
+{
+    return mMips->pcForLabel(label);
+}
+
+void ArmToMips64Assembler::set_condition(int mode, int R1, int R2) {
+    if (mode == 2) {
+        cond.type = SBIT_COND;
+    } else {
+        cond.type = CMP_COND;
+    }
+    cond.r1 = R1;
+    cond.r2 = R2;
+}
+
+//----------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Addressing modes & shifters...
+#endif
+
+
+// do not need this for MIPS, but it is in the Interface (virtual)
+int ArmToMips64Assembler::buildImmediate(
+        uint32_t immediate, uint32_t& rot, uint32_t& imm)
+{
+    // for MIPS, any 32-bit immediate is OK
+    rot = 0;
+    imm = immediate;
+    return 0;
+}
+
+// shifters...
+
+bool ArmToMips64Assembler::isValidImmediate(uint32_t immediate __unused)
+{
+    // for MIPS, any 32-bit immediate is OK
+    return true;
+}
+
+uint32_t ArmToMips64Assembler::imm(uint32_t immediate)
+{
+    amode.value = immediate;
+    return AMODE_IMM;
+}
+
+uint32_t ArmToMips64Assembler::reg_imm(int Rm, int type, uint32_t shift)
+{
+    amode.reg = Rm;
+    amode.stype = type;
+    amode.value = shift;
+    return AMODE_REG_IMM;
+}
+
+uint32_t ArmToMips64Assembler::reg_rrx(int Rm __unused)
+{
+    // reg_rrx mode is not used in the GLLAssember code at this time
+    return AMODE_UNSUPPORTED;
+}
+
+uint32_t ArmToMips64Assembler::reg_reg(int Rm __unused, int type __unused,
+                                       int Rs __unused)
+{
+    // reg_reg mode is not used in the GLLAssember code at this time
+    return AMODE_UNSUPPORTED;
+}
+
+
+// addressing modes...
+// LDR(B)/STR(B)/PLD (immediate and Rm can be negative, which indicate U=0)
+uint32_t ArmToMips64Assembler::immed12_pre(int32_t immed12, int W)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+    amode.value = immed12;
+    amode.writeback = W;
+    return AMODE_IMM_12_PRE;
+}
+
+uint32_t ArmToMips64Assembler::immed12_post(int32_t immed12)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+
+    amode.value = immed12;
+    return AMODE_IMM_12_POST;
+}
+
+uint32_t ArmToMips64Assembler::reg_scale_pre(int Rm, int type,
+        uint32_t shift, int W)
+{
+    LOG_ALWAYS_FATAL_IF(W | type | shift, "reg_scale_pre adv modes not yet implemented");
+
+    amode.reg = Rm;
+    // amode.stype = type;      // more advanced modes not used in GGLAssembler yet
+    // amode.value = shift;
+    // amode.writeback = W;
+    return AMODE_REG_SCALE_PRE;
+}
+
+uint32_t ArmToMips64Assembler::reg_scale_post(int Rm __unused, int type __unused,
+                                              uint32_t shift __unused)
+{
+    LOG_ALWAYS_FATAL("adr mode reg_scale_post not yet implemented\n");
+    return AMODE_UNSUPPORTED;
+}
+
+// LDRH/LDRSB/LDRSH/STRH (immediate and Rm can be negative, which indicate U=0)
+uint32_t ArmToMips64Assembler::immed8_pre(int32_t immed8, int W __unused)
+{
+    LOG_ALWAYS_FATAL("adr mode immed8_pre not yet implemented\n");
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+    return AMODE_IMM_8_PRE;
+}
+
+uint32_t ArmToMips64Assembler::immed8_post(int32_t immed8)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+    amode.value = immed8;
+    return AMODE_IMM_8_POST;
+}
+
+uint32_t ArmToMips64Assembler::reg_pre(int Rm, int W)
+{
+    LOG_ALWAYS_FATAL_IF(W, "reg_pre writeback not yet implemented");
+    amode.reg = Rm;
+    return AMODE_REG_PRE;
+}
+
+uint32_t ArmToMips64Assembler::reg_post(int Rm __unused)
+{
+    LOG_ALWAYS_FATAL("adr mode reg_post not yet implemented\n");
+    return AMODE_UNSUPPORTED;
+}
+
+
+
+// ----------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Data Processing...
+#endif
+
+// check if the operand registers from a previous CMP or S-bit instruction
+// would be overwritten by this instruction. If so, move the value to a
+// safe register.
+// Note that we cannot tell at _this_ instruction time if a future (conditional)
+// instruction will _also_ use this value (a defect of the simple 1-pass, one-
+// instruction-at-a-time translation). Therefore we must be conservative and
+// save the value before it is overwritten. This costs an extra MOVE instr.
+
+void ArmToMips64Assembler::protectConditionalOperands(int Rd)
+{
+    if (Rd == cond.r1) {
+        mMips->MOVE(R_cmp, cond.r1);
+        cond.r1 = R_cmp;
+    }
+    if (cond.type == CMP_COND && Rd == cond.r2) {
+        mMips->MOVE(R_cmp2, cond.r2);
+        cond.r2 = R_cmp2;
+    }
+}
+
+
+// interprets the addressing mode, and generates the common code
+// used by the majority of data-processing ops. Many MIPS instructions
+// have a register-based form and a different immediate form. See
+// opAND below for an example. (this could be inlined)
+//
+// this works with the imm(), reg_imm() methods above, which are directly
+// called by the GLLAssembler.
+// note: _signed parameter defaults to false (un-signed)
+// note: tmpReg parameter defaults to 1, MIPS register AT
+int ArmToMips64Assembler::dataProcAdrModes(int op, int& source, bool _signed, int tmpReg)
+{
+    if (op < AMODE_REG) {
+        source = op;
+        return SRC_REG;
+    } else if (op == AMODE_IMM) {
+        if ((!_signed && amode.value > 0xffff)
+                || (_signed && ((int)amode.value < -32768 || (int)amode.value > 32767) )) {
+            mMips->LUI(tmpReg, (amode.value >> 16));
+            if (amode.value & 0x0000ffff) {
+                mMips->ORI(tmpReg, tmpReg, (amode.value & 0x0000ffff));
+            }
+            source = tmpReg;
+            return SRC_REG;
+        } else {
+            source = amode.value;
+            return SRC_IMM;
+        }
+    } else if (op == AMODE_REG_IMM) {
+        switch (amode.stype) {
+            case LSL: mMips->SLL(tmpReg, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(tmpReg, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(tmpReg, amode.reg, amode.value); break;
+            case ROR: mMips->ROTR(tmpReg, amode.reg, amode.value); break;
+        }
+        source = tmpReg;
+        return SRC_REG;
+    } else {  // adr mode RRX is not used in GGL Assembler at this time
+        // we are screwed, this should be exception, assert-fail or something
+        LOG_ALWAYS_FATAL("adr mode reg_rrx not yet implemented\n");
+        return SRC_ERROR;
+    }
+}
+
+
+void ArmToMips64Assembler::dataProcessing(int opcode, int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    int src;    // src is modified by dataProcAdrModes() - passed as int&
+
+    if (cc != AL) {
+        protectConditionalOperands(Rd);
+        // the branch tests register(s) set by prev CMP or instr with 'S' bit set
+        // inverse the condition to jump past this conditional instruction
+        ArmToMips64Assembler::B(cc^1, cond.label[++cond.labelnum]);
+    } else {
+        mArmPC[mInum++] = pc();  // save starting PC for this instr
+    }
+
+    switch (opcode) {
+    case opAND:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->AND(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ANDI(Rd, Rn, src);
+        }
+        break;
+
+    case opADD:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->ADDU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ADDIU(Rd, Rn, src);
+        }
+        break;
+
+    case opSUB:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->SUBU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->SUBIU(Rd, Rn, src);
+        }
+        break;
+
+    case opADD64:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->DADDU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->DADDIU(Rd, Rn, src);
+        }
+        break;
+
+    case opSUB64:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->DSUBU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->DSUBIU(Rd, Rn, src);
+        }
+        break;
+
+    case opEOR:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->XOR(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->XORI(Rd, Rn, src);
+        }
+        break;
+
+    case opORR:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->OR(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ORI(Rd, Rn, src);
+        }
+        break;
+
+    case opBIC:
+        if (dataProcAdrModes(Op2, src) == SRC_IMM) {
+            // if we are 16-bit imnmediate, load to AT reg
+            mMips->ORI(R_at, 0, src);
+            src = R_at;
+        }
+        mMips->NOT(R_at, src);
+        mMips->AND(Rd, Rn, R_at);
+        break;
+
+    case opRSB:
+        if (dataProcAdrModes(Op2, src) == SRC_IMM) {
+            // if we are 16-bit imnmediate, load to AT reg
+            mMips->ORI(R_at, 0, src);
+            src = R_at;
+        }
+        mMips->SUBU(Rd, src, Rn);   // subu with the parameters reversed
+        break;
+
+    case opMOV:
+        if (Op2 < AMODE_REG) {  // op2 is reg # in this case
+            mMips->MOVE(Rd, Op2);
+        } else if (Op2 == AMODE_IMM) {
+            if (amode.value > 0xffff) {
+                mMips->LUI(Rd, (amode.value >> 16));
+                if (amode.value & 0x0000ffff) {
+                    mMips->ORI(Rd, Rd, (amode.value & 0x0000ffff));
+                }
+             } else {
+                mMips->ORI(Rd, 0, amode.value);
+            }
+        } else if (Op2 == AMODE_REG_IMM) {
+            switch (amode.stype) {
+            case LSL: mMips->SLL(Rd, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(Rd, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(Rd, amode.reg, amode.value); break;
+            case ROR: mMips->ROTR(Rd, amode.reg, amode.value); break;
+            }
+        }
+        else {
+            // adr mode RRX is not used in GGL Assembler at this time
+            mMips->UNIMPL();
+        }
+        break;
+
+    case opMVN:     // this is a 1's complement: NOT
+        if (Op2 < AMODE_REG) {  // op2 is reg # in this case
+            mMips->NOR(Rd, Op2, 0);     // NOT is NOR with 0
+            break;
+        } else if (Op2 == AMODE_IMM) {
+            if (amode.value > 0xffff) {
+                mMips->LUI(Rd, (amode.value >> 16));
+                if (amode.value & 0x0000ffff) {
+                    mMips->ORI(Rd, Rd, (amode.value & 0x0000ffff));
+                }
+             } else {
+                mMips->ORI(Rd, 0, amode.value);
+             }
+        } else if (Op2 == AMODE_REG_IMM) {
+            switch (amode.stype) {
+            case LSL: mMips->SLL(Rd, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(Rd, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(Rd, amode.reg, amode.value); break;
+            case ROR: mMips->ROTR(Rd, amode.reg, amode.value); break;
+            }
+        }
+        else {
+            // adr mode RRX is not used in GGL Assembler at this time
+            mMips->UNIMPL();
+        }
+        mMips->NOR(Rd, Rd, 0);     // NOT is NOR with 0
+        break;
+
+    case opCMP:
+        // Either operand of a CMP instr could get overwritten by a subsequent
+        // conditional instruction, which is ok, _UNLESS_ there is a _second_
+        // conditional instruction. Under MIPS, this requires doing the comparison
+        // again (SLT), and the original operands must be available. (and this
+        // pattern of multiple conditional instructions from same CMP _is_ used
+        // in GGL-Assembler)
+        //
+        // For now, if a conditional instr overwrites the operands, we will
+        // move them to dedicated temp regs. This is ugly, and inefficient,
+        // and should be optimized.
+        //
+        // WARNING: making an _Assumption_ that CMP operand regs will NOT be
+        // trashed by intervening NON-conditional instructions. In the general
+        // case this is legal, but it is NOT currently done in GGL-Assembler.
+
+        cond.type = CMP_COND;
+        cond.r1 = Rn;
+        if (dataProcAdrModes(Op2, src, false, R_cmp2) == SRC_REG) {
+            cond.r2 = src;
+        } else {                        // adr mode was SRC_IMM
+            mMips->ORI(R_cmp2, R_zero, src);
+            cond.r2 = R_cmp2;
+        }
+
+        break;
+
+
+    case opTST:
+    case opTEQ:
+    case opCMN:
+    case opADC:
+    case opSBC:
+    case opRSC:
+        mMips->UNIMPL(); // currently unused in GGL Assembler code
+        break;
+    }
+
+    if (cc != AL) {
+        mMips->label(cond.label[cond.labelnum]);
+    }
+    if (s && opcode != opCMP) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Multiply...
+#endif
+
+// multiply, accumulate
+void ArmToMips64Assembler::MLA(int cc __unused, int s,
+        int Rd, int Rm, int Rs, int Rn) {
+
+    //ALOGW("MLA");
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->MUL(R_at, Rm, Rs);
+    mMips->ADDU(Rd, R_at, Rn);
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+void ArmToMips64Assembler::MUL(int cc __unused, int s,
+        int Rd, int Rm, int Rs) {
+    mArmPC[mInum++] = pc();
+    mMips->MUL(Rd, Rm, Rs);
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+void ArmToMips64Assembler::UMULL(int cc __unused, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    mArmPC[mInum++] = pc();
+    mMips->MUH(RdHi, Rm, Rs);
+    mMips->MUL(RdLo, Rm, Rs);
+
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on UMULL must be on 64-bit result\n");
+    }
+}
+
+void ArmToMips64Assembler::UMUAL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<21) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on UMULL must be on 64-bit result\n");
+    }
+}
+
+void ArmToMips64Assembler::SMULL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on SMULL must be on 64-bit result\n");
+    }
+}
+void ArmToMips64Assembler::SMUAL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (1<<21) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on SMUAL must be on 64-bit result\n");
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Branches...
+#endif
+
+// branches...
+
+void ArmToMips64Assembler::B(int cc, const char* label)
+{
+    mArmPC[mInum++] = pc();
+    if (cond.type == SBIT_COND) { cond.r2 = R_zero; }
+
+    switch(cc) {
+        case EQ: mMips->BEQ(cond.r1, cond.r2, label); break;
+        case NE: mMips->BNE(cond.r1, cond.r2, label); break;
+        case HS: mMips->BGEU(cond.r1, cond.r2, label); break;
+        case LO: mMips->BLTU(cond.r1, cond.r2, label); break;
+        case MI: mMips->BLT(cond.r1, cond.r2, label); break;
+        case PL: mMips->BGE(cond.r1, cond.r2, label); break;
+
+        case HI: mMips->BGTU(cond.r1, cond.r2, label); break;
+        case LS: mMips->BLEU(cond.r1, cond.r2, label); break;
+        case GE: mMips->BGE(cond.r1, cond.r2, label); break;
+        case LT: mMips->BLT(cond.r1, cond.r2, label); break;
+        case GT: mMips->BGT(cond.r1, cond.r2, label); break;
+        case LE: mMips->BLE(cond.r1, cond.r2, label); break;
+        case AL: mMips->B(label); break;
+        case NV: /* B Never - no instruction */ break;
+
+        case VS:
+        case VC:
+        default:
+            LOG_ALWAYS_FATAL("Unsupported cc: %02x\n", cc);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::BL(int cc __unused, const char* label __unused)
+{
+    LOG_ALWAYS_FATAL("branch-and-link not supported yet\n");
+    mArmPC[mInum++] = pc();
+}
+
+// no use for Branches with integer PC, but they're in the Interface class ....
+void ArmToMips64Assembler::B(int cc __unused, uint32_t* to_pc __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+void ArmToMips64Assembler::BL(int cc __unused, uint32_t* to_pc __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+void ArmToMips64Assembler::BX(int cc __unused, int Rn __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Data Transfer...
+#endif
+
+// data transfer...
+void ArmToMips64Assembler::LDR(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert LDR via Arm SP to LW via Mips SP
+            }
+            mMips->LW(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->DADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert STR thru Arm SP to STR thru Mips SP
+            }
+            mMips->LW(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->LW(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::LDRB(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            mMips->LBU(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->DADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->LBU(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->LBU(Rd, R_at, 0);
+            break;
+    }
+
+}
+
+void ArmToMips64Assembler::STR(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;  // convert STR thru Arm SP to SW thru Mips SP
+            }
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                // If we will writeback, then update the index reg, then store.
+                // This correctly handles stack-push case.
+                mMips->DADDIU(Rn, Rn, amode.value);
+                mMips->SW(Rd, Rn, 0);
+            } else {
+                // No writeback so store offset by value
+                mMips->SW(Rd, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->SW(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);  // post index always writes back
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->SW(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::STRB(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            mMips->SB(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->DADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->SB(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->SB(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::LDRH(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed8_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_8_PRE:      // no support yet for writeback
+            mMips->LHU(Rd, Rn, amode.value);
+            break;
+        case AMODE_IMM_8_POST:
+            mMips->LHU(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_PRE:
+            // we only support simple base +/- index
+            if (amode.reg >= 0) {
+                mMips->DADDU(R_at, Rn, amode.reg);
+            } else {
+                mMips->DSUBU(R_at, Rn, abs(amode.reg));
+            }
+            mMips->LHU(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::LDRSB(int cc __unused, int Rd __unused,
+                                 int Rn __unused, uint32_t offset __unused)
+{
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::LDRSH(int cc __unused, int Rd __unused,
+                                 int Rn __unused, uint32_t offset __unused)
+{
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::STRH(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed8_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_8_PRE:      // no support yet for writeback
+            mMips->SH(Rd, Rn, amode.value);
+            break;
+        case AMODE_IMM_8_POST:
+            mMips->SH(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_PRE:
+            // we only support simple base +/- index
+            if (amode.reg >= 0) {
+                mMips->DADDU(R_at, Rn, amode.reg);
+            } else {
+                mMips->DSUBU(R_at, Rn, abs(amode.reg));
+            }
+            mMips->SH(Rd, R_at, 0);
+            break;
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Block Data Transfer...
+#endif
+
+// block data transfer...
+void ArmToMips64Assembler::LDM(int cc __unused, int dir __unused,
+        int Rn __unused, int W __unused, uint32_t reg_list __unused)
+{   //                        ED FD EA FA      IB IA DB DA
+    // const uint8_t P[8] = { 1, 0, 1, 0,      1, 0, 1, 0 };
+    // const uint8_t U[8] = { 1, 1, 0, 0,      1, 1, 0, 0 };
+    // *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+    //         (uint32_t(U[dir])<<23) | (1<<20) | (W<<21) | (Rn<<16) | reg_list;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::STM(int cc __unused, int dir __unused,
+        int Rn __unused, int W __unused, uint32_t reg_list __unused)
+{   //                        FA EA FD ED      IB IA DB DA
+    // const uint8_t P[8] = { 0, 1, 0, 1,      1, 0, 1, 0 };
+    // const uint8_t U[8] = { 0, 0, 1, 1,      1, 1, 0, 0 };
+    // *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+    //         (uint32_t(U[dir])<<23) | (0<<20) | (W<<21) | (Rn<<16) | reg_list;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Special...
+#endif
+
+// special...
+void ArmToMips64Assembler::SWP(int cc __unused, int Rn __unused,
+                               int Rd __unused, int Rm __unused) {
+    // *mPC++ = (cc<<28) | (2<<23) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::SWPB(int cc __unused, int Rn __unused,
+                                int Rd __unused, int Rm __unused) {
+    // *mPC++ = (cc<<28) | (2<<23) | (1<<22) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::SWI(int cc __unused, uint32_t comment __unused) {
+    // *mPC++ = (cc<<28) | (0xF<<24) | comment;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+
+#if 0
+#pragma mark -
+#pragma mark DSP instructions...
+#endif
+
+// DSP instructions...
+void ArmToMips64Assembler::PLD(int Rn __unused, uint32_t offset) {
+    LOG_ALWAYS_FATAL_IF(!((offset&(1<<24)) && !(offset&(1<<21))),
+                        "PLD only P=1, W=0");
+    // *mPC++ = 0xF550F000 | (Rn<<16) | offset;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::CLZ(int cc __unused, int Rd, int Rm)
+{
+    mArmPC[mInum++] = pc();
+    mMips->CLZ(Rd, Rm);
+}
+
+void ArmToMips64Assembler::QADD(int cc __unused, int Rd __unused,
+                                int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1000050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::QDADD(int cc __unused, int Rd __unused,
+                                 int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1400050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::QSUB(int cc __unused, int Rd __unused,
+                                int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1200050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::QDSUB(int cc __unused, int Rd __unused,
+                                 int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1600050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+// 16 x 16 signed multiply (like SMLAxx without the accumulate)
+void ArmToMips64Assembler::SMUL(int cc __unused, int xy,
+                int Rd, int Rm, int Rs)
+{
+    mArmPC[mInum++] = pc();
+
+    // the 16 bits may be in the top or bottom half of 32-bit source reg,
+    // as defined by the codes BB, BT, TB, TT (compressed param xy)
+    // where x corresponds to Rm and y to Rs
+
+    // select half-reg for Rm
+    if (xy & xyTB) {
+        // use top 16-bits
+        mMips->SRA(R_at, Rm, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        mMips->SEH(R_at, Rm);
+    }
+    // select half-reg for Rs
+    if (xy & xyBT) {
+        // use top 16-bits
+        mMips->SRA(R_at2, Rs, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        mMips->SEH(R_at2, Rs);
+    }
+    mMips->MUL(Rd, R_at, R_at2);
+}
+
+// signed 32b x 16b multiple, save top 32-bits of 48-bit result
+void ArmToMips64Assembler::SMULW(int cc __unused, int y,
+                int Rd, int Rm, int Rs)
+{
+    mArmPC[mInum++] = pc();
+
+    // the selector yT or yB refers to reg Rs
+    if (y & yT) {
+        // zero the bottom 16-bits, with 2 shifts, it can affect result
+        mMips->SRL(R_at, Rs, 16);
+        mMips->SLL(R_at, R_at, 16);
+
+    } else {
+        // move low 16-bit half, to high half
+        mMips->SLL(R_at, Rs, 16);
+    }
+    mMips->MUH(Rd, Rm, R_at);
+}
+
+// 16 x 16 signed multiply, accumulate: Rd = Rm{16} * Rs{16} + Rn
+void ArmToMips64Assembler::SMLA(int cc __unused, int xy,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    mArmPC[mInum++] = pc();
+
+    // the 16 bits may be in the top or bottom half of 32-bit source reg,
+    // as defined by the codes BB, BT, TB, TT (compressed param xy)
+    // where x corresponds to Rm and y to Rs
+
+    // select half-reg for Rm
+    if (xy & xyTB) {
+        // use top 16-bits
+        mMips->SRA(R_at, Rm, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        mMips->SEH(R_at, Rm);
+    }
+    // select half-reg for Rs
+    if (xy & xyBT) {
+        // use top 16-bits
+        mMips->SRA(R_at2, Rs, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        mMips->SEH(R_at2, Rs);
+    }
+
+    mMips->MUL(R_at, R_at, R_at2);
+    mMips->ADDU(Rd, R_at, Rn);
+}
+
+void ArmToMips64Assembler::SMLAL(int cc __unused, int xy __unused,
+                                 int RdHi __unused, int RdLo __unused,
+                                 int Rs __unused, int Rm __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1400080 | (RdHi<<16) | (RdLo<<12) | (Rs<<8) | (xy<<4) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::SMLAW(int cc __unused, int y __unused,
+                                 int Rd __unused, int Rm __unused,
+                                 int Rs __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1200080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (y<<4) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+// used by ARMv6 version of GGLAssembler::filter32
+void ArmToMips64Assembler::UXTB16(int cc __unused, int Rd, int Rm, int rotate)
+{
+    mArmPC[mInum++] = pc();
+
+    //Rd[31:16] := ZeroExtend((Rm ROR (8 * sh))[23:16]),
+    //Rd[15:0] := ZeroExtend((Rm ROR (8 * sh))[7:0]). sh 0-3.
+
+    mMips->ROTR(R_at2, Rm, rotate * 8);
+    mMips->LUI(R_at, 0xFF);
+    mMips->ORI(R_at, R_at, 0xFF);
+    mMips->AND(Rd, R_at2, R_at);
+}
+
+void ArmToMips64Assembler::UBFX(int cc __unused, int Rd __unused, int Rn __unused,
+                                int lsb __unused, int width __unused)
+{
+     /* Placeholder for UBFX */
+     mArmPC[mInum++] = pc();
+
+     mMips->NOP2();
+     NOT_IMPLEMENTED();
+}
+
+// ----------------------------------------------------------------------------
+// Address Processing...
+// ----------------------------------------------------------------------------
+
+void ArmToMips64Assembler::ADDR_ADD(int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+//    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+//    if(s  != 0) { NOT_IMPLEMENTED(); return;} //Not required
+    dataProcessing(opADD64, cc, s, Rd, Rn, Op2);
+}
+
+void ArmToMips64Assembler::ADDR_SUB(int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+//    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+//    if(s  != 0) { NOT_IMPLEMENTED(); return;} //Not required
+    dataProcessing(opSUB64, cc, s, Rd, Rn, Op2);
+}
+
+void ArmToMips64Assembler::ADDR_LDR(int cc __unused, int Rd,
+                                    int Rn, uint32_t offset) {
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert LDR via Arm SP to LW via Mips SP
+            }
+            mMips->LD(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->DADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert STR thru Arm SP to STR thru Mips SP
+            }
+            mMips->LD(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->LD(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::ADDR_STR(int cc __unused, int Rd,
+                                    int Rn, uint32_t offset) {
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;  // convert STR thru Arm SP to SW thru Mips SP
+            }
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                // If we will writeback, then update the index reg, then store.
+                // This correctly handles stack-push case.
+                mMips->DADDIU(Rn, Rn, amode.value);
+                mMips->SD(Rd, Rn, 0);
+            } else {
+                // No writeback so store offset by value
+                mMips->SD(Rd, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->SD(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);  // post index always writes back
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->SD(Rd, R_at, 0);
+            break;
+    }
+}
+
+#if 0
+#pragma mark -
+#pragma mark MIPS Assembler...
+#endif
+
+
+//**************************************************************************
+//**************************************************************************
+//**************************************************************************
+
+
+/* MIPS64 assembler
+** this is a subset of mips64r6, targeted specifically at ARM instruction
+** replacement in the pixelflinger/codeflinger code.
+**
+** This class is extended from MIPSAssembler class and overrides only
+** MIPS64r6 specific stuff.
+*/
+
+MIPS64Assembler::MIPS64Assembler(const sp<Assembly>& assembly, ArmToMips64Assembler *parent)
+    : MIPSAssembler::MIPSAssembler(assembly, NULL), mParent(parent)
+{
+}
+
+MIPS64Assembler::MIPS64Assembler(void* assembly, ArmToMips64Assembler *parent)
+    : MIPSAssembler::MIPSAssembler(assembly), mParent(parent)
+{
+}
+
+MIPS64Assembler::~MIPS64Assembler()
+{
+}
+
+void MIPS64Assembler::reset()
+{
+    if (mAssembly != NULL) {
+        mBase = mPC = (uint32_t *)mAssembly->base();
+    } else {
+        mPC = mBase = base();
+    }
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+}
+
+
+void MIPS64Assembler::disassemble(const char* name __unused)
+{
+    char di_buf[140];
+
+    bool arm_disasm_fmt = (mParent->mArmDisassemblyBuffer == NULL) ? false : true;
+
+    typedef char dstr[40];
+    dstr *lines = (dstr *)mParent->mArmDisassemblyBuffer;
+
+    if (mParent->mArmDisassemblyBuffer != NULL) {
+        for (int i=0; i<mParent->mArmInstrCount; ++i) {
+            string_detab(lines[i]);
+        }
+    }
+
+    size_t count = pc()-base();
+    uint32_t* mipsPC = base();
+
+    while (count--) {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(mipsPC);
+        if (label >= 0) {
+            ALOGW("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(mipsPC);
+        if (comment >= 0) {
+            ALOGW("; %s\n", mComments.valueAt(comment));
+        }
+        ::mips_disassem(mipsPC, di_buf, arm_disasm_fmt);
+        string_detab(di_buf);
+        string_pad(di_buf, 30);
+        ALOGW("%08lx:    %08x    %s", uintptr_t(mipsPC), uint32_t(*mipsPC), di_buf);
+        mipsPC++;
+    }
+}
+
+void MIPS64Assembler::fix_branches()
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--) {
+        const branch_target_t& bt = mBranchTargets[count];
+        uint32_t* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                "error resolving branch targets, target_pc is null");
+        int32_t offset = int32_t(target_pc - (bt.pc+1));
+        *bt.pc |= offset & 0x00FFFF;
+    }
+}
+
+void MIPS64Assembler::DADDU(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (daddu_fn<<FUNC_SHF)
+                    | (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF);
+}
+
+void MIPS64Assembler::DADDIU(int Rt, int Rs, int16_t imm)
+{
+    *mPC++ = (daddiu_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+void MIPS64Assembler::DSUBU(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (dsubu_fn<<FUNC_SHF) |
+                        (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF) ;
+}
+
+void MIPS64Assembler::DSUBIU(int Rt, int Rs, int16_t imm)   // really addiu(d, s, -j)
+{
+    *mPC++ = (daddiu_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | ((-imm) & MSK_16);
+}
+
+void MIPS64Assembler::MUL(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mul_fn<<RE_SHF) | (sop30_fn<<FUNC_SHF) |
+                        (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF) ;
+}
+
+void MIPS64Assembler::MUH(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (muh_fn<<RE_SHF) | (sop30_fn<<FUNC_SHF) |
+                        (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF) ;
+}
+
+void MIPS64Assembler::CLO(int Rd, int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (17<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (1<<RE_SHF);
+}
+
+void MIPS64Assembler::CLZ(int Rd, int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (16<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (1<<RE_SHF);
+}
+
+void MIPS64Assembler::LD(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (ld_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPS64Assembler::SD(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (sd_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPS64Assembler::LUI(int Rt, int16_t offset)
+{
+    *mPC++ = (aui_op<<OP_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+
+void MIPS64Assembler::JR(int Rs)
+{
+        *mPC++ = (spec_op<<OP_SHF) | (Rs<<RS_SHF) | (jalr_fn << FUNC_SHF);
+        MIPS64Assembler::NOP();
+}
+
+}; // namespace android:

diff --git a/libpixelflinger/codeflinger/MIPS64Assembler.h b/libpixelflinger/codeflinger/MIPS64Assembler.h
new file mode 100644
index 0000000..b43e5da
--- /dev/null
+++ b/libpixelflinger/codeflinger/MIPS64Assembler.h

@@ -0,0 +1,404 @@
+/* libs/pixelflinger/codeflinger/MIPS64Assembler.h
+**
+** Copyright 2015, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef ANDROID_MIPS64ASSEMBLER_H
+#define ANDROID_MIPS64ASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "utils/KeyedVector.h"
+#include "utils/Vector.h"
+#include "tinyutils/smartpointer.h"
+
+#include "ARMAssemblerInterface.h"
+#include "MIPSAssembler.h"
+#include "CodeCache.h"
+
+namespace android {
+
+class MIPS64Assembler;    // forward reference
+
+// this class mimics ARMAssembler interface
+// intent is to translate each ARM instruction to 1 or more MIPS instr
+// implementation calls MIPS64Assembler class to generate mips code
+class ArmToMips64Assembler : public ARMAssemblerInterface
+{
+public:
+                ArmToMips64Assembler(const sp<Assembly>& assembly,
+                        char *abuf = 0, int linesz = 0, int instr_count = 0);
+                ArmToMips64Assembler(void* assembly);
+    virtual     ~ArmToMips64Assembler();
+
+    uint32_t*   base() const;
+    uint32_t*   pc() const;
+    void        disassemble(const char* name);
+
+    virtual void    reset();
+
+    virtual int     generate(const char* name);
+    virtual int     getCodegenArch();
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+    // for testing purposes
+    void        fix_branches();
+    void        set_condition(int mode, int R1, int R2);
+
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // shifters...
+    virtual bool        isValidImmediate(uint32_t immed);
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    virtual uint32_t    imm(uint32_t immediate);
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    virtual uint32_t    reg_rrx(int Rm);
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes...
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0);
+    virtual uint32_t    immed12_post(int32_t immed12);
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0);
+    virtual uint32_t    immed8_post(int32_t immed8);
+    virtual uint32_t    reg_pre(int Rm, int W=0);
+    virtual uint32_t    reg_post(int Rm);
+
+
+
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    virtual uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+
+    // byte/half word extract...
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+
+    // bit manipulation...
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width);
+
+    // Address loading/storing/manipulation
+    virtual void ADDR_LDR(int cc, int Rd, int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_STR(int cc, int Rd, int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_ADD(int cc, int s, int Rd, int Rn, uint32_t Op2);
+    virtual void ADDR_SUB(int cc, int s, int Rd, int Rn, uint32_t Op2);
+
+    // this is some crap to share is MIPS64Assembler class for debug
+    char *      mArmDisassemblyBuffer;
+    int         mArmLineLength;
+    int         mArmInstrCount;
+
+    int         mInum;      // current arm instuction number (0..n)
+    uint32_t**  mArmPC;     // array: PC for 1st mips instr of
+                            //      each translated ARM instr
+
+
+private:
+    ArmToMips64Assembler(const ArmToMips64Assembler& rhs);
+    ArmToMips64Assembler& operator = (const ArmToMips64Assembler& rhs);
+
+    void init_conditional_labels(void);
+
+    void protectConditionalOperands(int Rd);
+
+    // reg__tmp set to MIPS AT, reg 1
+    int dataProcAdrModes(int op, int& source, bool sign = false, int reg_tmp = 1);
+
+    sp<Assembly>        mAssembly;
+    MIPS64Assembler*    mMips;
+
+
+    enum misc_constants_t {
+        ARM_MAX_INSTUCTIONS = 512  // based on ASSEMBLY_SCRATCH_SIZE
+    };
+
+    enum {
+        SRC_REG = 0,
+        SRC_IMM,
+        SRC_ERROR = -1
+    };
+
+    enum addr_modes {
+        // start above the range of legal mips reg #'s (0-31)
+        AMODE_REG = 0x20,
+        AMODE_IMM, AMODE_REG_IMM,               // for data processing
+        AMODE_IMM_12_PRE, AMODE_IMM_12_POST,    // for load/store
+        AMODE_REG_SCALE_PRE, AMODE_IMM_8_PRE,
+        AMODE_IMM_8_POST, AMODE_REG_PRE,
+        AMODE_UNSUPPORTED
+    };
+
+    struct addr_mode_t {    // address modes for current ARM instruction
+        int         reg;
+        int         stype;
+        uint32_t    value;
+        bool        writeback;  // writeback the adr reg after modification
+    } amode;
+
+    enum cond_types {
+        CMP_COND = 1,
+        SBIT_COND
+    };
+
+    struct cond_mode_t {    // conditional-execution info for current ARM instruction
+        cond_types  type;
+        int         r1;
+        int         r2;
+        int         labelnum;
+        char        label[100][10];
+    } cond;
+};
+
+
+
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// This is the basic MIPS64 assembler, which just creates the opcodes in memory.
+// All the more complicated work is done in ArmToMips64Assember above.
+// Inherits MIPSAssembler class, and overrides only MIPS64r6 specific stuff
+
+class MIPS64Assembler : public MIPSAssembler
+{
+public:
+                MIPS64Assembler(const sp<Assembly>& assembly, ArmToMips64Assembler *parent);
+                MIPS64Assembler(void* assembly, ArmToMips64Assembler *parent);
+    virtual     ~MIPS64Assembler();
+
+    virtual void        reset();
+    virtual void        disassemble(const char* name);
+
+    void        fix_branches();
+
+    // ------------------------------------------------------------------------
+    // MIPS64AssemblerInterface...
+    // ------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Arithmetic...
+#endif
+
+    void DADDU(int Rd, int Rs, int Rt);
+    void DADDIU(int Rt, int Rs, int16_t imm);
+    void DSUBU(int Rd, int Rs, int Rt);
+    void DSUBIU(int Rt, int Rs, int16_t imm);
+    virtual void MUL(int Rd, int Rs, int Rt);
+    void MUH(int Rd, int Rs, int Rt);
+
+#if 0
+#pragma mark -
+#pragma mark Logical...
+#endif
+
+    virtual void CLO(int Rd, int Rs);
+    virtual void CLZ(int Rd, int Rs);
+
+#if 0
+#pragma mark -
+#pragma mark Load/store...
+#endif
+
+    void LD(int Rt, int Rbase, int16_t offset);
+    void SD(int Rt, int Rbase, int16_t offset);
+    virtual void LUI(int Rt, int16_t offset);
+
+#if 0
+#pragma mark -
+#pragma mark Branch...
+#endif
+
+    void JR(int Rs);
+
+
+protected:
+    ArmToMips64Assembler *mParent;
+
+    // opcode field of all instructions
+    enum opcode_field {
+        spec_op, regimm_op, j_op, jal_op,                  // 0x00 - 0x03
+        beq_op, bne_op, pop06_op, pop07_op,                // 0x04 - 0x07
+        pop10_op, addiu_op, slti_op, sltiu_op,             // 0x08 - 0x0b
+        andi_op, ori_op, xori_op, aui_op,                  // 0x0c - 0x0f
+        cop0_op, cop1_op, cop2_op, rsrv_opc_0,             // 0x10 - 0x13
+        rsrv_opc_1, rsrv_opc_2, pop26_op, pop27_op,        // 0x14 - 0x17
+        pop30_op, daddiu_op, rsrv_opc_3, rsrv_opc_4,       // 0x18 - 0x1b
+        rsrv_opc_5, daui_op, msa_op, spec3_op,             // 0x1c - 0x1f
+        lb_op, lh_op, rsrv_opc_6, lw_op,                   // 0x20 - 0x23
+        lbu_op, lhu_op, rsrv_opc_7, lwu_op,                // 0x24 - 0x27
+        sb_op, sh_op, rsrv_opc_8, sw_op,                   // 0x28 - 0x2b
+        rsrv_opc_9, rsrv_opc_10, rsrv_opc_11, rsrv_opc_12, // 0x2c - 0x2f
+        rsrv_opc_13, lwc1_op, bc_op, rsrv_opc_14,          // 0x2c - 0x2f
+        rsrv_opc_15, ldc1_op, pop66_op, ld_op,             // 0x30 - 0x33
+        rsrv_opc_16, swc1_op, balc_op, pcrel_op,           // 0x34 - 0x37
+        rsrv_opc_17, sdc1_op, pop76_op, sd_op              // 0x38 - 0x3b
+    };
+
+
+    // func field for special opcode
+    enum func_spec_op {
+        sll_fn, rsrv_spec_0, srl_fn, sra_fn,
+        sllv_fn, lsa_fn, srlv_fn, srav_fn,
+        rsrv_spec_1, jalr_fn, rsrv_spec_2, rsrv_spec_3,
+        syscall_fn, break_fn, sdbbp_fn, sync_fn,
+        clz_fn, clo_fn, dclz_fn, dclo_fn,
+        dsllv_fn, dlsa_fn, dsrlv_fn, dsrav_fn,
+        sop30_fn, sop31_fn, sop32_fn, sop33_fn,
+        sop34_fn, sop35_fn, sop36_fn, sop37_fn,
+        add_fn, addu_fn, sub_fn, subu_fn,
+        and_fn, or_fn, xor_fn, nor_fn,
+        rsrv_spec_4, rsrv_spec_5, slt_fn, sltu_fn,
+        dadd_fn, daddu_fn, dsub_fn, dsubu_fn,
+        tge_fn, tgeu_fn, tlt_fn, tltu_fn,
+        teq_fn, seleqz_fn, tne_fn, selnez_fn,
+        dsll_fn, rsrv_spec_6, dsrl_fn, dsra_fn,
+        dsll32_fn, rsrv_spec_7, dsrl32_fn, dsra32_fn
+    };
+
+    // func field for spec3 opcode
+    enum func_spec3_op {
+        ext_fn, dextm_fn, dextu_fn, dext_fn,
+        ins_fn, dinsm_fn, dinsu_fn, dins_fn,
+        cachee_fn = 0x1b, sbe_fn, she_fn, sce_fn, swe_fn,
+        bshfl_fn, prefe_fn = 0x23, dbshfl_fn, cache_fn, sc_fn, scd_fn,
+        lbue_fn, lhue_fn, lbe_fn = 0x2c, lhe_fn, lle_fn, lwe_fn,
+        pref_fn = 0x35, ll_fn, lld_fn, rdhwr_fn = 0x3b
+    };
+
+    // sa field for spec3 opcodes, with BSHFL function
+    enum func_spec3_bshfl {
+        bitswap_fn,
+        wsbh_fn = 0x02,
+        dshd_fn = 0x05,
+        seb_fn = 0x10,
+        seh_fn = 0x18
+    };
+
+    // rt field of regimm opcodes.
+    enum regimm_fn {
+        bltz_fn, bgez_fn,
+        dahi_fn = 0x6,
+        nal_fn = 0x10, bal_fn, bltzall_fn, bgezall_fn,
+        sigrie_fn = 0x17,
+        dati_fn = 0x1e, synci_fn
+    };
+
+    enum muldiv_fn {
+        mul_fn = 0x02, muh_fn
+    };
+
+    enum mips_inst_shifts {
+        OP_SHF       = 26,
+        JTARGET_SHF  = 0,
+        RS_SHF       = 21,
+        RT_SHF       = 16,
+        RD_SHF       = 11,
+        RE_SHF       = 6,
+        SA_SHF       = RE_SHF,  // synonym
+        IMM_SHF      = 0,
+        FUNC_SHF     = 0,
+
+        // mask values
+        MSK_16       = 0xffff,
+
+
+        CACHEOP_SHF  = 18,
+        CACHESEL_SHF = 16,
+    };
+};
+
+
+}; // namespace android
+
+#endif //ANDROID_MIPS64ASSEMBLER_H

diff --git a/libpixelflinger/codeflinger/MIPSAssembler.cpp b/libpixelflinger/codeflinger/MIPSAssembler.cpp
new file mode 100644
index 0000000..7de8cc1
--- /dev/null
+++ b/libpixelflinger/codeflinger/MIPSAssembler.cpp

@@ -0,0 +1,1955 @@
+/* libs/pixelflinger/codeflinger/MIPSAssembler.cpp
+**
+** Copyright 2012, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+
+/* MIPS assembler and ARM->MIPS assembly translator
+**
+** The approach is to leave the GGLAssembler and associated files largely
+** un-changed, still utilizing all Arm instruction generation. Via the
+** ArmToMipsAssembler (subclassed from ArmAssemblerInterface) each Arm
+** instruction is translated to one or more Mips instructions as necessary. This
+** is clearly less efficient than a direct implementation within the
+** GGLAssembler, but is far cleaner, more maintainable, and has yielded very
+** significant performance gains on Mips compared to the generic pixel pipeline.
+**
+**
+** GGLAssembler changes
+**
+** - The register allocator has been modified to re-map Arm registers 0-15 to mips
+** registers 2-17. Mips register 0 cannot be used as general-purpose register,
+** and register 1 has traditional uses as a short-term temporary.
+**
+** - Added some early bailouts for OUT_OF_REGISTERS in texturing.cpp and
+** GGLAssembler.cpp, since this is not fatal, and can be retried at lower
+** optimization level.
+**
+**
+** ARMAssembler and ARMAssemblerInterface changes
+**
+** Refactored ARM address-mode static functions (imm(), reg_imm(), imm12_pre(), etc.)
+** to virtual, so they can be overridden in MIPSAssembler. The implementation of these
+** functions on ARM is moved from ARMAssemblerInterface.cpp to ARMAssembler.cpp, and
+** is unchanged from the original. (This required duplicating 2 of these as static
+** functions in ARMAssemblerInterface.cpp so they could be used as static initializers).
+*/
+
+#define LOG_TAG "MIPSAssembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <cutils/properties.h>
+#include <log/log.h>
+#include <private/pixelflinger/ggl_context.h>
+
+#include "CodeCache.h"
+#include "MIPSAssembler.h"
+#include "mips_disassem.h"
+
+#define __unused __attribute__((__unused__))
+
+// Choose MIPS arch variant following gcc flags
+#if defined(__mips__) && __mips==32 && __mips_isa_rev>=2
+#define mips32r2 1
+#else
+#define mips32r2 0
+#endif
+
+
+#define NOT_IMPLEMENTED()  LOG_ALWAYS_FATAL("Arm instruction %s not yet implemented\n", __func__)
+
+
+
+// ----------------------------------------------------------------------------
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark ArmToMipsAssembler...
+#endif
+
+ArmToMipsAssembler::ArmToMipsAssembler(const sp<Assembly>& assembly,
+                                       char *abuf, int linesz, int instr_count)
+    :   ARMAssemblerInterface(),
+        mArmDisassemblyBuffer(abuf),
+        mArmLineLength(linesz),
+        mArmInstrCount(instr_count),
+        mInum(0),
+        mAssembly(assembly)
+{
+    mMips = new MIPSAssembler(assembly, this);
+    mArmPC = (uint32_t **) malloc(ARM_MAX_INSTUCTIONS * sizeof(uint32_t *));
+    init_conditional_labels();
+}
+
+ArmToMipsAssembler::~ArmToMipsAssembler()
+{
+    delete mMips;
+    free((void *) mArmPC);
+}
+
+uint32_t* ArmToMipsAssembler::pc() const
+{
+    return mMips->pc();
+}
+
+uint32_t* ArmToMipsAssembler::base() const
+{
+    return mMips->base();
+}
+
+void ArmToMipsAssembler::reset()
+{
+    cond.labelnum = 0;
+    mInum = 0;
+    mMips->reset();
+}
+
+int ArmToMipsAssembler::getCodegenArch()
+{
+    return CODEGEN_ARCH_MIPS;
+}
+
+void ArmToMipsAssembler::comment(const char* string)
+{
+    mMips->comment(string);
+}
+
+void ArmToMipsAssembler::label(const char* theLabel)
+{
+    mMips->label(theLabel);
+}
+
+void ArmToMipsAssembler::disassemble(const char* name)
+{
+    mMips->disassemble(name);
+}
+
+void ArmToMipsAssembler::init_conditional_labels()
+{
+    int i;
+    for (i=0;i<99; ++i) {
+        sprintf(cond.label[i], "cond_%d", i);
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Prolog/Epilog & Generate...
+#endif
+
+void ArmToMipsAssembler::prolog()
+{
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->ADDIU(R_sp, R_sp, -(5 * 4));
+    mMips->SW(R_s0, R_sp, 0);
+    mMips->SW(R_s1, R_sp, 4);
+    mMips->SW(R_s2, R_sp, 8);
+    mMips->SW(R_s3, R_sp, 12);
+    mMips->SW(R_s4, R_sp, 16);
+    mMips->MOVE(R_v0, R_a0);    // move context * passed in a0 to v0 (arm r0)
+}
+
+void ArmToMipsAssembler::epilog(uint32_t touched __unused)
+{
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->LW(R_s0, R_sp, 0);
+    mMips->LW(R_s1, R_sp, 4);
+    mMips->LW(R_s2, R_sp, 8);
+    mMips->LW(R_s3, R_sp, 12);
+    mMips->LW(R_s4, R_sp, 16);
+    mMips->ADDIU(R_sp, R_sp, (5 * 4));
+    mMips->JR(R_ra);
+
+}
+
+int ArmToMipsAssembler::generate(const char* name)
+{
+    return mMips->generate(name);
+}
+
+uint32_t* ArmToMipsAssembler::pcForLabel(const char* label)
+{
+    return mMips->pcForLabel(label);
+}
+
+
+
+//----------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Addressing modes & shifters...
+#endif
+
+
+// do not need this for MIPS, but it is in the Interface (virtual)
+int ArmToMipsAssembler::buildImmediate(
+        uint32_t immediate, uint32_t& rot, uint32_t& imm)
+{
+    // for MIPS, any 32-bit immediate is OK
+    rot = 0;
+    imm = immediate;
+    return 0;
+}
+
+// shifters...
+
+bool ArmToMipsAssembler::isValidImmediate(uint32_t immediate __unused)
+{
+    // for MIPS, any 32-bit immediate is OK
+    return true;
+}
+
+uint32_t ArmToMipsAssembler::imm(uint32_t immediate)
+{
+    // ALOGW("immediate value %08x at pc %08x\n", immediate, (int)pc());
+    amode.value = immediate;
+    return AMODE_IMM;
+}
+
+uint32_t ArmToMipsAssembler::reg_imm(int Rm, int type, uint32_t shift)
+{
+    amode.reg = Rm;
+    amode.stype = type;
+    amode.value = shift;
+    return AMODE_REG_IMM;
+}
+
+uint32_t ArmToMipsAssembler::reg_rrx(int Rm __unused)
+{
+    // reg_rrx mode is not used in the GLLAssember code at this time
+    return AMODE_UNSUPPORTED;
+}
+
+uint32_t ArmToMipsAssembler::reg_reg(int Rm __unused, int type __unused,
+                                     int Rs __unused)
+{
+    // reg_reg mode is not used in the GLLAssember code at this time
+    return AMODE_UNSUPPORTED;
+}
+
+
+// addressing modes...
+// LDR(B)/STR(B)/PLD (immediate and Rm can be negative, which indicate U=0)
+uint32_t ArmToMipsAssembler::immed12_pre(int32_t immed12, int W)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+    amode.value = immed12;
+    amode.writeback = W;
+    return AMODE_IMM_12_PRE;
+}
+
+uint32_t ArmToMipsAssembler::immed12_post(int32_t immed12)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+
+    amode.value = immed12;
+    return AMODE_IMM_12_POST;
+}
+
+uint32_t ArmToMipsAssembler::reg_scale_pre(int Rm, int type,
+        uint32_t shift, int W)
+{
+    LOG_ALWAYS_FATAL_IF(W | type | shift, "reg_scale_pre adv modes not yet implemented");
+
+    amode.reg = Rm;
+    // amode.stype = type;      // more advanced modes not used in GGLAssembler yet
+    // amode.value = shift;
+    // amode.writeback = W;
+    return AMODE_REG_SCALE_PRE;
+}
+
+uint32_t ArmToMipsAssembler::reg_scale_post(int Rm __unused, int type __unused,
+                                            uint32_t shift __unused)
+{
+    LOG_ALWAYS_FATAL("adr mode reg_scale_post not yet implemented\n");
+    return AMODE_UNSUPPORTED;
+}
+
+// LDRH/LDRSB/LDRSH/STRH (immediate and Rm can be negative, which indicate U=0)
+uint32_t ArmToMipsAssembler::immed8_pre(int32_t immed8, int W __unused)
+{
+    // uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL("adr mode immed8_pre not yet implemented\n");
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+    return AMODE_IMM_8_PRE;
+}
+
+uint32_t ArmToMipsAssembler::immed8_post(int32_t immed8)
+{
+    // uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+    amode.value = immed8;
+    return AMODE_IMM_8_POST;
+}
+
+uint32_t ArmToMipsAssembler::reg_pre(int Rm, int W)
+{
+    LOG_ALWAYS_FATAL_IF(W, "reg_pre writeback not yet implemented");
+    amode.reg = Rm;
+    return AMODE_REG_PRE;
+}
+
+uint32_t ArmToMipsAssembler::reg_post(int Rm __unused)
+{
+    LOG_ALWAYS_FATAL("adr mode reg_post not yet implemented\n");
+    return AMODE_UNSUPPORTED;
+}
+
+
+
+// ----------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Data Processing...
+#endif
+
+// check if the operand registers from a previous CMP or S-bit instruction
+// would be overwritten by this instruction. If so, move the value to a
+// safe register.
+// Note that we cannot tell at _this_ instruction time if a future (conditional)
+// instruction will _also_ use this value (a defect of the simple 1-pass, one-
+// instruction-at-a-time translation). Therefore we must be conservative and
+// save the value before it is overwritten. This costs an extra MOVE instr.
+
+void ArmToMipsAssembler::protectConditionalOperands(int Rd)
+{
+    if (Rd == cond.r1) {
+        mMips->MOVE(R_cmp, cond.r1);
+        cond.r1 = R_cmp;
+    }
+    if (cond.type == CMP_COND && Rd == cond.r2) {
+        mMips->MOVE(R_cmp2, cond.r2);
+        cond.r2 = R_cmp2;
+    }
+}
+
+
+// interprets the addressing mode, and generates the common code
+// used by the majority of data-processing ops. Many MIPS instructions
+// have a register-based form and a different immediate form. See
+// opAND below for an example. (this could be inlined)
+//
+// this works with the imm(), reg_imm() methods above, which are directly
+// called by the GLLAssembler.
+// note: _signed parameter defaults to false (un-signed)
+// note: tmpReg parameter defaults to 1, MIPS register AT
+int ArmToMipsAssembler::dataProcAdrModes(int op, int& source, bool _signed, int tmpReg)
+{
+    if (op < AMODE_REG) {
+        source = op;
+        return SRC_REG;
+    } else if (op == AMODE_IMM) {
+        if ((!_signed && amode.value > 0xffff)
+                || (_signed && ((int)amode.value < -32768 || (int)amode.value > 32767) )) {
+            mMips->LUI(tmpReg, (amode.value >> 16));
+            if (amode.value & 0x0000ffff) {
+                mMips->ORI(tmpReg, tmpReg, (amode.value & 0x0000ffff));
+            }
+            source = tmpReg;
+            return SRC_REG;
+        } else {
+            source = amode.value;
+            return SRC_IMM;
+        }
+    } else if (op == AMODE_REG_IMM) {
+        switch (amode.stype) {
+            case LSL: mMips->SLL(tmpReg, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(tmpReg, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(tmpReg, amode.reg, amode.value); break;
+            case ROR: if (mips32r2) {
+                          mMips->ROTR(tmpReg, amode.reg, amode.value);
+                      } else {
+                          mMips->RORIsyn(tmpReg, amode.reg, amode.value);
+                      }
+                      break;
+        }
+        source = tmpReg;
+        return SRC_REG;
+    } else {  // adr mode RRX is not used in GGL Assembler at this time
+        // we are screwed, this should be exception, assert-fail or something
+        LOG_ALWAYS_FATAL("adr mode reg_rrx not yet implemented\n");
+        return SRC_ERROR;
+    }
+}
+
+
+void ArmToMipsAssembler::dataProcessing(int opcode, int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    int src;    // src is modified by dataProcAdrModes() - passed as int&
+
+
+    if (cc != AL) {
+        protectConditionalOperands(Rd);
+        // the branch tests register(s) set by prev CMP or instr with 'S' bit set
+        // inverse the condition to jump past this conditional instruction
+        ArmToMipsAssembler::B(cc^1, cond.label[++cond.labelnum]);
+    } else {
+        mArmPC[mInum++] = pc();  // save starting PC for this instr
+    }
+
+    switch (opcode) {
+    case opAND:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->AND(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ANDI(Rd, Rn, src);
+        }
+        break;
+
+    case opADD:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->ADDU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ADDIU(Rd, Rn, src);
+        }
+        break;
+
+    case opSUB:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->SUBU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->SUBIU(Rd, Rn, src);
+        }
+        break;
+
+    case opEOR:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->XOR(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->XORI(Rd, Rn, src);
+        }
+        break;
+
+    case opORR:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->OR(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ORI(Rd, Rn, src);
+        }
+        break;
+
+    case opBIC:
+        if (dataProcAdrModes(Op2, src) == SRC_IMM) {
+            // if we are 16-bit imnmediate, load to AT reg
+            mMips->ORI(R_at, 0, src);
+            src = R_at;
+        }
+        mMips->NOT(R_at, src);
+        mMips->AND(Rd, Rn, R_at);
+        break;
+
+    case opRSB:
+        if (dataProcAdrModes(Op2, src) == SRC_IMM) {
+            // if we are 16-bit imnmediate, load to AT reg
+            mMips->ORI(R_at, 0, src);
+            src = R_at;
+        }
+        mMips->SUBU(Rd, src, Rn);   // subu with the parameters reversed
+        break;
+
+    case opMOV:
+        if (Op2 < AMODE_REG) {  // op2 is reg # in this case
+            mMips->MOVE(Rd, Op2);
+        } else if (Op2 == AMODE_IMM) {
+            if (amode.value > 0xffff) {
+                mMips->LUI(Rd, (amode.value >> 16));
+                if (amode.value & 0x0000ffff) {
+                    mMips->ORI(Rd, Rd, (amode.value & 0x0000ffff));
+                }
+             } else {
+                mMips->ORI(Rd, 0, amode.value);
+            }
+        } else if (Op2 == AMODE_REG_IMM) {
+            switch (amode.stype) {
+            case LSL: mMips->SLL(Rd, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(Rd, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(Rd, amode.reg, amode.value); break;
+            case ROR: if (mips32r2) {
+                          mMips->ROTR(Rd, amode.reg, amode.value);
+                      } else {
+                          mMips->RORIsyn(Rd, amode.reg, amode.value);
+                      }
+                      break;
+            }
+        }
+        else {
+            // adr mode RRX is not used in GGL Assembler at this time
+            mMips->UNIMPL();
+        }
+        break;
+
+    case opMVN:     // this is a 1's complement: NOT
+        if (Op2 < AMODE_REG) {  // op2 is reg # in this case
+            mMips->NOR(Rd, Op2, 0);     // NOT is NOR with 0
+            break;
+        } else if (Op2 == AMODE_IMM) {
+            if (amode.value > 0xffff) {
+                mMips->LUI(Rd, (amode.value >> 16));
+                if (amode.value & 0x0000ffff) {
+                    mMips->ORI(Rd, Rd, (amode.value & 0x0000ffff));
+                }
+             } else {
+                mMips->ORI(Rd, 0, amode.value);
+             }
+        } else if (Op2 == AMODE_REG_IMM) {
+            switch (amode.stype) {
+            case LSL: mMips->SLL(Rd, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(Rd, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(Rd, amode.reg, amode.value); break;
+            case ROR: if (mips32r2) {
+                          mMips->ROTR(Rd, amode.reg, amode.value);
+                      } else {
+                          mMips->RORIsyn(Rd, amode.reg, amode.value);
+                      }
+                      break;
+            }
+        }
+        else {
+            // adr mode RRX is not used in GGL Assembler at this time
+            mMips->UNIMPL();
+        }
+        mMips->NOR(Rd, Rd, 0);     // NOT is NOR with 0
+        break;
+
+    case opCMP:
+        // Either operand of a CMP instr could get overwritten by a subsequent
+        // conditional instruction, which is ok, _UNLESS_ there is a _second_
+        // conditional instruction. Under MIPS, this requires doing the comparison
+        // again (SLT), and the original operands must be available. (and this
+        // pattern of multiple conditional instructions from same CMP _is_ used
+        // in GGL-Assembler)
+        //
+        // For now, if a conditional instr overwrites the operands, we will
+        // move them to dedicated temp regs. This is ugly, and inefficient,
+        // and should be optimized.
+        //
+        // WARNING: making an _Assumption_ that CMP operand regs will NOT be
+        // trashed by intervening NON-conditional instructions. In the general
+        // case this is legal, but it is NOT currently done in GGL-Assembler.
+
+        cond.type = CMP_COND;
+        cond.r1 = Rn;
+        if (dataProcAdrModes(Op2, src, false, R_cmp2) == SRC_REG) {
+            cond.r2 = src;
+        } else {                        // adr mode was SRC_IMM
+            mMips->ORI(R_cmp2, R_zero, src);
+            cond.r2 = R_cmp2;
+        }
+
+        break;
+
+
+    case opTST:
+    case opTEQ:
+    case opCMN:
+    case opADC:
+    case opSBC:
+    case opRSC:
+        mMips->UNIMPL(); // currently unused in GGL Assembler code
+        break;
+    }
+
+    if (cc != AL) {
+        mMips->label(cond.label[cond.labelnum]);
+    }
+    if (s && opcode != opCMP) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Multiply...
+#endif
+
+// multiply, accumulate
+void ArmToMipsAssembler::MLA(int cc __unused, int s,
+        int Rd, int Rm, int Rs, int Rn) {
+
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->MUL(R_at, Rm, Rs);
+    mMips->ADDU(Rd, R_at, Rn);
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+void ArmToMipsAssembler::MUL(int cc __unused, int s,
+        int Rd, int Rm, int Rs) {
+    mArmPC[mInum++] = pc();
+    mMips->MUL(Rd, Rm, Rs);
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+void ArmToMipsAssembler::UMULL(int cc __unused, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    mArmPC[mInum++] = pc();
+    mMips->MULT(Rm, Rs);
+    mMips->MFHI(RdHi);
+    mMips->MFLO(RdLo);
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on UMULL must be on 64-bit result\n");
+    }
+}
+
+void ArmToMipsAssembler::UMUAL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<21) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on UMULL must be on 64-bit result\n");
+    }
+}
+
+void ArmToMipsAssembler::SMULL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on SMULL must be on 64-bit result\n");
+    }
+}
+void ArmToMipsAssembler::SMUAL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (1<<21) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on SMUAL must be on 64-bit result\n");
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Branches...
+#endif
+
+// branches...
+
+void ArmToMipsAssembler::B(int cc, const char* label)
+{
+    mArmPC[mInum++] = pc();
+    if (cond.type == SBIT_COND) { cond.r2 = R_zero; }
+
+    switch(cc) {
+        case EQ: mMips->BEQ(cond.r1, cond.r2, label); break;
+        case NE: mMips->BNE(cond.r1, cond.r2, label); break;
+        case HS: mMips->BGEU(cond.r1, cond.r2, label); break;
+        case LO: mMips->BLTU(cond.r1, cond.r2, label); break;
+        case MI: mMips->BLT(cond.r1, cond.r2, label); break;
+        case PL: mMips->BGE(cond.r1, cond.r2, label); break;
+
+        case HI: mMips->BGTU(cond.r1, cond.r2, label); break;
+        case LS: mMips->BLEU(cond.r1, cond.r2, label); break;
+        case GE: mMips->BGE(cond.r1, cond.r2, label); break;
+        case LT: mMips->BLT(cond.r1, cond.r2, label); break;
+        case GT: mMips->BGT(cond.r1, cond.r2, label); break;
+        case LE: mMips->BLE(cond.r1, cond.r2, label); break;
+        case AL: mMips->B(label); break;
+        case NV: /* B Never - no instruction */ break;
+
+        case VS:
+        case VC:
+        default:
+            LOG_ALWAYS_FATAL("Unsupported cc: %02x\n", cc);
+            break;
+    }
+}
+
+void ArmToMipsAssembler::BL(int cc __unused, const char* label __unused)
+{
+    LOG_ALWAYS_FATAL("branch-and-link not supported yet\n");
+    mArmPC[mInum++] = pc();
+}
+
+// no use for Branches with integer PC, but they're in the Interface class ....
+void ArmToMipsAssembler::B(int cc __unused, uint32_t* to_pc __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+void ArmToMipsAssembler::BL(int cc __unused, uint32_t* to_pc __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+void ArmToMipsAssembler::BX(int cc __unused, int Rn __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Data Transfer...
+#endif
+
+// data transfer...
+void ArmToMipsAssembler::LDR(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert LDR via Arm SP to LW via Mips SP
+            }
+            mMips->LW(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->ADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert STR thru Arm SP to STR thru Mips SP
+            }
+            mMips->LW(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->ADDU(R_at, Rn, amode.reg);
+            mMips->LW(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMipsAssembler::LDRB(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            mMips->LBU(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->ADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->LBU(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->ADDU(R_at, Rn, amode.reg);
+            mMips->LBU(Rd, R_at, 0);
+            break;
+    }
+
+}
+
+void ArmToMipsAssembler::STR(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;  // convert STR thru Arm SP to SW thru Mips SP
+            }
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                // If we will writeback, then update the index reg, then store.
+                // This correctly handles stack-push case.
+                mMips->ADDIU(Rn, Rn, amode.value);
+                mMips->SW(Rd, Rn, 0);
+            } else {
+                // No writeback so store offset by value
+                mMips->SW(Rd, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->SW(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);  // post index always writes back
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->ADDU(R_at, Rn, amode.reg);
+            mMips->SW(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMipsAssembler::STRB(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            mMips->SB(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->ADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->SB(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->ADDU(R_at, Rn, amode.reg);
+            mMips->SB(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMipsAssembler::LDRH(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed8_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_8_PRE:      // no support yet for writeback
+            mMips->LHU(Rd, Rn, amode.value);
+            break;
+        case AMODE_IMM_8_POST:
+            mMips->LHU(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_PRE:
+            // we only support simple base +/- index
+            if (amode.reg >= 0) {
+                mMips->ADDU(R_at, Rn, amode.reg);
+            } else {
+                mMips->SUBU(R_at, Rn, abs(amode.reg));
+            }
+            mMips->LHU(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMipsAssembler::LDRSB(int cc __unused, int Rd __unused,
+                               int Rn __unused, uint32_t offset __unused)
+{
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::LDRSH(int cc __unused, int Rd __unused,
+                               int Rn __unused, uint32_t offset __unused)
+{
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::STRH(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed8_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_8_PRE:      // no support yet for writeback
+            mMips->SH(Rd, Rn, amode.value);
+            break;
+        case AMODE_IMM_8_POST:
+            mMips->SH(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_PRE:
+            // we only support simple base +/- index
+            if (amode.reg >= 0) {
+                mMips->ADDU(R_at, Rn, amode.reg);
+            } else {
+                mMips->SUBU(R_at, Rn, abs(amode.reg));
+            }
+            mMips->SH(Rd, R_at, 0);
+            break;
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Block Data Transfer...
+#endif
+
+// block data transfer...
+void ArmToMipsAssembler::LDM(int cc __unused, int dir __unused,
+        int Rn __unused, int W __unused, uint32_t reg_list __unused)
+{   //                        ED FD EA FA      IB IA DB DA
+    // const uint8_t P[8] = { 1, 0, 1, 0,      1, 0, 1, 0 };
+    // const uint8_t U[8] = { 1, 1, 0, 0,      1, 1, 0, 0 };
+    // *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+    //         (uint32_t(U[dir])<<23) | (1<<20) | (W<<21) | (Rn<<16) | reg_list;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::STM(int cc __unused, int dir __unused,
+        int Rn __unused, int W __unused, uint32_t reg_list __unused)
+{   //                        FA EA FD ED      IB IA DB DA
+    // const uint8_t P[8] = { 0, 1, 0, 1,      1, 0, 1, 0 };
+    // const uint8_t U[8] = { 0, 0, 1, 1,      1, 1, 0, 0 };
+    // *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+    //         (uint32_t(U[dir])<<23) | (0<<20) | (W<<21) | (Rn<<16) | reg_list;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Special...
+#endif
+
+// special...
+void ArmToMipsAssembler::SWP(int cc __unused, int Rn __unused,
+                             int Rd __unused, int Rm __unused) {
+    // *mPC++ = (cc<<28) | (2<<23) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::SWPB(int cc __unused, int Rn __unused,
+                              int Rd __unused, int Rm __unused) {
+    // *mPC++ = (cc<<28) | (2<<23) | (1<<22) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::SWI(int cc __unused, uint32_t comment __unused) {
+    // *mPC++ = (cc<<28) | (0xF<<24) | comment;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+
+#if 0
+#pragma mark -
+#pragma mark DSP instructions...
+#endif
+
+// DSP instructions...
+void ArmToMipsAssembler::PLD(int Rn __unused, uint32_t offset) {
+    LOG_ALWAYS_FATAL_IF(!((offset&(1<<24)) && !(offset&(1<<21))),
+                        "PLD only P=1, W=0");
+    // *mPC++ = 0xF550F000 | (Rn<<16) | offset;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::CLZ(int cc __unused, int Rd, int Rm)
+{
+    mArmPC[mInum++] = pc();
+    mMips->CLZ(Rd, Rm);
+}
+
+void ArmToMipsAssembler::QADD(int cc __unused,  int Rd __unused,
+                              int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1000050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::QDADD(int cc __unused,  int Rd __unused,
+                               int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1400050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::QSUB(int cc __unused,  int Rd __unused,
+                              int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1200050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::QDSUB(int cc __unused,  int Rd __unused,
+                               int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1600050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+// 16 x 16 signed multiply (like SMLAxx without the accumulate)
+void ArmToMipsAssembler::SMUL(int cc __unused, int xy,
+                int Rd, int Rm, int Rs)
+{
+    mArmPC[mInum++] = pc();
+
+    // the 16 bits may be in the top or bottom half of 32-bit source reg,
+    // as defined by the codes BB, BT, TB, TT (compressed param xy)
+    // where x corresponds to Rm and y to Rs
+
+    // select half-reg for Rm
+    if (xy & xyTB) {
+        // use top 16-bits
+        mMips->SRA(R_at, Rm, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        if (mips32r2) {
+            mMips->SEH(R_at, Rm);
+        } else {
+            mMips->SLL(R_at, Rm, 16);
+            mMips->SRA(R_at, R_at, 16);
+        }
+    }
+    // select half-reg for Rs
+    if (xy & xyBT) {
+        // use top 16-bits
+        mMips->SRA(R_at2, Rs, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        if (mips32r2) {
+            mMips->SEH(R_at2, Rs);
+        } else {
+            mMips->SLL(R_at2, Rs, 16);
+            mMips->SRA(R_at2, R_at2, 16);
+        }
+    }
+    mMips->MUL(Rd, R_at, R_at2);
+}
+
+// signed 32b x 16b multiple, save top 32-bits of 48-bit result
+void ArmToMipsAssembler::SMULW(int cc __unused, int y,
+                int Rd, int Rm, int Rs)
+{
+    mArmPC[mInum++] = pc();
+
+    // the selector yT or yB refers to reg Rs
+    if (y & yT) {
+        // zero the bottom 16-bits, with 2 shifts, it can affect result
+        mMips->SRL(R_at, Rs, 16);
+        mMips->SLL(R_at, R_at, 16);
+
+    } else {
+        // move low 16-bit half, to high half
+        mMips->SLL(R_at, Rs, 16);
+    }
+    mMips->MULT(Rm, R_at);
+    mMips->MFHI(Rd);
+}
+
+// 16 x 16 signed multiply, accumulate: Rd = Rm{16} * Rs{16} + Rn
+void ArmToMipsAssembler::SMLA(int cc __unused, int xy,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    mArmPC[mInum++] = pc();
+
+    // the 16 bits may be in the top or bottom half of 32-bit source reg,
+    // as defined by the codes BB, BT, TB, TT (compressed param xy)
+    // where x corresponds to Rm and y to Rs
+
+    // select half-reg for Rm
+    if (xy & xyTB) {
+        // use top 16-bits
+        mMips->SRA(R_at, Rm, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        if (mips32r2) {
+            mMips->SEH(R_at, Rm);
+        } else {
+            mMips->SLL(R_at, Rm, 16);
+            mMips->SRA(R_at, R_at, 16);
+        }
+    }
+    // select half-reg for Rs
+    if (xy & xyBT) {
+        // use top 16-bits
+        mMips->SRA(R_at2, Rs, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        if (mips32r2) {
+            mMips->SEH(R_at2, Rs);
+        } else {
+            mMips->SLL(R_at2, Rs, 16);
+            mMips->SRA(R_at2, R_at2, 16);
+        }
+    }
+
+    mMips->MUL(R_at, R_at, R_at2);
+    mMips->ADDU(Rd, R_at, Rn);
+}
+
+void ArmToMipsAssembler::SMLAL(int cc __unused, int xy __unused,
+                               int RdHi __unused, int RdLo __unused,
+                               int Rs __unused, int Rm __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1400080 | (RdHi<<16) | (RdLo<<12) | (Rs<<8) | (xy<<4) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::SMLAW(int cc __unused, int y __unused,
+                               int Rd __unused, int Rm __unused,
+                               int Rs __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1200080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (y<<4) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+// used by ARMv6 version of GGLAssembler::filter32
+void ArmToMipsAssembler::UXTB16(int cc __unused, int Rd, int Rm, int rotate)
+{
+    mArmPC[mInum++] = pc();
+
+    //Rd[31:16] := ZeroExtend((Rm ROR (8 * sh))[23:16]),
+    //Rd[15:0] := ZeroExtend((Rm ROR (8 * sh))[7:0]). sh 0-3.
+
+    mMips->ROTR(Rm, Rm, rotate * 8);
+    mMips->AND(Rd, Rm, 0x00FF00FF);
+}
+
+void ArmToMipsAssembler::UBFX(int cc __unused, int Rd __unused,
+                              int Rn __unused, int lsb __unused,
+                              int width __unused)
+{
+     /* Placeholder for UBFX */
+     mArmPC[mInum++] = pc();
+
+     mMips->NOP2();
+     NOT_IMPLEMENTED();
+}
+
+
+
+
+
+#if 0
+#pragma mark -
+#pragma mark MIPS Assembler...
+#endif
+
+
+//**************************************************************************
+//**************************************************************************
+//**************************************************************************
+
+
+/* mips assembler
+** this is a subset of mips32r2, targeted specifically at ARM instruction
+** replacement in the pixelflinger/codeflinger code.
+**
+** To that end, there is no need for floating point, or priviledged
+** instructions. This all runs in user space, no float.
+**
+** The syntax makes no attempt to be as complete as the assember, with
+** synthetic instructions, and automatic recognition of immedate operands
+** (use the immediate form of the instruction), etc.
+**
+** We start with mips32r1, and may add r2 and dsp extensions if cpu
+** supports. Decision will be made at compile time, based on gcc
+** options. (makes sense since android will be built for a a specific
+** device)
+*/
+
+MIPSAssembler::MIPSAssembler(const sp<Assembly>& assembly, ArmToMipsAssembler *parent)
+    : mParent(parent),
+    mAssembly(assembly)
+{
+    mBase = mPC = (uint32_t *)assembly->base();
+    mDuration = ggl_system_time();
+}
+
+MIPSAssembler::MIPSAssembler(void* assembly)
+    : mParent(NULL), mAssembly(NULL)
+{
+    mBase = mPC = (uint32_t *)assembly;
+}
+
+MIPSAssembler::~MIPSAssembler()
+{
+}
+
+
+uint32_t* MIPSAssembler::pc() const
+{
+    return mPC;
+}
+
+uint32_t* MIPSAssembler::base() const
+{
+    return mBase;
+}
+
+void MIPSAssembler::reset()
+{
+    mBase = mPC = (uint32_t *)mAssembly->base();
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+}
+
+
+// convert tabs to spaces, and remove any newline
+// works with strings of limited size (makes a temp copy)
+#define TABSTOP 8
+void MIPSAssembler::string_detab(char *s)
+{
+    char *os = s;
+    char temp[100];
+    char *t = temp;
+    int len = 99;
+    int i = TABSTOP;
+
+    while (*s && len-- > 0) {
+        if (*s == '\n') { s++; continue; }
+        if (*s == '\t') {
+            s++;
+            for ( ; i>0; i--) {*t++ = ' '; len--; }
+        } else {
+            *t++ = *s++;
+        }
+        if (i <= 0) i = TABSTOP;
+        i--;
+    }
+    *t = '\0';
+    strcpy(os, temp);
+}
+
+void MIPSAssembler::string_pad(char *s, int padded_len)
+{
+    int len = strlen(s);
+    s += len;
+    for (int i = padded_len - len; i > 0; --i) {
+        *s++ = ' ';
+    }
+    *s = '\0';
+}
+
+// ----------------------------------------------------------------------------
+
+void MIPSAssembler::disassemble(const char* name)
+{
+    char di_buf[140];
+
+    if (name) {
+        ALOGW("%s:\n", name);
+    }
+
+    bool arm_disasm_fmt = (mParent->mArmDisassemblyBuffer == NULL) ? false : true;
+
+    typedef char dstr[40];
+    dstr *lines = (dstr *)mParent->mArmDisassemblyBuffer;
+
+    if (mParent->mArmDisassemblyBuffer != NULL) {
+        for (int i=0; i<mParent->mArmInstrCount; ++i) {
+            string_detab(lines[i]);
+        }
+    }
+
+    size_t count = pc()-base();
+    uint32_t* mipsPC = base();
+    while (count--) {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(mipsPC);
+        if (label >= 0) {
+            ALOGW("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(mipsPC);
+        if (comment >= 0) {
+            ALOGW("; %s\n", mComments.valueAt(comment));
+        }
+        // ALOGW("%08x:    %08x    ", int(i), int(i[0]));
+        ::mips_disassem(mipsPC, di_buf, arm_disasm_fmt);
+        string_detab(di_buf);
+        string_pad(di_buf, 30);
+        ALOGW("0x%p:    %08x    %s", mipsPC, uint32_t(*mipsPC), di_buf);
+        mipsPC++;
+    }
+}
+
+void MIPSAssembler::comment(const char* string)
+{
+    mComments.add(pc(), string);
+}
+
+void MIPSAssembler::label(const char* theLabel)
+{
+    mLabels.add(theLabel, pc());
+    mLabelsInverseMapping.add(pc(), theLabel);
+}
+
+
+void MIPSAssembler::prolog()
+{
+    // empty - done in ArmToMipsAssembler
+}
+
+void MIPSAssembler::epilog(uint32_t touched __unused)
+{
+    // empty - done in ArmToMipsAssembler
+}
+
+int MIPSAssembler::generate(const char* name)
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--) {
+        const branch_target_t& bt = mBranchTargets[count];
+        uint32_t* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                "error resolving branch targets, target_pc is null");
+        int32_t offset = int32_t(target_pc - (bt.pc+1));
+        *bt.pc |= offset & 0x00FFFF;
+    }
+
+    mAssembly->resize( int(pc()-base())*4 );
+
+    // the instruction & data caches are flushed by CodeCache
+    const int64_t duration = ggl_system_time() - mDuration;
+    const char * const format = "generated %s (%d ins) at [%p:%p] in %" PRId64 " ns\n";
+    ALOGI(format, name, int(pc()-base()), base(), pc(), duration);
+
+    char value[PROPERTY_VALUE_MAX];
+    value[0] = '\0';
+
+    property_get("debug.pf.disasm", value, "0");
+
+    if (atoi(value) != 0) {
+        disassemble(name);
+    }
+
+    return OK;
+}
+
+uint32_t* MIPSAssembler::pcForLabel(const char* label)
+{
+    return mLabels.valueFor(label);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Arithmetic...
+#endif
+
+void MIPSAssembler::ADDU(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (addu_fn<<FUNC_SHF)
+                    | (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF);
+}
+
+// MD00086 pdf says this is: ADDIU rt, rs, imm -- they do not use Rd
+void MIPSAssembler::ADDIU(int Rt, int Rs, int16_t imm)
+{
+    *mPC++ = (addiu_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+
+void MIPSAssembler::SUBU(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (subu_fn<<FUNC_SHF) |
+                        (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF) ;
+}
+
+
+void MIPSAssembler::SUBIU(int Rt, int Rs, int16_t imm)   // really addiu(d, s, -j)
+{
+    *mPC++ = (addiu_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | ((-imm) & MSK_16);
+}
+
+
+void MIPSAssembler::NEGU(int Rd, int Rs)    // really subu(d, zero, s)
+{
+    MIPSAssembler::SUBU(Rd, 0, Rs);
+}
+
+void MIPSAssembler::MUL(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec2_op<<OP_SHF) | (mul_fn<<FUNC_SHF) |
+                        (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF) ;
+}
+
+void MIPSAssembler::MULT(int Rs, int Rt)    // dest is hi,lo
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mult_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+void MIPSAssembler::MULTU(int Rs, int Rt)    // dest is hi,lo
+{
+    *mPC++ = (spec_op<<OP_SHF) | (multu_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+void MIPSAssembler::MADD(int Rs, int Rt)    // hi,lo = hi,lo + Rs * Rt
+{
+    *mPC++ = (spec2_op<<OP_SHF) | (madd_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+void MIPSAssembler::MADDU(int Rs, int Rt)    // hi,lo = hi,lo + Rs * Rt
+{
+    *mPC++ = (spec2_op<<OP_SHF) | (maddu_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+
+void MIPSAssembler::MSUB(int Rs, int Rt)    // hi,lo = hi,lo - Rs * Rt
+{
+    *mPC++ = (spec2_op<<OP_SHF) | (msub_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+void MIPSAssembler::MSUBU(int Rs, int Rt)    // hi,lo = hi,lo - Rs * Rt
+{
+    *mPC++ = (spec2_op<<OP_SHF) | (msubu_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+
+void MIPSAssembler::SEB(int Rd, int Rt)    // sign-extend byte (mips32r2)
+{
+    *mPC++ = (spec3_op<<OP_SHF) | (bshfl_fn<<FUNC_SHF) | (seb_fn << SA_SHF) |
+                    (Rt<<RT_SHF) | (Rd<<RD_SHF);
+}
+
+void MIPSAssembler::SEH(int Rd, int Rt)    // sign-extend half-word (mips32r2)
+{
+    *mPC++ = (spec3_op<<OP_SHF) | (bshfl_fn<<FUNC_SHF) | (seh_fn << SA_SHF) |
+                    (Rt<<RT_SHF) | (Rd<<RD_SHF);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Comparisons...
+#endif
+
+void MIPSAssembler::SLT(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (slt_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::SLTI(int Rt, int Rs, int16_t imm)
+{
+    *mPC++ = (slti_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+
+void MIPSAssembler::SLTU(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (sltu_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::SLTIU(int Rt, int Rs, int16_t imm)
+{
+    *mPC++ = (sltiu_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Logical...
+#endif
+
+void MIPSAssembler::AND(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (and_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::ANDI(int Rt, int Rs, uint16_t imm)      // todo: support larger immediate
+{
+    *mPC++ = (andi_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+
+void MIPSAssembler::OR(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (or_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::ORI(int Rt, int Rs, uint16_t imm)
+{
+    *mPC++ = (ori_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+void MIPSAssembler::NOR(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (nor_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::NOT(int Rd, int Rs)
+{
+    MIPSAssembler::NOR(Rd, Rs, 0);  // NOT(d,s) = NOR(d,s,zero)
+}
+
+void MIPSAssembler::XOR(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (xor_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::XORI(int Rt, int Rs, uint16_t imm)  // todo: support larger immediate
+{
+    *mPC++ = (xori_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+void MIPSAssembler::SLL(int Rd, int Rt, int shft)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (sll_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rt<<RT_SHF) | (shft<<RE_SHF);
+}
+
+void MIPSAssembler::SLLV(int Rd, int Rt, int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (sllv_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::SRL(int Rd, int Rt, int shft)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (srl_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rt<<RT_SHF) | (shft<<RE_SHF);
+}
+
+void MIPSAssembler::SRLV(int Rd, int Rt, int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (srlv_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::SRA(int Rd, int Rt, int shft)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (sra_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rt<<RT_SHF) | (shft<<RE_SHF);
+}
+
+void MIPSAssembler::SRAV(int Rd, int Rt, int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (srav_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::ROTR(int Rd, int Rt, int shft)      // mips32r2
+{
+    // note weird encoding (SRL + 1)
+    *mPC++ = (spec_op<<OP_SHF) | (srl_fn<<FUNC_SHF) |
+                        (1<<RS_SHF) | (Rd<<RD_SHF) | (Rt<<RT_SHF) | (shft<<RE_SHF);
+}
+
+void MIPSAssembler::ROTRV(int Rd, int Rt, int Rs)       // mips32r2
+{
+    // note weird encoding (SRLV + 1)
+    *mPC++ = (spec_op<<OP_SHF) | (srlv_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF) | (1<<RE_SHF);
+}
+
+// uses at2 register (mapped to some appropriate mips reg)
+void MIPSAssembler::RORsyn(int Rd, int Rt, int Rs)
+{
+    // synthetic: d = t rotated by s
+    MIPSAssembler::NEGU(R_at2, Rs);
+    MIPSAssembler::SLLV(R_at2, Rt, R_at2);
+    MIPSAssembler::SRLV(Rd, Rt, Rs);
+    MIPSAssembler::OR(Rd, Rd, R_at2);
+}
+
+// immediate version - uses at2 register (mapped to some appropriate mips reg)
+void MIPSAssembler::RORIsyn(int Rd, int Rt, int rot)
+{
+    // synthetic: d = t rotated by immed rot
+    // d = s >> rot | s << (32-rot)
+    MIPSAssembler::SLL(R_at2, Rt, 32-rot);
+    MIPSAssembler::SRL(Rd, Rt, rot);
+    MIPSAssembler::OR(Rd, Rd, R_at2);
+}
+
+void MIPSAssembler::CLO(int Rd, int Rs)
+{
+    // Rt field must have same gpr # as Rd
+    *mPC++ = (spec2_op<<OP_SHF) | (clo_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rd<<RT_SHF);
+}
+
+void MIPSAssembler::CLZ(int Rd, int Rs)
+{
+    // Rt field must have same gpr # as Rd
+    *mPC++ = (spec2_op<<OP_SHF) | (clz_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rd<<RT_SHF);
+}
+
+void MIPSAssembler::WSBH(int Rd, int Rt)      // mips32r2
+{
+    *mPC++ = (spec3_op<<OP_SHF) | (bshfl_fn<<FUNC_SHF) | (wsbh_fn << SA_SHF) |
+                        (Rt<<RT_SHF) | (Rd<<RD_SHF);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Load/store...
+#endif
+
+void MIPSAssembler::LW(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (lw_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::SW(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (sw_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+// lb is sign-extended
+void MIPSAssembler::LB(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (lb_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::LBU(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (lbu_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::SB(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (sb_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+// lh is sign-extended
+void MIPSAssembler::LH(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (lh_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::LHU(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (lhu_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::SH(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (sh_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::LUI(int Rt, int16_t offset)
+{
+    *mPC++ = (lui_op<<OP_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Register move...
+#endif
+
+void MIPSAssembler::MOVE(int Rd, int Rs)
+{
+    // encoded as "or rd, rs, zero"
+    *mPC++ = (spec_op<<OP_SHF) | (or_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (0<<RT_SHF);
+}
+
+void MIPSAssembler::MOVN(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (movn_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::MOVZ(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (movz_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::MFHI(int Rd)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mfhi_fn<<FUNC_SHF) | (Rd<<RD_SHF);
+}
+
+void MIPSAssembler::MFLO(int Rd)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mflo_fn<<FUNC_SHF) | (Rd<<RD_SHF);
+}
+
+void MIPSAssembler::MTHI(int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mthi_fn<<FUNC_SHF) | (Rs<<RS_SHF);
+}
+
+void MIPSAssembler::MTLO(int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mtlo_fn<<FUNC_SHF) | (Rs<<RS_SHF);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Branch...
+#endif
+
+// temporarily forcing a NOP into branch-delay slot, just to be safe
+// todo: remove NOP, optimze use of delay slots
+void MIPSAssembler::B(const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+
+    // encoded as BEQ zero, zero, offset
+    *mPC++ = (beq_op<<OP_SHF) | (0<<RT_SHF)
+                        | (0<<RS_SHF) | 0;  // offset filled in later
+
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::BEQ(int Rs, int Rt, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (beq_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::BNE(int Rs, int Rt, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (bne_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::BLEZ(int Rs, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (blez_op<<OP_SHF) | (0<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::BLTZ(int Rs, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (regimm_op<<OP_SHF) | (bltz_fn<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::BGTZ(int Rs, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (bgtz_op<<OP_SHF) | (0<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+
+void MIPSAssembler::BGEZ(int Rs, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (regimm_op<<OP_SHF) | (bgez_fn<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::JR(int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (Rs<<RS_SHF) | (jr_fn << FUNC_SHF);
+    MIPSAssembler::NOP();
+}
+
+
+#if 0
+#pragma mark -
+#pragma mark Synthesized Branch...
+#endif
+
+// synthetic variants of branches (using slt & friends)
+void MIPSAssembler::BEQZ(int Rs, const char* label)
+{
+    BEQ(Rs, R_zero, label);
+}
+
+void MIPSAssembler::BNEZ(int Rs __unused, const char* label)
+{
+    BNE(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BGE(int Rs, int Rt, const char* label)
+{
+    SLT(R_at, Rs, Rt);
+    BEQ(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BGEU(int Rs, int Rt, const char* label)
+{
+    SLTU(R_at, Rs, Rt);
+    BEQ(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BGT(int Rs, int Rt, const char* label)
+{
+    SLT(R_at, Rt, Rs);   // rev
+    BNE(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BGTU(int Rs, int Rt, const char* label)
+{
+    SLTU(R_at, Rt, Rs);   // rev
+    BNE(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BLE(int Rs, int Rt, const char* label)
+{
+    SLT(R_at, Rt, Rs);   // rev
+    BEQ(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BLEU(int Rs, int Rt, const char* label)
+{
+    SLTU(R_at, Rt, Rs);  // rev
+    BEQ(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BLT(int Rs, int Rt, const char* label)
+{
+    SLT(R_at, Rs, Rt);
+    BNE(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BLTU(int Rs, int Rt, const char* label)
+{
+    SLTU(R_at, Rs, Rt);
+    BNE(R_at, R_zero, label);
+}
+
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Misc...
+#endif
+
+void MIPSAssembler::NOP(void)
+{
+    // encoded as "sll zero, zero, 0", which is all zero
+    *mPC++ = (spec_op<<OP_SHF) | (sll_fn<<FUNC_SHF);
+}
+
+// using this as special opcode for not-yet-implemented ARM instruction
+void MIPSAssembler::NOP2(void)
+{
+    // encoded as "sll zero, zero, 2", still a nop, but a unique code
+    *mPC++ = (spec_op<<OP_SHF) | (sll_fn<<FUNC_SHF) | (2 << RE_SHF);
+}
+
+// using this as special opcode for purposefully NOT implemented ARM instruction
+void MIPSAssembler::UNIMPL(void)
+{
+    // encoded as "sll zero, zero, 3", still a nop, but a unique code
+    *mPC++ = (spec_op<<OP_SHF) | (sll_fn<<FUNC_SHF) | (3 << RE_SHF);
+}
+
+
+}; // namespace android:

diff --git a/libpixelflinger/codeflinger/MIPSAssembler.h b/libpixelflinger/codeflinger/MIPSAssembler.h
new file mode 100644
index 0000000..c1178b6
--- /dev/null
+++ b/libpixelflinger/codeflinger/MIPSAssembler.h

@@ -0,0 +1,557 @@
+/* libs/pixelflinger/codeflinger/MIPSAssembler.h
+**
+** Copyright 2012, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef ANDROID_MIPSASSEMBLER_H
+#define ANDROID_MIPSASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "tinyutils/smartpointer.h"
+#include "utils/KeyedVector.h"
+#include "utils/Vector.h"
+
+#include "ARMAssemblerInterface.h"
+#include "CodeCache.h"
+
+namespace android {
+
+class MIPSAssembler;    // forward reference
+
+// this class mimics ARMAssembler interface
+//  intent is to translate each ARM instruction to 1 or more MIPS instr
+//  implementation calls MIPSAssembler class to generate mips code
+class ArmToMipsAssembler : public ARMAssemblerInterface
+{
+public:
+                ArmToMipsAssembler(const sp<Assembly>& assembly,
+                        char *abuf = 0, int linesz = 0, int instr_count = 0);
+    virtual     ~ArmToMipsAssembler();
+
+    uint32_t*   base() const;
+    uint32_t*   pc() const;
+    void        disassemble(const char* name);
+
+    virtual void    reset();
+
+    virtual int     generate(const char* name);
+    virtual int     getCodegenArch();
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // shifters...
+    virtual bool        isValidImmediate(uint32_t immed);
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    virtual uint32_t    imm(uint32_t immediate);
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    virtual uint32_t    reg_rrx(int Rm);
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes...
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0);
+    virtual uint32_t    immed12_post(int32_t immed12);
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0);
+    virtual uint32_t    immed8_post(int32_t immed8);
+    virtual uint32_t    reg_pre(int Rm, int W=0);
+    virtual uint32_t    reg_post(int Rm);
+
+
+
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    virtual uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+
+    // byte/half word extract...
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+
+    // bit manipulation...
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width);
+
+    // this is some crap to share is MIPSAssembler class for debug
+    char *      mArmDisassemblyBuffer;
+    int         mArmLineLength;
+    int         mArmInstrCount;
+
+    int         mInum;      // current arm instuction number (0..n)
+    uint32_t**  mArmPC;     // array: PC for 1st mips instr of
+                            //      each translated ARM instr
+
+
+private:
+    ArmToMipsAssembler(const ArmToMipsAssembler& rhs);
+    ArmToMipsAssembler& operator = (const ArmToMipsAssembler& rhs);
+
+    void init_conditional_labels(void);
+
+    void protectConditionalOperands(int Rd);
+
+    // reg__tmp set to MIPS AT, reg 1
+    int dataProcAdrModes(int op, int& source, bool sign = false, int reg_tmp = 1);
+
+    sp<Assembly>        mAssembly;
+    MIPSAssembler*      mMips;
+
+
+    enum misc_constants_t {
+        ARM_MAX_INSTUCTIONS = 512  // based on ASSEMBLY_SCRATCH_SIZE
+    };
+
+    enum {
+        SRC_REG = 0,
+        SRC_IMM,
+        SRC_ERROR = -1
+    };
+
+    enum addr_modes {
+        // start above the range of legal mips reg #'s (0-31)
+        AMODE_REG = 0x20,
+        AMODE_IMM, AMODE_REG_IMM,               // for data processing
+        AMODE_IMM_12_PRE, AMODE_IMM_12_POST,    // for load/store
+        AMODE_REG_SCALE_PRE, AMODE_IMM_8_PRE,
+        AMODE_IMM_8_POST, AMODE_REG_PRE,
+        AMODE_UNSUPPORTED
+    };
+
+    struct addr_mode_t {    // address modes for current ARM instruction
+        int         reg;
+        int         stype;
+        uint32_t    value;
+        bool        writeback;  // writeback the adr reg after modification
+    } amode;
+
+    enum cond_types {
+        CMP_COND = 1,
+        SBIT_COND
+    };
+
+    struct cond_mode_t {    // conditional-execution info for current ARM instruction
+        cond_types  type;
+        int         r1;
+        int         r2;
+        int         labelnum;
+        char        label[100][10];
+    } cond;
+
+};
+
+
+
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// This is the basic MIPS assembler, which just creates the opcodes in memory.
+// All the more complicated work is done in ArmToMipsAssember above.
+
+class MIPSAssembler
+{
+public:
+                MIPSAssembler(const sp<Assembly>& assembly, ArmToMipsAssembler *parent);
+                MIPSAssembler(void* assembly);
+    virtual     ~MIPSAssembler();
+
+    virtual uint32_t*   base() const;
+    virtual uint32_t*   pc() const;
+    virtual void        reset();
+
+    virtual void        disassemble(const char* name);
+
+    virtual void        prolog();
+    virtual void        epilog(uint32_t touched);
+    virtual int         generate(const char* name);
+    virtual void        comment(const char* string);
+    virtual void        label(const char* string);
+
+    // valid only after generate() has been called
+    virtual uint32_t*   pcForLabel(const char* label);
+
+
+    // ------------------------------------------------------------------------
+    // MIPSAssemblerInterface...
+    // ------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Arithmetic...
+#endif
+
+    void ADDU(int Rd, int Rs, int Rt);
+    void ADDIU(int Rt, int Rs, int16_t imm);
+    void SUBU(int Rd, int Rs, int Rt);
+    void SUBIU(int Rt, int Rs, int16_t imm);
+    void NEGU(int Rd, int Rs);
+    void MUL(int Rd, int Rs, int Rt);
+    void MULT(int Rs, int Rt);      // dest is hi,lo
+    void MULTU(int Rs, int Rt);     // dest is hi,lo
+    void MADD(int Rs, int Rt);      // hi,lo = hi,lo + Rs * Rt
+    void MADDU(int Rs, int Rt);     // hi,lo = hi,lo + Rs * Rt
+    void MSUB(int Rs, int Rt);      // hi,lo = hi,lo - Rs * Rt
+    void MSUBU(int Rs, int Rt);     // hi,lo = hi,lo - Rs * Rt
+    void SEB(int Rd, int Rt);       // sign-extend byte (mips32r2)
+    void SEH(int Rd, int Rt);       // sign-extend half-word (mips32r2)
+
+
+#if 0
+#pragma mark -
+#pragma mark Comparisons...
+#endif
+
+    void SLT(int Rd, int Rs, int Rt);
+    void SLTI(int Rt, int Rs, int16_t imm);
+    void SLTU(int Rd, int Rs, int Rt);
+    void SLTIU(int Rt, int Rs, int16_t imm);
+
+
+#if 0
+#pragma mark -
+#pragma mark Logical...
+#endif
+
+    void AND(int Rd, int Rs, int Rt);
+    void ANDI(int Rd, int Rs, uint16_t imm);
+    void OR(int Rd, int Rs, int Rt);
+    void ORI(int Rt, int Rs, uint16_t imm);
+    void NOR(int Rd, int Rs, int Rt);
+    void NOT(int Rd, int Rs);
+    void XOR(int Rd, int Rs, int Rt);
+    void XORI(int Rt, int Rs, uint16_t imm);
+
+    void SLL(int Rd, int Rt, int shft);
+    void SLLV(int Rd, int Rt, int Rs);
+    void SRL(int Rd, int Rt, int shft);
+    void SRLV(int Rd, int Rt, int Rs);
+    void SRA(int Rd, int Rt, int shft);
+    void SRAV(int Rd, int Rt, int Rs);
+    void ROTR(int Rd, int Rt, int shft);    // mips32r2
+    void ROTRV(int Rd, int Rt, int Rs);     // mips32r2
+    void RORsyn(int Rd, int Rs, int Rt);    // synthetic: d = s rotated by t
+    void RORIsyn(int Rd, int Rt, int rot);  // synthetic: d = s rotated by immed
+
+    void CLO(int Rd, int Rs);
+    void CLZ(int Rd, int Rs);
+    void WSBH(int Rd, int Rt);
+
+
+#if 0
+#pragma mark -
+#pragma mark Load/store...
+#endif
+
+    void LW(int Rt, int Rbase, int16_t offset);
+    void SW(int Rt, int Rbase, int16_t offset);
+    void LB(int Rt, int Rbase, int16_t offset);
+    void LBU(int Rt, int Rbase, int16_t offset);
+    void SB(int Rt, int Rbase, int16_t offset);
+    void LH(int Rt, int Rbase, int16_t offset);
+    void LHU(int Rt, int Rbase, int16_t offset);
+    void SH(int Rt, int Rbase, int16_t offset);
+    void LUI(int Rt, int16_t offset);
+
+#if 0
+#pragma mark -
+#pragma mark Register moves...
+#endif
+
+    void MOVE(int Rd, int Rs);
+    void MOVN(int Rd, int Rs, int Rt);
+    void MOVZ(int Rd, int Rs, int Rt);
+    void MFHI(int Rd);
+    void MFLO(int Rd);
+    void MTHI(int Rs);
+    void MTLO(int Rs);
+
+#if 0
+#pragma mark -
+#pragma mark Branch...
+#endif
+
+    void B(const char* label);
+    void BEQ(int Rs, int Rt, const char* label);
+    void BNE(int Rs, int Rt, const char* label);
+    void BGEZ(int Rs, const char* label);
+    void BGTZ(int Rs, const char* label);
+    void BLEZ(int Rs, const char* label);
+    void BLTZ(int Rs, const char* label);
+    void JR(int Rs);
+
+
+#if 0
+#pragma mark -
+#pragma mark Synthesized Branch...
+#endif
+
+    // synthetic variants of above (using slt & friends)
+    void BEQZ(int Rs, const char* label);
+    void BNEZ(int Rs, const char* label);
+    void BGE(int Rs, int Rt, const char* label);
+    void BGEU(int Rs, int Rt, const char* label);
+    void BGT(int Rs, int Rt, const char* label);
+    void BGTU(int Rs, int Rt, const char* label);
+    void BLE(int Rs, int Rt, const char* label);
+    void BLEU(int Rs, int Rt, const char* label);
+    void BLT(int Rs, int Rt, const char* label);
+    void BLTU(int Rs, int Rt, const char* label);
+
+#if 0
+#pragma mark -
+#pragma mark Misc...
+#endif
+
+    void NOP(void);
+    void NOP2(void);
+    void UNIMPL(void);
+
+
+
+
+
+protected:
+    virtual void string_detab(char *s);
+    virtual void string_pad(char *s, int padded_len);
+
+    ArmToMipsAssembler *mParent;
+    sp<Assembly>    mAssembly;
+    uint32_t*       mBase;
+    uint32_t*       mPC;
+    uint32_t*       mPrologPC;
+    int64_t         mDuration;
+
+    struct branch_target_t {
+        inline branch_target_t() : label(0), pc(0) { }
+        inline branch_target_t(const char* l, uint32_t* p)
+            : label(l), pc(p) { }
+        const char* label;
+        uint32_t*   pc;
+    };
+
+    Vector<branch_target_t>                 mBranchTargets;
+    KeyedVector< const char*, uint32_t* >   mLabels;
+    KeyedVector< uint32_t*, const char* >   mLabelsInverseMapping;
+    KeyedVector< uint32_t*, const char* >   mComments;
+
+
+
+
+    // opcode field of all instructions
+    enum opcode_field {
+        spec_op, regimm_op, j_op, jal_op,           // 00
+        beq_op, bne_op, blez_op, bgtz_op,
+        addi_op, addiu_op, slti_op, sltiu_op,       // 08
+        andi_op, ori_op, xori_op, lui_op,
+        cop0_op, cop1_op, cop2_op, cop1x_op,        // 10
+        beql_op, bnel_op, blezl_op, bgtzl_op,
+        daddi_op, daddiu_op, ldl_op, ldr_op,        // 18
+        spec2_op, jalx_op, mdmx_op, spec3_op,
+        lb_op, lh_op, lwl_op, lw_op,                // 20
+        lbu_op, lhu_op, lwr_op, lwu_op,
+        sb_op, sh_op, swl_op, sw_op,                // 28
+        sdl_op, sdr_op, swr_op, cache_op,
+        ll_op, lwc1_op, lwc2_op, pref_op,           // 30
+        lld_op, ldc1_op, ldc2_op, ld_op,
+        sc_op, swc1_op, swc2_op, rsrv_3b_op,        // 38
+        scd_op, sdc1_op, sdc2_op, sd_op
+    };
+
+
+    // func field for special opcode
+    enum func_spec_op {
+        sll_fn, movc_fn, srl_fn, sra_fn,            // 00
+        sllv_fn, pmon_fn, srlv_fn, srav_fn,
+        jr_fn, jalr_fn, movz_fn, movn_fn,           // 08
+        syscall_fn, break_fn, spim_fn, sync_fn,
+        mfhi_fn, mthi_fn, mflo_fn, mtlo_fn,         // 10
+        dsllv_fn, rsrv_spec_2, dsrlv_fn, dsrav_fn,
+        mult_fn, multu_fn, div_fn, divu_fn,         // 18
+        dmult_fn, dmultu_fn, ddiv_fn, ddivu_fn,
+        add_fn, addu_fn, sub_fn, subu_fn,           // 20
+        and_fn, or_fn, xor_fn, nor_fn,
+        rsrv_spec_3, rsrv_spec_4, slt_fn, sltu_fn,  // 28
+        dadd_fn, daddu_fn, dsub_fn, dsubu_fn,
+        tge_fn, tgeu_fn, tlt_fn, tltu_fn,           // 30
+        teq_fn, rsrv_spec_5, tne_fn, rsrv_spec_6,
+        dsll_fn, rsrv_spec_7, dsrl_fn, dsra_fn,     // 38
+        dsll32_fn, rsrv_spec_8, dsrl32_fn, dsra32_fn
+    };
+
+    // func field for spec2 opcode
+    enum func_spec2_op {
+        madd_fn, maddu_fn, mul_fn, rsrv_spec2_3,
+        msub_fn, msubu_fn,
+        clz_fn = 0x20, clo_fn,
+        dclz_fn = 0x24, dclo_fn,
+        sdbbp_fn = 0x3f
+    };
+
+    // func field for spec3 opcode
+    enum func_spec3_op {
+        ext_fn, dextm_fn, dextu_fn, dext_fn,
+        ins_fn, dinsm_fn, dinsu_fn, dins_fn,
+        bshfl_fn = 0x20,
+        dbshfl_fn = 0x24,
+        rdhwr_fn = 0x3b
+    };
+
+    // sa field for spec3 opcodes, with BSHFL function
+    enum func_spec3_bshfl {
+        wsbh_fn = 0x02,
+        seb_fn = 0x10,
+        seh_fn = 0x18
+    };
+
+    // rt field of regimm opcodes.
+    enum regimm_fn {
+        bltz_fn, bgez_fn, bltzl_fn, bgezl_fn,
+        rsrv_ri_fn4, rsrv_ri_fn5, rsrv_ri_fn6, rsrv_ri_fn7,
+        tgei_fn, tgeiu_fn, tlti_fn, tltiu_fn,
+        teqi_fn, rsrv_ri_fn_0d, tnei_fn, rsrv_ri_fn0f,
+        bltzal_fn, bgezal_fn, bltzall_fn, bgezall_fn,
+        bposge32_fn= 0x1c,
+        synci_fn = 0x1f
+    };
+
+
+    // func field for mad opcodes (MIPS IV).
+    enum mad_func {
+        madd_fp_op      = 0x08, msub_fp_op      = 0x0a,
+        nmadd_fp_op     = 0x0c, nmsub_fp_op     = 0x0e
+    };
+
+
+    enum mips_inst_shifts {
+        OP_SHF       = 26,
+        JTARGET_SHF  = 0,
+        RS_SHF       = 21,
+        RT_SHF       = 16,
+        RD_SHF       = 11,
+        RE_SHF       = 6,
+        SA_SHF       = RE_SHF,  // synonym
+        IMM_SHF      = 0,
+        FUNC_SHF     = 0,
+
+        // mask values
+        MSK_16       = 0xffff,
+
+
+        CACHEOP_SHF  = 18,
+        CACHESEL_SHF = 16,
+    };
+};
+
+enum mips_regnames {
+    R_zero = 0,
+            R_at,   R_v0,   R_v1,   R_a0,   R_a1,   R_a2,   R_a3,
+#if __mips_isa_rev < 6
+    R_t0,   R_t1,   R_t2,   R_t3,   R_t4,   R_t5,   R_t6,   R_t7,
+#else
+    R_a4,   R_a5,   R_a6,   R_a7,   R_t0,   R_t1,   R_t2,   R_t3,
+#endif
+    R_s0,   R_s1,   R_s2,   R_s3,   R_s4,   R_s5,   R_s6,   R_s7,
+    R_t8,   R_t9,   R_k0,   R_k1,   R_gp,   R_sp,   R_s8,   R_ra,
+    R_lr = R_s8,
+
+    // arm regs 0-15 are mips regs 2-17 (meaning s0 & s1 are used)
+    R_at2  = R_s2,    // R_at2 = 18 = s2
+    R_cmp  = R_s3,    // R_cmp = 19 = s3
+    R_cmp2 = R_s4     // R_cmp2 = 20 = s4
+};
+
+
+
+}; // namespace android
+
+#endif //ANDROID_MIPSASSEMBLER_H

diff --git a/libpixelflinger/codeflinger/armreg.h b/libpixelflinger/codeflinger/armreg.h
new file mode 100644
index 0000000..fde81ba
--- /dev/null
+++ b/libpixelflinger/codeflinger/armreg.h

@@ -0,0 +1,300 @@
+/*	$NetBSD: armreg.h,v 1.28 2003/10/31 16:30:15 scw Exp $	*/
+
+/*-
+ * Copyright (c) 1998, 2001 Ben Harris
+ * Copyright (c) 1994-1996 Mark Brinicombe.
+ * Copyright (c) 1994 Brini.
+ * All rights reserved.
+ *
+ * This code is derived from software written for Brini by Mark Brinicombe
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Brini.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: /repoman/r/ncvs/src/sys/arm/include/armreg.h,v 1.3 2005/11/21 19:06:25 cognet Exp $
+ */
+
+#ifndef MACHINE_ARMREG_H
+#define MACHINE_ARMREG_H
+#define INSN_SIZE	4
+#define INSN_COND_MASK	0xf0000000	/* Condition mask */
+#define PSR_MODE        0x0000001f      /* mode mask */
+#define PSR_USR26_MODE  0x00000000
+#define PSR_FIQ26_MODE  0x00000001
+#define PSR_IRQ26_MODE  0x00000002
+#define PSR_SVC26_MODE  0x00000003
+#define PSR_USR32_MODE  0x00000010
+#define PSR_FIQ32_MODE  0x00000011
+#define PSR_IRQ32_MODE  0x00000012
+#define PSR_SVC32_MODE  0x00000013
+#define PSR_ABT32_MODE  0x00000017
+#define PSR_UND32_MODE  0x0000001b
+#define PSR_SYS32_MODE  0x0000001f
+#define PSR_32_MODE     0x00000010
+#define PSR_FLAGS	0xf0000000    /* flags */
+
+#define PSR_C_bit (1 << 29)       /* carry */
+
+/* The high-order byte is always the implementor */
+#define CPU_ID_IMPLEMENTOR_MASK	0xff000000
+#define CPU_ID_ARM_LTD		0x41000000 /* 'A' */
+#define CPU_ID_DEC		0x44000000 /* 'D' */
+#define CPU_ID_INTEL		0x69000000 /* 'i' */
+#define	CPU_ID_TI		0x54000000 /* 'T' */
+
+/* How to decide what format the CPUID is in. */
+#define CPU_ID_ISOLD(x)		(((x) & 0x0000f000) == 0x00000000)
+#define CPU_ID_IS7(x)		(((x) & 0x0000f000) == 0x00007000)
+#define CPU_ID_ISNEW(x)		(!CPU_ID_ISOLD(x) && !CPU_ID_IS7(x))
+
+/* On ARM3 and ARM6, this byte holds the foundry ID. */
+#define CPU_ID_FOUNDRY_MASK	0x00ff0000
+#define CPU_ID_FOUNDRY_VLSI	0x00560000
+
+/* On ARM7 it holds the architecture and variant (sub-model) */
+#define CPU_ID_7ARCH_MASK	0x00800000
+#define CPU_ID_7ARCH_V3		0x00000000
+#define CPU_ID_7ARCH_V4T	0x00800000
+#define CPU_ID_7VARIANT_MASK	0x007f0000
+
+/* On more recent ARMs, it does the same, but in a different format */
+#define CPU_ID_ARCH_MASK	0x000f0000
+#define CPU_ID_ARCH_V3		0x00000000
+#define CPU_ID_ARCH_V4		0x00010000
+#define CPU_ID_ARCH_V4T		0x00020000
+#define CPU_ID_ARCH_V5		0x00030000
+#define CPU_ID_ARCH_V5T		0x00040000
+#define CPU_ID_ARCH_V5TE	0x00050000
+#define CPU_ID_VARIANT_MASK	0x00f00000
+
+/* Next three nybbles are part number */
+#define CPU_ID_PARTNO_MASK	0x0000fff0
+
+/* Intel XScale has sub fields in part number */
+#define CPU_ID_XSCALE_COREGEN_MASK	0x0000e000 /* core generation */
+#define CPU_ID_XSCALE_COREREV_MASK	0x00001c00 /* core revision */
+#define CPU_ID_XSCALE_PRODUCT_MASK	0x000003f0 /* product number */
+
+/* And finally, the revision number. */
+#define CPU_ID_REVISION_MASK	0x0000000f
+
+/* Individual CPUs are probably best IDed by everything but the revision. */
+#define CPU_ID_CPU_MASK		0xfffffff0
+
+/* Fake CPU IDs for ARMs without CP15 */
+#define CPU_ID_ARM2		0x41560200
+#define CPU_ID_ARM250		0x41560250
+
+/* Pre-ARM7 CPUs -- [15:12] == 0 */
+#define CPU_ID_ARM3		0x41560300
+#define CPU_ID_ARM600		0x41560600
+#define CPU_ID_ARM610		0x41560610
+#define CPU_ID_ARM620		0x41560620
+
+/* ARM7 CPUs -- [15:12] == 7 */
+#define CPU_ID_ARM700		0x41007000 /* XXX This is a guess. */
+#define CPU_ID_ARM710		0x41007100
+#define CPU_ID_ARM7500		0x41027100 /* XXX This is a guess. */
+#define CPU_ID_ARM710A		0x41047100 /* inc ARM7100 */
+#define CPU_ID_ARM7500FE	0x41077100
+#define CPU_ID_ARM710T		0x41807100
+#define CPU_ID_ARM720T		0x41807200
+#define CPU_ID_ARM740T8K	0x41807400 /* XXX no MMU, 8KB cache */
+#define CPU_ID_ARM740T4K	0x41817400 /* XXX no MMU, 4KB cache */
+
+/* Post-ARM7 CPUs */
+#define CPU_ID_ARM810		0x41018100
+#define CPU_ID_ARM920T		0x41129200
+#define CPU_ID_ARM920T_ALT	0x41009200
+#define CPU_ID_ARM922T		0x41029220
+#define CPU_ID_ARM940T		0x41029400 /* XXX no MMU */
+#define CPU_ID_ARM946ES		0x41049460 /* XXX no MMU */
+#define	CPU_ID_ARM966ES		0x41049660 /* XXX no MMU */
+#define	CPU_ID_ARM966ESR1	0x41059660 /* XXX no MMU */
+#define CPU_ID_ARM1020E		0x4115a200 /* (AKA arm10 rev 1) */
+#define CPU_ID_ARM1022ES	0x4105a220
+#define CPU_ID_SA110		0x4401a100
+#define CPU_ID_SA1100		0x4401a110
+#define	CPU_ID_TI925T		0x54029250
+#define CPU_ID_SA1110		0x6901b110
+#define CPU_ID_IXP1200		0x6901c120
+#define CPU_ID_80200		0x69052000
+#define CPU_ID_PXA250    	0x69052100 /* sans core revision */
+#define CPU_ID_PXA210    	0x69052120
+#define CPU_ID_PXA250A		0x69052100 /* 1st version Core */
+#define CPU_ID_PXA210A		0x69052120 /* 1st version Core */
+#define CPU_ID_PXA250B		0x69052900 /* 3rd version Core */
+#define CPU_ID_PXA210B		0x69052920 /* 3rd version Core */
+#define CPU_ID_PXA250C		0x69052d00 /* 4th version Core */
+#define CPU_ID_PXA210C		0x69052d20 /* 4th version Core */
+#define	CPU_ID_80321_400	0x69052420
+#define	CPU_ID_80321_600	0x69052430
+#define	CPU_ID_80321_400_B0	0x69052c20
+#define	CPU_ID_80321_600_B0	0x69052c30
+#define	CPU_ID_IXP425_533	0x690541c0
+#define	CPU_ID_IXP425_400	0x690541d0
+#define	CPU_ID_IXP425_266	0x690541f0
+
+/* ARM3-specific coprocessor 15 registers */
+#define ARM3_CP15_FLUSH		1
+#define ARM3_CP15_CONTROL	2
+#define ARM3_CP15_CACHEABLE	3
+#define ARM3_CP15_UPDATEABLE	4
+#define ARM3_CP15_DISRUPTIVE	5	
+
+/* ARM3 Control register bits */
+#define ARM3_CTL_CACHE_ON	0x00000001
+#define ARM3_CTL_SHARED		0x00000002
+#define ARM3_CTL_MONITOR	0x00000004
+
+/*
+ * Post-ARM3 CP15 registers:
+ *
+ *	1	Control register
+ *
+ *	2	Translation Table Base
+ *
+ *	3	Domain Access Control
+ *
+ *	4	Reserved
+ *
+ *	5	Fault Status
+ *
+ *	6	Fault Address
+ *
+ *	7	Cache/write-buffer Control
+ *
+ *	8	TLB Control
+ *
+ *	9	Cache Lockdown
+ *
+ *	10	TLB Lockdown
+ *
+ *	11	Reserved
+ *
+ *	12	Reserved
+ *
+ *	13	Process ID (for FCSE)
+ *
+ *	14	Reserved
+ *
+ *	15	Implementation Dependent
+ */
+
+/* Some of the definitions below need cleaning up for V3/V4 architectures */
+
+/* CPU control register (CP15 register 1) */
+#define CPU_CONTROL_MMU_ENABLE	0x00000001 /* M: MMU/Protection unit enable */
+#define CPU_CONTROL_AFLT_ENABLE	0x00000002 /* A: Alignment fault enable */
+#define CPU_CONTROL_DC_ENABLE	0x00000004 /* C: IDC/DC enable */
+#define CPU_CONTROL_WBUF_ENABLE 0x00000008 /* W: Write buffer enable */
+#define CPU_CONTROL_32BP_ENABLE 0x00000010 /* P: 32-bit exception handlers */
+#define CPU_CONTROL_32BD_ENABLE 0x00000020 /* D: 32-bit addressing */
+#define CPU_CONTROL_LABT_ENABLE 0x00000040 /* L: Late abort enable */
+#define CPU_CONTROL_BEND_ENABLE 0x00000080 /* B: Big-endian mode */
+#define CPU_CONTROL_SYST_ENABLE 0x00000100 /* S: System protection bit */
+#define CPU_CONTROL_ROM_ENABLE	0x00000200 /* R: ROM protection bit */
+#define CPU_CONTROL_CPCLK	0x00000400 /* F: Implementation defined */
+#define CPU_CONTROL_BPRD_ENABLE 0x00000800 /* Z: Branch prediction enable */
+#define CPU_CONTROL_IC_ENABLE   0x00001000 /* I: IC enable */
+#define CPU_CONTROL_VECRELOC	0x00002000 /* V: Vector relocation */
+#define CPU_CONTROL_ROUNDROBIN	0x00004000 /* RR: Predictable replacement */
+#define CPU_CONTROL_V4COMPAT	0x00008000 /* L4: ARMv4 compat LDR R15 etc */
+
+#define CPU_CONTROL_IDC_ENABLE	CPU_CONTROL_DC_ENABLE
+
+/* XScale Auxillary Control Register (CP15 register 1, opcode2 1) */
+#define	XSCALE_AUXCTL_K		0x00000001 /* dis. write buffer coalescing */
+#define	XSCALE_AUXCTL_P		0x00000002 /* ECC protect page table access */
+#define	XSCALE_AUXCTL_MD_WB_RA	0x00000000 /* mini-D$ wb, read-allocate */
+#define	XSCALE_AUXCTL_MD_WB_RWA	0x00000010 /* mini-D$ wb, read/write-allocate */
+#define	XSCALE_AUXCTL_MD_WT	0x00000020 /* mini-D$ wt, read-allocate */
+#define	XSCALE_AUXCTL_MD_MASK	0x00000030
+
+/* Cache type register definitions */
+#define	CPU_CT_ISIZE(x)		((x) & 0xfff)		/* I$ info */
+#define	CPU_CT_DSIZE(x)		(((x) >> 12) & 0xfff)	/* D$ info */
+#define	CPU_CT_S		(1U << 24)		/* split cache */
+#define	CPU_CT_CTYPE(x)		(((x) >> 25) & 0xf)	/* cache type */
+
+#define	CPU_CT_CTYPE_WT		0	/* write-through */
+#define	CPU_CT_CTYPE_WB1	1	/* write-back, clean w/ read */
+#define	CPU_CT_CTYPE_WB2	2	/* w/b, clean w/ cp15,7 */
+#define	CPU_CT_CTYPE_WB6	6	/* w/b, cp15,7, lockdown fmt A */
+#define	CPU_CT_CTYPE_WB7	7	/* w/b, cp15,7, lockdown fmt B */
+
+#define	CPU_CT_xSIZE_LEN(x)	((x) & 0x3)		/* line size */
+#define	CPU_CT_xSIZE_M		(1U << 2)		/* multiplier */
+#define	CPU_CT_xSIZE_ASSOC(x)	(((x) >> 3) & 0x7)	/* associativity */
+#define	CPU_CT_xSIZE_SIZE(x)	(((x) >> 6) & 0x7)	/* size */
+
+/* Fault status register definitions */
+
+#define FAULT_TYPE_MASK 0x0f
+#define FAULT_USER      0x10
+
+#define FAULT_WRTBUF_0  0x00 /* Vector Exception */
+#define FAULT_WRTBUF_1  0x02 /* Terminal Exception */
+#define FAULT_BUSERR_0  0x04 /* External Abort on Linefetch -- Section */
+#define FAULT_BUSERR_1  0x06 /* External Abort on Linefetch -- Page */
+#define FAULT_BUSERR_2  0x08 /* External Abort on Non-linefetch -- Section */
+#define FAULT_BUSERR_3  0x0a /* External Abort on Non-linefetch -- Page */
+#define FAULT_BUSTRNL1  0x0c /* External abort on Translation -- Level 1 */
+#define FAULT_BUSTRNL2  0x0e /* External abort on Translation -- Level 2 */
+#define FAULT_ALIGN_0   0x01 /* Alignment */
+#define FAULT_ALIGN_1   0x03 /* Alignment */
+#define FAULT_TRANS_S   0x05 /* Translation -- Section */
+#define FAULT_TRANS_P   0x07 /* Translation -- Page */
+#define FAULT_DOMAIN_S  0x09 /* Domain -- Section */
+#define FAULT_DOMAIN_P  0x0b /* Domain -- Page */
+#define FAULT_PERM_S    0x0d /* Permission -- Section */
+#define FAULT_PERM_P    0x0f /* Permission -- Page */
+
+#define	FAULT_IMPRECISE	0x400	/* Imprecise exception (XSCALE) */
+
+/*
+ * Address of the vector page, low and high versions.
+ */
+#define	ARM_VECTORS_LOW		0x00000000U
+#define	ARM_VECTORS_HIGH	0xffff0000U
+
+/*
+ * ARM Instructions
+ *
+ *       3 3 2 2 2                              
+ *       1 0 9 8 7                                                     0
+ *      +-------+-------------------------------------------------------+
+ *      | cond  |              instruction dependant                    |
+ *      |c c c c|                                                       |
+ *      +-------+-------------------------------------------------------+
+ */
+
+#define INSN_SIZE		4		/* Always 4 bytes */
+#define INSN_COND_MASK		0xf0000000	/* Condition mask */
+#define INSN_COND_AL		0xe0000000	/* Always condition */
+
+#endif /* !MACHINE_ARMREG_H */

diff --git a/libpixelflinger/codeflinger/blending.cpp b/libpixelflinger/codeflinger/blending.cpp
new file mode 100644
index 0000000..2cbb00f
--- /dev/null
+++ b/libpixelflinger/codeflinger/blending.cpp

@@ -0,0 +1,674 @@
+/* libs/pixelflinger/codeflinger/blending.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "pixelflinger-code"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <android-base/macros.h>
+#include <log/log.h>
+
+#include "GGLAssembler.h"
+
+namespace android {
+
+void GGLAssembler::build_fog(
+                        component_t& temp,      // incomming fragment / output
+                        int component,
+                        Scratch& regs)
+{
+   if (mInfo[component].fog) {
+        Scratch scratches(registerFile());
+        comment("fog");
+
+        integer_t fragment(temp.reg, temp.h, temp.flags);
+        if (!(temp.flags & CORRUPTIBLE)) {
+            temp.reg = regs.obtain();
+            temp.flags |= CORRUPTIBLE;
+        }
+
+        integer_t fogColor(scratches.obtain(), 8, CORRUPTIBLE); 
+        LDRB(AL, fogColor.reg, mBuilderContext.Rctx,
+                immed12_pre(GGL_OFFSETOF(state.fog.color[component])));
+
+        integer_t factor(scratches.obtain(), 16, CORRUPTIBLE);
+        CONTEXT_LOAD(factor.reg, generated_vars.f);
+
+        // clamp fog factor (TODO: see if there is a way to guarantee
+        // we won't overflow, when setting the iterators)
+        BIC(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, ASR, 31));
+        CMP(AL, factor.reg, imm( 0x10000 ));
+        MOV(HS, 0, factor.reg, imm( 0x10000 ));
+
+        build_blendFOneMinusF(temp, factor, fragment, fogColor);
+    }
+}
+
+void GGLAssembler::build_blending(
+                        component_t& temp,      // incomming fragment / output
+                        const pixel_t& pixel,   // framebuffer
+                        int component,
+                        Scratch& regs)
+{
+   if (!mInfo[component].blend)
+        return;
+        
+    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA)
+        fs = GGL_ONE;
+    const int blending = blending_codes(fs, fd);
+    if (!temp.size()) {
+        // here, blending will produce something which doesn't depend on
+        // that component (eg: GL_ZERO:GL_*), so the register has not been
+        // allocated yet. Will never be used as a source.
+        temp = component_t(regs.obtain(), CORRUPTIBLE);
+    }
+
+    // we are doing real blending...
+    // fb:          extracted dst
+    // fragment:    extracted src
+    // temp:        component_t(fragment) and result
+
+    // scoped register allocator
+    Scratch scratches(registerFile());
+    comment("blending");
+
+    // we can optimize these cases a bit...
+    // (1) saturation is not needed
+    // (2) we can use only one multiply instead of 2
+    // (3) we can reduce the register pressure
+    //      R = S*f + D*(1-f) = (S-D)*f + D
+    //      R = S*(1-f) + D*f = (D-S)*f + S
+
+    const bool same_factor_opt1 =
+        (fs==GGL_DST_COLOR && fd==GGL_ONE_MINUS_DST_COLOR) ||
+        (fs==GGL_SRC_COLOR && fd==GGL_ONE_MINUS_SRC_COLOR) ||
+        (fs==GGL_DST_ALPHA && fd==GGL_ONE_MINUS_DST_ALPHA) ||
+        (fs==GGL_SRC_ALPHA && fd==GGL_ONE_MINUS_SRC_ALPHA);
+
+    const bool same_factor_opt2 =
+        (fs==GGL_ONE_MINUS_DST_COLOR && fd==GGL_DST_COLOR) ||
+        (fs==GGL_ONE_MINUS_SRC_COLOR && fd==GGL_SRC_COLOR) || 
+        (fs==GGL_ONE_MINUS_DST_ALPHA && fd==GGL_DST_ALPHA) ||
+        (fs==GGL_ONE_MINUS_SRC_ALPHA && fd==GGL_SRC_ALPHA);
+
+
+    // XXX: we could also optimize these cases:
+    // R = S*f + D*f = (S+D)*f
+    // R = S*(1-f) + D*(1-f) = (S+D)*(1-f)
+    // R = S*D + D*S = 2*S*D
+
+
+    // see if we need to extract 'component' from the destination (fb)
+    integer_t fb;
+    if (blending & (BLEND_DST|FACTOR_DST)) { 
+        fb.setTo(scratches.obtain(), 32); 
+        extract(fb, pixel, component);
+        if (mDithering) {
+            // XXX: maybe what we should do instead, is simply
+            // expand fb -or- fragment to the larger of the two
+            if (fb.size() < temp.size()) {
+                // for now we expand 'fb' to min(fragment, 8)
+                int new_size = temp.size() < 8 ? temp.size() : 8;
+                expand(fb, fb, new_size);
+            }
+        }
+    }
+
+
+    // convert input fragment to integer_t
+    if (temp.l && (temp.flags & CORRUPTIBLE)) {
+        MOV(AL, 0, temp.reg, reg_imm(temp.reg, LSR, temp.l));
+        temp.h -= temp.l;
+        temp.l = 0;
+    }
+    integer_t fragment(temp.reg, temp.size(), temp.flags);
+
+    // if not done yet, convert input fragment to integer_t
+    if (temp.l) {
+        // here we know temp is not CORRUPTIBLE
+        fragment.reg = scratches.obtain();
+        MOV(AL, 0, fragment.reg, reg_imm(temp.reg, LSR, temp.l));
+        fragment.flags |= CORRUPTIBLE;
+    }
+
+    if (!(temp.flags & CORRUPTIBLE)) {
+        // temp is not corruptible, but since it's the destination it
+        // will be modified, so we need to allocate a new register.
+        temp.reg = regs.obtain();
+        temp.flags &= ~CORRUPTIBLE;
+        fragment.flags &= ~CORRUPTIBLE;
+    }
+
+    if ((blending & BLEND_SRC) && !same_factor_opt1) {
+        // source (fragment) is needed for the blending stage
+        // so it's not CORRUPTIBLE (unless we're doing same_factor_opt1)
+        fragment.flags &= ~CORRUPTIBLE;
+    }
+
+
+    if (same_factor_opt1) {
+        //  R = S*f + D*(1-f) = (S-D)*f + D
+        integer_t factor;
+        build_blend_factor(factor, fs, 
+                component, pixel, fragment, fb, scratches);
+        // fb is always corruptible from this point
+        fb.flags |= CORRUPTIBLE;
+        build_blendFOneMinusF(temp, factor, fragment, fb);
+    } else if (same_factor_opt2) {
+        //  R = S*(1-f) + D*f = (D-S)*f + S
+        integer_t factor;
+        // fb is always corrruptible here
+        fb.flags |= CORRUPTIBLE;
+        build_blend_factor(factor, fd,
+                component, pixel, fragment, fb, scratches);
+        build_blendOneMinusFF(temp, factor, fragment, fb);
+    } else {
+        integer_t src_factor;
+        integer_t dst_factor;
+
+        // if destination (fb) is not needed for the blending stage, 
+        // then it can be marked as CORRUPTIBLE
+        if (!(blending & BLEND_DST)) {
+            fb.flags |= CORRUPTIBLE;
+        }
+
+        // XXX: try to mark some registers as CORRUPTIBLE
+        // in most case we could make those corruptible
+        // when we're processing the last component
+        // but not always, for instance
+        //    when fragment is constant and not reloaded
+        //    when fb is needed for logic-ops or masking
+        //    when a register is aliased (for instance with mAlphaSource)
+
+        // blend away...
+        if (fs==GGL_ZERO) {
+            if (fd==GGL_ZERO) {         // R = 0
+                // already taken care of
+            } else if (fd==GGL_ONE) {   // R = D
+                // already taken care of
+            } else {                    // R = D*fd
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor(temp, fb, dst_factor);
+            }
+        } else if (fs==GGL_ONE) {
+            if (fd==GGL_ZERO) {         // R = S
+                // NOP, taken care of
+            } else if (fd==GGL_ONE) {   // R = S + D
+                component_add(temp, fb, fragment); // args order matters
+                component_sat(temp);
+            } else {                    // R = S + D*fd
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor_add(temp, fb, dst_factor, component_t(fragment));
+                component_sat(temp);
+            }
+        } else {
+            // compute fs
+            build_blend_factor(src_factor, fs, 
+                    component, pixel, fragment, fb, scratches);
+            if (fd==GGL_ZERO) {         // R = S*fs
+                mul_factor(temp, fragment, src_factor);
+            } else if (fd==GGL_ONE) {   // R = S*fs + D
+                mul_factor_add(temp, fragment, src_factor, component_t(fb));
+                component_sat(temp);
+            } else {                    // R = S*fs + D*fd
+                mul_factor(temp, fragment, src_factor);
+                if (scratches.isUsed(src_factor.reg))
+                    scratches.recycle(src_factor.reg);
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor_add(temp, fb, dst_factor, temp);
+                if (!same_factor_opt1 && !same_factor_opt2) {
+                    component_sat(temp);
+                }
+            }
+        }
+    }
+
+    // now we can be corrupted (it's the dest)
+    temp.flags |= CORRUPTIBLE;
+}
+
+void GGLAssembler::build_blend_factor(
+        integer_t& factor, int f, int component,
+        const pixel_t& dst_pixel,
+        integer_t& fragment,
+        integer_t& fb,
+        Scratch& scratches)
+{
+    integer_t src_alpha(fragment);
+
+    // src_factor/dst_factor won't be used after blending,
+    // so it's fine to mark them as CORRUPTIBLE (if not aliased)
+    factor.flags |= CORRUPTIBLE;
+
+    switch(f) {
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        if (component==GGLFormat::ALPHA && !isAlphaSourceNeeded()) {
+            // we're processing alpha, so we already have
+            // src-alpha in fragment, and we need src-alpha just this time.
+        } else {
+           // alpha-src will be needed for other components
+            if (!mBlendFactorCached || mBlendFactorCached==f) {
+                src_alpha = mAlphaSource;
+                factor = mAlphaSource;
+                factor.flags &= ~CORRUPTIBLE;           
+                // we already computed the blend factor before, nothing to do.
+                if (mBlendFactorCached)
+                    return;
+                // this is the first time, make sure to compute the blend
+                // factor properly.
+                mBlendFactorCached = f;
+                break;
+            } else {
+                // we have a cached alpha blend factor, but we want another one,
+                // this should really not happen because by construction,
+                // we cannot have BOTH source and destination
+                // blend factors use ALPHA *and* ONE_MINUS_ALPHA (because
+                // the blending stage uses the f/(1-f) optimization
+                
+                // for completeness, we handle this case though. Since there
+                // are only 2 choices, this meens we want "the other one"
+                // (1-factor)
+                factor = mAlphaSource;
+                factor.flags &= ~CORRUPTIBLE;           
+                RSB(AL, 0, factor.reg, factor.reg, imm((1<<factor.s)));
+                mBlendFactorCached = f;
+                return;
+            }                
+        }
+        FALLTHROUGH_INTENDED;
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+    case GGL_SRC_ALPHA_SATURATE:
+        // help us find out what register we can use for the blend-factor
+        // CORRUPTIBLE registers are chosen first, or a new one is allocated.
+        if (fragment.flags & CORRUPTIBLE) {
+            factor.setTo(fragment.reg, 32, CORRUPTIBLE);
+            fragment.flags &= ~CORRUPTIBLE;
+        } else if (fb.flags & CORRUPTIBLE) {
+            factor.setTo(fb.reg, 32, CORRUPTIBLE);
+            fb.flags &= ~CORRUPTIBLE;
+        } else {
+            factor.setTo(scratches.obtain(), 32, CORRUPTIBLE);
+        } 
+        break;
+    }
+
+    // XXX: doesn't work if size==1
+
+    switch(f) {
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        factor.s = fb.s;
+        ADD(AL, 0, factor.reg, fb.reg, reg_imm(fb.reg, LSR, fb.s-1));
+        break;
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+        factor.s = fragment.s;
+        ADD(AL, 0, factor.reg, fragment.reg,
+            reg_imm(fragment.reg, LSR, fragment.s-1));
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        factor.s = src_alpha.s;
+        ADD(AL, 0, factor.reg, src_alpha.reg,
+                reg_imm(src_alpha.reg, LSR, src_alpha.s-1));
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        // XXX: should be precomputed
+        extract(factor, dst_pixel, GGLFormat::ALPHA);
+        ADD(AL, 0, factor.reg, factor.reg,
+                reg_imm(factor.reg, LSR, factor.s-1));
+        break;
+    case GGL_SRC_ALPHA_SATURATE:
+        // XXX: should be precomputed
+        // XXX: f = min(As, 1-Ad)
+        // btw, we're guaranteed that Ad's size is <= 8, because
+        // it's extracted from the framebuffer
+        break;
+    }
+
+    switch(f) {
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_ONE_MINUS_SRC_ALPHA:
+        RSB(AL, 0, factor.reg, factor.reg, imm((1<<factor.s)));
+    }
+    
+    // don't need more than 8-bits for the blend factor
+    // and this will prevent overflows in the multiplies later
+    if (factor.s > 8) {
+        MOV(AL, 0, factor.reg, reg_imm(factor.reg, LSR, factor.s-8));
+        factor.s = 8;
+    }
+}
+
+int GGLAssembler::blending_codes(int fs, int fd)
+{
+    int blending = 0;
+    switch(fs) {
+    case GGL_ONE:
+        blending |= BLEND_SRC;
+        break;
+
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        blending |= FACTOR_DST|BLEND_SRC;
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        // no need to extract 'component' from the destination
+        // for the blend factor, because we need ALPHA only.
+        blending |= BLEND_SRC;
+        break;
+
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:    
+        blending |= FACTOR_SRC|BLEND_SRC;
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+    case GGL_SRC_ALPHA_SATURATE:
+        blending |= FACTOR_SRC|BLEND_SRC;
+        break;
+    }
+    switch(fd) {
+    case GGL_ONE:
+        blending |= BLEND_DST;
+        break;
+
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        blending |= FACTOR_DST|BLEND_DST;
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        blending |= FACTOR_DST|BLEND_DST;
+        break;
+
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:    
+        blending |= FACTOR_SRC|BLEND_DST;
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        // no need to extract 'component' from the source
+        // for the blend factor, because we need ALPHA only.
+        blending |= BLEND_DST;
+        break;
+    }
+    return blending;
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_blendFOneMinusF(
+        component_t& temp,
+        const integer_t& factor, 
+        const integer_t& fragment,
+        const integer_t& fb)
+{
+    //  R = S*f + D*(1-f) = (S-D)*f + D
+    Scratch scratches(registerFile());
+    // compute S-D
+    integer_t diff(fragment.flags & CORRUPTIBLE ?
+            fragment.reg : scratches.obtain(), fb.size(), CORRUPTIBLE);
+    const int shift = fragment.size() - fb.size();
+    if (shift>0)        RSB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSR, shift));
+    else if (shift<0)   RSB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSL,-shift));
+    else                RSB(AL, 0, diff.reg, fb.reg, fragment.reg);
+    mul_factor_add(temp, diff, factor, component_t(fb));
+}
+
+void GGLAssembler::build_blendOneMinusFF(
+        component_t& temp,
+        const integer_t& factor, 
+        const integer_t& fragment,
+        const integer_t& fb)
+{
+    //  R = S*f + D*(1-f) = (S-D)*f + D
+    Scratch scratches(registerFile());
+    // compute D-S
+    integer_t diff(fb.flags & CORRUPTIBLE ?
+            fb.reg : scratches.obtain(), fb.size(), CORRUPTIBLE);
+    const int shift = fragment.size() - fb.size();
+    if (shift>0)        SUB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSR, shift));
+    else if (shift<0)   SUB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSL,-shift));
+    else                SUB(AL, 0, diff.reg, fb.reg, fragment.reg);
+    mul_factor_add(temp, diff, factor, component_t(fragment));
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::mul_factor(  component_t& d,
+                                const integer_t& v,
+                                const integer_t& f)
+{
+    int vs = v.size();
+    int fs = f.size();
+    int ms = vs+fs;
+
+    // XXX: we could have special cases for 1 bit mul
+
+    // all this code below to use the best multiply instruction
+    // wrt the parameters size. We take advantage of the fact
+    // that the 16-bits multiplies allow a 16-bit shift
+    // The trick is that we just make sure that we have at least 8-bits
+    // per component (which is enough for a 8 bits display).
+
+    int xy;
+    int vshift = 0;
+    int fshift = 0;
+    int smulw = 0;
+
+    if (vs<16) {
+        if (fs<16) {
+            xy = xyBB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            ms -= 16;
+            xy = xyTB;
+        } else {
+            // eg: 15 * 18  ->  15 * 15
+            fshift = fs - 15;
+            ms -= fshift;
+            xy = xyBB;
+        }
+    } else if (GGL_BETWEEN(vs, 24, 31)) {
+        if (fs<16) {
+            ms -= 16;
+            xy = xyTB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            ms -= 32;
+            xy = xyTT;
+        } else {
+            // eg: 24 * 18  ->  8 * 18
+            fshift = fs - 15;
+            ms -= 16 + fshift;
+            xy = xyTB;
+        }
+    } else {
+        if (fs<16) {
+            // eg: 18 * 15  ->  15 * 15
+            vshift = vs - 15;
+            ms -= vshift;
+            xy = xyBB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            // eg: 18 * 24  ->  15 * 8
+            vshift = vs - 15;
+            ms -= 16 + vshift;
+            xy = xyBT;
+        } else {
+            // eg: 18 * 18  ->  (15 * 18)>>16
+            fshift = fs - 15;
+            ms -= 16 + fshift;
+            xy = yB;    //XXX SMULWB
+            smulw = 1;
+        }
+    }
+
+    ALOGE_IF(ms>=32, "mul_factor overflow vs=%d, fs=%d", vs, fs);
+
+    int vreg = v.reg;
+    int freg = f.reg;
+    if (vshift) {
+        MOV(AL, 0, d.reg, reg_imm(vreg, LSR, vshift));
+        vreg = d.reg;
+    }
+    if (fshift) {
+        MOV(AL, 0, d.reg, reg_imm(vreg, LSR, fshift));
+        freg = d.reg;
+    }
+    if (smulw)  SMULW(AL, xy, d.reg, vreg, freg);
+    else        SMUL(AL, xy, d.reg, vreg, freg);
+
+
+    d.h = ms;
+    if (mDithering) {
+        d.l = 0; 
+    } else {
+        d.l = fs; 
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::mul_factor_add(  component_t& d,
+                                    const integer_t& v,
+                                    const integer_t& f,
+                                    const component_t& a)
+{
+    // XXX: we could have special cases for 1 bit mul
+    Scratch scratches(registerFile());
+
+    int vs = v.size();
+    int fs = f.size();
+    int as = a.h;
+    int ms = vs+fs;
+
+    ALOGE_IF(ms>=32, "mul_factor_add overflow vs=%d, fs=%d, as=%d", vs, fs, as);
+
+    integer_t add(a.reg, a.h, a.flags);
+
+    // 'a' is a component_t but it is guaranteed to have
+    // its high bits set to 0. However in the dithering case,
+    // we can't get away with truncating the potentially bad bits
+    // so extraction is needed.
+
+   if ((mDithering) && (a.size() < ms)) {
+        // we need to expand a
+        if (!(a.flags & CORRUPTIBLE)) {
+            // ... but it's not corruptible, so we need to pick a
+            // temporary register.
+            // Try to uses the destination register first (it's likely
+            // to be usable, unless it aliases an input).
+            if (d.reg!=a.reg && d.reg!=v.reg && d.reg!=f.reg) {
+                add.reg = d.reg;
+            } else {
+                add.reg = scratches.obtain();
+            }
+        }
+        expand(add, a, ms); // extracts and expands
+        as = ms;
+    }
+
+    if (ms == as) {
+        if (vs<16 && fs<16) SMLABB(AL, d.reg, v.reg, f.reg, add.reg);
+        else                MLA(AL, 0, d.reg, v.reg, f.reg, add.reg);
+    } else {
+        int temp = d.reg;
+        if (temp == add.reg) {
+            // the mul will modify add.reg, we need an intermediary reg
+            if (v.flags & CORRUPTIBLE)      temp = v.reg;
+            else if (f.flags & CORRUPTIBLE) temp = f.reg;
+            else                            temp = scratches.obtain();
+        }
+
+        if (vs<16 && fs<16) SMULBB(AL, temp, v.reg, f.reg);
+        else                MUL(AL, 0, temp, v.reg, f.reg);
+
+        if (ms>as) {
+            ADD(AL, 0, d.reg, temp, reg_imm(add.reg, LSL, ms-as));
+        } else if (ms<as) {
+            // not sure if we should expand the mul instead?
+            ADD(AL, 0, d.reg, temp, reg_imm(add.reg, LSR, as-ms));
+        }
+    }
+
+    d.h = ms;
+    if (mDithering) {
+        d.l = a.l; 
+    } else {
+        d.l = fs>a.l ? fs : a.l;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::component_add(component_t& d,
+        const integer_t& dst, const integer_t& src)
+{
+    // here we're guaranteed that fragment.size() >= fb.size()
+    const int shift = src.size() - dst.size();
+    if (!shift) {
+        ADD(AL, 0, d.reg, src.reg, dst.reg);
+    } else {
+        ADD(AL, 0, d.reg, src.reg, reg_imm(dst.reg, LSL, shift));
+    }
+
+    d.h = src.size();
+    if (mDithering) {
+        d.l = 0;
+    } else {
+        d.l = shift;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::component_sat(const component_t& v)
+{
+    const int one = ((1<<v.size())-1)<<v.l;
+    CMP(AL, v.reg, imm( 1<<v.h ));
+    if (isValidImmediate(one)) {
+        MOV(HS, 0, v.reg, imm( one ));
+    } else if (isValidImmediate(~one)) {
+        MVN(HS, 0, v.reg, imm( ~one ));
+    } else {
+        MOV(HS, 0, v.reg, imm( 1<<v.h ));
+        SUB(HS, 0, v.reg, v.reg, imm( 1<<v.l ));
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+

diff --git a/libpixelflinger/codeflinger/disassem.c b/libpixelflinger/codeflinger/disassem.c
new file mode 100644
index 0000000..5cbd63d
--- /dev/null
+++ b/libpixelflinger/codeflinger/disassem.c

@@ -0,0 +1,714 @@
+/*	$NetBSD: disassem.c,v 1.14 2003/03/27 16:58:36 mycroft Exp $	*/
+
+/*-
+ * Copyright (c) 1996 Mark Brinicombe.
+ * Copyright (c) 1996 Brini.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Brini.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * RiscBSD kernel project
+ *
+ * db_disasm.c
+ *
+ * Kernel disassembler
+ *
+ * Created      : 10/02/96
+ *
+ * Structured after the sparc/sparc/db_disasm.c by David S. Miller &
+ * Paul Kranenburg
+ *
+ * This code is not complete. Not all instructions are disassembled.
+ */
+
+#include <sys/cdefs.h>
+//__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/arm/arm/disassem.c,v 1.2 2005/01/05 21:58:47 imp Exp $");
+#include <sys/param.h>
+#include <stdio.h>
+
+#include "disassem.h"
+#include "armreg.h"
+//#include <ddb/ddb.h>
+
+/*
+ * General instruction format
+ *
+ *	insn[cc][mod]	[operands]
+ *
+ * Those fields with an uppercase format code indicate that the field
+ * follows directly after the instruction before the separator i.e.
+ * they modify the instruction rather than just being an operand to
+ * the instruction. The only exception is the writeback flag which
+ * follows a operand.
+ *
+ *
+ * 2 - print Operand 2 of a data processing instruction
+ * d - destination register (bits 12-15)
+ * n - n register (bits 16-19)
+ * s - s register (bits 8-11)
+ * o - indirect register rn (bits 16-19) (used by swap)
+ * m - m register (bits 0-3)
+ * a - address operand of ldr/str instruction
+ * e - address operand of ldrh/strh instruction
+ * l - register list for ldm/stm instruction
+ * f - 1st fp operand (register) (bits 12-14)
+ * g - 2nd fp operand (register) (bits 16-18)
+ * h - 3rd fp operand (register/immediate) (bits 0-4)
+ * j - xtb rotate literal (bits 10-11)
+ * i - bfx lsb literal (bits 7-11)
+ * w - bfx width literal (bits 16-20)
+ * b - branch address
+ * t - thumb branch address (bits 24, 0-23)
+ * k - breakpoint comment (bits 0-3, 8-19)
+ * X - block transfer type
+ * Y - block transfer type (r13 base)
+ * c - comment field bits(0-23)
+ * p - saved or current status register
+ * F - PSR transfer fields
+ * D - destination-is-r15 (P) flag on TST, TEQ, CMP, CMN
+ * L - co-processor transfer size
+ * S - set status flag
+ * P - fp precision
+ * Q - fp precision (for ldf/stf)
+ * R - fp rounding
+ * v - co-processor data transfer registers + addressing mode
+ * W - writeback flag
+ * x - instruction in hex
+ * # - co-processor number
+ * y - co-processor data processing registers
+ * z - co-processor register transfer registers
+ */
+
+struct arm32_insn {
+	u_int mask;
+	u_int pattern;
+	char* name;
+	char* format;
+};
+
+static const struct arm32_insn arm32_i[] = {
+    { 0x0fffffff, 0x0ff00000, "imb",	"c" },		/* Before swi */
+    { 0x0fffffff, 0x0ff00001, "imbrange",	"c" },	/* Before swi */
+    { 0x0f000000, 0x0f000000, "swi",	"c" },
+    { 0xfe000000, 0xfa000000, "blx",	"t" },		/* Before b and bl */
+    { 0x0f000000, 0x0a000000, "b",	"b" },
+    { 0x0f000000, 0x0b000000, "bl",	"b" },
+    { 0x0fe000f0, 0x00000090, "mul",	"Snms" },
+    { 0x0fe000f0, 0x00200090, "mla",	"Snmsd" },
+    { 0x0fe000f0, 0x00800090, "umull",	"Sdnms" },
+    { 0x0fe000f0, 0x00c00090, "smull",	"Sdnms" },
+    { 0x0fe000f0, 0x00a00090, "umlal",	"Sdnms" },
+    { 0x0fe000f0, 0x00e00090, "smlal",	"Sdnms" },
+    { 0x0fff03f0, 0x06cf0070, "uxtb16", "dmj" },
+    { 0x0fe00070, 0x07e00050, "ubfx",   "dmiw" },
+    { 0x0d700000, 0x04200000, "strt",	"daW" },
+    { 0x0d700000, 0x04300000, "ldrt",	"daW" },
+    { 0x0d700000, 0x04600000, "strbt",	"daW" },
+    { 0x0d700000, 0x04700000, "ldrbt",	"daW" },
+    { 0x0c500000, 0x04000000, "str",	"daW" },
+    { 0x0c500000, 0x04100000, "ldr",	"daW" },
+    { 0x0c500000, 0x04400000, "strb",	"daW" },
+    { 0x0c500000, 0x04500000, "ldrb",	"daW" },
+    { 0x0e1f0000, 0x080d0000, "stm",	"YnWl" },/* separate out r13 base */
+    { 0x0e1f0000, 0x081d0000, "ldm",	"YnWl" },/* separate out r13 base */    
+    { 0x0e100000, 0x08000000, "stm",	"XnWl" },
+    { 0x0e100000, 0x08100000, "ldm",	"XnWl" },    
+    { 0x0e1000f0, 0x00100090, "ldrb",	"deW" },
+    { 0x0e1000f0, 0x00000090, "strb",	"deW" },
+    { 0x0e1000f0, 0x001000d0, "ldrsb",	"deW" },
+    { 0x0e1000f0, 0x001000b0, "ldrh",	"deW" },
+    { 0x0e1000f0, 0x000000b0, "strh",	"deW" },
+    { 0x0e1000f0, 0x001000f0, "ldrsh",	"deW" },
+    { 0x0f200090, 0x00200090, "und",	"x" },	/* Before data processing */
+    { 0x0e1000d0, 0x000000d0, "und",	"x" },	/* Before data processing */
+    { 0x0ff00ff0, 0x01000090, "swp",	"dmo" },
+    { 0x0ff00ff0, 0x01400090, "swpb",	"dmo" },
+    { 0x0fbf0fff, 0x010f0000, "mrs",	"dp" },	/* Before data processing */
+    { 0x0fb0fff0, 0x0120f000, "msr",	"pFm" },/* Before data processing */
+    { 0x0fb0f000, 0x0320f000, "msr",	"pF2" },/* Before data processing */
+    { 0x0ffffff0, 0x012fff10, "bx",     "m" },
+    { 0x0fff0ff0, 0x016f0f10, "clz",	"dm" },
+    { 0x0ffffff0, 0x012fff30, "blx",	"m" },
+    { 0xfff000f0, 0xe1200070, "bkpt",	"k" },
+    { 0x0de00000, 0x00000000, "and",	"Sdn2" },
+    { 0x0de00000, 0x00200000, "eor",	"Sdn2" },
+    { 0x0de00000, 0x00400000, "sub",	"Sdn2" },
+    { 0x0de00000, 0x00600000, "rsb",	"Sdn2" },
+    { 0x0de00000, 0x00800000, "add",	"Sdn2" },
+    { 0x0de00000, 0x00a00000, "adc",	"Sdn2" },
+    { 0x0de00000, 0x00c00000, "sbc",	"Sdn2" },
+    { 0x0de00000, 0x00e00000, "rsc",	"Sdn2" },
+    { 0x0df00000, 0x01100000, "tst",	"Dn2" },
+    { 0x0df00000, 0x01300000, "teq",	"Dn2" },
+    { 0x0df00000, 0x01500000, "cmp",	"Dn2" },
+    { 0x0df00000, 0x01700000, "cmn",	"Dn2" },
+    { 0x0de00000, 0x01800000, "orr",	"Sdn2" },
+    { 0x0de00000, 0x01a00000, "mov",	"Sd2" },
+    { 0x0de00000, 0x01c00000, "bic",	"Sdn2" },
+    { 0x0de00000, 0x01e00000, "mvn",	"Sd2" },
+    { 0x0ff08f10, 0x0e000100, "adf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e100100, "muf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e200100, "suf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e300100, "rsf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e400100, "dvf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e500100, "rdf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e600100, "pow",	"PRfgh" },
+    { 0x0ff08f10, 0x0e700100, "rpw",	"PRfgh" },
+    { 0x0ff08f10, 0x0e800100, "rmf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e900100, "fml",	"PRfgh" },
+    { 0x0ff08f10, 0x0ea00100, "fdv",	"PRfgh" },
+    { 0x0ff08f10, 0x0eb00100, "frd",	"PRfgh" },
+    { 0x0ff08f10, 0x0ec00100, "pol",	"PRfgh" },
+    { 0x0f008f10, 0x0e000100, "fpbop",	"PRfgh" },
+    { 0x0ff08f10, 0x0e008100, "mvf",	"PRfh" },
+    { 0x0ff08f10, 0x0e108100, "mnf",	"PRfh" },
+    { 0x0ff08f10, 0x0e208100, "abs",	"PRfh" },
+    { 0x0ff08f10, 0x0e308100, "rnd",	"PRfh" },
+    { 0x0ff08f10, 0x0e408100, "sqt",	"PRfh" },
+    { 0x0ff08f10, 0x0e508100, "log",	"PRfh" },
+    { 0x0ff08f10, 0x0e608100, "lgn",	"PRfh" },
+    { 0x0ff08f10, 0x0e708100, "exp",	"PRfh" },
+    { 0x0ff08f10, 0x0e808100, "sin",	"PRfh" },
+    { 0x0ff08f10, 0x0e908100, "cos",	"PRfh" },
+    { 0x0ff08f10, 0x0ea08100, "tan",	"PRfh" },
+    { 0x0ff08f10, 0x0eb08100, "asn",	"PRfh" },
+    { 0x0ff08f10, 0x0ec08100, "acs",	"PRfh" },
+    { 0x0ff08f10, 0x0ed08100, "atn",	"PRfh" },
+    { 0x0f008f10, 0x0e008100, "fpuop",	"PRfh" },
+    { 0x0e100f00, 0x0c000100, "stf",	"QLv" },
+    { 0x0e100f00, 0x0c100100, "ldf",	"QLv" },
+    { 0x0ff00f10, 0x0e000110, "flt",	"PRgd" },
+    { 0x0ff00f10, 0x0e100110, "fix",	"PRdh" },
+    { 0x0ff00f10, 0x0e200110, "wfs",	"d" },
+    { 0x0ff00f10, 0x0e300110, "rfs",	"d" },
+    { 0x0ff00f10, 0x0e400110, "wfc",	"d" },
+    { 0x0ff00f10, 0x0e500110, "rfc",	"d" },
+    { 0x0ff0ff10, 0x0e90f110, "cmf",	"PRgh" },
+    { 0x0ff0ff10, 0x0eb0f110, "cnf",	"PRgh" },
+    { 0x0ff0ff10, 0x0ed0f110, "cmfe",	"PRgh" },
+    { 0x0ff0ff10, 0x0ef0f110, "cnfe",	"PRgh" },
+    { 0xff100010, 0xfe000010, "mcr2",	"#z" },
+    { 0x0f100010, 0x0e000010, "mcr",	"#z" },
+    { 0xff100010, 0xfe100010, "mrc2",	"#z" },
+    { 0x0f100010, 0x0e100010, "mrc",	"#z" },
+    { 0xff000010, 0xfe000000, "cdp2",	"#y" },
+    { 0x0f000010, 0x0e000000, "cdp",	"#y" },
+    { 0xfe100090, 0xfc100000, "ldc2",	"L#v" },
+    { 0x0e100090, 0x0c100000, "ldc",	"L#v" },
+    { 0xfe100090, 0xfc000000, "stc2",	"L#v" },
+    { 0x0e100090, 0x0c000000, "stc",	"L#v" },
+    { 0xf550f000, 0xf550f000, "pld",	"ne" },
+    { 0x0ff00ff0, 0x01000050, "qaad",	"dmn" },
+    { 0x0ff00ff0, 0x01400050, "qdaad",	"dmn" },
+    { 0x0ff00ff0, 0x01600050, "qdsub",	"dmn" },
+    { 0x0ff00ff0, 0x01200050, "dsub",	"dmn" },
+    { 0x0ff000f0, 0x01000080, "smlabb",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000a0, "smlatb",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000c0, "smlabt",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000e0, "smlatt",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x01400080, "smlalbb","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000a0, "smlaltb","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000c0, "smlalbt","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000e0, "smlaltt","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x01200080, "smlawb", "nmsd" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x012000a0, "smulwb","nms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x012000c0, "smlawt", "nmsd" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x012000e0, "smulwt","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x01600080, "smulbb","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000a0, "smultb","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000c0, "smulbt","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000e0, "smultt","nms" },   // d & n inverted!!
+    { 0x00000000, 0x00000000, NULL,	NULL }
+};
+
+static char const arm32_insn_conditions[][4] = {
+	"eq", "ne", "cs", "cc",
+	"mi", "pl", "vs", "vc",
+	"hi", "ls", "ge", "lt",
+	"gt", "le", "",   "nv"
+};
+
+static char const insn_block_transfers[][4] = {
+	"da", "ia", "db", "ib"
+};
+
+static char const insn_stack_block_transfers[][4] = {
+	"ed", "ea", "fd", "fa"
+};
+
+static char const op_shifts[][4] = {
+	"lsl", "lsr", "asr", "ror"
+};
+
+static char const insn_fpa_rounding[][2] = {
+	"", "p", "m", "z"
+};
+
+static char const insn_fpa_precision[][2] = {
+	"s", "d", "e", "p"
+};
+
+static char const insn_fpaconstants[][8] = {
+	"0.0", "1.0", "2.0", "3.0",
+	"4.0", "5.0", "0.5", "10.0"
+};
+
+#define insn_condition(x)	arm32_insn_conditions[((x) >> 28) & 0x0f]
+#define insn_blktrans(x)	insn_block_transfers[((x) >> 23) & 3]
+#define insn_stkblktrans(x)	insn_stack_block_transfers[(3*(((x) >> 20)&1))^(((x) >> 23)&3)]
+#define op2_shift(x)		op_shifts[((x) >> 5) & 3]
+#define insn_fparnd(x)		insn_fpa_rounding[((x) >> 5) & 0x03]
+#define insn_fpaprec(x)		insn_fpa_precision[((((x) >> 18) & 2)|((x) >> 7)) & 1]
+#define insn_fpaprect(x)	insn_fpa_precision[((((x) >> 21) & 2)|((x) >> 15)) & 1]
+#define insn_fpaimm(x)		insn_fpaconstants[(x) & 0x07]
+
+/* Local prototypes */
+static void disasm_register_shift(const disasm_interface_t *di, u_int insn);
+static void disasm_print_reglist(const disasm_interface_t *di, u_int insn);
+static void disasm_insn_ldrstr(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static void disasm_insn_ldrhstrh(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static void disasm_insn_ldcstc(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static u_int disassemble_readword(u_int address);
+static void disassemble_printaddr(u_int address);
+
+u_int
+disasm(const disasm_interface_t *di, u_int loc, int __unused altfmt)
+{
+	const struct arm32_insn *i_ptr = &arm32_i[0];
+	u_int insn = di->di_readword(loc);
+	int matchp = 0;
+	int branch;
+	char* f_ptr;
+	int fmt = 0;
+
+/*	di->di_printf("loc=%08x insn=%08x : ", loc, insn);*/
+
+	while (i_ptr->name) {
+		if ((insn & i_ptr->mask) ==  i_ptr->pattern) {
+			matchp = 1;
+			break;
+		}
+		i_ptr++;
+	}
+
+	if (!matchp) {
+		di->di_printf("und%s\t%08x\n", insn_condition(insn), insn);
+		return(loc + INSN_SIZE);
+	}
+
+	/* If instruction forces condition code, don't print it. */
+	if ((i_ptr->mask & 0xf0000000) == 0xf0000000)
+		di->di_printf("%s", i_ptr->name);
+	else
+		di->di_printf("%s%s", i_ptr->name, insn_condition(insn));
+
+	f_ptr = i_ptr->format;
+
+	/* Insert tab if there are no instruction modifiers */
+
+	if (*(f_ptr) < 'A' || *(f_ptr) > 'Z') {
+		++fmt;
+		di->di_printf("\t");
+	}
+
+	while (*f_ptr) {
+		switch (*f_ptr) {
+		/* 2 - print Operand 2 of a data processing instruction */
+		case '2':
+			if (insn & 0x02000000) {
+				int rotate= ((insn >> 7) & 0x1e);
+
+				di->di_printf("#0x%08x",
+					      (insn & 0xff) << (32 - rotate) |
+					      (insn & 0xff) >> rotate);
+			} else {  
+				disasm_register_shift(di, insn);
+			}
+			break;
+		/* d - destination register (bits 12-15) */
+		case 'd':
+			di->di_printf("r%d", ((insn >> 12) & 0x0f));
+			break;
+		/* D - insert 'p' if Rd is R15 */
+		case 'D':
+			if (((insn >> 12) & 0x0f) == 15)
+				di->di_printf("p");
+			break;
+		/* n - n register (bits 16-19) */
+		case 'n':
+			di->di_printf("r%d", ((insn >> 16) & 0x0f));
+			break;
+		/* s - s register (bits 8-11) */
+		case 's':
+			di->di_printf("r%d", ((insn >> 8) & 0x0f));
+			break;
+		/* o - indirect register rn (bits 16-19) (used by swap) */
+		case 'o':
+			di->di_printf("[r%d]", ((insn >> 16) & 0x0f));
+			break;
+		/* m - m register (bits 0-4) */
+		case 'm':
+			di->di_printf("r%d", ((insn >> 0) & 0x0f));
+			break;
+		/* a - address operand of ldr/str instruction */
+		case 'a':
+			disasm_insn_ldrstr(di, insn, loc);
+			break;
+		/* e - address operand of ldrh/strh instruction */
+		case 'e':
+			disasm_insn_ldrhstrh(di, insn, loc);
+			break;
+		/* l - register list for ldm/stm instruction */
+		case 'l':
+			disasm_print_reglist(di, insn);
+			break;
+		/* f - 1st fp operand (register) (bits 12-14) */
+		case 'f':
+			di->di_printf("f%d", (insn >> 12) & 7);
+			break;
+		/* g - 2nd fp operand (register) (bits 16-18) */
+		case 'g':
+			di->di_printf("f%d", (insn >> 16) & 7);
+			break;
+		/* h - 3rd fp operand (register/immediate) (bits 0-4) */
+		case 'h':
+			if (insn & (1 << 3))
+				di->di_printf("#%s", insn_fpaimm(insn));
+			else
+				di->di_printf("f%d", insn & 7);
+			break;
+		/* j - xtb rotate literal (bits 10-11) */
+		case 'j':
+			di->di_printf("ror #%d", ((insn >> 10) & 3) << 3);
+			break;
+        /* i - bfx lsb literal (bits 7-11) */
+        case 'i':
+            di->di_printf("#%d", (insn >> 7) & 31);
+            break;
+        /* w - bfx width literal (bits 16-20) */
+        case 'w':
+            di->di_printf("#%d", 1 + ((insn >> 16) & 31));
+            break;
+		/* b - branch address */
+		case 'b':
+			branch = ((insn << 2) & 0x03ffffff);
+			if (branch & 0x02000000)
+				branch |= 0xfc000000;
+			di->di_printaddr(loc + 8 + branch);
+			break;
+		/* t - blx address */
+		case 't':
+			branch = ((insn << 2) & 0x03ffffff) |
+			    (insn >> 23 & 0x00000002);
+			if (branch & 0x02000000)
+				branch |= 0xfc000000;
+			di->di_printaddr(loc + 8 + branch);
+			break;
+		/* X - block transfer type */
+		case 'X':
+			di->di_printf("%s", insn_blktrans(insn));
+			break;
+		/* Y - block transfer type (r13 base) */
+		case 'Y':
+			di->di_printf("%s", insn_stkblktrans(insn));
+			break;
+		/* c - comment field bits(0-23) */
+		case 'c':
+			di->di_printf("0x%08x", (insn & 0x00ffffff));
+			break;
+		/* k - breakpoint comment (bits 0-3, 8-19) */
+		case 'k':
+			di->di_printf("0x%04x",
+			    (insn & 0x000fff00) >> 4 | (insn & 0x0000000f));
+			break;
+		/* p - saved or current status register */
+		case 'p':
+			if (insn & 0x00400000)
+				di->di_printf("spsr");
+			else
+				di->di_printf("cpsr");
+			break;
+		/* F - PSR transfer fields */
+		case 'F':
+			di->di_printf("_");
+			if (insn & (1 << 16))
+				di->di_printf("c");
+			if (insn & (1 << 17))
+				di->di_printf("x");
+			if (insn & (1 << 18))
+				di->di_printf("s");
+			if (insn & (1 << 19))
+				di->di_printf("f");
+			break;
+		/* B - byte transfer flag */
+		case 'B':
+			if (insn & 0x00400000)
+				di->di_printf("b");
+			break;
+		/* L - co-processor transfer size */
+		case 'L':
+			if (insn & (1 << 22))
+				di->di_printf("l");
+			break;
+		/* S - set status flag */
+		case 'S':
+			if (insn & 0x00100000)
+				di->di_printf("s");
+			break;
+		/* P - fp precision */
+		case 'P':
+			di->di_printf("%s", insn_fpaprec(insn));
+			break;
+		/* Q - fp precision (for ldf/stf) */
+		case 'Q':
+			break;
+		/* R - fp rounding */
+		case 'R':
+			di->di_printf("%s", insn_fparnd(insn));
+			break;
+		/* W - writeback flag */
+		case 'W':
+			if (insn & (1 << 21))
+				di->di_printf("!");
+			break;
+		/* # - co-processor number */
+		case '#':
+			di->di_printf("p%d", (insn >> 8) & 0x0f);
+			break;
+		/* v - co-processor data transfer registers+addressing mode */
+		case 'v':
+			disasm_insn_ldcstc(di, insn, loc);
+			break;
+		/* x - instruction in hex */
+		case 'x':
+			di->di_printf("0x%08x", insn);
+			break;
+		/* y - co-processor data processing registers */
+		case 'y':
+			di->di_printf("%d, ", (insn >> 20) & 0x0f);
+
+			di->di_printf("c%d, c%d, c%d", (insn >> 12) & 0x0f,
+			    (insn >> 16) & 0x0f, insn & 0x0f);
+
+			di->di_printf(", %d", (insn >> 5) & 0x07);
+			break;
+		/* z - co-processor register transfer registers */
+		case 'z':
+			di->di_printf("%d, ", (insn >> 21) & 0x07);
+			di->di_printf("r%d, c%d, c%d, %d",
+			    (insn >> 12) & 0x0f, (insn >> 16) & 0x0f,
+			    insn & 0x0f, (insn >> 5) & 0x07);
+
+/*			if (((insn >> 5) & 0x07) != 0)
+				di->di_printf(", %d", (insn >> 5) & 0x07);*/
+			break;
+		default:
+			di->di_printf("[%c - unknown]", *f_ptr);
+			break;
+		}
+		if (*(f_ptr+1) >= 'A' && *(f_ptr+1) <= 'Z')
+			++f_ptr;
+		else if (*(++f_ptr)) {
+			++fmt;
+			if (fmt == 1)
+				di->di_printf("\t");
+			else
+				di->di_printf(", ");
+		}
+	};
+
+	di->di_printf("\n");
+
+	return(loc + INSN_SIZE);
+}
+
+
+static void
+disasm_register_shift(const disasm_interface_t *di, u_int insn)
+{
+	di->di_printf("r%d", (insn & 0x0f));
+	if ((insn & 0x00000ff0) == 0)
+		;
+	else if ((insn & 0x00000ff0) == 0x00000060)
+		di->di_printf(", rrx");
+	else {
+		if (insn & 0x10)
+			di->di_printf(", %s r%d", op2_shift(insn),
+			    (insn >> 8) & 0x0f);
+		else
+			di->di_printf(", %s #%d", op2_shift(insn),
+			    (insn >> 7) & 0x1f);
+	}
+}
+
+
+static void
+disasm_print_reglist(const disasm_interface_t *di, u_int insn)
+{
+	int loop;
+	int start;
+	int comma;
+
+	di->di_printf("{");
+	start = -1;
+	comma = 0;
+
+	for (loop = 0; loop < 17; ++loop) {
+		if (start != -1) {
+			if (loop == 16 || !(insn & (1 << loop))) {
+				if (comma)
+					di->di_printf(", ");
+				else
+					comma = 1;
+        			if (start == loop - 1)
+        				di->di_printf("r%d", start);
+        			else
+        				di->di_printf("r%d-r%d", start, loop - 1);
+        			start = -1;
+        		}
+        	} else {
+        		if (insn & (1 << loop))
+        			start = loop;
+        	}
+        }
+	di->di_printf("}");
+
+	if (insn & (1 << 22))
+		di->di_printf("^");
+}
+
+static void
+disasm_insn_ldrstr(const disasm_interface_t *di, u_int insn, u_int loc)
+{
+	int offset;
+
+	offset = insn & 0xfff;
+	if ((insn & 0x032f0000) == 0x010f0000) {
+		/* rA = pc, immediate index */
+		if (insn & 0x00800000)
+			loc += offset;
+		else
+			loc -= offset;
+		di->di_printaddr(loc + 8);
+ 	} else {
+		di->di_printf("[r%d", (insn >> 16) & 0x0f);
+		if ((insn & 0x03000fff) != 0x01000000) {
+			di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+			if (!(insn & 0x00800000))
+				di->di_printf("-");
+			if (insn & (1 << 25))
+				disasm_register_shift(di, insn);
+			else
+				di->di_printf("#0x%03x", offset);
+		}
+		if (insn & (1 << 24))
+			di->di_printf("]");
+	}
+}
+
+static void
+disasm_insn_ldrhstrh(const disasm_interface_t *di, u_int insn, u_int loc)
+{
+	int offset;
+
+	offset = ((insn & 0xf00) >> 4) | (insn & 0xf);
+	if ((insn & 0x004f0000) == 0x004f0000) {
+		/* rA = pc, immediate index */
+		if (insn & 0x00800000)
+			loc += offset;
+		else
+			loc -= offset;
+		di->di_printaddr(loc + 8);
+ 	} else {
+		di->di_printf("[r%d", (insn >> 16) & 0x0f);
+		if ((insn & 0x01400f0f) != 0x01400000) {
+			di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+			if (!(insn & 0x00800000))
+				di->di_printf("-");
+			if (insn & (1 << 22))
+				di->di_printf("#0x%02x", offset);
+			else
+				di->di_printf("r%d", (insn & 0x0f));
+		}
+		if (insn & (1 << 24))
+			di->di_printf("]");
+	}
+}
+
+static void
+disasm_insn_ldcstc(const disasm_interface_t *di, u_int insn, u_int __unused loc)
+{
+	if (((insn >> 8) & 0xf) == 1)
+		di->di_printf("f%d, ", (insn >> 12) & 0x07);
+	else
+		di->di_printf("c%d, ", (insn >> 12) & 0x0f);
+
+	di->di_printf("[r%d", (insn >> 16) & 0x0f);
+
+	di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+
+	if (!(insn & (1 << 23)))
+		di->di_printf("-");
+
+	di->di_printf("#0x%03x", (insn & 0xff) << 2);
+
+	if (insn & (1 << 24))
+		di->di_printf("]");
+
+	if (insn & (1 << 21))
+		di->di_printf("!");
+}
+
+static u_int
+disassemble_readword(u_int address)
+{
+	return(*((u_int *)address));
+}
+
+static void
+disassemble_printaddr(u_int address)
+{
+	printf("0x%08x", address);
+}
+
+static const disasm_interface_t disassemble_di = {
+	disassemble_readword, disassemble_printaddr, printf
+};
+
+void
+disassemble(u_int address)
+{
+
+	(void)disasm(&disassemble_di, address, 0);
+}
+
+/* End of disassem.c */

diff --git a/libpixelflinger/codeflinger/disassem.h b/libpixelflinger/codeflinger/disassem.h
new file mode 100644
index 0000000..c7c60b6
--- /dev/null
+++ b/libpixelflinger/codeflinger/disassem.h

@@ -0,0 +1,65 @@
+/*	$NetBSD: disassem.h,v 1.4 2001/03/04 04:15:58 matt Exp $	*/
+
+/*-
+ * Copyright (c) 1997 Mark Brinicombe.
+ * Copyright (c) 1997 Causality Limited.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Mark Brinicombe.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Define the interface structure required by the disassembler.
+ *
+ * $FreeBSD: /repoman/r/ncvs/src/sys/arm/include/disassem.h,v 1.2 2005/01/05 21:58:48 imp Exp $
+ */
+
+#ifndef ANDROID_MACHINE_DISASSEM_H
+#define ANDROID_MACHINE_DISASSEM_H
+
+#include <sys/types.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+	u_int	(*di_readword)(u_int);
+	void	(*di_printaddr)(u_int);
+	int	(*di_printf)(const char *, ...);
+} disasm_interface_t;
+
+/* Prototypes for callable functions */
+
+u_int disasm(const disasm_interface_t *, u_int, int);
+void disassemble(u_int);
+
+#if __cplusplus
+}
+#endif
+
+#endif /* !ANDROID_MACHINE_DISASSEM_H */

diff --git a/libpixelflinger/codeflinger/load_store.cpp b/libpixelflinger/codeflinger/load_store.cpp
new file mode 100644
index 0000000..4db0a49
--- /dev/null
+++ b/libpixelflinger/codeflinger/load_store.cpp

@@ -0,0 +1,384 @@
+/* libs/pixelflinger/codeflinger/load_store.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "pixelflinger-code"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <log/log.h>
+
+#include "GGLAssembler.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+void GGLAssembler::store(const pointer_t& addr, const pixel_t& s, uint32_t flags)
+{
+    const int bits = addr.size;
+    const int inc = (flags & WRITE_BACK)?1:0;
+    switch (bits) {
+    case 32:
+        if (inc)    STR(AL, s.reg, addr.reg, immed12_post(4));
+        else        STR(AL, s.reg, addr.reg);
+        break;
+    case 24:
+        // 24 bits formats are a little special and used only for RGB
+        // 0x00BBGGRR is unpacked as R,G,B
+        STRB(AL, s.reg, addr.reg, immed12_pre(0));
+        MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 8));
+        STRB(AL, s.reg, addr.reg, immed12_pre(1));
+        MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 8));
+        STRB(AL, s.reg, addr.reg, immed12_pre(2));
+        if (!(s.flags & CORRUPTIBLE)) {
+            MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 16));
+        }
+        if (inc)
+            ADD(AL, 0, addr.reg, addr.reg, imm(3));
+        break;
+    case 16:
+        if (inc)    STRH(AL, s.reg, addr.reg, immed8_post(2));
+        else        STRH(AL, s.reg, addr.reg);
+        break;
+    case  8:
+        if (inc)    STRB(AL, s.reg, addr.reg, immed12_post(1));
+        else        STRB(AL, s.reg, addr.reg);
+        break;
+    }
+}
+
+void GGLAssembler::load(const pointer_t& addr, const pixel_t& s, uint32_t flags)
+{
+    Scratch scratches(registerFile());
+    int s0;
+
+    const int bits = addr.size;
+    const int inc = (flags & WRITE_BACK)?1:0;
+    switch (bits) {
+    case 32:
+        if (inc)    LDR(AL, s.reg, addr.reg, immed12_post(4));
+        else        LDR(AL, s.reg, addr.reg);
+        break;
+    case 24:
+        // 24 bits formats are a little special and used only for RGB
+        // R,G,B is packed as 0x00BBGGRR
+        s0 = scratches.obtain();
+        if (s.reg != addr.reg) {
+            LDRB(AL, s.reg, addr.reg, immed12_pre(0));      // R
+            LDRB(AL, s0, addr.reg, immed12_pre(1));         // G
+            ORR(AL, 0, s.reg, s.reg, reg_imm(s0, LSL, 8));
+            LDRB(AL, s0, addr.reg, immed12_pre(2));         // B
+            ORR(AL, 0, s.reg, s.reg, reg_imm(s0, LSL, 16));
+        } else {
+            int s1 = scratches.obtain();
+            LDRB(AL, s1, addr.reg, immed12_pre(0));         // R
+            LDRB(AL, s0, addr.reg, immed12_pre(1));         // G
+            ORR(AL, 0, s1, s1, reg_imm(s0, LSL, 8));
+            LDRB(AL, s0, addr.reg, immed12_pre(2));         // B
+            ORR(AL, 0, s.reg, s1, reg_imm(s0, LSL, 16));
+        }
+        if (inc)
+            ADD(AL, 0, addr.reg, addr.reg, imm(3));
+        break;
+    case 16:
+        if (inc)    LDRH(AL, s.reg, addr.reg, immed8_post(2));
+        else        LDRH(AL, s.reg, addr.reg);
+        break;
+    case  8:
+        if (inc)    LDRB(AL, s.reg, addr.reg, immed12_post(1));
+        else        LDRB(AL, s.reg, addr.reg);
+        break;
+    }
+}
+
+void GGLAssembler::extract(integer_t& d, int s, int h, int l, int bits)
+{
+    const int maskLen = h-l;
+
+#ifdef __mips__
+    assert(maskLen<=11);
+#else
+    assert(maskLen<=8);
+#endif
+    assert(h);
+
+    if (h != bits) {
+        const int mask = ((1<<maskLen)-1) << l;
+        if (isValidImmediate(mask)) {
+            AND(AL, 0, d.reg, s, imm(mask));    // component = packed & mask;
+        } else if (isValidImmediate(~mask)) {
+            BIC(AL, 0, d.reg, s, imm(~mask));   // component = packed & mask;
+        } else {
+            MOV(AL, 0, d.reg, reg_imm(s, LSL, 32-h));
+            l += 32-h;
+            h = 32;
+        }
+        s = d.reg;
+    }
+
+    if (l) {
+        MOV(AL, 0, d.reg, reg_imm(s, LSR, l));  // component = packed >> l;
+        s = d.reg;
+    }
+
+    if (s != d.reg) {
+        MOV(AL, 0, d.reg, s);
+    }
+
+    d.s = maskLen;
+}
+
+void GGLAssembler::extract(integer_t& d, const pixel_t& s, int component)
+{
+    extract(d,  s.reg,
+                s.format.c[component].h,
+                s.format.c[component].l,
+                s.size());
+}
+
+void GGLAssembler::extract(component_t& d, const pixel_t& s, int component)
+{
+    integer_t r(d.reg, 32, d.flags);
+    extract(r,  s.reg,
+                s.format.c[component].h,
+                s.format.c[component].l,
+                s.size());
+    d = component_t(r);
+}
+
+
+void GGLAssembler::expand(integer_t& d, const component_t& s, int dbits)
+{
+    if (s.l || (s.flags & CLEAR_HI)) {
+        extract(d, s.reg, s.h, s.l, 32);
+        expand(d, d, dbits);
+    } else {
+        expand(d, integer_t(s.reg, s.size(), s.flags), dbits);
+    }
+}
+
+void GGLAssembler::expand(component_t& d, const component_t& s, int dbits)
+{
+    integer_t r(d.reg, 32, d.flags);
+    expand(r, s, dbits);
+    d = component_t(r);
+}
+
+void GGLAssembler::expand(integer_t& dst, const integer_t& src, int dbits)
+{
+    assert(src.size());
+
+    int sbits = src.size();
+    int s = src.reg;
+    int d = dst.reg;
+
+    // be sure to set 'dst' after we read 'src' as they may be identical
+    dst.s = dbits;
+    dst.flags = 0;
+
+    if (dbits<=sbits) {
+        if (s != d) {
+            MOV(AL, 0, d, s);
+        }
+        return;
+    }
+
+    if (sbits == 1) {
+        RSB(AL, 0, d, s, reg_imm(s, LSL, dbits));
+            // d = (s<<dbits) - s;
+        return;
+    }
+
+    if (dbits % sbits) {
+        MOV(AL, 0, d, reg_imm(s, LSL, dbits-sbits));
+            // d = s << (dbits-sbits);
+        dbits -= sbits;
+        do {
+            ORR(AL, 0, d, d, reg_imm(d, LSR, sbits));
+                // d |= d >> sbits;
+            dbits -= sbits;
+            sbits *= 2;
+        } while(dbits>0);
+        return;
+    }
+
+    dbits -= sbits;
+    do {
+        ORR(AL, 0, d, s, reg_imm(s, LSL, sbits));
+            // d |= d<<sbits;
+        s = d;
+        dbits -= sbits;
+        if (sbits*2 < dbits) {
+            sbits *= 2;
+        }
+    } while(dbits>0);
+}
+
+void GGLAssembler::downshift(
+        pixel_t& d, int component, component_t s, const reg_t& dither)
+{
+    Scratch scratches(registerFile());
+
+    int sh = s.h;
+    int sl = s.l;
+    int maskHiBits = (sh!=32) ? ((s.flags & CLEAR_HI)?1:0) : 0;
+    int maskLoBits = (sl!=0)  ? ((s.flags & CLEAR_LO)?1:0) : 0;
+    int sbits = sh - sl;
+
+    int dh = d.format.c[component].h;
+    int dl = d.format.c[component].l;
+    int dbits = dh - dl;
+    int dithering = 0;
+
+    ALOGE_IF(sbits<dbits, "sbits (%d) < dbits (%d) in downshift", sbits, dbits);
+
+    if (sbits>dbits) {
+        // see if we need to dither
+        dithering = mDithering;
+    }
+
+    int ireg = d.reg;
+    if (!(d.flags & FIRST)) {
+        if (s.flags & CORRUPTIBLE)  {
+            ireg = s.reg;
+        } else {
+            ireg = scratches.obtain();
+        }
+    }
+    d.flags &= ~FIRST;
+
+    if (maskHiBits) {
+        // we need to mask the high bits (and possibly the lowbits too)
+        // and we might be able to use immediate mask.
+        if (!dithering) {
+            // we don't do this if we only have maskLoBits because we can
+            // do it more efficiently below (in the case where dl=0)
+            const int offset = sh - dbits;
+            if (dbits<=8 && offset >= 0) {
+                const uint32_t mask = ((1<<dbits)-1) << offset;
+                if (isValidImmediate(mask) || isValidImmediate(~mask)) {
+                    build_and_immediate(ireg, s.reg, mask, 32);
+                    sl = offset;
+                    s.reg = ireg;
+                    sbits = dbits;
+                    maskLoBits = maskHiBits = 0;
+                }
+            }
+        } else {
+            // in the dithering case though, we need to preserve the lower bits
+            const uint32_t mask = ((1<<sbits)-1) << sl;
+            if (isValidImmediate(mask) || isValidImmediate(~mask)) {
+                build_and_immediate(ireg, s.reg, mask, 32);
+                s.reg = ireg;
+                maskLoBits = maskHiBits = 0;
+            }
+        }
+    }
+
+    // XXX: we could special case (maskHiBits & !maskLoBits)
+    // like we do for maskLoBits below, but it happens very rarely
+    // that we have maskHiBits only and the conditions necessary to lead
+    // to better code (like doing d |= s << 24)
+
+    if (maskHiBits) {
+        MOV(AL, 0, ireg, reg_imm(s.reg, LSL, 32-sh));
+        sl += 32-sh;
+        sh = 32;
+        s.reg = ireg;
+        maskHiBits = 0;
+    }
+
+    //	Downsampling should be performed as follows:
+    //  V * ((1<<dbits)-1) / ((1<<sbits)-1)
+    //	V * [(1<<dbits)/((1<<sbits)-1)	-	1/((1<<sbits)-1)]
+    //	V * [1/((1<<sbits)-1)>>dbits	-	1/((1<<sbits)-1)]
+    //	V/((1<<(sbits-dbits))-(1>>dbits))	-	(V>>sbits)/((1<<sbits)-1)>>sbits
+    //	V/((1<<(sbits-dbits))-(1>>dbits))	-	(V>>sbits)/(1-(1>>sbits))
+    //
+    //	By approximating (1>>dbits) and (1>>sbits) to 0:
+    //
+    //		V>>(sbits-dbits)	-	V>>sbits
+    //
+	//  A good approximation is V>>(sbits-dbits),
+    //  but better one (needed for dithering) is:
+    //
+    //		(V>>(sbits-dbits)<<sbits	-	V)>>sbits
+    //		(V<<dbits	-	V)>>sbits
+    //		(V	-	V>>dbits)>>(sbits-dbits)
+
+    // Dithering is done here
+    if (dithering) {
+        comment("dithering");
+        if (sl) {
+            MOV(AL, 0, ireg, reg_imm(s.reg, LSR, sl));
+            sh -= sl;
+            sl = 0;
+            s.reg = ireg;
+        }
+        // scaling (V-V>>dbits)
+        SUB(AL, 0, ireg, s.reg, reg_imm(s.reg, LSR, dbits));
+        const int shift = (GGL_DITHER_BITS - (sbits-dbits));
+        if (shift>0)        ADD(AL, 0, ireg, ireg, reg_imm(dither.reg, LSR, shift));
+        else if (shift<0)   ADD(AL, 0, ireg, ireg, reg_imm(dither.reg, LSL,-shift));
+        else                ADD(AL, 0, ireg, ireg, dither.reg);
+        s.reg = ireg;
+    }
+
+    if ((maskLoBits|dithering) && (sh > dbits)) {
+        int shift = sh-dbits;
+        if (dl) {
+            MOV(AL, 0, ireg, reg_imm(s.reg, LSR, shift));
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(ireg, LSL, dl));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(ireg, LSL, dl));
+            }
+        } else {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSR, shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSR, shift));
+            }
+        }
+    } else {
+        int shift = sh-dh;
+        if (shift>0) {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSR, shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSR, shift));
+            }
+        } else if (shift<0) {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSL, -shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSL, -shift));
+            }
+        } else {
+            if (ireg == d.reg) {
+                if (s.reg != d.reg) {
+                    MOV(AL, 0, d.reg, s.reg);
+                }
+            } else {
+                ORR(AL, 0, d.reg, d.reg, s.reg);
+            }
+        }
+    }
+}
+
+}; // namespace android

diff --git a/libpixelflinger/codeflinger/mips64_disassem.c b/libpixelflinger/codeflinger/mips64_disassem.c
new file mode 100644
index 0000000..8528299
--- /dev/null
+++ b/libpixelflinger/codeflinger/mips64_disassem.c

@@ -0,0 +1,584 @@
+/*  $NetBSD: db_disasm.c,v 1.19 2007/02/28 04:21:53 thorpej Exp $   */
+
+/*-
+ * Copyright (c) 1991, 1993
+ *  The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  from: @(#)kadb.c    8.1 (Berkeley) 6/10/93
+ */
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/cdefs.h>
+#include <sys/types.h>
+
+#include <android/log.h>
+
+#include "mips_opcode.h"
+
+#define __unused __attribute__((__unused__))
+
+static char *sprintf_buffer;
+static int sprintf_buf_len;
+
+typedef uint64_t db_addr_t;
+static void db_printf(const char* fmt, ...);
+
+static const char * const op_name[64] = {
+/* 0 */ "spec", "bcond", "j", "jal", "beq", "bne", "blez", "bgtz",
+/* 8 */ "pop10", "addiu", "slti", "sltiu", "andi", "ori", "xori", "aui",
+/*16 */ "cop0", "cop1", "cop2", "?", "?", "?", "pop26", "pop27",
+/*24 */ "pop30", "daddiu", "?", "?", "?", "daui", "msa", "op37",
+/*32 */ "lb", "lh", "?",  "lw", "lbu", "lhu", "?", "lwu",
+/*40 */ "sb", "sh", "?", "sw", "?", "?", "?", "?",
+/*48 */ "?", "lwc1", "bc", "?", "?",  "ldc1", "pop66", "ld",
+/*56 */ "?", "swc1", "balc", "pcrel", "?", "sdc1", "pop76", "sd"
+};
+
+static const char * const spec_name[64] = {
+/* 0 */ "sll", "?", "srl", "sra", "sllv", "?", "srlv", "srav",
+/* 8 */ "?", "jalr", "?", "?", "syscall", "break", "sdbpp", "sync",
+/*16 */ "clz", "clo", "dclz", "dclo", "dsllv", "dlsa", "dsrlv", "dsrav",
+/*24 */ "sop30", "sop31", "sop32", "sop33", "sop34", "sop35", "sop36", "sop37",
+/*32 */ "add", "addu", "sub", "subu", "and", "or", "xor", "nor",
+/*40 */ "?", "?", "slt", "sltu", "dadd", "daddu", "dsub", "dsubu",
+/*48 */ "tge", "tgeu", "tlt", "tltu", "teq", "seleqz", "tne", "selnez",
+/*56 */ "dsll", "?", "dsrl", "dsra", "dsll32", "?", "dsrl32", "dsra32"
+};
+
+static const char * const bcond_name[32] = {
+/* 0 */ "bltz", "bgez", "?", "?", "?", "?", "dahi", "?",
+/* 8 */ "?", "?", "?", "?", "?", "?", "?", "?",
+/*16 */ "nal", "bal", "?", "?", "?", "?", "?", "sigrie",
+/*24 */ "?", "?", "?", "?", "?", "?", "dati", "synci",
+};
+
+static const char * const cop1_name[64] = {
+/* 0 */ "fadd",  "fsub", "fmpy", "fdiv", "fsqrt","fabs", "fmov", "fneg",
+/* 8 */ "fop08","fop09","fop0a","fop0b","fop0c","fop0d","fop0e","fop0f",
+/*16 */ "fop10","fop11","fop12","fop13","fop14","fop15","fop16","fop17",
+/*24 */ "fop18","fop19","fop1a","fop1b","fop1c","fop1d","fop1e","fop1f",
+/*32 */ "fcvts","fcvtd","fcvte","fop23","fcvtw","fop25","fop26","fop27",
+/*40 */ "fop28","fop29","fop2a","fop2b","fop2c","fop2d","fop2e","fop2f",
+/*48 */ "fcmp.f","fcmp.un","fcmp.eq","fcmp.ueq","fcmp.olt","fcmp.ult",
+    "fcmp.ole","fcmp.ule",
+/*56 */ "fcmp.sf","fcmp.ngle","fcmp.seq","fcmp.ngl","fcmp.lt","fcmp.nge",
+    "fcmp.le","fcmp.ngt"
+};
+
+static const char * const fmt_name[16] = {
+    "s",    "d",    "e",    "fmt3",
+    "w",    "fmt5", "fmt6", "fmt7",
+    "fmt8", "fmt9", "fmta", "fmtb",
+    "fmtc", "fmtd", "fmte", "fmtf"
+};
+
+static char * const mips_reg_name[32] = {
+    "zero", "at",   "v0",   "v1",   "a0",   "a1",   "a2",   "a3",
+    "a4",   "a5",   "a6",   "a7",   "t0",   "t1",   "t2",   "t3",
+    "s0",   "s1",   "s2",   "s3",   "s4",   "s5",   "s6",   "s7",
+    "t8",   "t9",   "k0",   "k1",   "gp",   "sp",   "s8",   "ra"
+};
+
+static char * alt_arm_reg_name[32] = {  // hacked names for comparison with ARM code
+    "zero", "at",   "r0",   "r1",   "r2",   "r3",   "r4",   "r5",
+    "r6",   "r7",   "r8",   "r9",   "r10",  "r11",  "r12",  "r13",
+    "r14",  "r15",  "at2",  "cmp",  "s4",   "s5",   "s6",   "s7",
+    "t8",   "t9",   "k0",   "k1",   "gp",   "sp",   "s8",   "ra"
+};
+
+static char * const * reg_name =  &mips_reg_name[0];
+
+static const char * const c0_opname[64] = {
+    "c0op00","tlbr",  "tlbwi", "c0op03","c0op04","c0op05","tlbwr", "c0op07",
+    "tlbp",  "c0op11","c0op12","c0op13","c0op14","c0op15","c0op16","c0op17",
+    "rfe",   "c0op21","c0op22","c0op23","c0op24","c0op25","c0op26","c0op27",
+    "eret",  "c0op31","c0op32","c0op33","c0op34","c0op35","c0op36","c0op37",
+    "c0op40","c0op41","c0op42","c0op43","c0op44","c0op45","c0op46","c0op47",
+    "c0op50","c0op51","c0op52","c0op53","c0op54","c0op55","c0op56","c0op57",
+    "c0op60","c0op61","c0op62","c0op63","c0op64","c0op65","c0op66","c0op67",
+    "c0op70","c0op71","c0op72","c0op73","c0op74","c0op75","c0op77","c0op77",
+};
+
+static const char * const c0_reg[32] = {
+    "index",    "random",   "tlblo0",  "tlblo1",
+    "context",  "pagemask", "wired",   "cp0r7",
+    "badvaddr", "count",    "tlbhi",   "compare",
+    "status",   "cause",    "epc",     "prid",
+    "config",   "lladdr",   "watchlo", "watchhi",
+    "xcontext", "cp0r21",   "cp0r22",  "debug",
+    "depc",     "perfcnt",  "ecc",     "cacheerr",
+    "taglo",    "taghi",    "errepc",  "desave"
+};
+
+static void print_addr(db_addr_t);
+db_addr_t mips_disassem(db_addr_t loc, char *di_buffer, int alt_dis_format);
+
+
+/*
+ * Disassemble instruction 'insn' nominally at 'loc'.
+ * 'loc' may in fact contain a breakpoint instruction.
+ */
+static db_addr_t
+db_disasm_insn(int insn, db_addr_t loc, bool altfmt __unused)
+{
+    bool bdslot = false;
+    InstFmt i;
+
+    i.word = insn;
+
+    switch (i.JType.op) {
+    case OP_SPECIAL:
+        if (i.word == 0) {
+            db_printf("nop");
+            break;
+        }
+        if (i.word == 0x0080) {
+            db_printf("NIY");
+            break;
+        }
+        if (i.word == 0x00c0) {
+            db_printf("NOT IMPL");
+            break;
+        }
+        /* Special cases --------------------------------------------------
+         * "addu" is a "move" only in 32-bit mode.  What's the correct
+         * answer - never decode addu/daddu as "move"?
+         */
+        if ( (i.RType.func == OP_ADDU && i.RType.rt == 0)  ||
+             (i.RType.func == OP_OR   && i.RType.rt == 0) ) {
+            db_printf("move\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rs]);
+            break;
+        }
+
+        if (i.RType.func == OP_SRL && (i.RType.rs & 1) == 1) {
+            db_printf("rotr\t%s,%s,%d", reg_name[i.RType.rd],
+                reg_name[i.RType.rt], i.RType.shamt);
+            break;
+        }
+        if (i.RType.func == OP_SRLV && (i.RType.shamt & 1) == 1) {
+            db_printf("rotrv\t%s,%s,%s", reg_name[i.RType.rd],
+                reg_name[i.RType.rt], reg_name[i.RType.rs]);
+            break;
+        }
+
+        if (i.RType.func == OP_SOP30) {
+            if (i.RType.shamt == OP_MUL) {
+                db_printf("mul");
+            } else if (i.RType.shamt == OP_MUH) {
+                db_printf("muh");
+            }
+            db_printf("\t%s,%s,%s", reg_name[i.RType.rd],
+                reg_name[i.RType.rs], reg_name[i.RType.rt]);
+            break;
+        }
+        if (i.RType.func == OP_SOP31) {
+            if (i.RType.shamt == OP_MUL) {
+                db_printf("mulu");
+            } else if (i.RType.shamt == OP_MUH) {
+                db_printf("muhu");
+            }
+            db_printf("\t%s,%s,%s", reg_name[i.RType.rd],
+                reg_name[i.RType.rs], reg_name[i.RType.rt]);
+            break;
+        }
+
+        if (i.RType.func == OP_JALR && i.RType.rd == 0) {
+            db_printf("jr\t%s", reg_name[i.RType.rs]);
+            bdslot = true;
+            break;
+        }
+
+        db_printf("%s", spec_name[i.RType.func]);
+        switch (i.RType.func) {
+        case OP_SLL:
+        case OP_SRL:
+        case OP_SRA:
+        case OP_DSLL:
+
+        case OP_DSRL:
+        case OP_DSRA:
+        case OP_DSLL32:
+        case OP_DSRL32:
+        case OP_DSRA32:
+            db_printf("\t%s,%s,%d",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt],
+                i.RType.shamt);
+            break;
+
+        case OP_SLLV:
+        case OP_SRLV:
+        case OP_SRAV:
+        case OP_DSLLV:
+        case OP_DSRLV:
+        case OP_DSRAV:
+            db_printf("\t%s,%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt],
+                reg_name[i.RType.rs]);
+            break;
+
+        case OP_CLZ:
+        case OP_CLO:
+        case OP_DCLZ:
+        case OP_DCLO:
+            db_printf("\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rs]);
+            break;
+
+        case OP_JALR:
+            db_printf("\t");
+            if (i.RType.rd != 31) {
+                db_printf("%s,", reg_name[i.RType.rd]);
+            }
+            db_printf("%s", reg_name[i.RType.rs]);
+            bdslot = true;
+            break;
+
+        case OP_SYSCALL:
+        case OP_SYNC:
+            break;
+
+        case OP_BREAK:
+            db_printf("\t%d", (i.RType.rs << 5) | i.RType.rt);
+            break;
+
+        default:
+            db_printf("\t%s,%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rs],
+                reg_name[i.RType.rt]);
+        }
+        break;
+
+    case OP_SPECIAL3:
+        if (i.RType.func == OP_EXT)
+            db_printf("ext\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd+1);
+        else if (i.RType.func == OP_DEXT)
+            db_printf("dext\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd+1);
+        else if (i.RType.func == OP_DEXTM)
+            db_printf("dextm\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd+33);
+        else if (i.RType.func == OP_DEXTU)
+            db_printf("dextu\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt+32,
+                    i.RType.rd+1);
+        else if (i.RType.func == OP_INS)
+            db_printf("ins\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd-i.RType.shamt+1);
+        else if (i.RType.func == OP_DINS)
+            db_printf("dins\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd-i.RType.shamt+1);
+        else if (i.RType.func == OP_DINSM)
+            db_printf("dinsm\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd-i.RType.shamt+33);
+        else if (i.RType.func == OP_DINSU)
+            db_printf("dinsu\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt+32,
+                    i.RType.rd-i.RType.shamt+1);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_WSBH)
+            db_printf("wsbh\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_SEB)
+            db_printf("seb\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_SEH)
+            db_printf("seh\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else if (i.RType.func == OP_RDHWR)
+            db_printf("rdhwr\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else
+            db_printf("Unknown");
+        break;
+
+    case OP_BCOND:
+        db_printf("%s\t%s,", bcond_name[i.IType.rt],
+            reg_name[i.IType.rs]);
+        goto pr_displ;
+
+    case OP_BLEZ:
+    case OP_BGTZ:
+        db_printf("%s\t%s,", op_name[i.IType.op],
+            reg_name[i.IType.rs]);
+        goto pr_displ;
+
+    case OP_BEQ:
+        if (i.IType.rs == 0 && i.IType.rt == 0) {
+            db_printf("b\t");
+            goto pr_displ;
+        }
+        /* FALLTHROUGH */
+    case OP_BNE:
+        db_printf("%s\t%s,%s,", op_name[i.IType.op],
+            reg_name[i.IType.rs],
+            reg_name[i.IType.rt]);
+    pr_displ:
+        print_addr(loc + 4 + ((short)i.IType.imm << 2));
+        bdslot = true;
+        break;
+
+    case OP_COP0:
+        switch (i.RType.rs) {
+        case OP_BCx:
+        case OP_BCy:
+
+            db_printf("bc0%c\t",
+                "ft"[i.RType.rt & COPz_BC_TF_MASK]);
+            goto pr_displ;
+
+        case OP_MT:
+            db_printf("mtc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_DMT:
+            db_printf("dmtc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_MF:
+            db_printf("mfc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_DMF:
+            db_printf("dmfc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        default:
+            db_printf("%s", c0_opname[i.FRType.func]);
+        }
+        break;
+
+    case OP_COP1:
+        switch (i.RType.rs) {
+        case OP_BCx:
+        case OP_BCy:
+            db_printf("bc1%c\t",
+                "ft"[i.RType.rt & COPz_BC_TF_MASK]);
+            goto pr_displ;
+
+        case OP_MT:
+            db_printf("mtc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_MF:
+            db_printf("mfc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_CT:
+            db_printf("ctc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_CF:
+            db_printf("cfc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        default:
+            db_printf("%s.%s\tf%d,f%d,f%d",
+                cop1_name[i.FRType.func],
+                fmt_name[i.FRType.fmt],
+                i.FRType.fd, i.FRType.fs, i.FRType.ft);
+        }
+        break;
+
+    case OP_J:
+    case OP_JAL:
+        db_printf("%s\t", op_name[i.JType.op]);
+        print_addr((loc & 0xFFFFFFFFF0000000) | (i.JType.target << 2));
+        bdslot = true;
+        break;
+
+    case OP_LWC1:
+    case OP_SWC1:
+        db_printf("%s\tf%d,", op_name[i.IType.op],
+            i.IType.rt);
+        goto loadstore;
+
+    case OP_LB:
+    case OP_LH:
+    case OP_LW:
+    case OP_LD:
+    case OP_LBU:
+    case OP_LHU:
+    case OP_LWU:
+    case OP_SB:
+    case OP_SH:
+    case OP_SW:
+    case OP_SD:
+        db_printf("%s\t%s,", op_name[i.IType.op],
+            reg_name[i.IType.rt]);
+    loadstore:
+        db_printf("%d(%s)", (short)i.IType.imm,
+            reg_name[i.IType.rs]);
+        break;
+
+    case OP_ORI:
+    case OP_XORI:
+        if (i.IType.rs == 0) {
+            db_printf("li\t%s,0x%x",
+                reg_name[i.IType.rt],
+                i.IType.imm);
+            break;
+        }
+        /* FALLTHROUGH */
+    case OP_ANDI:
+        db_printf("%s\t%s,%s,0x%x", op_name[i.IType.op],
+            reg_name[i.IType.rt],
+            reg_name[i.IType.rs],
+            i.IType.imm);
+        break;
+
+    case OP_AUI:
+        if (i.IType.rs == 0) {
+            db_printf("lui\t%s,0x%x", reg_name[i.IType.rt],
+                i.IType.imm);
+        } else {
+            db_printf("%s\t%s,%s,%d", op_name[i.IType.op],
+            reg_name[i.IType.rt], reg_name[i.IType.rs],
+            (short)i.IType.imm);
+        }
+        break;
+
+    case OP_ADDIU:
+    case OP_DADDIU:
+        if (i.IType.rs == 0) {
+            db_printf("li\t%s,%d",
+                reg_name[i.IType.rt],
+                (short)i.IType.imm);
+            break;
+        }
+        /* FALLTHROUGH */
+    default:
+        db_printf("%s\t%s,%s,%d", op_name[i.IType.op],
+            reg_name[i.IType.rt],
+            reg_name[i.IType.rs],
+            (short)i.IType.imm);
+    }
+    // db_printf("\n");
+    // if (bdslot) {
+    //     db_printf("   bd: ");
+    //     mips_disassem(loc+4);
+    //     return (loc + 8);
+    // }
+    return (loc + 4);
+}
+
+static void
+print_addr(db_addr_t loc)
+{
+    db_printf("0x%08lx", loc);
+}
+
+static void db_printf(const char* fmt, ...)
+{
+    int cnt;
+    va_list argp;
+    va_start(argp, fmt);
+    if (sprintf_buffer) {
+        cnt = vsnprintf(sprintf_buffer, sprintf_buf_len, fmt, argp);
+        sprintf_buffer += cnt;
+        sprintf_buf_len -= cnt;
+    } else {
+        vprintf(fmt, argp);
+    }
+    va_end(argp);
+}
+
+/*
+ * Disassemble instruction at 'loc'.
+ * Return address of start of next instruction.
+ * Since this function is used by 'examine' and by 'step'
+ * "next instruction" does NOT mean the next instruction to
+ * be executed but the 'linear' next instruction.
+ */
+db_addr_t
+mips_disassem(db_addr_t loc, char *di_buffer, int alt_dis_format)
+{
+    u_int32_t instr;
+
+    if (alt_dis_format) {   // use ARM register names for disassembly
+        reg_name = &alt_arm_reg_name[0];
+    }
+
+    sprintf_buffer = di_buffer;     // quick 'n' dirty printf() vs sprintf()
+    sprintf_buf_len = 39;           // should be passed in
+
+    instr =  *(u_int32_t *)loc;
+    return (db_disasm_insn(instr, loc, false));
+}

diff --git a/libpixelflinger/codeflinger/mips64_disassem.h b/libpixelflinger/codeflinger/mips64_disassem.h
new file mode 100644
index 0000000..c94f04f
--- /dev/null
+++ b/libpixelflinger/codeflinger/mips64_disassem.h

@@ -0,0 +1,56 @@
+/*  $NetBSD: db_disasm.c,v 1.19 2007/02/28 04:21:53 thorpej Exp $   */
+
+/*-
+ * Copyright (c) 1991, 1993
+ *  The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  from: @(#)kadb.c    8.1 (Berkeley) 6/10/93
+ */
+
+
+
+#ifndef ANDROID_MIPS_DISASSEM_H
+#define ANDROID_MIPS_DISASSEM_H
+
+#include <sys/types.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+/* Prototypes for callable functions */
+
+void mips_disassem(uint32_t *location, char *di_buffer, int alt_fmt);
+
+#if __cplusplus
+}
+#endif
+
+#endif /* !ANDROID_MIPS_DISASSEM_H */

diff --git a/libpixelflinger/codeflinger/mips_disassem.c b/libpixelflinger/codeflinger/mips_disassem.c
new file mode 100644
index 0000000..1fe6806
--- /dev/null
+++ b/libpixelflinger/codeflinger/mips_disassem.c

@@ -0,0 +1,592 @@
+/*  $NetBSD: db_disasm.c,v 1.19 2007/02/28 04:21:53 thorpej Exp $   */
+
+/*-
+ * Copyright (c) 1991, 1993
+ *  The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  from: @(#)kadb.c    8.1 (Berkeley) 6/10/93
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <sys/cdefs.h>
+
+#include <sys/types.h>
+#include "mips_opcode.h"
+
+
+// #include <sys/systm.h>
+// #include <sys/param.h>
+
+// #include <machine/reg.h>
+// #include <machine/cpu.h>
+/*#include <machine/param.h>*/
+// #include <machine/db_machdep.h>
+
+// #include <ddb/db_interface.h>
+// #include <ddb/db_output.h>
+// #include <ddb/db_extern.h>
+// #include <ddb/db_sym.h>
+
+#define __unused __attribute__((__unused__))
+
+static char *sprintf_buffer;
+static int sprintf_buf_len;
+
+
+typedef uint32_t db_addr_t;
+static void db_printf(const char* fmt, ...);
+
+static const char * const op_name[64] = {
+/* 0 */ "spec", "bcond","j  ",    "jal",  "beq",  "bne",  "blez", "bgtz",
+/* 8 */ "addi", "addiu","slti", "sltiu","andi", "ori",  "xori", "lui",
+/*16 */ "cop0", "cop1", "cop2", "cop3", "beql", "bnel", "blezl","bgtzl",
+/*24 */ "daddi","daddiu","ldl", "ldr",  "op34", "op35", "op36", "op37",
+/*32 */ "lb ",   "lh ",   "lwl",  "lw ",   "lbu",  "lhu",  "lwr",  "lwu",
+/*40 */ "sb ",   "sh ",   "swl",  "sw ",   "sdl",  "sdr",  "swr",  "cache",
+/*48 */ "ll ",   "lwc1", "lwc2", "lwc3", "lld",  "ldc1", "ldc2", "ld ",
+/*56 */ "sc ",   "swc1", "swc2", "swc3", "scd",  "sdc1", "sdc2", "sd "
+};
+
+static const char * const spec_name[64] = {
+/* 0 */ "sll",  "spec01","srl", "sra",  "sllv", "spec05","srlv","srav",
+/* 8 */ "jr",   "jalr", "movz","movn","syscall","break","spec16","sync",
+/*16 */ "mfhi", "mthi", "mflo", "mtlo", "dsllv","spec25","dsrlv","dsrav",
+/*24 */ "mult", "multu","div",  "divu", "dmult","dmultu","ddiv","ddivu",
+/*32 */ "add",  "addu", "sub",  "subu", "and",  "or ",   "xor",  "nor",
+/*40 */ "spec50","spec51","slt","sltu", "dadd","daddu","dsub","dsubu",
+/*48 */ "tge","tgeu","tlt","tltu","teq","spec65","tne","spec67",
+/*56 */ "dsll","spec71","dsrl","dsra","dsll32","spec75","dsrl32","dsra32"
+};
+
+static const char * const spec2_name[64] = {     /* QED RM4650, R5000, etc. */
+/* 0x00 */ "madd", "maddu", "mul", "spec3", "msub", "msubu", "rsrv6", "rsrv7",
+/* 0x08 */ "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv",
+/* 0x10 */ "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv",
+/* 0x18 */ "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv",
+/* 0x20 */ "clz",  "clo",  "rsrv", "rsrv", "dclz", "dclo", "rsrv", "rsrv",
+/* 0x28 */ "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv",
+/* 0x30 */ "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv",
+/* 0x38 */ "rsrv", "rsrv", "rsrv", "resv", "rsrv", "rsrv", "rsrv", "sdbbp"
+};
+
+static const char * const bcond_name[32] = {
+/* 0 */ "bltz", "bgez", "bltzl", "bgezl", "?", "?", "?", "?",
+/* 8 */ "tgei", "tgeiu", "tlti", "tltiu", "teqi", "?", "tnei", "?",
+/*16 */ "bltzal", "bgezal", "bltzall", "bgezall", "?", "?", "?", "?",
+/*24 */ "?", "?", "?", "?", "?", "?", "?", "?",
+};
+
+static const char * const cop1_name[64] = {
+/* 0 */ "fadd",  "fsub", "fmpy", "fdiv", "fsqrt","fabs", "fmov", "fneg",
+/* 8 */ "fop08","fop09","fop0a","fop0b","fop0c","fop0d","fop0e","fop0f",
+/*16 */ "fop10","fop11","fop12","fop13","fop14","fop15","fop16","fop17",
+/*24 */ "fop18","fop19","fop1a","fop1b","fop1c","fop1d","fop1e","fop1f",
+/*32 */ "fcvts","fcvtd","fcvte","fop23","fcvtw","fop25","fop26","fop27",
+/*40 */ "fop28","fop29","fop2a","fop2b","fop2c","fop2d","fop2e","fop2f",
+/*48 */ "fcmp.f","fcmp.un","fcmp.eq","fcmp.ueq","fcmp.olt","fcmp.ult",
+    "fcmp.ole","fcmp.ule",
+/*56 */ "fcmp.sf","fcmp.ngle","fcmp.seq","fcmp.ngl","fcmp.lt","fcmp.nge",
+    "fcmp.le","fcmp.ngt"
+};
+
+static const char * const fmt_name[16] = {
+    "s",    "d",    "e",    "fmt3",
+    "w",    "fmt5", "fmt6", "fmt7",
+    "fmt8", "fmt9", "fmta", "fmtb",
+    "fmtc", "fmtd", "fmte", "fmtf"
+};
+
+#if defined(__mips_n32) || defined(__mips_n64)
+static char * const reg_name[32] = {
+    "zero", "at",   "v0",   "v1",   "a0",   "a1",   "a2",   "a3",
+    "a4",   "a5",   "a6",   "a7",   "t0",   "t1",   "t2",   "t3",
+    "s0",   "s1",   "s2",   "s3",   "s4",   "s5",   "s6",   "s7",
+    "t8",   "t9",   "k0",   "k1",   "gp",   "sp",   "s8",   "ra"
+};
+#else
+
+static char * alt_arm_reg_name[32] = {  // hacked names for comparison with ARM code
+    "zero", "at",   "r0",   "r1",   "r2",   "r3",   "r4",   "r5",
+    "r6",   "r7",   "r8",   "r9",   "r10",  "r11",  "r12",  "r13",
+    "r14",  "r15",  "at2",  "cmp",  "s4",   "s5",   "s6",   "s7",
+    "t8",   "t9",   "k0",   "k1",   "gp",   "sp",   "s8",   "ra"
+};
+
+static char * mips_reg_name[32] = {
+    "zero", "at",   "v0",   "v1",   "a0",   "a1",   "a2",   "a3",
+    "t0",   "t1",   "t2",   "t3",   "t4",   "t5",   "t6",   "t7",
+    "s0",   "s1",   "s2",   "s3",   "s4",   "s5",   "s6",   "s7",
+    "t8",   "t9",   "k0",   "k1",   "gp",   "sp",   "s8",   "ra"
+};
+
+static char ** reg_name =  &mips_reg_name[0];
+
+#endif /* __mips_n32 || __mips_n64 */
+
+static const char * const c0_opname[64] = {
+    "c0op00","tlbr",  "tlbwi", "c0op03","c0op04","c0op05","tlbwr", "c0op07",
+    "tlbp",  "c0op11","c0op12","c0op13","c0op14","c0op15","c0op16","c0op17",
+    "rfe",   "c0op21","c0op22","c0op23","c0op24","c0op25","c0op26","c0op27",
+    "eret",  "c0op31","c0op32","c0op33","c0op34","c0op35","c0op36","c0op37",
+    "c0op40","c0op41","c0op42","c0op43","c0op44","c0op45","c0op46","c0op47",
+    "c0op50","c0op51","c0op52","c0op53","c0op54","c0op55","c0op56","c0op57",
+    "c0op60","c0op61","c0op62","c0op63","c0op64","c0op65","c0op66","c0op67",
+    "c0op70","c0op71","c0op72","c0op73","c0op74","c0op75","c0op77","c0op77",
+};
+
+static const char * const c0_reg[32] = {
+    "index",    "random",   "tlblo0",  "tlblo1",
+    "context",  "pagemask", "wired",   "cp0r7",
+    "badvaddr", "count",    "tlbhi",   "compare",
+    "status",   "cause",    "epc",     "prid",
+    "config",   "lladdr",   "watchlo", "watchhi",
+    "xcontext", "cp0r21",   "cp0r22",  "debug",
+    "depc",     "perfcnt",  "ecc",     "cacheerr",
+    "taglo",    "taghi",    "errepc",  "desave"
+};
+
+static void print_addr(db_addr_t);
+db_addr_t mips_disassem(db_addr_t loc, char *di_buffer, int alt_dis_format);
+
+
+/*
+ * Disassemble instruction 'insn' nominally at 'loc'.
+ * 'loc' may in fact contain a breakpoint instruction.
+ */
+static db_addr_t
+db_disasm_insn(int insn, db_addr_t loc, bool altfmt __unused)
+{
+    bool bdslot = false;
+    InstFmt i;
+
+    i.word = insn;
+
+    switch (i.JType.op) {
+    case OP_SPECIAL:
+        if (i.word == 0) {
+            db_printf("nop");
+            break;
+        }
+        if (i.word == 0x0080) {
+            db_printf("NIY");
+            break;
+        }
+        if (i.word == 0x00c0) {
+            db_printf("NOT IMPL");
+            break;
+        }
+        /* Special cases --------------------------------------------------
+         * "addu" is a "move" only in 32-bit mode.  What's the correct
+         * answer - never decode addu/daddu as "move"?
+         */
+        if ( (i.RType.func == OP_ADDU && i.RType.rt == 0)  ||
+             (i.RType.func == OP_OR   && i.RType.rt == 0) ) {
+            db_printf("move\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rs]);
+            break;
+        }
+        // mips32r2, rotr & rotrv
+        if (i.RType.func == OP_SRL && (i.RType.rs & 1) == 1) {
+            db_printf("rotr\t%s,%s,%d", reg_name[i.RType.rd],
+                reg_name[i.RType.rt], i.RType.shamt);
+            break;
+        }
+        if (i.RType.func == OP_SRLV && (i.RType.shamt & 1) == 1) {
+            db_printf("rotrv\t%s,%s,%s", reg_name[i.RType.rd],
+                reg_name[i.RType.rt], reg_name[i.RType.rs]);
+            break;
+        }
+
+
+        db_printf("%s", spec_name[i.RType.func]);
+        switch (i.RType.func) {
+        case OP_SLL:
+        case OP_SRL:
+        case OP_SRA:
+        case OP_DSLL:
+
+        case OP_DSRL:
+        case OP_DSRA:
+        case OP_DSLL32:
+        case OP_DSRL32:
+        case OP_DSRA32:
+            db_printf("\t%s,%s,%d",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt],
+                i.RType.shamt);
+            break;
+
+        case OP_SLLV:
+        case OP_SRLV:
+        case OP_SRAV:
+        case OP_DSLLV:
+        case OP_DSRLV:
+        case OP_DSRAV:
+            db_printf("\t%s,%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt],
+                reg_name[i.RType.rs]);
+            break;
+
+        case OP_MFHI:
+        case OP_MFLO:
+            db_printf("\t%s", reg_name[i.RType.rd]);
+            break;
+
+        case OP_JR:
+        case OP_JALR:
+            db_printf("\t%s", reg_name[i.RType.rs]);
+            bdslot = true;
+            break;
+        case OP_MTLO:
+        case OP_MTHI:
+            db_printf("\t%s", reg_name[i.RType.rs]);
+            break;
+
+        case OP_MULT:
+        case OP_MULTU:
+        case OP_DMULT:
+        case OP_DMULTU:
+        case OP_DIV:
+        case OP_DIVU:
+        case OP_DDIV:
+        case OP_DDIVU:
+            db_printf("\t%s,%s",
+                reg_name[i.RType.rs],
+                reg_name[i.RType.rt]);
+            break;
+
+
+        case OP_SYSCALL:
+        case OP_SYNC:
+            break;
+
+        case OP_BREAK:
+            db_printf("\t%d", (i.RType.rs << 5) | i.RType.rt);
+            break;
+
+        default:
+            db_printf("\t%s,%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rs],
+                reg_name[i.RType.rt]);
+        }
+        break;
+
+    case OP_SPECIAL2:
+        if (i.RType.func == OP_MUL)
+            db_printf("%s\t%s,%s,%s",
+                spec2_name[i.RType.func & 0x3f],
+                    reg_name[i.RType.rd],
+                    reg_name[i.RType.rs],
+                    reg_name[i.RType.rt]);
+        else
+            db_printf("%s\t%s,%s",
+                spec2_name[i.RType.func & 0x3f],
+                    reg_name[i.RType.rs],
+                    reg_name[i.RType.rt]);
+
+        break;
+
+    case OP_SPECIAL3:
+        if (i.RType.func == OP_EXT)
+            db_printf("ext\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd+1);
+        else if (i.RType.func == OP_INS)
+            db_printf("ins\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd-i.RType.shamt+1);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_WSBH)
+            db_printf("wsbh\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_SEB)
+            db_printf("seb\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_SEH)
+            db_printf("seh\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else
+            db_printf("Unknown");
+        break;
+
+    case OP_BCOND:
+        db_printf("%s\t%s,", bcond_name[i.IType.rt],
+            reg_name[i.IType.rs]);
+        goto pr_displ;
+
+    case OP_BLEZ:
+    case OP_BLEZL:
+    case OP_BGTZ:
+    case OP_BGTZL:
+        db_printf("%s\t%s,", op_name[i.IType.op],
+            reg_name[i.IType.rs]);
+        goto pr_displ;
+
+    case OP_BEQ:
+    case OP_BEQL:
+        if (i.IType.rs == 0 && i.IType.rt == 0) {
+            db_printf("b  \t");
+            goto pr_displ;
+        }
+        /* FALLTHROUGH */
+    case OP_BNE:
+    case OP_BNEL:
+        db_printf("%s\t%s,%s,", op_name[i.IType.op],
+            reg_name[i.IType.rs],
+            reg_name[i.IType.rt]);
+    pr_displ:
+        print_addr(loc + 4 + ((short)i.IType.imm << 2));
+        bdslot = true;
+        break;
+
+    case OP_COP0:
+        switch (i.RType.rs) {
+        case OP_BCx:
+        case OP_BCy:
+
+            db_printf("bc0%c\t",
+                "ft"[i.RType.rt & COPz_BC_TF_MASK]);
+            goto pr_displ;
+
+        case OP_MT:
+            db_printf("mtc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_DMT:
+            db_printf("dmtc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_MF:
+            db_printf("mfc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_DMF:
+            db_printf("dmfc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        default:
+            db_printf("%s", c0_opname[i.FRType.func]);
+        }
+        break;
+
+    case OP_COP1:
+        switch (i.RType.rs) {
+        case OP_BCx:
+        case OP_BCy:
+            db_printf("bc1%c\t",
+                "ft"[i.RType.rt & COPz_BC_TF_MASK]);
+            goto pr_displ;
+
+        case OP_MT:
+            db_printf("mtc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_MF:
+            db_printf("mfc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_CT:
+            db_printf("ctc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_CF:
+            db_printf("cfc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        default:
+            db_printf("%s.%s\tf%d,f%d,f%d",
+                cop1_name[i.FRType.func],
+                fmt_name[i.FRType.fmt],
+                i.FRType.fd, i.FRType.fs, i.FRType.ft);
+        }
+        break;
+
+    case OP_J:
+    case OP_JAL:
+        db_printf("%s\t", op_name[i.JType.op]);
+        print_addr((loc & 0xF0000000) | (i.JType.target << 2));
+        bdslot = true;
+        break;
+
+    case OP_LWC1:
+    case OP_SWC1:
+        db_printf("%s\tf%d,", op_name[i.IType.op],
+            i.IType.rt);
+        goto loadstore;
+
+    case OP_LB:
+    case OP_LH:
+    case OP_LW:
+    case OP_LD:
+    case OP_LBU:
+    case OP_LHU:
+    case OP_LWU:
+    case OP_SB:
+    case OP_SH:
+    case OP_SW:
+    case OP_SD:
+        db_printf("%s\t%s,", op_name[i.IType.op],
+            reg_name[i.IType.rt]);
+    loadstore:
+        db_printf("%d(%s)", (short)i.IType.imm,
+            reg_name[i.IType.rs]);
+        break;
+
+    case OP_ORI:
+    case OP_XORI:
+        if (i.IType.rs == 0) {
+            db_printf("li\t%s,0x%x",
+                reg_name[i.IType.rt],
+                i.IType.imm);
+            break;
+        }
+        /* FALLTHROUGH */
+    case OP_ANDI:
+        db_printf("%s\t%s,%s,0x%x", op_name[i.IType.op],
+            reg_name[i.IType.rt],
+            reg_name[i.IType.rs],
+            i.IType.imm);
+        break;
+
+    case OP_LUI:
+        db_printf("%s\t%s,0x%x", op_name[i.IType.op],
+            reg_name[i.IType.rt],
+            i.IType.imm);
+        break;
+
+    case OP_CACHE:
+        db_printf("%s\t0x%x,0x%x(%s)",
+            op_name[i.IType.op],
+            i.IType.rt,
+            i.IType.imm,
+            reg_name[i.IType.rs]);
+        break;
+
+    case OP_ADDI:
+    case OP_DADDI:
+    case OP_ADDIU:
+    case OP_DADDIU:
+        if (i.IType.rs == 0) {
+            db_printf("li\t%s,%d",
+                reg_name[i.IType.rt],
+                (short)i.IType.imm);
+            break;
+        }
+        /* FALLTHROUGH */
+    default:
+        db_printf("%s\t%s,%s,%d", op_name[i.IType.op],
+            reg_name[i.IType.rt],
+            reg_name[i.IType.rs],
+            (short)i.IType.imm);
+    }
+    // db_printf("\n");
+    // if (bdslot) {
+    //     db_printf("   bd: ");
+    //     mips_disassem(loc+4);
+    //     return (loc + 8);
+    // }
+    return (loc + 4);
+}
+
+static void
+print_addr(db_addr_t loc)
+{
+    db_printf("0x%08x", loc);
+}
+
+
+
+static void db_printf(const char* fmt, ...)
+{
+    int cnt;
+    va_list argp;
+    va_start(argp, fmt);
+    if (sprintf_buffer) {
+        cnt = vsnprintf(sprintf_buffer, sprintf_buf_len, fmt, argp);
+        sprintf_buffer += cnt;
+        sprintf_buf_len -= cnt;
+    } else {
+        vprintf(fmt, argp);
+    }
+    va_end(argp);
+}
+
+
+/*
+ * Disassemble instruction at 'loc'.
+ * Return address of start of next instruction.
+ * Since this function is used by 'examine' and by 'step'
+ * "next instruction" does NOT mean the next instruction to
+ * be executed but the 'linear' next instruction.
+ */
+db_addr_t
+mips_disassem(db_addr_t loc, char *di_buffer, int alt_dis_format)
+{
+    u_int32_t instr;
+
+    if (alt_dis_format) {   // use ARM register names for disassembly
+        reg_name = &alt_arm_reg_name[0];
+    }
+
+    sprintf_buffer = di_buffer;     // quick 'n' dirty printf() vs sprintf()
+    sprintf_buf_len = 39;           // should be passed in
+
+    instr =  *(u_int32_t *)loc;
+    return (db_disasm_insn(instr, loc, false));
+}
+

diff --git a/libpixelflinger/codeflinger/mips_disassem.h b/libpixelflinger/codeflinger/mips_disassem.h
new file mode 100644
index 0000000..2d5b7f5
--- /dev/null
+++ b/libpixelflinger/codeflinger/mips_disassem.h

@@ -0,0 +1,66 @@
+/*  $NetBSD: db_disasm.c,v 1.19 2007/02/28 04:21:53 thorpej Exp $   */
+
+/*-
+ * Copyright (c) 1991, 1993
+ *  The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  from: @(#)kadb.c    8.1 (Berkeley) 6/10/93
+ */
+
+
+
+#ifndef ANDROID_MIPS_DISASSEM_H
+#define ANDROID_MIPS_DISASSEM_H
+
+#include <sys/types.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+
+// could add an interface like this, but I have not
+// typedef struct {
+//  u_int   (*di_readword)(u_int);
+//  void    (*di_printaddr)(u_int);
+//  void    (*di_printf)(const char *, ...);
+// } disasm_interface_t;
+
+/* Prototypes for callable functions */
+
+// u_int disasm(const disasm_interface_t *, u_int, int);
+
+void mips_disassem(uint32_t *location, char *di_buffer, int alt_fmt);
+
+#if __cplusplus
+}
+#endif
+
+#endif /* !ANDROID_MIPS_DISASSEM_H */

diff --git a/libpixelflinger/codeflinger/mips_opcode.h b/libpixelflinger/codeflinger/mips_opcode.h
new file mode 100644
index 0000000..45bb19e
--- /dev/null
+++ b/libpixelflinger/codeflinger/mips_opcode.h

@@ -0,0 +1,410 @@
+/*  $NetBSD: mips_opcode.h,v 1.12 2005/12/11 12:18:09 christos Exp $    */
+
+/*-
+ * Copyright (c) 1992, 1993
+ *  The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  @(#)mips_opcode.h   8.1 (Berkeley) 6/10/93
+ */
+
+/*
+ * Define the instruction formats and opcode values for the
+ * MIPS instruction set.
+ */
+
+#include <endian.h>
+
+/*
+ * Define the instruction formats.
+ */
+typedef union {
+    unsigned word;
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+    struct {
+        unsigned imm: 16;
+        unsigned rt: 5;
+        unsigned rs: 5;
+        unsigned op: 6;
+    } IType;
+
+    struct {
+        unsigned target: 26;
+        unsigned op: 6;
+    } JType;
+
+    struct {
+        unsigned func: 6;
+        unsigned shamt: 5;
+        unsigned rd: 5;
+        unsigned rt: 5;
+        unsigned rs: 5;
+        unsigned op: 6;
+    } RType;
+
+    struct {
+        unsigned func: 6;
+        unsigned fd: 5;
+        unsigned fs: 5;
+        unsigned ft: 5;
+        unsigned fmt: 4;
+        unsigned : 1;       /* always '1' */
+        unsigned op: 6;     /* always '0x11' */
+    } FRType;
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+    struct {
+        unsigned op: 6;
+        unsigned rs: 5;
+        unsigned rt: 5;
+        unsigned imm: 16;
+    } IType;
+
+    struct {
+        unsigned op: 6;
+        unsigned target: 26;
+    } JType;
+
+    struct {
+        unsigned op: 6;
+        unsigned rs: 5;
+        unsigned rt: 5;
+        unsigned rd: 5;
+        unsigned shamt: 5;
+        unsigned func: 6;
+    } RType;
+
+    struct {
+        unsigned op: 6;     /* always '0x11' */
+        unsigned : 1;       /* always '1' */
+        unsigned fmt: 4;
+        unsigned ft: 5;
+        unsigned fs: 5;
+        unsigned fd: 5;
+        unsigned func: 6;
+    } FRType;
+#endif
+} InstFmt;
+
+/*
+ * Values for the 'op' field.
+ */
+#define OP_SPECIAL  000
+#define OP_BCOND    001
+#define OP_J        002
+#define OP_JAL      003
+#define OP_BEQ      004
+#define OP_BNE      005
+#define OP_BLEZ     006
+#define OP_BGTZ     007
+
+#if __mips_isa_rev < 6
+#define OP_ADDI     010
+#else
+#define OP_POP10    010
+#endif
+
+#define OP_ADDIU    011
+#define OP_SLTI     012
+#define OP_SLTIU    013
+#define OP_ANDI     014
+#define OP_ORI      015
+#define OP_XORI     016
+
+#if __mips_isa_rev < 6
+#define OP_LUI      017
+#else
+#define OP_AUI      017
+#endif
+
+#define OP_COP0     020
+#define OP_COP1     021
+#define OP_COP2     022
+
+#if __mips_isa_rev < 6
+#define OP_COP3     023
+#define OP_BEQL     024
+#define OP_BNEL     025
+#define OP_BLEZL    026
+#define OP_BGTZL    027
+#define OP_DADDI    030
+#else
+#define OP_POP26    026
+#define OP_POP27    027
+#define OP_POP30    030
+#endif
+
+#define OP_DADDIU   031
+
+#if __mips_isa_rev < 6
+#define OP_LDL      032
+#define OP_LDR      033
+#define OP_SPECIAL2 034
+#else
+#define OP_DAUI     035
+#endif
+
+#define OP_SPECIAL3 037
+
+#define OP_LB       040
+#define OP_LH       041
+
+#if __mips_isa_rev < 6
+#define OP_LWL      042
+#endif
+
+#define OP_LW       043
+#define OP_LBU      044
+#define OP_LHU      045
+#define OP_LWR      046
+#define OP_LHU      045
+
+#if __mips_isa_rev < 6
+#define OP_LWR      046
+#endif
+
+#define OP_LWU      047
+
+#define OP_SB       050
+#define OP_SH       051
+
+#if __mips_isa_rev < 6
+#define OP_SWL      052
+#endif
+
+#define OP_SW       053
+
+#if __mips_isa_rev < 6
+#define OP_SDL      054
+#define OP_SDR      055
+#define OP_SWR      056
+#define OP_CACHE    057
+#define OP_LL       060
+#define OP_LWC0     OP_LL
+#define OP_LWC1     061
+#define OP_LWC2     062
+#define OP_LWC3     063
+#define OP_LLD      064
+#else
+#define OP_LWC1     061
+#define OP_BC       062
+#endif
+
+#define OP_LDC1     065
+#define OP_LD       067
+
+#if __mips_isa_rev < 6
+#define OP_SC       070
+#define OP_SWC0     OP_SC
+#endif
+
+#define OP_SWC1     071
+
+#if __mips_isa_rev < 6
+#define OP_SWC2     072
+#define OP_SWC3     073
+#define OP_SCD      074
+#else
+#define OP_BALC     072
+#endif
+
+#define OP_SDC1     075
+#define OP_SD       077
+
+/*
+ * Values for the 'func' field when 'op' == OP_SPECIAL.
+ */
+#define OP_SLL      000
+#define OP_SRL      002
+#define OP_SRA      003
+#define OP_SLLV     004
+#define OP_SRLV     006
+#define OP_SRAV     007
+
+#if __mips_isa_rev < 6
+#define OP_JR       010
+#endif
+
+#define OP_JALR     011
+#define OP_SYSCALL  014
+#define OP_BREAK    015
+#define OP_SYNC     017
+
+#if __mips_isa_rev < 6
+#define OP_MFHI     020
+#define OP_MTHI     021
+#define OP_MFLO     022
+#define OP_MTLO     023
+#else
+#define OP_CLZ      020
+#define OP_CLO      021
+#define OP_DCLZ     022
+#define OP_DCLO     023
+#endif
+
+#define OP_DSLLV    024
+#define OP_DSRLV    026
+#define OP_DSRAV    027
+
+#if __mips_isa_rev < 6
+#define OP_MULT     030
+#define OP_MULTU    031
+#define OP_DIV      032
+#define OP_DIVU     033
+#define OP_DMULT    034
+#define OP_DMULTU   035
+#define OP_DDIV     036
+#define OP_DDIVU    037
+#else
+#define OP_SOP30    030
+#define OP_SOP31    031
+#define OP_SOP32    032
+#define OP_SOP33    033
+#define OP_SOP34    034
+#define OP_SOP35    035
+#define OP_SOP36    036
+#define OP_SOP37    037
+#endif
+
+#define OP_ADD      040
+#define OP_ADDU     041
+#define OP_SUB      042
+#define OP_SUBU     043
+#define OP_AND      044
+#define OP_OR       045
+#define OP_XOR      046
+#define OP_NOR      047
+
+#define OP_SLT      052
+#define OP_SLTU     053
+#define OP_DADD     054
+#define OP_DADDU    055
+#define OP_DSUB     056
+#define OP_DSUBU    057
+
+#define OP_TGE      060
+#define OP_TGEU     061
+#define OP_TLT      062
+#define OP_TLTU     063
+#define OP_TEQ      064
+#define OP_TNE      066
+
+#define OP_DSLL     070
+#define OP_DSRL     072
+#define OP_DSRA     073
+#define OP_DSLL32   074
+#define OP_DSRL32   076
+#define OP_DSRA32   077
+
+#if __mips_isa_rev < 6
+/*
+ * Values for the 'func' field when 'op' == OP_SPECIAL2.
+ * OP_SPECIAL2 opcodes are removed in mips32r6
+ */
+#define OP_MAD      000     /* QED */
+#define OP_MADU     001     /* QED */
+#define OP_MUL      002     /* QED */
+#endif
+
+/*
+ * Values for the 'func' field when 'op' == OP_SPECIAL3.
+ */
+#define OP_EXT      000
+#define OP_DEXTM    001
+#define OP_DEXTU    002
+#define OP_DEXT     003
+#define OP_INS      004
+#define OP_DINSM    005
+#define OP_DINSU    006
+#define OP_DINS     007
+#define OP_BSHFL    040
+#define OP_RDHWR    073
+
+/*
+ * Values for the 'shamt' field when OP_SPECIAL3 && func OP_BSHFL.
+ */
+
+#define OP_WSBH     002
+#define OP_SEB      020
+#define OP_SEH      030
+
+#if __mips_isa_rev == 6
+/*
+ * Values for the 'shamt' field when OP_SOP30.
+ */
+#define OP_MUL      002
+#define OP_MUH      003
+#endif
+
+/*
+ * Values for the 'func' field when 'op' == OP_BCOND.
+ */
+#define OP_BLTZ     000
+#define OP_BGEZ     001
+
+#if __mips_isa_rev < 6
+#define OP_BLTZL    002
+#define OP_BGEZL    003
+#define OP_TGEI     010
+#define OP_TGEIU    011
+#define OP_TLTI     012
+#define OP_TLTIU    013
+#define OP_TEQI     014
+#define OP_TNEI     016
+#define OP_BLTZAL   020
+#define OP_BGEZAL   021
+#define OP_BLTZALL  022
+#define OP_BGEZALL  023
+#else
+#define OP_NAL      020
+#define OP_BAL      021
+#endif
+
+/*
+ * Values for the 'rs' field when 'op' == OP_COPz.
+ */
+#define OP_MF       000
+#define OP_DMF      001
+#define OP_MT       004
+#define OP_DMT      005
+#define OP_BCx      010
+#define OP_BCy      014
+#define OP_CF       002
+#define OP_CT       006
+
+/*
+ * Values for the 'rt' field when 'op' == OP_COPz.
+ */
+#define COPz_BC_TF_MASK 0x01
+#define COPz_BC_TRUE    0x01
+#define COPz_BC_FALSE   0x00
+#define COPz_BCL_TF_MASK    0x02
+#define COPz_BCL_TRUE   0x02
+#define COPz_BCL_FALSE  0x00

diff --git a/libpixelflinger/codeflinger/texturing.cpp b/libpixelflinger/codeflinger/texturing.cpp
new file mode 100644
index 0000000..e6997bd
--- /dev/null
+++ b/libpixelflinger/codeflinger/texturing.cpp

@@ -0,0 +1,1261 @@
+/* libs/pixelflinger/codeflinger/texturing.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "pixelflinger-code"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <log/log.h>
+
+#include "GGLAssembler.h"
+
+namespace android {
+
+// ---------------------------------------------------------------------------
+
+// iterators are initialized like this:
+// (intToFixedCenter(x) * dx)>>16 + x0
+// ((x<<16 + 0x8000) * dx)>>16 + x0
+// ((x<<16)*dx + (0x8000*dx))>>16 + x0
+// ( (x*dx) + dx>>1 ) + x0
+// (x*dx) + (dx>>1 + x0)
+
+void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
+{
+    context_t const* c = mBuilderContext.c;
+
+    if (mSmooth) {
+        // NOTE: we could take this case in the mDithering + !mSmooth case,
+        // but this would use up to 4 more registers for the color components
+        // for only a little added quality.
+        // Currently, this causes the system to run out of registers in
+        // some case (see issue #719496)
+
+        comment("compute initial iterated color (smooth and/or dither case)");
+
+        parts.iterated_packed = 0;
+        parts.packed = 0;
+
+        // 0x1: color component
+        // 0x2: iterators
+        const int optReload = mOptLevel >> 1;
+        if (optReload >= 3)         parts.reload = 0; // reload nothing
+        else if (optReload == 2)    parts.reload = 2; // reload iterators
+        else if (optReload == 1)    parts.reload = 1; // reload colors
+        else if (optReload <= 0)    parts.reload = 3; // reload both
+
+        if (!mSmooth) {
+            // we're not smoothing (just dithering), we never have to 
+            // reload the iterators
+            parts.reload &= ~2;
+        }
+
+        Scratch scratches(registerFile());
+        const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
+        const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
+        for (int i=0 ; i<4 ; i++) {
+            if (!mInfo[i].iterated)
+                continue;            
+            
+            // this component exists in the destination and is not replaced
+            // by a texture unit.
+            const int c = (parts.reload & 1) ? t0 : obtainReg();              
+            if (i==0) CONTEXT_LOAD(c, iterators.ydady);
+            if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
+            if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
+            if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
+            parts.argb[i].reg = c;
+
+            if (mInfo[i].smooth) {
+                parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
+                const int dvdx = parts.argb_dx[i].reg;
+                CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
+                MLA(AL, 0, c, x.reg, dvdx, c);
+                
+                // adjust the color iterator to make sure it won't overflow
+                if (!mAA) {
+                    // this is not needed when we're using anti-aliasing
+                    // because we will (have to) clamp the components
+                    // anyway.
+                    int end = scratches.obtain();
+                    MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
+                    MLA(AL, 1, end, dvdx, end, c);
+                    SUB(MI, 0, c, c, end);
+                    BIC(AL, 0, c, c, reg_imm(c, ASR, 31)); 
+                    scratches.recycle(end);
+                }
+            }
+            
+            if (parts.reload & 1) {
+                CONTEXT_STORE(c, generated_vars.argb[i].c);
+            }
+        }
+    } else {
+        // We're not smoothed, so we can 
+        // just use a packed version of the color and extract the
+        // components as needed (or not at all if we don't blend)
+
+        // figure out if we need the iterated color
+        int load = 0;
+        for (int i=0 ; i<4 ; i++) {
+            component_info_t& info = mInfo[i];
+            if ((info.inDest || info.needed) && !info.replaced)
+                load |= 1;
+        }
+        
+        parts.iterated_packed = 1;
+        parts.packed = (!mTextureMachine.mask && !mBlending
+                && !mFog && !mDithering);
+        parts.reload = 0;
+        if (load || parts.packed) {
+            if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
+                comment("load initial iterated color (8888 packed)");
+                parts.iterated.setTo(obtainReg(),
+                        &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
+                CONTEXT_LOAD(parts.iterated.reg, packed8888);
+            } else {
+                comment("load initial iterated color (dest format packed)");
+
+                parts.iterated.setTo(obtainReg(), &mCbFormat);
+
+                // pre-mask the iterated color
+                const int bits = parts.iterated.size();
+                const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
+                uint32_t mask = 0;
+                if (mMasking) {
+                    for (int i=0 ; i<4 ; i++) {
+                        const int component_mask = 1<<i;
+                        const int h = parts.iterated.format.c[i].h;
+                        const int l = parts.iterated.format.c[i].l;
+                        if (h && (!(mMasking & component_mask))) {
+                            mask |= ((1<<(h-l))-1) << l;
+                        }
+                    }
+                }
+
+                if (mMasking && ((mask & size)==0)) {
+                    // none of the components are present in the mask
+                } else {
+                    CONTEXT_LOAD(parts.iterated.reg, packed);
+                    if (mCbFormat.size == 1) {
+                        AND(AL, 0, parts.iterated.reg,
+                                parts.iterated.reg, imm(0xFF));
+                    } else if (mCbFormat.size == 2) {
+                        MOV(AL, 0, parts.iterated.reg,
+                                reg_imm(parts.iterated.reg, LSR, 16));
+                    }
+                }
+
+                // pre-mask the iterated color
+                if (mMasking) {
+                    build_and_immediate(parts.iterated.reg, parts.iterated.reg,
+                            mask, bits);
+                }
+            }
+        }
+    }
+}
+
+void GGLAssembler::build_iterated_color(
+        component_t& fragment,
+        const fragment_parts_t& parts,
+        int component,
+        Scratch& regs)
+{
+    fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE); 
+
+    if (!mInfo[component].iterated)
+        return;
+
+    if (parts.iterated_packed) {
+        // iterated colors are packed, extract the one we need
+        extract(fragment, parts.iterated, component);
+    } else {
+        fragment.h = GGL_COLOR_BITS;
+        fragment.l = GGL_COLOR_BITS - 8;
+        fragment.flags |= CLEAR_LO;
+        // iterated colors are held in their own register,
+        // (smooth and/or dithering case)
+        if (parts.reload==3) {
+            // this implies mSmooth
+            Scratch scratches(registerFile());
+            int dx = scratches.obtain();
+            CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
+            CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
+            ADD(AL, 0, dx, fragment.reg, dx);
+            CONTEXT_STORE(dx, generated_vars.argb[component].c);
+        } else if (parts.reload & 1) {
+            CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
+        } else {
+            // we don't reload, so simply rename the register and mark as
+            // non CORRUPTIBLE so that the texture env or blending code
+            // won't modify this (renamed) register
+            regs.recycle(fragment.reg);
+            fragment.reg = parts.argb[component].reg;
+            fragment.flags &= ~CORRUPTIBLE;
+        }
+        if (mInfo[component].smooth && mAA) {
+            // when using smooth shading AND anti-aliasing, we need to clamp
+            // the iterators because there is always an extra pixel on the
+            // edges, which most of the time will cause an overflow
+            // (since technically its outside of the domain).
+            BIC(AL, 0, fragment.reg, fragment.reg,
+                    reg_imm(fragment.reg, ASR, 31));
+            component_sat(fragment);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
+{
+    // gather some informations about the components we need to process...
+    const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+    switch(opcode) {
+    case GGL_COPY:
+        mLogicOp = 0;
+        break;
+    case GGL_CLEAR:
+    case GGL_SET:
+        mLogicOp = LOGIC_OP;
+        break;
+    case GGL_AND:
+    case GGL_AND_REVERSE:
+    case GGL_AND_INVERTED:
+    case GGL_XOR:
+    case GGL_OR:
+    case GGL_NOR:
+    case GGL_EQUIV:
+    case GGL_OR_REVERSE:
+    case GGL_OR_INVERTED:
+    case GGL_NAND:
+        mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
+        break;
+    case GGL_NOOP:
+    case GGL_INVERT:
+        mLogicOp = LOGIC_OP|LOGIC_OP_DST;
+        break;        
+    case GGL_COPY_INVERTED:
+        mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
+        break;
+    };        
+}
+
+void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
+{
+    uint8_t replaced=0;
+    mTextureMachine.mask = 0;
+    mTextureMachine.activeUnits = 0;
+    for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (replaced == 0xF) {
+            // all components are replaced, skip this TMU.
+            tmu.format_idx = 0;
+            tmu.mask = 0;
+            tmu.replaced = replaced;
+            continue;
+        }
+        tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
+        tmu.format = c->formats[tmu.format_idx];
+        tmu.bits = tmu.format.size*8;
+        tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
+        tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
+        tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
+        tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
+        tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
+                && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
+
+        // 5551 linear filtering is not supported
+        if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
+            tmu.linear = 0;
+        
+        tmu.mask = 0;
+        tmu.replaced = replaced;
+
+        if (tmu.format_idx) {
+            mTextureMachine.activeUnits++;
+            if (tmu.format.c[0].h)    tmu.mask |= 0x1;
+            if (tmu.format.c[1].h)    tmu.mask |= 0x2;
+            if (tmu.format.c[2].h)    tmu.mask |= 0x4;
+            if (tmu.format.c[3].h)    tmu.mask |= 0x8;
+            if (tmu.env == GGL_REPLACE) {
+                replaced |= tmu.mask;
+            } else if (tmu.env == GGL_DECAL) {
+                if (!tmu.format.c[GGLFormat::ALPHA].h) {
+                    // if we don't have alpha, decal does nothing
+                    tmu.mask = 0;
+                } else {
+                    // decal always ignores At
+                    tmu.mask &= ~(1<<GGLFormat::ALPHA);
+                }
+            }
+        }
+        mTextureMachine.mask |= tmu.mask;
+        //printf("%d: mask=%08lx, replaced=%08lx\n",
+        //    i, int(tmu.mask), int(tmu.replaced));
+    }
+    mTextureMachine.replaced = replaced;
+    mTextureMachine.directTexture = 0;
+    //printf("replaced=%08lx\n", mTextureMachine.replaced);
+}
+
+
+void GGLAssembler::init_textures(
+        tex_coord_t* coords,
+        const reg_t& x, const reg_t& y)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    int Rx = x.reg;
+    int Ry = y.reg;
+
+    if (mTextureMachine.mask) {
+        comment("compute texture coordinates");
+    }
+
+    // init texture coordinates for each tmu
+    const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+            (tmu.twrap == GGL_NEEDS_WRAP_11)) 
+        {
+            // 1:1 texture
+            pointer_t& txPtr = coords[i].ptr;
+            txPtr.setTo(obtainReg(), tmu.bits);
+            CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
+            ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16));    // x += (s>>16)
+            CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
+            ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16));    // y += (t>>16)
+            // merge base & offset
+            CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
+            SMLABB(AL, Rx, Ry, txPtr.reg, Rx);               // x+y*stride
+            CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
+            base_offset(txPtr, txPtr, Rx);
+        } else {
+            Scratch scratches(registerFile());
+            reg_t& s = coords[i].s;
+            reg_t& t = coords[i].t;
+            // s = (x * dsdx)>>16 + ydsdy
+            // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
+            // t = (x * dtdx)>>16 + ydtdy
+            // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
+            s.setTo(obtainReg());
+            t.setTo(obtainReg());
+            const int need_w = GGL_READ_NEEDS(W, needs.n);
+            if (need_w) {
+                CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
+                CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
+            } else {
+                int ydsdy = scratches.obtain();
+                int ydtdy = scratches.obtain();
+                CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
+                CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
+                CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
+                CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
+                MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
+                MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
+            }
+            
+            if ((mOptLevel&1)==0) {
+                CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
+                CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
+                recycleReg(s.reg);
+                recycleReg(t.reg);
+            }
+        }
+
+        // direct texture?
+        if (!multiTexture && !mBlending && !mDithering && !mFog && 
+            cb_format_idx == tmu.format_idx && !tmu.linear &&
+            mTextureMachine.replaced == tmu.mask) 
+        {
+                mTextureMachine.directTexture = i + 1; 
+        }
+    }
+}
+
+void GGLAssembler::build_textures(  fragment_parts_t& parts,
+                                    Scratch& regs)
+{
+    // We don't have a way to spill registers automatically
+    // spill depth and AA regs, when we know we may have to.
+    // build the spill list...
+    uint32_t spill_list = 0;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+        if (tmu.linear) {
+            // we may run out of register if we have linear filtering
+            // at 1 or 4 bytes / pixel on any texture unit.
+            if (tmu.format.size == 1) {
+                // if depth and AA enabled, we'll run out of 1 register
+                if (parts.z.reg > 0 && parts.covPtr.reg > 0)
+                    spill_list |= 1<<parts.covPtr.reg;
+            }
+            if (tmu.format.size == 4) {
+                // if depth or AA enabled, we'll run out of 1 or 2 registers
+                if (parts.z.reg > 0)
+                    spill_list |= 1<<parts.z.reg;
+                if (parts.covPtr.reg > 0)   
+                    spill_list |= 1<<parts.covPtr.reg;
+            }
+        }
+    }
+
+    Spill spill(registerFile(), *this, spill_list);
+
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+
+        pointer_t& txPtr = parts.coords[i].ptr;
+        pixel_t& texel = parts.texel[i];
+
+        // repeat...
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+            (tmu.twrap == GGL_NEEDS_WRAP_11))
+        { // 1:1 textures
+            comment("fetch texel");
+            texel.setTo(regs.obtain(), &tmu.format);
+            load(txPtr, texel, WRITE_BACK);
+        } else {
+            Scratch scratches(registerFile());
+            reg_t& s = parts.coords[i].s;
+            reg_t& t = parts.coords[i].t;
+            if ((mOptLevel&1)==0) {
+                comment("reload s/t (multitexture or linear filtering)");
+                s.reg = scratches.obtain();
+                t.reg = scratches.obtain();
+                CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
+                CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
+            }
+
+            if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
+                return;
+
+            comment("compute repeat/clamp");
+            int u       = scratches.obtain();
+            int v       = scratches.obtain();
+            int width   = scratches.obtain();
+            int height  = scratches.obtain();
+            int U = 0;
+            int V = 0;
+
+            if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
+                return;
+
+            CONTEXT_LOAD(width,  generated_vars.texture[i].width);
+            CONTEXT_LOAD(height, generated_vars.texture[i].height);
+
+            int FRAC_BITS = 0;
+            if (tmu.linear) {
+                // linear interpolation
+                if (tmu.format.size == 1) {
+                    // for 8-bits textures, we can afford
+                    // 7 bits of fractional precision at no
+                    // additional cost (we can't do 8 bits
+                    // because filter8 uses signed 16 bits muls)
+                    FRAC_BITS = 7;
+                } else if (tmu.format.size == 2) {
+                    // filter16() is internally limited to 4 bits, so:
+                    // FRAC_BITS=2 generates less instructions,
+                    // FRAC_BITS=3,4,5 creates unpleasant artifacts,
+                    // FRAC_BITS=6+ looks good
+                    FRAC_BITS = 6;
+                } else if (tmu.format.size == 4) {
+                    // filter32() is internally limited to 8 bits, so:
+                    // FRAC_BITS=4 looks good
+                    // FRAC_BITS=5+ looks better, but generates 3 extra ipp
+                    FRAC_BITS = 6;
+                } else {
+                    // for all other cases we use 4 bits.
+                    FRAC_BITS = 4;
+                }
+            }
+            wrapping(u, s.reg, width,  tmu.swrap, FRAC_BITS);
+            wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
+
+            if (tmu.linear) {
+                comment("compute linear filtering offsets");
+                // pixel size scale
+                const int shift = 31 - gglClz(tmu.format.size);
+                U = scratches.obtain();
+                V = scratches.obtain();
+
+                if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
+                    return;
+
+                // sample the texel center
+                SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
+                SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
+
+                // get the fractionnal part of U,V
+                AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
+                AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
+
+                // compute width-1 and height-1
+                SUB(AL, 0, width,  width,  imm(1));
+                SUB(AL, 0, height, height, imm(1));
+
+                // get the integer part of U,V and clamp/wrap
+                // and compute offset to the next texel
+                if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
+                    // u has already been REPEATed
+                    MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
+                    MOV(MI, 0, u, width);                    
+                    CMP(AL, u, width);
+                    MOV(LT, 0, width, imm(1 << shift));
+                    if (shift)
+                        MOV(GE, 0, width, reg_imm(width, LSL, shift));
+                    RSB(GE, 0, width, width, imm(0));
+                } else {
+                    // u has not been CLAMPed yet
+                    // algorithm:
+                    // if ((u>>4) >= width)
+                    //      u = width<<4
+                    //      width = 0
+                    // else
+                    //      width = 1<<shift
+                    // u = u>>4; // get integer part
+                    // if (u<0)
+                    //      u = 0
+                    //      width = 0
+                    // generated_vars.rt = width
+                    
+                    CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
+                    MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
+                    MOV(LE, 0, width, imm(0));
+                    MOV(GT, 0, width, imm(1 << shift));
+                    MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
+                    MOV(MI, 0, u, imm(0));
+                    MOV(MI, 0, width, imm(0));
+                }
+                CONTEXT_STORE(width, generated_vars.rt);
+
+                const int stride = width;
+                CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
+                if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
+                    // v has already been REPEATed
+                    MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
+                    MOV(MI, 0, v, height);
+                    CMP(AL, v, height);
+                    MOV(LT, 0, height, imm(1 << shift));
+                    if (shift)
+                        MOV(GE, 0, height, reg_imm(height, LSL, shift));
+                    RSB(GE, 0, height, height, imm(0));
+                    MUL(AL, 0, height, stride, height);
+                } else {
+                    // v has not been CLAMPed yet
+                    CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
+                    MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
+                    MOV(LE, 0, height, imm(0));
+                    if (shift) {
+                        MOV(GT, 0, height, reg_imm(stride, LSL, shift));
+                    } else {
+                        MOV(GT, 0, height, stride);
+                    }
+                    MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
+                    MOV(MI, 0, v, imm(0));
+                    MOV(MI, 0, height, imm(0));
+                }
+                CONTEXT_STORE(height, generated_vars.lb);
+            }
+    
+            scratches.recycle(width);
+            scratches.recycle(height);
+
+            // iterate texture coordinates...
+            comment("iterate s,t");
+            int dsdx = scratches.obtain();
+            int dtdx = scratches.obtain();
+
+            if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
+                return;
+
+            CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
+            CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
+            ADD(AL, 0, s.reg, s.reg, dsdx);
+            ADD(AL, 0, t.reg, t.reg, dtdx);
+            if ((mOptLevel&1)==0) {
+                CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
+                CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
+                scratches.recycle(s.reg);
+                scratches.recycle(t.reg);
+            }
+            scratches.recycle(dsdx);
+            scratches.recycle(dtdx);
+
+            // merge base & offset...
+            comment("merge base & offset");
+            texel.setTo(regs.obtain(), &tmu.format);
+            txPtr.setTo(texel.reg, tmu.bits);
+            int stride = scratches.obtain();
+
+            if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
+                return;
+
+            CONTEXT_LOAD(stride,    generated_vars.texture[i].stride);
+            CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
+            SMLABB(AL, u, v, stride, u);    // u+v*stride 
+            base_offset(txPtr, txPtr, u);
+
+            // load texel
+            if (!tmu.linear) {
+                comment("fetch texel");
+                load(txPtr, texel, 0);
+            } else {
+                // recycle registers we don't need anymore
+                scratches.recycle(u);
+                scratches.recycle(v);
+                scratches.recycle(stride);
+
+                comment("fetch texel, bilinear");
+                switch (tmu.format.size) {
+                case 1:  filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                }
+            }            
+        }
+    }
+}
+
+void GGLAssembler::build_iterate_texture_coordinates(
+    const fragment_parts_t& parts)
+{
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+            (tmu.twrap == GGL_NEEDS_WRAP_11))
+        { // 1:1 textures
+            const pointer_t& txPtr = parts.coords[i].ptr;
+            ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
+        } else {
+            Scratch scratches(registerFile());
+            int s = parts.coords[i].s.reg;
+            int t = parts.coords[i].t.reg;
+            if ((mOptLevel&1)==0) {
+                s = scratches.obtain();
+                t = scratches.obtain();
+                CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
+                CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
+            }
+            int dsdx = scratches.obtain();
+            int dtdx = scratches.obtain();
+            CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
+            CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
+            ADD(AL, 0, s, s, dsdx);
+            ADD(AL, 0, t, t, dtdx);
+            if ((mOptLevel&1)==0) {
+                CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
+                CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
+            }
+        }
+    }
+}
+
+void GGLAssembler::filter8(
+        const fragment_parts_t& /*parts*/,
+        pixel_t& texel, const texture_unit_t& tmu,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{
+    if (tmu.format.components != GGL_ALPHA &&
+        tmu.format.components != GGL_LUMINANCE)
+    {
+        // this is a packed format, and we don't support
+        // linear filtering (it's probably RGB 332)
+        // Should not happen with OpenGL|ES
+        LDRB(AL, texel.reg, txPtr.reg);
+        return;
+    }
+
+    // ------------------------
+    // about ~22 cycles / pixel
+    Scratch scratches(registerFile());
+
+    int pixel= scratches.obtain();
+    int d    = scratches.obtain();
+    int u    = scratches.obtain();
+    int k    = scratches.obtain();
+    int rt   = scratches.obtain();
+    int lb   = scratches.obtain();
+
+    // RB -> U * V
+
+    CONTEXT_LOAD(rt, generated_vars.rt);
+    CONTEXT_LOAD(lb, generated_vars.lb);
+
+    int offset = pixel;
+    ADD(AL, 0, offset, lb, rt);
+    LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SMULBB(AL, u, U, V);
+    SMULBB(AL, d, pixel, u);
+    RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
+    
+    // LB -> (1-U) * V
+    RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
+    LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
+    SMULBB(AL, u, U, V);
+    SMLABB(AL, d, pixel, u, d);
+    SUB(AL, 0, k, k, u);
+    
+    // LT -> (1-U)*(1-V)
+    RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
+    LDRB(AL, pixel, txPtr.reg);
+    SMULBB(AL, u, U, V);
+    SMLABB(AL, d, pixel, u, d);
+
+    // RT -> U*(1-V)
+    LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
+    SUB(AL, 0, u, k, u);
+    SMLABB(AL, texel.reg, pixel, u, d);
+    
+    for (int i=0 ; i<4 ; i++) {
+        if (!texel.format.c[i].h) continue;
+        texel.format.c[i].h = FRAC_BITS*2+8;
+        texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
+    }
+    texel.format.size = 4;
+    texel.format.bitsPerPixel = 32;
+    texel.flags |= CLEAR_LO;
+}
+
+void GGLAssembler::filter16(
+        const fragment_parts_t& /*parts*/,
+        pixel_t& texel, const texture_unit_t& tmu,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{    
+    // compute the mask
+    // XXX: it would be nice if the mask below could be computed
+    // automatically.
+    uint32_t mask = 0;
+    int shift = 0;
+    int prec = 0;
+    switch (tmu.format_idx) {
+        case GGL_PIXEL_FORMAT_RGB_565:
+            // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
+            // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
+            mask = 0x07E0F81F;
+            shift = 16;
+            prec = 5;
+            break;
+        case GGL_PIXEL_FORMAT_RGBA_4444:
+            // 0000,1111,0000,1111 | 0000,1111,0000,1111
+            mask = 0x0F0F0F0F;
+            shift = 12;
+            prec = 4;
+            break;
+        case GGL_PIXEL_FORMAT_LA_88:
+            // 0000,0000,1111,1111 | 0000,0000,1111,1111
+            // AALL -> 00AA | 00LL
+            mask = 0x00FF00FF;
+            shift = 8;
+            prec = 8;
+            break;
+        default:
+            // unsupported format, do something sensical...
+            ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
+            LDRH(AL, texel.reg, txPtr.reg);
+            return;
+    }
+
+    const int adjust = FRAC_BITS*2 - prec;
+    const int round  = 0;
+
+    // update the texel format
+    texel.format.size = 4;
+    texel.format.bitsPerPixel = 32;
+    texel.flags |= CLEAR_HI|CLEAR_LO;
+    for (int i=0 ; i<4 ; i++) {
+        if (!texel.format.c[i].h) continue;
+        const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
+        texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
+        texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
+    }
+
+    // ------------------------
+    // about ~40 cycles / pixel
+    Scratch scratches(registerFile());
+
+    int pixel= scratches.obtain();
+    int d    = scratches.obtain();
+    int u    = scratches.obtain();
+    int k    = scratches.obtain();
+
+    // RB -> U * V
+    int offset = pixel;
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    CONTEXT_LOAD(u, generated_vars.lb);
+    ADD(AL, 0, offset, offset, u);
+
+    LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
+    SMULBB(AL, u, U, V);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MUL(AL, 0, d, pixel, u);
+    RSB(AL, 0, k, u, imm(1<<prec));
+    
+    // LB -> (1-U) * V
+    CONTEXT_LOAD(offset, generated_vars.lb);
+    RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
+    LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
+    SMULBB(AL, u, U, V);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, d, pixel, u, d);
+    SUB(AL, 0, k, k, u);
+    
+    // LT -> (1-U)*(1-V)
+    RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
+    LDRH(AL, pixel, txPtr.reg);
+    SMULBB(AL, u, U, V);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, d, pixel, u, d);
+
+    // RT -> U*(1-V)            
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
+    SUB(AL, 0, u, k, u);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    MLA(AL, 0, texel.reg, pixel, u, d);
+}
+
+void GGLAssembler::filter24(
+        const fragment_parts_t& /*parts*/,
+        pixel_t& texel, const texture_unit_t& /*tmu*/,
+        int /*U*/, int /*V*/, pointer_t& txPtr,
+        int /*FRAC_BITS*/)
+{
+    // not supported yet (currently disabled)
+    load(txPtr, texel, 0);
+}
+
+void GGLAssembler::filter32(
+        const fragment_parts_t& /*parts*/,
+        pixel_t& texel, const texture_unit_t& /*tmu*/,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{
+    const int adjust = FRAC_BITS*2 - 8;
+    const int round  = 0;
+
+    // ------------------------
+    // about ~38 cycles / pixel
+    Scratch scratches(registerFile());
+    
+    int pixel= scratches.obtain();
+    int dh   = scratches.obtain();
+    int u    = scratches.obtain();
+    int k    = scratches.obtain();
+
+    int temp = scratches.obtain();
+    int dl   = scratches.obtain();
+    int mask = scratches.obtain();
+
+    MOV(AL, 0, mask, imm(0xFF));
+    ORR(AL, 0, mask, mask, imm(0xFF0000));
+
+    // RB -> U * V
+    int offset = pixel;
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    CONTEXT_LOAD(u, generated_vars.lb);
+    ADD(AL, 0, offset, offset, u);
+
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SMULBB(AL, u, U, V);
+    AND(AL, 0, temp, mask, pixel);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MUL(AL, 0, dh, temp, u);
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MUL(AL, 0, dl, temp, u);
+    RSB(AL, 0, k, u, imm(0x100));
+
+    // LB -> (1-U) * V
+    CONTEXT_LOAD(offset, generated_vars.lb);
+    RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SMULBB(AL, u, U, V);
+    AND(AL, 0, temp, mask, pixel);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, dh, temp, u, dh);    
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MLA(AL, 0, dl, temp, u, dl);
+    SUB(AL, 0, k, k, u);
+
+    // LT -> (1-U)*(1-V)
+    RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
+    LDR(AL, pixel, txPtr.reg);
+    SMULBB(AL, u, U, V);
+    AND(AL, 0, temp, mask, pixel);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, dh, temp, u, dh);    
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MLA(AL, 0, dl, temp, u, dl);
+
+    // RT -> U*(1-V)            
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SUB(AL, 0, u, k, u);
+    AND(AL, 0, temp, mask, pixel);
+    MLA(AL, 0, dh, temp, u, dh);    
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MLA(AL, 0, dl, temp, u, dl);
+
+    AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
+    AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
+    ORR(AL, 0, texel.reg, dh, dl);
+}
+
+void GGLAssembler::build_texture_environment(
+        component_t& fragment,
+        const fragment_parts_t& parts,
+        int component,
+        Scratch& regs)
+{
+    const uint32_t component_mask = 1<<component;
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+
+        if (tmu.mask & component_mask) {
+            // replace or modulate with this texture
+            if ((tmu.replaced & component_mask) == 0) {
+                // not replaced by a later tmu...
+
+                Scratch scratches(registerFile());
+                pixel_t texel(parts.texel[i]);
+
+                if (multiTexture && 
+                    tmu.swrap == GGL_NEEDS_WRAP_11 &&
+                    tmu.twrap == GGL_NEEDS_WRAP_11)
+                {
+                    texel.reg = scratches.obtain();
+                    texel.flags |= CORRUPTIBLE;
+                    comment("fetch texel (multitexture 1:1)");
+                    load(parts.coords[i].ptr, texel, WRITE_BACK);
+                 }
+
+                component_t incoming(fragment);
+                modify(fragment, regs);
+                
+                switch (tmu.env) {
+                case GGL_REPLACE:
+                    extract(fragment, texel, component);
+                    break;
+                case GGL_MODULATE:
+                    modulate(fragment, incoming, texel, component);
+                    break;
+                case GGL_DECAL:
+                    decal(fragment, incoming, texel, component);
+                    break;
+                case GGL_BLEND:
+                    blend(fragment, incoming, texel, component, i);
+                    break;
+                case GGL_ADD:
+                    add(fragment, incoming, texel, component);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::wrapping(
+            int d,
+            int coord, int size,
+            int tx_wrap, int tx_linear)
+{
+    // notes:
+    // if tx_linear is set, we need 4 extra bits of precision on the result
+    // SMULL/UMULL is 3 cycles
+    Scratch scratches(registerFile());
+    int c = coord;
+    if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
+        // UMULL takes 4 cycles (interlocked), and we can get away with
+        // 2 cycles using SMULWB, but we're loosing 16 bits of precision
+        // out of 32 (this is not a problem because the iterator keeps
+        // its full precision)
+        // UMULL(AL, 0, size, d, c, size);
+        // note: we can't use SMULTB because it's signed.
+        MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
+        SMULWB(AL, d, d, size);
+    } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
+        if (tx_linear) {
+            // 1 cycle
+            MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
+        } else {
+            // 4 cycles (common case)
+            MOV(AL, 0, d, reg_imm(coord, ASR, 16));
+            BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
+            CMP(AL, d, size);
+            SUB(GE, 0, d, size, imm(1));
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::modulate(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component)
+{
+    Scratch locals(registerFile());
+    integer_t texel(locals.obtain(), 32, CORRUPTIBLE);            
+    extract(texel, incomingTexel, component);
+
+    const int Nt = texel.size();
+        // Nt should always be less than 10 bits because it comes
+        // from the TMU.
+
+    int Ni = incoming.size();
+        // Ni could be big because it comes from previous MODULATEs
+
+    if (Nt == 1) {
+        // texel acts as a bit-mask
+        // dest = incoming & ((texel << incoming.h)-texel)
+        RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
+        AND(AL, 0, dest.reg, dest.reg, incoming.reg);
+        dest.l = incoming.l;
+        dest.h = incoming.h;
+        dest.flags |= (incoming.flags & CLEAR_LO);
+    } else if (Ni == 1) {
+        MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
+        AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
+        dest.l = 0;
+        dest.h = Nt;
+    } else {
+        int inReg = incoming.reg;
+        int shift = incoming.l;
+        if ((Nt + Ni) > 32) {
+            // we will overflow, reduce the precision of Ni to 8 bits
+            // (Note Nt cannot be more than 10 bits which happens with 
+            // 565 textures and GGL_LINEAR)
+            shift += Ni-8;
+            Ni = 8;
+        }
+
+        // modulate by the component with the lowest precision
+        if (Nt >= Ni) {
+            if (shift) {
+                // XXX: we should be able to avoid this shift
+                // when shift==16 && Nt<16 && Ni<16, in which
+                // we could use SMULBT below.
+                MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
+                inReg = dest.reg;
+                shift = 0;
+            }
+            // operation:           (Cf*Ct)/((1<<Ni)-1)
+            // approximated with:   Cf*(Ct + Ct>>(Ni-1))>>Ni
+            // this operation doesn't change texel's size
+            ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
+            if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
+            else                MUL(AL, 0, dest.reg, texel.reg, dest.reg);
+            dest.l = Ni;
+            dest.h = Nt + Ni;            
+        } else {
+            if (shift && (shift != 16)) {
+                // if shift==16, we can use 16-bits mul instructions later
+                MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
+                inReg = dest.reg;
+                shift = 0;
+            }
+            // operation:           (Cf*Ct)/((1<<Nt)-1)
+            // approximated with:   Ct*(Cf + Cf>>(Nt-1))>>Nt
+            // this operation doesn't change incoming's size
+            Scratch scratches(registerFile());
+            int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
+            if (t == inReg)
+                t = scratches.obtain();
+            ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
+            if (Nt<16 && Ni<16) {
+                if (shift==16)  SMULBT(AL, dest.reg, t, inReg);
+                else            SMULBB(AL, dest.reg, t, inReg);
+            } else              MUL(AL, 0, dest.reg, t, inReg);
+            dest.l = Nt;
+            dest.h = Nt + Ni;
+        }
+
+        // low bits are not valid
+        dest.flags |= CLEAR_LO;
+
+        // no need to keep more than 8 bits/component
+        if (dest.size() > 8)
+            dest.l = dest.h-8;
+    }
+}
+
+void GGLAssembler::decal(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component)
+{
+    // RGBA:
+    // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
+    // Av = Af
+    Scratch locals(registerFile());
+    integer_t texel(locals.obtain(), 32, CORRUPTIBLE);            
+    integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
+    extract(texel, incomingTexel, component);
+    extract(factor, incomingTexel, GGLFormat::ALPHA);
+
+    // no need to keep more than 8-bits for decal 
+    int Ni = incoming.size();
+    int shift = incoming.l;
+    if (Ni > 8) {
+        shift += Ni-8;
+        Ni = 8;
+    }
+    integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
+    if (shift) {
+        MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
+        incomingNorm.reg = dest.reg;
+        incomingNorm.flags |= CORRUPTIBLE;
+    }
+    ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
+    build_blendOneMinusFF(dest, factor, incomingNorm, texel);
+}
+
+void GGLAssembler::blend(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component, int tmu)
+{
+    // RGBA:
+    // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
+    // Av = At*Af
+
+    if (component == GGLFormat::ALPHA) {
+        modulate(dest, incoming, incomingTexel, component);
+        return;
+    }
+    
+    Scratch locals(registerFile());
+    integer_t color(locals.obtain(), 8, CORRUPTIBLE);            
+    integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
+    LDRB(AL, color.reg, mBuilderContext.Rctx,
+            immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
+    extract(factor, incomingTexel, component);
+
+    // no need to keep more than 8-bits for blend 
+    int Ni = incoming.size();
+    int shift = incoming.l;
+    if (Ni > 8) {
+        shift += Ni-8;
+        Ni = 8;
+    }
+    integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
+    if (shift) {
+        MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
+        incomingNorm.reg = dest.reg;
+        incomingNorm.flags |= CORRUPTIBLE;
+    }
+    ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
+    build_blendOneMinusFF(dest, factor, incomingNorm, color);
+}
+
+void GGLAssembler::add(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component)
+{
+    // RGBA:
+    // Cv = Cf + Ct;
+    Scratch locals(registerFile());
+    
+    component_t incomingTemp(incoming);
+
+    // use "dest" as a temporary for extracting the texel, unless "dest"
+    // overlaps "incoming".
+    integer_t texel(dest.reg, 32, CORRUPTIBLE);
+    if (dest.reg == incomingTemp.reg)
+        texel.reg = locals.obtain();
+    extract(texel, incomingTexel, component);
+
+    if (texel.s < incomingTemp.size()) {
+        expand(texel, texel, incomingTemp.size());
+    } else if (texel.s > incomingTemp.size()) {
+        if (incomingTemp.flags & CORRUPTIBLE) {
+            expand(incomingTemp, incomingTemp, texel.s);
+        } else {
+            incomingTemp.reg = locals.obtain();
+            expand(incomingTemp, incoming, texel.s);
+        }
+    }
+
+    if (incomingTemp.l) {
+        ADD(AL, 0, dest.reg, texel.reg,
+                reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
+    } else {
+        ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
+    }
+    dest.l = 0;
+    dest.h = texel.size();
+    component_sat(dest);
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+

diff --git a/libpixelflinger/codeflinger/tinyutils/smartpointer.h b/libpixelflinger/codeflinger/tinyutils/smartpointer.h
new file mode 100644
index 0000000..23a5f7e
--- /dev/null
+++ b/libpixelflinger/codeflinger/tinyutils/smartpointer.h

@@ -0,0 +1,180 @@
+/*
+ * Copyright 2005 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_PIXELFLINGER_SMART_POINTER_H
+#define ANDROID_PIXELFLINGER_SMART_POINTER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <stdlib.h>
+
+// ---------------------------------------------------------------------------
+namespace android {
+namespace tinyutils {
+
+// ---------------------------------------------------------------------------
+
+#define COMPARE(_op_)                                           \
+inline bool operator _op_ (const sp<T>& o) const {              \
+    return m_ptr _op_ o.m_ptr;                                  \
+}                                                               \
+inline bool operator _op_ (const T* o) const {                  \
+    return m_ptr _op_ o;                                        \
+}                                                               \
+template<typename U>                                            \
+inline bool operator _op_ (const sp<U>& o) const {              \
+    return m_ptr _op_ o.m_ptr;                                  \
+}                                                               \
+template<typename U>                                            \
+inline bool operator _op_ (const U* o) const {                  \
+    return m_ptr _op_ o;                                        \
+}
+
+// ---------------------------------------------------------------------------
+
+template <typename T>
+class sp
+{
+public:
+    inline sp() : m_ptr(0) { }
+
+    sp(T* other);  // NOLINT, implicit
+    sp(const sp<T>& other);
+    template<typename U> sp(U* other);  // NOLINT, implicit
+    template<typename U> sp(const sp<U>& other);  // NOLINT, implicit
+
+    ~sp();
+    
+    // Assignment
+
+    sp& operator = (T* other);
+    sp& operator = (const sp<T>& other);
+    
+    template<typename U> sp& operator = (const sp<U>& other);
+    template<typename U> sp& operator = (U* other);
+    
+    // Reset
+    void clear();
+    
+    // Accessors
+
+    inline  T&      operator* () const  { return *m_ptr; }
+    inline  T*      operator-> () const { return m_ptr;  }
+    inline  T*      get() const         { return m_ptr; }
+
+    // Operators
+        
+    COMPARE(==)
+    COMPARE(!=)
+    COMPARE(>)
+    COMPARE(<)
+    COMPARE(<=)
+    COMPARE(>=)
+
+private:    
+    template<typename Y> friend class sp;
+
+    T*              m_ptr;
+};
+
+// ---------------------------------------------------------------------------
+// No user serviceable parts below here.
+
+template<typename T>
+sp<T>::sp(T* other)
+    : m_ptr(other)
+{
+    if (other) other->incStrong(this);
+}
+
+template<typename T>
+sp<T>::sp(const sp<T>& other)
+    : m_ptr(other.m_ptr)
+{
+    if (m_ptr) m_ptr->incStrong(this);
+}
+
+template<typename T> template<typename U>
+sp<T>::sp(U* other) : m_ptr(other)
+{
+    if (other) other->incStrong(this);
+}
+
+template<typename T> template<typename U>
+sp<T>::sp(const sp<U>& other)
+    : m_ptr(other.m_ptr)
+{
+    if (m_ptr) m_ptr->incStrong(this);
+}
+
+template<typename T>
+sp<T>::~sp()
+{
+    if (m_ptr) m_ptr->decStrong(this);
+}
+
+template<typename T>
+sp<T>& sp<T>::operator = (const sp<T>& other) {
+    if (other.m_ptr) other.m_ptr->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other.m_ptr;
+    return *this;
+}
+
+template<typename T>
+sp<T>& sp<T>::operator = (T* other)
+{
+    if (other) other->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other;
+    return *this;
+}
+
+template<typename T> template<typename U>
+sp<T>& sp<T>::operator = (const sp<U>& other)
+{
+    if (other.m_ptr) other.m_ptr->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other.m_ptr;
+    return *this;
+}
+
+template<typename T> template<typename U>
+sp<T>& sp<T>::operator = (U* other)
+{
+    if (other) other->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other;
+    return *this;
+}
+
+template<typename T>
+void sp<T>::clear()
+{
+    if (m_ptr) {
+        m_ptr->decStrong(this);
+        m_ptr = 0;
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+} // namespace tinyutils
+} // namespace android
+
+// ---------------------------------------------------------------------------
+
+#endif // ANDROID_PIXELFLINGER_SMART_POINTER_H
commit	673c7ae01e96a141aadc234549ddb58dc00b93d4	[log] [tgz]
author	bigbiff <bigbiff@teamw.in>	Wed Dec 02 19:44:56 2020 -0500
committer	bigbiff <bigbiff@teamw.in>	Wed Dec 16 19:06:18 2020 -0500
tree	93eaae786c73e842740197600ac7dc054605f51b
parent	a9fc4f9eba836e2d0dba9413d35b2f77ba59c8dd [diff]
parent	d74a03d544399b23844f43ac36dc1de893b52d90 [diff]