Merge tag 'android-11.0.0_r16' into android-10.0

Android 11.0.0 release 16 - twrp bringup patch
diff --git a/libpixelflinger/Android.bp b/libpixelflinger/Android.bp
new file mode 100644
index 0000000..5721394
--- /dev/null
+++ b/libpixelflinger/Android.bp
@@ -0,0 +1,105 @@
+//bootstrap_go_package {

+//    name: "soong-libpixelflingertwrp_defaults",

+//    pkgPath: "bootable/recovery/libpixelflinger",

+//    deps: [

+//        "soong",

+//        "soong-android",

+//        "soong-cc"

+//    ],

+//    srcs: [

+//        "libpixelflingertwrp_defaults.go"

+//    ],

+//    pluginFor: ["soong_build"]

+//}

+

+//libpixelflingertwrp_defaults {

+//    name: "libpixelflingertwrp_defaults"

+//}

+

+cc_defaults {

+    name: "libpixelflingertwrp_defaults",

+

+    cflags: [

+        "-fstrict-aliasing",

+        "-fomit-frame-pointer",

+        "-Wall",

+        "-Werror",

+        "-Wno-unused-function",

+    ],

+    export_include_dirs: ["include"],

+    header_libs: ["libbase_headers"],

+    shared_libs: [

+        "libcutils",

+        "liblog",

+        "libutils",

+    ],

+

+    arch: {

+        arm: {

+            neon: {

+                cflags: ["-D__ARM_HAVE_NEON"],

+            },

+        },

+    },

+}

+

+cc_library_static {

+    name: "libpixelflinger_twrp",

+    defaults: ["libpixelflingertwrp_defaults"],

+

+    srcs: [

+        "codeflinger/ARMAssemblerInterface.cpp",

+        "codeflinger/ARMAssemblerProxy.cpp",

+        "codeflinger/CodeCache.cpp",

+        "codeflinger/GGLAssembler.cpp",

+        "codeflinger/load_store.cpp",

+        "codeflinger/blending.cpp",

+        "codeflinger/texturing.cpp",

+        "format.cpp",

+        "clear.cpp",

+        "raster.cpp",

+        "buffer.cpp",

+    ],

+    whole_static_libs: ["libpixelflinger-arm"],

+

+    arch: {

+        arm: {

+            srcs: [

+                "codeflinger/ARMAssembler.cpp",

+                "codeflinger/disassem.c",

+                "col32cb16blend.S",

+                "t32cb16blend.S",

+            ],

+

+            neon: {

+                srcs: ["col32cb16blend_neon.S"],

+            },

+        },

+        arm64: {

+            srcs: [

+                "codeflinger/Arm64Assembler.cpp",

+                "codeflinger/Arm64Disassembler.cpp",

+                "arch-arm64/col32cb16blend.S",

+                "arch-arm64/t32cb16blend.S",

+            ],

+        },

+        mips: {

+            mips32r6: {

+                srcs: [

+                    "codeflinger/MIPSAssembler.cpp",

+                    "codeflinger/mips_disassem.c",

+                    "arch-mips/t32cb16blend.S",

+                ],

+            },

+        },

+        mips64: {

+            srcs: [

+                "codeflinger/MIPSAssembler.cpp",

+                "codeflinger/MIPS64Assembler.cpp",

+                "codeflinger/mips64_disassem.c",

+                "arch-mips64/col32cb16blend.S",

+                "arch-mips64/t32cb16blend.S",

+            ],

+        },

+    },

+}

diff --git a/libpixelflinger/Android.mk b/libpixelflinger/Android.mk.orig
similarity index 94%
rename from libpixelflinger/Android.mk
rename to libpixelflinger/Android.mk.orig
index 439c835..5a0bc79 100644
--- a/libpixelflinger/Android.mk
+++ b/libpixelflinger/Android.mk.orig
@@ -76,14 +76,6 @@
 	arch-arm64/col32cb16blend.S \
 	arch-arm64/t32cb16blend.S \
 
-ifndef ARCH_MIPS_REV6
-PIXELFLINGER_SRC_FILES_mips := \
-	codeflinger/MIPSAssembler.cpp \
-	codeflinger/mips_disassem.c \
-	arch-mips/t32cb16blend.S \
-
-endif
-
 #
 # Static library version
 #
diff --git a/libpixelflinger/MODULE_LICENSE_APACHE2 b/libpixelflinger/MODULE_LICENSE_APACHE2
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/libpixelflinger/MODULE_LICENSE_APACHE2
diff --git a/libpixelflinger/NOTICE b/libpixelflinger/NOTICE
new file mode 100644
index 0000000..c5b1efa
--- /dev/null
+++ b/libpixelflinger/NOTICE
@@ -0,0 +1,190 @@
+
+   Copyright (c) 2005-2008, The Android Open Source Project
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
diff --git a/libpixelflinger/arch-arm64/col32cb16blend.S b/libpixelflinger/arch-arm64/col32cb16blend.S
new file mode 100644
index 0000000..84596f9
--- /dev/null
+++ b/libpixelflinger/arch-arm64/col32cb16blend.S
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+    .text
+    .balign 0
+
+    .global scanline_col32cb16blend_arm64
+
+//
+// This function alpha blends a fixed color into a destination scanline, using
+// the formula:
+//
+//     d = s + (((a + (a >> 7)) * d) >> 8)
+//
+// where d is the destination pixel,
+//       s is the source color,
+//       a is the alpha channel of the source color.
+//
+
+// x0 = destination buffer pointer
+// w1 = color value
+// w2 = count
+
+
+scanline_col32cb16blend_arm64:
+
+    lsr         w5, w1, #24                     // shift down alpha
+    mov         w9, #0xff                       // create mask
+    add         w5, w5, w5, lsr #7              // add in top bit
+    mov         w4, #256                        // create #0x100
+    sub         w5, w4, w5                      // invert alpha
+    and         w10, w1, #0xff                  // extract red
+    and         w12, w9, w1, lsr #8             // extract green
+    and         w4,  w9, w1, lsr #16            // extract blue
+    lsl         w10, w10, #5                    // prescale red
+    lsl         w12, w12, #6                    // prescale green
+    lsl         w4,  w4,  #5                    // prescale blue
+    lsr         w9,  w9,  #2                    // create dest green mask
+
+1:
+    ldrh        w8, [x0]                        // load dest pixel
+    subs        w2, w2, #1                      // decrement loop counter
+    lsr         w6, w8, #11                     // extract dest red
+    and         w7, w9, w8, lsr #5              // extract dest green
+    and         w8, w8, #0x1f                   // extract dest blue
+
+    madd        w6, w6, w5, w10                 // dest red * alpha + src red
+    madd        w7, w7, w5, w12                 // dest green * alpha + src green
+    madd        w8, w8, w5, w4                  // dest blue * alpha + src blue
+
+    lsr         w6, w6, #8                      // shift down red
+    lsr         w7, w7, #8                      // shift down green
+    lsl         w6, w6, #11                     // shift red into 565
+    orr         w6, w6, w7, lsl #5              // shift green into 565
+    orr         w6, w6, w8, lsr #8              // shift blue into 565
+
+    strh        w6, [x0], #2                    // store pixel to dest, update ptr
+    b.ne        1b                              // if count != 0, loop
+
+    ret
+
+
+
diff --git a/libpixelflinger/arch-arm64/t32cb16blend.S b/libpixelflinger/arch-arm64/t32cb16blend.S
new file mode 100644
index 0000000..a9733c0
--- /dev/null
+++ b/libpixelflinger/arch-arm64/t32cb16blend.S
@@ -0,0 +1,213 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+    .text
+    .balign 0
+
+    .global scanline_t32cb16blend_arm64
+
+/*
+ * .macro pixel
+ *
+ *  This macro alpha blends RGB565 original pixel located in either
+ *  top or bottom 16 bits of DREG register with SRC 32 bit pixel value
+ *  and writes the result to FB register
+ *
+ * \DREG is a 32-bit register containing *two* original destination RGB565
+ *       pixels, with the even one in the low-16 bits, and the odd one in the
+ *       high 16 bits.
+ *
+ * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
+ *
+ * \FB is a target register that will contain the blended pixel values.
+ *
+ * \ODD is either 0 or 1 and indicates if we're blending the lower or
+ *      upper 16-bit pixels in DREG into FB
+ *
+ *
+ * clobbered: w6, w7, w15, w16, w17
+ *
+ */
+
+.macro pixel,   DREG, SRC, FB, ODD
+
+    // SRC = 0xAABBGGRR
+    lsr     w7, \SRC, #24               // sA
+    add     w7, w7, w7, lsr #7          // sA + (sA >> 7)
+    mov     w6, #0x100
+    sub     w7, w6, w7                  // sA = 0x100 - (sA+(sA>>7))
+
+1:
+
+.if \ODD //Blending odd pixel present in top 16 bits of DREG register
+
+    // red
+    lsr     w16, \DREG, #(16 + 11)
+    mul     w16, w7, w16
+    lsr     w6, \SRC, #3
+    and     w6, w6, #0x1F
+    add     w16, w6, w16, lsr #8
+    cmp     w16, #0x1F
+    orr     w17, \FB, #(0x1F<<(16 + 11))
+    orr     w15, \FB, w16, lsl #(16 + 11)
+    csel    \FB, w17, w15, hi
+        // green
+        and     w6, \DREG, #(0x3F<<(16 + 5))
+        lsr     w17,w6,#(16+5)
+        mul     w6, w7, w17
+        lsr     w16, \SRC, #(8+2)
+        and     w16, w16, #0x3F
+        add     w6, w16, w6, lsr #8
+        cmp     w6, #0x3F
+        orr     w17, \FB, #(0x3F<<(16 + 5))
+        orr     w15, \FB, w6, lsl #(16 + 5)
+        csel    \FB, w17, w15, hi
+            // blue
+            and     w16, \DREG, #(0x1F << 16)
+            lsr     w17,w16,#16
+            mul     w16, w7, w17
+            lsr     w6, \SRC, #(8+8+3)
+            and     w6, w6, #0x1F
+            add     w16, w6, w16, lsr #8
+            cmp     w16, #0x1F
+            orr     w17, \FB, #(0x1F << 16)
+            orr     w15, \FB, w16, lsl #16
+            csel    \FB, w17, w15, hi
+
+.else //Blending even pixel present in bottom 16 bits of DREG register
+
+    // red
+    lsr     w16, \DREG, #11
+    and     w16, w16, #0x1F
+    mul     w16, w7, w16
+    lsr     w6, \SRC, #3
+    and     w6, w6, #0x1F
+    add     w16, w6, w16, lsr #8
+    cmp     w16, #0x1F
+    mov     w17, #(0x1F<<11)
+    lsl     w15, w16, #11
+    csel    \FB, w17, w15, hi
+
+
+        // green
+        and     w6, \DREG, #(0x3F<<5)
+        mul     w6, w7, w6
+        lsr     w16, \SRC, #(8+2)
+        and     w16, w16, #0x3F
+        add     w6, w16, w6, lsr #(5+8)
+        cmp     w6, #0x3F
+        orr     w17, \FB, #(0x3F<<5)
+        orr     w15, \FB, w6, lsl #5
+        csel    \FB, w17, w15, hi
+
+            // blue
+            and     w16, \DREG, #0x1F
+            mul     w16, w7, w16
+            lsr     w6, \SRC, #(8+8+3)
+            and     w6, w6, #0x1F
+            add     w16, w6, w16, lsr #8
+            cmp     w16, #0x1F
+            orr     w17, \FB, #0x1F
+            orr     w15, \FB, w16
+            csel    \FB, w17, w15, hi
+
+.endif // End of blending even pixel
+
+.endm // End of pixel macro
+
+
+// x0:  dst ptr
+// x1:  src ptr
+// w2:  count
+// w3:  d
+// w4:  s0
+// w5:  s1
+// w6:  pixel
+// w7:  pixel
+// w8:  free
+// w9:  free
+// w10: free
+// w11: free
+// w12: scratch
+// w14: pixel
+
+scanline_t32cb16blend_arm64:
+
+    // align DST to 32 bits
+    tst     x0, #0x3
+    b.eq    aligned
+    subs    w2, w2, #1
+    b.lo    return
+
+last:
+    ldr     w4, [x1], #4
+    ldrh    w3, [x0]
+    pixel   w3, w4, w12, 0
+    strh    w12, [x0], #2
+
+aligned:
+    subs    w2, w2, #2
+    b.lo    9f
+
+    // The main loop is unrolled twice and processes 4 pixels
+8:
+    ldp   w4,w5, [x1], #8
+    add     x0, x0, #4
+    // it's all zero, skip this pixel
+    orr     w3, w4, w5
+    cbz     w3, 7f
+
+    // load the destination
+    ldr     w3, [x0, #-4]
+    // stream the destination
+    pixel   w3, w4, w12, 0
+    pixel   w3, w5, w12, 1
+    str     w12, [x0, #-4]
+
+    // 2nd iteration of the loop, don't stream anything
+    subs    w2, w2, #2
+    csel    w4, w5, w4, lt
+    blt     9f
+    ldp     w4,w5, [x1], #8
+    add     x0, x0, #4
+    orr     w3, w4, w5
+    cbz     w3, 7f
+    ldr     w3, [x0, #-4]
+    pixel   w3, w4, w12, 0
+    pixel   w3, w5, w12, 1
+    str     w12, [x0, #-4]
+
+7:  subs    w2, w2, #2
+    bhs     8b
+    mov     w4, w5
+
+9:  adds    w2, w2, #1
+    b.lo    return
+    b       last
+
+return:
+    ret
diff --git a/libpixelflinger/arch-mips/col32cb16blend.S b/libpixelflinger/arch-mips/col32cb16blend.S
new file mode 100644
index 0000000..810294c
--- /dev/null
+++ b/libpixelflinger/arch-mips/col32cb16blend.S
@@ -0,0 +1,134 @@
+/*
+** Copyright 2015, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+       .macro pixel dreg src f sR sG sB shift
+
+#if __mips==32 && __mips_isa_rev>=2
+       /* extract red */
+       ext $t4,\src,\shift+11,5
+       mul $t4,$t4,\f
+
+       /* extract green */
+       ext $t5,\src,\shift+5,6
+       mul $t5,$t5,\f
+
+       /* extract blue */
+       ext $t6,\src,\shift,5
+       mul $t6,$t6,\f
+#else
+       /* extract red */
+       srl $t4,\src,\shift+11
+       andi $t4, 0x1f
+       mul $t4,$t4,\f
+
+       /* extract green */
+       srl $t5,\src,\shift+5
+       andi $t5, 0x3f
+       mul $t5,$t5,\f
+
+       /* extract blue */
+       srl $t6,\src,\shift
+       andi $t6, 0x1f
+       mul $t6,$t6,\f
+#endif
+
+       srl $t4,$t4,8
+       srl $t5,$t5,8
+       srl $t6,$t6,8
+       addu $t4,$t4,\sR
+       addu $t5,$t5,\sG
+       addu \dreg,$t6,\sB
+       sll $t4,$t4,11
+       sll $t5,$t5,5
+       or \dreg,\dreg,$t4
+       or \dreg,\dreg,$t5
+       andi \dreg, 0xffff
+       .endm
+
+       .text
+       .balign 4
+
+       .global scanline_col32cb16blend_mips
+       .ent    scanline_col32cb16blend_mips
+scanline_col32cb16blend_mips:
+
+       /* check if count is zero */
+       srl     $v0,$a1,24 /* sA */
+       beqz    $a2,done
+       li      $t4, 0x100
+       srl     $v1,$v0,7
+       addu    $v0,$v1,$v0
+       subu    $v0,$t4,$v0 /* f */
+#if __mips==32 && __mips_isa_rev>=2
+       ext     $a3,$a1,3,5 /* sR */
+       ext     $t0,$a1,10,6 /* sG */
+       ext     $t1,$a1,19,5 /* sB */
+#else
+       srl     $a3, $a1, 3
+       andi    $a3, 0x1f    /* sR */
+       srl     $t0, $a1, 10
+       andi    $t0, 0x3f    /* sG */
+       srl     $t1, $a1, 19
+       andi    $t1, 0x1f    /* sB */
+#endif
+
+       /* check if cnt is at least 4 */
+       addiu   $a2,$a2,-4
+       bltz    $a2,tail
+
+loop_4pixels:
+       lw      $t7,0($a0)
+       lw      $t8,4($a0)
+       addiu   $a0,$a0,8
+       addiu   $a2,$a2,-4
+       pixel   $t2 $t7 $v0 $a3 $t0 $t1 0
+       pixel   $t3 $t7 $v0 $a3 $t0 $t1 16
+#if __mips==32 && __mips_isa_rev>=2
+       ins     $t2,$t3,16,16
+#else
+       sll $t3, 16
+       or  $t2, $t2, $t3
+#endif
+       pixel   $t7 $t8 $v0 $a3 $t0 $t1 0
+       pixel   $t3 $t8 $v0 $a3 $t0 $t1 16
+#if __mips==32 && __mips_isa_rev>=2
+       ins     $t7,$t3,16,16
+#else
+       sll $t3, 16
+       or  $t7, $t7, $t3
+#endif
+       sw      $t2,-8($a0)
+       sw      $t7,-4($a0)
+       bgez    $a2, loop_4pixels
+
+tail:
+       /* the pixel count underran, restore it now */
+       addiu   $a2,$a2,4
+
+       /* handle the last 0..3 pixels */
+       beqz    $a2,done
+
+loop_1pixel:
+       lhu     $t7,0($a0)
+       addiu   $a0,$a0,2
+       addiu   $a2,$a2,-1
+       pixel   $t2 $t7 $v0 $a3 $t0 $t1 0
+       sh      $t2, -2($a0)
+       bnez    $a2,loop_1pixel
+
+done:
+       j       $ra
+       .end    scanline_col32cb16blend_mips
diff --git a/libpixelflinger/arch-mips/t32cb16blend.S b/libpixelflinger/arch-mips/t32cb16blend.S
new file mode 100644
index 0000000..1d2fb8f
--- /dev/null
+++ b/libpixelflinger/arch-mips/t32cb16blend.S
@@ -0,0 +1,273 @@
+/* libs/pixelflinger/t32cb16blend.S
+**
+** Copyright 2010, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifdef DEBUG
+#define DBG
+#else
+#define DBG #
+#endif
+
+/*
+ * blend one of 2 16bpp RGB pixels held in dreg selected by shift
+ * with the 32bpp ABGR pixel held in src and store the result in fb
+ *
+ * Assumes that the dreg data is little endian and that
+ * the the second pixel (shift==16) will be merged into
+ * the fb result
+ *
+ * Uses $t0,$t6,$t7,$t8
+ */
+
+#if __mips==32 && __mips_isa_rev>=2
+    .macro pixel dreg src fb shift
+    /*
+     * sA = s >> 24
+     * f = 0x100 - (sA + (sA>>7))
+     */
+DBG .set    noat
+DBG rdhwr   $at,$2
+DBG .set    at
+
+    srl  $t7,\src,24
+    srl  $t6,$t7,7
+    addu $t7,$t6
+    li   $t6,0x100
+    subu $t7,$t6,$t7
+
+    /* red */
+    ext  $t8,\dreg,\shift+6+5,5         # dst[\shift:15..11]
+    mul  $t6,$t8,$t7
+    ext  $t0,\dreg,\shift+5,6           # start green extraction dst[\shift:10..5]
+    ext  $t8,\src,3,5               # src[7..3]
+    srl  $t6,8
+    addu $t8,$t6
+.if \shift!=0
+    sll  $t8,\shift+11
+    or   \fb,$t8
+.else
+    sll  \fb,$t8,11
+.endif
+
+    /* green */
+    mul  $t8,$t0,$t7
+    ext  $t0,\dreg,\shift,5         # start blue extraction dst[\shift:4..0]
+    ext  $t6,\src,2+8,6             # src[15..10]
+    srl  $t8,8
+    addu $t8,$t6
+
+    /* blue */
+    mul  $t0,$t0,$t7
+    sll  $t8, $t8, \shift+5
+    or   \fb, \fb, $t8
+    ext  $t6,\src,(3+8+8),5
+    srl  $t8,$t0,8
+    addu $t8,$t6
+    sll  $t8, $t8, \shift
+    or   \fb, \fb, $t8
+
+DBG .set    noat
+DBG rdhwr $t8,$2
+DBG subu  $t8,$at
+DBG sltu  $at,$t8,$v0
+DBG movn  $v0,$t8,$at
+DBG sgtu  $at,$t8,$v1
+DBG movn  $v1,$t8,$at
+DBG .set    at
+    .endm
+
+#else
+
+    .macro pixel dreg src fb shift
+    /*
+     * sA = s >> 24
+     * f = 0x100 - (sA + (sA>>7))
+     */
+DBG .set    push
+DBG .set    noat
+DBG .set    mips32r2
+DBG rdhwr   $at,$2
+DBG .set    pop
+
+    srl  $t7,\src,24
+    srl  $t6,$t7,7
+    addu $t7,$t6
+    li   $t6,0x100
+    subu $t7,$t6,$t7
+
+    /*
+     * red
+     * dR = (d >> (6 + 5)) & 0x1f;
+     * dR = (f*dR)>>8
+     * sR = (s >> (   3)) & 0x1f;
+     * sR += dR
+     * fb |= sR << 11
+     */
+    srl  $t8,\dreg,\shift+6+5
+.if \shift==0
+    and  $t8,0x1f
+.endif
+    mul  $t8,$t8,$t7
+    srl  $t6,\src,3
+    and  $t6,0x1f
+    srl  $t8,8
+    addu $t8,$t6
+.if \shift!=0
+    sll  $t8,\shift+11
+    or   \fb,$t8
+.else
+    sll  \fb,$t8,11
+.endif
+
+        /*
+     * green
+     * dG = (d >> 5) & 0x3f
+     * dG = (f*dG) >> 8
+     * sG = (s >> ( 8+2))&0x3F;
+     */
+    srl  $t8,\dreg,\shift+5
+    and  $t8,0x3f
+    mul  $t8,$t8,$t7
+    srl  $t6,\src,8+2
+    and  $t6,0x3f
+    srl  $t8,8
+    addu $t8,$t6
+    sll  $t8,\shift + 5
+    or   \fb,$t8
+
+    /* blue */
+.if \shift!=0
+    srl  $t8,\dreg,\shift
+    and  $t8,0x1f
+.else
+    and  $t8,\dreg,0x1f
+.endif
+    mul  $t8,$t8,$t7
+    srl  $t6,\src,(8+8+3)
+    and  $t6,0x1f
+    srl  $t8,8
+    addu $t8,$t6
+.if \shift!=0
+    sll  $t8,\shift
+.endif
+    or   \fb,$t8
+DBG .set    push
+DBG .set    noat
+DBG .set    mips32r2
+DBG rdhwr   $t8,$2
+DBG subu    $t8,$at
+DBG sltu    $at,$t8,$v0
+DBG movn    $v0,$t8,$at
+DBG sgtu    $at,$t8,$v1
+DBG movn    $v1,$t8,$at
+DBG .set    pop
+    .endm
+#endif
+
+    .text
+    .balign 4
+
+    .global scanline_t32cb16blend_mips
+    .ent    scanline_t32cb16blend_mips
+scanline_t32cb16blend_mips:
+DBG li    $v0,0xffffffff
+DBG li    $v1,0
+    /* Align the destination if necessary */
+    and   $t0,$a0,3
+    beqz  $t0,aligned
+
+    /* as long as there is at least one pixel */
+    beqz  $a2,done
+
+    lw    $t4,($a1)
+    addu  $a0,2
+    addu  $a1,4
+    beqz  $t4,1f
+    lhu   $t3,-2($a0)
+    pixel $t3,$t4,$t1,0
+    sh    $t1,-2($a0)
+1:  subu  $a2,1
+
+aligned:
+    /* Check to see if its worth unrolling the loop */
+    subu  $a2,4
+    bltz  $a2,tail
+
+    /* Process 4 pixels at a time */
+fourpixels:
+    /* 1st pair of pixels */
+    lw    $t4,0($a1)
+    lw    $t5,4($a1)
+    addu  $a0,8
+    addu  $a1,16
+
+    /* both are zero, skip this pair */
+    or    $t3,$t4,$t5
+    beqz  $t3,1f
+
+    /* load the destination */
+    lw    $t3,-8($a0)
+
+    pixel $t3,$t4,$t1,0
+    andi  $t1, 0xFFFF
+    pixel $t3,$t5,$t1,16
+    sw    $t1,-8($a0)
+
+1:
+    /* 2nd pair of pixels */
+    lw    $t4,-8($a1)
+    lw    $t5,-4($a1)
+
+    /* both are zero, skip this pair */
+    or    $t3,$t4,$t5
+    beqz  $t3,1f
+
+    /* load the destination */
+    lw    $t3,-4($a0)
+
+    pixel $t3,$t4,$t1,0
+    andi  $t1, 0xFFFF
+    pixel $t3,$t5,$t1,16
+    sw    $t1,-4($a0)
+
+1:  subu  $a2,4
+    bgtz  $a2,fourpixels
+
+tail:
+    /* the pixel count underran, restore it now */
+    addu  $a2,4
+
+    /* handle the last 0..3 pixels */
+    beqz  $a2,done
+onepixel:
+    lw    $t4,($a1)
+    addu  $a0,2
+    addu  $a1,4
+    beqz  $t4,1f
+    lhu   $t3,-2($a0)
+    pixel $t3,$t4,$t1,0
+    sh    $t1,-2($a0)
+1:  subu  $a2,1
+    bnez  $a2,onepixel
+done:
+DBG .set    push
+DBG .set    mips32r2
+DBG rdhwr   $a0,$3
+DBG mul     $v0,$a0
+DBG mul     $v1,$a0
+DBG .set    pop
+    j     $ra
+    .end    scanline_t32cb16blend_mips
diff --git a/libpixelflinger/arch-mips64/col32cb16blend.S b/libpixelflinger/arch-mips64/col32cb16blend.S
new file mode 100644
index 0000000..5baffb1
--- /dev/null
+++ b/libpixelflinger/arch-mips64/col32cb16blend.S
@@ -0,0 +1,108 @@
+/*
+** Copyright 2015, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+    .macro pixel dreg src f sR sG sB shift
+
+    /* extract red */
+.if \shift < 32
+    dext   $t0,\src,\shift+11,5
+.else
+    dextu  $t0,\src,\shift+11,5
+.endif
+    mul    $t0,$t0,\f
+
+    /* extract green */
+.if \shift < 32
+    dext   $t1,\src,\shift+5,6
+.else
+    dextu  $t1,\src,\shift+5,6
+.endif
+    mul    $t1,$t1,\f
+
+    /* extract blue */
+.if \shift < 32
+    dext   $t2,\src,\shift,5
+.else
+    dextu  $t2,\src,\shift,5
+.endif
+    mul    $t2,$t2,\f
+
+    srl    $t0,$t0,8
+    srl    $t1,$t1,8
+    srl    $t2,$t2,8
+    addu   $t0,$t0,\sR
+    addu   $t1,$t1,\sG
+    addu   \dreg,$t2,\sB
+    sll    $t0,$t0,11
+    sll    $t1,$t1,5
+    or     \dreg,\dreg,$t0
+    or     \dreg,\dreg,$t1
+    .endm
+
+    .text
+    .balign 4
+
+    .global scanline_col32cb16blend_mips64
+    .ent    scanline_col32cb16blend_mips64
+scanline_col32cb16blend_mips64:
+
+    /* check if count is zero */
+    srl     $v0,$a1,24 /* sA */
+    beqz    $a2,done
+    li      $t0, 0x100
+    srl     $v1,$v0,7
+    addu    $v0,$v1,$v0
+    subu    $v0,$t0,$v0 /* f */
+    ext     $a3,$a1,3,5 /* sR */
+    ext     $a4,$a1,10,6 /* sG */
+    ext     $a5,$a1,19,5 /* sB */
+
+    /* check if cnt is at least 4 */
+    addiu   $a2,$a2,-4
+    bltz    $a2,tail
+
+loop_4pixels:
+    ld      $t3,0($a0)
+    daddiu  $a0,$a0,8
+    addiu   $a2,$a2,-4
+    pixel   $a6 $t3 $v0 $a3 $a4 $a5 0
+    pixel   $a7 $t3 $v0 $a3 $a4 $a5 16
+    pixel   $t8 $t3 $v0 $a3 $a4 $a5 32
+    pixel   $t9 $t3 $v0 $a3 $a4 $a5 48
+    dins    $a6,$a7,16,16
+    dinsu   $a6,$t8,32,16
+    dinsu   $a6,$t9,48,16
+    sd      $a6,-8($a0)
+    bgez    $a2, loop_4pixels
+
+tail:
+    /* the pixel count underran, restore it now */
+    addiu   $a2,$a2,4
+
+    /* handle the last 0..3 pixels */
+    beqz    $a2,done
+
+loop_1pixel:
+    lhu     $t3,0($a0)
+    daddiu  $a0,$a0,2
+    addiu   $a2,$a2,-1
+    pixel   $a6 $t3 $v0 $a3 $a4 $a5 0
+    sh      $a6, -2($a0)
+    bnez    $a2,loop_1pixel
+
+done:
+    j       $ra
+    .end    scanline_col32cb16blend_mips64
diff --git a/libpixelflinger/arch-mips64/t32cb16blend.S b/libpixelflinger/arch-mips64/t32cb16blend.S
new file mode 100644
index 0000000..3cb5f93
--- /dev/null
+++ b/libpixelflinger/arch-mips64/t32cb16blend.S
@@ -0,0 +1,172 @@
+/*
+** Copyright 2015, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifdef DEBUG
+#define DBG
+#else
+#define DBG #
+#endif
+
+/*
+ * blend one of 2 16bpp RGB pixels held in dreg selected by shift
+ * with the 32bpp ABGR pixel held in src and store the result in fb
+ *
+ * Assumes that the dreg data is little endian and that
+ * the the second pixel (shift==16) will be merged into
+ * the fb result
+ *
+ * Uses $a4,$t2,$t3,$t8
+ */
+
+    .macro pixel dreg src fb shift
+    /*
+     * sA = s >> 24
+     * f = 0x100 - (sA + (sA>>7))
+     */
+    srl     $t3,\src,24
+    srl     $t2,$t3,7
+    addu    $t3,$t2
+    li      $t2,0x100
+    subu    $t3,$t2,$t3
+
+    /* red */
+    ext     $t8,\dreg,\shift+6+5,5                  # dst[\shift:15..11]
+    mul     $t2,$t8,$t3
+    ext     $a4,\dreg,\shift+5,6                    # start green extraction dst[\shift:10..5]
+    ext     $t8,\src,3,5                            # src[7..3]
+    srl     $t2,8
+    addu    $t8,$t2
+.if \shift!=0
+    sll     $t8,\shift+11                           # dst[\shift:15..11]
+    or      \fb,$t8
+.else
+    sll     \fb,$t8,11
+.endif
+
+    /* green */
+    mul     $t8,$a4,$t3
+    ext     $a4,\dreg,\shift,5                      # start blue extraction dst[\shift:4..0]
+    ext     $t2,\src,2+8,6                          # src[15..10]
+    srl     $t8,8
+    addu    $t8,$t2
+
+    /* blue */
+    mul     $a4,$a4,$t3
+    sll     $t8, $t8, \shift+5                  # finish green insertion dst[\shift:10..5]
+    or      \fb, \fb, $t8
+    ext     $t2,\src,(3+8+8),5
+    srl     $t8,$a4,8
+    addu    $t8,$t2
+    sll     $t8, $t8, \shift
+    or      \fb, \fb, $t8
+    .endm
+
+    .text
+    .balign 4
+
+    .global scanline_t32cb16blend_mips64
+    .ent    scanline_t32cb16blend_mips64
+scanline_t32cb16blend_mips64:
+    daddiu  $sp, $sp, -40
+DBG li      $v0,0xffffffff
+DBG li      $v1,0
+    /* Align the destination if necessary */
+    and     $a4,$a0,3
+    beqz    $a4,aligned
+
+    /* as long as there is at least one pixel */
+    beqz    $a2,done
+
+    lw      $t0,($a1)
+    daddu   $a0,2
+    daddu   $a1,4
+    beqz    $t0,1f
+    lhu     $a7,-2($a0)
+    pixel   $a7,$t0,$a5,0
+    sh      $a5,-2($a0)
+1:  subu    $a2,1
+
+aligned:
+    /* Check to see if its worth unrolling the loop */
+    subu    $a2,4
+    bltz    $a2,tail
+
+    /* Process 4 pixels at a time */
+fourpixels:
+    /* 1st pair of pixels */
+    lw      $t0,0($a1)
+    lw      $t1,4($a1)
+    daddu   $a0,8
+    daddu   $a1,16
+
+    /* both are zero, skip this pair */
+    or      $a7,$t0,$t1
+    beqz    $a7,1f
+
+    /* load the destination */
+    lw      $a7,-8($a0)
+
+    pixel   $a7,$t0,$a5,0
+    andi    $a5, 0xFFFF
+    pixel   $a7,$t1,$a5,16
+    sw      $a5,-8($a0)
+
+1:
+    /* 2nd pair of pixels */
+    lw      $t0,-8($a1)
+    lw      $t1,-4($a1)
+
+    /* both are zero, skip this pair */
+    or      $a7,$t0,$t1
+    beqz    $a7,1f
+
+    /* load the destination */
+    lw      $a7,-4($a0)
+
+    pixel   $a7,$t0,$a5,0
+    andi    $a5, 0xFFFF
+    pixel   $a7,$t1,$a5,16
+    sw      $a5,-4($a0)
+
+1:  subu    $a2,4
+    bgtz    $a2,fourpixels
+
+tail:
+    /* the pixel count underran, restore it now */
+    addu    $a2,4
+
+    /* handle the last 0..3 pixels */
+    beqz    $a2,done
+onepixel:
+    lw      $t0,($a1)
+    daddu   $a0,2
+    daddu   $a1,4
+    beqz    $t0,1f
+    lhu     $a7,-2($a0)
+    pixel   $a7,$t0,$a5,0
+    sh      $a5,-2($a0)
+1:  subu    $a2,1
+    bnez    $a2,onepixel
+done:
+DBG .set    push
+DBG .set    mips32r2
+DBG rdhwr   $a0,$3
+DBG mul     $v0,$a0
+DBG mul     $v1,$a0
+DBG .set    pop
+    daddiu  $sp, $sp, 40
+    j       $ra
+    .end    scanline_t32cb16blend_mips64
diff --git a/libpixelflinger/buffer.cpp b/libpixelflinger/buffer.cpp
new file mode 100644
index 0000000..ea9514c
--- /dev/null
+++ b/libpixelflinger/buffer.cpp
@@ -0,0 +1,389 @@
+/* libs/pixelflinger/buffer.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <assert.h>
+
+#include <android-base/macros.h>
+
+#include "buffer.h"
+
+namespace android {
+// ----------------------------------------------------------------------------
+
+static void read_pixel(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, pixel_t* pixel);
+static void write_pixel(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, const pixel_t* pixel);
+static void readRGB565(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, pixel_t* pixel);
+static void readABGR8888(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, pixel_t* pixel);
+
+static uint32_t logic_op(int op, uint32_t s, uint32_t d);
+static uint32_t extract(uint32_t v, int h, int l, int bits);
+static uint32_t expand(uint32_t v, int sbits, int dbits);
+static uint32_t downshift_component(uint32_t in, uint32_t v,
+        int sh, int sl, int dh, int dl, int ch, int cl, int dither);
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_texture(context_t* c)
+{
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
+        texture_t& t = c->state.texture[i];
+        t.s_coord = GGL_ONE_TO_ONE;
+        t.t_coord = GGL_ONE_TO_ONE;
+        t.s_wrap = GGL_REPEAT;
+        t.t_wrap = GGL_REPEAT;
+        t.min_filter = GGL_NEAREST;
+        t.mag_filter = GGL_NEAREST;
+        t.env = GGL_MODULATE;
+    }
+    c->activeTMU = &(c->state.texture[0]);
+}
+
+void ggl_set_surface(context_t* c, surface_t* dst, const GGLSurface* src)
+{
+    dst->width = src->width;
+    dst->height = src->height;
+    dst->stride = src->stride;
+    dst->data = src->data;
+    dst->format = src->format;
+    dst->dirty = 1;
+    if (__builtin_expect(dst->stride < 0, false)) {
+        const GGLFormat& pixelFormat(c->formats[dst->format]);
+        const int32_t bpr = -dst->stride * pixelFormat.size;
+        dst->data += bpr * (dst->height-1);
+    }
+}
+
+static void pick_read_write(surface_t* s)
+{
+    // Choose best reader/writers.
+    switch (s->format) {
+        case GGL_PIXEL_FORMAT_RGBA_8888:    s->read = readABGR8888;  break;
+        case GGL_PIXEL_FORMAT_RGB_565:      s->read = readRGB565;    break;
+        default:                            s->read = read_pixel;    break;
+    }
+    s->write = write_pixel;
+}
+
+void ggl_pick_texture(context_t* c)
+{
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+        surface_t& s = c->state.texture[i].surface;
+        if ((!c->state.texture[i].enable) || (!s.dirty))
+            continue;
+        s.dirty = 0;
+        pick_read_write(&s);
+        generated_tex_vars_t& gen = c->generated_vars.texture[i];
+        gen.width   = s.width;
+        gen.height  = s.height;
+        gen.stride  = s.stride;
+        gen.data    = uintptr_t(s.data);
+    }
+}
+
+void ggl_pick_cb(context_t* c)
+{
+    surface_t& s = c->state.buffers.color;
+    if (s.dirty) {
+        s.dirty = 0;
+        pick_read_write(&s);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+void read_pixel(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, pixel_t* pixel)
+{
+    assert((x < s->width) && (y < s->height));
+
+    const GGLFormat* f = &(c->formats[s->format]);
+    int32_t index = x + (s->stride * y);
+    uint8_t* const data = s->data + index * f->size;
+    uint32_t v = 0;
+    switch (f->size) {
+        case 1:		v = *data;									break;
+        case 2:		v = *(uint16_t*)data;						break;
+        case 3:		v = (data[2]<<16)|(data[1]<<8)|data[0];     break;
+        case 4:		v = GGL_RGBA_TO_HOST(*(uint32_t*)data);		break;
+    }
+    for (int i=0 ; i<4 ; i++) {
+        pixel->s[i] = f->c[i].h - f->c[i].l;
+        if (pixel->s[i])
+            pixel->c[i] = extract(v,  f->c[i].h,  f->c[i].l, f->size*8);
+    }
+}
+
+void readRGB565(const surface_t* s, context_t* /*c*/,
+        uint32_t x, uint32_t y, pixel_t* pixel)
+{
+    uint16_t v = *(reinterpret_cast<uint16_t*>(s->data) + (x + (s->stride * y)));
+    pixel->c[0] = 0;
+    pixel->c[1] = v>>11;
+    pixel->c[2] = (v>>5)&0x3F;
+    pixel->c[3] = v&0x1F;
+    pixel->s[0] = 0;
+    pixel->s[1] = 5;
+    pixel->s[2] = 6;
+    pixel->s[3] = 5;
+}
+
+void readABGR8888(const surface_t* s, context_t* /*c*/,
+        uint32_t x, uint32_t y, pixel_t* pixel)
+{
+    uint32_t v = *(reinterpret_cast<uint32_t*>(s->data) + (x + (s->stride * y)));
+    v = GGL_RGBA_TO_HOST(v);
+    pixel->c[0] = v>>24;        // A
+    pixel->c[1] = v&0xFF;       // R
+    pixel->c[2] = (v>>8)&0xFF;  // G
+    pixel->c[3] = (v>>16)&0xFF; // B
+    pixel->s[0] = 
+    pixel->s[1] = 
+    pixel->s[2] = 
+    pixel->s[3] = 8;
+}
+
+void write_pixel(const surface_t* s, context_t* c,
+        uint32_t x, uint32_t y, const pixel_t* pixel)
+{
+    assert((x < s->width) && (y < s->height));
+
+    int dither = -1;
+    if (c->state.enables & GGL_ENABLE_DITHER) {
+        dither = c->ditherMatrix[ (x & GGL_DITHER_MASK) +
+                ((y & GGL_DITHER_MASK)<<GGL_DITHER_ORDER_SHIFT) ];
+    }
+
+    const GGLFormat* f = &(c->formats[s->format]);
+    int32_t index = x + (s->stride * y);
+    uint8_t* const data = s->data + index * f->size;
+        
+    uint32_t mask = 0;
+    uint32_t v = 0;
+    for (int i=0 ; i<4 ; i++) {
+        const int component_mask = 1 << i;
+        if (f->components>=GGL_LUMINANCE &&
+                (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
+            // destinations L formats don't have G or B
+            continue;
+        }
+        const int l = f->c[i].l;
+        const int h = f->c[i].h;
+        if (h && (c->state.mask.color & component_mask)) {
+            mask |= (((1<<(h-l))-1)<<l);
+            uint32_t u = pixel->c[i];
+            int32_t pixelSize = pixel->s[i];
+            if (pixelSize < (h-l)) {
+                u = expand(u, pixelSize, h-l);
+                pixelSize = h-l;
+            }
+            v = downshift_component(v, u, pixelSize, 0, h, l, 0, 0, dither);
+        }
+    }
+
+    if ((c->state.mask.color != 0xF) || 
+        (c->state.enables & GGL_ENABLE_LOGIC_OP)) {
+        uint32_t d = 0;
+        switch (f->size) {
+            case 1:	d = *data;									break;
+            case 2:	d = *(uint16_t*)data;						break;
+            case 3:	d = (data[2]<<16)|(data[1]<<8)|data[0];     break;
+            case 4:	d = GGL_RGBA_TO_HOST(*(uint32_t*)data);		break;
+        }
+        if (c->state.enables & GGL_ENABLE_LOGIC_OP) {
+            v = logic_op(c->state.logic_op.opcode, v, d);            
+            v &= mask;
+        }
+        v |= (d & ~mask);
+    }
+
+    switch (f->size) {
+        case 1:		*data = v;									break;
+        case 2:		*(uint16_t*)data = v;						break;
+        case 3:
+            data[0] = v;
+            data[1] = v>>8;
+            data[2] = v>>16;
+            break;
+        case 4:		*(uint32_t*)data = GGL_HOST_TO_RGBA(v);     break;
+    }
+}
+
+static uint32_t logic_op(int op, uint32_t s, uint32_t d)
+{
+    switch(op) {
+    case GGL_CLEAR:         return 0;
+    case GGL_AND:           return s & d;
+    case GGL_AND_REVERSE:   return s & ~d;
+    case GGL_COPY:          return s;
+    case GGL_AND_INVERTED:  return ~s & d;
+    case GGL_NOOP:          return d;
+    case GGL_XOR:           return s ^ d;
+    case GGL_OR:            return s | d;
+    case GGL_NOR:           return ~(s | d);
+    case GGL_EQUIV:         return ~(s ^ d);
+    case GGL_INVERT:        return ~d;
+    case GGL_OR_REVERSE:    return s | ~d;
+    case GGL_COPY_INVERTED: return ~s;
+    case GGL_OR_INVERTED:   return ~s | d;
+    case GGL_NAND:          return ~(s & d);
+    case GGL_SET:           return ~0;
+    };
+    return s;
+}            
+
+
+uint32_t ggl_expand(uint32_t v, int sbits, int dbits)
+{
+    return expand(v, sbits, dbits);
+}
+
+uint32_t ggl_pack_color(context_t* c, int32_t format,
+        GGLcolor r, GGLcolor g, GGLcolor b, GGLcolor a)
+{
+    const GGLFormat* f = &(c->formats[format]);
+    uint32_t p = 0;
+    const int32_t hbits = GGL_COLOR_BITS;
+    const int32_t lbits = GGL_COLOR_BITS - 8;
+    p = downshift_component(p, r,   hbits, lbits,  f->rh, f->rl, 0, 1, -1);
+    p = downshift_component(p, g,   hbits, lbits,  f->gh, f->gl, 0, 1, -1);
+    p = downshift_component(p, b,   hbits, lbits,  f->bh, f->bl, 0, 1, -1);
+    p = downshift_component(p, a,   hbits, lbits,  f->ah, f->al, 0, 1, -1);
+    switch (f->size) {
+        case 1:
+            p |= p << 8;
+            FALLTHROUGH_INTENDED;
+        case 2:
+            p |= p << 16;
+    }
+    return p;
+}
+
+// ----------------------------------------------------------------------------
+
+// extract a component from a word
+uint32_t extract(uint32_t v, int h, int l, int bits)
+{
+	assert(h);
+	if (l) {
+		v >>= l;
+	}
+	if (h != bits) {
+		v &= (1<<(h-l))-1;
+	}
+	return v;
+}
+
+// expand a component from sbits to dbits
+uint32_t expand(uint32_t v, int sbits, int dbits)
+{
+    if (dbits > sbits) {
+        assert(sbits);
+        if (sbits==1) {
+            v = (v<<dbits) - v;
+        } else {
+            if (dbits % sbits) {
+                v <<= (dbits-sbits);
+                dbits -= sbits;
+                do {
+                    v |= v>>sbits;
+                    dbits -= sbits;
+                    sbits *= 2;
+                } while (dbits>0);
+            } else {
+                dbits -= sbits;
+                do {
+                    v |= v<<sbits;
+                    dbits -= sbits;
+                    if (sbits*2 < dbits) {
+                        sbits *= 2;
+                    }
+                } while (dbits > 0);
+            }
+        }
+    }
+	return v;
+}
+
+// downsample a component from sbits to dbits
+// and shift / construct the pixel
+uint32_t downshift_component(	uint32_t in, uint32_t v,
+                                int sh, int sl,		// src
+                                int dh, int dl,		// dst
+                                int ch, int cl,		// clear
+                                int dither)
+{
+	const int sbits = sh-sl;
+	const int dbits = dh-dl;
+    
+	assert(sbits>=dbits);
+
+
+    if (sbits>dbits) {
+        if (dither>=0) {
+            v -= (v>>dbits);				// fix up
+            const int shift = (GGL_DITHER_BITS - (sbits-dbits));
+            if (shift >= 0)   v += (dither >> shift) << sl;
+            else              v += (dither << (-shift)) << sl;
+        } else {
+            // don't do that right now, so we can reproduce the same
+            // artifacts we get on ARM (Where we don't do this)
+            // -> this is not really needed if we don't dither
+            //if (dBits > 1) { // result already OK if dBits==1
+            //    v -= (v>>dbits);				// fix up
+            //    v += 1 << ((sbits-dbits)-1);	// rounding
+            //}
+        }
+    }
+
+
+	// we need to clear the high bits of the source
+	if (ch) {
+		v <<= 32-sh;
+		sl += 32-sh;
+        sh = 32;
+	}
+	
+	if (dl) {
+		if (cl || (sbits>dbits)) {
+			v >>= sh-dbits;
+			sl = 0;
+			sh = dbits;
+            in |= v<<dl;
+		} else {
+			// sbits==dbits and we don't need to clean the lower bits
+			// so we just have to shift the component to the right location
+            int shift = dh-sh;
+            in |= v<<shift;
+		}
+	} else {
+		// destination starts at bit 0
+		// ie: sh-dh == sh-dbits
+		int shift = sh-dh;
+		if (shift > 0)      in |= v>>shift;
+		else if (shift < 0) in |= v<<shift;
+		else                in |= v;
+	}
+	return in;
+}
+
+// ----------------------------------------------------------------------------
+}; // namespace android
diff --git a/libpixelflinger/buffer.h b/libpixelflinger/buffer.h
new file mode 100644
index 0000000..9c9e4bc
--- /dev/null
+++ b/libpixelflinger/buffer.h
@@ -0,0 +1,39 @@
+/* libs/pixelflinger/buffer.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_GGL_TEXTURE_H
+#define ANDROID_GGL_TEXTURE_H
+
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_texture(context_t* c);
+
+void ggl_set_surface(context_t* c, surface_t* dst, const GGLSurface* src);
+
+void ggl_pick_texture(context_t* c);
+void ggl_pick_cb(context_t* c);
+
+uint32_t ggl_expand(uint32_t v, int sbits, int dbits);
+uint32_t ggl_pack_color(context_t* c, int32_t format,
+            GGLcolor r, GGLcolor g, GGLcolor b, GGLcolor a);
+
+}; // namespace android
+
+#endif // ANDROID_GGL_TEXTURE_H
diff --git a/libpixelflinger/clear.cpp b/libpixelflinger/clear.cpp
new file mode 100644
index 0000000..b962456
--- /dev/null
+++ b/libpixelflinger/clear.cpp
@@ -0,0 +1,171 @@
+/* libs/pixelflinger/clear.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <cutils/memory.h>
+
+#include "clear.h"
+#include "buffer.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+static void ggl_clear(void* c, GGLbitfield mask);
+static void ggl_clearColorx(void* c,
+        GGLclampx r, GGLclampx g, GGLclampx b, GGLclampx a);        
+static void ggl_clearDepthx(void* c, GGLclampx depth);
+static void ggl_clearStencil(void* c, GGLint s);
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_clear(context_t* c)
+{
+    GGLContext& procs = *(GGLContext*)c;
+    GGL_INIT_PROC(procs, clear);
+    GGL_INIT_PROC(procs, clearColorx);
+    GGL_INIT_PROC(procs, clearDepthx);
+    GGL_INIT_PROC(procs, clearStencil);
+    c->state.clear.dirty =  GGL_STENCIL_BUFFER_BIT |
+                            GGL_COLOR_BUFFER_BIT |
+                            GGL_DEPTH_BUFFER_BIT;
+    c->state.clear.depth = FIXED_ONE;
+}
+
+// ----------------------------------------------------------------------------
+
+static void memset2d(context_t* c, const surface_t& s, uint32_t packed,
+        uint32_t l, uint32_t t, uint32_t w, uint32_t h)
+{
+    const uint32_t size = c->formats[s.format].size;
+    const int32_t stride = s.stride * size;
+    uint8_t* dst = (uint8_t*)s.data + (l + t*s.stride)*size;
+    w *= size;
+
+    if (ggl_likely(int32_t(w) == stride)) {
+        // clear the whole thing in one call
+        w *= h;
+        h = 1;
+    }
+
+    switch (size) {
+    case 1:
+        do {
+            memset(dst, packed, w);
+            dst += stride;
+        } while(--h);
+        break;
+    case 2:
+        do {
+            android_memset16((uint16_t*)dst, packed, w);
+            dst += stride;
+        } while(--h);
+        break;
+    case 3: // XXX: 24-bit clear.
+        break;
+    case 4:
+        do {
+            android_memset32((uint32_t*)dst, packed, w);
+            dst += stride;
+        } while(--h);
+        break;
+    }    
+}
+
+static inline GGLfixed fixedToZ(GGLfixed z) {
+    return GGLfixed(((int64_t(z) << 16) - z) >> 16);
+}
+
+static void ggl_clear(void* con, GGLbitfield mask)
+{
+    GGL_CONTEXT(c, con);
+
+    // XXX: rgba-dithering, rgba-masking
+    // XXX: handle all formats of Z and S
+
+    const uint32_t l = c->state.scissor.left;
+    const uint32_t t = c->state.scissor.top;
+    uint32_t w = c->state.scissor.right - l;
+    uint32_t h = c->state.scissor.bottom - t;
+
+    if (!w || !h)
+        return;
+
+    // unexsiting buffers have no effect...
+    if (c->state.buffers.color.format == 0)
+        mask &= ~GGL_COLOR_BUFFER_BIT;
+
+    if (c->state.buffers.depth.format == 0)
+        mask &= ~GGL_DEPTH_BUFFER_BIT;
+
+    if (c->state.buffers.stencil.format == 0)
+        mask &= ~GGL_STENCIL_BUFFER_BIT;
+
+    if (mask & GGL_COLOR_BUFFER_BIT) {
+        if (c->state.clear.dirty & GGL_COLOR_BUFFER_BIT) {
+            c->state.clear.dirty &= ~GGL_COLOR_BUFFER_BIT;
+
+            uint32_t colorPacked = ggl_pack_color(c,
+                    c->state.buffers.color.format,
+                    gglFixedToIteratedColor(c->state.clear.r),
+                    gglFixedToIteratedColor(c->state.clear.g),
+                    gglFixedToIteratedColor(c->state.clear.b),
+                    gglFixedToIteratedColor(c->state.clear.a));
+
+            c->state.clear.colorPacked = GGL_HOST_TO_RGBA(colorPacked);
+        }
+        const uint32_t packed = c->state.clear.colorPacked;
+        memset2d(c, c->state.buffers.color, packed, l, t, w, h);
+    }
+    if (mask & GGL_DEPTH_BUFFER_BIT) {
+        if (c->state.clear.dirty & GGL_DEPTH_BUFFER_BIT) {
+            c->state.clear.dirty &= ~GGL_DEPTH_BUFFER_BIT;
+            uint32_t depth = fixedToZ(c->state.clear.depth);
+            c->state.clear.depthPacked = (depth<<16)|depth;
+        }
+        const uint32_t packed = c->state.clear.depthPacked;
+        memset2d(c, c->state.buffers.depth, packed, l, t, w, h);
+    }
+
+    // XXX: do stencil buffer
+}
+
+static void ggl_clearColorx(void* con,
+        GGLclampx r, GGLclampx g, GGLclampx b, GGLclampx a)
+{
+    GGL_CONTEXT(c, con);
+    c->state.clear.r = gglClampx(r);
+    c->state.clear.g = gglClampx(g);
+    c->state.clear.b = gglClampx(b);
+    c->state.clear.a = gglClampx(a);
+    c->state.clear.dirty |= GGL_COLOR_BUFFER_BIT;
+}
+
+static void ggl_clearDepthx(void* con, GGLclampx depth)
+{
+    GGL_CONTEXT(c, con);
+    c->state.clear.depth = gglClampx(depth);
+    c->state.clear.dirty |= GGL_DEPTH_BUFFER_BIT;
+}
+
+static void ggl_clearStencil(void* con, GGLint s)
+{
+    GGL_CONTEXT(c, con);
+    c->state.clear.stencil = s;
+    c->state.clear.dirty |= GGL_STENCIL_BUFFER_BIT;
+}
+
+}; // namespace android
diff --git a/libpixelflinger/clear.h b/libpixelflinger/clear.h
new file mode 100644
index 0000000..b071df0
--- /dev/null
+++ b/libpixelflinger/clear.h
@@ -0,0 +1,30 @@
+/* libs/pixelflinger/clear.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#ifndef ANDROID_GGL_CLEAR_H
+#define ANDROID_GGL_CLEAR_H
+
+#include <pixelflinger/pixelflinger.h>
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_clear(context_t* c);
+
+}; // namespace android
+
+#endif // ANDROID_GGL_CLEAR_H
diff --git a/libpixelflinger/codeflinger/ARMAssembler.cpp b/libpixelflinger/codeflinger/ARMAssembler.cpp
new file mode 100644
index 0000000..f47b6e4
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssembler.cpp
@@ -0,0 +1,579 @@
+/* libs/pixelflinger/codeflinger/ARMAssembler.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "ARMAssembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cutils/properties.h>
+#include <log/log.h>
+#include <private/pixelflinger/ggl_context.h>
+
+#include "ARMAssembler.h"
+#include "CodeCache.h"
+#include "disassem.h"
+
+// ----------------------------------------------------------------------------
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark ARMAssembler...
+#endif
+
+ARMAssembler::ARMAssembler(const sp<Assembly>& assembly)
+    :   ARMAssemblerInterface(),
+        mAssembly(assembly)
+{
+    mBase = mPC = (uint32_t *)assembly->base();
+    mDuration = ggl_system_time();
+}
+
+ARMAssembler::~ARMAssembler()
+{
+}
+
+uint32_t* ARMAssembler::pc() const
+{
+    return mPC;
+}
+
+uint32_t* ARMAssembler::base() const
+{
+    return mBase;
+}
+
+void ARMAssembler::reset()
+{
+    mBase = mPC = (uint32_t *)mAssembly->base();
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+}
+
+int ARMAssembler::getCodegenArch()
+{
+    return CODEGEN_ARCH_ARM;
+}
+
+// ----------------------------------------------------------------------------
+
+void ARMAssembler::disassemble(const char* name)
+{
+    if (name) {
+        printf("%s:\n", name);
+    }
+    size_t count = pc()-base();
+    uint32_t* i = base();
+    while (count--) {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(i);
+        if (label >= 0) {
+            printf("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(i);
+        if (comment >= 0) {
+            printf("; %s\n", mComments.valueAt(comment));
+        }
+        printf("%08x:    %08x    ", uintptr_t(i), int(i[0]));
+        ::disassemble((uintptr_t)i);
+        i++;
+    }
+}
+
+void ARMAssembler::comment(const char* string)
+{
+    mComments.add(mPC, string);
+}
+
+void ARMAssembler::label(const char* theLabel)
+{
+    mLabels.add(theLabel, mPC);
+    mLabelsInverseMapping.add(mPC, theLabel);
+}
+
+void ARMAssembler::B(int cc, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (cc<<28) | (0xA<<24) | 0;
+}
+
+void ARMAssembler::BL(int cc, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (cc<<28) | (0xB<<24) | 0;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Prolog/Epilog & Generate...
+#endif
+
+
+void ARMAssembler::prolog()
+{
+    // write dummy prolog code
+    mPrologPC = mPC;
+    STM(AL, FD, SP, 1, LSAVED);
+}
+
+void ARMAssembler::epilog(uint32_t touched)
+{
+    touched &= LSAVED;
+    if (touched) {
+        // write prolog code
+        uint32_t* pc = mPC;
+        mPC = mPrologPC;
+        STM(AL, FD, SP, 1, touched | LLR);
+        mPC = pc;
+        // write epilog code
+        LDM(AL, FD, SP, 1, touched | LLR);
+        BX(AL, LR);
+    } else {   // heh, no registers to save!
+        // write prolog code
+        uint32_t* pc = mPC;
+        mPC = mPrologPC;
+        MOV(AL, 0, R0, R0); // NOP
+        mPC = pc;
+        // write epilog code
+        BX(AL, LR);
+    }
+}
+
+int ARMAssembler::generate(const char* name)
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--) {
+        const branch_target_t& bt = mBranchTargets[count];
+        uint32_t* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                "error resolving branch targets, target_pc is null");
+        int32_t offset = int32_t(target_pc - (bt.pc+2));
+        *bt.pc |= offset & 0xFFFFFF;
+    }
+
+    mAssembly->resize( int(pc()-base())*4 );
+
+    // the instruction cache is flushed by CodeCache
+    const int64_t duration = ggl_system_time() - mDuration;
+    const char * const format = "generated %s (%d ins) at [%p:%p] in %lld ns\n";
+    ALOGI(format, name, int(pc()-base()), base(), pc(), duration);
+
+    char value[PROPERTY_VALUE_MAX];
+    property_get("debug.pf.disasm", value, "0");
+    if (atoi(value) != 0) {
+        printf(format, name, int(pc()-base()), base(), pc(), duration);
+        disassemble(name);
+    }
+
+    return OK;
+}
+
+uint32_t* ARMAssembler::pcForLabel(const char* label)
+{
+    return mLabels.valueFor(label);
+}
+
+// ----------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Data Processing...
+#endif
+
+void ARMAssembler::dataProcessing(int opcode, int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    *mPC++ = (cc<<28) | (opcode<<21) | (s<<20) | (Rn<<16) | (Rd<<12) | Op2;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Multiply...
+#endif
+
+// multiply...
+void ARMAssembler::MLA(int cc, int s,
+        int Rd, int Rm, int Rs, int Rn) {
+    if (Rd == Rm) { int t = Rm; Rm=Rs; Rs=t; }
+    LOG_FATAL_IF(Rd==Rm, "MLA(r%u,r%u,r%u,r%u)", Rd,Rm,Rs,Rn);
+    *mPC++ =    (cc<<28) | (1<<21) | (s<<20) |
+                (Rd<<16) | (Rn<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::MUL(int cc, int s,
+        int Rd, int Rm, int Rs) {
+    if (Rd == Rm) { int t = Rm; Rm=Rs; Rs=t; }
+    LOG_FATAL_IF(Rd==Rm, "MUL(r%u,r%u,r%u)", Rd,Rm,Rs);
+    *mPC++ = (cc<<28) | (s<<20) | (Rd<<16) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::UMULL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::UMUAL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<21) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::SMULL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+void ARMAssembler::SMUAL(int cc, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (1<<21) | (s<<20) |
+                (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Branches...
+#endif
+
+// branches...
+void ARMAssembler::B(int cc, uint32_t* pc)
+{
+    int32_t offset = int32_t(pc - (mPC+2));
+    *mPC++ = (cc<<28) | (0xA<<24) | (offset & 0xFFFFFF);
+}
+
+void ARMAssembler::BL(int cc, uint32_t* pc)
+{
+    int32_t offset = int32_t(pc - (mPC+2));
+    *mPC++ = (cc<<28) | (0xB<<24) | (offset & 0xFFFFFF);
+}
+
+void ARMAssembler::BX(int cc, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x12FFF10 | Rn;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Data Transfer...
+#endif
+
+// data transfert...
+void ARMAssembler::LDR(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<20) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::LDRB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<22) | (1<<20) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::STR(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (Rn<<16) | (Rd<<12) | offset;
+}
+void ARMAssembler::STRB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<26) | (1<<22) | (Rn<<16) | (Rd<<12) | offset;
+}
+
+void ARMAssembler::LDRH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xB0 | offset;
+}
+void ARMAssembler::LDRSB(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xD0 | offset;
+}
+void ARMAssembler::LDRSH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (1<<20) | (Rn<<16) | (Rd<<12) | 0xF0 | offset;
+}
+void ARMAssembler::STRH(int cc, int Rd, int Rn, uint32_t offset) {
+    *mPC++ = (cc<<28) | (Rn<<16) | (Rd<<12) | 0xB0 | offset;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Block Data Transfer...
+#endif
+
+// block data transfer...
+void ARMAssembler::LDM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{   //                    ED FD EA FA      IB IA DB DA
+    const uint8_t P[8] = { 1, 0, 1, 0,      1, 0, 1, 0 };
+    const uint8_t U[8] = { 1, 1, 0, 0,      1, 1, 0, 0 };
+    *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+            (uint32_t(U[dir])<<23) | (1<<20) | (W<<21) | (Rn<<16) | reg_list;
+}
+
+void ARMAssembler::STM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{   //                    ED FD EA FA      IB IA DB DA
+    const uint8_t P[8] = { 0, 1, 0, 1,      1, 0, 1, 0 };
+    const uint8_t U[8] = { 0, 0, 1, 1,      1, 1, 0, 0 };
+    *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+            (uint32_t(U[dir])<<23) | (0<<20) | (W<<21) | (Rn<<16) | reg_list;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Special...
+#endif
+
+// special...
+void ARMAssembler::SWP(int cc, int Rn, int Rd, int Rm) {
+    *mPC++ = (cc<<28) | (2<<23) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+}
+void ARMAssembler::SWPB(int cc, int Rn, int Rd, int Rm) {
+    *mPC++ = (cc<<28) | (2<<23) | (1<<22) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+}
+void ARMAssembler::SWI(int cc, uint32_t comment) {
+    *mPC++ = (cc<<28) | (0xF<<24) | comment;
+}
+
+#if 0
+#pragma mark -
+#pragma mark DSP instructions...
+#endif
+
+// DSP instructions...
+void ARMAssembler::PLD(int Rn, uint32_t offset) {
+    LOG_ALWAYS_FATAL_IF(!((offset&(1<<24)) && !(offset&(1<<21))),
+                        "PLD only P=1, W=0");
+    *mPC++ = 0xF550F000 | (Rn<<16) | offset;
+}
+
+void ARMAssembler::CLZ(int cc, int Rd, int Rm)
+{
+    *mPC++ = (cc<<28) | 0x16F0F10| (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QADD(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1000050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QDADD(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1400050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QSUB(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1200050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::QDSUB(int cc,  int Rd, int Rm, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1600050 | (Rn<<16) | (Rd<<12) | Rm;
+}
+
+void ARMAssembler::SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs)
+{
+    *mPC++ = (cc<<28) | 0x1600080 | (Rd<<16) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMULW(int cc, int y,
+                int Rd, int Rm, int Rs)
+{
+    *mPC++ = (cc<<28) | 0x12000A0 | (Rd<<16) | (Rs<<8) | (y<<4) | Rm;
+}
+
+void ARMAssembler::SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1000080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm)
+{
+    *mPC++ = (cc<<28) | 0x1400080 | (RdHi<<16) | (RdLo<<12) | (Rs<<8) | (xy<<4) | Rm;
+}
+
+void ARMAssembler::SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    *mPC++ = (cc<<28) | 0x1200080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (y<<4) | Rm;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Byte/half word extract and extend (ARMv6+ only)...
+#endif
+
+void ARMAssembler::UXTB16(int cc, int Rd, int Rm, int rotate)
+{
+    *mPC++ = (cc<<28) | 0x6CF0070 | (Rd<<12) | ((rotate >> 3) << 10) | Rm;
+}
+#if 0
+#pragma mark -
+#pragma mark Bit manipulation (ARMv7+ only)...
+#endif
+
+// Bit manipulation (ARMv7+ only)...
+void ARMAssembler::UBFX(int cc, int Rd, int Rn, int lsb, int width)
+{
+    *mPC++ = (cc<<28) | 0x7E00000 | ((width-1)<<16) | (Rd<<12) | (lsb<<7) | 0x50 | Rn;
+}
+
+#if 0
+#pragma mark -
+#pragma mark Addressing modes...
+#endif
+
+int ARMAssembler::buildImmediate(
+        uint32_t immediate, uint32_t& rot, uint32_t& imm)
+{
+    rot = 0;
+    imm = immediate;
+    if (imm > 0x7F) { // skip the easy cases
+        while (!(imm&3)  || (imm&0xFC000000)) {
+            uint32_t newval;
+            newval = imm >> 2;
+            newval |= (imm&3) << 30;
+            imm = newval;
+            rot += 2;
+            if (rot == 32) {
+                rot = 0;
+                break;
+            }
+        }
+    }
+    rot = (16 - (rot>>1)) & 0xF;
+
+    if (imm>=0x100)
+        return -EINVAL;
+
+    if (((imm>>(rot<<1)) | (imm<<(32-(rot<<1)))) != immediate)
+        return -1;
+
+    return 0;
+}
+
+// shifters...
+
+bool ARMAssembler::isValidImmediate(uint32_t immediate)
+{
+    uint32_t rot, imm;
+    return buildImmediate(immediate, rot, imm) == 0;
+}
+
+uint32_t ARMAssembler::imm(uint32_t immediate)
+{
+    uint32_t rot, imm;
+    int err = buildImmediate(immediate, rot, imm);
+
+    LOG_ALWAYS_FATAL_IF(err==-EINVAL,
+                        "immediate %08x cannot be encoded",
+                        immediate);
+
+    LOG_ALWAYS_FATAL_IF(err,
+                        "immediate (%08x) encoding bogus!",
+                        immediate);
+
+    return (1<<25) | (rot<<8) | imm;
+}
+
+uint32_t ARMAssembler::reg_imm(int Rm, int type, uint32_t shift)
+{
+    return ((shift&0x1F)<<7) | ((type&0x3)<<5) | (Rm&0xF);
+}
+
+uint32_t ARMAssembler::reg_rrx(int Rm)
+{
+    return (ROR<<5) | (Rm&0xF);
+}
+
+uint32_t ARMAssembler::reg_reg(int Rm, int type, int Rs)
+{
+    return ((Rs&0xF)<<8) | ((type&0x3)<<5) | (1<<4) | (Rm&0xF);
+}
+
+// addressing modes...
+// LDR(B)/STR(B)/PLD (immediate and Rm can be negative, which indicate U=0)
+uint32_t ARMAssembler::immed12_pre(int32_t immed12, int W)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+    return (1<<24) | (((uint32_t(immed12)>>31)^1)<<23) |
+            ((W&1)<<21) | (abs(immed12)&0x7FF);
+}
+
+uint32_t ARMAssembler::immed12_post(int32_t immed12)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+
+    return (((uint32_t(immed12)>>31)^1)<<23) | (abs(immed12)&0x7FF);
+}
+
+uint32_t ARMAssembler::reg_scale_pre(int Rm, int type,
+        uint32_t shift, int W)
+{
+    return  (1<<25) | (1<<24) |
+            (((uint32_t(Rm)>>31)^1)<<23) | ((W&1)<<21) |
+            reg_imm(abs(Rm), type, shift);
+}
+
+uint32_t ARMAssembler::reg_scale_post(int Rm, int type, uint32_t shift)
+{
+    return (1<<25) | (((uint32_t(Rm)>>31)^1)<<23) | reg_imm(abs(Rm), type, shift);
+}
+
+// LDRH/LDRSB/LDRSH/STRH (immediate and Rm can be negative, which indicate U=0)
+uint32_t ARMAssembler::immed8_pre(int32_t immed8, int W)
+{
+    uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+
+    return  (1<<24) | (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) |
+            ((W&1)<<21) | (((offset&0xF0)<<4)|(offset&0xF));
+}
+
+uint32_t ARMAssembler::immed8_post(int32_t immed8)
+{
+    uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+
+    return (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) |
+            (((offset&0xF0)<<4) | (offset&0xF));
+}
+
+uint32_t ARMAssembler::reg_pre(int Rm, int W)
+{
+    return (1<<24) | (((uint32_t(Rm)>>31)^1)<<23) | ((W&1)<<21) | (abs(Rm)&0xF);
+}
+
+uint32_t ARMAssembler::reg_post(int Rm)
+{
+    return (((uint32_t(Rm)>>31)^1)<<23) | (abs(Rm)&0xF);
+}
+
+}; // namespace android
diff --git a/libpixelflinger/codeflinger/ARMAssembler.h b/libpixelflinger/codeflinger/ARMAssembler.h
new file mode 100644
index 0000000..76acf7e
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssembler.h
@@ -0,0 +1,187 @@
+/* libs/pixelflinger/codeflinger/ARMAssembler.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#ifndef ANDROID_ARMASSEMBLER_H
+#define ANDROID_ARMASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "tinyutils/smartpointer.h"
+#include "utils/Vector.h"
+#include "utils/KeyedVector.h"
+
+#include "ARMAssemblerInterface.h"
+#include "CodeCache.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssembler : public ARMAssemblerInterface
+{
+public:
+    explicit    ARMAssembler(const sp<Assembly>& assembly);
+    virtual     ~ARMAssembler();
+
+    uint32_t*   base() const;
+    uint32_t*   pc() const;
+
+
+    void        disassemble(const char* name);
+
+    // ------------------------------------------------------------------------
+    // ARMAssemblerInterface...
+    // ------------------------------------------------------------------------
+
+    virtual void    reset();
+
+    virtual int     generate(const char* name);
+    virtual int     getCodegenArch();
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // shifters...
+    virtual bool        isValidImmediate(uint32_t immed);
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    virtual uint32_t    imm(uint32_t immediate);
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    virtual uint32_t    reg_rrx(int Rm);
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes...
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0);
+    virtual uint32_t    immed12_post(int32_t immed12);
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0);
+    virtual uint32_t    immed8_post(int32_t immed8);
+    virtual uint32_t    reg_pre(int Rm, int W=0);
+    virtual uint32_t    reg_post(int Rm);
+
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    virtual uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+
+
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width);
+
+private:
+                ARMAssembler(const ARMAssembler& rhs);
+                ARMAssembler& operator = (const ARMAssembler& rhs);
+
+    sp<Assembly>    mAssembly;
+    uint32_t*       mBase;
+    uint32_t*       mPC;
+    uint32_t*       mPrologPC;
+    int64_t         mDuration;
+    
+    struct branch_target_t {
+        inline branch_target_t() : label(0), pc(0) { }
+        inline branch_target_t(const char* l, uint32_t* p)
+            : label(l), pc(p) { }
+        const char* label;
+        uint32_t*   pc;
+    };
+    
+    Vector<branch_target_t>                 mBranchTargets;
+    KeyedVector< const char*, uint32_t* >   mLabels;
+    KeyedVector< uint32_t*, const char* >   mLabelsInverseMapping;
+    KeyedVector< uint32_t*, const char* >   mComments;
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_H
diff --git a/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp b/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp
new file mode 100644
index 0000000..c96cf4b
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerInterface.cpp
@@ -0,0 +1,90 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerInterface.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+#define LOG_TAG "pixelflinger-code"
+
+#include <errno.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <log/log.h>
+
+#include "ARMAssemblerInterface.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+ARMAssemblerInterface::~ARMAssemblerInterface()
+{
+}
+
+// --------------------------------------------------------------------
+
+// The following two functions are static and used for initializers
+// in the original ARM code. The above versions (without __), are now
+// virtual, and can be overridden in the MIPS code. But since these are
+// needed at initialization time, they must be static. Not thrilled with
+// this implementation, but it works...
+
+uint32_t ARMAssemblerInterface::__immed12_pre(int32_t immed12, int W)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+    return (1<<24) | (((uint32_t(immed12)>>31)^1)<<23) |
+            ((W&1)<<21) | (abs(immed12)&0x7FF);
+}
+
+uint32_t ARMAssemblerInterface::__immed8_pre(int32_t immed8, int W)
+{
+    uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+
+    return  (1<<24) | (1<<22) | (((uint32_t(immed8)>>31)^1)<<23) |
+            ((W&1)<<21) | (((offset&0xF0)<<4)|(offset&0xF));
+}
+
+// The following four functions are required for address manipulation
+// These are virtual functions, which can be overridden by architectures
+// that need special handling of address values (e.g. 64-bit arch)
+
+void ARMAssemblerInterface::ADDR_LDR(int cc, int Rd,
+     int Rn, uint32_t offset)
+{
+    LDR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerInterface::ADDR_STR(int cc, int Rd,
+     int Rn, uint32_t offset)
+{
+    STR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerInterface::ADDR_ADD(int cc, int s,
+     int Rd, int Rn, uint32_t Op2)
+{
+    dataProcessing(opADD, cc, s, Rd, Rn, Op2);
+}
+void ARMAssemblerInterface::ADDR_SUB(int cc, int s,
+     int Rd, int Rn, uint32_t Op2)
+{
+    dataProcessing(opSUB, cc, s, Rd, Rn, Op2);
+}
+}; // namespace android
+
diff --git a/libpixelflinger/codeflinger/ARMAssemblerInterface.h b/libpixelflinger/codeflinger/ARMAssemblerInterface.h
new file mode 100644
index 0000000..72935ac
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerInterface.h
@@ -0,0 +1,349 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerInterface.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_ARMASSEMBLER_INTERFACE_H
+#define ANDROID_ARMASSEMBLER_INTERFACE_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssemblerInterface
+{
+public:
+    virtual ~ARMAssemblerInterface();
+
+    enum {
+        EQ, NE, CS, CC, MI, PL, VS, VC, HI, LS, GE, LT, GT, LE, AL, NV,
+        HS = CS,
+        LO = CC
+    };
+    enum {
+        S = 1
+    };
+    enum {
+        LSL, LSR, ASR, ROR
+    };
+    enum {
+        ED, FD, EA, FA,
+        IB, IA, DB, DA
+    };
+    enum {
+        R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15,
+        SP = R13,
+        LR = R14,
+        PC = R15
+    };
+    enum {
+        #define LIST(rr) L##rr=1<<rr
+        LIST(R0), LIST(R1), LIST(R2), LIST(R3), LIST(R4), LIST(R5), LIST(R6),
+        LIST(R7), LIST(R8), LIST(R9), LIST(R10), LIST(R11), LIST(R12),
+        LIST(R13), LIST(R14), LIST(R15),
+        LIST(SP), LIST(LR), LIST(PC),
+        #undef LIST
+        LSAVED = LR4|LR5|LR6|LR7|LR8|LR9|LR10|LR11 | LLR
+    };
+
+    enum {
+        CODEGEN_ARCH_ARM = 1, CODEGEN_ARCH_MIPS, CODEGEN_ARCH_ARM64, CODEGEN_ARCH_MIPS64
+    };
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // these static versions are used for initializers on LDxx/STxx below
+    static uint32_t    __immed12_pre(int32_t immed12, int W=0);
+    static uint32_t    __immed8_pre(int32_t immed12, int W=0);
+
+    virtual bool        isValidImmediate(uint32_t immed) = 0;
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm) = 0;
+
+    virtual uint32_t    imm(uint32_t immediate) = 0;
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift) = 0;
+    virtual uint32_t    reg_rrx(int Rm) = 0;
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs) = 0;
+
+    // addressing modes... 
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0) = 0;
+    virtual uint32_t    immed12_post(int32_t immed12) = 0;
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0) = 0;
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0) = 0;
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0) = 0;
+    virtual uint32_t    immed8_post(int32_t immed8) = 0;
+    virtual uint32_t    reg_pre(int Rm, int W=0) = 0;
+    virtual uint32_t    reg_post(int Rm) = 0;
+
+    // -----------------------------------------------------------------------
+    // basic instructions & code generation
+    // -----------------------------------------------------------------------
+
+    // generate the code
+    virtual void reset() = 0;
+    virtual int  generate(const char* name) = 0;
+    virtual void disassemble(const char* name) = 0;
+    virtual int  getCodegenArch() = 0;
+    
+    // construct prolog and epilog
+    virtual void prolog() = 0;
+    virtual void epilog(uint32_t touched) = 0;
+    virtual void comment(const char* string) = 0;
+
+    // data processing...
+    enum {
+        opAND, opEOR, opSUB, opRSB, opADD, opADC, opSBC, opRSC, 
+        opTST, opTEQ, opCMP, opCMN, opORR, opMOV, opBIC, opMVN,
+        opADD64, opSUB64
+    };
+
+    virtual void
+            dataProcessing( int opcode, int cc, int s,
+                            int Rd, int Rn,
+                            uint32_t Op2) = 0;
+    
+    // multiply...
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs) = 0;
+
+    // branches...
+    virtual void B(int cc, uint32_t* pc) = 0;
+    virtual void BL(int cc, uint32_t* pc) = 0;
+    virtual void BX(int cc, int Rn) = 0;
+
+    virtual void label(const char* theLabel) = 0;
+    virtual void B(int cc, const char* label) = 0;
+    virtual void BL(int cc, const char* label) = 0;
+
+    // valid only after generate() has been called
+    virtual uint32_t* pcForLabel(const char* label) = 0;
+
+    // data transfer...
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0)) = 0;
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0)) = 0;
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0)) = 0;
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0)) = 0;
+
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0)) = 0;
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = __immed8_pre(0)) = 0;
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0)) = 0;
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0)) = 0;
+
+    // block data transfer...
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list) = 0;
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list) = 0;
+
+    // special...
+    virtual void SWP(int cc, int Rn, int Rd, int Rm) = 0;
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm) = 0;
+    virtual void SWI(int cc, uint32_t comment) = 0;
+
+    // DSP instructions...
+    enum {
+        // B=0, T=1
+        //     yx
+        xyBB = 0, // 0000,
+        xyTB = 2, // 0010,
+        xyBT = 4, // 0100,
+        xyTT = 6, // 0110,
+        yB   = 0, // 0000,
+        yT   = 4, // 0100
+    };
+
+    virtual void PLD(int Rn, uint32_t offset) = 0;
+
+    virtual void CLZ(int cc, int Rd, int Rm) = 0;
+    
+    virtual void QADD(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn) = 0;
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn) = 0;
+    
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs) = 0;
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm) = 0;
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn) = 0;
+
+    // byte/half word extract...
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate) = 0;
+
+    // bit manipulation...
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width) = 0;
+
+    // -----------------------------------------------------------------------
+    // convenience...
+    // -----------------------------------------------------------------------
+    inline void
+    ADC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opADC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    ADD(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opADD, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    AND(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opAND, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    BIC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opBIC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    EOR(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opEOR, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    MOV(int cc, int s, int Rd, uint32_t Op2) {
+        dataProcessing(opMOV, cc, s, Rd, 0, Op2);
+    }
+    inline void
+    MVN(int cc, int s, int Rd, uint32_t Op2) {
+        dataProcessing(opMVN, cc, s, Rd, 0, Op2);
+    }
+    inline void
+    ORR(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opORR, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    RSB(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opRSB, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    RSC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opRSC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    SBC(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opSBC, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    SUB(int cc, int s, int Rd, int Rn, uint32_t Op2) {
+        dataProcessing(opSUB, cc, s, Rd, Rn, Op2);
+    }
+    inline void
+    TEQ(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opTEQ, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    TST(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opTST, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    CMP(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opCMP, cc, 1, 0, Rn, Op2);
+    }
+    inline void
+    CMN(int cc, int Rn, uint32_t Op2) {
+        dataProcessing(opCMN, cc, 1, 0, Rn, Op2);
+    }
+
+    inline void SMULBB(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyBB, Rd, Rm, Rs);    }
+    inline void SMULTB(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyTB, Rd, Rm, Rs);    }
+    inline void SMULBT(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyBT, Rd, Rm, Rs);    }
+    inline void SMULTT(int cc, int Rd, int Rm, int Rs) {
+        SMUL(cc, xyTT, Rd, Rm, Rs);    }
+
+    inline void SMULWB(int cc, int Rd, int Rm, int Rs) {
+        SMULW(cc, yB, Rd, Rm, Rs);    }
+    inline void SMULWT(int cc, int Rd, int Rm, int Rs) {
+        SMULW(cc, yT, Rd, Rm, Rs);    }
+
+    inline void
+    SMLABB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyBB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLATB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyTB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLABT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyBT, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLATT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLA(cc, xyTT, Rd, Rm, Rs, Rn);    }
+
+    inline void
+    SMLALBB(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyBB, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALTB(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyTB, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALBT(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyBT, RdHi, RdLo, Rs, Rm);    }
+    inline void
+    SMLALTT(int cc, int RdHi, int RdLo, int Rs, int Rm) {
+        SMLAL(cc, xyTT, RdHi, RdLo, Rs, Rm);    }
+
+    inline void
+    SMLAWB(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLAW(cc, yB, Rd, Rm, Rs, Rn);    }
+    inline void
+    SMLAWT(int cc, int Rd, int Rm, int Rs, int Rn) {
+        SMLAW(cc, yT, Rd, Rm, Rs, Rn);    }
+
+    // Address loading/storing/manipulation
+    virtual void ADDR_LDR(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_STR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_ADD(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+    virtual void ADDR_SUB(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_INTERFACE_H
diff --git a/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp b/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp
new file mode 100644
index 0000000..816de48
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerProxy.cpp
@@ -0,0 +1,311 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerProxy.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "ARMAssemblerProxy.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+ARMAssemblerProxy::ARMAssemblerProxy()
+    : mTarget(0)
+{
+}
+
+ARMAssemblerProxy::ARMAssemblerProxy(ARMAssemblerInterface* target)
+    : mTarget(target)
+{
+}
+
+ARMAssemblerProxy::~ARMAssemblerProxy()
+{
+    delete mTarget;
+}
+
+void ARMAssemblerProxy::setTarget(ARMAssemblerInterface* target)
+{
+    delete mTarget;
+    mTarget = target;
+}
+
+void ARMAssemblerProxy::reset() {
+    mTarget->reset();
+}
+int ARMAssemblerProxy::generate(const char* name) {
+    return mTarget->generate(name);
+}
+void ARMAssemblerProxy::disassemble(const char* name) {
+    return mTarget->disassemble(name);
+}
+int ARMAssemblerProxy::getCodegenArch()
+{
+    return mTarget->getCodegenArch();
+}
+void ARMAssemblerProxy::prolog() {
+    mTarget->prolog();
+}
+void ARMAssemblerProxy::epilog(uint32_t touched) {
+    mTarget->epilog(touched);
+}
+void ARMAssemblerProxy::comment(const char* string) {
+    mTarget->comment(string);
+}
+
+
+
+// addressing modes
+
+bool ARMAssemblerProxy::isValidImmediate(uint32_t immed)
+{
+    return mTarget->isValidImmediate(immed);
+}
+
+int ARMAssemblerProxy::buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm)
+{
+    return mTarget->buildImmediate(i, rot, imm);
+}
+
+
+
+uint32_t ARMAssemblerProxy::imm(uint32_t immediate)
+{
+    return mTarget->imm(immediate);
+}
+
+uint32_t ARMAssemblerProxy::reg_imm(int Rm, int type, uint32_t shift)
+{
+    return mTarget->reg_imm(Rm, type, shift);
+}
+
+uint32_t ARMAssemblerProxy::reg_rrx(int Rm)
+{
+    return mTarget->reg_rrx(Rm);
+}
+
+uint32_t ARMAssemblerProxy::reg_reg(int Rm, int type, int Rs)
+{
+    return mTarget->reg_reg(Rm, type, Rs);
+}
+
+
+// addressing modes...
+// LDR(B)/STR(B)/PLD
+// (immediate and Rm can be negative, which indicates U=0)
+uint32_t ARMAssemblerProxy::immed12_pre(int32_t immed12, int W)
+{
+    return mTarget->immed12_pre(immed12, W);
+}
+
+uint32_t ARMAssemblerProxy::immed12_post(int32_t immed12)
+{
+    return mTarget->immed12_post(immed12);
+}
+
+uint32_t ARMAssemblerProxy::reg_scale_pre(int Rm, int type, uint32_t shift, int W)
+{
+    return mTarget->reg_scale_pre(Rm, type, shift, W);
+}
+
+uint32_t ARMAssemblerProxy::reg_scale_post(int Rm, int type, uint32_t shift)
+{
+    return mTarget->reg_scale_post(Rm, type, shift);
+}
+
+
+// LDRH/LDRSB/LDRSH/STRH
+// (immediate and Rm can be negative, which indicates U=0)
+uint32_t ARMAssemblerProxy::immed8_pre(int32_t immed8, int W)
+{
+    return mTarget->immed8_pre(immed8, W);
+}
+
+uint32_t ARMAssemblerProxy::immed8_post(int32_t immed8)
+{
+    return mTarget->immed8_post(immed8);
+}
+
+uint32_t ARMAssemblerProxy::reg_pre(int Rm, int W)
+{
+    return mTarget->reg_pre(Rm, W);
+}
+
+uint32_t ARMAssemblerProxy::reg_post(int Rm)
+{
+    return mTarget->reg_post(Rm);
+}
+
+
+//------------------------------------------------------------------------
+
+
+
+void ARMAssemblerProxy::dataProcessing( int opcode, int cc, int s,
+                                        int Rd, int Rn, uint32_t Op2)
+{
+    mTarget->dataProcessing(opcode, cc, s, Rd, Rn, Op2);
+}
+
+void ARMAssemblerProxy::MLA(int cc, int s, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->MLA(cc, s, Rd, Rm, Rs, Rn);
+}
+void ARMAssemblerProxy::MUL(int cc, int s, int Rd, int Rm, int Rs) {
+    mTarget->MUL(cc, s, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::UMULL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->UMULL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::UMUAL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->UMUAL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::SMULL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->SMULL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+void ARMAssemblerProxy::SMUAL(int cc, int s,
+            int RdLo, int RdHi, int Rm, int Rs) {
+    mTarget->SMUAL(cc, s, RdLo, RdHi, Rm, Rs); 
+}
+
+void ARMAssemblerProxy::B(int cc, uint32_t* pc) {
+    mTarget->B(cc, pc); 
+}
+void ARMAssemblerProxy::BL(int cc, uint32_t* pc) {
+    mTarget->BL(cc, pc); 
+}
+void ARMAssemblerProxy::BX(int cc, int Rn) {
+    mTarget->BX(cc, Rn); 
+}
+void ARMAssemblerProxy::label(const char* theLabel) {
+    mTarget->label(theLabel);
+}
+void ARMAssemblerProxy::B(int cc, const char* label) {
+    mTarget->B(cc, label);
+}
+void ARMAssemblerProxy::BL(int cc, const char* label) {
+    mTarget->BL(cc, label);
+}
+
+uint32_t* ARMAssemblerProxy::pcForLabel(const char* label) {
+    return mTarget->pcForLabel(label);
+}
+
+void ARMAssemblerProxy::LDR(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STR(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STRB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STRB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRSB(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRSB(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDRSH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->LDRSH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::STRH(int cc, int Rd, int Rn, uint32_t offset) {
+    mTarget->STRH(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::LDM(int cc, int dir, int Rn, int W, uint32_t reg_list) {
+    mTarget->LDM(cc, dir, Rn, W, reg_list);
+}
+void ARMAssemblerProxy::STM(int cc, int dir, int Rn, int W, uint32_t reg_list) {
+    mTarget->STM(cc, dir, Rn, W, reg_list);
+}
+
+void ARMAssemblerProxy::SWP(int cc, int Rn, int Rd, int Rm) {
+    mTarget->SWP(cc, Rn, Rd, Rm);
+}
+void ARMAssemblerProxy::SWPB(int cc, int Rn, int Rd, int Rm) {
+    mTarget->SWPB(cc, Rn, Rd, Rm);
+}
+void ARMAssemblerProxy::SWI(int cc, uint32_t comment) {
+    mTarget->SWI(cc, comment);
+}
+
+
+void ARMAssemblerProxy::PLD(int Rn, uint32_t offset) {
+    mTarget->PLD(Rn, offset);
+}
+void ARMAssemblerProxy::CLZ(int cc, int Rd, int Rm) {
+    mTarget->CLZ(cc, Rd, Rm);
+}
+void ARMAssemblerProxy::QADD(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QADD(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QDADD(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QDADD(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QSUB(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QSUB(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::QDSUB(int cc, int Rd, int Rm, int Rn) {
+    mTarget->QDSUB(cc, Rd, Rm, Rn);
+}
+void ARMAssemblerProxy::SMUL(int cc, int xy, int Rd, int Rm, int Rs) {
+    mTarget->SMUL(cc, xy, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::SMULW(int cc, int y, int Rd, int Rm, int Rs) {
+    mTarget->SMULW(cc, y, Rd, Rm, Rs);
+}
+void ARMAssemblerProxy::SMLA(int cc, int xy, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->SMLA(cc, xy, Rd, Rm, Rs, Rn);
+}
+void ARMAssemblerProxy::SMLAL(  int cc, int xy,
+                                int RdHi, int RdLo, int Rs, int Rm) {
+    mTarget->SMLAL(cc, xy, RdHi, RdLo, Rs, Rm);
+}
+void ARMAssemblerProxy::SMLAW(int cc, int y, int Rd, int Rm, int Rs, int Rn) {
+    mTarget->SMLAW(cc, y, Rd, Rm, Rs, Rn);
+}
+
+void ARMAssemblerProxy::UXTB16(int cc, int Rd, int Rm, int rotate) {
+    mTarget->UXTB16(cc, Rd, Rm, rotate);
+}
+
+void ARMAssemblerProxy::UBFX(int cc, int Rd, int Rn, int lsb, int width) {
+    mTarget->UBFX(cc, Rd, Rn, lsb, width);
+}
+
+void ARMAssemblerProxy::ADDR_LDR(int cc, int Rd, int Rn, uint32_t offset) {
+     mTarget->ADDR_LDR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::ADDR_STR(int cc, int Rd, int Rn, uint32_t offset) {
+     mTarget->ADDR_STR(cc, Rd, Rn, offset);
+}
+void ARMAssemblerProxy::ADDR_ADD(int cc, int s, int Rd, int Rn, uint32_t Op2){
+     mTarget->ADDR_ADD(cc, s, Rd, Rn, Op2);
+}
+void ARMAssemblerProxy::ADDR_SUB(int cc, int s, int Rd, int Rn, uint32_t Op2){
+     mTarget->ADDR_SUB(cc, s, Rd, Rn, Op2);
+}
+
+}; // namespace android
+
diff --git a/libpixelflinger/codeflinger/ARMAssemblerProxy.h b/libpixelflinger/codeflinger/ARMAssemblerProxy.h
new file mode 100644
index 0000000..10d0390
--- /dev/null
+++ b/libpixelflinger/codeflinger/ARMAssemblerProxy.h
@@ -0,0 +1,164 @@
+/* libs/pixelflinger/codeflinger/ARMAssemblerProxy.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_ARMASSEMBLER_PROXY_H
+#define ANDROID_ARMASSEMBLER_PROXY_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "ARMAssemblerInterface.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ARMAssemblerProxy : public ARMAssemblerInterface
+{
+public:
+    // ARMAssemblerProxy take ownership of the target
+
+                ARMAssemblerProxy();
+    explicit    ARMAssemblerProxy(ARMAssemblerInterface* target);
+    virtual     ~ARMAssemblerProxy();
+
+    void setTarget(ARMAssemblerInterface* target);
+
+    virtual void    reset();
+    virtual int     generate(const char* name);
+    virtual void    disassemble(const char* name);
+    virtual int     getCodegenArch();
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    virtual bool        isValidImmediate(uint32_t immed);
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    virtual uint32_t    imm(uint32_t immediate);
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    virtual uint32_t    reg_rrx(int Rm);
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes...
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0);
+    virtual uint32_t    immed12_post(int32_t immed12);
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0);
+    virtual uint32_t    immed8_post(int32_t immed8);
+    virtual uint32_t    reg_pre(int Rm, int W=0);
+    virtual uint32_t    reg_post(int Rm);
+
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void LDRSB(int cc, int Rd, 
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = __immed8_pre(0));
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width);
+
+    virtual void ADDR_LDR(int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_STR (int cc, int Rd,
+                int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_ADD(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+    virtual void ADDR_SUB(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+
+private:
+    ARMAssemblerInterface*  mTarget;
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARMASSEMBLER_PROXY_H
diff --git a/libpixelflinger/codeflinger/Arm64Assembler.cpp b/libpixelflinger/codeflinger/Arm64Assembler.cpp
new file mode 100644
index 0000000..271a9b9
--- /dev/null
+++ b/libpixelflinger/codeflinger/Arm64Assembler.cpp
@@ -0,0 +1,1240 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#define LOG_TAG "ArmToArm64Assembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cutils/properties.h>
+#include <log/log.h>
+#include <private/pixelflinger/ggl_context.h>
+
+#include "codeflinger/Arm64Assembler.h"
+#include "codeflinger/Arm64Disassembler.h"
+#include "codeflinger/CodeCache.h"
+
+/*
+** --------------------------------------------
+** Support for Arm64 in GGLAssembler JIT
+** --------------------------------------------
+**
+** Approach
+** - GGLAssembler and associated files are largely un-changed.
+** - A translator class maps ArmAssemblerInterface calls to
+**   generate Arm64 instructions.
+**
+** ----------------------
+** ArmToArm64Assembler
+** ----------------------
+**
+** - Subclassed from ArmAssemblerInterface
+**
+** - Translates each ArmAssemblerInterface call to generate
+**   one or more Arm64 instructions  as necessary.
+**
+** - Does not implement ArmAssemblerInterface portions unused by GGLAssembler
+**   It calls NOT_IMPLEMENTED() for such cases, which in turn logs
+**    a fatal message.
+**
+** - Uses A64_.. series of functions to generate instruction machine code
+**   for Arm64 instructions. These functions also log the instruction
+**   to LOG, if ARM64_ASM_DEBUG define is set to 1
+**
+** - Dumps machine code and eqvt assembly if "debug.pf.disasm" option is set
+**   It uses arm64_disassemble to perform disassembly
+**
+** - Uses register 13 (SP in ARM), 15 (PC in ARM), 16, 17 for storing
+**   intermediate results. GGLAssembler does not use SP and PC as these
+**   registers are marked as reserved. The temporary registers are not
+**   saved/restored on stack as these are caller-saved registers in Arm64
+**
+** - Uses CSEL instruction to support conditional execution. The result is
+**   stored in a temporary register and then copied to the target register
+**   if the condition is true.
+**
+** - In the case of conditional data transfer instructions, conditional
+**   branch is used to skip over instruction, if the condition is false
+**
+** - Wherever possible, immediate values are transferred to temporary
+**   register prior to processing. This simplifies overall implementation
+**   as instructions requiring immediate values are converted to
+**   move immediate instructions followed by register-register instruction.
+**
+** --------------------------------------------
+** ArmToArm64Assembler unit test bench
+** --------------------------------------------
+**
+** - Tests ArmToArm64Assembler interface for all the possible
+**   ways in which GGLAssembler uses ArmAssemblerInterface interface.
+**
+** - Uses test jacket (written in assembly) to set the registers,
+**   condition flags prior to calling generated instruction. It also
+**   copies registers and flags at the end of execution. Caller then
+**   checks if generated code performed correct operation based on
+**   output registers and flags.
+**
+** - Broadly contains three type of tests, (i) data operation tests
+**   (ii) data transfer tests and (iii) LDM/STM tests.
+**
+** ----------------------
+** Arm64 disassembler
+** ----------------------
+** - This disassembler disassembles only those machine codes which can be
+**   generated by ArmToArm64Assembler. It has a unit testbench which
+**   tests all the instructions supported by the disassembler.
+**
+** ------------------------------------------------------------------
+** ARMAssembler/ARMAssemblerInterface/ARMAssemblerProxy changes
+** ------------------------------------------------------------------
+**
+** - In existing code, addresses were being handled as 32 bit values at
+**   certain places.
+**
+** - Added a new set of functions for address load/store/manipulation.
+**   These are ADDR_LDR, ADDR_STR, ADDR_ADD, ADDR_SUB and they map to
+**   default 32 bit implementations in ARMAssemblerInterface.
+**
+** - ArmToArm64Assembler maps these functions to appropriate 64 bit
+**   functions.
+**
+** ----------------------
+** GGLAssembler changes
+** ----------------------
+** - Since ArmToArm64Assembler can generate 4 Arm64 instructions for
+**   each call in worst case, the memory required is set to 4 times
+**   ARM memory
+**
+** - Address load/store/manipulation were changed to use new functions
+**   added in the ARMAssemblerInterface.
+**
+*/
+
+
+#define NOT_IMPLEMENTED()  LOG_FATAL("Arm instruction %s not yet implemented\n", __func__)
+
+#define ARM64_ASM_DEBUG 0
+
+#if ARM64_ASM_DEBUG
+    #define LOG_INSTR(...) ALOGD("\t" __VA_ARGS__)
+    #define LOG_LABEL(...) ALOGD(__VA_ARGS__)
+#else
+    #define LOG_INSTR(...) ((void)0)
+    #define LOG_LABEL(...) ((void)0)
+#endif
+
+namespace android {
+
+static __unused const char* shift_codes[] =
+{
+    "LSL", "LSR", "ASR", "ROR"
+};
+static __unused const char *cc_codes[] =
+{
+    "EQ", "NE", "CS", "CC", "MI",
+    "PL", "VS", "VC", "HI", "LS",
+    "GE", "LT", "GT", "LE", "AL", "NV"
+};
+
+ArmToArm64Assembler::ArmToArm64Assembler(const sp<Assembly>& assembly)
+    :   ARMAssemblerInterface(),
+        mAssembly(assembly)
+{
+    mBase = mPC = (uint32_t *)assembly->base();
+    mDuration = ggl_system_time();
+    mZeroReg = 13;
+    mTmpReg1 = 15;
+    mTmpReg2 = 16;
+    mTmpReg3 = 17;
+}
+
+ArmToArm64Assembler::ArmToArm64Assembler(void *base)
+    :   ARMAssemblerInterface(), mAssembly(NULL)
+{
+    mBase = mPC = (uint32_t *)base;
+    mDuration = ggl_system_time();
+    // Regs 13, 15, 16, 17 are used as temporary registers
+    mZeroReg = 13;
+    mTmpReg1 = 15;
+    mTmpReg2 = 16;
+    mTmpReg3 = 17;
+}
+
+ArmToArm64Assembler::~ArmToArm64Assembler()
+{
+}
+
+uint32_t* ArmToArm64Assembler::pc() const
+{
+    return mPC;
+}
+
+uint32_t* ArmToArm64Assembler::base() const
+{
+    return mBase;
+}
+
+void ArmToArm64Assembler::reset()
+{
+    if(mAssembly == NULL)
+        mPC = mBase;
+    else
+        mBase = mPC = (uint32_t *)mAssembly->base();
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+#if ARM64_ASM_DEBUG
+    ALOGI("RESET\n");
+#endif
+}
+
+int ArmToArm64Assembler::getCodegenArch()
+{
+    return CODEGEN_ARCH_ARM64;
+}
+
+// ----------------------------------------------------------------------------
+
+void ArmToArm64Assembler::disassemble(const char* name)
+{
+    if(name)
+    {
+        printf("%s:\n", name);
+    }
+    size_t count = pc()-base();
+    uint32_t* i = base();
+    while (count--)
+    {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(i);
+        if (label >= 0)
+        {
+            printf("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(i);
+        if (comment >= 0)
+        {
+            printf("; %s\n", mComments.valueAt(comment));
+        }
+        printf("%p:    %08x    ", i, uint32_t(i[0]));
+        {
+            char instr[256];
+            ::arm64_disassemble(*i, instr);
+            printf("%s\n", instr);
+        }
+        i++;
+    }
+}
+
+void ArmToArm64Assembler::comment(const char* string)
+{
+    mComments.add(mPC, string);
+    LOG_INSTR("//%s\n", string);
+}
+
+void ArmToArm64Assembler::label(const char* theLabel)
+{
+    mLabels.add(theLabel, mPC);
+    mLabelsInverseMapping.add(mPC, theLabel);
+    LOG_LABEL("%s:\n", theLabel);
+}
+
+void ArmToArm64Assembler::B(int cc, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    LOG_INSTR("B%s %s\n", cc_codes[cc], label );
+    *mPC++ = (0x54 << 24) | cc;
+}
+
+void ArmToArm64Assembler::BL(int /*cc*/, const char* /*label*/)
+{
+    NOT_IMPLEMENTED(); //Not Required
+}
+
+// ----------------------------------------------------------------------------
+//Prolog/Epilog & Generate...
+// ----------------------------------------------------------------------------
+
+void ArmToArm64Assembler::prolog()
+{
+    // write prolog code
+    mPrologPC = mPC;
+    *mPC++ = A64_MOVZ_X(mZeroReg,0,0);
+}
+
+void ArmToArm64Assembler::epilog(uint32_t /*touched*/)
+{
+    // write epilog code
+    static const int XLR = 30;
+    *mPC++ = A64_RET(XLR);
+}
+
+int ArmToArm64Assembler::generate(const char* name)
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--)
+    {
+        const branch_target_t& bt = mBranchTargets[count];
+        uint32_t* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                "error resolving branch targets, target_pc is null");
+        int32_t offset = int32_t(target_pc - bt.pc);
+        *bt.pc |= (offset & 0x7FFFF) << 5;
+    }
+
+    if(mAssembly != NULL)
+        mAssembly->resize( int(pc()-base())*4 );
+
+    // the instruction cache is flushed by CodeCache
+    const int64_t duration = ggl_system_time() - mDuration;
+    const char * const format = "generated %s (%d ins) at [%p:%p] in %" PRId64 "ns\n";
+    ALOGI(format, name, int(pc()-base()), base(), pc(), duration);
+
+
+    char value[PROPERTY_VALUE_MAX];
+    property_get("debug.pf.disasm", value, "0");
+    if (atoi(value) != 0)
+    {
+        printf(format, name, int(pc()-base()), base(), pc(), duration);
+        disassemble(name);
+    }
+    return OK;
+}
+
+uint32_t* ArmToArm64Assembler::pcForLabel(const char* label)
+{
+    return mLabels.valueFor(label);
+}
+
+// ----------------------------------------------------------------------------
+// Data Processing...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::dataProcessingCommon(int opcode,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    if(opcode != opSUB && s == 1)
+    {
+        NOT_IMPLEMENTED(); //Not required
+        return;
+    }
+
+    if(opcode != opSUB && opcode != opADD && opcode != opAND &&
+       opcode != opORR && opcode != opMVN)
+    {
+        NOT_IMPLEMENTED(); //Not required
+        return;
+    }
+
+    if(Op2 == OPERAND_REG_IMM && mAddrMode.reg_imm_shift > 31)
+        {
+        NOT_IMPLEMENTED();
+        return;
+    }
+
+    //Store immediate in temporary register and convert
+    //immediate operation into register operation
+    if(Op2 == OPERAND_IMM)
+    {
+        int imm = mAddrMode.immediate;
+        *mPC++ = A64_MOVZ_W(mTmpReg2, imm & 0x0000FFFF, 0);
+        *mPC++ = A64_MOVK_W(mTmpReg2, (imm >> 16) & 0x0000FFFF, 16);
+        Op2 = mTmpReg2;
+    }
+
+
+    {
+        uint32_t shift;
+        uint32_t amount;
+        uint32_t Rm;
+
+        if(Op2 == OPERAND_REG_IMM)
+        {
+            shift   = mAddrMode.reg_imm_type;
+            amount  = mAddrMode.reg_imm_shift;
+            Rm      = mAddrMode.reg_imm_Rm;
+        }
+        else if(Op2 < OPERAND_REG)
+        {
+            shift   = 0;
+            amount  = 0;
+            Rm      = Op2;
+        }
+        else
+        {
+            NOT_IMPLEMENTED(); //Not required
+            return;
+        }
+
+        switch(opcode)
+        {
+            case opADD: *mPC++ = A64_ADD_W(Rd, Rn, Rm, shift, amount); break;
+            case opAND: *mPC++ = A64_AND_W(Rd, Rn, Rm, shift, amount); break;
+            case opORR: *mPC++ = A64_ORR_W(Rd, Rn, Rm, shift, amount); break;
+            case opMVN: *mPC++ = A64_ORN_W(Rd, Rn, Rm, shift, amount); break;
+            case opSUB: *mPC++ = A64_SUB_W(Rd, Rn, Rm, shift, amount, s);break;
+        };
+
+    }
+}
+
+void ArmToArm64Assembler::dataProcessing(int opcode, int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    uint32_t Wd;
+
+    if(cc != AL)
+        Wd = mTmpReg1;
+    else
+        Wd = Rd;
+
+    if(opcode == opADD || opcode == opAND || opcode == opORR ||opcode == opSUB)
+    {
+        dataProcessingCommon(opcode, s, Wd, Rn, Op2);
+    }
+    else if(opcode == opCMP)
+    {
+        dataProcessingCommon(opSUB, 1, mTmpReg3, Rn, Op2);
+    }
+    else if(opcode == opRSB)
+    {
+        dataProcessingCommon(opSUB, s, Wd, Rn, Op2);
+        dataProcessingCommon(opSUB, s, Wd, mZeroReg, Wd);
+    }
+    else if(opcode == opMOV)
+    {
+        dataProcessingCommon(opORR, 0, Wd, mZeroReg, Op2);
+        if(s == 1)
+        {
+            dataProcessingCommon(opSUB, 1, mTmpReg3, Wd, mZeroReg);
+        }
+    }
+    else if(opcode == opMVN)
+    {
+        dataProcessingCommon(opMVN, s, Wd, mZeroReg, Op2);
+    }
+    else if(opcode == opBIC)
+    {
+        dataProcessingCommon(opMVN, s, mTmpReg3, mZeroReg, Op2);
+        dataProcessingCommon(opAND, s, Wd, Rn, mTmpReg3);
+    }
+    else
+    {
+        NOT_IMPLEMENTED();
+        return;
+    }
+
+    if(cc != AL)
+    {
+        *mPC++ = A64_CSEL_W(Rd, mTmpReg1, Rd, cc);
+    }
+}
+// ----------------------------------------------------------------------------
+// Address Processing...
+// ----------------------------------------------------------------------------
+
+void ArmToArm64Assembler::ADDR_ADD(int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+    if(s  != 0) { NOT_IMPLEMENTED(); return;} //Not required
+
+
+    if(Op2 == OPERAND_REG_IMM && mAddrMode.reg_imm_type == LSL)
+    {
+        int Rm = mAddrMode.reg_imm_Rm;
+        int amount = mAddrMode.reg_imm_shift;
+        *mPC++ = A64_ADD_X_Wm_SXTW(Rd, Rn, Rm, amount);
+    }
+    else if(Op2 < OPERAND_REG)
+    {
+        int Rm = Op2;
+        int amount = 0;
+        *mPC++ = A64_ADD_X_Wm_SXTW(Rd, Rn, Rm, amount);
+    }
+    else if(Op2 == OPERAND_IMM)
+    {
+        int imm = mAddrMode.immediate;
+        *mPC++ = A64_MOVZ_W(mTmpReg1, imm & 0x0000FFFF, 0);
+        *mPC++ = A64_MOVK_W(mTmpReg1, (imm >> 16) & 0x0000FFFF, 16);
+
+        int Rm = mTmpReg1;
+        int amount = 0;
+        *mPC++ = A64_ADD_X_Wm_SXTW(Rd, Rn, Rm, amount);
+    }
+    else
+    {
+        NOT_IMPLEMENTED(); //Not required
+    }
+}
+
+void ArmToArm64Assembler::ADDR_SUB(int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+    if(s  != 0) { NOT_IMPLEMENTED(); return;} //Not required
+
+    if(Op2 == OPERAND_REG_IMM && mAddrMode.reg_imm_type == LSR)
+    {
+        *mPC++ = A64_ADD_W(mTmpReg1, mZeroReg, mAddrMode.reg_imm_Rm,
+                           LSR, mAddrMode.reg_imm_shift);
+        *mPC++ = A64_SUB_X_Wm_SXTW(Rd, Rn, mTmpReg1, 0);
+    }
+    else
+    {
+        NOT_IMPLEMENTED(); //Not required
+    }
+}
+
+// ----------------------------------------------------------------------------
+// multiply...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::MLA(int cc, int s,int Rd, int Rm, int Rs, int Rn)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+
+    *mPC++ = A64_MADD_W(Rd, Rm, Rs, Rn);
+    if(s == 1)
+        dataProcessingCommon(opSUB, 1, mTmpReg1, Rd, mZeroReg);
+}
+void ArmToArm64Assembler::MUL(int cc, int s, int Rd, int Rm, int Rs)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+    if(s  != 0) { NOT_IMPLEMENTED(); return;} //Not required
+    *mPC++ = A64_MADD_W(Rd, Rm, Rs, mZeroReg);
+}
+void ArmToArm64Assembler::UMULL(int /*cc*/, int /*s*/,
+        int /*RdLo*/, int /*RdHi*/, int /*Rm*/, int /*Rs*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::UMUAL(int /*cc*/, int /*s*/,
+        int /*RdLo*/, int /*RdHi*/, int /*Rm*/, int /*Rs*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::SMULL(int /*cc*/, int /*s*/,
+        int /*RdLo*/, int /*RdHi*/, int /*Rm*/, int /*Rs*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::SMUAL(int /*cc*/, int /*s*/,
+        int /*RdLo*/, int /*RdHi*/, int /*Rm*/, int /*Rs*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+// ----------------------------------------------------------------------------
+// branches relative to PC...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::B(int /*cc*/, uint32_t* /*pc*/){
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::BL(int /*cc*/, uint32_t* /*pc*/){
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::BX(int /*cc*/, int /*Rn*/){
+    NOT_IMPLEMENTED(); //Not required
+}
+
+// ----------------------------------------------------------------------------
+// data transfer...
+// ----------------------------------------------------------------------------
+enum dataTransferOp
+{
+    opLDR,opLDRB,opLDRH,opSTR,opSTRB,opSTRH
+};
+
+void ArmToArm64Assembler::dataTransfer(int op, int cc,
+                            int Rd, int Rn, uint32_t op_type, uint32_t size)
+{
+    const int XSP = 31;
+    if(Rn == SP)
+        Rn = XSP;
+
+    if(op_type == OPERAND_IMM)
+    {
+        int addrReg;
+        int imm = mAddrMode.immediate;
+        if(imm >= 0 && imm < (1<<12))
+            *mPC++ = A64_ADD_IMM_X(mTmpReg1, mZeroReg, imm, 0);
+        else if(imm < 0 && -imm < (1<<12))
+            *mPC++ = A64_SUB_IMM_X(mTmpReg1, mZeroReg, -imm, 0);
+        else
+        {
+            NOT_IMPLEMENTED();
+            return;
+        }
+
+        addrReg = Rn;
+        if(mAddrMode.preindex == true || mAddrMode.postindex == true)
+        {
+            *mPC++ = A64_ADD_X(mTmpReg2, addrReg, mTmpReg1);
+            if(mAddrMode.preindex == true)
+                addrReg = mTmpReg2;
+        }
+
+        if(cc != AL)
+            *mPC++ = A64_B_COND(cc^1, 8);
+
+        *mPC++ = A64_LDRSTR_Wm_SXTW_0(op, size, Rd, addrReg, mZeroReg);
+
+        if(mAddrMode.writeback == true)
+            *mPC++ = A64_CSEL_X(Rn, mTmpReg2, Rn, cc);
+    }
+    else if(op_type == OPERAND_REG_OFFSET)
+    {
+        if(cc != AL)
+            *mPC++ = A64_B_COND(cc^1, 8);
+        *mPC++ = A64_LDRSTR_Wm_SXTW_0(op, size, Rd, Rn, mAddrMode.reg_offset);
+
+    }
+    else if(op_type > OPERAND_UNSUPPORTED)
+    {
+        if(cc != AL)
+            *mPC++ = A64_B_COND(cc^1, 8);
+        *mPC++ = A64_LDRSTR_Wm_SXTW_0(op, size, Rd, Rn, mZeroReg);
+    }
+    else
+    {
+        NOT_IMPLEMENTED(); // Not required
+    }
+    return;
+
+}
+void ArmToArm64Assembler::ADDR_LDR(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opLDR, cc, Rd, Rn, op_type, 64);
+}
+void ArmToArm64Assembler::ADDR_STR(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opSTR, cc, Rd, Rn, op_type, 64);
+}
+void ArmToArm64Assembler::LDR(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opLDR, cc, Rd, Rn, op_type);
+}
+void ArmToArm64Assembler::LDRB(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opLDRB, cc, Rd, Rn, op_type);
+}
+void ArmToArm64Assembler::STR(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opSTR, cc, Rd, Rn, op_type);
+}
+
+void ArmToArm64Assembler::STRB(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opSTRB, cc, Rd, Rn, op_type);
+}
+
+void ArmToArm64Assembler::LDRH(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opLDRH, cc, Rd, Rn, op_type);
+}
+void ArmToArm64Assembler::LDRSB(int /*cc*/, int /*Rd*/, int /*Rn*/, uint32_t /*offset*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::LDRSH(int /*cc*/, int /*Rd*/, int /*Rn*/, uint32_t /*offset*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::STRH(int cc, int Rd, int Rn, uint32_t op_type)
+{
+    return dataTransfer(opSTRH, cc, Rd, Rn, op_type);
+}
+
+// ----------------------------------------------------------------------------
+// block data transfer...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::LDM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{
+    const int XSP = 31;
+    if(cc != AL || dir != IA || W == 0 || Rn != SP)
+    {
+        NOT_IMPLEMENTED();
+        return;
+    }
+
+    for(int i = 0; i < 32; ++i)
+    {
+        if((reg_list & (1 << i)))
+        {
+            int reg = i;
+            int size = 16;
+            *mPC++ = A64_LDR_IMM_PostIndex(reg, XSP, size);
+        }
+    }
+}
+
+void ArmToArm64Assembler::STM(int cc, int dir,
+        int Rn, int W, uint32_t reg_list)
+{
+    const int XSP = 31;
+    if(cc != AL || dir != DB || W == 0 || Rn != SP)
+    {
+        NOT_IMPLEMENTED();
+        return;
+    }
+
+    for(int i = 31; i >= 0; --i)
+    {
+        if((reg_list & (1 << i)))
+        {
+            int size = -16;
+            int reg  = i;
+            *mPC++ = A64_STR_IMM_PreIndex(reg, XSP, size);
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// special...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::SWP(int /*cc*/, int /*Rn*/, int /*Rd*/, int /*Rm*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::SWPB(int /*cc*/, int /*Rn*/, int /*Rd*/, int /*Rm*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+void ArmToArm64Assembler::SWI(int /*cc*/, uint32_t /*comment*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+// ----------------------------------------------------------------------------
+// DSP instructions...
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::PLD(int /*Rn*/, uint32_t /*offset*/) {
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::CLZ(int /*cc*/, int /*Rd*/, int /*Rm*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::QADD(int /*cc*/, int /*Rd*/, int /*Rm*/, int /*Rn*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::QDADD(int /*cc*/, int /*Rd*/, int /*Rm*/, int /*Rn*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::QSUB(int /*cc*/, int /*Rd*/, int /*Rm*/, int /*Rn*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+void ArmToArm64Assembler::QDSUB(int /*cc*/, int /*Rd*/, int /*Rm*/, int /*Rn*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+}
+
+// ----------------------------------------------------------------------------
+// 16 x 16 multiplication
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+
+    if (xy & xyTB)
+        *mPC++ = A64_SBFM_W(mTmpReg1, Rm, 16, 31);
+    else
+        *mPC++ = A64_SBFM_W(mTmpReg1, Rm, 0, 15);
+
+    if (xy & xyBT)
+        *mPC++ = A64_SBFM_W(mTmpReg2, Rs, 16, 31);
+    else
+        *mPC++ = A64_SBFM_W(mTmpReg2, Rs, 0, 15);
+
+    *mPC++ = A64_MADD_W(Rd,mTmpReg1,mTmpReg2, mZeroReg);
+}
+// ----------------------------------------------------------------------------
+// 32 x 16 multiplication
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::SMULW(int cc, int y, int Rd, int Rm, int Rs)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+
+    if (y & yT)
+        *mPC++ = A64_SBFM_W(mTmpReg1, Rs, 16, 31);
+    else
+        *mPC++ = A64_SBFM_W(mTmpReg1, Rs, 0, 15);
+
+    *mPC++ = A64_SBFM_W(mTmpReg2, Rm, 0, 31);
+    *mPC++ = A64_SMADDL(mTmpReg3,mTmpReg1,mTmpReg2, mZeroReg);
+    *mPC++ = A64_UBFM_X(Rd,mTmpReg3, 16, 47);
+}
+// ----------------------------------------------------------------------------
+// 16 x 16 multiplication and accumulate
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::SMLA(int cc, int xy, int Rd, int Rm, int Rs, int Rn)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+    if(xy != xyBB) { NOT_IMPLEMENTED(); return;} //Not required
+
+    *mPC++ = A64_SBFM_W(mTmpReg1, Rm, 0, 15);
+    *mPC++ = A64_SBFM_W(mTmpReg2, Rs, 0, 15);
+    *mPC++ = A64_MADD_W(Rd, mTmpReg1, mTmpReg2, Rn);
+}
+
+void ArmToArm64Assembler::SMLAL(int /*cc*/, int /*xy*/,
+                int /*RdHi*/, int /*RdLo*/, int /*Rs*/, int /*Rm*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+    return;
+}
+
+void ArmToArm64Assembler::SMLAW(int /*cc*/, int /*y*/,
+                int /*Rd*/, int /*Rm*/, int /*Rs*/, int /*Rn*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+    return;
+}
+
+// ----------------------------------------------------------------------------
+// Byte/half word extract and extend
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::UXTB16(int cc, int Rd, int Rm, int rotate)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+
+    *mPC++ = A64_EXTR_W(mTmpReg1, Rm, Rm, rotate * 8);
+
+    uint32_t imm = 0x00FF00FF;
+    *mPC++ = A64_MOVZ_W(mTmpReg2, imm & 0xFFFF, 0);
+    *mPC++ = A64_MOVK_W(mTmpReg2, (imm >> 16) & 0x0000FFFF, 16);
+    *mPC++ = A64_AND_W(Rd,mTmpReg1, mTmpReg2);
+}
+
+// ----------------------------------------------------------------------------
+// Bit manipulation
+// ----------------------------------------------------------------------------
+void ArmToArm64Assembler::UBFX(int cc, int Rd, int Rn, int lsb, int width)
+{
+    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+    *mPC++ = A64_UBFM_W(Rd, Rn, lsb, lsb + width - 1);
+}
+// ----------------------------------------------------------------------------
+// Shifters...
+// ----------------------------------------------------------------------------
+int ArmToArm64Assembler::buildImmediate(
+        uint32_t immediate, uint32_t& rot, uint32_t& imm)
+{
+    rot = 0;
+    imm = immediate;
+    return 0; // Always true
+}
+
+
+bool ArmToArm64Assembler::isValidImmediate(uint32_t immediate)
+{
+    uint32_t rot, imm;
+    return buildImmediate(immediate, rot, imm) == 0;
+}
+
+uint32_t ArmToArm64Assembler::imm(uint32_t immediate)
+{
+    mAddrMode.immediate = immediate;
+    mAddrMode.writeback = false;
+    mAddrMode.preindex  = false;
+    mAddrMode.postindex = false;
+    return OPERAND_IMM;
+
+}
+
+uint32_t ArmToArm64Assembler::reg_imm(int Rm, int type, uint32_t shift)
+{
+    mAddrMode.reg_imm_Rm = Rm;
+    mAddrMode.reg_imm_type = type;
+    mAddrMode.reg_imm_shift = shift;
+    return OPERAND_REG_IMM;
+}
+
+uint32_t ArmToArm64Assembler::reg_rrx(int /*Rm*/)
+{
+    NOT_IMPLEMENTED();
+    return OPERAND_UNSUPPORTED;
+}
+
+uint32_t ArmToArm64Assembler::reg_reg(int /*Rm*/, int /*type*/, int /*Rs*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+    return OPERAND_UNSUPPORTED;
+}
+// ----------------------------------------------------------------------------
+// Addressing modes...
+// ----------------------------------------------------------------------------
+uint32_t ArmToArm64Assembler::immed12_pre(int32_t immed12, int W)
+{
+    mAddrMode.immediate = immed12;
+    mAddrMode.writeback = W;
+    mAddrMode.preindex  = true;
+    mAddrMode.postindex = false;
+    return OPERAND_IMM;
+}
+
+uint32_t ArmToArm64Assembler::immed12_post(int32_t immed12)
+{
+    mAddrMode.immediate = immed12;
+    mAddrMode.writeback = true;
+    mAddrMode.preindex  = false;
+    mAddrMode.postindex = true;
+    return OPERAND_IMM;
+}
+
+uint32_t ArmToArm64Assembler::reg_scale_pre(int Rm, int type,
+        uint32_t shift, int W)
+{
+    if(type != 0 || shift != 0 || W != 0)
+    {
+        NOT_IMPLEMENTED(); //Not required
+        return OPERAND_UNSUPPORTED;
+    }
+    else
+    {
+        mAddrMode.reg_offset = Rm;
+        return OPERAND_REG_OFFSET;
+    }
+}
+
+uint32_t ArmToArm64Assembler::reg_scale_post(int /*Rm*/, int /*type*/, uint32_t /*shift*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+    return OPERAND_UNSUPPORTED;
+}
+
+uint32_t ArmToArm64Assembler::immed8_pre(int32_t immed8, int W)
+{
+    mAddrMode.immediate = immed8;
+    mAddrMode.writeback = W;
+    mAddrMode.preindex  = true;
+    mAddrMode.postindex = false;
+    return OPERAND_IMM;
+}
+
+uint32_t ArmToArm64Assembler::immed8_post(int32_t immed8)
+{
+    mAddrMode.immediate = immed8;
+    mAddrMode.writeback = true;
+    mAddrMode.preindex  = false;
+    mAddrMode.postindex = true;
+    return OPERAND_IMM;
+}
+
+uint32_t ArmToArm64Assembler::reg_pre(int Rm, int W)
+{
+    if(W != 0)
+    {
+        NOT_IMPLEMENTED(); //Not required
+        return OPERAND_UNSUPPORTED;
+    }
+    else
+    {
+        mAddrMode.reg_offset = Rm;
+        return OPERAND_REG_OFFSET;
+    }
+}
+
+uint32_t ArmToArm64Assembler::reg_post(int /*Rm*/)
+{
+    NOT_IMPLEMENTED(); //Not required
+    return OPERAND_UNSUPPORTED;
+}
+
+// ----------------------------------------------------------------------------
+// A64 instructions
+// ----------------------------------------------------------------------------
+
+static __unused const char * dataTransferOpName[] =
+{
+    "LDR","LDRB","LDRH","STR","STRB","STRH"
+};
+
+static const uint32_t dataTransferOpCode [] =
+{
+    ((0xB8u << 24) | (0x3 << 21) | (0x6 << 13) | (0x0 << 12) |(0x1 << 11)),
+    ((0x38u << 24) | (0x3 << 21) | (0x6 << 13) | (0x1 << 12) |(0x1 << 11)),
+    ((0x78u << 24) | (0x3 << 21) | (0x6 << 13) | (0x0 << 12) |(0x1 << 11)),
+    ((0xB8u << 24) | (0x1 << 21) | (0x6 << 13) | (0x0 << 12) |(0x1 << 11)),
+    ((0x38u << 24) | (0x1 << 21) | (0x6 << 13) | (0x1 << 12) |(0x1 << 11)),
+    ((0x78u << 24) | (0x1 << 21) | (0x6 << 13) | (0x0 << 12) |(0x1 << 11))
+};
+uint32_t ArmToArm64Assembler::A64_LDRSTR_Wm_SXTW_0(uint32_t op,
+                            uint32_t size, uint32_t Rt,
+                            uint32_t Rn, uint32_t Rm)
+{
+    if(size == 32)
+    {
+        LOG_INSTR("%s W%d, [X%d, W%d, SXTW #0]\n",
+                   dataTransferOpName[op], Rt, Rn, Rm);
+        return(dataTransferOpCode[op] | (Rm << 16) | (Rn << 5) | Rt);
+    }
+    else
+    {
+        LOG_INSTR("%s X%d, [X%d, W%d, SXTW #0]\n",
+                  dataTransferOpName[op], Rt, Rn, Rm);
+        return(dataTransferOpCode[op] | (0x1<<30) | (Rm<<16) | (Rn<<5)|Rt);
+    }
+}
+
+uint32_t ArmToArm64Assembler::A64_STR_IMM_PreIndex(uint32_t Rt,
+                            uint32_t Rn, int32_t simm)
+{
+    if(Rn == 31)
+        LOG_INSTR("STR W%d, [SP, #%d]!\n", Rt, simm);
+    else
+        LOG_INSTR("STR W%d, [X%d, #%d]!\n", Rt, Rn, simm);
+
+    uint32_t imm9 = (unsigned)(simm) & 0x01FF;
+    return (0xB8 << 24) | (imm9 << 12) | (0x3 << 10) | (Rn << 5) | Rt;
+}
+
+uint32_t ArmToArm64Assembler::A64_LDR_IMM_PostIndex(uint32_t Rt,
+                            uint32_t Rn, int32_t simm)
+{
+    if(Rn == 31)
+        LOG_INSTR("LDR W%d, [SP], #%d\n",Rt,simm);
+    else
+        LOG_INSTR("LDR W%d, [X%d], #%d\n",Rt, Rn, simm);
+
+    uint32_t imm9 = (unsigned)(simm) & 0x01FF;
+    return (0xB8 << 24) | (0x1 << 22) |
+             (imm9 << 12) | (0x1 << 10) | (Rn << 5) | Rt;
+
+}
+uint32_t ArmToArm64Assembler::A64_ADD_X_Wm_SXTW(uint32_t Rd,
+                               uint32_t Rn,
+                               uint32_t Rm,
+                               uint32_t amount)
+{
+    LOG_INSTR("ADD X%d, X%d, W%d, SXTW #%d\n", Rd, Rn, Rm, amount);
+    return ((0x8B << 24) | (0x1 << 21) |(Rm << 16) |
+              (0x6 << 13) | (amount << 10) | (Rn << 5) | Rd);
+
+}
+
+uint32_t ArmToArm64Assembler::A64_SUB_X_Wm_SXTW(uint32_t Rd,
+                               uint32_t Rn,
+                               uint32_t Rm,
+                               uint32_t amount)
+{
+    LOG_INSTR("SUB X%d, X%d, W%d, SXTW #%d\n", Rd, Rn, Rm, amount);
+    return ((0xCB << 24) | (0x1 << 21) |(Rm << 16) |
+            (0x6 << 13) | (amount << 10) | (Rn << 5) | Rd);
+
+}
+
+uint32_t ArmToArm64Assembler::A64_B_COND(uint32_t cc, uint32_t offset)
+{
+    LOG_INSTR("B.%s #.+%d\n", cc_codes[cc], offset);
+    return (0x54 << 24) | ((offset/4) << 5) | (cc);
+
+}
+uint32_t ArmToArm64Assembler::A64_ADD_X(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount)
+{
+    LOG_INSTR("ADD X%d, X%d, X%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+    return ((0x8B << 24) | (shift << 22) | ( Rm << 16) |
+            (amount << 10) |(Rn << 5) | Rd);
+}
+uint32_t ArmToArm64Assembler::A64_ADD_IMM_X(uint32_t Rd, uint32_t Rn,
+                                          uint32_t imm, uint32_t shift)
+{
+    LOG_INSTR("ADD X%d, X%d, #%d, LSL #%d\n", Rd, Rn, imm, shift);
+    return (0x91 << 24) | ((shift/12) << 22) | (imm << 10) | (Rn << 5) | Rd;
+}
+
+uint32_t ArmToArm64Assembler::A64_SUB_IMM_X(uint32_t Rd, uint32_t Rn,
+                                          uint32_t imm, uint32_t shift)
+{
+    LOG_INSTR("SUB X%d, X%d, #%d, LSL #%d\n", Rd, Rn, imm, shift);
+    return (0xD1 << 24) | ((shift/12) << 22) | (imm << 10) | (Rn << 5) | Rd;
+}
+
+uint32_t ArmToArm64Assembler::A64_ADD_W(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount)
+{
+    LOG_INSTR("ADD W%d, W%d, W%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+    return ((0x0B << 24) | (shift << 22) | ( Rm << 16) |
+            (amount << 10) |(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_SUB_W(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount,
+                                          uint32_t setflag)
+{
+    if(setflag == 0)
+    {
+        LOG_INSTR("SUB W%d, W%d, W%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+        return ((0x4B << 24) | (shift << 22) | ( Rm << 16) |
+                (amount << 10) |(Rn << 5) | Rd);
+    }
+    else
+    {
+        LOG_INSTR("SUBS W%d, W%d, W%d, %s #%d\n",
+                   Rd, Rn, Rm, shift_codes[shift], amount);
+        return ((0x6B << 24) | (shift << 22) | ( Rm << 16) |
+                (amount << 10) |(Rn << 5) | Rd);
+    }
+}
+
+uint32_t ArmToArm64Assembler::A64_AND_W(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount)
+{
+    LOG_INSTR("AND W%d, W%d, W%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+    return ((0x0A << 24) | (shift << 22) | ( Rm << 16) |
+            (amount << 10) |(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_ORR_W(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount)
+{
+    LOG_INSTR("ORR W%d, W%d, W%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+    return ((0x2A << 24) | (shift << 22) | ( Rm << 16) |
+            (amount << 10) |(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_ORN_W(uint32_t Rd, uint32_t Rn,
+                                          uint32_t Rm, uint32_t shift,
+                                          uint32_t amount)
+{
+    LOG_INSTR("ORN W%d, W%d, W%d, %s #%d\n",
+               Rd, Rn, Rm, shift_codes[shift], amount);
+    return ((0x2A << 24) | (shift << 22) | (0x1 << 21) | ( Rm << 16) |
+            (amount << 10) |(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_CSEL_X(uint32_t Rd, uint32_t Rn,
+                                           uint32_t Rm, uint32_t cond)
+{
+    LOG_INSTR("CSEL X%d, X%d, X%d, %s\n", Rd, Rn, Rm, cc_codes[cond]);
+    return ((0x9A << 24)|(0x1 << 23)|(Rm << 16) |(cond << 12)| (Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_CSEL_W(uint32_t Rd, uint32_t Rn,
+                                           uint32_t Rm, uint32_t cond)
+{
+    LOG_INSTR("CSEL W%d, W%d, W%d, %s\n", Rd, Rn, Rm, cc_codes[cond]);
+    return ((0x1A << 24)|(0x1 << 23)|(Rm << 16) |(cond << 12)| (Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_RET(uint32_t Rn)
+{
+    LOG_INSTR("RET X%d\n", Rn);
+    return ((0xD6 << 24) | (0x1 << 22) | (0x1F << 16) | (Rn << 5));
+}
+
+uint32_t ArmToArm64Assembler::A64_MOVZ_X(uint32_t Rd, uint32_t imm,
+                                         uint32_t shift)
+{
+    LOG_INSTR("MOVZ X%d, #0x%x, LSL #%d\n", Rd, imm, shift);
+    return(0xD2 << 24) | (0x1 << 23) | ((shift/16) << 21) |  (imm << 5) | Rd;
+}
+
+uint32_t ArmToArm64Assembler::A64_MOVK_W(uint32_t Rd, uint32_t imm,
+                                         uint32_t shift)
+{
+    LOG_INSTR("MOVK W%d, #0x%x, LSL #%d\n", Rd, imm, shift);
+    return (0x72 << 24) | (0x1 << 23) | ((shift/16) << 21) | (imm << 5) | Rd;
+}
+
+uint32_t ArmToArm64Assembler::A64_MOVZ_W(uint32_t Rd, uint32_t imm,
+                                         uint32_t shift)
+{
+    LOG_INSTR("MOVZ W%d, #0x%x, LSL #%d\n", Rd, imm, shift);
+    return(0x52 << 24) | (0x1 << 23) | ((shift/16) << 21) |  (imm << 5) | Rd;
+}
+
+uint32_t ArmToArm64Assembler::A64_SMADDL(uint32_t Rd, uint32_t Rn,
+                                           uint32_t Rm, uint32_t Ra)
+{
+    LOG_INSTR("SMADDL X%d, W%d, W%d, X%d\n",Rd, Rn, Rm, Ra);
+    return ((0x9B << 24) | (0x1 << 21) | (Rm << 16)|(Ra << 10)|(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_MADD_W(uint32_t Rd, uint32_t Rn,
+                                           uint32_t Rm, uint32_t Ra)
+{
+    LOG_INSTR("MADD W%d, W%d, W%d, W%d\n",Rd, Rn, Rm, Ra);
+    return ((0x1B << 24) | (Rm << 16) | (Ra << 10) |(Rn << 5) | Rd);
+}
+
+uint32_t ArmToArm64Assembler::A64_SBFM_W(uint32_t Rd, uint32_t Rn,
+                                           uint32_t immr, uint32_t imms)
+{
+    LOG_INSTR("SBFM W%d, W%d, #%d, #%d\n", Rd, Rn, immr, imms);
+    return ((0x13 << 24) | (immr << 16) | (imms << 10) | (Rn << 5) | Rd);
+
+}
+uint32_t ArmToArm64Assembler::A64_UBFM_W(uint32_t Rd, uint32_t Rn,
+                                           uint32_t immr, uint32_t imms)
+{
+    LOG_INSTR("UBFM W%d, W%d, #%d, #%d\n", Rd, Rn, immr, imms);
+    return ((0x53 << 24) | (immr << 16) | (imms << 10) | (Rn << 5) | Rd);
+
+}
+uint32_t ArmToArm64Assembler::A64_UBFM_X(uint32_t Rd, uint32_t Rn,
+                                           uint32_t immr, uint32_t imms)
+{
+    LOG_INSTR("UBFM X%d, X%d, #%d, #%d\n", Rd, Rn, immr, imms);
+    return ((0xD3 << 24) | (0x1 << 22) |
+            (immr << 16) | (imms << 10) | (Rn << 5) | Rd);
+
+}
+uint32_t ArmToArm64Assembler::A64_EXTR_W(uint32_t Rd, uint32_t Rn,
+                                           uint32_t Rm, uint32_t lsb)
+{
+    LOG_INSTR("EXTR W%d, W%d, W%d, #%d\n", Rd, Rn, Rm, lsb);
+    return (0x13 << 24)|(0x1 << 23) | (Rm << 16) | (lsb << 10)|(Rn << 5) | Rd;
+}
+
+}; // namespace android
diff --git a/libpixelflinger/codeflinger/Arm64Assembler.h b/libpixelflinger/codeflinger/Arm64Assembler.h
new file mode 100644
index 0000000..527c757
--- /dev/null
+++ b/libpixelflinger/codeflinger/Arm64Assembler.h
@@ -0,0 +1,290 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef ANDROID_ARMTOARM64ASSEMBLER_H
+#define ANDROID_ARMTOARM64ASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "tinyutils/smartpointer.h"
+#include "utils/Vector.h"
+#include "utils/KeyedVector.h"
+
+#include "tinyutils/smartpointer.h"
+#include "codeflinger/ARMAssemblerInterface.h"
+#include "codeflinger/CodeCache.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+class ArmToArm64Assembler : public ARMAssemblerInterface
+{
+public:
+    explicit    ArmToArm64Assembler(const sp<Assembly>& assembly);
+    explicit    ArmToArm64Assembler(void *base);
+    virtual     ~ArmToArm64Assembler();
+
+    uint32_t*   base() const;
+    uint32_t*   pc() const;
+
+
+    void        disassemble(const char* name);
+
+    // ------------------------------------------------------------------------
+    // ARMAssemblerInterface...
+    // ------------------------------------------------------------------------
+
+    virtual void    reset();
+
+    virtual int     generate(const char* name);
+    virtual int     getCodegenArch();
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // shifters...
+    virtual bool        isValidImmediate(uint32_t immed);
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    virtual uint32_t    imm(uint32_t immediate);
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    virtual uint32_t    reg_rrx(int Rm);
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes...
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0);
+    virtual uint32_t    immed12_post(int32_t immed12);
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0);
+    virtual uint32_t    immed8_post(int32_t immed8);
+    virtual uint32_t    reg_pre(int Rm, int W=0);
+    virtual uint32_t    reg_post(int Rm);
+
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    virtual uint32_t* pcForLabel(const char* label);
+
+    virtual void ADDR_LDR(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void ADDR_ADD(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+    virtual void ADDR_SUB(int cc, int s, int Rd,
+                int Rn, uint32_t Op2);
+    virtual void ADDR_STR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+
+
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width);
+
+private:
+    ArmToArm64Assembler(const ArmToArm64Assembler& rhs);
+    ArmToArm64Assembler& operator = (const ArmToArm64Assembler& rhs);
+
+    // -----------------------------------------------------------------------
+    // helper functions
+    // -----------------------------------------------------------------------
+
+    void dataTransfer(int operation, int cc, int Rd, int Rn,
+                      uint32_t operand_type, uint32_t size = 32);
+    void dataProcessingCommon(int opcode, int s,
+                      int Rd, int Rn, uint32_t Op2);
+
+    // -----------------------------------------------------------------------
+    // Arm64 instructions
+    // -----------------------------------------------------------------------
+    uint32_t A64_B_COND(uint32_t cc, uint32_t offset);
+    uint32_t A64_RET(uint32_t Rn);
+
+    uint32_t A64_LDRSTR_Wm_SXTW_0(uint32_t operation,
+                                uint32_t size, uint32_t Rt,
+                                uint32_t Rn, uint32_t Rm);
+
+    uint32_t A64_STR_IMM_PreIndex(uint32_t Rt, uint32_t Rn, int32_t simm);
+    uint32_t A64_LDR_IMM_PostIndex(uint32_t Rt,uint32_t Rn, int32_t simm);
+
+    uint32_t A64_ADD_X_Wm_SXTW(uint32_t Rd, uint32_t Rn, uint32_t Rm,
+                               uint32_t amount);
+    uint32_t A64_SUB_X_Wm_SXTW(uint32_t Rd, uint32_t Rn, uint32_t Rm,
+                               uint32_t amount);
+
+    uint32_t A64_ADD_IMM_X(uint32_t Rd, uint32_t Rn,
+                           uint32_t imm, uint32_t shift = 0);
+    uint32_t A64_SUB_IMM_X(uint32_t Rd, uint32_t Rn,
+                           uint32_t imm, uint32_t shift = 0);
+
+    uint32_t A64_ADD_X(uint32_t Rd, uint32_t Rn,
+                       uint32_t Rm, uint32_t shift = 0, uint32_t amount = 0);
+    uint32_t A64_ADD_W(uint32_t Rd, uint32_t Rn, uint32_t Rm,
+                       uint32_t shift = 0, uint32_t amount = 0);
+    uint32_t A64_SUB_W(uint32_t Rd, uint32_t Rn, uint32_t Rm,
+                       uint32_t shift = 0, uint32_t amount = 0,
+                       uint32_t setflag = 0);
+    uint32_t A64_AND_W(uint32_t Rd, uint32_t Rn,
+                       uint32_t Rm, uint32_t shift = 0, uint32_t amount = 0);
+    uint32_t A64_ORR_W(uint32_t Rd, uint32_t Rn,
+                       uint32_t Rm, uint32_t shift = 0, uint32_t amount = 0);
+    uint32_t A64_ORN_W(uint32_t Rd, uint32_t Rn,
+                       uint32_t Rm, uint32_t shift = 0, uint32_t amount = 0);
+
+    uint32_t A64_MOVZ_W(uint32_t Rd, uint32_t imm, uint32_t shift);
+    uint32_t A64_MOVZ_X(uint32_t Rd, uint32_t imm, uint32_t shift);
+    uint32_t A64_MOVK_W(uint32_t Rd, uint32_t imm, uint32_t shift);
+
+    uint32_t A64_SMADDL(uint32_t Rd, uint32_t Rn, uint32_t Rm, uint32_t Ra);
+    uint32_t A64_MADD_W(uint32_t Rd, uint32_t Rn, uint32_t Rm, uint32_t Ra);
+
+    uint32_t A64_SBFM_W(uint32_t Rd, uint32_t Rn,
+                        uint32_t immr, uint32_t imms);
+    uint32_t A64_UBFM_W(uint32_t Rd, uint32_t Rn,
+                        uint32_t immr, uint32_t imms);
+    uint32_t A64_UBFM_X(uint32_t Rd, uint32_t Rn,
+                        uint32_t immr, uint32_t imms);
+
+    uint32_t A64_EXTR_W(uint32_t Rd, uint32_t Rn, uint32_t Rm, uint32_t lsb);
+    uint32_t A64_CSEL_X(uint32_t Rd, uint32_t Rn, uint32_t Rm, uint32_t cond);
+    uint32_t A64_CSEL_W(uint32_t Rd, uint32_t Rn, uint32_t Rm, uint32_t cond);
+
+    uint32_t*       mBase;
+    uint32_t*       mPC;
+    uint32_t*       mPrologPC;
+    int64_t         mDuration;
+    uint32_t        mTmpReg1, mTmpReg2, mTmpReg3, mZeroReg;
+
+    struct branch_target_t {
+        inline branch_target_t() : label(0), pc(0) { }
+        inline branch_target_t(const char* l, uint32_t* p)
+            : label(l), pc(p) { }
+        const char* label;
+        uint32_t*   pc;
+    };
+
+    sp<Assembly>    mAssembly;
+    Vector<branch_target_t>                 mBranchTargets;
+    KeyedVector< const char*, uint32_t* >   mLabels;
+    KeyedVector< uint32_t*, const char* >   mLabelsInverseMapping;
+    KeyedVector< uint32_t*, const char* >   mComments;
+
+    enum operand_type_t
+    {
+        OPERAND_REG = 0x20,
+        OPERAND_IMM,
+        OPERAND_REG_IMM,
+        OPERAND_REG_OFFSET,
+        OPERAND_UNSUPPORTED
+    };
+
+    struct addr_mode_t {
+        int32_t         immediate;
+        bool            writeback;
+        bool            preindex;
+        bool            postindex;
+        int32_t         reg_imm_Rm;
+        int32_t         reg_imm_type;
+        uint32_t        reg_imm_shift;
+        int32_t         reg_offset;
+    } mAddrMode;
+
+};
+
+}; // namespace android
+
+#endif //ANDROID_ARM64ASSEMBLER_H
diff --git a/libpixelflinger/codeflinger/Arm64Disassembler.cpp b/libpixelflinger/codeflinger/Arm64Disassembler.cpp
new file mode 100644
index 0000000..70f1ff1
--- /dev/null
+++ b/libpixelflinger/codeflinger/Arm64Disassembler.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <stdio.h>
+#include <inttypes.h>
+#include <string.h>
+
+struct disasm_table_entry_t
+{
+    uint32_t       mask;
+    uint32_t       value;
+    const char*    instr_template;
+};
+
+
+static disasm_table_entry_t disasm_table[] =
+{
+    {0xff000000, 0x91000000, "add <xd|sp>, <xn|sp>, #<imm1>, <shift1>"},
+    {0xff000000, 0xd1000000, "sub <xd|sp>, <xn|sp>, #<imm1>, <shift1>"},
+    {0xff200000, 0x8b000000, "add <xd>, <xn>, <xm>, <shift2> #<amt1>"},
+    {0xff200000, 0x0b000000, "add <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff200000, 0x4b000000, "sub <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff200000, 0x6b000000, "subs <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff200000, 0x0a000000, "and <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff200000, 0x2a000000, "orr <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff200000, 0x2a200000, "orn <wd>, <wn>, <wm>, <shift2> #<amt1>"},
+    {0xff800000, 0x72800000, "movk <wd>, #<imm2>, lsl #<shift3>"},
+    {0xff800000, 0x52800000, "movz <wd>, #<imm2>, lsl #<shift3>"},
+    {0xff800000, 0xd2800000, "movz <xd>, #<imm2>, lsl #<shift3>"},
+    {0xffe00c00, 0x1a800000, "csel <wd>, <wn>, <wm>, <cond1>"},
+    {0xffe00c00, 0x9a800000, "csel <xd>, <xn>, <xm>, <cond1>"},
+    {0xffe00c00, 0x5a800000, "csinv <wd>, <wn>, <wm>, <cond1>"},
+    {0xffe08000, 0x1b000000, "madd <wd>, <wn>, <wm>, <wa>"},
+    {0xffe08000, 0x9b200000, "smaddl <xd>, <wn>, <wm>, <xa>"},
+    {0xffe04c00, 0xb8604800, "ldr <wt>, [<xn|sp>, <r1><m1>, <ext1> #<amt2>]"},
+    {0xffe04c00, 0xb8204800, "str <wt>, [<xn|sp>, <r1><m1>, <ext1> #<amt2>]"},
+    {0xffe04c00, 0xf8604800, "ldr <xt>, [<xn|sp>, <r1><m1>, <ext1> #<amt3>]"},
+    {0xffe04c00, 0xf8204800, "str <xt>, [<xn|sp>, <r1><m1>, <ext1> #<amt3>]"},
+    {0xffe04c00, 0x38604800, "ldrb <wt>, [<xn|sp>, <r1><m1>, <ext1> <amt5>]"},
+    {0xffe04c00, 0x38204800, "strb <wt>, [<xn|sp>, <r1><m1>, <ext1> <amt5>]"},
+    {0xffe04c00, 0x78604800, "ldrh <wt>, [<xn|sp>, <r1><m1>, <ext1> #<amt6>]"},
+    {0xffe04c00, 0x78204800, "strh <wt>, [<xn|sp>, <r1><m1>, <ext1> #<amt6>]"},
+    {0xffe00c00, 0xb8400400, "ldr <wt>, [<xn|sp>], #<simm1>"},
+    {0xffe00c00, 0xb8000c00, "str <wt>, [<xn|sp>, #<simm1>]!"},
+    {0xffc00000, 0x13000000, "sbfm <wd>, <wn>, #<immr1>, #<imms1>"},
+    {0xffc00000, 0x53000000, "ubfm <wd>, <wn>, #<immr1>, #<imms1>"},
+    {0xffc00000, 0xd3400000, "ubfm <xd>, <xn>, #<immr1>, #<imms1>"},
+    {0xffe00000, 0x13800000, "extr <wd>, <wn>, <wm>, #<lsb1>"},
+    {0xff000000, 0x54000000, "b.<cond2> <label1>"},
+    {0xfffffc1f, 0xd65f0000, "ret <xn>"},
+    {0xffe00000, 0x8b200000, "add <xd|sp>, <xn|sp>, <r2><m1>, <ext2> #<amt4>"},
+    {0xffe00000, 0xcb200000, "sub <xd|sp>, <xn|sp>, <r2><m1>, <ext2> #<amt4>"}
+};
+
+static int32_t bits_signed(uint32_t instr, uint32_t msb, uint32_t lsb)
+{
+    int32_t value;
+    value   = ((int32_t)instr) << (31 - msb);
+    value >>= (31 - msb);
+    value >>= lsb;
+    return value;
+}
+static uint32_t bits_unsigned(uint32_t instr, uint32_t msb, uint32_t lsb)
+{
+    uint32_t width = msb - lsb + 1;
+    uint32_t mask  = (1 << width) - 1;
+    return ((instr >> lsb) & mask);
+}
+
+static void get_token(const char *instr, uint32_t index, char *token)
+{
+    uint32_t i, j;
+    for(i = index, j = 0; i < strlen(instr); ++i)
+    {
+        if(instr[index] == '<' && instr[i] == '>')
+        {
+            token[j++] = instr[i];
+            break;
+        }
+        else if(instr[index] != '<' && instr[i] == '<')
+        {
+            break;
+        }
+        else
+        {
+            token[j++] = instr[i];
+        }
+    }
+    token[j] = '\0';
+    return;
+}
+
+
+static const char * token_cc_table[] =
+{
+    "eq", "ne", "cs", "cc", "mi",
+    "pl", "vs", "vc", "hi", "ls",
+    "ge", "lt", "gt", "le", "al", "nv"
+};
+
+static void decode_rx_zr_token(uint32_t reg, const char *prefix, char *instr_part)
+{
+    if(reg == 31)
+        sprintf(instr_part, "%s%s", prefix, "zr");
+    else
+        sprintf(instr_part, "%s%d", prefix, reg);
+}
+
+static void decode_token(uint32_t code, char *token, char *instr_part)
+{
+    if(strcmp(token, "<imm1>") == 0)
+        sprintf(instr_part, "0x%x", bits_unsigned(code, 21,10));
+    else if(strcmp(token, "<imm2>") == 0)
+        sprintf(instr_part, "0x%x", bits_unsigned(code, 20,5));
+    else if(strcmp(token, "<shift1>") == 0)
+        sprintf(instr_part, "lsl #%d", bits_unsigned(code, 23,22) * 12);
+    else if(strcmp(token, "<shift2>") == 0)
+    {
+        static const char * shift2_table[] = { "lsl", "lsr", "asr", "ror"};
+        sprintf(instr_part, "%s", shift2_table[bits_unsigned(code, 23,22)]);
+    }
+    else if(strcmp(token, "<shift3>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 22,21) * 16);
+    else if(strcmp(token, "<amt1>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 15,10));
+    else if(strcmp(token, "<amt2>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 12,12) * 2);
+    else if(strcmp(token, "<amt3>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 12,12) * 3);
+    else if(strcmp(token, "<amt4>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 12,10));
+    else if(strcmp(token, "<amt5>") == 0)
+    {
+        static const char * amt5_table[] = {"", "#0"};
+        sprintf(instr_part, "%s", amt5_table[bits_unsigned(code, 12,12)]);
+    }
+    else if(strcmp(token, "<amt6>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 12,12));
+    else if(strcmp(token, "<simm1>") == 0)
+        sprintf(instr_part, "%d", bits_signed(code, 20,12));
+    else if(strcmp(token, "<immr1>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 21,16));
+    else if(strcmp(token, "<imms1>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 15,10));
+    else if(strcmp(token, "<lsb1>") == 0)
+        sprintf(instr_part, "%d", bits_unsigned(code, 15,10));
+    else if(strcmp(token, "<cond1>") == 0)
+        sprintf(instr_part, "%s", token_cc_table[bits_unsigned(code, 15,12)]);
+    else if(strcmp(token, "<cond2>") == 0)
+        sprintf(instr_part, "%s", token_cc_table[bits_unsigned(code, 4,0)]);
+    else if(strcmp(token, "<r1>") == 0)
+    {
+        const char * token_r1_table[] =
+        {
+            "reserved", "reserved", "w", "x",
+            "reserved", "reserved", "w", "x"
+        };
+        sprintf(instr_part, "%s", token_r1_table[bits_unsigned(code, 15,13)]);
+    }
+    else if(strcmp(token, "<r2>") == 0)
+    {
+        static const char * token_r2_table[] =
+        {
+                "w","w","w", "x", "w", "w", "w", "x"
+        };
+        sprintf(instr_part, "%s", token_r2_table[bits_unsigned(code, 15,13)]);
+    }
+    else if(strcmp(token, "<m1>") == 0)
+    {
+        uint32_t reg = bits_unsigned(code, 20,16);
+        if(reg == 31)
+            sprintf(instr_part, "%s", "zr");
+        else
+            sprintf(instr_part, "%d", reg);
+    }
+    else if(strcmp(token, "<ext1>") == 0)
+    {
+        static const char * token_ext1_table[] =
+        {
+             "reserved","reserved","uxtw", "lsl",
+             "reserved","reserved", "sxtw", "sxtx"
+        };
+        sprintf(instr_part, "%s", token_ext1_table[bits_unsigned(code, 15,13)]);
+    }
+    else if(strcmp(token, "<ext2>") == 0)
+    {
+        static const char * token_ext2_table[] =
+        {
+                "uxtb","uxth","uxtw","uxtx",
+                "sxtb","sxth","sxtw","sxtx"
+        };
+        sprintf(instr_part, "%s", token_ext2_table[bits_unsigned(code, 15,13)]);
+    }
+    else if (strcmp(token, "<label1>") == 0)
+    {
+        int32_t offset = bits_signed(code, 23,5) * 4;
+        if(offset > 0)
+            sprintf(instr_part, "#.+%d", offset);
+        else
+            sprintf(instr_part, "#.-%d", -offset);
+    }
+    else if (strcmp(token, "<xn|sp>") == 0)
+    {
+        uint32_t reg = bits_unsigned(code, 9, 5);
+        if(reg == 31)
+            sprintf(instr_part, "%s", "sp");
+        else
+            sprintf(instr_part, "x%d", reg);
+    }
+    else if (strcmp(token, "<xd|sp>") == 0)
+    {
+        uint32_t reg = bits_unsigned(code, 4, 0);
+        if(reg == 31)
+            sprintf(instr_part, "%s", "sp");
+        else
+            sprintf(instr_part, "x%d", reg);
+    }
+    else if (strcmp(token, "<xn>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 9, 5), "x", instr_part);
+    else if (strcmp(token, "<xd>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 4, 0), "x", instr_part);
+    else if (strcmp(token, "<xm>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 20, 16), "x", instr_part);
+    else if (strcmp(token, "<xa>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 14, 10), "x", instr_part);
+    else if (strcmp(token, "<xt>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 4, 0), "x", instr_part);
+    else if (strcmp(token, "<wn>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 9, 5), "w", instr_part);
+    else if (strcmp(token, "<wd>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 4, 0), "w", instr_part);
+    else if (strcmp(token, "<wm>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 20, 16), "w", instr_part);
+    else if (strcmp(token, "<wa>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 14, 10), "w", instr_part);
+    else if (strcmp(token, "<wt>") == 0)
+        decode_rx_zr_token(bits_unsigned(code, 4, 0), "w", instr_part);
+    else
+    {
+        sprintf(instr_part, "error");
+    }
+    return;
+}
+
+int arm64_disassemble(uint32_t code, char* instr)
+{
+    uint32_t i;
+    char token[256];
+    char instr_part[256];
+
+    if(instr == NULL)
+        return -1;
+
+    bool matched = false;
+    disasm_table_entry_t *entry = NULL;
+    for(i = 0; i < sizeof(disasm_table)/sizeof(disasm_table_entry_t); ++i)
+    {
+        entry = &disasm_table[i];
+        if((code & entry->mask) == entry->value)
+        {
+            matched = true;
+            break;
+        }
+    }
+    if(matched == false)
+    {
+        strcpy(instr, "Unknown Instruction");
+        return -1;
+    }
+    else
+    {
+        uint32_t index = 0;
+        uint32_t length = strlen(entry->instr_template);
+        instr[0] = '\0';
+        do
+        {
+            get_token(entry->instr_template, index, token);
+            if(token[0] == '<')
+            {
+                decode_token(code, token, instr_part);
+                strcat(instr, instr_part);
+            }
+            else
+            {
+                strcat(instr, token);
+            }
+            index += strlen(token);
+        }while(index < length);
+        return 0;
+    }
+}
diff --git a/libpixelflinger/codeflinger/Arm64Disassembler.h b/libpixelflinger/codeflinger/Arm64Disassembler.h
new file mode 100644
index 0000000..86f3aba
--- /dev/null
+++ b/libpixelflinger/codeflinger/Arm64Disassembler.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2013 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef ANDROID_ARM64DISASSEMBLER_H
+#define ANDROID_ARM64DISASSEMBLER_H
+
+#include <inttypes.h>
+int arm64_disassemble(uint32_t code, char* instr);
+
+#endif //ANDROID_ARM64ASSEMBLER_H
diff --git a/libpixelflinger/codeflinger/CodeCache.cpp b/libpixelflinger/codeflinger/CodeCache.cpp
new file mode 100644
index 0000000..32691a3
--- /dev/null
+++ b/libpixelflinger/codeflinger/CodeCache.cpp
@@ -0,0 +1,217 @@
+/* libs/pixelflinger/codeflinger/CodeCache.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "CodeCache"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+#include <cutils/ashmem.h>
+#include <log/log.h>
+
+#include "CodeCache.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+#if defined(__arm__) || defined(__aarch64__)
+#include <unistd.h>
+#include <errno.h>
+#endif
+
+#if defined(__mips__)
+#include <asm/cachectl.h>
+#include <errno.h>
+#endif
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// A dlmalloc mspace is used to manage the code cache over a mmaped region.
+#define HAVE_MMAP 0
+#define HAVE_MREMAP 0
+#define HAVE_MORECORE 0
+#define MALLOC_ALIGNMENT 16
+#define MSPACES 1
+#define NO_MALLINFO 1
+#define ONLY_MSPACES 1
+// Custom heap error handling.
+#define PROCEED_ON_ERROR 0
+static void heap_error(const char* msg, const char* function, void* p);
+#define CORRUPTION_ERROR_ACTION(m) \
+    heap_error("HEAP MEMORY CORRUPTION", __FUNCTION__, NULL)
+#define USAGE_ERROR_ACTION(m,p) \
+    heap_error("ARGUMENT IS INVALID HEAP ADDRESS", __FUNCTION__, p)
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wexpansion-to-defined"
+#pragma GCC diagnostic ignored "-Wnull-pointer-arithmetic"
+#include "../../../../external/dlmalloc/malloc.c"
+#pragma GCC diagnostic pop
+
+static void heap_error(const char* msg, const char* function, void* p) {
+    ALOG(LOG_FATAL, LOG_TAG, "@@@ ABORTING: CODE FLINGER: %s IN %s addr=%p",
+         msg, function, p);
+    /* So that we can get a memory dump around p */
+    *((int **) 0xdeadbaad) = (int *) p;
+}
+
+// ----------------------------------------------------------------------------
+
+static void* gExecutableStore = NULL;
+static mspace gMspace = NULL;
+const size_t kMaxCodeCacheCapacity = 1024 * 1024;
+
+static mspace getMspace()
+{
+    if (gExecutableStore == NULL) {
+        int fd = ashmem_create_region("CodeFlinger code cache",
+                                      kMaxCodeCacheCapacity);
+        LOG_ALWAYS_FATAL_IF(fd < 0,
+                            "Creating code cache, ashmem_create_region "
+                            "failed with error '%s'", strerror(errno));
+        gExecutableStore = mmap(NULL, kMaxCodeCacheCapacity,
+                                PROT_READ | PROT_WRITE | PROT_EXEC,
+                                MAP_PRIVATE, fd, 0);
+        LOG_ALWAYS_FATAL_IF(gExecutableStore == MAP_FAILED,
+                            "Creating code cache, mmap failed with error "
+                            "'%s'", strerror(errno));
+        close(fd);
+        gMspace = create_mspace_with_base(gExecutableStore, kMaxCodeCacheCapacity,
+                                          /*locked=*/ false);
+        mspace_set_footprint_limit(gMspace, kMaxCodeCacheCapacity);
+    }
+    return gMspace;
+}
+
+Assembly::Assembly(size_t size)
+    : mCount(0), mSize(0)
+{
+    mBase = (uint32_t*)mspace_malloc(getMspace(), size);
+    LOG_ALWAYS_FATAL_IF(mBase == NULL,
+                        "Failed to create Assembly of size %zd in executable "
+                        "store of size %zd", size, kMaxCodeCacheCapacity);
+    mSize = size;
+}
+
+Assembly::~Assembly()
+{
+    mspace_free(getMspace(), mBase);
+}
+
+void Assembly::incStrong(const void*) const
+{
+    mCount.fetch_add(1, std::memory_order_relaxed);
+}
+
+void Assembly::decStrong(const void*) const
+{
+    if (mCount.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+        delete this;
+    }
+}
+
+ssize_t Assembly::size() const
+{
+    if (!mBase) return NO_MEMORY;
+    return mSize;
+}
+
+uint32_t* Assembly::base() const
+{
+    return mBase;
+}
+
+ssize_t Assembly::resize(size_t newSize)
+{
+    mBase = (uint32_t*)mspace_realloc(getMspace(), mBase, newSize);
+    LOG_ALWAYS_FATAL_IF(mBase == NULL,
+                        "Failed to resize Assembly to %zd in code cache "
+                        "of size %zd", newSize, kMaxCodeCacheCapacity);
+    mSize = newSize;
+    return size();
+}
+
+// ----------------------------------------------------------------------------
+
+CodeCache::CodeCache(size_t size)
+    : mCacheSize(size), mCacheInUse(0)
+{
+    pthread_mutex_init(&mLock, 0);
+}
+
+CodeCache::~CodeCache()
+{
+    pthread_mutex_destroy(&mLock);
+}
+
+sp<Assembly> CodeCache::lookup(const AssemblyKeyBase& keyBase) const
+{
+    pthread_mutex_lock(&mLock);
+    sp<Assembly> r;
+    ssize_t index = mCacheData.indexOfKey(key_t(keyBase));
+    if (index >= 0) {
+        const cache_entry_t& e = mCacheData.valueAt(index);
+        e.when = mWhen++;
+        r = e.entry;
+    }
+    pthread_mutex_unlock(&mLock);
+    return r;
+}
+
+int CodeCache::cache(  const AssemblyKeyBase& keyBase,
+                            const sp<Assembly>& assembly)
+{
+    pthread_mutex_lock(&mLock);
+
+    const ssize_t assemblySize = assembly->size();
+    while (mCacheInUse + assemblySize > mCacheSize) {
+        // evict the LRU
+        size_t lru = 0;
+        size_t count = mCacheData.size();
+        for (size_t i=0 ; i<count ; i++) {
+            const cache_entry_t& e = mCacheData.valueAt(i);
+            if (e.when < mCacheData.valueAt(lru).when) {
+                lru = i;
+            }
+        }
+        const cache_entry_t& e = mCacheData.valueAt(lru);
+        mCacheInUse -= e.entry->size();
+        mCacheData.removeItemsAt(lru);
+    }
+
+    ssize_t err = mCacheData.add(key_t(keyBase), cache_entry_t(assembly, mWhen));
+    if (err >= 0) {
+        mCacheInUse += assemblySize;
+        mWhen++;
+        // synchronize caches...
+        char* base = reinterpret_cast<char*>(assembly->base());
+        char* curr = reinterpret_cast<char*>(base + assembly->size());
+        __builtin___clear_cache(base, curr);
+    }
+
+    pthread_mutex_unlock(&mLock);
+    return err;
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
diff --git a/libpixelflinger/codeflinger/CodeCache.h b/libpixelflinger/codeflinger/CodeCache.h
new file mode 100644
index 0000000..9326453
--- /dev/null
+++ b/libpixelflinger/codeflinger/CodeCache.h
@@ -0,0 +1,136 @@
+/* libs/pixelflinger/codeflinger/CodeCache.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_CODECACHE_H
+#define ANDROID_CODECACHE_H
+
+#include <atomic>
+#include <stdint.h>
+#include <pthread.h>
+#include <sys/types.h>
+
+#include "utils/KeyedVector.h"
+#include "tinyutils/smartpointer.h"
+
+namespace android {
+
+using namespace tinyutils;
+
+// ----------------------------------------------------------------------------
+
+class AssemblyKeyBase {
+public:
+    virtual ~AssemblyKeyBase() { }
+    virtual int compare_type(const AssemblyKeyBase& key) const = 0;
+};
+
+template  <typename T>
+class AssemblyKey : public AssemblyKeyBase
+{
+public:
+    explicit AssemblyKey(const T& rhs) : mKey(rhs) { }
+    virtual int compare_type(const AssemblyKeyBase& key) const {
+        const T& rhs = static_cast<const AssemblyKey&>(key).mKey;
+        return android::compare_type(mKey, rhs);
+    }
+private:
+    T mKey;
+};
+
+// ----------------------------------------------------------------------------
+
+class Assembly
+{
+public:
+    explicit    Assembly(size_t size);
+    virtual     ~Assembly();
+
+    ssize_t     size() const;
+    uint32_t*   base() const;
+    ssize_t     resize(size_t size);
+
+    // protocol for sp<>
+            void    incStrong(const void* id) const;
+            void    decStrong(const void* id) const;
+    typedef void    weakref_type;
+
+private:
+    mutable std::atomic<int32_t>     mCount;
+            uint32_t*   mBase;
+            size_t      mSize;
+};
+
+// ----------------------------------------------------------------------------
+
+class CodeCache
+{
+public:
+// pretty simple cache API...
+    explicit            CodeCache(size_t size);
+                        ~CodeCache();
+
+    sp<Assembly>        lookup(const AssemblyKeyBase& key) const;
+
+    int                 cache(const AssemblyKeyBase& key,
+                              const sp<Assembly>& assembly);
+
+private:
+    // nothing to see here...
+    struct cache_entry_t {
+        inline cache_entry_t() { }
+        inline cache_entry_t(const sp<Assembly>& a, int64_t w)
+                : entry(a), when(w) { }
+        sp<Assembly>            entry;
+        mutable int64_t         when;
+    };
+
+    class key_t {
+        friend int compare_type(
+            const key_value_pair_t<key_t, cache_entry_t>&,
+            const key_value_pair_t<key_t, cache_entry_t>&);
+        const AssemblyKeyBase* mKey;
+    public:
+        key_t() { };
+        explicit key_t(const AssemblyKeyBase& k) : mKey(&k)  { }
+    };
+
+    mutable pthread_mutex_t             mLock;
+    mutable int64_t                     mWhen;
+    size_t                              mCacheSize;
+    size_t                              mCacheInUse;
+    KeyedVector<key_t, cache_entry_t>   mCacheData;
+
+    friend int compare_type(
+        const key_value_pair_t<key_t, cache_entry_t>&,
+        const key_value_pair_t<key_t, cache_entry_t>&);
+};
+
+// KeyedVector uses compare_type(), which is more efficient, than
+// just using operator < ()
+inline int compare_type(
+    const key_value_pair_t<CodeCache::key_t, CodeCache::cache_entry_t>& lhs,
+    const key_value_pair_t<CodeCache::key_t, CodeCache::cache_entry_t>& rhs)
+{
+    return lhs.key.mKey->compare_type(*(rhs.key.mKey));
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
+#endif //ANDROID_CODECACHE_H
diff --git a/libpixelflinger/codeflinger/GGLAssembler.cpp b/libpixelflinger/codeflinger/GGLAssembler.cpp
new file mode 100644
index 0000000..04e285d
--- /dev/null
+++ b/libpixelflinger/codeflinger/GGLAssembler.cpp
@@ -0,0 +1,1195 @@
+/* libs/pixelflinger/codeflinger/GGLAssembler.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "GGLAssembler"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <log/log.h>
+
+#include "GGLAssembler.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+GGLAssembler::GGLAssembler(ARMAssemblerInterface* target)
+    : ARMAssemblerProxy(target),
+      RegisterAllocator(ARMAssemblerProxy::getCodegenArch()), mOptLevel(7)
+{
+}
+
+GGLAssembler::~GGLAssembler()
+{
+}
+
+void GGLAssembler::prolog()
+{
+    ARMAssemblerProxy::prolog();
+}
+
+void GGLAssembler::epilog(uint32_t touched)
+{
+    ARMAssemblerProxy::epilog(touched);
+}
+
+void GGLAssembler::reset(int opt_level)
+{
+    ARMAssemblerProxy::reset();
+    RegisterAllocator::reset();
+    mOptLevel = opt_level;
+}
+
+// ---------------------------------------------------------------------------
+
+int GGLAssembler::scanline(const needs_t& needs, context_t const* c)
+{
+    int err = 0;
+    int opt_level = mOptLevel;
+    while (opt_level >= 0) {
+        reset(opt_level);
+        err = scanline_core(needs, c);
+        if (err == 0)
+            break;
+        opt_level--;
+    }
+    
+    // XXX: in theory, pcForLabel is not valid before generate()
+    uint32_t* fragment_start_pc = pcForLabel("fragment_loop");
+    uint32_t* fragment_end_pc = pcForLabel("epilog");
+    const int per_fragment_ops = int(fragment_end_pc - fragment_start_pc);
+    
+    // build a name for our pipeline
+    char name[64];    
+    sprintf(name,
+            "scanline__%08X:%08X_%08X_%08X [%3d ipp]",
+            needs.p, needs.n, needs.t[0], needs.t[1], per_fragment_ops);
+
+    if (err) {
+        ALOGE("Error while generating ""%s""\n", name);
+        disassemble(name);
+        return -1;
+    }
+
+    return generate(name);
+}
+
+int GGLAssembler::scanline_core(const needs_t& needs, context_t const* c)
+{
+    mBlendFactorCached = 0;
+    mBlending = 0;
+    mMasking = 0;
+    mAA        = GGL_READ_NEEDS(P_AA, needs.p);
+    mDithering = GGL_READ_NEEDS(P_DITHER, needs.p);
+    mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER;
+    mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER;
+    mFog       = GGL_READ_NEEDS(P_FOG, needs.p) != 0;
+    mSmooth    = GGL_READ_NEEDS(SHADE, needs.n) != 0;
+    mBuilderContext.needs = needs;
+    mBuilderContext.c = c;
+    mBuilderContext.Rctx = reserveReg(R0); // context always in R0
+    mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ];
+
+    // ------------------------------------------------------------------------
+
+    decodeLogicOpNeeds(needs);
+
+    decodeTMUNeeds(needs, c);
+
+    mBlendSrc  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n));
+    mBlendDst  = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n));
+    mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n));
+    mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n));
+
+    if (!mCbFormat.c[GGLFormat::ALPHA].h) {
+        if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendSrc == GGL_DST_ALPHA)) {
+            mBlendSrc = GGL_ONE;
+        }
+        if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendSrcA == GGL_DST_ALPHA)) {
+            mBlendSrcA = GGL_ONE;
+        }
+        if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendDst == GGL_DST_ALPHA)) {
+            mBlendDst = GGL_ONE;
+        }
+        if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) ||
+            (mBlendDstA == GGL_DST_ALPHA)) {
+            mBlendDstA = GGL_ONE;
+        }
+    }
+
+    // if we need the framebuffer, read it now
+    const int blending =    blending_codes(mBlendSrc, mBlendDst) |
+                            blending_codes(mBlendSrcA, mBlendDstA);
+
+    // XXX: handle special cases, destination not modified...
+    if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
+        (mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) {
+        // Destination unmodified (beware of logic ops)
+    } else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
+        (mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) {
+        // Destination is zero (beware of logic ops)
+    }
+    
+    int fbComponents = 0;
+    const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n);
+    for (int i=0 ; i<4 ; i++) {
+        const int mask = 1<<i;
+        component_info_t& info = mInfo[i];
+        int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+        int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+        if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA)
+            fs = GGL_ONE;
+        info.masked =   !!(masking & mask);
+        info.inDest =   !info.masked && mCbFormat.c[i].h && 
+                        ((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp));
+        if (mCbFormat.components >= GGL_LUMINANCE &&
+                (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
+            info.inDest = false;
+        }
+        info.needed =   (i==GGLFormat::ALPHA) && 
+                        (isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS);
+        info.replaced = !!(mTextureMachine.replaced & mask);
+        info.iterated = (!info.replaced && (info.inDest || info.needed)); 
+        info.smooth =   mSmooth && info.iterated;
+        info.fog =      mFog && info.inDest && (i != GGLFormat::ALPHA);
+        info.blend =    (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
+
+        mBlending |= (info.blend ? mask : 0);
+        mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0;
+        fbComponents |= mCbFormat.c[i].h ? mask : 0;
+    }
+
+    mAllMasked = (mMasking == fbComponents);
+    if (mAllMasked) {
+        mDithering = 0;
+    }
+    
+    fragment_parts_t parts;
+
+    // ------------------------------------------------------------------------
+    prolog();
+    // ------------------------------------------------------------------------
+
+    build_scanline_prolog(parts, needs);
+
+    if (registerFile().status())
+        return registerFile().status();
+
+    // ------------------------------------------------------------------------
+    label("fragment_loop");
+    // ------------------------------------------------------------------------
+    {
+        Scratch regs(registerFile());
+
+        if (mDithering) {
+            // update the dither index.
+            MOV(AL, 0, parts.count.reg,
+                    reg_imm(parts.count.reg, ROR, GGL_DITHER_ORDER_SHIFT));
+            ADD(AL, 0, parts.count.reg, parts.count.reg,
+                    imm( 1 << (32 - GGL_DITHER_ORDER_SHIFT)));
+            MOV(AL, 0, parts.count.reg,
+                    reg_imm(parts.count.reg, ROR, 32 - GGL_DITHER_ORDER_SHIFT));
+        }
+
+        // XXX: could we do an early alpha-test here in some cases?
+        // It would probaly be used only with smooth-alpha and no texture
+        // (or no alpha component in the texture).
+
+        // Early z-test
+        if (mAlphaTest==GGL_ALWAYS) {
+            build_depth_test(parts, Z_TEST|Z_WRITE);
+        } else {
+            // we cannot do the z-write here, because
+            // it might be killed by the alpha-test later
+            build_depth_test(parts, Z_TEST);
+        }
+
+        { // texture coordinates
+            Scratch scratches(registerFile());
+
+            // texel generation
+            build_textures(parts, regs);
+            if (registerFile().status())
+                return registerFile().status();
+        }
+
+        if ((blending & (FACTOR_DST|BLEND_DST)) || 
+                (mMasking && !mAllMasked) ||
+                (mLogicOp & LOGIC_OP_DST)) 
+        {
+            // blending / logic_op / masking need the framebuffer
+            mDstPixel.setTo(regs.obtain(), &mCbFormat);
+
+            // load the framebuffer pixel
+            comment("fetch color-buffer");
+            load(parts.cbPtr, mDstPixel);
+        }
+
+        if (registerFile().status())
+            return registerFile().status();
+
+        pixel_t pixel;
+        int directTex = mTextureMachine.directTexture;
+        if (directTex | parts.packed) {
+            // note: we can't have both here
+            // iterated color or direct texture
+            pixel = directTex ? parts.texel[directTex-1] : parts.iterated;
+            pixel.flags &= ~CORRUPTIBLE;
+        } else {
+            if (mDithering) {
+                const int ctxtReg = mBuilderContext.Rctx;
+                const int mask = GGL_DITHER_SIZE-1;
+                parts.dither = reg_t(regs.obtain());
+                AND(AL, 0, parts.dither.reg, parts.count.reg, imm(mask));
+                ADDR_ADD(AL, 0, parts.dither.reg, ctxtReg, parts.dither.reg);
+                LDRB(AL, parts.dither.reg, parts.dither.reg,
+                        immed12_pre(GGL_OFFSETOF(ditherMatrix)));
+            }
+        
+            // allocate a register for the resulting pixel
+            pixel.setTo(regs.obtain(), &mCbFormat, FIRST);
+
+            build_component(pixel, parts, GGLFormat::ALPHA,    regs);
+
+            if (mAlphaTest!=GGL_ALWAYS) {
+                // only handle the z-write part here. We know z-test
+                // was successful, as well as alpha-test.
+                build_depth_test(parts, Z_WRITE);
+            }
+
+            build_component(pixel, parts, GGLFormat::RED,      regs);
+            build_component(pixel, parts, GGLFormat::GREEN,    regs);
+            build_component(pixel, parts, GGLFormat::BLUE,     regs);
+
+            pixel.flags |= CORRUPTIBLE;
+        }
+
+        if (registerFile().status())
+            return registerFile().status();
+        
+        if (pixel.reg == -1) {
+            // be defensive here. if we're here it's probably
+            // that this whole fragment is a no-op.
+            pixel = mDstPixel;
+        }
+        
+        if (!mAllMasked) {
+            // logic operation
+            build_logic_op(pixel, regs);
+    
+            // masking
+            build_masking(pixel, regs); 
+    
+            comment("store");
+            store(parts.cbPtr, pixel, WRITE_BACK);
+        }
+    }
+
+    if (registerFile().status())
+        return registerFile().status();
+
+    // update the iterated color...
+    if (parts.reload != 3) {
+        build_smooth_shade(parts);
+    }
+
+    // update iterated z
+    build_iterate_z(parts);
+
+    // update iterated fog
+    build_iterate_f(parts);
+
+    SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
+    B(PL, "fragment_loop");
+    label("epilog");
+    epilog(registerFile().touched());
+
+    if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) {
+        if (mDepthTest!=GGL_ALWAYS) {
+            label("discard_before_textures");
+            build_iterate_texture_coordinates(parts);
+        }
+        label("discard_after_textures");
+        build_smooth_shade(parts);
+        build_iterate_z(parts);
+        build_iterate_f(parts);
+        if (!mAllMasked) {
+            ADDR_ADD(AL, 0, parts.cbPtr.reg, parts.cbPtr.reg, imm(parts.cbPtr.size>>3));
+        }
+        SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
+        B(PL, "fragment_loop");
+        epilog(registerFile().touched());
+    }
+
+    return registerFile().status();
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_scanline_prolog(
+    fragment_parts_t& parts, const needs_t& needs)
+{
+    Scratch scratches(registerFile());
+
+    // compute count
+    comment("compute ct (# of pixels to process)");
+    parts.count.setTo(obtainReg());
+    int Rx = scratches.obtain();    
+    int Ry = scratches.obtain();
+    CONTEXT_LOAD(Rx, iterators.xl);
+    CONTEXT_LOAD(parts.count.reg, iterators.xr);
+    CONTEXT_LOAD(Ry, iterators.y);
+
+    // parts.count = iterators.xr - Rx
+    SUB(AL, 0, parts.count.reg, parts.count.reg, Rx);
+    SUB(AL, 0, parts.count.reg, parts.count.reg, imm(1));
+
+    if (mDithering) {
+        // parts.count.reg = 0xNNNNXXDD
+        // NNNN = count-1
+        // DD   = dither offset
+        // XX   = 0xxxxxxx (x = garbage)
+        Scratch scratches(registerFile());
+        int tx = scratches.obtain();
+        int ty = scratches.obtain();
+        AND(AL, 0, tx, Rx, imm(GGL_DITHER_MASK));
+        AND(AL, 0, ty, Ry, imm(GGL_DITHER_MASK));
+        ADD(AL, 0, tx, tx, reg_imm(ty, LSL, GGL_DITHER_ORDER_SHIFT));
+        ORR(AL, 0, parts.count.reg, tx, reg_imm(parts.count.reg, LSL, 16));
+    } else {
+        // parts.count.reg = 0xNNNN0000
+        // NNNN = count-1
+        MOV(AL, 0, parts.count.reg, reg_imm(parts.count.reg, LSL, 16));
+    }
+
+    if (!mAllMasked) {
+        // compute dst ptr
+        comment("compute color-buffer pointer");
+        const int cb_bits = mCbFormat.size*8;
+        int Rs = scratches.obtain();
+        parts.cbPtr.setTo(obtainReg(), cb_bits);
+        CONTEXT_LOAD(Rs, state.buffers.color.stride);
+        CONTEXT_ADDR_LOAD(parts.cbPtr.reg, state.buffers.color.data);
+        SMLABB(AL, Rs, Ry, Rs, Rx);  // Rs = Rx + Ry*Rs
+        base_offset(parts.cbPtr, parts.cbPtr, Rs);
+        scratches.recycle(Rs);
+    }
+    
+    // init fog
+    const int need_fog = GGL_READ_NEEDS(P_FOG, needs.p);
+    if (need_fog) {
+        comment("compute initial fog coordinate");
+        Scratch scratches(registerFile());
+        int dfdx = scratches.obtain();
+        int ydfdy = scratches.obtain();
+        int f = ydfdy;
+        CONTEXT_LOAD(dfdx,  generated_vars.dfdx);
+        CONTEXT_LOAD(ydfdy, iterators.ydfdy);
+        MLA(AL, 0, f, Rx, dfdx, ydfdy);
+        CONTEXT_STORE(f, generated_vars.f);
+    }
+
+    // init Z coordinate
+    if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
+        parts.z = reg_t(obtainReg());
+        comment("compute initial Z coordinate");
+        Scratch scratches(registerFile());
+        int dzdx = scratches.obtain();
+        int ydzdy = parts.z.reg;
+        CONTEXT_LOAD(dzdx,  generated_vars.dzdx);   // 1.31 fixed-point
+        CONTEXT_LOAD(ydzdy, iterators.ydzdy);       // 1.31 fixed-point
+        MLA(AL, 0, parts.z.reg, Rx, dzdx, ydzdy);
+
+        // we're going to index zbase of parts.count
+        // zbase = base + (xl-count + stride*y)*2
+        int Rs = dzdx;
+        int zbase = scratches.obtain();
+        CONTEXT_LOAD(Rs, state.buffers.depth.stride);
+        CONTEXT_ADDR_LOAD(zbase, state.buffers.depth.data);
+        SMLABB(AL, Rs, Ry, Rs, Rx);
+        ADD(AL, 0, Rs, Rs, reg_imm(parts.count.reg, LSR, 16));
+        ADDR_ADD(AL, 0, zbase, zbase, reg_imm(Rs, LSL, 1));
+        CONTEXT_ADDR_STORE(zbase, generated_vars.zbase);
+    }
+
+    // init texture coordinates
+    init_textures(parts.coords, reg_t(Rx), reg_t(Ry));
+    scratches.recycle(Ry);
+
+    // iterated color
+    init_iterated_color(parts, reg_t(Rx));
+
+    // init coverage factor application (anti-aliasing)
+    if (mAA) {
+        parts.covPtr.setTo(obtainReg(), 16);
+        CONTEXT_ADDR_LOAD(parts.covPtr.reg, state.buffers.coverage);
+        ADDR_ADD(AL, 0, parts.covPtr.reg, parts.covPtr.reg, reg_imm(Rx, LSL, 1));
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_component( pixel_t& pixel,
+                                    const fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& regs)
+{
+    static char const * comments[] = {"alpha", "red", "green", "blue"};
+    comment(comments[component]);
+
+    // local register file
+    Scratch scratches(registerFile());
+    const int dst_component_size = pixel.component_size(component);
+
+    component_t temp(-1);
+    build_incoming_component( temp, dst_component_size,
+            parts, component, scratches, regs);
+
+    if (mInfo[component].inDest) {
+
+        // blending...
+        build_blending( temp, mDstPixel, component, scratches );
+
+        // downshift component and rebuild pixel...
+        downshift(pixel, component, temp, parts.dither);
+    }
+}
+
+void GGLAssembler::build_incoming_component(
+                                    component_t& temp,
+                                    int dst_size,
+                                    const fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& scratches,
+                                    Scratch& global_regs)
+{
+    const uint32_t component_mask = 1<<component;
+
+    // Figure out what we need for the blending stage...
+    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA) {
+        fs = GGL_ONE;
+    }
+
+    // Figure out what we need to extract and for what reason
+    const int blending = blending_codes(fs, fd);
+
+    // Are we actually going to blend?
+    const int need_blending = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
+    
+    // expand the source if the destination has more bits
+    int need_expander = false;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT-1 ; i++) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if ((tmu.format_idx) &&
+            (parts.texel[i].component_size(component) < dst_size)) {
+            need_expander = true;
+        }
+    }
+
+    // do we need to extract this component?
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    const int blend_needs_alpha_source = (component==GGLFormat::ALPHA) &&
+                                        (isAlphaSourceNeeded());
+    int need_extract = mInfo[component].needed;
+    if (mInfo[component].inDest)
+    {
+        need_extract |= ((need_blending ?
+                (blending & (BLEND_SRC|FACTOR_SRC)) : need_expander));
+        need_extract |= (mTextureMachine.mask != mTextureMachine.replaced);
+        need_extract |= mInfo[component].smooth;
+        need_extract |= mInfo[component].fog;
+        need_extract |= mDithering;
+        need_extract |= multiTexture;
+    }
+
+    if (need_extract) {
+        Scratch& regs = blend_needs_alpha_source ? global_regs : scratches;
+        component_t fragment;
+
+        // iterated color
+        build_iterated_color(fragment, parts, component, regs);
+
+        // texture environement (decal, modulate, replace)
+        build_texture_environment(fragment, parts, component, regs);
+
+        // expand the source if the destination has more bits
+        if (need_expander && (fragment.size() < dst_size)) {
+            // we're here only if we fetched a texel
+            // (so we know for sure fragment is CORRUPTIBLE)
+            expand(fragment, fragment, dst_size);
+        }
+
+        // We have a few specific things to do for the alpha-channel
+        if ((component==GGLFormat::ALPHA) &&
+            (mInfo[component].needed || fragment.size()<dst_size))
+        {
+            // convert to integer_t first and make sure
+            // we don't corrupt a needed register
+            if (fragment.l) {
+                component_t incoming(fragment);
+                modify(fragment, regs);
+                MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSR, incoming.l));
+                fragment.h -= fragment.l;
+                fragment.l = 0;
+            }
+
+            // coverage factor application
+            build_coverage_application(fragment, parts, regs);
+
+            // alpha-test
+            build_alpha_test(fragment, parts);
+
+            if (blend_needs_alpha_source) {
+                // We keep only 8 bits for the blending stage
+                const int shift = fragment.h <= 8 ? 0 : fragment.h-8;
+                if (fragment.flags & CORRUPTIBLE) {
+                    fragment.flags &= ~CORRUPTIBLE;
+                    mAlphaSource.setTo(fragment.reg,
+                            fragment.size(), fragment.flags);
+                    if (shift) {
+                        MOV(AL, 0, mAlphaSource.reg,
+                            reg_imm(mAlphaSource.reg, LSR, shift));
+                    }
+                } else {
+                    // XXX: it would better to do this in build_blend_factor()
+                    // so we can avoid the extra MOV below.
+                    mAlphaSource.setTo(regs.obtain(),
+                            fragment.size(), CORRUPTIBLE);
+                    if (shift) {
+                        MOV(AL, 0, mAlphaSource.reg,
+                            reg_imm(fragment.reg, LSR, shift));
+                    } else {
+                        MOV(AL, 0, mAlphaSource.reg, fragment.reg);
+                    }
+                }
+                mAlphaSource.s -= shift;
+            }
+        }
+
+        // fog...
+        build_fog( fragment, component, regs );
+
+        temp = fragment;
+    } else {
+        if (mInfo[component].inDest) {
+            // extraction not needed and replace
+            // we just select the right component
+            if ((mTextureMachine.replaced & component_mask) == 0) {
+                // component wasn't replaced, so use it!
+                temp = component_t(parts.iterated, component);
+            }
+            for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
+                const texture_unit_t& tmu = mTextureMachine.tmu[i];
+                if ((tmu.mask & component_mask) &&
+                    ((tmu.replaced & component_mask) == 0)) {
+                    temp = component_t(parts.texel[i], component);
+                }
+            }
+        }
+    }
+}
+
+bool GGLAssembler::isAlphaSourceNeeded() const
+{
+    // XXX: also needed for alpha-test
+    const int bs = mBlendSrc;
+    const int bd = mBlendDst;
+    return  bs==GGL_SRC_ALPHA_SATURATE ||
+            bs==GGL_SRC_ALPHA || bs==GGL_ONE_MINUS_SRC_ALPHA ||
+            bd==GGL_SRC_ALPHA || bd==GGL_ONE_MINUS_SRC_ALPHA ; 
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_smooth_shade(const fragment_parts_t& parts)
+{
+    if (mSmooth && !parts.iterated_packed) {
+        // update the iterated color in a pipelined way...
+        comment("update iterated color");
+        Scratch scratches(registerFile());
+
+        const int reload = parts.reload;
+        for (int i=0 ; i<4 ; i++) {
+            if (!mInfo[i].iterated) 
+                continue;
+                
+            int c = parts.argb[i].reg;
+            int dx = parts.argb_dx[i].reg;
+            
+            if (reload & 1) {
+                c = scratches.obtain();
+                CONTEXT_LOAD(c, generated_vars.argb[i].c);
+            }
+            if (reload & 2) {
+                dx = scratches.obtain();
+                CONTEXT_LOAD(dx, generated_vars.argb[i].dx);
+            }
+            
+            if (mSmooth) {
+                ADD(AL, 0, c, c, dx);
+            }
+            
+            if (reload & 1) {
+                CONTEXT_STORE(c, generated_vars.argb[i].c);
+                scratches.recycle(c);
+            }
+            if (reload & 2) {
+                scratches.recycle(dx);
+            }
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_coverage_application(component_t& fragment,
+        const fragment_parts_t& parts, Scratch& regs)
+{
+    // here fragment.l is guarenteed to be 0
+    if (mAA) {
+        // coverages are 1.15 fixed-point numbers
+        comment("coverage application");
+
+        component_t incoming(fragment);
+        modify(fragment, regs);
+
+        Scratch scratches(registerFile());
+        int cf = scratches.obtain();
+        LDRH(AL, cf, parts.covPtr.reg, immed8_post(2));
+        if (fragment.h > 31) {
+            fragment.h--;
+            SMULWB(AL, fragment.reg, incoming.reg, cf);
+        } else {
+            MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSL, 1));
+            SMULWB(AL, fragment.reg, fragment.reg, cf);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_alpha_test(component_t& fragment,
+                                    const fragment_parts_t& /*parts*/)
+{
+    if (mAlphaTest != GGL_ALWAYS) {
+        comment("Alpha Test");
+        Scratch scratches(registerFile());
+        int ref = scratches.obtain();
+        const int shift = GGL_COLOR_BITS-fragment.size();
+        CONTEXT_LOAD(ref, state.alpha_test.ref);
+        if (shift) CMP(AL, fragment.reg, reg_imm(ref, LSR, shift));
+        else       CMP(AL, fragment.reg, ref);
+        int cc = NV;
+        switch (mAlphaTest) {
+        case GGL_NEVER:     cc = NV;    break;
+        case GGL_LESS:      cc = LT;    break;
+        case GGL_EQUAL:     cc = EQ;    break;
+        case GGL_LEQUAL:    cc = LS;    break;
+        case GGL_GREATER:   cc = HI;    break;
+        case GGL_NOTEQUAL:  cc = NE;    break;
+        case GGL_GEQUAL:    cc = HS;    break;
+        }
+        B(cc^1, "discard_after_textures");
+    }
+}
+
+// ---------------------------------------------------------------------------
+            
+void GGLAssembler::build_depth_test(
+        const fragment_parts_t& parts, uint32_t mask)
+{
+    mask &= Z_TEST|Z_WRITE;
+    const needs_t& needs = mBuilderContext.needs;
+    const int zmask = GGL_READ_NEEDS(P_MASK_Z, needs.p);
+    Scratch scratches(registerFile());
+
+    if (mDepthTest != GGL_ALWAYS || zmask) {
+        int cc=AL, ic=AL;
+        switch (mDepthTest) {
+        case GGL_LESS:      ic = HI;    break;
+        case GGL_EQUAL:     ic = EQ;    break;
+        case GGL_LEQUAL:    ic = HS;    break;
+        case GGL_GREATER:   ic = LT;    break;
+        case GGL_NOTEQUAL:  ic = NE;    break;
+        case GGL_GEQUAL:    ic = LS;    break;
+        case GGL_NEVER:
+            // this never happens, because it's taken care of when 
+            // computing the needs. but we keep it for completness.
+            comment("Depth Test (NEVER)");
+            B(AL, "discard_before_textures");
+            return;
+        case GGL_ALWAYS:
+            // we're here because zmask is enabled
+            mask &= ~Z_TEST;    // test always passes.
+            break;
+        }
+        
+        // inverse the condition
+        cc = ic^1;
+        
+        if ((mask & Z_WRITE) && !zmask) {
+            mask &= ~Z_WRITE;
+        }
+        
+        if (!mask)
+            return;
+
+        comment("Depth Test");
+
+        int zbase = scratches.obtain();
+        int depth = scratches.obtain();
+        int z = parts.z.reg;
+        
+        CONTEXT_ADDR_LOAD(zbase, generated_vars.zbase);  // stall
+        ADDR_SUB(AL, 0, zbase, zbase, reg_imm(parts.count.reg, LSR, 15));
+            // above does zbase = zbase + ((count >> 16) << 1)
+
+        if (mask & Z_TEST) {
+            LDRH(AL, depth, zbase);  // stall
+            CMP(AL, depth, reg_imm(z, LSR, 16));
+            B(cc, "discard_before_textures");
+        }
+        if (mask & Z_WRITE) {
+            if (mask == Z_WRITE) {
+                // only z-write asked, cc is meaningless
+                ic = AL;
+            }
+            MOV(AL, 0, depth, reg_imm(z, LSR, 16));
+            STRH(ic, depth, zbase);
+        }
+    }
+}
+
+void GGLAssembler::build_iterate_z(const fragment_parts_t& parts)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
+        Scratch scratches(registerFile());
+        int dzdx = scratches.obtain();
+        CONTEXT_LOAD(dzdx, generated_vars.dzdx);    // stall
+        ADD(AL, 0, parts.z.reg, parts.z.reg, dzdx); 
+    }
+}
+
+void GGLAssembler::build_iterate_f(const fragment_parts_t& /*parts*/)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    if (GGL_READ_NEEDS(P_FOG, needs.p)) {
+        Scratch scratches(registerFile());
+        int dfdx = scratches.obtain();
+        int f = scratches.obtain();
+        CONTEXT_LOAD(f,     generated_vars.f);
+        CONTEXT_LOAD(dfdx,  generated_vars.dfdx);   // stall
+        ADD(AL, 0, f, f, dfdx);
+        CONTEXT_STORE(f,    generated_vars.f);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_logic_op(pixel_t& pixel, Scratch& regs)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+    if (opcode == GGL_COPY)
+        return;
+    
+    comment("logic operation");
+
+    pixel_t s(pixel);
+    if (!(pixel.flags & CORRUPTIBLE)) {
+        pixel.reg = regs.obtain();
+        pixel.flags |= CORRUPTIBLE;
+    }
+    
+    pixel_t d(mDstPixel);
+    switch(opcode) {
+    case GGL_CLEAR:         MOV(AL, 0, pixel.reg, imm(0));          break;
+    case GGL_AND:           AND(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_AND_REVERSE:   BIC(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_COPY:                                                  break;
+    case GGL_AND_INVERTED:  BIC(AL, 0, pixel.reg, d.reg, s.reg);    break;
+    case GGL_NOOP:          MOV(AL, 0, pixel.reg, d.reg);           break;
+    case GGL_XOR:           EOR(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_OR:            ORR(AL, 0, pixel.reg, s.reg, d.reg);    break;
+    case GGL_NOR:           ORR(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_EQUIV:         EOR(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_INVERT:        MVN(AL, 0, pixel.reg, d.reg);           break;
+    case GGL_OR_REVERSE:    // s | ~d == ~(~s & d)
+                            BIC(AL, 0, pixel.reg, d.reg, s.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_COPY_INVERTED: MVN(AL, 0, pixel.reg, s.reg);           break;
+    case GGL_OR_INVERTED:   // ~s | d == ~(s & ~d)
+                            BIC(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_NAND:          AND(AL, 0, pixel.reg, s.reg, d.reg);
+                            MVN(AL, 0, pixel.reg, pixel.reg);       break;
+    case GGL_SET:           MVN(AL, 0, pixel.reg, imm(0));          break;
+    };        
+}
+
+// ---------------------------------------------------------------------------
+
+static uint32_t find_bottom(uint32_t val)
+{
+    uint32_t i = 0;
+    while (!(val & (3<<i)))
+        i+= 2;
+    return i;
+}
+
+static void normalize(uint32_t& val, uint32_t& rot)
+{
+    rot = 0;
+    while (!(val&3)  || (val & 0xFC000000)) {
+        uint32_t newval;
+        newval = val >> 2;
+        newval |= (val&3) << 30;
+        val = newval;
+        rot += 2;
+        if (rot == 32) {
+            rot = 0;
+            break;
+        }
+    }
+}
+
+void GGLAssembler::build_and_immediate(int d, int s, uint32_t mask, int bits)
+{
+    uint32_t rot;
+    uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
+    mask &= size;
+
+    if (mask == size) {
+        if (d != s)
+            MOV( AL, 0, d, s);
+        return;
+    }
+    
+    if ((getCodegenArch() == CODEGEN_ARCH_MIPS) ||
+        (getCodegenArch() == CODEGEN_ARCH_MIPS64)) {
+        // MIPS can do 16-bit imm in 1 instr, 32-bit in 3 instr
+        // the below ' while (mask)' code is buggy on mips
+        // since mips returns true on isValidImmediate()
+        // then we get multiple AND instr (positive logic)
+        AND( AL, 0, d, s, imm(mask) );
+        return;
+    }
+    else if (getCodegenArch() == CODEGEN_ARCH_ARM64) {
+        AND( AL, 0, d, s, imm(mask) );
+        return;
+    }
+
+    int negative_logic = !isValidImmediate(mask);
+    if (negative_logic) {
+        mask = ~mask & size;
+    }
+    normalize(mask, rot);
+
+    if (mask) {
+        while (mask) {
+            uint32_t bitpos = find_bottom(mask);
+            int shift = rot + bitpos;
+            uint32_t m = mask & (0xff << bitpos);
+            mask &= ~m;
+            m >>= bitpos;
+            int32_t newMask =  (m<<shift) | (m>>(32-shift));
+            if (!negative_logic) {
+                AND( AL, 0, d, s, imm(newMask) );
+            } else {
+                BIC( AL, 0, d, s, imm(newMask) );
+            }
+            s = d;
+        }
+    } else {
+        MOV( AL, 0, d, imm(0));
+    }
+}		
+
+void GGLAssembler::build_masking(pixel_t& pixel, Scratch& regs)
+{
+    if (!mMasking || mAllMasked) {
+        return;
+    }
+
+    comment("color mask");
+
+    pixel_t fb(mDstPixel);
+    pixel_t s(pixel);
+    if (!(pixel.flags & CORRUPTIBLE)) {
+        pixel.reg = regs.obtain();
+        pixel.flags |= CORRUPTIBLE;
+    }
+
+    int mask = 0;
+    for (int i=0 ; i<4 ; i++) {
+        const int component_mask = 1<<i;
+        const int h = fb.format.c[i].h;
+        const int l = fb.format.c[i].l;
+        if (h && (!(mMasking & component_mask))) {
+            mask |= ((1<<(h-l))-1) << l;
+        }
+    }
+
+    // There is no need to clear the masked components of the source
+    // (unless we applied a logic op), because they're already zeroed 
+    // by construction (masked components are not computed)
+
+    if (mLogicOp) {
+        const needs_t& needs = mBuilderContext.needs;
+        const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+        if (opcode != GGL_CLEAR) {
+            // clear masked component of source
+            build_and_immediate(pixel.reg, s.reg, mask, fb.size());
+            s = pixel;
+        }
+    }
+
+    // clear non masked components of destination
+    build_and_immediate(fb.reg, fb.reg, ~mask, fb.size()); 
+
+    // or back the channels that were masked
+    if (s.reg == fb.reg) {
+         // this is in fact a MOV
+        if (s.reg == pixel.reg) {
+            // ugh. this in in fact a nop
+        } else {
+            MOV(AL, 0, pixel.reg, fb.reg);
+        }
+    } else {
+        ORR(AL, 0, pixel.reg, s.reg, fb.reg);
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::base_offset(
+        const pointer_t& d, const pointer_t& b, const reg_t& o)
+{
+    switch (b.size) {
+    case 32:
+        ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 2));
+        break;
+    case 24:
+        if (d.reg == b.reg) {
+            ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
+            ADDR_ADD(AL, 0, d.reg, d.reg, o.reg);
+        } else {
+            ADDR_ADD(AL, 0, d.reg, o.reg, reg_imm(o.reg, LSL, 1));
+            ADDR_ADD(AL, 0, d.reg, d.reg, b.reg);
+        }
+        break;
+    case 16:
+        ADDR_ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
+        break;
+    case 8:
+        ADDR_ADD(AL, 0, d.reg, b.reg, o.reg);
+        break;
+    }
+}
+
+// ----------------------------------------------------------------------------
+// cheezy register allocator...
+// ----------------------------------------------------------------------------
+
+// Modified to support MIPS processors, in a very simple way. We retain the
+// (Arm) limit of 16 total registers, but shift the mapping of those registers
+// from 0-15, to 2-17. Register 0 on Mips cannot be used as GP registers, and
+// register 1 has a traditional use as a temp).
+
+RegisterAllocator::RegisterAllocator(int arch) : mRegs(arch)
+{
+}
+
+void RegisterAllocator::reset()
+{
+    mRegs.reset();
+}
+
+int RegisterAllocator::reserveReg(int reg)
+{
+    return mRegs.reserve(reg);
+}
+
+int RegisterAllocator::obtainReg()
+{
+    return mRegs.obtain();
+}
+
+void RegisterAllocator::recycleReg(int reg)
+{
+    mRegs.recycle(reg);
+}
+
+RegisterAllocator::RegisterFile& RegisterAllocator::registerFile()
+{
+    return mRegs;
+}
+
+// ----------------------------------------------------------------------------
+
+RegisterAllocator::RegisterFile::RegisterFile(int codegen_arch)
+    : mRegs(0), mTouched(0), mStatus(0), mArch(codegen_arch), mRegisterOffset(0)
+{
+    if ((mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS) ||
+        (mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS64)) {
+        mRegisterOffset = 2;    // ARM has regs 0..15, MIPS offset to 2..17
+    }
+    reserve(ARMAssemblerInterface::SP);
+    reserve(ARMAssemblerInterface::PC);
+}
+
+RegisterAllocator::RegisterFile::RegisterFile(const RegisterFile& rhs, int codegen_arch)
+    : mRegs(rhs.mRegs), mTouched(rhs.mTouched), mArch(codegen_arch), mRegisterOffset(0)
+{
+    if ((mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS) ||
+        (mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS64)) {
+        mRegisterOffset = 2;    // ARM has regs 0..15, MIPS offset to 2..17
+    }
+}
+
+RegisterAllocator::RegisterFile::~RegisterFile()
+{
+}
+
+bool RegisterAllocator::RegisterFile::operator == (const RegisterFile& rhs) const
+{
+    return (mRegs == rhs.mRegs);
+}
+
+void RegisterAllocator::RegisterFile::reset()
+{
+    mRegs = mTouched = mStatus = 0;
+    reserve(ARMAssemblerInterface::SP);
+    reserve(ARMAssemblerInterface::PC);
+}
+
+// RegisterFile::reserve() take a register parameter in the
+// range 0-15 (Arm compatible), but on a Mips processor, will
+// return the actual allocated register in the range 2-17.
+int RegisterAllocator::RegisterFile::reserve(int reg)
+{
+    reg += mRegisterOffset;
+    LOG_ALWAYS_FATAL_IF(isUsed(reg),
+                        "reserving register %d, but already in use",
+                        reg);
+    mRegs |= (1<<reg);
+    mTouched |= mRegs;
+    return reg;
+}
+
+// This interface uses regMask in range 2-17 on MIPS, no translation.
+void RegisterAllocator::RegisterFile::reserveSeveral(uint32_t regMask)
+{
+    mRegs |= regMask;
+    mTouched |= regMask;
+}
+
+int RegisterAllocator::RegisterFile::isUsed(int reg) const
+{
+    LOG_ALWAYS_FATAL_IF(reg>=16+(int)mRegisterOffset, "invalid register %d", reg);
+    return mRegs & (1<<reg);
+}
+
+int RegisterAllocator::RegisterFile::obtain()
+{
+    const char priorityList[14] = {  0,  1, 2, 3, 
+                                    12, 14, 4, 5, 
+                                     6,  7, 8, 9,
+                                    10, 11 };
+    const int nbreg = sizeof(priorityList);
+    int i, r, reg;
+    for (i=0 ; i<nbreg ; i++) {
+        r = priorityList[i];
+        if (!isUsed(r + mRegisterOffset)) {
+            break;
+        }
+    }
+    // this is not an error anymore because, we'll try again with
+    // a lower optimization level.
+    //ALOGE_IF(i >= nbreg, "pixelflinger ran out of registers\n");
+    if (i >= nbreg) {
+        mStatus |= OUT_OF_REGISTERS;
+        // we return SP so we can more easily debug things
+        // the code will never be run anyway.
+        return ARMAssemblerInterface::SP; 
+    }
+    reg = reserve(r);  // Param in Arm range 0-15, returns range 2-17 on Mips.
+    return reg;
+}
+
+bool RegisterAllocator::RegisterFile::hasFreeRegs() const
+{
+    uint32_t regs = mRegs >> mRegisterOffset;   // MIPS fix.
+    return ((regs & 0xFFFF) == 0xFFFF) ? false : true;
+}
+
+int RegisterAllocator::RegisterFile::countFreeRegs() const
+{
+    uint32_t regs = mRegs >> mRegisterOffset;   // MIPS fix.
+    int f = ~regs & 0xFFFF;
+    // now count number of 1
+   f = (f & 0x5555) + ((f>>1) & 0x5555);
+   f = (f & 0x3333) + ((f>>2) & 0x3333);
+   f = (f & 0x0F0F) + ((f>>4) & 0x0F0F);
+   f = (f & 0x00FF) + ((f>>8) & 0x00FF);
+   return f;
+}
+
+void RegisterAllocator::RegisterFile::recycle(int reg)
+{
+    // commented out, since common failure of running out of regs
+    // triggers this assertion. Since the code is not execectued
+    // in that case, it does not matter. No reason to FATAL err.
+    // LOG_FATAL_IF(!isUsed(reg),
+    //         "recycling unallocated register %d",
+    //         reg);
+    mRegs &= ~(1<<reg);
+}
+
+void RegisterAllocator::RegisterFile::recycleSeveral(uint32_t regMask)
+{
+    // commented out, since common failure of running out of regs
+    // triggers this assertion. Since the code is not execectued
+    // in that case, it does not matter. No reason to FATAL err.
+    // LOG_FATAL_IF((mRegs & regMask)!=regMask,
+    //         "recycling unallocated registers "
+    //         "(recycle=%08x, allocated=%08x, unallocated=%08x)",
+    //         regMask, mRegs, mRegs&regMask);
+    mRegs &= ~regMask;
+}
+
+uint32_t RegisterAllocator::RegisterFile::touched() const
+{
+    return mTouched;
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
diff --git a/libpixelflinger/codeflinger/GGLAssembler.h b/libpixelflinger/codeflinger/GGLAssembler.h
new file mode 100644
index 0000000..47dbf3a
--- /dev/null
+++ b/libpixelflinger/codeflinger/GGLAssembler.h
@@ -0,0 +1,572 @@
+/* libs/pixelflinger/codeflinger/GGLAssembler.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_GGLASSEMBLER_H
+#define ANDROID_GGLASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <private/pixelflinger/ggl_context.h>
+
+#include "ARMAssemblerProxy.h"
+
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+#define CONTEXT_ADDR_LOAD(REG, FIELD) \
+    ADDR_LDR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+#define CONTEXT_ADDR_STORE(REG, FIELD) \
+    ADDR_STR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+#define CONTEXT_LOAD(REG, FIELD) \
+    LDR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+#define CONTEXT_STORE(REG, FIELD) \
+    STR(AL, REG, mBuilderContext.Rctx, immed12_pre(GGL_OFFSETOF(FIELD)))
+
+
+class RegisterAllocator
+{
+public:
+    class RegisterFile;
+    
+                    RegisterAllocator(int arch);  // NOLINT, implicit
+    RegisterFile&   registerFile();
+    int             reserveReg(int reg);
+    int             obtainReg();
+    void            recycleReg(int reg);
+    void            reset();
+
+    class RegisterFile
+    {
+    public:
+                            RegisterFile(int arch);  // NOLINT, implicit
+                            RegisterFile(const RegisterFile& rhs, int arch);
+                            ~RegisterFile();
+
+                void        reset();
+
+                bool operator == (const RegisterFile& rhs) const;
+                bool operator != (const RegisterFile& rhs) const {
+                    return !operator == (rhs);
+                }
+
+                int         reserve(int reg);
+                void        reserveSeveral(uint32_t regMask);
+
+                void        recycle(int reg);
+                void        recycleSeveral(uint32_t regMask);
+
+                int         obtain();
+        inline  int         isUsed(int reg) const;
+
+                bool        hasFreeRegs() const;
+                int         countFreeRegs() const;                
+
+                uint32_t    touched() const;
+        inline  uint32_t    status() const { return mStatus; }
+        
+        enum {
+            OUT_OF_REGISTERS = 0x1
+        };
+
+    private:
+        uint32_t    mRegs;
+        uint32_t    mTouched;
+        uint32_t    mStatus;
+        int         mArch;
+        uint32_t    mRegisterOffset;    // lets reg alloc use 2..17 for mips
+                                        // while arm uses 0..15
+    };
+ 
+    class Scratch
+    {
+    public:
+            explicit Scratch(RegisterFile& regFile)
+                : mRegFile(regFile), mScratch(0) { 
+            }
+            ~Scratch() {
+                mRegFile.recycleSeveral(mScratch);
+            }
+        int obtain() { 
+            int reg = mRegFile.obtain();
+            mScratch |= 1<<reg;
+            return reg;
+        }
+        void recycle(int reg) {
+            mRegFile.recycle(reg);
+            mScratch &= ~(1<<reg);
+        }
+        bool isUsed(int reg) {
+            return (mScratch & (1<<reg));
+        }
+        int countFreeRegs() {
+            return mRegFile.countFreeRegs();
+        }
+    private:
+        RegisterFile&   mRegFile;
+        uint32_t        mScratch;
+    };
+
+    class Spill
+    {
+    public:
+        Spill(RegisterFile& regFile, ARMAssemblerInterface& gen, uint32_t reglist)
+            : mRegFile(regFile), mGen(gen), mRegList(reglist), mCount(0)
+        {
+            if (reglist) {
+                int count = 0;
+                while (reglist) {
+                    count++;
+                    reglist &= ~(1 << (31 - __builtin_clz(reglist)));
+                }
+                if (count == 1) {
+                    int reg = 31 - __builtin_clz(mRegList);
+                    mGen.STR(mGen.AL, reg, mGen.SP, mGen.immed12_pre(-4, 1));
+                } else {
+                    mGen.STM(mGen.AL, mGen.DB, mGen.SP, 1, mRegList);
+                }
+                mRegFile.recycleSeveral(mRegList);
+                mCount = count;
+            }
+        }
+        ~Spill() {
+            if (mRegList) {
+                if (mCount == 1) {
+                    int reg = 31 - __builtin_clz(mRegList);
+                    mGen.LDR(mGen.AL, reg, mGen.SP, mGen.immed12_post(4));
+                } else {
+                    mGen.LDM(mGen.AL, mGen.IA, mGen.SP, 1, mRegList);
+                }
+                mRegFile.reserveSeveral(mRegList);
+            }
+        }
+    private:
+        RegisterFile&           mRegFile;
+        ARMAssemblerInterface&  mGen;
+        uint32_t                mRegList;
+        int                     mCount;
+    };
+    
+private:
+    RegisterFile    mRegs;
+};
+
+// ----------------------------------------------------------------------------
+
+class GGLAssembler : public ARMAssemblerProxy, public RegisterAllocator
+{
+public:
+
+    explicit    GGLAssembler(ARMAssemblerInterface* target);
+    virtual     ~GGLAssembler();
+
+    uint32_t*   base() const { return 0; } // XXX
+    uint32_t*   pc() const { return 0; } // XXX
+
+    void        reset(int opt_level);
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+
+        // generate scanline code for given needs
+    int         scanline(const needs_t& needs, context_t const* c);
+    int         scanline_core(const needs_t& needs, context_t const* c);
+
+        enum {
+            CLEAR_LO    = 0x0001,
+            CLEAR_HI    = 0x0002,
+            CORRUPTIBLE = 0x0004,
+            FIRST       = 0x0008
+        };
+
+        enum { //load/store flags
+            WRITE_BACK  = 0x0001
+        };
+
+        struct reg_t {
+            reg_t() : reg(-1), flags(0) {
+            }
+            reg_t(int r, int f=0)  // NOLINT, implicit
+                : reg(r), flags(f) {
+            }
+            void setTo(int r, int f=0) {
+                reg=r; flags=f;
+            }
+            int         reg;
+            uint16_t    flags;
+        };
+
+        struct integer_t : public reg_t {
+            integer_t() : reg_t(), s(0) {
+            }
+            integer_t(int r, int sz=32, int f=0)  // NOLINT, implicit
+                : reg_t(r, f), s(sz) {
+            }
+            void setTo(int r, int sz=32, int f=0) {
+                reg_t::setTo(r, f); s=sz;
+            }
+            int8_t s;
+            inline int size() const { return s; }
+        };
+        
+        struct pixel_t : public reg_t {
+            pixel_t() : reg_t() {
+                memset(&format, 0, sizeof(GGLFormat));
+            }
+            pixel_t(int r, const GGLFormat* fmt, int f=0)
+                : reg_t(r, f), format(*fmt) {
+            }
+            void setTo(int r, const GGLFormat* fmt, int f=0) {
+                reg_t::setTo(r, f); format = *fmt;
+            }
+            GGLFormat format;
+            inline int hi(int c) const { return format.c[c].h; }
+            inline int low(int c) const { return format.c[c].l; }
+            inline int mask(int c) const { return ((1<<size(c))-1) << low(c); }
+            inline int size() const { return format.size*8; }
+            inline int size(int c) const { return component_size(c); }
+            inline int component_size(int c) const { return hi(c) - low(c); }
+        };
+
+        struct component_t : public reg_t {
+            component_t() : reg_t(), h(0), l(0) {
+            }
+            component_t(int r, int f=0)  // NOLINT, implicit
+                : reg_t(r, f), h(0), l(0) {
+            }
+            component_t(int r, int lo, int hi, int f=0)
+                : reg_t(r, f), h(hi), l(lo) {
+            }
+            explicit component_t(const integer_t& rhs)
+                : reg_t(rhs.reg, rhs.flags), h(rhs.s), l(0) {
+            }
+            explicit component_t(const pixel_t& rhs, int component) {
+                setTo(  rhs.reg, 
+                        rhs.format.c[component].l,
+                        rhs.format.c[component].h,
+                        rhs.flags|CLEAR_LO|CLEAR_HI);
+            }
+            void setTo(int r, int lo=0, int hi=0, int f=0) {
+                reg_t::setTo(r, f); h=hi; l=lo;
+            }
+            int8_t h;
+            int8_t l;
+            inline int size() const { return h-l; }
+        };
+
+        struct pointer_t : public reg_t {
+            pointer_t() : reg_t(), size(0) {
+            }
+            pointer_t(int r, int s, int f=0)
+                : reg_t(r, f), size(s) {
+            }
+            void setTo(int r, int s, int f=0) {
+                reg_t::setTo(r, f); size=s;
+            }
+            int8_t size;
+        };
+
+
+private:
+    // GGLAssembler hides RegisterAllocator's and ARMAssemblerProxy's reset
+    // methods by providing a reset method with a different parameter set. The
+    // intent of GGLAssembler's reset method is to wrap the inherited reset
+    // methods, so make these methods private in order to prevent direct calls
+    // to these methods from clients.
+    using RegisterAllocator::reset;
+    using ARMAssemblerProxy::reset;
+
+    struct tex_coord_t {
+        reg_t       s;
+        reg_t       t;
+        pointer_t   ptr;
+    };
+
+    struct fragment_parts_t {
+        uint32_t    packed  : 1;
+        uint32_t    reload  : 2;
+        uint32_t    iterated_packed  : 1;
+        pixel_t     iterated;
+        pointer_t   cbPtr;
+        pointer_t   covPtr;
+        reg_t       count;
+        reg_t       argb[4];
+        reg_t       argb_dx[4];
+        reg_t       z;
+        reg_t       dither;
+        pixel_t     texel[GGL_TEXTURE_UNIT_COUNT];
+        tex_coord_t coords[GGL_TEXTURE_UNIT_COUNT];
+    };
+    
+    struct texture_unit_t {
+        int         format_idx;
+        GGLFormat   format;
+        int         bits;
+        int         swrap;
+        int         twrap;
+        int         env;
+        int         pot;
+        int         linear;
+        uint8_t     mask;
+        uint8_t     replaced;
+    };
+
+    struct texture_machine_t {
+        texture_unit_t  tmu[GGL_TEXTURE_UNIT_COUNT];
+        uint8_t         mask;
+        uint8_t         replaced;
+        uint8_t         directTexture;
+        uint8_t         activeUnits;
+    };
+
+    struct component_info_t {
+        bool    masked      : 1;
+        bool    inDest      : 1;
+        bool    needed      : 1;
+        bool    replaced    : 1;
+        bool    iterated    : 1;
+        bool    smooth      : 1;
+        bool    blend       : 1;
+        bool    fog         : 1;
+    };
+
+    struct builder_context_t {
+        context_t const*    c;
+        needs_t             needs;
+        int                 Rctx;
+    };
+
+    template <typename T>
+    void modify(T& r, Scratch& regs)
+    {
+        if (!(r.flags & CORRUPTIBLE)) {
+            r.reg = regs.obtain();
+            r.flags |= CORRUPTIBLE;
+        }
+    }
+
+    // helpers
+    void    base_offset(const pointer_t& d, const pointer_t& b, const reg_t& o);
+
+    // texture environement
+    void    modulate(   component_t& dest,
+                        const component_t& incoming,
+                        const pixel_t& texel, int component);
+
+    void    decal(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component);
+
+    void    blend(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component, int tmu);
+
+    void    add(  component_t& dest,
+                    const component_t& incoming,
+                    const pixel_t& texel, int component);
+
+    // load/store stuff
+    void    store(const pointer_t& addr, const pixel_t& src, uint32_t flags=0);
+    void    load(const pointer_t& addr, const pixel_t& dest, uint32_t flags=0);
+    void    extract(integer_t& d, const pixel_t& s, int component);    
+    void    extract(component_t& d, const pixel_t& s, int component);    
+    void    extract(integer_t& d, int s, int h, int l, int bits=32);
+    void    expand(integer_t& d, const integer_t& s, int dbits);
+    void    expand(integer_t& d, const component_t& s, int dbits);
+    void    expand(component_t& d, const component_t& s, int dbits);
+    void    downshift(pixel_t& d, int component, component_t s, const reg_t& dither);
+
+
+    void    mul_factor( component_t& d,
+                        const integer_t& v,
+                        const integer_t& f);
+
+    void    mul_factor_add( component_t& d,
+                            const integer_t& v,
+                            const integer_t& f,
+                            const component_t& a);
+
+    void    component_add(  component_t& d,
+                            const integer_t& dst,
+                            const integer_t& src);
+
+    void    component_sat(  const component_t& v);
+
+
+    void    build_scanline_prolog(  fragment_parts_t& parts,
+                                    const needs_t& needs);
+
+    void    build_smooth_shade(const fragment_parts_t& parts);
+
+    void    build_component(    pixel_t& pixel,
+                                const fragment_parts_t& parts,
+                                int component,
+                                Scratch& global_scratches);
+                                
+    void    build_incoming_component(
+                                component_t& temp,
+                                int dst_size,
+                                const fragment_parts_t& parts,
+                                int component,
+                                Scratch& scratches,
+                                Scratch& global_scratches);
+
+    void    init_iterated_color(fragment_parts_t& parts, const reg_t& x);
+
+    void    build_iterated_color(   component_t& fragment,
+                                    const fragment_parts_t& parts,
+                                    int component,
+                                    Scratch& regs);
+
+    void    decodeLogicOpNeeds(const needs_t& needs);
+    
+    void    decodeTMUNeeds(const needs_t& needs, context_t const* c);
+
+    void    init_textures(  tex_coord_t* coords,
+                            const reg_t& x,
+                            const reg_t& y);
+
+    void    build_textures( fragment_parts_t& parts,
+                            Scratch& regs);
+
+    void    filter8(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter16(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter24(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    filter32(   const fragment_parts_t& parts,
+                        pixel_t& texel, const texture_unit_t& tmu,
+                        int U, int V, pointer_t& txPtr,
+                        int FRAC_BITS);
+
+    void    build_texture_environment(  component_t& fragment,
+                                        const fragment_parts_t& parts,
+                                        int component,
+                                        Scratch& regs);
+
+    void    wrapping(   int d,
+                        int coord, int size,
+                        int tx_wrap, int tx_linear);
+
+    void    build_fog(  component_t& temp,
+                        int component,
+                        Scratch& parent_scratches);
+
+    void    build_blending(     component_t& in_out,
+                                const pixel_t& pixel,
+                                int component,
+                                Scratch& parent_scratches);
+
+    void    build_blend_factor(
+                integer_t& factor, int f, int component,
+                const pixel_t& dst_pixel,
+                integer_t& fragment,
+                integer_t& fb,
+                Scratch& scratches);
+
+    void    build_blendFOneMinusF(  component_t& temp,
+                                    const integer_t& factor, 
+                                    const integer_t& fragment,
+                                    const integer_t& fb);
+
+    void    build_blendOneMinusFF(  component_t& temp,
+                                    const integer_t& factor, 
+                                    const integer_t& fragment,
+                                    const integer_t& fb);
+
+    void build_coverage_application(component_t& fragment,
+                                    const fragment_parts_t& parts,
+                                    Scratch& regs);
+
+    void build_alpha_test(component_t& fragment, const fragment_parts_t& parts);
+
+    enum { Z_TEST=1, Z_WRITE=2 }; 
+    void build_depth_test(const fragment_parts_t& parts, uint32_t mask);
+    void build_iterate_z(const fragment_parts_t& parts);
+    void build_iterate_f(const fragment_parts_t& parts);
+    void build_iterate_texture_coordinates(const fragment_parts_t& parts);
+
+    void build_logic_op(pixel_t& pixel, Scratch& regs);
+
+    void build_masking(pixel_t& pixel, Scratch& regs);
+
+    void build_and_immediate(int d, int s, uint32_t mask, int bits);
+
+    bool    isAlphaSourceNeeded() const;
+
+    enum {
+        FACTOR_SRC=1, FACTOR_DST=2, BLEND_SRC=4, BLEND_DST=8 
+    };
+    
+    enum {
+        LOGIC_OP=1, LOGIC_OP_SRC=2, LOGIC_OP_DST=4
+    };
+
+    static int blending_codes(int fs, int fd);
+
+    builder_context_t   mBuilderContext;
+    texture_machine_t   mTextureMachine;
+    component_info_t    mInfo[4];
+    int                 mBlending;
+    int                 mMasking;
+    int                 mAllMasked;
+    int                 mLogicOp;
+    int                 mAlphaTest;
+    int                 mAA;
+    int                 mDithering;
+    int                 mDepthTest;
+
+    int             mSmooth;
+    int             mFog;
+    pixel_t         mDstPixel;
+    
+    GGLFormat       mCbFormat;
+    
+    int             mBlendFactorCached;
+    integer_t       mAlphaSource;
+    
+    int             mBaseRegister;
+    
+    int             mBlendSrc;
+    int             mBlendDst;
+    int             mBlendSrcA;
+    int             mBlendDstA;
+    
+    int             mOptLevel;
+};
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
+#endif // ANDROID_GGLASSEMBLER_H
diff --git a/libpixelflinger/codeflinger/MIPS64Assembler.cpp b/libpixelflinger/codeflinger/MIPS64Assembler.cpp
new file mode 100644
index 0000000..d6d2156
--- /dev/null
+++ b/libpixelflinger/codeflinger/MIPS64Assembler.cpp
@@ -0,0 +1,1447 @@
+/* libs/pixelflinger/codeflinger/MIPS64Assembler.cpp
+**
+** Copyright 2015, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+
+/* MIPS64 assembler and ARM->MIPS64 assembly translator
+**
+** The approach is utilize MIPSAssembler generator, using inherited MIPS64Assembler
+** that overrides just the specific MIPS64r6 instructions.
+** For now ArmToMips64Assembler is copied over from ArmToMipsAssembler class,
+** changing some MIPS64r6 related stuff.
+**
+*/
+
+#define LOG_TAG "MIPS64Assembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cutils/properties.h>
+#include <log/log.h>
+#include <private/pixelflinger/ggl_context.h>
+
+#include "MIPS64Assembler.h"
+#include "CodeCache.h"
+#include "mips64_disassem.h"
+
+#define NOT_IMPLEMENTED()  LOG_ALWAYS_FATAL("Arm instruction %s not yet implemented\n", __func__)
+#define __unused __attribute__((__unused__))
+
+// ----------------------------------------------------------------------------
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark ArmToMips64Assembler...
+#endif
+
+ArmToMips64Assembler::ArmToMips64Assembler(const sp<Assembly>& assembly,
+                                           char *abuf, int linesz, int instr_count)
+    :   ARMAssemblerInterface(),
+        mArmDisassemblyBuffer(abuf),
+        mArmLineLength(linesz),
+        mArmInstrCount(instr_count),
+        mInum(0),
+        mAssembly(assembly)
+{
+    mMips = new MIPS64Assembler(assembly, this);
+    mArmPC = (uint32_t **) malloc(ARM_MAX_INSTUCTIONS * sizeof(uint32_t *));
+    init_conditional_labels();
+}
+
+ArmToMips64Assembler::ArmToMips64Assembler(void* assembly)
+    :   ARMAssemblerInterface(),
+        mArmDisassemblyBuffer(NULL),
+        mInum(0),
+        mAssembly(NULL)
+{
+    mMips = new MIPS64Assembler(assembly, this);
+    mArmPC = (uint32_t **) malloc(ARM_MAX_INSTUCTIONS * sizeof(uint32_t *));
+    init_conditional_labels();
+}
+
+ArmToMips64Assembler::~ArmToMips64Assembler()
+{
+    delete mMips;
+    free((void *) mArmPC);
+}
+
+uint32_t* ArmToMips64Assembler::pc() const
+{
+    return mMips->pc();
+}
+
+uint32_t* ArmToMips64Assembler::base() const
+{
+    return mMips->base();
+}
+
+void ArmToMips64Assembler::reset()
+{
+    cond.labelnum = 0;
+    mInum = 0;
+    mMips->reset();
+}
+
+int ArmToMips64Assembler::getCodegenArch()
+{
+    return CODEGEN_ARCH_MIPS64;
+}
+
+void ArmToMips64Assembler::comment(const char* string)
+{
+    mMips->comment(string);
+}
+
+void ArmToMips64Assembler::label(const char* theLabel)
+{
+    mMips->label(theLabel);
+}
+
+void ArmToMips64Assembler::disassemble(const char* name)
+{
+    mMips->disassemble(name);
+}
+
+void ArmToMips64Assembler::init_conditional_labels()
+{
+    int i;
+    for (i=0;i<99; ++i) {
+        sprintf(cond.label[i], "cond_%d", i);
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Prolog/Epilog & Generate...
+#endif
+
+void ArmToMips64Assembler::prolog()
+{
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->DADDIU(R_sp, R_sp, -(5 * 8));
+    mMips->SD(R_s0, R_sp, 0);
+    mMips->SD(R_s1, R_sp, 8);
+    mMips->SD(R_s2, R_sp, 16);
+    mMips->SD(R_s3, R_sp, 24);
+    mMips->SD(R_s4, R_sp, 32);
+    mMips->MOVE(R_v0, R_a0);    // move context * passed in a0 to v0 (arm r0)
+}
+
+void ArmToMips64Assembler::epilog(uint32_t touched __unused)
+{
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->LD(R_s0, R_sp, 0);
+    mMips->LD(R_s1, R_sp, 8);
+    mMips->LD(R_s2, R_sp, 16);
+    mMips->LD(R_s3, R_sp, 24);
+    mMips->LD(R_s4, R_sp, 32);
+    mMips->DADDIU(R_sp, R_sp, (5 * 8));
+    mMips->JR(R_ra);
+
+}
+
+int ArmToMips64Assembler::generate(const char* name)
+{
+    return mMips->generate(name);
+}
+
+void ArmToMips64Assembler::fix_branches()
+{
+    mMips->fix_branches();
+}
+
+uint32_t* ArmToMips64Assembler::pcForLabel(const char* label)
+{
+    return mMips->pcForLabel(label);
+}
+
+void ArmToMips64Assembler::set_condition(int mode, int R1, int R2) {
+    if (mode == 2) {
+        cond.type = SBIT_COND;
+    } else {
+        cond.type = CMP_COND;
+    }
+    cond.r1 = R1;
+    cond.r2 = R2;
+}
+
+//----------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Addressing modes & shifters...
+#endif
+
+
+// do not need this for MIPS, but it is in the Interface (virtual)
+int ArmToMips64Assembler::buildImmediate(
+        uint32_t immediate, uint32_t& rot, uint32_t& imm)
+{
+    // for MIPS, any 32-bit immediate is OK
+    rot = 0;
+    imm = immediate;
+    return 0;
+}
+
+// shifters...
+
+bool ArmToMips64Assembler::isValidImmediate(uint32_t immediate __unused)
+{
+    // for MIPS, any 32-bit immediate is OK
+    return true;
+}
+
+uint32_t ArmToMips64Assembler::imm(uint32_t immediate)
+{
+    amode.value = immediate;
+    return AMODE_IMM;
+}
+
+uint32_t ArmToMips64Assembler::reg_imm(int Rm, int type, uint32_t shift)
+{
+    amode.reg = Rm;
+    amode.stype = type;
+    amode.value = shift;
+    return AMODE_REG_IMM;
+}
+
+uint32_t ArmToMips64Assembler::reg_rrx(int Rm __unused)
+{
+    // reg_rrx mode is not used in the GLLAssember code at this time
+    return AMODE_UNSUPPORTED;
+}
+
+uint32_t ArmToMips64Assembler::reg_reg(int Rm __unused, int type __unused,
+                                       int Rs __unused)
+{
+    // reg_reg mode is not used in the GLLAssember code at this time
+    return AMODE_UNSUPPORTED;
+}
+
+
+// addressing modes...
+// LDR(B)/STR(B)/PLD (immediate and Rm can be negative, which indicate U=0)
+uint32_t ArmToMips64Assembler::immed12_pre(int32_t immed12, int W)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+    amode.value = immed12;
+    amode.writeback = W;
+    return AMODE_IMM_12_PRE;
+}
+
+uint32_t ArmToMips64Assembler::immed12_post(int32_t immed12)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+
+    amode.value = immed12;
+    return AMODE_IMM_12_POST;
+}
+
+uint32_t ArmToMips64Assembler::reg_scale_pre(int Rm, int type,
+        uint32_t shift, int W)
+{
+    LOG_ALWAYS_FATAL_IF(W | type | shift, "reg_scale_pre adv modes not yet implemented");
+
+    amode.reg = Rm;
+    // amode.stype = type;      // more advanced modes not used in GGLAssembler yet
+    // amode.value = shift;
+    // amode.writeback = W;
+    return AMODE_REG_SCALE_PRE;
+}
+
+uint32_t ArmToMips64Assembler::reg_scale_post(int Rm __unused, int type __unused,
+                                              uint32_t shift __unused)
+{
+    LOG_ALWAYS_FATAL("adr mode reg_scale_post not yet implemented\n");
+    return AMODE_UNSUPPORTED;
+}
+
+// LDRH/LDRSB/LDRSH/STRH (immediate and Rm can be negative, which indicate U=0)
+uint32_t ArmToMips64Assembler::immed8_pre(int32_t immed8, int W __unused)
+{
+    LOG_ALWAYS_FATAL("adr mode immed8_pre not yet implemented\n");
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+    return AMODE_IMM_8_PRE;
+}
+
+uint32_t ArmToMips64Assembler::immed8_post(int32_t immed8)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+    amode.value = immed8;
+    return AMODE_IMM_8_POST;
+}
+
+uint32_t ArmToMips64Assembler::reg_pre(int Rm, int W)
+{
+    LOG_ALWAYS_FATAL_IF(W, "reg_pre writeback not yet implemented");
+    amode.reg = Rm;
+    return AMODE_REG_PRE;
+}
+
+uint32_t ArmToMips64Assembler::reg_post(int Rm __unused)
+{
+    LOG_ALWAYS_FATAL("adr mode reg_post not yet implemented\n");
+    return AMODE_UNSUPPORTED;
+}
+
+
+
+// ----------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Data Processing...
+#endif
+
+// check if the operand registers from a previous CMP or S-bit instruction
+// would be overwritten by this instruction. If so, move the value to a
+// safe register.
+// Note that we cannot tell at _this_ instruction time if a future (conditional)
+// instruction will _also_ use this value (a defect of the simple 1-pass, one-
+// instruction-at-a-time translation). Therefore we must be conservative and
+// save the value before it is overwritten. This costs an extra MOVE instr.
+
+void ArmToMips64Assembler::protectConditionalOperands(int Rd)
+{
+    if (Rd == cond.r1) {
+        mMips->MOVE(R_cmp, cond.r1);
+        cond.r1 = R_cmp;
+    }
+    if (cond.type == CMP_COND && Rd == cond.r2) {
+        mMips->MOVE(R_cmp2, cond.r2);
+        cond.r2 = R_cmp2;
+    }
+}
+
+
+// interprets the addressing mode, and generates the common code
+// used by the majority of data-processing ops. Many MIPS instructions
+// have a register-based form and a different immediate form. See
+// opAND below for an example. (this could be inlined)
+//
+// this works with the imm(), reg_imm() methods above, which are directly
+// called by the GLLAssembler.
+// note: _signed parameter defaults to false (un-signed)
+// note: tmpReg parameter defaults to 1, MIPS register AT
+int ArmToMips64Assembler::dataProcAdrModes(int op, int& source, bool _signed, int tmpReg)
+{
+    if (op < AMODE_REG) {
+        source = op;
+        return SRC_REG;
+    } else if (op == AMODE_IMM) {
+        if ((!_signed && amode.value > 0xffff)
+                || (_signed && ((int)amode.value < -32768 || (int)amode.value > 32767) )) {
+            mMips->LUI(tmpReg, (amode.value >> 16));
+            if (amode.value & 0x0000ffff) {
+                mMips->ORI(tmpReg, tmpReg, (amode.value & 0x0000ffff));
+            }
+            source = tmpReg;
+            return SRC_REG;
+        } else {
+            source = amode.value;
+            return SRC_IMM;
+        }
+    } else if (op == AMODE_REG_IMM) {
+        switch (amode.stype) {
+            case LSL: mMips->SLL(tmpReg, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(tmpReg, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(tmpReg, amode.reg, amode.value); break;
+            case ROR: mMips->ROTR(tmpReg, amode.reg, amode.value); break;
+        }
+        source = tmpReg;
+        return SRC_REG;
+    } else {  // adr mode RRX is not used in GGL Assembler at this time
+        // we are screwed, this should be exception, assert-fail or something
+        LOG_ALWAYS_FATAL("adr mode reg_rrx not yet implemented\n");
+        return SRC_ERROR;
+    }
+}
+
+
+void ArmToMips64Assembler::dataProcessing(int opcode, int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    int src;    // src is modified by dataProcAdrModes() - passed as int&
+
+    if (cc != AL) {
+        protectConditionalOperands(Rd);
+        // the branch tests register(s) set by prev CMP or instr with 'S' bit set
+        // inverse the condition to jump past this conditional instruction
+        ArmToMips64Assembler::B(cc^1, cond.label[++cond.labelnum]);
+    } else {
+        mArmPC[mInum++] = pc();  // save starting PC for this instr
+    }
+
+    switch (opcode) {
+    case opAND:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->AND(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ANDI(Rd, Rn, src);
+        }
+        break;
+
+    case opADD:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->ADDU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ADDIU(Rd, Rn, src);
+        }
+        break;
+
+    case opSUB:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->SUBU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->SUBIU(Rd, Rn, src);
+        }
+        break;
+
+    case opADD64:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->DADDU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->DADDIU(Rd, Rn, src);
+        }
+        break;
+
+    case opSUB64:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->DSUBU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->DSUBIU(Rd, Rn, src);
+        }
+        break;
+
+    case opEOR:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->XOR(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->XORI(Rd, Rn, src);
+        }
+        break;
+
+    case opORR:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->OR(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ORI(Rd, Rn, src);
+        }
+        break;
+
+    case opBIC:
+        if (dataProcAdrModes(Op2, src) == SRC_IMM) {
+            // if we are 16-bit imnmediate, load to AT reg
+            mMips->ORI(R_at, 0, src);
+            src = R_at;
+        }
+        mMips->NOT(R_at, src);
+        mMips->AND(Rd, Rn, R_at);
+        break;
+
+    case opRSB:
+        if (dataProcAdrModes(Op2, src) == SRC_IMM) {
+            // if we are 16-bit imnmediate, load to AT reg
+            mMips->ORI(R_at, 0, src);
+            src = R_at;
+        }
+        mMips->SUBU(Rd, src, Rn);   // subu with the parameters reversed
+        break;
+
+    case opMOV:
+        if (Op2 < AMODE_REG) {  // op2 is reg # in this case
+            mMips->MOVE(Rd, Op2);
+        } else if (Op2 == AMODE_IMM) {
+            if (amode.value > 0xffff) {
+                mMips->LUI(Rd, (amode.value >> 16));
+                if (amode.value & 0x0000ffff) {
+                    mMips->ORI(Rd, Rd, (amode.value & 0x0000ffff));
+                }
+             } else {
+                mMips->ORI(Rd, 0, amode.value);
+            }
+        } else if (Op2 == AMODE_REG_IMM) {
+            switch (amode.stype) {
+            case LSL: mMips->SLL(Rd, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(Rd, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(Rd, amode.reg, amode.value); break;
+            case ROR: mMips->ROTR(Rd, amode.reg, amode.value); break;
+            }
+        }
+        else {
+            // adr mode RRX is not used in GGL Assembler at this time
+            mMips->UNIMPL();
+        }
+        break;
+
+    case opMVN:     // this is a 1's complement: NOT
+        if (Op2 < AMODE_REG) {  // op2 is reg # in this case
+            mMips->NOR(Rd, Op2, 0);     // NOT is NOR with 0
+            break;
+        } else if (Op2 == AMODE_IMM) {
+            if (amode.value > 0xffff) {
+                mMips->LUI(Rd, (amode.value >> 16));
+                if (amode.value & 0x0000ffff) {
+                    mMips->ORI(Rd, Rd, (amode.value & 0x0000ffff));
+                }
+             } else {
+                mMips->ORI(Rd, 0, amode.value);
+             }
+        } else if (Op2 == AMODE_REG_IMM) {
+            switch (amode.stype) {
+            case LSL: mMips->SLL(Rd, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(Rd, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(Rd, amode.reg, amode.value); break;
+            case ROR: mMips->ROTR(Rd, amode.reg, amode.value); break;
+            }
+        }
+        else {
+            // adr mode RRX is not used in GGL Assembler at this time
+            mMips->UNIMPL();
+        }
+        mMips->NOR(Rd, Rd, 0);     // NOT is NOR with 0
+        break;
+
+    case opCMP:
+        // Either operand of a CMP instr could get overwritten by a subsequent
+        // conditional instruction, which is ok, _UNLESS_ there is a _second_
+        // conditional instruction. Under MIPS, this requires doing the comparison
+        // again (SLT), and the original operands must be available. (and this
+        // pattern of multiple conditional instructions from same CMP _is_ used
+        // in GGL-Assembler)
+        //
+        // For now, if a conditional instr overwrites the operands, we will
+        // move them to dedicated temp regs. This is ugly, and inefficient,
+        // and should be optimized.
+        //
+        // WARNING: making an _Assumption_ that CMP operand regs will NOT be
+        // trashed by intervening NON-conditional instructions. In the general
+        // case this is legal, but it is NOT currently done in GGL-Assembler.
+
+        cond.type = CMP_COND;
+        cond.r1 = Rn;
+        if (dataProcAdrModes(Op2, src, false, R_cmp2) == SRC_REG) {
+            cond.r2 = src;
+        } else {                        // adr mode was SRC_IMM
+            mMips->ORI(R_cmp2, R_zero, src);
+            cond.r2 = R_cmp2;
+        }
+
+        break;
+
+
+    case opTST:
+    case opTEQ:
+    case opCMN:
+    case opADC:
+    case opSBC:
+    case opRSC:
+        mMips->UNIMPL(); // currently unused in GGL Assembler code
+        break;
+    }
+
+    if (cc != AL) {
+        mMips->label(cond.label[cond.labelnum]);
+    }
+    if (s && opcode != opCMP) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Multiply...
+#endif
+
+// multiply, accumulate
+void ArmToMips64Assembler::MLA(int cc __unused, int s,
+        int Rd, int Rm, int Rs, int Rn) {
+
+    //ALOGW("MLA");
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->MUL(R_at, Rm, Rs);
+    mMips->ADDU(Rd, R_at, Rn);
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+void ArmToMips64Assembler::MUL(int cc __unused, int s,
+        int Rd, int Rm, int Rs) {
+    mArmPC[mInum++] = pc();
+    mMips->MUL(Rd, Rm, Rs);
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+void ArmToMips64Assembler::UMULL(int cc __unused, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    mArmPC[mInum++] = pc();
+    mMips->MUH(RdHi, Rm, Rs);
+    mMips->MUL(RdLo, Rm, Rs);
+
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on UMULL must be on 64-bit result\n");
+    }
+}
+
+void ArmToMips64Assembler::UMUAL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<21) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on UMULL must be on 64-bit result\n");
+    }
+}
+
+void ArmToMips64Assembler::SMULL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on SMULL must be on 64-bit result\n");
+    }
+}
+void ArmToMips64Assembler::SMUAL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (1<<21) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on SMUAL must be on 64-bit result\n");
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Branches...
+#endif
+
+// branches...
+
+void ArmToMips64Assembler::B(int cc, const char* label)
+{
+    mArmPC[mInum++] = pc();
+    if (cond.type == SBIT_COND) { cond.r2 = R_zero; }
+
+    switch(cc) {
+        case EQ: mMips->BEQ(cond.r1, cond.r2, label); break;
+        case NE: mMips->BNE(cond.r1, cond.r2, label); break;
+        case HS: mMips->BGEU(cond.r1, cond.r2, label); break;
+        case LO: mMips->BLTU(cond.r1, cond.r2, label); break;
+        case MI: mMips->BLT(cond.r1, cond.r2, label); break;
+        case PL: mMips->BGE(cond.r1, cond.r2, label); break;
+
+        case HI: mMips->BGTU(cond.r1, cond.r2, label); break;
+        case LS: mMips->BLEU(cond.r1, cond.r2, label); break;
+        case GE: mMips->BGE(cond.r1, cond.r2, label); break;
+        case LT: mMips->BLT(cond.r1, cond.r2, label); break;
+        case GT: mMips->BGT(cond.r1, cond.r2, label); break;
+        case LE: mMips->BLE(cond.r1, cond.r2, label); break;
+        case AL: mMips->B(label); break;
+        case NV: /* B Never - no instruction */ break;
+
+        case VS:
+        case VC:
+        default:
+            LOG_ALWAYS_FATAL("Unsupported cc: %02x\n", cc);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::BL(int cc __unused, const char* label __unused)
+{
+    LOG_ALWAYS_FATAL("branch-and-link not supported yet\n");
+    mArmPC[mInum++] = pc();
+}
+
+// no use for Branches with integer PC, but they're in the Interface class ....
+void ArmToMips64Assembler::B(int cc __unused, uint32_t* to_pc __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+void ArmToMips64Assembler::BL(int cc __unused, uint32_t* to_pc __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+void ArmToMips64Assembler::BX(int cc __unused, int Rn __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Data Transfer...
+#endif
+
+// data transfer...
+void ArmToMips64Assembler::LDR(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert LDR via Arm SP to LW via Mips SP
+            }
+            mMips->LW(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->DADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert STR thru Arm SP to STR thru Mips SP
+            }
+            mMips->LW(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->LW(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::LDRB(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            mMips->LBU(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->DADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->LBU(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->LBU(Rd, R_at, 0);
+            break;
+    }
+
+}
+
+void ArmToMips64Assembler::STR(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;  // convert STR thru Arm SP to SW thru Mips SP
+            }
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                // If we will writeback, then update the index reg, then store.
+                // This correctly handles stack-push case.
+                mMips->DADDIU(Rn, Rn, amode.value);
+                mMips->SW(Rd, Rn, 0);
+            } else {
+                // No writeback so store offset by value
+                mMips->SW(Rd, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->SW(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);  // post index always writes back
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->SW(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::STRB(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            mMips->SB(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->DADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->SB(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->SB(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::LDRH(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed8_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_8_PRE:      // no support yet for writeback
+            mMips->LHU(Rd, Rn, amode.value);
+            break;
+        case AMODE_IMM_8_POST:
+            mMips->LHU(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_PRE:
+            // we only support simple base +/- index
+            if (amode.reg >= 0) {
+                mMips->DADDU(R_at, Rn, amode.reg);
+            } else {
+                mMips->DSUBU(R_at, Rn, abs(amode.reg));
+            }
+            mMips->LHU(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::LDRSB(int cc __unused, int Rd __unused,
+                                 int Rn __unused, uint32_t offset __unused)
+{
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::LDRSH(int cc __unused, int Rd __unused,
+                                 int Rn __unused, uint32_t offset __unused)
+{
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::STRH(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed8_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_8_PRE:      // no support yet for writeback
+            mMips->SH(Rd, Rn, amode.value);
+            break;
+        case AMODE_IMM_8_POST:
+            mMips->SH(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_PRE:
+            // we only support simple base +/- index
+            if (amode.reg >= 0) {
+                mMips->DADDU(R_at, Rn, amode.reg);
+            } else {
+                mMips->DSUBU(R_at, Rn, abs(amode.reg));
+            }
+            mMips->SH(Rd, R_at, 0);
+            break;
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Block Data Transfer...
+#endif
+
+// block data transfer...
+void ArmToMips64Assembler::LDM(int cc __unused, int dir __unused,
+        int Rn __unused, int W __unused, uint32_t reg_list __unused)
+{   //                        ED FD EA FA      IB IA DB DA
+    // const uint8_t P[8] = { 1, 0, 1, 0,      1, 0, 1, 0 };
+    // const uint8_t U[8] = { 1, 1, 0, 0,      1, 1, 0, 0 };
+    // *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+    //         (uint32_t(U[dir])<<23) | (1<<20) | (W<<21) | (Rn<<16) | reg_list;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::STM(int cc __unused, int dir __unused,
+        int Rn __unused, int W __unused, uint32_t reg_list __unused)
+{   //                        FA EA FD ED      IB IA DB DA
+    // const uint8_t P[8] = { 0, 1, 0, 1,      1, 0, 1, 0 };
+    // const uint8_t U[8] = { 0, 0, 1, 1,      1, 1, 0, 0 };
+    // *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+    //         (uint32_t(U[dir])<<23) | (0<<20) | (W<<21) | (Rn<<16) | reg_list;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Special...
+#endif
+
+// special...
+void ArmToMips64Assembler::SWP(int cc __unused, int Rn __unused,
+                               int Rd __unused, int Rm __unused) {
+    // *mPC++ = (cc<<28) | (2<<23) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::SWPB(int cc __unused, int Rn __unused,
+                                int Rd __unused, int Rm __unused) {
+    // *mPC++ = (cc<<28) | (2<<23) | (1<<22) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::SWI(int cc __unused, uint32_t comment __unused) {
+    // *mPC++ = (cc<<28) | (0xF<<24) | comment;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+
+#if 0
+#pragma mark -
+#pragma mark DSP instructions...
+#endif
+
+// DSP instructions...
+void ArmToMips64Assembler::PLD(int Rn __unused, uint32_t offset) {
+    LOG_ALWAYS_FATAL_IF(!((offset&(1<<24)) && !(offset&(1<<21))),
+                        "PLD only P=1, W=0");
+    // *mPC++ = 0xF550F000 | (Rn<<16) | offset;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::CLZ(int cc __unused, int Rd, int Rm)
+{
+    mArmPC[mInum++] = pc();
+    mMips->CLZ(Rd, Rm);
+}
+
+void ArmToMips64Assembler::QADD(int cc __unused, int Rd __unused,
+                                int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1000050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::QDADD(int cc __unused, int Rd __unused,
+                                 int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1400050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::QSUB(int cc __unused, int Rd __unused,
+                                int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1200050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::QDSUB(int cc __unused, int Rd __unused,
+                                 int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1600050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+// 16 x 16 signed multiply (like SMLAxx without the accumulate)
+void ArmToMips64Assembler::SMUL(int cc __unused, int xy,
+                int Rd, int Rm, int Rs)
+{
+    mArmPC[mInum++] = pc();
+
+    // the 16 bits may be in the top or bottom half of 32-bit source reg,
+    // as defined by the codes BB, BT, TB, TT (compressed param xy)
+    // where x corresponds to Rm and y to Rs
+
+    // select half-reg for Rm
+    if (xy & xyTB) {
+        // use top 16-bits
+        mMips->SRA(R_at, Rm, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        mMips->SEH(R_at, Rm);
+    }
+    // select half-reg for Rs
+    if (xy & xyBT) {
+        // use top 16-bits
+        mMips->SRA(R_at2, Rs, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        mMips->SEH(R_at2, Rs);
+    }
+    mMips->MUL(Rd, R_at, R_at2);
+}
+
+// signed 32b x 16b multiple, save top 32-bits of 48-bit result
+void ArmToMips64Assembler::SMULW(int cc __unused, int y,
+                int Rd, int Rm, int Rs)
+{
+    mArmPC[mInum++] = pc();
+
+    // the selector yT or yB refers to reg Rs
+    if (y & yT) {
+        // zero the bottom 16-bits, with 2 shifts, it can affect result
+        mMips->SRL(R_at, Rs, 16);
+        mMips->SLL(R_at, R_at, 16);
+
+    } else {
+        // move low 16-bit half, to high half
+        mMips->SLL(R_at, Rs, 16);
+    }
+    mMips->MUH(Rd, Rm, R_at);
+}
+
+// 16 x 16 signed multiply, accumulate: Rd = Rm{16} * Rs{16} + Rn
+void ArmToMips64Assembler::SMLA(int cc __unused, int xy,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    mArmPC[mInum++] = pc();
+
+    // the 16 bits may be in the top or bottom half of 32-bit source reg,
+    // as defined by the codes BB, BT, TB, TT (compressed param xy)
+    // where x corresponds to Rm and y to Rs
+
+    // select half-reg for Rm
+    if (xy & xyTB) {
+        // use top 16-bits
+        mMips->SRA(R_at, Rm, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        mMips->SEH(R_at, Rm);
+    }
+    // select half-reg for Rs
+    if (xy & xyBT) {
+        // use top 16-bits
+        mMips->SRA(R_at2, Rs, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        mMips->SEH(R_at2, Rs);
+    }
+
+    mMips->MUL(R_at, R_at, R_at2);
+    mMips->ADDU(Rd, R_at, Rn);
+}
+
+void ArmToMips64Assembler::SMLAL(int cc __unused, int xy __unused,
+                                 int RdHi __unused, int RdLo __unused,
+                                 int Rs __unused, int Rm __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1400080 | (RdHi<<16) | (RdLo<<12) | (Rs<<8) | (xy<<4) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMips64Assembler::SMLAW(int cc __unused, int y __unused,
+                                 int Rd __unused, int Rm __unused,
+                                 int Rs __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1200080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (y<<4) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+// used by ARMv6 version of GGLAssembler::filter32
+void ArmToMips64Assembler::UXTB16(int cc __unused, int Rd, int Rm, int rotate)
+{
+    mArmPC[mInum++] = pc();
+
+    //Rd[31:16] := ZeroExtend((Rm ROR (8 * sh))[23:16]),
+    //Rd[15:0] := ZeroExtend((Rm ROR (8 * sh))[7:0]). sh 0-3.
+
+    mMips->ROTR(R_at2, Rm, rotate * 8);
+    mMips->LUI(R_at, 0xFF);
+    mMips->ORI(R_at, R_at, 0xFF);
+    mMips->AND(Rd, R_at2, R_at);
+}
+
+void ArmToMips64Assembler::UBFX(int cc __unused, int Rd __unused, int Rn __unused,
+                                int lsb __unused, int width __unused)
+{
+     /* Placeholder for UBFX */
+     mArmPC[mInum++] = pc();
+
+     mMips->NOP2();
+     NOT_IMPLEMENTED();
+}
+
+// ----------------------------------------------------------------------------
+// Address Processing...
+// ----------------------------------------------------------------------------
+
+void ArmToMips64Assembler::ADDR_ADD(int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+//    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+//    if(s  != 0) { NOT_IMPLEMENTED(); return;} //Not required
+    dataProcessing(opADD64, cc, s, Rd, Rn, Op2);
+}
+
+void ArmToMips64Assembler::ADDR_SUB(int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+//    if(cc != AL){ NOT_IMPLEMENTED(); return;} //Not required
+//    if(s  != 0) { NOT_IMPLEMENTED(); return;} //Not required
+    dataProcessing(opSUB64, cc, s, Rd, Rn, Op2);
+}
+
+void ArmToMips64Assembler::ADDR_LDR(int cc __unused, int Rd,
+                                    int Rn, uint32_t offset) {
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert LDR via Arm SP to LW via Mips SP
+            }
+            mMips->LD(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->DADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert STR thru Arm SP to STR thru Mips SP
+            }
+            mMips->LD(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->LD(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMips64Assembler::ADDR_STR(int cc __unused, int Rd,
+                                    int Rn, uint32_t offset) {
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;  // convert STR thru Arm SP to SW thru Mips SP
+            }
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                // If we will writeback, then update the index reg, then store.
+                // This correctly handles stack-push case.
+                mMips->DADDIU(Rn, Rn, amode.value);
+                mMips->SD(Rd, Rn, 0);
+            } else {
+                // No writeback so store offset by value
+                mMips->SD(Rd, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->SD(Rd, Rn, 0);
+            mMips->DADDIU(Rn, Rn, amode.value);  // post index always writes back
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->DADDU(R_at, Rn, amode.reg);
+            mMips->SD(Rd, R_at, 0);
+            break;
+    }
+}
+
+#if 0
+#pragma mark -
+#pragma mark MIPS Assembler...
+#endif
+
+
+//**************************************************************************
+//**************************************************************************
+//**************************************************************************
+
+
+/* MIPS64 assembler
+** this is a subset of mips64r6, targeted specifically at ARM instruction
+** replacement in the pixelflinger/codeflinger code.
+**
+** This class is extended from MIPSAssembler class and overrides only
+** MIPS64r6 specific stuff.
+*/
+
+MIPS64Assembler::MIPS64Assembler(const sp<Assembly>& assembly, ArmToMips64Assembler *parent)
+    : MIPSAssembler::MIPSAssembler(assembly, NULL), mParent(parent)
+{
+}
+
+MIPS64Assembler::MIPS64Assembler(void* assembly, ArmToMips64Assembler *parent)
+    : MIPSAssembler::MIPSAssembler(assembly), mParent(parent)
+{
+}
+
+MIPS64Assembler::~MIPS64Assembler()
+{
+}
+
+void MIPS64Assembler::reset()
+{
+    if (mAssembly != NULL) {
+        mBase = mPC = (uint32_t *)mAssembly->base();
+    } else {
+        mPC = mBase = base();
+    }
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+}
+
+
+void MIPS64Assembler::disassemble(const char* name __unused)
+{
+    char di_buf[140];
+
+    bool arm_disasm_fmt = (mParent->mArmDisassemblyBuffer == NULL) ? false : true;
+
+    typedef char dstr[40];
+    dstr *lines = (dstr *)mParent->mArmDisassemblyBuffer;
+
+    if (mParent->mArmDisassemblyBuffer != NULL) {
+        for (int i=0; i<mParent->mArmInstrCount; ++i) {
+            string_detab(lines[i]);
+        }
+    }
+
+    size_t count = pc()-base();
+    uint32_t* mipsPC = base();
+
+    while (count--) {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(mipsPC);
+        if (label >= 0) {
+            ALOGW("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(mipsPC);
+        if (comment >= 0) {
+            ALOGW("; %s\n", mComments.valueAt(comment));
+        }
+        ::mips_disassem(mipsPC, di_buf, arm_disasm_fmt);
+        string_detab(di_buf);
+        string_pad(di_buf, 30);
+        ALOGW("%08lx:    %08x    %s", uintptr_t(mipsPC), uint32_t(*mipsPC), di_buf);
+        mipsPC++;
+    }
+}
+
+void MIPS64Assembler::fix_branches()
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--) {
+        const branch_target_t& bt = mBranchTargets[count];
+        uint32_t* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                "error resolving branch targets, target_pc is null");
+        int32_t offset = int32_t(target_pc - (bt.pc+1));
+        *bt.pc |= offset & 0x00FFFF;
+    }
+}
+
+void MIPS64Assembler::DADDU(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (daddu_fn<<FUNC_SHF)
+                    | (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF);
+}
+
+void MIPS64Assembler::DADDIU(int Rt, int Rs, int16_t imm)
+{
+    *mPC++ = (daddiu_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+void MIPS64Assembler::DSUBU(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (dsubu_fn<<FUNC_SHF) |
+                        (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF) ;
+}
+
+void MIPS64Assembler::DSUBIU(int Rt, int Rs, int16_t imm)   // really addiu(d, s, -j)
+{
+    *mPC++ = (daddiu_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | ((-imm) & MSK_16);
+}
+
+void MIPS64Assembler::MUL(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mul_fn<<RE_SHF) | (sop30_fn<<FUNC_SHF) |
+                        (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF) ;
+}
+
+void MIPS64Assembler::MUH(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (muh_fn<<RE_SHF) | (sop30_fn<<FUNC_SHF) |
+                        (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF) ;
+}
+
+void MIPS64Assembler::CLO(int Rd, int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (17<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (1<<RE_SHF);
+}
+
+void MIPS64Assembler::CLZ(int Rd, int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (16<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (1<<RE_SHF);
+}
+
+void MIPS64Assembler::LD(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (ld_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPS64Assembler::SD(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (sd_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPS64Assembler::LUI(int Rt, int16_t offset)
+{
+    *mPC++ = (aui_op<<OP_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+
+void MIPS64Assembler::JR(int Rs)
+{
+        *mPC++ = (spec_op<<OP_SHF) | (Rs<<RS_SHF) | (jalr_fn << FUNC_SHF);
+        MIPS64Assembler::NOP();
+}
+
+}; // namespace android:
diff --git a/libpixelflinger/codeflinger/MIPS64Assembler.h b/libpixelflinger/codeflinger/MIPS64Assembler.h
new file mode 100644
index 0000000..b43e5da
--- /dev/null
+++ b/libpixelflinger/codeflinger/MIPS64Assembler.h
@@ -0,0 +1,404 @@
+/* libs/pixelflinger/codeflinger/MIPS64Assembler.h
+**
+** Copyright 2015, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef ANDROID_MIPS64ASSEMBLER_H
+#define ANDROID_MIPS64ASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "utils/KeyedVector.h"
+#include "utils/Vector.h"
+#include "tinyutils/smartpointer.h"
+
+#include "ARMAssemblerInterface.h"
+#include "MIPSAssembler.h"
+#include "CodeCache.h"
+
+namespace android {
+
+class MIPS64Assembler;    // forward reference
+
+// this class mimics ARMAssembler interface
+// intent is to translate each ARM instruction to 1 or more MIPS instr
+// implementation calls MIPS64Assembler class to generate mips code
+class ArmToMips64Assembler : public ARMAssemblerInterface
+{
+public:
+                ArmToMips64Assembler(const sp<Assembly>& assembly,
+                        char *abuf = 0, int linesz = 0, int instr_count = 0);
+                ArmToMips64Assembler(void* assembly);
+    virtual     ~ArmToMips64Assembler();
+
+    uint32_t*   base() const;
+    uint32_t*   pc() const;
+    void        disassemble(const char* name);
+
+    virtual void    reset();
+
+    virtual int     generate(const char* name);
+    virtual int     getCodegenArch();
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+    // for testing purposes
+    void        fix_branches();
+    void        set_condition(int mode, int R1, int R2);
+
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // shifters...
+    virtual bool        isValidImmediate(uint32_t immed);
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    virtual uint32_t    imm(uint32_t immediate);
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    virtual uint32_t    reg_rrx(int Rm);
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes...
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0);
+    virtual uint32_t    immed12_post(int32_t immed12);
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0);
+    virtual uint32_t    immed8_post(int32_t immed8);
+    virtual uint32_t    reg_pre(int Rm, int W=0);
+    virtual uint32_t    reg_post(int Rm);
+
+
+
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    virtual uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+
+    // byte/half word extract...
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+
+    // bit manipulation...
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width);
+
+    // Address loading/storing/manipulation
+    virtual void ADDR_LDR(int cc, int Rd, int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_STR(int cc, int Rd, int Rn, uint32_t offset = __immed12_pre(0));
+    virtual void ADDR_ADD(int cc, int s, int Rd, int Rn, uint32_t Op2);
+    virtual void ADDR_SUB(int cc, int s, int Rd, int Rn, uint32_t Op2);
+
+    // this is some crap to share is MIPS64Assembler class for debug
+    char *      mArmDisassemblyBuffer;
+    int         mArmLineLength;
+    int         mArmInstrCount;
+
+    int         mInum;      // current arm instuction number (0..n)
+    uint32_t**  mArmPC;     // array: PC for 1st mips instr of
+                            //      each translated ARM instr
+
+
+private:
+    ArmToMips64Assembler(const ArmToMips64Assembler& rhs);
+    ArmToMips64Assembler& operator = (const ArmToMips64Assembler& rhs);
+
+    void init_conditional_labels(void);
+
+    void protectConditionalOperands(int Rd);
+
+    // reg__tmp set to MIPS AT, reg 1
+    int dataProcAdrModes(int op, int& source, bool sign = false, int reg_tmp = 1);
+
+    sp<Assembly>        mAssembly;
+    MIPS64Assembler*    mMips;
+
+
+    enum misc_constants_t {
+        ARM_MAX_INSTUCTIONS = 512  // based on ASSEMBLY_SCRATCH_SIZE
+    };
+
+    enum {
+        SRC_REG = 0,
+        SRC_IMM,
+        SRC_ERROR = -1
+    };
+
+    enum addr_modes {
+        // start above the range of legal mips reg #'s (0-31)
+        AMODE_REG = 0x20,
+        AMODE_IMM, AMODE_REG_IMM,               // for data processing
+        AMODE_IMM_12_PRE, AMODE_IMM_12_POST,    // for load/store
+        AMODE_REG_SCALE_PRE, AMODE_IMM_8_PRE,
+        AMODE_IMM_8_POST, AMODE_REG_PRE,
+        AMODE_UNSUPPORTED
+    };
+
+    struct addr_mode_t {    // address modes for current ARM instruction
+        int         reg;
+        int         stype;
+        uint32_t    value;
+        bool        writeback;  // writeback the adr reg after modification
+    } amode;
+
+    enum cond_types {
+        CMP_COND = 1,
+        SBIT_COND
+    };
+
+    struct cond_mode_t {    // conditional-execution info for current ARM instruction
+        cond_types  type;
+        int         r1;
+        int         r2;
+        int         labelnum;
+        char        label[100][10];
+    } cond;
+};
+
+
+
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// This is the basic MIPS64 assembler, which just creates the opcodes in memory.
+// All the more complicated work is done in ArmToMips64Assember above.
+// Inherits MIPSAssembler class, and overrides only MIPS64r6 specific stuff
+
+class MIPS64Assembler : public MIPSAssembler
+{
+public:
+                MIPS64Assembler(const sp<Assembly>& assembly, ArmToMips64Assembler *parent);
+                MIPS64Assembler(void* assembly, ArmToMips64Assembler *parent);
+    virtual     ~MIPS64Assembler();
+
+    virtual void        reset();
+    virtual void        disassemble(const char* name);
+
+    void        fix_branches();
+
+    // ------------------------------------------------------------------------
+    // MIPS64AssemblerInterface...
+    // ------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Arithmetic...
+#endif
+
+    void DADDU(int Rd, int Rs, int Rt);
+    void DADDIU(int Rt, int Rs, int16_t imm);
+    void DSUBU(int Rd, int Rs, int Rt);
+    void DSUBIU(int Rt, int Rs, int16_t imm);
+    virtual void MUL(int Rd, int Rs, int Rt);
+    void MUH(int Rd, int Rs, int Rt);
+
+#if 0
+#pragma mark -
+#pragma mark Logical...
+#endif
+
+    virtual void CLO(int Rd, int Rs);
+    virtual void CLZ(int Rd, int Rs);
+
+#if 0
+#pragma mark -
+#pragma mark Load/store...
+#endif
+
+    void LD(int Rt, int Rbase, int16_t offset);
+    void SD(int Rt, int Rbase, int16_t offset);
+    virtual void LUI(int Rt, int16_t offset);
+
+#if 0
+#pragma mark -
+#pragma mark Branch...
+#endif
+
+    void JR(int Rs);
+
+
+protected:
+    ArmToMips64Assembler *mParent;
+
+    // opcode field of all instructions
+    enum opcode_field {
+        spec_op, regimm_op, j_op, jal_op,                  // 0x00 - 0x03
+        beq_op, bne_op, pop06_op, pop07_op,                // 0x04 - 0x07
+        pop10_op, addiu_op, slti_op, sltiu_op,             // 0x08 - 0x0b
+        andi_op, ori_op, xori_op, aui_op,                  // 0x0c - 0x0f
+        cop0_op, cop1_op, cop2_op, rsrv_opc_0,             // 0x10 - 0x13
+        rsrv_opc_1, rsrv_opc_2, pop26_op, pop27_op,        // 0x14 - 0x17
+        pop30_op, daddiu_op, rsrv_opc_3, rsrv_opc_4,       // 0x18 - 0x1b
+        rsrv_opc_5, daui_op, msa_op, spec3_op,             // 0x1c - 0x1f
+        lb_op, lh_op, rsrv_opc_6, lw_op,                   // 0x20 - 0x23
+        lbu_op, lhu_op, rsrv_opc_7, lwu_op,                // 0x24 - 0x27
+        sb_op, sh_op, rsrv_opc_8, sw_op,                   // 0x28 - 0x2b
+        rsrv_opc_9, rsrv_opc_10, rsrv_opc_11, rsrv_opc_12, // 0x2c - 0x2f
+        rsrv_opc_13, lwc1_op, bc_op, rsrv_opc_14,          // 0x2c - 0x2f
+        rsrv_opc_15, ldc1_op, pop66_op, ld_op,             // 0x30 - 0x33
+        rsrv_opc_16, swc1_op, balc_op, pcrel_op,           // 0x34 - 0x37
+        rsrv_opc_17, sdc1_op, pop76_op, sd_op              // 0x38 - 0x3b
+    };
+
+
+    // func field for special opcode
+    enum func_spec_op {
+        sll_fn, rsrv_spec_0, srl_fn, sra_fn,
+        sllv_fn, lsa_fn, srlv_fn, srav_fn,
+        rsrv_spec_1, jalr_fn, rsrv_spec_2, rsrv_spec_3,
+        syscall_fn, break_fn, sdbbp_fn, sync_fn,
+        clz_fn, clo_fn, dclz_fn, dclo_fn,
+        dsllv_fn, dlsa_fn, dsrlv_fn, dsrav_fn,
+        sop30_fn, sop31_fn, sop32_fn, sop33_fn,
+        sop34_fn, sop35_fn, sop36_fn, sop37_fn,
+        add_fn, addu_fn, sub_fn, subu_fn,
+        and_fn, or_fn, xor_fn, nor_fn,
+        rsrv_spec_4, rsrv_spec_5, slt_fn, sltu_fn,
+        dadd_fn, daddu_fn, dsub_fn, dsubu_fn,
+        tge_fn, tgeu_fn, tlt_fn, tltu_fn,
+        teq_fn, seleqz_fn, tne_fn, selnez_fn,
+        dsll_fn, rsrv_spec_6, dsrl_fn, dsra_fn,
+        dsll32_fn, rsrv_spec_7, dsrl32_fn, dsra32_fn
+    };
+
+    // func field for spec3 opcode
+    enum func_spec3_op {
+        ext_fn, dextm_fn, dextu_fn, dext_fn,
+        ins_fn, dinsm_fn, dinsu_fn, dins_fn,
+        cachee_fn = 0x1b, sbe_fn, she_fn, sce_fn, swe_fn,
+        bshfl_fn, prefe_fn = 0x23, dbshfl_fn, cache_fn, sc_fn, scd_fn,
+        lbue_fn, lhue_fn, lbe_fn = 0x2c, lhe_fn, lle_fn, lwe_fn,
+        pref_fn = 0x35, ll_fn, lld_fn, rdhwr_fn = 0x3b
+    };
+
+    // sa field for spec3 opcodes, with BSHFL function
+    enum func_spec3_bshfl {
+        bitswap_fn,
+        wsbh_fn = 0x02,
+        dshd_fn = 0x05,
+        seb_fn = 0x10,
+        seh_fn = 0x18
+    };
+
+    // rt field of regimm opcodes.
+    enum regimm_fn {
+        bltz_fn, bgez_fn,
+        dahi_fn = 0x6,
+        nal_fn = 0x10, bal_fn, bltzall_fn, bgezall_fn,
+        sigrie_fn = 0x17,
+        dati_fn = 0x1e, synci_fn
+    };
+
+    enum muldiv_fn {
+        mul_fn = 0x02, muh_fn
+    };
+
+    enum mips_inst_shifts {
+        OP_SHF       = 26,
+        JTARGET_SHF  = 0,
+        RS_SHF       = 21,
+        RT_SHF       = 16,
+        RD_SHF       = 11,
+        RE_SHF       = 6,
+        SA_SHF       = RE_SHF,  // synonym
+        IMM_SHF      = 0,
+        FUNC_SHF     = 0,
+
+        // mask values
+        MSK_16       = 0xffff,
+
+
+        CACHEOP_SHF  = 18,
+        CACHESEL_SHF = 16,
+    };
+};
+
+
+}; // namespace android
+
+#endif //ANDROID_MIPS64ASSEMBLER_H
diff --git a/libpixelflinger/codeflinger/MIPSAssembler.cpp b/libpixelflinger/codeflinger/MIPSAssembler.cpp
new file mode 100644
index 0000000..7de8cc1
--- /dev/null
+++ b/libpixelflinger/codeflinger/MIPSAssembler.cpp
@@ -0,0 +1,1955 @@
+/* libs/pixelflinger/codeflinger/MIPSAssembler.cpp
+**
+** Copyright 2012, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+
+/* MIPS assembler and ARM->MIPS assembly translator
+**
+** The approach is to leave the GGLAssembler and associated files largely
+** un-changed, still utilizing all Arm instruction generation. Via the
+** ArmToMipsAssembler (subclassed from ArmAssemblerInterface) each Arm
+** instruction is translated to one or more Mips instructions as necessary. This
+** is clearly less efficient than a direct implementation within the
+** GGLAssembler, but is far cleaner, more maintainable, and has yielded very
+** significant performance gains on Mips compared to the generic pixel pipeline.
+**
+**
+** GGLAssembler changes
+**
+** - The register allocator has been modified to re-map Arm registers 0-15 to mips
+** registers 2-17. Mips register 0 cannot be used as general-purpose register,
+** and register 1 has traditional uses as a short-term temporary.
+**
+** - Added some early bailouts for OUT_OF_REGISTERS in texturing.cpp and
+** GGLAssembler.cpp, since this is not fatal, and can be retried at lower
+** optimization level.
+**
+**
+** ARMAssembler and ARMAssemblerInterface changes
+**
+** Refactored ARM address-mode static functions (imm(), reg_imm(), imm12_pre(), etc.)
+** to virtual, so they can be overridden in MIPSAssembler. The implementation of these
+** functions on ARM is moved from ARMAssemblerInterface.cpp to ARMAssembler.cpp, and
+** is unchanged from the original. (This required duplicating 2 of these as static
+** functions in ARMAssemblerInterface.cpp so they could be used as static initializers).
+*/
+
+#define LOG_TAG "MIPSAssembler"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <cutils/properties.h>
+#include <log/log.h>
+#include <private/pixelflinger/ggl_context.h>
+
+#include "CodeCache.h"
+#include "MIPSAssembler.h"
+#include "mips_disassem.h"
+
+#define __unused __attribute__((__unused__))
+
+// Choose MIPS arch variant following gcc flags
+#if defined(__mips__) && __mips==32 && __mips_isa_rev>=2
+#define mips32r2 1
+#else
+#define mips32r2 0
+#endif
+
+
+#define NOT_IMPLEMENTED()  LOG_ALWAYS_FATAL("Arm instruction %s not yet implemented\n", __func__)
+
+
+
+// ----------------------------------------------------------------------------
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark ArmToMipsAssembler...
+#endif
+
+ArmToMipsAssembler::ArmToMipsAssembler(const sp<Assembly>& assembly,
+                                       char *abuf, int linesz, int instr_count)
+    :   ARMAssemblerInterface(),
+        mArmDisassemblyBuffer(abuf),
+        mArmLineLength(linesz),
+        mArmInstrCount(instr_count),
+        mInum(0),
+        mAssembly(assembly)
+{
+    mMips = new MIPSAssembler(assembly, this);
+    mArmPC = (uint32_t **) malloc(ARM_MAX_INSTUCTIONS * sizeof(uint32_t *));
+    init_conditional_labels();
+}
+
+ArmToMipsAssembler::~ArmToMipsAssembler()
+{
+    delete mMips;
+    free((void *) mArmPC);
+}
+
+uint32_t* ArmToMipsAssembler::pc() const
+{
+    return mMips->pc();
+}
+
+uint32_t* ArmToMipsAssembler::base() const
+{
+    return mMips->base();
+}
+
+void ArmToMipsAssembler::reset()
+{
+    cond.labelnum = 0;
+    mInum = 0;
+    mMips->reset();
+}
+
+int ArmToMipsAssembler::getCodegenArch()
+{
+    return CODEGEN_ARCH_MIPS;
+}
+
+void ArmToMipsAssembler::comment(const char* string)
+{
+    mMips->comment(string);
+}
+
+void ArmToMipsAssembler::label(const char* theLabel)
+{
+    mMips->label(theLabel);
+}
+
+void ArmToMipsAssembler::disassemble(const char* name)
+{
+    mMips->disassemble(name);
+}
+
+void ArmToMipsAssembler::init_conditional_labels()
+{
+    int i;
+    for (i=0;i<99; ++i) {
+        sprintf(cond.label[i], "cond_%d", i);
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Prolog/Epilog & Generate...
+#endif
+
+void ArmToMipsAssembler::prolog()
+{
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->ADDIU(R_sp, R_sp, -(5 * 4));
+    mMips->SW(R_s0, R_sp, 0);
+    mMips->SW(R_s1, R_sp, 4);
+    mMips->SW(R_s2, R_sp, 8);
+    mMips->SW(R_s3, R_sp, 12);
+    mMips->SW(R_s4, R_sp, 16);
+    mMips->MOVE(R_v0, R_a0);    // move context * passed in a0 to v0 (arm r0)
+}
+
+void ArmToMipsAssembler::epilog(uint32_t touched __unused)
+{
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->LW(R_s0, R_sp, 0);
+    mMips->LW(R_s1, R_sp, 4);
+    mMips->LW(R_s2, R_sp, 8);
+    mMips->LW(R_s3, R_sp, 12);
+    mMips->LW(R_s4, R_sp, 16);
+    mMips->ADDIU(R_sp, R_sp, (5 * 4));
+    mMips->JR(R_ra);
+
+}
+
+int ArmToMipsAssembler::generate(const char* name)
+{
+    return mMips->generate(name);
+}
+
+uint32_t* ArmToMipsAssembler::pcForLabel(const char* label)
+{
+    return mMips->pcForLabel(label);
+}
+
+
+
+//----------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Addressing modes & shifters...
+#endif
+
+
+// do not need this for MIPS, but it is in the Interface (virtual)
+int ArmToMipsAssembler::buildImmediate(
+        uint32_t immediate, uint32_t& rot, uint32_t& imm)
+{
+    // for MIPS, any 32-bit immediate is OK
+    rot = 0;
+    imm = immediate;
+    return 0;
+}
+
+// shifters...
+
+bool ArmToMipsAssembler::isValidImmediate(uint32_t immediate __unused)
+{
+    // for MIPS, any 32-bit immediate is OK
+    return true;
+}
+
+uint32_t ArmToMipsAssembler::imm(uint32_t immediate)
+{
+    // ALOGW("immediate value %08x at pc %08x\n", immediate, (int)pc());
+    amode.value = immediate;
+    return AMODE_IMM;
+}
+
+uint32_t ArmToMipsAssembler::reg_imm(int Rm, int type, uint32_t shift)
+{
+    amode.reg = Rm;
+    amode.stype = type;
+    amode.value = shift;
+    return AMODE_REG_IMM;
+}
+
+uint32_t ArmToMipsAssembler::reg_rrx(int Rm __unused)
+{
+    // reg_rrx mode is not used in the GLLAssember code at this time
+    return AMODE_UNSUPPORTED;
+}
+
+uint32_t ArmToMipsAssembler::reg_reg(int Rm __unused, int type __unused,
+                                     int Rs __unused)
+{
+    // reg_reg mode is not used in the GLLAssember code at this time
+    return AMODE_UNSUPPORTED;
+}
+
+
+// addressing modes...
+// LDR(B)/STR(B)/PLD (immediate and Rm can be negative, which indicate U=0)
+uint32_t ArmToMipsAssembler::immed12_pre(int32_t immed12, int W)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+    amode.value = immed12;
+    amode.writeback = W;
+    return AMODE_IMM_12_PRE;
+}
+
+uint32_t ArmToMipsAssembler::immed12_post(int32_t immed12)
+{
+    LOG_ALWAYS_FATAL_IF(abs(immed12) >= 0x800,
+                        "LDR(B)/STR(B)/PLD immediate too big (%08x)",
+                        immed12);
+
+    amode.value = immed12;
+    return AMODE_IMM_12_POST;
+}
+
+uint32_t ArmToMipsAssembler::reg_scale_pre(int Rm, int type,
+        uint32_t shift, int W)
+{
+    LOG_ALWAYS_FATAL_IF(W | type | shift, "reg_scale_pre adv modes not yet implemented");
+
+    amode.reg = Rm;
+    // amode.stype = type;      // more advanced modes not used in GGLAssembler yet
+    // amode.value = shift;
+    // amode.writeback = W;
+    return AMODE_REG_SCALE_PRE;
+}
+
+uint32_t ArmToMipsAssembler::reg_scale_post(int Rm __unused, int type __unused,
+                                            uint32_t shift __unused)
+{
+    LOG_ALWAYS_FATAL("adr mode reg_scale_post not yet implemented\n");
+    return AMODE_UNSUPPORTED;
+}
+
+// LDRH/LDRSB/LDRSH/STRH (immediate and Rm can be negative, which indicate U=0)
+uint32_t ArmToMipsAssembler::immed8_pre(int32_t immed8, int W __unused)
+{
+    // uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL("adr mode immed8_pre not yet implemented\n");
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+    return AMODE_IMM_8_PRE;
+}
+
+uint32_t ArmToMipsAssembler::immed8_post(int32_t immed8)
+{
+    // uint32_t offset = abs(immed8);
+
+    LOG_ALWAYS_FATAL_IF(abs(immed8) >= 0x100,
+                        "LDRH/LDRSB/LDRSH/STRH immediate too big (%08x)",
+                        immed8);
+    amode.value = immed8;
+    return AMODE_IMM_8_POST;
+}
+
+uint32_t ArmToMipsAssembler::reg_pre(int Rm, int W)
+{
+    LOG_ALWAYS_FATAL_IF(W, "reg_pre writeback not yet implemented");
+    amode.reg = Rm;
+    return AMODE_REG_PRE;
+}
+
+uint32_t ArmToMipsAssembler::reg_post(int Rm __unused)
+{
+    LOG_ALWAYS_FATAL("adr mode reg_post not yet implemented\n");
+    return AMODE_UNSUPPORTED;
+}
+
+
+
+// ----------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Data Processing...
+#endif
+
+// check if the operand registers from a previous CMP or S-bit instruction
+// would be overwritten by this instruction. If so, move the value to a
+// safe register.
+// Note that we cannot tell at _this_ instruction time if a future (conditional)
+// instruction will _also_ use this value (a defect of the simple 1-pass, one-
+// instruction-at-a-time translation). Therefore we must be conservative and
+// save the value before it is overwritten. This costs an extra MOVE instr.
+
+void ArmToMipsAssembler::protectConditionalOperands(int Rd)
+{
+    if (Rd == cond.r1) {
+        mMips->MOVE(R_cmp, cond.r1);
+        cond.r1 = R_cmp;
+    }
+    if (cond.type == CMP_COND && Rd == cond.r2) {
+        mMips->MOVE(R_cmp2, cond.r2);
+        cond.r2 = R_cmp2;
+    }
+}
+
+
+// interprets the addressing mode, and generates the common code
+// used by the majority of data-processing ops. Many MIPS instructions
+// have a register-based form and a different immediate form. See
+// opAND below for an example. (this could be inlined)
+//
+// this works with the imm(), reg_imm() methods above, which are directly
+// called by the GLLAssembler.
+// note: _signed parameter defaults to false (un-signed)
+// note: tmpReg parameter defaults to 1, MIPS register AT
+int ArmToMipsAssembler::dataProcAdrModes(int op, int& source, bool _signed, int tmpReg)
+{
+    if (op < AMODE_REG) {
+        source = op;
+        return SRC_REG;
+    } else if (op == AMODE_IMM) {
+        if ((!_signed && amode.value > 0xffff)
+                || (_signed && ((int)amode.value < -32768 || (int)amode.value > 32767) )) {
+            mMips->LUI(tmpReg, (amode.value >> 16));
+            if (amode.value & 0x0000ffff) {
+                mMips->ORI(tmpReg, tmpReg, (amode.value & 0x0000ffff));
+            }
+            source = tmpReg;
+            return SRC_REG;
+        } else {
+            source = amode.value;
+            return SRC_IMM;
+        }
+    } else if (op == AMODE_REG_IMM) {
+        switch (amode.stype) {
+            case LSL: mMips->SLL(tmpReg, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(tmpReg, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(tmpReg, amode.reg, amode.value); break;
+            case ROR: if (mips32r2) {
+                          mMips->ROTR(tmpReg, amode.reg, amode.value);
+                      } else {
+                          mMips->RORIsyn(tmpReg, amode.reg, amode.value);
+                      }
+                      break;
+        }
+        source = tmpReg;
+        return SRC_REG;
+    } else {  // adr mode RRX is not used in GGL Assembler at this time
+        // we are screwed, this should be exception, assert-fail or something
+        LOG_ALWAYS_FATAL("adr mode reg_rrx not yet implemented\n");
+        return SRC_ERROR;
+    }
+}
+
+
+void ArmToMipsAssembler::dataProcessing(int opcode, int cc,
+        int s, int Rd, int Rn, uint32_t Op2)
+{
+    int src;    // src is modified by dataProcAdrModes() - passed as int&
+
+
+    if (cc != AL) {
+        protectConditionalOperands(Rd);
+        // the branch tests register(s) set by prev CMP or instr with 'S' bit set
+        // inverse the condition to jump past this conditional instruction
+        ArmToMipsAssembler::B(cc^1, cond.label[++cond.labelnum]);
+    } else {
+        mArmPC[mInum++] = pc();  // save starting PC for this instr
+    }
+
+    switch (opcode) {
+    case opAND:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->AND(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ANDI(Rd, Rn, src);
+        }
+        break;
+
+    case opADD:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->ADDU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ADDIU(Rd, Rn, src);
+        }
+        break;
+
+    case opSUB:
+        // set "signed" to true for adr modes
+        if (dataProcAdrModes(Op2, src, true) == SRC_REG) {
+            mMips->SUBU(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->SUBIU(Rd, Rn, src);
+        }
+        break;
+
+    case opEOR:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->XOR(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->XORI(Rd, Rn, src);
+        }
+        break;
+
+    case opORR:
+        if (dataProcAdrModes(Op2, src) == SRC_REG) {
+            mMips->OR(Rd, Rn, src);
+        } else {                        // adr mode was SRC_IMM
+            mMips->ORI(Rd, Rn, src);
+        }
+        break;
+
+    case opBIC:
+        if (dataProcAdrModes(Op2, src) == SRC_IMM) {
+            // if we are 16-bit imnmediate, load to AT reg
+            mMips->ORI(R_at, 0, src);
+            src = R_at;
+        }
+        mMips->NOT(R_at, src);
+        mMips->AND(Rd, Rn, R_at);
+        break;
+
+    case opRSB:
+        if (dataProcAdrModes(Op2, src) == SRC_IMM) {
+            // if we are 16-bit imnmediate, load to AT reg
+            mMips->ORI(R_at, 0, src);
+            src = R_at;
+        }
+        mMips->SUBU(Rd, src, Rn);   // subu with the parameters reversed
+        break;
+
+    case opMOV:
+        if (Op2 < AMODE_REG) {  // op2 is reg # in this case
+            mMips->MOVE(Rd, Op2);
+        } else if (Op2 == AMODE_IMM) {
+            if (amode.value > 0xffff) {
+                mMips->LUI(Rd, (amode.value >> 16));
+                if (amode.value & 0x0000ffff) {
+                    mMips->ORI(Rd, Rd, (amode.value & 0x0000ffff));
+                }
+             } else {
+                mMips->ORI(Rd, 0, amode.value);
+            }
+        } else if (Op2 == AMODE_REG_IMM) {
+            switch (amode.stype) {
+            case LSL: mMips->SLL(Rd, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(Rd, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(Rd, amode.reg, amode.value); break;
+            case ROR: if (mips32r2) {
+                          mMips->ROTR(Rd, amode.reg, amode.value);
+                      } else {
+                          mMips->RORIsyn(Rd, amode.reg, amode.value);
+                      }
+                      break;
+            }
+        }
+        else {
+            // adr mode RRX is not used in GGL Assembler at this time
+            mMips->UNIMPL();
+        }
+        break;
+
+    case opMVN:     // this is a 1's complement: NOT
+        if (Op2 < AMODE_REG) {  // op2 is reg # in this case
+            mMips->NOR(Rd, Op2, 0);     // NOT is NOR with 0
+            break;
+        } else if (Op2 == AMODE_IMM) {
+            if (amode.value > 0xffff) {
+                mMips->LUI(Rd, (amode.value >> 16));
+                if (amode.value & 0x0000ffff) {
+                    mMips->ORI(Rd, Rd, (amode.value & 0x0000ffff));
+                }
+             } else {
+                mMips->ORI(Rd, 0, amode.value);
+             }
+        } else if (Op2 == AMODE_REG_IMM) {
+            switch (amode.stype) {
+            case LSL: mMips->SLL(Rd, amode.reg, amode.value); break;
+            case LSR: mMips->SRL(Rd, amode.reg, amode.value); break;
+            case ASR: mMips->SRA(Rd, amode.reg, amode.value); break;
+            case ROR: if (mips32r2) {
+                          mMips->ROTR(Rd, amode.reg, amode.value);
+                      } else {
+                          mMips->RORIsyn(Rd, amode.reg, amode.value);
+                      }
+                      break;
+            }
+        }
+        else {
+            // adr mode RRX is not used in GGL Assembler at this time
+            mMips->UNIMPL();
+        }
+        mMips->NOR(Rd, Rd, 0);     // NOT is NOR with 0
+        break;
+
+    case opCMP:
+        // Either operand of a CMP instr could get overwritten by a subsequent
+        // conditional instruction, which is ok, _UNLESS_ there is a _second_
+        // conditional instruction. Under MIPS, this requires doing the comparison
+        // again (SLT), and the original operands must be available. (and this
+        // pattern of multiple conditional instructions from same CMP _is_ used
+        // in GGL-Assembler)
+        //
+        // For now, if a conditional instr overwrites the operands, we will
+        // move them to dedicated temp regs. This is ugly, and inefficient,
+        // and should be optimized.
+        //
+        // WARNING: making an _Assumption_ that CMP operand regs will NOT be
+        // trashed by intervening NON-conditional instructions. In the general
+        // case this is legal, but it is NOT currently done in GGL-Assembler.
+
+        cond.type = CMP_COND;
+        cond.r1 = Rn;
+        if (dataProcAdrModes(Op2, src, false, R_cmp2) == SRC_REG) {
+            cond.r2 = src;
+        } else {                        // adr mode was SRC_IMM
+            mMips->ORI(R_cmp2, R_zero, src);
+            cond.r2 = R_cmp2;
+        }
+
+        break;
+
+
+    case opTST:
+    case opTEQ:
+    case opCMN:
+    case opADC:
+    case opSBC:
+    case opRSC:
+        mMips->UNIMPL(); // currently unused in GGL Assembler code
+        break;
+    }
+
+    if (cc != AL) {
+        mMips->label(cond.label[cond.labelnum]);
+    }
+    if (s && opcode != opCMP) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Multiply...
+#endif
+
+// multiply, accumulate
+void ArmToMipsAssembler::MLA(int cc __unused, int s,
+        int Rd, int Rm, int Rs, int Rn) {
+
+    mArmPC[mInum++] = pc();  // save starting PC for this instr
+
+    mMips->MUL(R_at, Rm, Rs);
+    mMips->ADDU(Rd, R_at, Rn);
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+void ArmToMipsAssembler::MUL(int cc __unused, int s,
+        int Rd, int Rm, int Rs) {
+    mArmPC[mInum++] = pc();
+    mMips->MUL(Rd, Rm, Rs);
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = Rd;
+    }
+}
+
+void ArmToMipsAssembler::UMULL(int cc __unused, int s,
+        int RdLo, int RdHi, int Rm, int Rs) {
+    mArmPC[mInum++] = pc();
+    mMips->MULT(Rm, Rs);
+    mMips->MFHI(RdHi);
+    mMips->MFLO(RdLo);
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on UMULL must be on 64-bit result\n");
+    }
+}
+
+void ArmToMipsAssembler::UMUAL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "UMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<21) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on UMULL must be on 64-bit result\n");
+    }
+}
+
+void ArmToMipsAssembler::SMULL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMULL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on SMULL must be on 64-bit result\n");
+    }
+}
+void ArmToMipsAssembler::SMUAL(int cc __unused, int s,
+        int RdLo __unused, int RdHi, int Rm __unused, int Rs __unused) {
+    LOG_FATAL_IF(RdLo==Rm || RdHi==Rm || RdLo==RdHi,
+                        "SMUAL(r%u,r%u,r%u,r%u)", RdLo,RdHi,Rm,Rs);
+    // *mPC++ =    (cc<<28) | (1<<23) | (1<<22) | (1<<21) | (s<<20) |
+    //             (RdHi<<16) | (RdLo<<12) | (Rs<<8) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+    if (s) {
+        cond.type = SBIT_COND;
+        cond.r1 = RdHi;     // BUG...
+        LOG_ALWAYS_FATAL("Condition on SMUAL must be on 64-bit result\n");
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Branches...
+#endif
+
+// branches...
+
+void ArmToMipsAssembler::B(int cc, const char* label)
+{
+    mArmPC[mInum++] = pc();
+    if (cond.type == SBIT_COND) { cond.r2 = R_zero; }
+
+    switch(cc) {
+        case EQ: mMips->BEQ(cond.r1, cond.r2, label); break;
+        case NE: mMips->BNE(cond.r1, cond.r2, label); break;
+        case HS: mMips->BGEU(cond.r1, cond.r2, label); break;
+        case LO: mMips->BLTU(cond.r1, cond.r2, label); break;
+        case MI: mMips->BLT(cond.r1, cond.r2, label); break;
+        case PL: mMips->BGE(cond.r1, cond.r2, label); break;
+
+        case HI: mMips->BGTU(cond.r1, cond.r2, label); break;
+        case LS: mMips->BLEU(cond.r1, cond.r2, label); break;
+        case GE: mMips->BGE(cond.r1, cond.r2, label); break;
+        case LT: mMips->BLT(cond.r1, cond.r2, label); break;
+        case GT: mMips->BGT(cond.r1, cond.r2, label); break;
+        case LE: mMips->BLE(cond.r1, cond.r2, label); break;
+        case AL: mMips->B(label); break;
+        case NV: /* B Never - no instruction */ break;
+
+        case VS:
+        case VC:
+        default:
+            LOG_ALWAYS_FATAL("Unsupported cc: %02x\n", cc);
+            break;
+    }
+}
+
+void ArmToMipsAssembler::BL(int cc __unused, const char* label __unused)
+{
+    LOG_ALWAYS_FATAL("branch-and-link not supported yet\n");
+    mArmPC[mInum++] = pc();
+}
+
+// no use for Branches with integer PC, but they're in the Interface class ....
+void ArmToMipsAssembler::B(int cc __unused, uint32_t* to_pc __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+void ArmToMipsAssembler::BL(int cc __unused, uint32_t* to_pc __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+void ArmToMipsAssembler::BX(int cc __unused, int Rn __unused)
+{
+    LOG_ALWAYS_FATAL("branch to absolute PC not supported, use Label\n");
+    mArmPC[mInum++] = pc();
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Data Transfer...
+#endif
+
+// data transfer...
+void ArmToMipsAssembler::LDR(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert LDR via Arm SP to LW via Mips SP
+            }
+            mMips->LW(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->ADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;      // convert STR thru Arm SP to STR thru Mips SP
+            }
+            mMips->LW(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->ADDU(R_at, Rn, amode.reg);
+            mMips->LW(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMipsAssembler::LDRB(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            mMips->LBU(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->ADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->LBU(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->ADDU(R_at, Rn, amode.reg);
+            mMips->LBU(Rd, R_at, 0);
+            break;
+    }
+
+}
+
+void ArmToMipsAssembler::STR(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            if (Rn == ARMAssemblerInterface::SP) {
+                Rn = R_sp;  // convert STR thru Arm SP to SW thru Mips SP
+            }
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                // If we will writeback, then update the index reg, then store.
+                // This correctly handles stack-push case.
+                mMips->ADDIU(Rn, Rn, amode.value);
+                mMips->SW(Rd, Rn, 0);
+            } else {
+                // No writeback so store offset by value
+                mMips->SW(Rd, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->SW(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);  // post index always writes back
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->ADDU(R_at, Rn, amode.reg);
+            mMips->SW(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMipsAssembler::STRB(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed12_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            amode.writeback = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_12_PRE:
+            mMips->SB(Rd, Rn, amode.value);
+            if (amode.writeback) {      // OPTIONAL writeback on pre-index mode
+                mMips->ADDIU(Rn, Rn, amode.value);
+            }
+            break;
+        case AMODE_IMM_12_POST:
+            mMips->SB(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_SCALE_PRE:
+            // we only support simple base + index, no advanced modes for this one yet
+            mMips->ADDU(R_at, Rn, amode.reg);
+            mMips->SB(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMipsAssembler::LDRH(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed8_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_8_PRE:      // no support yet for writeback
+            mMips->LHU(Rd, Rn, amode.value);
+            break;
+        case AMODE_IMM_8_POST:
+            mMips->LHU(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_PRE:
+            // we only support simple base +/- index
+            if (amode.reg >= 0) {
+                mMips->ADDU(R_at, Rn, amode.reg);
+            } else {
+                mMips->SUBU(R_at, Rn, abs(amode.reg));
+            }
+            mMips->LHU(Rd, R_at, 0);
+            break;
+    }
+}
+
+void ArmToMipsAssembler::LDRSB(int cc __unused, int Rd __unused,
+                               int Rn __unused, uint32_t offset __unused)
+{
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::LDRSH(int cc __unused, int Rd __unused,
+                               int Rn __unused, uint32_t offset __unused)
+{
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::STRH(int cc __unused, int Rd, int Rn, uint32_t offset)
+{
+    mArmPC[mInum++] = pc();
+    // work-around for ARM default address mode of immed8_pre(0)
+    if (offset > AMODE_UNSUPPORTED) offset = 0;
+    switch (offset) {
+        case 0:
+            amode.value = 0;
+            // fall thru to next case ....
+        case AMODE_IMM_8_PRE:      // no support yet for writeback
+            mMips->SH(Rd, Rn, amode.value);
+            break;
+        case AMODE_IMM_8_POST:
+            mMips->SH(Rd, Rn, 0);
+            mMips->ADDIU(Rn, Rn, amode.value);
+            break;
+        case AMODE_REG_PRE:
+            // we only support simple base +/- index
+            if (amode.reg >= 0) {
+                mMips->ADDU(R_at, Rn, amode.reg);
+            } else {
+                mMips->SUBU(R_at, Rn, abs(amode.reg));
+            }
+            mMips->SH(Rd, R_at, 0);
+            break;
+    }
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Block Data Transfer...
+#endif
+
+// block data transfer...
+void ArmToMipsAssembler::LDM(int cc __unused, int dir __unused,
+        int Rn __unused, int W __unused, uint32_t reg_list __unused)
+{   //                        ED FD EA FA      IB IA DB DA
+    // const uint8_t P[8] = { 1, 0, 1, 0,      1, 0, 1, 0 };
+    // const uint8_t U[8] = { 1, 1, 0, 0,      1, 1, 0, 0 };
+    // *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+    //         (uint32_t(U[dir])<<23) | (1<<20) | (W<<21) | (Rn<<16) | reg_list;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::STM(int cc __unused, int dir __unused,
+        int Rn __unused, int W __unused, uint32_t reg_list __unused)
+{   //                        FA EA FD ED      IB IA DB DA
+    // const uint8_t P[8] = { 0, 1, 0, 1,      1, 0, 1, 0 };
+    // const uint8_t U[8] = { 0, 0, 1, 1,      1, 1, 0, 0 };
+    // *mPC++ = (cc<<28) | (4<<25) | (uint32_t(P[dir])<<24) |
+    //         (uint32_t(U[dir])<<23) | (0<<20) | (W<<21) | (Rn<<16) | reg_list;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Special...
+#endif
+
+// special...
+void ArmToMipsAssembler::SWP(int cc __unused, int Rn __unused,
+                             int Rd __unused, int Rm __unused) {
+    // *mPC++ = (cc<<28) | (2<<23) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::SWPB(int cc __unused, int Rn __unused,
+                              int Rd __unused, int Rm __unused) {
+    // *mPC++ = (cc<<28) | (2<<23) | (1<<22) | (Rn<<16) | (Rd << 12) | 0x90 | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::SWI(int cc __unused, uint32_t comment __unused) {
+    // *mPC++ = (cc<<28) | (0xF<<24) | comment;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+
+#if 0
+#pragma mark -
+#pragma mark DSP instructions...
+#endif
+
+// DSP instructions...
+void ArmToMipsAssembler::PLD(int Rn __unused, uint32_t offset) {
+    LOG_ALWAYS_FATAL_IF(!((offset&(1<<24)) && !(offset&(1<<21))),
+                        "PLD only P=1, W=0");
+    // *mPC++ = 0xF550F000 | (Rn<<16) | offset;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::CLZ(int cc __unused, int Rd, int Rm)
+{
+    mArmPC[mInum++] = pc();
+    mMips->CLZ(Rd, Rm);
+}
+
+void ArmToMipsAssembler::QADD(int cc __unused,  int Rd __unused,
+                              int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1000050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::QDADD(int cc __unused,  int Rd __unused,
+                               int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1400050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::QSUB(int cc __unused,  int Rd __unused,
+                              int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1200050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::QDSUB(int cc __unused,  int Rd __unused,
+                               int Rm __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1600050 | (Rn<<16) | (Rd<<12) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+// 16 x 16 signed multiply (like SMLAxx without the accumulate)
+void ArmToMipsAssembler::SMUL(int cc __unused, int xy,
+                int Rd, int Rm, int Rs)
+{
+    mArmPC[mInum++] = pc();
+
+    // the 16 bits may be in the top or bottom half of 32-bit source reg,
+    // as defined by the codes BB, BT, TB, TT (compressed param xy)
+    // where x corresponds to Rm and y to Rs
+
+    // select half-reg for Rm
+    if (xy & xyTB) {
+        // use top 16-bits
+        mMips->SRA(R_at, Rm, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        if (mips32r2) {
+            mMips->SEH(R_at, Rm);
+        } else {
+            mMips->SLL(R_at, Rm, 16);
+            mMips->SRA(R_at, R_at, 16);
+        }
+    }
+    // select half-reg for Rs
+    if (xy & xyBT) {
+        // use top 16-bits
+        mMips->SRA(R_at2, Rs, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        if (mips32r2) {
+            mMips->SEH(R_at2, Rs);
+        } else {
+            mMips->SLL(R_at2, Rs, 16);
+            mMips->SRA(R_at2, R_at2, 16);
+        }
+    }
+    mMips->MUL(Rd, R_at, R_at2);
+}
+
+// signed 32b x 16b multiple, save top 32-bits of 48-bit result
+void ArmToMipsAssembler::SMULW(int cc __unused, int y,
+                int Rd, int Rm, int Rs)
+{
+    mArmPC[mInum++] = pc();
+
+    // the selector yT or yB refers to reg Rs
+    if (y & yT) {
+        // zero the bottom 16-bits, with 2 shifts, it can affect result
+        mMips->SRL(R_at, Rs, 16);
+        mMips->SLL(R_at, R_at, 16);
+
+    } else {
+        // move low 16-bit half, to high half
+        mMips->SLL(R_at, Rs, 16);
+    }
+    mMips->MULT(Rm, R_at);
+    mMips->MFHI(Rd);
+}
+
+// 16 x 16 signed multiply, accumulate: Rd = Rm{16} * Rs{16} + Rn
+void ArmToMipsAssembler::SMLA(int cc __unused, int xy,
+                int Rd, int Rm, int Rs, int Rn)
+{
+    mArmPC[mInum++] = pc();
+
+    // the 16 bits may be in the top or bottom half of 32-bit source reg,
+    // as defined by the codes BB, BT, TB, TT (compressed param xy)
+    // where x corresponds to Rm and y to Rs
+
+    // select half-reg for Rm
+    if (xy & xyTB) {
+        // use top 16-bits
+        mMips->SRA(R_at, Rm, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        if (mips32r2) {
+            mMips->SEH(R_at, Rm);
+        } else {
+            mMips->SLL(R_at, Rm, 16);
+            mMips->SRA(R_at, R_at, 16);
+        }
+    }
+    // select half-reg for Rs
+    if (xy & xyBT) {
+        // use top 16-bits
+        mMips->SRA(R_at2, Rs, 16);
+    } else {
+        // use bottom 16, but sign-extend to 32
+        if (mips32r2) {
+            mMips->SEH(R_at2, Rs);
+        } else {
+            mMips->SLL(R_at2, Rs, 16);
+            mMips->SRA(R_at2, R_at2, 16);
+        }
+    }
+
+    mMips->MUL(R_at, R_at, R_at2);
+    mMips->ADDU(Rd, R_at, Rn);
+}
+
+void ArmToMipsAssembler::SMLAL(int cc __unused, int xy __unused,
+                               int RdHi __unused, int RdLo __unused,
+                               int Rs __unused, int Rm __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1400080 | (RdHi<<16) | (RdLo<<12) | (Rs<<8) | (xy<<4) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+void ArmToMipsAssembler::SMLAW(int cc __unused, int y __unused,
+                               int Rd __unused, int Rm __unused,
+                               int Rs __unused, int Rn __unused)
+{
+    // *mPC++ = (cc<<28) | 0x1200080 | (Rd<<16) | (Rn<<12) | (Rs<<8) | (y<<4) | Rm;
+    mArmPC[mInum++] = pc();
+    mMips->NOP2();
+    NOT_IMPLEMENTED();
+}
+
+// used by ARMv6 version of GGLAssembler::filter32
+void ArmToMipsAssembler::UXTB16(int cc __unused, int Rd, int Rm, int rotate)
+{
+    mArmPC[mInum++] = pc();
+
+    //Rd[31:16] := ZeroExtend((Rm ROR (8 * sh))[23:16]),
+    //Rd[15:0] := ZeroExtend((Rm ROR (8 * sh))[7:0]). sh 0-3.
+
+    mMips->ROTR(Rm, Rm, rotate * 8);
+    mMips->AND(Rd, Rm, 0x00FF00FF);
+}
+
+void ArmToMipsAssembler::UBFX(int cc __unused, int Rd __unused,
+                              int Rn __unused, int lsb __unused,
+                              int width __unused)
+{
+     /* Placeholder for UBFX */
+     mArmPC[mInum++] = pc();
+
+     mMips->NOP2();
+     NOT_IMPLEMENTED();
+}
+
+
+
+
+
+#if 0
+#pragma mark -
+#pragma mark MIPS Assembler...
+#endif
+
+
+//**************************************************************************
+//**************************************************************************
+//**************************************************************************
+
+
+/* mips assembler
+** this is a subset of mips32r2, targeted specifically at ARM instruction
+** replacement in the pixelflinger/codeflinger code.
+**
+** To that end, there is no need for floating point, or priviledged
+** instructions. This all runs in user space, no float.
+**
+** The syntax makes no attempt to be as complete as the assember, with
+** synthetic instructions, and automatic recognition of immedate operands
+** (use the immediate form of the instruction), etc.
+**
+** We start with mips32r1, and may add r2 and dsp extensions if cpu
+** supports. Decision will be made at compile time, based on gcc
+** options. (makes sense since android will be built for a a specific
+** device)
+*/
+
+MIPSAssembler::MIPSAssembler(const sp<Assembly>& assembly, ArmToMipsAssembler *parent)
+    : mParent(parent),
+    mAssembly(assembly)
+{
+    mBase = mPC = (uint32_t *)assembly->base();
+    mDuration = ggl_system_time();
+}
+
+MIPSAssembler::MIPSAssembler(void* assembly)
+    : mParent(NULL), mAssembly(NULL)
+{
+    mBase = mPC = (uint32_t *)assembly;
+}
+
+MIPSAssembler::~MIPSAssembler()
+{
+}
+
+
+uint32_t* MIPSAssembler::pc() const
+{
+    return mPC;
+}
+
+uint32_t* MIPSAssembler::base() const
+{
+    return mBase;
+}
+
+void MIPSAssembler::reset()
+{
+    mBase = mPC = (uint32_t *)mAssembly->base();
+    mBranchTargets.clear();
+    mLabels.clear();
+    mLabelsInverseMapping.clear();
+    mComments.clear();
+}
+
+
+// convert tabs to spaces, and remove any newline
+// works with strings of limited size (makes a temp copy)
+#define TABSTOP 8
+void MIPSAssembler::string_detab(char *s)
+{
+    char *os = s;
+    char temp[100];
+    char *t = temp;
+    int len = 99;
+    int i = TABSTOP;
+
+    while (*s && len-- > 0) {
+        if (*s == '\n') { s++; continue; }
+        if (*s == '\t') {
+            s++;
+            for ( ; i>0; i--) {*t++ = ' '; len--; }
+        } else {
+            *t++ = *s++;
+        }
+        if (i <= 0) i = TABSTOP;
+        i--;
+    }
+    *t = '\0';
+    strcpy(os, temp);
+}
+
+void MIPSAssembler::string_pad(char *s, int padded_len)
+{
+    int len = strlen(s);
+    s += len;
+    for (int i = padded_len - len; i > 0; --i) {
+        *s++ = ' ';
+    }
+    *s = '\0';
+}
+
+// ----------------------------------------------------------------------------
+
+void MIPSAssembler::disassemble(const char* name)
+{
+    char di_buf[140];
+
+    if (name) {
+        ALOGW("%s:\n", name);
+    }
+
+    bool arm_disasm_fmt = (mParent->mArmDisassemblyBuffer == NULL) ? false : true;
+
+    typedef char dstr[40];
+    dstr *lines = (dstr *)mParent->mArmDisassemblyBuffer;
+
+    if (mParent->mArmDisassemblyBuffer != NULL) {
+        for (int i=0; i<mParent->mArmInstrCount; ++i) {
+            string_detab(lines[i]);
+        }
+    }
+
+    size_t count = pc()-base();
+    uint32_t* mipsPC = base();
+    while (count--) {
+        ssize_t label = mLabelsInverseMapping.indexOfKey(mipsPC);
+        if (label >= 0) {
+            ALOGW("%s:\n", mLabelsInverseMapping.valueAt(label));
+        }
+        ssize_t comment = mComments.indexOfKey(mipsPC);
+        if (comment >= 0) {
+            ALOGW("; %s\n", mComments.valueAt(comment));
+        }
+        // ALOGW("%08x:    %08x    ", int(i), int(i[0]));
+        ::mips_disassem(mipsPC, di_buf, arm_disasm_fmt);
+        string_detab(di_buf);
+        string_pad(di_buf, 30);
+        ALOGW("0x%p:    %08x    %s", mipsPC, uint32_t(*mipsPC), di_buf);
+        mipsPC++;
+    }
+}
+
+void MIPSAssembler::comment(const char* string)
+{
+    mComments.add(pc(), string);
+}
+
+void MIPSAssembler::label(const char* theLabel)
+{
+    mLabels.add(theLabel, pc());
+    mLabelsInverseMapping.add(pc(), theLabel);
+}
+
+
+void MIPSAssembler::prolog()
+{
+    // empty - done in ArmToMipsAssembler
+}
+
+void MIPSAssembler::epilog(uint32_t touched __unused)
+{
+    // empty - done in ArmToMipsAssembler
+}
+
+int MIPSAssembler::generate(const char* name)
+{
+    // fixup all the branches
+    size_t count = mBranchTargets.size();
+    while (count--) {
+        const branch_target_t& bt = mBranchTargets[count];
+        uint32_t* target_pc = mLabels.valueFor(bt.label);
+        LOG_ALWAYS_FATAL_IF(!target_pc,
+                "error resolving branch targets, target_pc is null");
+        int32_t offset = int32_t(target_pc - (bt.pc+1));
+        *bt.pc |= offset & 0x00FFFF;
+    }
+
+    mAssembly->resize( int(pc()-base())*4 );
+
+    // the instruction & data caches are flushed by CodeCache
+    const int64_t duration = ggl_system_time() - mDuration;
+    const char * const format = "generated %s (%d ins) at [%p:%p] in %" PRId64 " ns\n";
+    ALOGI(format, name, int(pc()-base()), base(), pc(), duration);
+
+    char value[PROPERTY_VALUE_MAX];
+    value[0] = '\0';
+
+    property_get("debug.pf.disasm", value, "0");
+
+    if (atoi(value) != 0) {
+        disassemble(name);
+    }
+
+    return OK;
+}
+
+uint32_t* MIPSAssembler::pcForLabel(const char* label)
+{
+    return mLabels.valueFor(label);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Arithmetic...
+#endif
+
+void MIPSAssembler::ADDU(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (addu_fn<<FUNC_SHF)
+                    | (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF);
+}
+
+// MD00086 pdf says this is: ADDIU rt, rs, imm -- they do not use Rd
+void MIPSAssembler::ADDIU(int Rt, int Rs, int16_t imm)
+{
+    *mPC++ = (addiu_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+
+void MIPSAssembler::SUBU(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (subu_fn<<FUNC_SHF) |
+                        (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF) ;
+}
+
+
+void MIPSAssembler::SUBIU(int Rt, int Rs, int16_t imm)   // really addiu(d, s, -j)
+{
+    *mPC++ = (addiu_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | ((-imm) & MSK_16);
+}
+
+
+void MIPSAssembler::NEGU(int Rd, int Rs)    // really subu(d, zero, s)
+{
+    MIPSAssembler::SUBU(Rd, 0, Rs);
+}
+
+void MIPSAssembler::MUL(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec2_op<<OP_SHF) | (mul_fn<<FUNC_SHF) |
+                        (Rs<<RS_SHF) | (Rt<<RT_SHF) | (Rd<<RD_SHF) ;
+}
+
+void MIPSAssembler::MULT(int Rs, int Rt)    // dest is hi,lo
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mult_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+void MIPSAssembler::MULTU(int Rs, int Rt)    // dest is hi,lo
+{
+    *mPC++ = (spec_op<<OP_SHF) | (multu_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+void MIPSAssembler::MADD(int Rs, int Rt)    // hi,lo = hi,lo + Rs * Rt
+{
+    *mPC++ = (spec2_op<<OP_SHF) | (madd_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+void MIPSAssembler::MADDU(int Rs, int Rt)    // hi,lo = hi,lo + Rs * Rt
+{
+    *mPC++ = (spec2_op<<OP_SHF) | (maddu_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+
+void MIPSAssembler::MSUB(int Rs, int Rt)    // hi,lo = hi,lo - Rs * Rt
+{
+    *mPC++ = (spec2_op<<OP_SHF) | (msub_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+void MIPSAssembler::MSUBU(int Rs, int Rt)    // hi,lo = hi,lo - Rs * Rt
+{
+    *mPC++ = (spec2_op<<OP_SHF) | (msubu_fn<<FUNC_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF);
+}
+
+
+void MIPSAssembler::SEB(int Rd, int Rt)    // sign-extend byte (mips32r2)
+{
+    *mPC++ = (spec3_op<<OP_SHF) | (bshfl_fn<<FUNC_SHF) | (seb_fn << SA_SHF) |
+                    (Rt<<RT_SHF) | (Rd<<RD_SHF);
+}
+
+void MIPSAssembler::SEH(int Rd, int Rt)    // sign-extend half-word (mips32r2)
+{
+    *mPC++ = (spec3_op<<OP_SHF) | (bshfl_fn<<FUNC_SHF) | (seh_fn << SA_SHF) |
+                    (Rt<<RT_SHF) | (Rd<<RD_SHF);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Comparisons...
+#endif
+
+void MIPSAssembler::SLT(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (slt_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::SLTI(int Rt, int Rs, int16_t imm)
+{
+    *mPC++ = (slti_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+
+void MIPSAssembler::SLTU(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (sltu_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::SLTIU(int Rt, int Rs, int16_t imm)
+{
+    *mPC++ = (sltiu_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Logical...
+#endif
+
+void MIPSAssembler::AND(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (and_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::ANDI(int Rt, int Rs, uint16_t imm)      // todo: support larger immediate
+{
+    *mPC++ = (andi_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+
+void MIPSAssembler::OR(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (or_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::ORI(int Rt, int Rs, uint16_t imm)
+{
+    *mPC++ = (ori_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+void MIPSAssembler::NOR(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (nor_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::NOT(int Rd, int Rs)
+{
+    MIPSAssembler::NOR(Rd, Rs, 0);  // NOT(d,s) = NOR(d,s,zero)
+}
+
+void MIPSAssembler::XOR(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (xor_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::XORI(int Rt, int Rs, uint16_t imm)  // todo: support larger immediate
+{
+    *mPC++ = (xori_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | (imm & MSK_16);
+}
+
+void MIPSAssembler::SLL(int Rd, int Rt, int shft)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (sll_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rt<<RT_SHF) | (shft<<RE_SHF);
+}
+
+void MIPSAssembler::SLLV(int Rd, int Rt, int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (sllv_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::SRL(int Rd, int Rt, int shft)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (srl_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rt<<RT_SHF) | (shft<<RE_SHF);
+}
+
+void MIPSAssembler::SRLV(int Rd, int Rt, int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (srlv_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::SRA(int Rd, int Rt, int shft)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (sra_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rt<<RT_SHF) | (shft<<RE_SHF);
+}
+
+void MIPSAssembler::SRAV(int Rd, int Rt, int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (srav_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::ROTR(int Rd, int Rt, int shft)      // mips32r2
+{
+    // note weird encoding (SRL + 1)
+    *mPC++ = (spec_op<<OP_SHF) | (srl_fn<<FUNC_SHF) |
+                        (1<<RS_SHF) | (Rd<<RD_SHF) | (Rt<<RT_SHF) | (shft<<RE_SHF);
+}
+
+void MIPSAssembler::ROTRV(int Rd, int Rt, int Rs)       // mips32r2
+{
+    // note weird encoding (SRLV + 1)
+    *mPC++ = (spec_op<<OP_SHF) | (srlv_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF) | (1<<RE_SHF);
+}
+
+// uses at2 register (mapped to some appropriate mips reg)
+void MIPSAssembler::RORsyn(int Rd, int Rt, int Rs)
+{
+    // synthetic: d = t rotated by s
+    MIPSAssembler::NEGU(R_at2, Rs);
+    MIPSAssembler::SLLV(R_at2, Rt, R_at2);
+    MIPSAssembler::SRLV(Rd, Rt, Rs);
+    MIPSAssembler::OR(Rd, Rd, R_at2);
+}
+
+// immediate version - uses at2 register (mapped to some appropriate mips reg)
+void MIPSAssembler::RORIsyn(int Rd, int Rt, int rot)
+{
+    // synthetic: d = t rotated by immed rot
+    // d = s >> rot | s << (32-rot)
+    MIPSAssembler::SLL(R_at2, Rt, 32-rot);
+    MIPSAssembler::SRL(Rd, Rt, rot);
+    MIPSAssembler::OR(Rd, Rd, R_at2);
+}
+
+void MIPSAssembler::CLO(int Rd, int Rs)
+{
+    // Rt field must have same gpr # as Rd
+    *mPC++ = (spec2_op<<OP_SHF) | (clo_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rd<<RT_SHF);
+}
+
+void MIPSAssembler::CLZ(int Rd, int Rs)
+{
+    // Rt field must have same gpr # as Rd
+    *mPC++ = (spec2_op<<OP_SHF) | (clz_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rd<<RT_SHF);
+}
+
+void MIPSAssembler::WSBH(int Rd, int Rt)      // mips32r2
+{
+    *mPC++ = (spec3_op<<OP_SHF) | (bshfl_fn<<FUNC_SHF) | (wsbh_fn << SA_SHF) |
+                        (Rt<<RT_SHF) | (Rd<<RD_SHF);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Load/store...
+#endif
+
+void MIPSAssembler::LW(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (lw_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::SW(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (sw_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+// lb is sign-extended
+void MIPSAssembler::LB(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (lb_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::LBU(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (lbu_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::SB(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (sb_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+// lh is sign-extended
+void MIPSAssembler::LH(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (lh_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::LHU(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (lhu_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::SH(int Rt, int Rbase, int16_t offset)
+{
+    *mPC++ = (sh_op<<OP_SHF) | (Rbase<<RS_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+void MIPSAssembler::LUI(int Rt, int16_t offset)
+{
+    *mPC++ = (lui_op<<OP_SHF) | (Rt<<RT_SHF) | (offset & MSK_16);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Register move...
+#endif
+
+void MIPSAssembler::MOVE(int Rd, int Rs)
+{
+    // encoded as "or rd, rs, zero"
+    *mPC++ = (spec_op<<OP_SHF) | (or_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (0<<RT_SHF);
+}
+
+void MIPSAssembler::MOVN(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (movn_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::MOVZ(int Rd, int Rs, int Rt)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (movz_fn<<FUNC_SHF) |
+                        (Rd<<RD_SHF) | (Rs<<RS_SHF) | (Rt<<RT_SHF);
+}
+
+void MIPSAssembler::MFHI(int Rd)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mfhi_fn<<FUNC_SHF) | (Rd<<RD_SHF);
+}
+
+void MIPSAssembler::MFLO(int Rd)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mflo_fn<<FUNC_SHF) | (Rd<<RD_SHF);
+}
+
+void MIPSAssembler::MTHI(int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mthi_fn<<FUNC_SHF) | (Rs<<RS_SHF);
+}
+
+void MIPSAssembler::MTLO(int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (mtlo_fn<<FUNC_SHF) | (Rs<<RS_SHF);
+}
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Branch...
+#endif
+
+// temporarily forcing a NOP into branch-delay slot, just to be safe
+// todo: remove NOP, optimze use of delay slots
+void MIPSAssembler::B(const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+
+    // encoded as BEQ zero, zero, offset
+    *mPC++ = (beq_op<<OP_SHF) | (0<<RT_SHF)
+                        | (0<<RS_SHF) | 0;  // offset filled in later
+
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::BEQ(int Rs, int Rt, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (beq_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::BNE(int Rs, int Rt, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (bne_op<<OP_SHF) | (Rt<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::BLEZ(int Rs, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (blez_op<<OP_SHF) | (0<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::BLTZ(int Rs, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (regimm_op<<OP_SHF) | (bltz_fn<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::BGTZ(int Rs, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (bgtz_op<<OP_SHF) | (0<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+
+void MIPSAssembler::BGEZ(int Rs, const char* label)
+{
+    mBranchTargets.add(branch_target_t(label, mPC));
+    *mPC++ = (regimm_op<<OP_SHF) | (bgez_fn<<RT_SHF) | (Rs<<RS_SHF) | 0;
+    MIPSAssembler::NOP();
+}
+
+void MIPSAssembler::JR(int Rs)
+{
+    *mPC++ = (spec_op<<OP_SHF) | (Rs<<RS_SHF) | (jr_fn << FUNC_SHF);
+    MIPSAssembler::NOP();
+}
+
+
+#if 0
+#pragma mark -
+#pragma mark Synthesized Branch...
+#endif
+
+// synthetic variants of branches (using slt & friends)
+void MIPSAssembler::BEQZ(int Rs, const char* label)
+{
+    BEQ(Rs, R_zero, label);
+}
+
+void MIPSAssembler::BNEZ(int Rs __unused, const char* label)
+{
+    BNE(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BGE(int Rs, int Rt, const char* label)
+{
+    SLT(R_at, Rs, Rt);
+    BEQ(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BGEU(int Rs, int Rt, const char* label)
+{
+    SLTU(R_at, Rs, Rt);
+    BEQ(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BGT(int Rs, int Rt, const char* label)
+{
+    SLT(R_at, Rt, Rs);   // rev
+    BNE(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BGTU(int Rs, int Rt, const char* label)
+{
+    SLTU(R_at, Rt, Rs);   // rev
+    BNE(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BLE(int Rs, int Rt, const char* label)
+{
+    SLT(R_at, Rt, Rs);   // rev
+    BEQ(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BLEU(int Rs, int Rt, const char* label)
+{
+    SLTU(R_at, Rt, Rs);  // rev
+    BEQ(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BLT(int Rs, int Rt, const char* label)
+{
+    SLT(R_at, Rs, Rt);
+    BNE(R_at, R_zero, label);
+}
+
+void MIPSAssembler::BLTU(int Rs, int Rt, const char* label)
+{
+    SLTU(R_at, Rs, Rt);
+    BNE(R_at, R_zero, label);
+}
+
+
+
+
+#if 0
+#pragma mark -
+#pragma mark Misc...
+#endif
+
+void MIPSAssembler::NOP(void)
+{
+    // encoded as "sll zero, zero, 0", which is all zero
+    *mPC++ = (spec_op<<OP_SHF) | (sll_fn<<FUNC_SHF);
+}
+
+// using this as special opcode for not-yet-implemented ARM instruction
+void MIPSAssembler::NOP2(void)
+{
+    // encoded as "sll zero, zero, 2", still a nop, but a unique code
+    *mPC++ = (spec_op<<OP_SHF) | (sll_fn<<FUNC_SHF) | (2 << RE_SHF);
+}
+
+// using this as special opcode for purposefully NOT implemented ARM instruction
+void MIPSAssembler::UNIMPL(void)
+{
+    // encoded as "sll zero, zero, 3", still a nop, but a unique code
+    *mPC++ = (spec_op<<OP_SHF) | (sll_fn<<FUNC_SHF) | (3 << RE_SHF);
+}
+
+
+}; // namespace android:
diff --git a/libpixelflinger/codeflinger/MIPSAssembler.h b/libpixelflinger/codeflinger/MIPSAssembler.h
new file mode 100644
index 0000000..c1178b6
--- /dev/null
+++ b/libpixelflinger/codeflinger/MIPSAssembler.h
@@ -0,0 +1,557 @@
+/* libs/pixelflinger/codeflinger/MIPSAssembler.h
+**
+** Copyright 2012, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#ifndef ANDROID_MIPSASSEMBLER_H
+#define ANDROID_MIPSASSEMBLER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "tinyutils/smartpointer.h"
+#include "utils/KeyedVector.h"
+#include "utils/Vector.h"
+
+#include "ARMAssemblerInterface.h"
+#include "CodeCache.h"
+
+namespace android {
+
+class MIPSAssembler;    // forward reference
+
+// this class mimics ARMAssembler interface
+//  intent is to translate each ARM instruction to 1 or more MIPS instr
+//  implementation calls MIPSAssembler class to generate mips code
+class ArmToMipsAssembler : public ARMAssemblerInterface
+{
+public:
+                ArmToMipsAssembler(const sp<Assembly>& assembly,
+                        char *abuf = 0, int linesz = 0, int instr_count = 0);
+    virtual     ~ArmToMipsAssembler();
+
+    uint32_t*   base() const;
+    uint32_t*   pc() const;
+    void        disassemble(const char* name);
+
+    virtual void    reset();
+
+    virtual int     generate(const char* name);
+    virtual int     getCodegenArch();
+
+    virtual void    prolog();
+    virtual void    epilog(uint32_t touched);
+    virtual void    comment(const char* string);
+
+
+    // -----------------------------------------------------------------------
+    // shifters and addressing modes
+    // -----------------------------------------------------------------------
+
+    // shifters...
+    virtual bool        isValidImmediate(uint32_t immed);
+    virtual int         buildImmediate(uint32_t i, uint32_t& rot, uint32_t& imm);
+
+    virtual uint32_t    imm(uint32_t immediate);
+    virtual uint32_t    reg_imm(int Rm, int type, uint32_t shift);
+    virtual uint32_t    reg_rrx(int Rm);
+    virtual uint32_t    reg_reg(int Rm, int type, int Rs);
+
+    // addressing modes...
+    // LDR(B)/STR(B)/PLD
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed12_pre(int32_t immed12, int W=0);
+    virtual uint32_t    immed12_post(int32_t immed12);
+    virtual uint32_t    reg_scale_pre(int Rm, int type=0, uint32_t shift=0, int W=0);
+    virtual uint32_t    reg_scale_post(int Rm, int type=0, uint32_t shift=0);
+
+    // LDRH/LDRSB/LDRSH/STRH
+    // (immediate and Rm can be negative, which indicates U=0)
+    virtual uint32_t    immed8_pre(int32_t immed8, int W=0);
+    virtual uint32_t    immed8_post(int32_t immed8);
+    virtual uint32_t    reg_pre(int Rm, int W=0);
+    virtual uint32_t    reg_post(int Rm);
+
+
+
+
+    virtual void    dataProcessing(int opcode, int cc, int s,
+                                int Rd, int Rn,
+                                uint32_t Op2);
+    virtual void MLA(int cc, int s,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void MUL(int cc, int s,
+                int Rd, int Rm, int Rs);
+    virtual void UMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void UMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMULL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+    virtual void SMUAL(int cc, int s,
+                int RdLo, int RdHi, int Rm, int Rs);
+
+    virtual void B(int cc, uint32_t* pc);
+    virtual void BL(int cc, uint32_t* pc);
+    virtual void BX(int cc, int Rn);
+    virtual void label(const char* theLabel);
+    virtual void B(int cc, const char* label);
+    virtual void BL(int cc, const char* label);
+
+    virtual uint32_t* pcForLabel(const char* label);
+
+    virtual void LDR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STR (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSB(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void LDRSH(int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+    virtual void STRH (int cc, int Rd,
+                int Rn, uint32_t offset = 0);
+
+    virtual void LDM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+    virtual void STM(int cc, int dir,
+                int Rn, int W, uint32_t reg_list);
+
+    virtual void SWP(int cc, int Rn, int Rd, int Rm);
+    virtual void SWPB(int cc, int Rn, int Rd, int Rm);
+    virtual void SWI(int cc, uint32_t comment);
+
+    virtual void PLD(int Rn, uint32_t offset);
+    virtual void CLZ(int cc, int Rd, int Rm);
+    virtual void QADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QDADD(int cc, int Rd, int Rm, int Rn);
+    virtual void QSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void QDSUB(int cc, int Rd, int Rm, int Rn);
+    virtual void SMUL(int cc, int xy,
+                int Rd, int Rm, int Rs);
+    virtual void SMULW(int cc, int y,
+                int Rd, int Rm, int Rs);
+    virtual void SMLA(int cc, int xy,
+                int Rd, int Rm, int Rs, int Rn);
+    virtual void SMLAL(int cc, int xy,
+                int RdHi, int RdLo, int Rs, int Rm);
+    virtual void SMLAW(int cc, int y,
+                int Rd, int Rm, int Rs, int Rn);
+
+    // byte/half word extract...
+    virtual void UXTB16(int cc, int Rd, int Rm, int rotate);
+
+    // bit manipulation...
+    virtual void UBFX(int cc, int Rd, int Rn, int lsb, int width);
+
+    // this is some crap to share is MIPSAssembler class for debug
+    char *      mArmDisassemblyBuffer;
+    int         mArmLineLength;
+    int         mArmInstrCount;
+
+    int         mInum;      // current arm instuction number (0..n)
+    uint32_t**  mArmPC;     // array: PC for 1st mips instr of
+                            //      each translated ARM instr
+
+
+private:
+    ArmToMipsAssembler(const ArmToMipsAssembler& rhs);
+    ArmToMipsAssembler& operator = (const ArmToMipsAssembler& rhs);
+
+    void init_conditional_labels(void);
+
+    void protectConditionalOperands(int Rd);
+
+    // reg__tmp set to MIPS AT, reg 1
+    int dataProcAdrModes(int op, int& source, bool sign = false, int reg_tmp = 1);
+
+    sp<Assembly>        mAssembly;
+    MIPSAssembler*      mMips;
+
+
+    enum misc_constants_t {
+        ARM_MAX_INSTUCTIONS = 512  // based on ASSEMBLY_SCRATCH_SIZE
+    };
+
+    enum {
+        SRC_REG = 0,
+        SRC_IMM,
+        SRC_ERROR = -1
+    };
+
+    enum addr_modes {
+        // start above the range of legal mips reg #'s (0-31)
+        AMODE_REG = 0x20,
+        AMODE_IMM, AMODE_REG_IMM,               // for data processing
+        AMODE_IMM_12_PRE, AMODE_IMM_12_POST,    // for load/store
+        AMODE_REG_SCALE_PRE, AMODE_IMM_8_PRE,
+        AMODE_IMM_8_POST, AMODE_REG_PRE,
+        AMODE_UNSUPPORTED
+    };
+
+    struct addr_mode_t {    // address modes for current ARM instruction
+        int         reg;
+        int         stype;
+        uint32_t    value;
+        bool        writeback;  // writeback the adr reg after modification
+    } amode;
+
+    enum cond_types {
+        CMP_COND = 1,
+        SBIT_COND
+    };
+
+    struct cond_mode_t {    // conditional-execution info for current ARM instruction
+        cond_types  type;
+        int         r1;
+        int         r2;
+        int         labelnum;
+        char        label[100][10];
+    } cond;
+
+};
+
+
+
+
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+// ----------------------------------------------------------------------------
+
+// This is the basic MIPS assembler, which just creates the opcodes in memory.
+// All the more complicated work is done in ArmToMipsAssember above.
+
+class MIPSAssembler
+{
+public:
+                MIPSAssembler(const sp<Assembly>& assembly, ArmToMipsAssembler *parent);
+                MIPSAssembler(void* assembly);
+    virtual     ~MIPSAssembler();
+
+    virtual uint32_t*   base() const;
+    virtual uint32_t*   pc() const;
+    virtual void        reset();
+
+    virtual void        disassemble(const char* name);
+
+    virtual void        prolog();
+    virtual void        epilog(uint32_t touched);
+    virtual int         generate(const char* name);
+    virtual void        comment(const char* string);
+    virtual void        label(const char* string);
+
+    // valid only after generate() has been called
+    virtual uint32_t*   pcForLabel(const char* label);
+
+
+    // ------------------------------------------------------------------------
+    // MIPSAssemblerInterface...
+    // ------------------------------------------------------------------------
+
+#if 0
+#pragma mark -
+#pragma mark Arithmetic...
+#endif
+
+    void ADDU(int Rd, int Rs, int Rt);
+    void ADDIU(int Rt, int Rs, int16_t imm);
+    void SUBU(int Rd, int Rs, int Rt);
+    void SUBIU(int Rt, int Rs, int16_t imm);
+    void NEGU(int Rd, int Rs);
+    void MUL(int Rd, int Rs, int Rt);
+    void MULT(int Rs, int Rt);      // dest is hi,lo
+    void MULTU(int Rs, int Rt);     // dest is hi,lo
+    void MADD(int Rs, int Rt);      // hi,lo = hi,lo + Rs * Rt
+    void MADDU(int Rs, int Rt);     // hi,lo = hi,lo + Rs * Rt
+    void MSUB(int Rs, int Rt);      // hi,lo = hi,lo - Rs * Rt
+    void MSUBU(int Rs, int Rt);     // hi,lo = hi,lo - Rs * Rt
+    void SEB(int Rd, int Rt);       // sign-extend byte (mips32r2)
+    void SEH(int Rd, int Rt);       // sign-extend half-word (mips32r2)
+
+
+#if 0
+#pragma mark -
+#pragma mark Comparisons...
+#endif
+
+    void SLT(int Rd, int Rs, int Rt);
+    void SLTI(int Rt, int Rs, int16_t imm);
+    void SLTU(int Rd, int Rs, int Rt);
+    void SLTIU(int Rt, int Rs, int16_t imm);
+
+
+#if 0
+#pragma mark -
+#pragma mark Logical...
+#endif
+
+    void AND(int Rd, int Rs, int Rt);
+    void ANDI(int Rd, int Rs, uint16_t imm);
+    void OR(int Rd, int Rs, int Rt);
+    void ORI(int Rt, int Rs, uint16_t imm);
+    void NOR(int Rd, int Rs, int Rt);
+    void NOT(int Rd, int Rs);
+    void XOR(int Rd, int Rs, int Rt);
+    void XORI(int Rt, int Rs, uint16_t imm);
+
+    void SLL(int Rd, int Rt, int shft);
+    void SLLV(int Rd, int Rt, int Rs);
+    void SRL(int Rd, int Rt, int shft);
+    void SRLV(int Rd, int Rt, int Rs);
+    void SRA(int Rd, int Rt, int shft);
+    void SRAV(int Rd, int Rt, int Rs);
+    void ROTR(int Rd, int Rt, int shft);    // mips32r2
+    void ROTRV(int Rd, int Rt, int Rs);     // mips32r2
+    void RORsyn(int Rd, int Rs, int Rt);    // synthetic: d = s rotated by t
+    void RORIsyn(int Rd, int Rt, int rot);  // synthetic: d = s rotated by immed
+
+    void CLO(int Rd, int Rs);
+    void CLZ(int Rd, int Rs);
+    void WSBH(int Rd, int Rt);
+
+
+#if 0
+#pragma mark -
+#pragma mark Load/store...
+#endif
+
+    void LW(int Rt, int Rbase, int16_t offset);
+    void SW(int Rt, int Rbase, int16_t offset);
+    void LB(int Rt, int Rbase, int16_t offset);
+    void LBU(int Rt, int Rbase, int16_t offset);
+    void SB(int Rt, int Rbase, int16_t offset);
+    void LH(int Rt, int Rbase, int16_t offset);
+    void LHU(int Rt, int Rbase, int16_t offset);
+    void SH(int Rt, int Rbase, int16_t offset);
+    void LUI(int Rt, int16_t offset);
+
+#if 0
+#pragma mark -
+#pragma mark Register moves...
+#endif
+
+    void MOVE(int Rd, int Rs);
+    void MOVN(int Rd, int Rs, int Rt);
+    void MOVZ(int Rd, int Rs, int Rt);
+    void MFHI(int Rd);
+    void MFLO(int Rd);
+    void MTHI(int Rs);
+    void MTLO(int Rs);
+
+#if 0
+#pragma mark -
+#pragma mark Branch...
+#endif
+
+    void B(const char* label);
+    void BEQ(int Rs, int Rt, const char* label);
+    void BNE(int Rs, int Rt, const char* label);
+    void BGEZ(int Rs, const char* label);
+    void BGTZ(int Rs, const char* label);
+    void BLEZ(int Rs, const char* label);
+    void BLTZ(int Rs, const char* label);
+    void JR(int Rs);
+
+
+#if 0
+#pragma mark -
+#pragma mark Synthesized Branch...
+#endif
+
+    // synthetic variants of above (using slt & friends)
+    void BEQZ(int Rs, const char* label);
+    void BNEZ(int Rs, const char* label);
+    void BGE(int Rs, int Rt, const char* label);
+    void BGEU(int Rs, int Rt, const char* label);
+    void BGT(int Rs, int Rt, const char* label);
+    void BGTU(int Rs, int Rt, const char* label);
+    void BLE(int Rs, int Rt, const char* label);
+    void BLEU(int Rs, int Rt, const char* label);
+    void BLT(int Rs, int Rt, const char* label);
+    void BLTU(int Rs, int Rt, const char* label);
+
+#if 0
+#pragma mark -
+#pragma mark Misc...
+#endif
+
+    void NOP(void);
+    void NOP2(void);
+    void UNIMPL(void);
+
+
+
+
+
+protected:
+    virtual void string_detab(char *s);
+    virtual void string_pad(char *s, int padded_len);
+
+    ArmToMipsAssembler *mParent;
+    sp<Assembly>    mAssembly;
+    uint32_t*       mBase;
+    uint32_t*       mPC;
+    uint32_t*       mPrologPC;
+    int64_t         mDuration;
+
+    struct branch_target_t {
+        inline branch_target_t() : label(0), pc(0) { }
+        inline branch_target_t(const char* l, uint32_t* p)
+            : label(l), pc(p) { }
+        const char* label;
+        uint32_t*   pc;
+    };
+
+    Vector<branch_target_t>                 mBranchTargets;
+    KeyedVector< const char*, uint32_t* >   mLabels;
+    KeyedVector< uint32_t*, const char* >   mLabelsInverseMapping;
+    KeyedVector< uint32_t*, const char* >   mComments;
+
+
+
+
+    // opcode field of all instructions
+    enum opcode_field {
+        spec_op, regimm_op, j_op, jal_op,           // 00
+        beq_op, bne_op, blez_op, bgtz_op,
+        addi_op, addiu_op, slti_op, sltiu_op,       // 08
+        andi_op, ori_op, xori_op, lui_op,
+        cop0_op, cop1_op, cop2_op, cop1x_op,        // 10
+        beql_op, bnel_op, blezl_op, bgtzl_op,
+        daddi_op, daddiu_op, ldl_op, ldr_op,        // 18
+        spec2_op, jalx_op, mdmx_op, spec3_op,
+        lb_op, lh_op, lwl_op, lw_op,                // 20
+        lbu_op, lhu_op, lwr_op, lwu_op,
+        sb_op, sh_op, swl_op, sw_op,                // 28
+        sdl_op, sdr_op, swr_op, cache_op,
+        ll_op, lwc1_op, lwc2_op, pref_op,           // 30
+        lld_op, ldc1_op, ldc2_op, ld_op,
+        sc_op, swc1_op, swc2_op, rsrv_3b_op,        // 38
+        scd_op, sdc1_op, sdc2_op, sd_op
+    };
+
+
+    // func field for special opcode
+    enum func_spec_op {
+        sll_fn, movc_fn, srl_fn, sra_fn,            // 00
+        sllv_fn, pmon_fn, srlv_fn, srav_fn,
+        jr_fn, jalr_fn, movz_fn, movn_fn,           // 08
+        syscall_fn, break_fn, spim_fn, sync_fn,
+        mfhi_fn, mthi_fn, mflo_fn, mtlo_fn,         // 10
+        dsllv_fn, rsrv_spec_2, dsrlv_fn, dsrav_fn,
+        mult_fn, multu_fn, div_fn, divu_fn,         // 18
+        dmult_fn, dmultu_fn, ddiv_fn, ddivu_fn,
+        add_fn, addu_fn, sub_fn, subu_fn,           // 20
+        and_fn, or_fn, xor_fn, nor_fn,
+        rsrv_spec_3, rsrv_spec_4, slt_fn, sltu_fn,  // 28
+        dadd_fn, daddu_fn, dsub_fn, dsubu_fn,
+        tge_fn, tgeu_fn, tlt_fn, tltu_fn,           // 30
+        teq_fn, rsrv_spec_5, tne_fn, rsrv_spec_6,
+        dsll_fn, rsrv_spec_7, dsrl_fn, dsra_fn,     // 38
+        dsll32_fn, rsrv_spec_8, dsrl32_fn, dsra32_fn
+    };
+
+    // func field for spec2 opcode
+    enum func_spec2_op {
+        madd_fn, maddu_fn, mul_fn, rsrv_spec2_3,
+        msub_fn, msubu_fn,
+        clz_fn = 0x20, clo_fn,
+        dclz_fn = 0x24, dclo_fn,
+        sdbbp_fn = 0x3f
+    };
+
+    // func field for spec3 opcode
+    enum func_spec3_op {
+        ext_fn, dextm_fn, dextu_fn, dext_fn,
+        ins_fn, dinsm_fn, dinsu_fn, dins_fn,
+        bshfl_fn = 0x20,
+        dbshfl_fn = 0x24,
+        rdhwr_fn = 0x3b
+    };
+
+    // sa field for spec3 opcodes, with BSHFL function
+    enum func_spec3_bshfl {
+        wsbh_fn = 0x02,
+        seb_fn = 0x10,
+        seh_fn = 0x18
+    };
+
+    // rt field of regimm opcodes.
+    enum regimm_fn {
+        bltz_fn, bgez_fn, bltzl_fn, bgezl_fn,
+        rsrv_ri_fn4, rsrv_ri_fn5, rsrv_ri_fn6, rsrv_ri_fn7,
+        tgei_fn, tgeiu_fn, tlti_fn, tltiu_fn,
+        teqi_fn, rsrv_ri_fn_0d, tnei_fn, rsrv_ri_fn0f,
+        bltzal_fn, bgezal_fn, bltzall_fn, bgezall_fn,
+        bposge32_fn= 0x1c,
+        synci_fn = 0x1f
+    };
+
+
+    // func field for mad opcodes (MIPS IV).
+    enum mad_func {
+        madd_fp_op      = 0x08, msub_fp_op      = 0x0a,
+        nmadd_fp_op     = 0x0c, nmsub_fp_op     = 0x0e
+    };
+
+
+    enum mips_inst_shifts {
+        OP_SHF       = 26,
+        JTARGET_SHF  = 0,
+        RS_SHF       = 21,
+        RT_SHF       = 16,
+        RD_SHF       = 11,
+        RE_SHF       = 6,
+        SA_SHF       = RE_SHF,  // synonym
+        IMM_SHF      = 0,
+        FUNC_SHF     = 0,
+
+        // mask values
+        MSK_16       = 0xffff,
+
+
+        CACHEOP_SHF  = 18,
+        CACHESEL_SHF = 16,
+    };
+};
+
+enum mips_regnames {
+    R_zero = 0,
+            R_at,   R_v0,   R_v1,   R_a0,   R_a1,   R_a2,   R_a3,
+#if __mips_isa_rev < 6
+    R_t0,   R_t1,   R_t2,   R_t3,   R_t4,   R_t5,   R_t6,   R_t7,
+#else
+    R_a4,   R_a5,   R_a6,   R_a7,   R_t0,   R_t1,   R_t2,   R_t3,
+#endif
+    R_s0,   R_s1,   R_s2,   R_s3,   R_s4,   R_s5,   R_s6,   R_s7,
+    R_t8,   R_t9,   R_k0,   R_k1,   R_gp,   R_sp,   R_s8,   R_ra,
+    R_lr = R_s8,
+
+    // arm regs 0-15 are mips regs 2-17 (meaning s0 & s1 are used)
+    R_at2  = R_s2,    // R_at2 = 18 = s2
+    R_cmp  = R_s3,    // R_cmp = 19 = s3
+    R_cmp2 = R_s4     // R_cmp2 = 20 = s4
+};
+
+
+
+}; // namespace android
+
+#endif //ANDROID_MIPSASSEMBLER_H
diff --git a/libpixelflinger/codeflinger/armreg.h b/libpixelflinger/codeflinger/armreg.h
new file mode 100644
index 0000000..fde81ba
--- /dev/null
+++ b/libpixelflinger/codeflinger/armreg.h
@@ -0,0 +1,300 @@
+/*	$NetBSD: armreg.h,v 1.28 2003/10/31 16:30:15 scw Exp $	*/
+
+/*-
+ * Copyright (c) 1998, 2001 Ben Harris
+ * Copyright (c) 1994-1996 Mark Brinicombe.
+ * Copyright (c) 1994 Brini.
+ * All rights reserved.
+ *
+ * This code is derived from software written for Brini by Mark Brinicombe
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Brini.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: /repoman/r/ncvs/src/sys/arm/include/armreg.h,v 1.3 2005/11/21 19:06:25 cognet Exp $
+ */
+
+#ifndef MACHINE_ARMREG_H
+#define MACHINE_ARMREG_H
+#define INSN_SIZE	4
+#define INSN_COND_MASK	0xf0000000	/* Condition mask */
+#define PSR_MODE        0x0000001f      /* mode mask */
+#define PSR_USR26_MODE  0x00000000
+#define PSR_FIQ26_MODE  0x00000001
+#define PSR_IRQ26_MODE  0x00000002
+#define PSR_SVC26_MODE  0x00000003
+#define PSR_USR32_MODE  0x00000010
+#define PSR_FIQ32_MODE  0x00000011
+#define PSR_IRQ32_MODE  0x00000012
+#define PSR_SVC32_MODE  0x00000013
+#define PSR_ABT32_MODE  0x00000017
+#define PSR_UND32_MODE  0x0000001b
+#define PSR_SYS32_MODE  0x0000001f
+#define PSR_32_MODE     0x00000010
+#define PSR_FLAGS	0xf0000000    /* flags */
+
+#define PSR_C_bit (1 << 29)       /* carry */
+
+/* The high-order byte is always the implementor */
+#define CPU_ID_IMPLEMENTOR_MASK	0xff000000
+#define CPU_ID_ARM_LTD		0x41000000 /* 'A' */
+#define CPU_ID_DEC		0x44000000 /* 'D' */
+#define CPU_ID_INTEL		0x69000000 /* 'i' */
+#define	CPU_ID_TI		0x54000000 /* 'T' */
+
+/* How to decide what format the CPUID is in. */
+#define CPU_ID_ISOLD(x)		(((x) & 0x0000f000) == 0x00000000)
+#define CPU_ID_IS7(x)		(((x) & 0x0000f000) == 0x00007000)
+#define CPU_ID_ISNEW(x)		(!CPU_ID_ISOLD(x) && !CPU_ID_IS7(x))
+
+/* On ARM3 and ARM6, this byte holds the foundry ID. */
+#define CPU_ID_FOUNDRY_MASK	0x00ff0000
+#define CPU_ID_FOUNDRY_VLSI	0x00560000
+
+/* On ARM7 it holds the architecture and variant (sub-model) */
+#define CPU_ID_7ARCH_MASK	0x00800000
+#define CPU_ID_7ARCH_V3		0x00000000
+#define CPU_ID_7ARCH_V4T	0x00800000
+#define CPU_ID_7VARIANT_MASK	0x007f0000
+
+/* On more recent ARMs, it does the same, but in a different format */
+#define CPU_ID_ARCH_MASK	0x000f0000
+#define CPU_ID_ARCH_V3		0x00000000
+#define CPU_ID_ARCH_V4		0x00010000
+#define CPU_ID_ARCH_V4T		0x00020000
+#define CPU_ID_ARCH_V5		0x00030000
+#define CPU_ID_ARCH_V5T		0x00040000
+#define CPU_ID_ARCH_V5TE	0x00050000
+#define CPU_ID_VARIANT_MASK	0x00f00000
+
+/* Next three nybbles are part number */
+#define CPU_ID_PARTNO_MASK	0x0000fff0
+
+/* Intel XScale has sub fields in part number */
+#define CPU_ID_XSCALE_COREGEN_MASK	0x0000e000 /* core generation */
+#define CPU_ID_XSCALE_COREREV_MASK	0x00001c00 /* core revision */
+#define CPU_ID_XSCALE_PRODUCT_MASK	0x000003f0 /* product number */
+
+/* And finally, the revision number. */
+#define CPU_ID_REVISION_MASK	0x0000000f
+
+/* Individual CPUs are probably best IDed by everything but the revision. */
+#define CPU_ID_CPU_MASK		0xfffffff0
+
+/* Fake CPU IDs for ARMs without CP15 */
+#define CPU_ID_ARM2		0x41560200
+#define CPU_ID_ARM250		0x41560250
+
+/* Pre-ARM7 CPUs -- [15:12] == 0 */
+#define CPU_ID_ARM3		0x41560300
+#define CPU_ID_ARM600		0x41560600
+#define CPU_ID_ARM610		0x41560610
+#define CPU_ID_ARM620		0x41560620
+
+/* ARM7 CPUs -- [15:12] == 7 */
+#define CPU_ID_ARM700		0x41007000 /* XXX This is a guess. */
+#define CPU_ID_ARM710		0x41007100
+#define CPU_ID_ARM7500		0x41027100 /* XXX This is a guess. */
+#define CPU_ID_ARM710A		0x41047100 /* inc ARM7100 */
+#define CPU_ID_ARM7500FE	0x41077100
+#define CPU_ID_ARM710T		0x41807100
+#define CPU_ID_ARM720T		0x41807200
+#define CPU_ID_ARM740T8K	0x41807400 /* XXX no MMU, 8KB cache */
+#define CPU_ID_ARM740T4K	0x41817400 /* XXX no MMU, 4KB cache */
+
+/* Post-ARM7 CPUs */
+#define CPU_ID_ARM810		0x41018100
+#define CPU_ID_ARM920T		0x41129200
+#define CPU_ID_ARM920T_ALT	0x41009200
+#define CPU_ID_ARM922T		0x41029220
+#define CPU_ID_ARM940T		0x41029400 /* XXX no MMU */
+#define CPU_ID_ARM946ES		0x41049460 /* XXX no MMU */
+#define	CPU_ID_ARM966ES		0x41049660 /* XXX no MMU */
+#define	CPU_ID_ARM966ESR1	0x41059660 /* XXX no MMU */
+#define CPU_ID_ARM1020E		0x4115a200 /* (AKA arm10 rev 1) */
+#define CPU_ID_ARM1022ES	0x4105a220
+#define CPU_ID_SA110		0x4401a100
+#define CPU_ID_SA1100		0x4401a110
+#define	CPU_ID_TI925T		0x54029250
+#define CPU_ID_SA1110		0x6901b110
+#define CPU_ID_IXP1200		0x6901c120
+#define CPU_ID_80200		0x69052000
+#define CPU_ID_PXA250    	0x69052100 /* sans core revision */
+#define CPU_ID_PXA210    	0x69052120
+#define CPU_ID_PXA250A		0x69052100 /* 1st version Core */
+#define CPU_ID_PXA210A		0x69052120 /* 1st version Core */
+#define CPU_ID_PXA250B		0x69052900 /* 3rd version Core */
+#define CPU_ID_PXA210B		0x69052920 /* 3rd version Core */
+#define CPU_ID_PXA250C		0x69052d00 /* 4th version Core */
+#define CPU_ID_PXA210C		0x69052d20 /* 4th version Core */
+#define	CPU_ID_80321_400	0x69052420
+#define	CPU_ID_80321_600	0x69052430
+#define	CPU_ID_80321_400_B0	0x69052c20
+#define	CPU_ID_80321_600_B0	0x69052c30
+#define	CPU_ID_IXP425_533	0x690541c0
+#define	CPU_ID_IXP425_400	0x690541d0
+#define	CPU_ID_IXP425_266	0x690541f0
+
+/* ARM3-specific coprocessor 15 registers */
+#define ARM3_CP15_FLUSH		1
+#define ARM3_CP15_CONTROL	2
+#define ARM3_CP15_CACHEABLE	3
+#define ARM3_CP15_UPDATEABLE	4
+#define ARM3_CP15_DISRUPTIVE	5	
+
+/* ARM3 Control register bits */
+#define ARM3_CTL_CACHE_ON	0x00000001
+#define ARM3_CTL_SHARED		0x00000002
+#define ARM3_CTL_MONITOR	0x00000004
+
+/*
+ * Post-ARM3 CP15 registers:
+ *
+ *	1	Control register
+ *
+ *	2	Translation Table Base
+ *
+ *	3	Domain Access Control
+ *
+ *	4	Reserved
+ *
+ *	5	Fault Status
+ *
+ *	6	Fault Address
+ *
+ *	7	Cache/write-buffer Control
+ *
+ *	8	TLB Control
+ *
+ *	9	Cache Lockdown
+ *
+ *	10	TLB Lockdown
+ *
+ *	11	Reserved
+ *
+ *	12	Reserved
+ *
+ *	13	Process ID (for FCSE)
+ *
+ *	14	Reserved
+ *
+ *	15	Implementation Dependent
+ */
+
+/* Some of the definitions below need cleaning up for V3/V4 architectures */
+
+/* CPU control register (CP15 register 1) */
+#define CPU_CONTROL_MMU_ENABLE	0x00000001 /* M: MMU/Protection unit enable */
+#define CPU_CONTROL_AFLT_ENABLE	0x00000002 /* A: Alignment fault enable */
+#define CPU_CONTROL_DC_ENABLE	0x00000004 /* C: IDC/DC enable */
+#define CPU_CONTROL_WBUF_ENABLE 0x00000008 /* W: Write buffer enable */
+#define CPU_CONTROL_32BP_ENABLE 0x00000010 /* P: 32-bit exception handlers */
+#define CPU_CONTROL_32BD_ENABLE 0x00000020 /* D: 32-bit addressing */
+#define CPU_CONTROL_LABT_ENABLE 0x00000040 /* L: Late abort enable */
+#define CPU_CONTROL_BEND_ENABLE 0x00000080 /* B: Big-endian mode */
+#define CPU_CONTROL_SYST_ENABLE 0x00000100 /* S: System protection bit */
+#define CPU_CONTROL_ROM_ENABLE	0x00000200 /* R: ROM protection bit */
+#define CPU_CONTROL_CPCLK	0x00000400 /* F: Implementation defined */
+#define CPU_CONTROL_BPRD_ENABLE 0x00000800 /* Z: Branch prediction enable */
+#define CPU_CONTROL_IC_ENABLE   0x00001000 /* I: IC enable */
+#define CPU_CONTROL_VECRELOC	0x00002000 /* V: Vector relocation */
+#define CPU_CONTROL_ROUNDROBIN	0x00004000 /* RR: Predictable replacement */
+#define CPU_CONTROL_V4COMPAT	0x00008000 /* L4: ARMv4 compat LDR R15 etc */
+
+#define CPU_CONTROL_IDC_ENABLE	CPU_CONTROL_DC_ENABLE
+
+/* XScale Auxillary Control Register (CP15 register 1, opcode2 1) */
+#define	XSCALE_AUXCTL_K		0x00000001 /* dis. write buffer coalescing */
+#define	XSCALE_AUXCTL_P		0x00000002 /* ECC protect page table access */
+#define	XSCALE_AUXCTL_MD_WB_RA	0x00000000 /* mini-D$ wb, read-allocate */
+#define	XSCALE_AUXCTL_MD_WB_RWA	0x00000010 /* mini-D$ wb, read/write-allocate */
+#define	XSCALE_AUXCTL_MD_WT	0x00000020 /* mini-D$ wt, read-allocate */
+#define	XSCALE_AUXCTL_MD_MASK	0x00000030
+
+/* Cache type register definitions */
+#define	CPU_CT_ISIZE(x)		((x) & 0xfff)		/* I$ info */
+#define	CPU_CT_DSIZE(x)		(((x) >> 12) & 0xfff)	/* D$ info */
+#define	CPU_CT_S		(1U << 24)		/* split cache */
+#define	CPU_CT_CTYPE(x)		(((x) >> 25) & 0xf)	/* cache type */
+
+#define	CPU_CT_CTYPE_WT		0	/* write-through */
+#define	CPU_CT_CTYPE_WB1	1	/* write-back, clean w/ read */
+#define	CPU_CT_CTYPE_WB2	2	/* w/b, clean w/ cp15,7 */
+#define	CPU_CT_CTYPE_WB6	6	/* w/b, cp15,7, lockdown fmt A */
+#define	CPU_CT_CTYPE_WB7	7	/* w/b, cp15,7, lockdown fmt B */
+
+#define	CPU_CT_xSIZE_LEN(x)	((x) & 0x3)		/* line size */
+#define	CPU_CT_xSIZE_M		(1U << 2)		/* multiplier */
+#define	CPU_CT_xSIZE_ASSOC(x)	(((x) >> 3) & 0x7)	/* associativity */
+#define	CPU_CT_xSIZE_SIZE(x)	(((x) >> 6) & 0x7)	/* size */
+
+/* Fault status register definitions */
+
+#define FAULT_TYPE_MASK 0x0f
+#define FAULT_USER      0x10
+
+#define FAULT_WRTBUF_0  0x00 /* Vector Exception */
+#define FAULT_WRTBUF_1  0x02 /* Terminal Exception */
+#define FAULT_BUSERR_0  0x04 /* External Abort on Linefetch -- Section */
+#define FAULT_BUSERR_1  0x06 /* External Abort on Linefetch -- Page */
+#define FAULT_BUSERR_2  0x08 /* External Abort on Non-linefetch -- Section */
+#define FAULT_BUSERR_3  0x0a /* External Abort on Non-linefetch -- Page */
+#define FAULT_BUSTRNL1  0x0c /* External abort on Translation -- Level 1 */
+#define FAULT_BUSTRNL2  0x0e /* External abort on Translation -- Level 2 */
+#define FAULT_ALIGN_0   0x01 /* Alignment */
+#define FAULT_ALIGN_1   0x03 /* Alignment */
+#define FAULT_TRANS_S   0x05 /* Translation -- Section */
+#define FAULT_TRANS_P   0x07 /* Translation -- Page */
+#define FAULT_DOMAIN_S  0x09 /* Domain -- Section */
+#define FAULT_DOMAIN_P  0x0b /* Domain -- Page */
+#define FAULT_PERM_S    0x0d /* Permission -- Section */
+#define FAULT_PERM_P    0x0f /* Permission -- Page */
+
+#define	FAULT_IMPRECISE	0x400	/* Imprecise exception (XSCALE) */
+
+/*
+ * Address of the vector page, low and high versions.
+ */
+#define	ARM_VECTORS_LOW		0x00000000U
+#define	ARM_VECTORS_HIGH	0xffff0000U
+
+/*
+ * ARM Instructions
+ *
+ *       3 3 2 2 2                              
+ *       1 0 9 8 7                                                     0
+ *      +-------+-------------------------------------------------------+
+ *      | cond  |              instruction dependant                    |
+ *      |c c c c|                                                       |
+ *      +-------+-------------------------------------------------------+
+ */
+
+#define INSN_SIZE		4		/* Always 4 bytes */
+#define INSN_COND_MASK		0xf0000000	/* Condition mask */
+#define INSN_COND_AL		0xe0000000	/* Always condition */
+
+#endif /* !MACHINE_ARMREG_H */
diff --git a/libpixelflinger/codeflinger/blending.cpp b/libpixelflinger/codeflinger/blending.cpp
new file mode 100644
index 0000000..2cbb00f
--- /dev/null
+++ b/libpixelflinger/codeflinger/blending.cpp
@@ -0,0 +1,674 @@
+/* libs/pixelflinger/codeflinger/blending.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "pixelflinger-code"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <android-base/macros.h>
+#include <log/log.h>
+
+#include "GGLAssembler.h"
+
+namespace android {
+
+void GGLAssembler::build_fog(
+                        component_t& temp,      // incomming fragment / output
+                        int component,
+                        Scratch& regs)
+{
+   if (mInfo[component].fog) {
+        Scratch scratches(registerFile());
+        comment("fog");
+
+        integer_t fragment(temp.reg, temp.h, temp.flags);
+        if (!(temp.flags & CORRUPTIBLE)) {
+            temp.reg = regs.obtain();
+            temp.flags |= CORRUPTIBLE;
+        }
+
+        integer_t fogColor(scratches.obtain(), 8, CORRUPTIBLE); 
+        LDRB(AL, fogColor.reg, mBuilderContext.Rctx,
+                immed12_pre(GGL_OFFSETOF(state.fog.color[component])));
+
+        integer_t factor(scratches.obtain(), 16, CORRUPTIBLE);
+        CONTEXT_LOAD(factor.reg, generated_vars.f);
+
+        // clamp fog factor (TODO: see if there is a way to guarantee
+        // we won't overflow, when setting the iterators)
+        BIC(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, ASR, 31));
+        CMP(AL, factor.reg, imm( 0x10000 ));
+        MOV(HS, 0, factor.reg, imm( 0x10000 ));
+
+        build_blendFOneMinusF(temp, factor, fragment, fogColor);
+    }
+}
+
+void GGLAssembler::build_blending(
+                        component_t& temp,      // incomming fragment / output
+                        const pixel_t& pixel,   // framebuffer
+                        int component,
+                        Scratch& regs)
+{
+   if (!mInfo[component].blend)
+        return;
+        
+    int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
+    int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
+    if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA)
+        fs = GGL_ONE;
+    const int blending = blending_codes(fs, fd);
+    if (!temp.size()) {
+        // here, blending will produce something which doesn't depend on
+        // that component (eg: GL_ZERO:GL_*), so the register has not been
+        // allocated yet. Will never be used as a source.
+        temp = component_t(regs.obtain(), CORRUPTIBLE);
+    }
+
+    // we are doing real blending...
+    // fb:          extracted dst
+    // fragment:    extracted src
+    // temp:        component_t(fragment) and result
+
+    // scoped register allocator
+    Scratch scratches(registerFile());
+    comment("blending");
+
+    // we can optimize these cases a bit...
+    // (1) saturation is not needed
+    // (2) we can use only one multiply instead of 2
+    // (3) we can reduce the register pressure
+    //      R = S*f + D*(1-f) = (S-D)*f + D
+    //      R = S*(1-f) + D*f = (D-S)*f + S
+
+    const bool same_factor_opt1 =
+        (fs==GGL_DST_COLOR && fd==GGL_ONE_MINUS_DST_COLOR) ||
+        (fs==GGL_SRC_COLOR && fd==GGL_ONE_MINUS_SRC_COLOR) ||
+        (fs==GGL_DST_ALPHA && fd==GGL_ONE_MINUS_DST_ALPHA) ||
+        (fs==GGL_SRC_ALPHA && fd==GGL_ONE_MINUS_SRC_ALPHA);
+
+    const bool same_factor_opt2 =
+        (fs==GGL_ONE_MINUS_DST_COLOR && fd==GGL_DST_COLOR) ||
+        (fs==GGL_ONE_MINUS_SRC_COLOR && fd==GGL_SRC_COLOR) || 
+        (fs==GGL_ONE_MINUS_DST_ALPHA && fd==GGL_DST_ALPHA) ||
+        (fs==GGL_ONE_MINUS_SRC_ALPHA && fd==GGL_SRC_ALPHA);
+
+
+    // XXX: we could also optimize these cases:
+    // R = S*f + D*f = (S+D)*f
+    // R = S*(1-f) + D*(1-f) = (S+D)*(1-f)
+    // R = S*D + D*S = 2*S*D
+
+
+    // see if we need to extract 'component' from the destination (fb)
+    integer_t fb;
+    if (blending & (BLEND_DST|FACTOR_DST)) { 
+        fb.setTo(scratches.obtain(), 32); 
+        extract(fb, pixel, component);
+        if (mDithering) {
+            // XXX: maybe what we should do instead, is simply
+            // expand fb -or- fragment to the larger of the two
+            if (fb.size() < temp.size()) {
+                // for now we expand 'fb' to min(fragment, 8)
+                int new_size = temp.size() < 8 ? temp.size() : 8;
+                expand(fb, fb, new_size);
+            }
+        }
+    }
+
+
+    // convert input fragment to integer_t
+    if (temp.l && (temp.flags & CORRUPTIBLE)) {
+        MOV(AL, 0, temp.reg, reg_imm(temp.reg, LSR, temp.l));
+        temp.h -= temp.l;
+        temp.l = 0;
+    }
+    integer_t fragment(temp.reg, temp.size(), temp.flags);
+
+    // if not done yet, convert input fragment to integer_t
+    if (temp.l) {
+        // here we know temp is not CORRUPTIBLE
+        fragment.reg = scratches.obtain();
+        MOV(AL, 0, fragment.reg, reg_imm(temp.reg, LSR, temp.l));
+        fragment.flags |= CORRUPTIBLE;
+    }
+
+    if (!(temp.flags & CORRUPTIBLE)) {
+        // temp is not corruptible, but since it's the destination it
+        // will be modified, so we need to allocate a new register.
+        temp.reg = regs.obtain();
+        temp.flags &= ~CORRUPTIBLE;
+        fragment.flags &= ~CORRUPTIBLE;
+    }
+
+    if ((blending & BLEND_SRC) && !same_factor_opt1) {
+        // source (fragment) is needed for the blending stage
+        // so it's not CORRUPTIBLE (unless we're doing same_factor_opt1)
+        fragment.flags &= ~CORRUPTIBLE;
+    }
+
+
+    if (same_factor_opt1) {
+        //  R = S*f + D*(1-f) = (S-D)*f + D
+        integer_t factor;
+        build_blend_factor(factor, fs, 
+                component, pixel, fragment, fb, scratches);
+        // fb is always corruptible from this point
+        fb.flags |= CORRUPTIBLE;
+        build_blendFOneMinusF(temp, factor, fragment, fb);
+    } else if (same_factor_opt2) {
+        //  R = S*(1-f) + D*f = (D-S)*f + S
+        integer_t factor;
+        // fb is always corrruptible here
+        fb.flags |= CORRUPTIBLE;
+        build_blend_factor(factor, fd,
+                component, pixel, fragment, fb, scratches);
+        build_blendOneMinusFF(temp, factor, fragment, fb);
+    } else {
+        integer_t src_factor;
+        integer_t dst_factor;
+
+        // if destination (fb) is not needed for the blending stage, 
+        // then it can be marked as CORRUPTIBLE
+        if (!(blending & BLEND_DST)) {
+            fb.flags |= CORRUPTIBLE;
+        }
+
+        // XXX: try to mark some registers as CORRUPTIBLE
+        // in most case we could make those corruptible
+        // when we're processing the last component
+        // but not always, for instance
+        //    when fragment is constant and not reloaded
+        //    when fb is needed for logic-ops or masking
+        //    when a register is aliased (for instance with mAlphaSource)
+
+        // blend away...
+        if (fs==GGL_ZERO) {
+            if (fd==GGL_ZERO) {         // R = 0
+                // already taken care of
+            } else if (fd==GGL_ONE) {   // R = D
+                // already taken care of
+            } else {                    // R = D*fd
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor(temp, fb, dst_factor);
+            }
+        } else if (fs==GGL_ONE) {
+            if (fd==GGL_ZERO) {         // R = S
+                // NOP, taken care of
+            } else if (fd==GGL_ONE) {   // R = S + D
+                component_add(temp, fb, fragment); // args order matters
+                component_sat(temp);
+            } else {                    // R = S + D*fd
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor_add(temp, fb, dst_factor, component_t(fragment));
+                component_sat(temp);
+            }
+        } else {
+            // compute fs
+            build_blend_factor(src_factor, fs, 
+                    component, pixel, fragment, fb, scratches);
+            if (fd==GGL_ZERO) {         // R = S*fs
+                mul_factor(temp, fragment, src_factor);
+            } else if (fd==GGL_ONE) {   // R = S*fs + D
+                mul_factor_add(temp, fragment, src_factor, component_t(fb));
+                component_sat(temp);
+            } else {                    // R = S*fs + D*fd
+                mul_factor(temp, fragment, src_factor);
+                if (scratches.isUsed(src_factor.reg))
+                    scratches.recycle(src_factor.reg);
+                // compute fd
+                build_blend_factor(dst_factor, fd,
+                        component, pixel, fragment, fb, scratches);
+                mul_factor_add(temp, fb, dst_factor, temp);
+                if (!same_factor_opt1 && !same_factor_opt2) {
+                    component_sat(temp);
+                }
+            }
+        }
+    }
+
+    // now we can be corrupted (it's the dest)
+    temp.flags |= CORRUPTIBLE;
+}
+
+void GGLAssembler::build_blend_factor(
+        integer_t& factor, int f, int component,
+        const pixel_t& dst_pixel,
+        integer_t& fragment,
+        integer_t& fb,
+        Scratch& scratches)
+{
+    integer_t src_alpha(fragment);
+
+    // src_factor/dst_factor won't be used after blending,
+    // so it's fine to mark them as CORRUPTIBLE (if not aliased)
+    factor.flags |= CORRUPTIBLE;
+
+    switch(f) {
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        if (component==GGLFormat::ALPHA && !isAlphaSourceNeeded()) {
+            // we're processing alpha, so we already have
+            // src-alpha in fragment, and we need src-alpha just this time.
+        } else {
+           // alpha-src will be needed for other components
+            if (!mBlendFactorCached || mBlendFactorCached==f) {
+                src_alpha = mAlphaSource;
+                factor = mAlphaSource;
+                factor.flags &= ~CORRUPTIBLE;           
+                // we already computed the blend factor before, nothing to do.
+                if (mBlendFactorCached)
+                    return;
+                // this is the first time, make sure to compute the blend
+                // factor properly.
+                mBlendFactorCached = f;
+                break;
+            } else {
+                // we have a cached alpha blend factor, but we want another one,
+                // this should really not happen because by construction,
+                // we cannot have BOTH source and destination
+                // blend factors use ALPHA *and* ONE_MINUS_ALPHA (because
+                // the blending stage uses the f/(1-f) optimization
+                
+                // for completeness, we handle this case though. Since there
+                // are only 2 choices, this meens we want "the other one"
+                // (1-factor)
+                factor = mAlphaSource;
+                factor.flags &= ~CORRUPTIBLE;           
+                RSB(AL, 0, factor.reg, factor.reg, imm((1<<factor.s)));
+                mBlendFactorCached = f;
+                return;
+            }                
+        }
+        FALLTHROUGH_INTENDED;
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+    case GGL_SRC_ALPHA_SATURATE:
+        // help us find out what register we can use for the blend-factor
+        // CORRUPTIBLE registers are chosen first, or a new one is allocated.
+        if (fragment.flags & CORRUPTIBLE) {
+            factor.setTo(fragment.reg, 32, CORRUPTIBLE);
+            fragment.flags &= ~CORRUPTIBLE;
+        } else if (fb.flags & CORRUPTIBLE) {
+            factor.setTo(fb.reg, 32, CORRUPTIBLE);
+            fb.flags &= ~CORRUPTIBLE;
+        } else {
+            factor.setTo(scratches.obtain(), 32, CORRUPTIBLE);
+        } 
+        break;
+    }
+
+    // XXX: doesn't work if size==1
+
+    switch(f) {
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        factor.s = fb.s;
+        ADD(AL, 0, factor.reg, fb.reg, reg_imm(fb.reg, LSR, fb.s-1));
+        break;
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:
+        factor.s = fragment.s;
+        ADD(AL, 0, factor.reg, fragment.reg,
+            reg_imm(fragment.reg, LSR, fragment.s-1));
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        factor.s = src_alpha.s;
+        ADD(AL, 0, factor.reg, src_alpha.reg,
+                reg_imm(src_alpha.reg, LSR, src_alpha.s-1));
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        // XXX: should be precomputed
+        extract(factor, dst_pixel, GGLFormat::ALPHA);
+        ADD(AL, 0, factor.reg, factor.reg,
+                reg_imm(factor.reg, LSR, factor.s-1));
+        break;
+    case GGL_SRC_ALPHA_SATURATE:
+        // XXX: should be precomputed
+        // XXX: f = min(As, 1-Ad)
+        // btw, we're guaranteed that Ad's size is <= 8, because
+        // it's extracted from the framebuffer
+        break;
+    }
+
+    switch(f) {
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_ONE_MINUS_SRC_ALPHA:
+        RSB(AL, 0, factor.reg, factor.reg, imm((1<<factor.s)));
+    }
+    
+    // don't need more than 8-bits for the blend factor
+    // and this will prevent overflows in the multiplies later
+    if (factor.s > 8) {
+        MOV(AL, 0, factor.reg, reg_imm(factor.reg, LSR, factor.s-8));
+        factor.s = 8;
+    }
+}
+
+int GGLAssembler::blending_codes(int fs, int fd)
+{
+    int blending = 0;
+    switch(fs) {
+    case GGL_ONE:
+        blending |= BLEND_SRC;
+        break;
+
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        blending |= FACTOR_DST|BLEND_SRC;
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        // no need to extract 'component' from the destination
+        // for the blend factor, because we need ALPHA only.
+        blending |= BLEND_SRC;
+        break;
+
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:    
+        blending |= FACTOR_SRC|BLEND_SRC;
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+    case GGL_SRC_ALPHA_SATURATE:
+        blending |= FACTOR_SRC|BLEND_SRC;
+        break;
+    }
+    switch(fd) {
+    case GGL_ONE:
+        blending |= BLEND_DST;
+        break;
+
+    case GGL_ONE_MINUS_DST_COLOR:
+    case GGL_DST_COLOR:
+        blending |= FACTOR_DST|BLEND_DST;
+        break;
+    case GGL_ONE_MINUS_DST_ALPHA:
+    case GGL_DST_ALPHA:
+        blending |= FACTOR_DST|BLEND_DST;
+        break;
+
+    case GGL_ONE_MINUS_SRC_COLOR:
+    case GGL_SRC_COLOR:    
+        blending |= FACTOR_SRC|BLEND_DST;
+        break;
+    case GGL_ONE_MINUS_SRC_ALPHA:
+    case GGL_SRC_ALPHA:
+        // no need to extract 'component' from the source
+        // for the blend factor, because we need ALPHA only.
+        blending |= BLEND_DST;
+        break;
+    }
+    return blending;
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::build_blendFOneMinusF(
+        component_t& temp,
+        const integer_t& factor, 
+        const integer_t& fragment,
+        const integer_t& fb)
+{
+    //  R = S*f + D*(1-f) = (S-D)*f + D
+    Scratch scratches(registerFile());
+    // compute S-D
+    integer_t diff(fragment.flags & CORRUPTIBLE ?
+            fragment.reg : scratches.obtain(), fb.size(), CORRUPTIBLE);
+    const int shift = fragment.size() - fb.size();
+    if (shift>0)        RSB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSR, shift));
+    else if (shift<0)   RSB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSL,-shift));
+    else                RSB(AL, 0, diff.reg, fb.reg, fragment.reg);
+    mul_factor_add(temp, diff, factor, component_t(fb));
+}
+
+void GGLAssembler::build_blendOneMinusFF(
+        component_t& temp,
+        const integer_t& factor, 
+        const integer_t& fragment,
+        const integer_t& fb)
+{
+    //  R = S*f + D*(1-f) = (S-D)*f + D
+    Scratch scratches(registerFile());
+    // compute D-S
+    integer_t diff(fb.flags & CORRUPTIBLE ?
+            fb.reg : scratches.obtain(), fb.size(), CORRUPTIBLE);
+    const int shift = fragment.size() - fb.size();
+    if (shift>0)        SUB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSR, shift));
+    else if (shift<0)   SUB(AL, 0, diff.reg, fb.reg, reg_imm(fragment.reg, LSL,-shift));
+    else                SUB(AL, 0, diff.reg, fb.reg, fragment.reg);
+    mul_factor_add(temp, diff, factor, component_t(fragment));
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::mul_factor(  component_t& d,
+                                const integer_t& v,
+                                const integer_t& f)
+{
+    int vs = v.size();
+    int fs = f.size();
+    int ms = vs+fs;
+
+    // XXX: we could have special cases for 1 bit mul
+
+    // all this code below to use the best multiply instruction
+    // wrt the parameters size. We take advantage of the fact
+    // that the 16-bits multiplies allow a 16-bit shift
+    // The trick is that we just make sure that we have at least 8-bits
+    // per component (which is enough for a 8 bits display).
+
+    int xy;
+    int vshift = 0;
+    int fshift = 0;
+    int smulw = 0;
+
+    if (vs<16) {
+        if (fs<16) {
+            xy = xyBB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            ms -= 16;
+            xy = xyTB;
+        } else {
+            // eg: 15 * 18  ->  15 * 15
+            fshift = fs - 15;
+            ms -= fshift;
+            xy = xyBB;
+        }
+    } else if (GGL_BETWEEN(vs, 24, 31)) {
+        if (fs<16) {
+            ms -= 16;
+            xy = xyTB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            ms -= 32;
+            xy = xyTT;
+        } else {
+            // eg: 24 * 18  ->  8 * 18
+            fshift = fs - 15;
+            ms -= 16 + fshift;
+            xy = xyTB;
+        }
+    } else {
+        if (fs<16) {
+            // eg: 18 * 15  ->  15 * 15
+            vshift = vs - 15;
+            ms -= vshift;
+            xy = xyBB;
+        } else if (GGL_BETWEEN(fs, 24, 31)) {
+            // eg: 18 * 24  ->  15 * 8
+            vshift = vs - 15;
+            ms -= 16 + vshift;
+            xy = xyBT;
+        } else {
+            // eg: 18 * 18  ->  (15 * 18)>>16
+            fshift = fs - 15;
+            ms -= 16 + fshift;
+            xy = yB;    //XXX SMULWB
+            smulw = 1;
+        }
+    }
+
+    ALOGE_IF(ms>=32, "mul_factor overflow vs=%d, fs=%d", vs, fs);
+
+    int vreg = v.reg;
+    int freg = f.reg;
+    if (vshift) {
+        MOV(AL, 0, d.reg, reg_imm(vreg, LSR, vshift));
+        vreg = d.reg;
+    }
+    if (fshift) {
+        MOV(AL, 0, d.reg, reg_imm(vreg, LSR, fshift));
+        freg = d.reg;
+    }
+    if (smulw)  SMULW(AL, xy, d.reg, vreg, freg);
+    else        SMUL(AL, xy, d.reg, vreg, freg);
+
+
+    d.h = ms;
+    if (mDithering) {
+        d.l = 0; 
+    } else {
+        d.l = fs; 
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::mul_factor_add(  component_t& d,
+                                    const integer_t& v,
+                                    const integer_t& f,
+                                    const component_t& a)
+{
+    // XXX: we could have special cases for 1 bit mul
+    Scratch scratches(registerFile());
+
+    int vs = v.size();
+    int fs = f.size();
+    int as = a.h;
+    int ms = vs+fs;
+
+    ALOGE_IF(ms>=32, "mul_factor_add overflow vs=%d, fs=%d, as=%d", vs, fs, as);
+
+    integer_t add(a.reg, a.h, a.flags);
+
+    // 'a' is a component_t but it is guaranteed to have
+    // its high bits set to 0. However in the dithering case,
+    // we can't get away with truncating the potentially bad bits
+    // so extraction is needed.
+
+   if ((mDithering) && (a.size() < ms)) {
+        // we need to expand a
+        if (!(a.flags & CORRUPTIBLE)) {
+            // ... but it's not corruptible, so we need to pick a
+            // temporary register.
+            // Try to uses the destination register first (it's likely
+            // to be usable, unless it aliases an input).
+            if (d.reg!=a.reg && d.reg!=v.reg && d.reg!=f.reg) {
+                add.reg = d.reg;
+            } else {
+                add.reg = scratches.obtain();
+            }
+        }
+        expand(add, a, ms); // extracts and expands
+        as = ms;
+    }
+
+    if (ms == as) {
+        if (vs<16 && fs<16) SMLABB(AL, d.reg, v.reg, f.reg, add.reg);
+        else                MLA(AL, 0, d.reg, v.reg, f.reg, add.reg);
+    } else {
+        int temp = d.reg;
+        if (temp == add.reg) {
+            // the mul will modify add.reg, we need an intermediary reg
+            if (v.flags & CORRUPTIBLE)      temp = v.reg;
+            else if (f.flags & CORRUPTIBLE) temp = f.reg;
+            else                            temp = scratches.obtain();
+        }
+
+        if (vs<16 && fs<16) SMULBB(AL, temp, v.reg, f.reg);
+        else                MUL(AL, 0, temp, v.reg, f.reg);
+
+        if (ms>as) {
+            ADD(AL, 0, d.reg, temp, reg_imm(add.reg, LSL, ms-as));
+        } else if (ms<as) {
+            // not sure if we should expand the mul instead?
+            ADD(AL, 0, d.reg, temp, reg_imm(add.reg, LSR, as-ms));
+        }
+    }
+
+    d.h = ms;
+    if (mDithering) {
+        d.l = a.l; 
+    } else {
+        d.l = fs>a.l ? fs : a.l;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::component_add(component_t& d,
+        const integer_t& dst, const integer_t& src)
+{
+    // here we're guaranteed that fragment.size() >= fb.size()
+    const int shift = src.size() - dst.size();
+    if (!shift) {
+        ADD(AL, 0, d.reg, src.reg, dst.reg);
+    } else {
+        ADD(AL, 0, d.reg, src.reg, reg_imm(dst.reg, LSL, shift));
+    }
+
+    d.h = src.size();
+    if (mDithering) {
+        d.l = 0;
+    } else {
+        d.l = shift;
+        d.flags |= CLEAR_LO;
+    }
+}
+
+void GGLAssembler::component_sat(const component_t& v)
+{
+    const int one = ((1<<v.size())-1)<<v.l;
+    CMP(AL, v.reg, imm( 1<<v.h ));
+    if (isValidImmediate(one)) {
+        MOV(HS, 0, v.reg, imm( one ));
+    } else if (isValidImmediate(~one)) {
+        MVN(HS, 0, v.reg, imm( ~one ));
+    } else {
+        MOV(HS, 0, v.reg, imm( 1<<v.h ));
+        SUB(HS, 0, v.reg, v.reg, imm( 1<<v.l ));
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
diff --git a/libpixelflinger/codeflinger/disassem.c b/libpixelflinger/codeflinger/disassem.c
new file mode 100644
index 0000000..5cbd63d
--- /dev/null
+++ b/libpixelflinger/codeflinger/disassem.c
@@ -0,0 +1,714 @@
+/*	$NetBSD: disassem.c,v 1.14 2003/03/27 16:58:36 mycroft Exp $	*/
+
+/*-
+ * Copyright (c) 1996 Mark Brinicombe.
+ * Copyright (c) 1996 Brini.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Brini.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY BRINI ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL BRINI OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * RiscBSD kernel project
+ *
+ * db_disasm.c
+ *
+ * Kernel disassembler
+ *
+ * Created      : 10/02/96
+ *
+ * Structured after the sparc/sparc/db_disasm.c by David S. Miller &
+ * Paul Kranenburg
+ *
+ * This code is not complete. Not all instructions are disassembled.
+ */
+
+#include <sys/cdefs.h>
+//__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/arm/arm/disassem.c,v 1.2 2005/01/05 21:58:47 imp Exp $");
+#include <sys/param.h>
+#include <stdio.h>
+
+#include "disassem.h"
+#include "armreg.h"
+//#include <ddb/ddb.h>
+
+/*
+ * General instruction format
+ *
+ *	insn[cc][mod]	[operands]
+ *
+ * Those fields with an uppercase format code indicate that the field
+ * follows directly after the instruction before the separator i.e.
+ * they modify the instruction rather than just being an operand to
+ * the instruction. The only exception is the writeback flag which
+ * follows a operand.
+ *
+ *
+ * 2 - print Operand 2 of a data processing instruction
+ * d - destination register (bits 12-15)
+ * n - n register (bits 16-19)
+ * s - s register (bits 8-11)
+ * o - indirect register rn (bits 16-19) (used by swap)
+ * m - m register (bits 0-3)
+ * a - address operand of ldr/str instruction
+ * e - address operand of ldrh/strh instruction
+ * l - register list for ldm/stm instruction
+ * f - 1st fp operand (register) (bits 12-14)
+ * g - 2nd fp operand (register) (bits 16-18)
+ * h - 3rd fp operand (register/immediate) (bits 0-4)
+ * j - xtb rotate literal (bits 10-11)
+ * i - bfx lsb literal (bits 7-11)
+ * w - bfx width literal (bits 16-20)
+ * b - branch address
+ * t - thumb branch address (bits 24, 0-23)
+ * k - breakpoint comment (bits 0-3, 8-19)
+ * X - block transfer type
+ * Y - block transfer type (r13 base)
+ * c - comment field bits(0-23)
+ * p - saved or current status register
+ * F - PSR transfer fields
+ * D - destination-is-r15 (P) flag on TST, TEQ, CMP, CMN
+ * L - co-processor transfer size
+ * S - set status flag
+ * P - fp precision
+ * Q - fp precision (for ldf/stf)
+ * R - fp rounding
+ * v - co-processor data transfer registers + addressing mode
+ * W - writeback flag
+ * x - instruction in hex
+ * # - co-processor number
+ * y - co-processor data processing registers
+ * z - co-processor register transfer registers
+ */
+
+struct arm32_insn {
+	u_int mask;
+	u_int pattern;
+	char* name;
+	char* format;
+};
+
+static const struct arm32_insn arm32_i[] = {
+    { 0x0fffffff, 0x0ff00000, "imb",	"c" },		/* Before swi */
+    { 0x0fffffff, 0x0ff00001, "imbrange",	"c" },	/* Before swi */
+    { 0x0f000000, 0x0f000000, "swi",	"c" },
+    { 0xfe000000, 0xfa000000, "blx",	"t" },		/* Before b and bl */
+    { 0x0f000000, 0x0a000000, "b",	"b" },
+    { 0x0f000000, 0x0b000000, "bl",	"b" },
+    { 0x0fe000f0, 0x00000090, "mul",	"Snms" },
+    { 0x0fe000f0, 0x00200090, "mla",	"Snmsd" },
+    { 0x0fe000f0, 0x00800090, "umull",	"Sdnms" },
+    { 0x0fe000f0, 0x00c00090, "smull",	"Sdnms" },
+    { 0x0fe000f0, 0x00a00090, "umlal",	"Sdnms" },
+    { 0x0fe000f0, 0x00e00090, "smlal",	"Sdnms" },
+    { 0x0fff03f0, 0x06cf0070, "uxtb16", "dmj" },
+    { 0x0fe00070, 0x07e00050, "ubfx",   "dmiw" },
+    { 0x0d700000, 0x04200000, "strt",	"daW" },
+    { 0x0d700000, 0x04300000, "ldrt",	"daW" },
+    { 0x0d700000, 0x04600000, "strbt",	"daW" },
+    { 0x0d700000, 0x04700000, "ldrbt",	"daW" },
+    { 0x0c500000, 0x04000000, "str",	"daW" },
+    { 0x0c500000, 0x04100000, "ldr",	"daW" },
+    { 0x0c500000, 0x04400000, "strb",	"daW" },
+    { 0x0c500000, 0x04500000, "ldrb",	"daW" },
+    { 0x0e1f0000, 0x080d0000, "stm",	"YnWl" },/* separate out r13 base */
+    { 0x0e1f0000, 0x081d0000, "ldm",	"YnWl" },/* separate out r13 base */    
+    { 0x0e100000, 0x08000000, "stm",	"XnWl" },
+    { 0x0e100000, 0x08100000, "ldm",	"XnWl" },    
+    { 0x0e1000f0, 0x00100090, "ldrb",	"deW" },
+    { 0x0e1000f0, 0x00000090, "strb",	"deW" },
+    { 0x0e1000f0, 0x001000d0, "ldrsb",	"deW" },
+    { 0x0e1000f0, 0x001000b0, "ldrh",	"deW" },
+    { 0x0e1000f0, 0x000000b0, "strh",	"deW" },
+    { 0x0e1000f0, 0x001000f0, "ldrsh",	"deW" },
+    { 0x0f200090, 0x00200090, "und",	"x" },	/* Before data processing */
+    { 0x0e1000d0, 0x000000d0, "und",	"x" },	/* Before data processing */
+    { 0x0ff00ff0, 0x01000090, "swp",	"dmo" },
+    { 0x0ff00ff0, 0x01400090, "swpb",	"dmo" },
+    { 0x0fbf0fff, 0x010f0000, "mrs",	"dp" },	/* Before data processing */
+    { 0x0fb0fff0, 0x0120f000, "msr",	"pFm" },/* Before data processing */
+    { 0x0fb0f000, 0x0320f000, "msr",	"pF2" },/* Before data processing */
+    { 0x0ffffff0, 0x012fff10, "bx",     "m" },
+    { 0x0fff0ff0, 0x016f0f10, "clz",	"dm" },
+    { 0x0ffffff0, 0x012fff30, "blx",	"m" },
+    { 0xfff000f0, 0xe1200070, "bkpt",	"k" },
+    { 0x0de00000, 0x00000000, "and",	"Sdn2" },
+    { 0x0de00000, 0x00200000, "eor",	"Sdn2" },
+    { 0x0de00000, 0x00400000, "sub",	"Sdn2" },
+    { 0x0de00000, 0x00600000, "rsb",	"Sdn2" },
+    { 0x0de00000, 0x00800000, "add",	"Sdn2" },
+    { 0x0de00000, 0x00a00000, "adc",	"Sdn2" },
+    { 0x0de00000, 0x00c00000, "sbc",	"Sdn2" },
+    { 0x0de00000, 0x00e00000, "rsc",	"Sdn2" },
+    { 0x0df00000, 0x01100000, "tst",	"Dn2" },
+    { 0x0df00000, 0x01300000, "teq",	"Dn2" },
+    { 0x0df00000, 0x01500000, "cmp",	"Dn2" },
+    { 0x0df00000, 0x01700000, "cmn",	"Dn2" },
+    { 0x0de00000, 0x01800000, "orr",	"Sdn2" },
+    { 0x0de00000, 0x01a00000, "mov",	"Sd2" },
+    { 0x0de00000, 0x01c00000, "bic",	"Sdn2" },
+    { 0x0de00000, 0x01e00000, "mvn",	"Sd2" },
+    { 0x0ff08f10, 0x0e000100, "adf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e100100, "muf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e200100, "suf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e300100, "rsf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e400100, "dvf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e500100, "rdf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e600100, "pow",	"PRfgh" },
+    { 0x0ff08f10, 0x0e700100, "rpw",	"PRfgh" },
+    { 0x0ff08f10, 0x0e800100, "rmf",	"PRfgh" },
+    { 0x0ff08f10, 0x0e900100, "fml",	"PRfgh" },
+    { 0x0ff08f10, 0x0ea00100, "fdv",	"PRfgh" },
+    { 0x0ff08f10, 0x0eb00100, "frd",	"PRfgh" },
+    { 0x0ff08f10, 0x0ec00100, "pol",	"PRfgh" },
+    { 0x0f008f10, 0x0e000100, "fpbop",	"PRfgh" },
+    { 0x0ff08f10, 0x0e008100, "mvf",	"PRfh" },
+    { 0x0ff08f10, 0x0e108100, "mnf",	"PRfh" },
+    { 0x0ff08f10, 0x0e208100, "abs",	"PRfh" },
+    { 0x0ff08f10, 0x0e308100, "rnd",	"PRfh" },
+    { 0x0ff08f10, 0x0e408100, "sqt",	"PRfh" },
+    { 0x0ff08f10, 0x0e508100, "log",	"PRfh" },
+    { 0x0ff08f10, 0x0e608100, "lgn",	"PRfh" },
+    { 0x0ff08f10, 0x0e708100, "exp",	"PRfh" },
+    { 0x0ff08f10, 0x0e808100, "sin",	"PRfh" },
+    { 0x0ff08f10, 0x0e908100, "cos",	"PRfh" },
+    { 0x0ff08f10, 0x0ea08100, "tan",	"PRfh" },
+    { 0x0ff08f10, 0x0eb08100, "asn",	"PRfh" },
+    { 0x0ff08f10, 0x0ec08100, "acs",	"PRfh" },
+    { 0x0ff08f10, 0x0ed08100, "atn",	"PRfh" },
+    { 0x0f008f10, 0x0e008100, "fpuop",	"PRfh" },
+    { 0x0e100f00, 0x0c000100, "stf",	"QLv" },
+    { 0x0e100f00, 0x0c100100, "ldf",	"QLv" },
+    { 0x0ff00f10, 0x0e000110, "flt",	"PRgd" },
+    { 0x0ff00f10, 0x0e100110, "fix",	"PRdh" },
+    { 0x0ff00f10, 0x0e200110, "wfs",	"d" },
+    { 0x0ff00f10, 0x0e300110, "rfs",	"d" },
+    { 0x0ff00f10, 0x0e400110, "wfc",	"d" },
+    { 0x0ff00f10, 0x0e500110, "rfc",	"d" },
+    { 0x0ff0ff10, 0x0e90f110, "cmf",	"PRgh" },
+    { 0x0ff0ff10, 0x0eb0f110, "cnf",	"PRgh" },
+    { 0x0ff0ff10, 0x0ed0f110, "cmfe",	"PRgh" },
+    { 0x0ff0ff10, 0x0ef0f110, "cnfe",	"PRgh" },
+    { 0xff100010, 0xfe000010, "mcr2",	"#z" },
+    { 0x0f100010, 0x0e000010, "mcr",	"#z" },
+    { 0xff100010, 0xfe100010, "mrc2",	"#z" },
+    { 0x0f100010, 0x0e100010, "mrc",	"#z" },
+    { 0xff000010, 0xfe000000, "cdp2",	"#y" },
+    { 0x0f000010, 0x0e000000, "cdp",	"#y" },
+    { 0xfe100090, 0xfc100000, "ldc2",	"L#v" },
+    { 0x0e100090, 0x0c100000, "ldc",	"L#v" },
+    { 0xfe100090, 0xfc000000, "stc2",	"L#v" },
+    { 0x0e100090, 0x0c000000, "stc",	"L#v" },
+    { 0xf550f000, 0xf550f000, "pld",	"ne" },
+    { 0x0ff00ff0, 0x01000050, "qaad",	"dmn" },
+    { 0x0ff00ff0, 0x01400050, "qdaad",	"dmn" },
+    { 0x0ff00ff0, 0x01600050, "qdsub",	"dmn" },
+    { 0x0ff00ff0, 0x01200050, "dsub",	"dmn" },
+    { 0x0ff000f0, 0x01000080, "smlabb",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000a0, "smlatb",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000c0, "smlabt",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x010000e0, "smlatt",	"nmsd" },   // d & n inverted!!
+    { 0x0ff000f0, 0x01400080, "smlalbb","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000a0, "smlaltb","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000c0, "smlalbt","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x014000e0, "smlaltt","ndms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x01200080, "smlawb", "nmsd" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x012000a0, "smulwb","nms" },   // d & n inverted!!
+    { 0x0ff000f0, 0x012000c0, "smlawt", "nmsd" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x012000e0, "smulwt","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x01600080, "smulbb","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000a0, "smultb","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000c0, "smulbt","nms" },   // d & n inverted!!
+    { 0x0ff0f0f0, 0x016000e0, "smultt","nms" },   // d & n inverted!!
+    { 0x00000000, 0x00000000, NULL,	NULL }
+};
+
+static char const arm32_insn_conditions[][4] = {
+	"eq", "ne", "cs", "cc",
+	"mi", "pl", "vs", "vc",
+	"hi", "ls", "ge", "lt",
+	"gt", "le", "",   "nv"
+};
+
+static char const insn_block_transfers[][4] = {
+	"da", "ia", "db", "ib"
+};
+
+static char const insn_stack_block_transfers[][4] = {
+	"ed", "ea", "fd", "fa"
+};
+
+static char const op_shifts[][4] = {
+	"lsl", "lsr", "asr", "ror"
+};
+
+static char const insn_fpa_rounding[][2] = {
+	"", "p", "m", "z"
+};
+
+static char const insn_fpa_precision[][2] = {
+	"s", "d", "e", "p"
+};
+
+static char const insn_fpaconstants[][8] = {
+	"0.0", "1.0", "2.0", "3.0",
+	"4.0", "5.0", "0.5", "10.0"
+};
+
+#define insn_condition(x)	arm32_insn_conditions[((x) >> 28) & 0x0f]
+#define insn_blktrans(x)	insn_block_transfers[((x) >> 23) & 3]
+#define insn_stkblktrans(x)	insn_stack_block_transfers[(3*(((x) >> 20)&1))^(((x) >> 23)&3)]
+#define op2_shift(x)		op_shifts[((x) >> 5) & 3]
+#define insn_fparnd(x)		insn_fpa_rounding[((x) >> 5) & 0x03]
+#define insn_fpaprec(x)		insn_fpa_precision[((((x) >> 18) & 2)|((x) >> 7)) & 1]
+#define insn_fpaprect(x)	insn_fpa_precision[((((x) >> 21) & 2)|((x) >> 15)) & 1]
+#define insn_fpaimm(x)		insn_fpaconstants[(x) & 0x07]
+
+/* Local prototypes */
+static void disasm_register_shift(const disasm_interface_t *di, u_int insn);
+static void disasm_print_reglist(const disasm_interface_t *di, u_int insn);
+static void disasm_insn_ldrstr(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static void disasm_insn_ldrhstrh(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static void disasm_insn_ldcstc(const disasm_interface_t *di, u_int insn,
+    u_int loc);
+static u_int disassemble_readword(u_int address);
+static void disassemble_printaddr(u_int address);
+
+u_int
+disasm(const disasm_interface_t *di, u_int loc, int __unused altfmt)
+{
+	const struct arm32_insn *i_ptr = &arm32_i[0];
+	u_int insn = di->di_readword(loc);
+	int matchp = 0;
+	int branch;
+	char* f_ptr;
+	int fmt = 0;
+
+/*	di->di_printf("loc=%08x insn=%08x : ", loc, insn);*/
+
+	while (i_ptr->name) {
+		if ((insn & i_ptr->mask) ==  i_ptr->pattern) {
+			matchp = 1;
+			break;
+		}
+		i_ptr++;
+	}
+
+	if (!matchp) {
+		di->di_printf("und%s\t%08x\n", insn_condition(insn), insn);
+		return(loc + INSN_SIZE);
+	}
+
+	/* If instruction forces condition code, don't print it. */
+	if ((i_ptr->mask & 0xf0000000) == 0xf0000000)
+		di->di_printf("%s", i_ptr->name);
+	else
+		di->di_printf("%s%s", i_ptr->name, insn_condition(insn));
+
+	f_ptr = i_ptr->format;
+
+	/* Insert tab if there are no instruction modifiers */
+
+	if (*(f_ptr) < 'A' || *(f_ptr) > 'Z') {
+		++fmt;
+		di->di_printf("\t");
+	}
+
+	while (*f_ptr) {
+		switch (*f_ptr) {
+		/* 2 - print Operand 2 of a data processing instruction */
+		case '2':
+			if (insn & 0x02000000) {
+				int rotate= ((insn >> 7) & 0x1e);
+
+				di->di_printf("#0x%08x",
+					      (insn & 0xff) << (32 - rotate) |
+					      (insn & 0xff) >> rotate);
+			} else {  
+				disasm_register_shift(di, insn);
+			}
+			break;
+		/* d - destination register (bits 12-15) */
+		case 'd':
+			di->di_printf("r%d", ((insn >> 12) & 0x0f));
+			break;
+		/* D - insert 'p' if Rd is R15 */
+		case 'D':
+			if (((insn >> 12) & 0x0f) == 15)
+				di->di_printf("p");
+			break;
+		/* n - n register (bits 16-19) */
+		case 'n':
+			di->di_printf("r%d", ((insn >> 16) & 0x0f));
+			break;
+		/* s - s register (bits 8-11) */
+		case 's':
+			di->di_printf("r%d", ((insn >> 8) & 0x0f));
+			break;
+		/* o - indirect register rn (bits 16-19) (used by swap) */
+		case 'o':
+			di->di_printf("[r%d]", ((insn >> 16) & 0x0f));
+			break;
+		/* m - m register (bits 0-4) */
+		case 'm':
+			di->di_printf("r%d", ((insn >> 0) & 0x0f));
+			break;
+		/* a - address operand of ldr/str instruction */
+		case 'a':
+			disasm_insn_ldrstr(di, insn, loc);
+			break;
+		/* e - address operand of ldrh/strh instruction */
+		case 'e':
+			disasm_insn_ldrhstrh(di, insn, loc);
+			break;
+		/* l - register list for ldm/stm instruction */
+		case 'l':
+			disasm_print_reglist(di, insn);
+			break;
+		/* f - 1st fp operand (register) (bits 12-14) */
+		case 'f':
+			di->di_printf("f%d", (insn >> 12) & 7);
+			break;
+		/* g - 2nd fp operand (register) (bits 16-18) */
+		case 'g':
+			di->di_printf("f%d", (insn >> 16) & 7);
+			break;
+		/* h - 3rd fp operand (register/immediate) (bits 0-4) */
+		case 'h':
+			if (insn & (1 << 3))
+				di->di_printf("#%s", insn_fpaimm(insn));
+			else
+				di->di_printf("f%d", insn & 7);
+			break;
+		/* j - xtb rotate literal (bits 10-11) */
+		case 'j':
+			di->di_printf("ror #%d", ((insn >> 10) & 3) << 3);
+			break;
+        /* i - bfx lsb literal (bits 7-11) */
+        case 'i':
+            di->di_printf("#%d", (insn >> 7) & 31);
+            break;
+        /* w - bfx width literal (bits 16-20) */
+        case 'w':
+            di->di_printf("#%d", 1 + ((insn >> 16) & 31));
+            break;
+		/* b - branch address */
+		case 'b':
+			branch = ((insn << 2) & 0x03ffffff);
+			if (branch & 0x02000000)
+				branch |= 0xfc000000;
+			di->di_printaddr(loc + 8 + branch);
+			break;
+		/* t - blx address */
+		case 't':
+			branch = ((insn << 2) & 0x03ffffff) |
+			    (insn >> 23 & 0x00000002);
+			if (branch & 0x02000000)
+				branch |= 0xfc000000;
+			di->di_printaddr(loc + 8 + branch);
+			break;
+		/* X - block transfer type */
+		case 'X':
+			di->di_printf("%s", insn_blktrans(insn));
+			break;
+		/* Y - block transfer type (r13 base) */
+		case 'Y':
+			di->di_printf("%s", insn_stkblktrans(insn));
+			break;
+		/* c - comment field bits(0-23) */
+		case 'c':
+			di->di_printf("0x%08x", (insn & 0x00ffffff));
+			break;
+		/* k - breakpoint comment (bits 0-3, 8-19) */
+		case 'k':
+			di->di_printf("0x%04x",
+			    (insn & 0x000fff00) >> 4 | (insn & 0x0000000f));
+			break;
+		/* p - saved or current status register */
+		case 'p':
+			if (insn & 0x00400000)
+				di->di_printf("spsr");
+			else
+				di->di_printf("cpsr");
+			break;
+		/* F - PSR transfer fields */
+		case 'F':
+			di->di_printf("_");
+			if (insn & (1 << 16))
+				di->di_printf("c");
+			if (insn & (1 << 17))
+				di->di_printf("x");
+			if (insn & (1 << 18))
+				di->di_printf("s");
+			if (insn & (1 << 19))
+				di->di_printf("f");
+			break;
+		/* B - byte transfer flag */
+		case 'B':
+			if (insn & 0x00400000)
+				di->di_printf("b");
+			break;
+		/* L - co-processor transfer size */
+		case 'L':
+			if (insn & (1 << 22))
+				di->di_printf("l");
+			break;
+		/* S - set status flag */
+		case 'S':
+			if (insn & 0x00100000)
+				di->di_printf("s");
+			break;
+		/* P - fp precision */
+		case 'P':
+			di->di_printf("%s", insn_fpaprec(insn));
+			break;
+		/* Q - fp precision (for ldf/stf) */
+		case 'Q':
+			break;
+		/* R - fp rounding */
+		case 'R':
+			di->di_printf("%s", insn_fparnd(insn));
+			break;
+		/* W - writeback flag */
+		case 'W':
+			if (insn & (1 << 21))
+				di->di_printf("!");
+			break;
+		/* # - co-processor number */
+		case '#':
+			di->di_printf("p%d", (insn >> 8) & 0x0f);
+			break;
+		/* v - co-processor data transfer registers+addressing mode */
+		case 'v':
+			disasm_insn_ldcstc(di, insn, loc);
+			break;
+		/* x - instruction in hex */
+		case 'x':
+			di->di_printf("0x%08x", insn);
+			break;
+		/* y - co-processor data processing registers */
+		case 'y':
+			di->di_printf("%d, ", (insn >> 20) & 0x0f);
+
+			di->di_printf("c%d, c%d, c%d", (insn >> 12) & 0x0f,
+			    (insn >> 16) & 0x0f, insn & 0x0f);
+
+			di->di_printf(", %d", (insn >> 5) & 0x07);
+			break;
+		/* z - co-processor register transfer registers */
+		case 'z':
+			di->di_printf("%d, ", (insn >> 21) & 0x07);
+			di->di_printf("r%d, c%d, c%d, %d",
+			    (insn >> 12) & 0x0f, (insn >> 16) & 0x0f,
+			    insn & 0x0f, (insn >> 5) & 0x07);
+
+/*			if (((insn >> 5) & 0x07) != 0)
+				di->di_printf(", %d", (insn >> 5) & 0x07);*/
+			break;
+		default:
+			di->di_printf("[%c - unknown]", *f_ptr);
+			break;
+		}
+		if (*(f_ptr+1) >= 'A' && *(f_ptr+1) <= 'Z')
+			++f_ptr;
+		else if (*(++f_ptr)) {
+			++fmt;
+			if (fmt == 1)
+				di->di_printf("\t");
+			else
+				di->di_printf(", ");
+		}
+	};
+
+	di->di_printf("\n");
+
+	return(loc + INSN_SIZE);
+}
+
+
+static void
+disasm_register_shift(const disasm_interface_t *di, u_int insn)
+{
+	di->di_printf("r%d", (insn & 0x0f));
+	if ((insn & 0x00000ff0) == 0)
+		;
+	else if ((insn & 0x00000ff0) == 0x00000060)
+		di->di_printf(", rrx");
+	else {
+		if (insn & 0x10)
+			di->di_printf(", %s r%d", op2_shift(insn),
+			    (insn >> 8) & 0x0f);
+		else
+			di->di_printf(", %s #%d", op2_shift(insn),
+			    (insn >> 7) & 0x1f);
+	}
+}
+
+
+static void
+disasm_print_reglist(const disasm_interface_t *di, u_int insn)
+{
+	int loop;
+	int start;
+	int comma;
+
+	di->di_printf("{");
+	start = -1;
+	comma = 0;
+
+	for (loop = 0; loop < 17; ++loop) {
+		if (start != -1) {
+			if (loop == 16 || !(insn & (1 << loop))) {
+				if (comma)
+					di->di_printf(", ");
+				else
+					comma = 1;
+        			if (start == loop - 1)
+        				di->di_printf("r%d", start);
+        			else
+        				di->di_printf("r%d-r%d", start, loop - 1);
+        			start = -1;
+        		}
+        	} else {
+        		if (insn & (1 << loop))
+        			start = loop;
+        	}
+        }
+	di->di_printf("}");
+
+	if (insn & (1 << 22))
+		di->di_printf("^");
+}
+
+static void
+disasm_insn_ldrstr(const disasm_interface_t *di, u_int insn, u_int loc)
+{
+	int offset;
+
+	offset = insn & 0xfff;
+	if ((insn & 0x032f0000) == 0x010f0000) {
+		/* rA = pc, immediate index */
+		if (insn & 0x00800000)
+			loc += offset;
+		else
+			loc -= offset;
+		di->di_printaddr(loc + 8);
+ 	} else {
+		di->di_printf("[r%d", (insn >> 16) & 0x0f);
+		if ((insn & 0x03000fff) != 0x01000000) {
+			di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+			if (!(insn & 0x00800000))
+				di->di_printf("-");
+			if (insn & (1 << 25))
+				disasm_register_shift(di, insn);
+			else
+				di->di_printf("#0x%03x", offset);
+		}
+		if (insn & (1 << 24))
+			di->di_printf("]");
+	}
+}
+
+static void
+disasm_insn_ldrhstrh(const disasm_interface_t *di, u_int insn, u_int loc)
+{
+	int offset;
+
+	offset = ((insn & 0xf00) >> 4) | (insn & 0xf);
+	if ((insn & 0x004f0000) == 0x004f0000) {
+		/* rA = pc, immediate index */
+		if (insn & 0x00800000)
+			loc += offset;
+		else
+			loc -= offset;
+		di->di_printaddr(loc + 8);
+ 	} else {
+		di->di_printf("[r%d", (insn >> 16) & 0x0f);
+		if ((insn & 0x01400f0f) != 0x01400000) {
+			di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+			if (!(insn & 0x00800000))
+				di->di_printf("-");
+			if (insn & (1 << 22))
+				di->di_printf("#0x%02x", offset);
+			else
+				di->di_printf("r%d", (insn & 0x0f));
+		}
+		if (insn & (1 << 24))
+			di->di_printf("]");
+	}
+}
+
+static void
+disasm_insn_ldcstc(const disasm_interface_t *di, u_int insn, u_int __unused loc)
+{
+	if (((insn >> 8) & 0xf) == 1)
+		di->di_printf("f%d, ", (insn >> 12) & 0x07);
+	else
+		di->di_printf("c%d, ", (insn >> 12) & 0x0f);
+
+	di->di_printf("[r%d", (insn >> 16) & 0x0f);
+
+	di->di_printf("%s, ", (insn & (1 << 24)) ? "" : "]");
+
+	if (!(insn & (1 << 23)))
+		di->di_printf("-");
+
+	di->di_printf("#0x%03x", (insn & 0xff) << 2);
+
+	if (insn & (1 << 24))
+		di->di_printf("]");
+
+	if (insn & (1 << 21))
+		di->di_printf("!");
+}
+
+static u_int
+disassemble_readword(u_int address)
+{
+	return(*((u_int *)address));
+}
+
+static void
+disassemble_printaddr(u_int address)
+{
+	printf("0x%08x", address);
+}
+
+static const disasm_interface_t disassemble_di = {
+	disassemble_readword, disassemble_printaddr, printf
+};
+
+void
+disassemble(u_int address)
+{
+
+	(void)disasm(&disassemble_di, address, 0);
+}
+
+/* End of disassem.c */
diff --git a/libpixelflinger/codeflinger/disassem.h b/libpixelflinger/codeflinger/disassem.h
new file mode 100644
index 0000000..c7c60b6
--- /dev/null
+++ b/libpixelflinger/codeflinger/disassem.h
@@ -0,0 +1,65 @@
+/*	$NetBSD: disassem.h,v 1.4 2001/03/04 04:15:58 matt Exp $	*/
+
+/*-
+ * Copyright (c) 1997 Mark Brinicombe.
+ * Copyright (c) 1997 Causality Limited.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Mark Brinicombe.
+ * 4. The name of the company nor the name of the author may be used to
+ *    endorse or promote products derived from this software without specific
+ *    prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * Define the interface structure required by the disassembler.
+ *
+ * $FreeBSD: /repoman/r/ncvs/src/sys/arm/include/disassem.h,v 1.2 2005/01/05 21:58:48 imp Exp $
+ */
+
+#ifndef ANDROID_MACHINE_DISASSEM_H
+#define ANDROID_MACHINE_DISASSEM_H
+
+#include <sys/types.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+	u_int	(*di_readword)(u_int);
+	void	(*di_printaddr)(u_int);
+	int	(*di_printf)(const char *, ...);
+} disasm_interface_t;
+
+/* Prototypes for callable functions */
+
+u_int disasm(const disasm_interface_t *, u_int, int);
+void disassemble(u_int);
+
+#if __cplusplus
+}
+#endif
+
+#endif /* !ANDROID_MACHINE_DISASSEM_H */
diff --git a/libpixelflinger/codeflinger/load_store.cpp b/libpixelflinger/codeflinger/load_store.cpp
new file mode 100644
index 0000000..4db0a49
--- /dev/null
+++ b/libpixelflinger/codeflinger/load_store.cpp
@@ -0,0 +1,384 @@
+/* libs/pixelflinger/codeflinger/load_store.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "pixelflinger-code"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <log/log.h>
+
+#include "GGLAssembler.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+void GGLAssembler::store(const pointer_t& addr, const pixel_t& s, uint32_t flags)
+{
+    const int bits = addr.size;
+    const int inc = (flags & WRITE_BACK)?1:0;
+    switch (bits) {
+    case 32:
+        if (inc)    STR(AL, s.reg, addr.reg, immed12_post(4));
+        else        STR(AL, s.reg, addr.reg);
+        break;
+    case 24:
+        // 24 bits formats are a little special and used only for RGB
+        // 0x00BBGGRR is unpacked as R,G,B
+        STRB(AL, s.reg, addr.reg, immed12_pre(0));
+        MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 8));
+        STRB(AL, s.reg, addr.reg, immed12_pre(1));
+        MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 8));
+        STRB(AL, s.reg, addr.reg, immed12_pre(2));
+        if (!(s.flags & CORRUPTIBLE)) {
+            MOV(AL, 0, s.reg, reg_imm(s.reg, ROR, 16));
+        }
+        if (inc)
+            ADD(AL, 0, addr.reg, addr.reg, imm(3));
+        break;
+    case 16:
+        if (inc)    STRH(AL, s.reg, addr.reg, immed8_post(2));
+        else        STRH(AL, s.reg, addr.reg);
+        break;
+    case  8:
+        if (inc)    STRB(AL, s.reg, addr.reg, immed12_post(1));
+        else        STRB(AL, s.reg, addr.reg);
+        break;
+    }
+}
+
+void GGLAssembler::load(const pointer_t& addr, const pixel_t& s, uint32_t flags)
+{
+    Scratch scratches(registerFile());
+    int s0;
+
+    const int bits = addr.size;
+    const int inc = (flags & WRITE_BACK)?1:0;
+    switch (bits) {
+    case 32:
+        if (inc)    LDR(AL, s.reg, addr.reg, immed12_post(4));
+        else        LDR(AL, s.reg, addr.reg);
+        break;
+    case 24:
+        // 24 bits formats are a little special and used only for RGB
+        // R,G,B is packed as 0x00BBGGRR
+        s0 = scratches.obtain();
+        if (s.reg != addr.reg) {
+            LDRB(AL, s.reg, addr.reg, immed12_pre(0));      // R
+            LDRB(AL, s0, addr.reg, immed12_pre(1));         // G
+            ORR(AL, 0, s.reg, s.reg, reg_imm(s0, LSL, 8));
+            LDRB(AL, s0, addr.reg, immed12_pre(2));         // B
+            ORR(AL, 0, s.reg, s.reg, reg_imm(s0, LSL, 16));
+        } else {
+            int s1 = scratches.obtain();
+            LDRB(AL, s1, addr.reg, immed12_pre(0));         // R
+            LDRB(AL, s0, addr.reg, immed12_pre(1));         // G
+            ORR(AL, 0, s1, s1, reg_imm(s0, LSL, 8));
+            LDRB(AL, s0, addr.reg, immed12_pre(2));         // B
+            ORR(AL, 0, s.reg, s1, reg_imm(s0, LSL, 16));
+        }
+        if (inc)
+            ADD(AL, 0, addr.reg, addr.reg, imm(3));
+        break;
+    case 16:
+        if (inc)    LDRH(AL, s.reg, addr.reg, immed8_post(2));
+        else        LDRH(AL, s.reg, addr.reg);
+        break;
+    case  8:
+        if (inc)    LDRB(AL, s.reg, addr.reg, immed12_post(1));
+        else        LDRB(AL, s.reg, addr.reg);
+        break;
+    }
+}
+
+void GGLAssembler::extract(integer_t& d, int s, int h, int l, int bits)
+{
+    const int maskLen = h-l;
+
+#ifdef __mips__
+    assert(maskLen<=11);
+#else
+    assert(maskLen<=8);
+#endif
+    assert(h);
+
+    if (h != bits) {
+        const int mask = ((1<<maskLen)-1) << l;
+        if (isValidImmediate(mask)) {
+            AND(AL, 0, d.reg, s, imm(mask));    // component = packed & mask;
+        } else if (isValidImmediate(~mask)) {
+            BIC(AL, 0, d.reg, s, imm(~mask));   // component = packed & mask;
+        } else {
+            MOV(AL, 0, d.reg, reg_imm(s, LSL, 32-h));
+            l += 32-h;
+            h = 32;
+        }
+        s = d.reg;
+    }
+
+    if (l) {
+        MOV(AL, 0, d.reg, reg_imm(s, LSR, l));  // component = packed >> l;
+        s = d.reg;
+    }
+
+    if (s != d.reg) {
+        MOV(AL, 0, d.reg, s);
+    }
+
+    d.s = maskLen;
+}
+
+void GGLAssembler::extract(integer_t& d, const pixel_t& s, int component)
+{
+    extract(d,  s.reg,
+                s.format.c[component].h,
+                s.format.c[component].l,
+                s.size());
+}
+
+void GGLAssembler::extract(component_t& d, const pixel_t& s, int component)
+{
+    integer_t r(d.reg, 32, d.flags);
+    extract(r,  s.reg,
+                s.format.c[component].h,
+                s.format.c[component].l,
+                s.size());
+    d = component_t(r);
+}
+
+
+void GGLAssembler::expand(integer_t& d, const component_t& s, int dbits)
+{
+    if (s.l || (s.flags & CLEAR_HI)) {
+        extract(d, s.reg, s.h, s.l, 32);
+        expand(d, d, dbits);
+    } else {
+        expand(d, integer_t(s.reg, s.size(), s.flags), dbits);
+    }
+}
+
+void GGLAssembler::expand(component_t& d, const component_t& s, int dbits)
+{
+    integer_t r(d.reg, 32, d.flags);
+    expand(r, s, dbits);
+    d = component_t(r);
+}
+
+void GGLAssembler::expand(integer_t& dst, const integer_t& src, int dbits)
+{
+    assert(src.size());
+
+    int sbits = src.size();
+    int s = src.reg;
+    int d = dst.reg;
+
+    // be sure to set 'dst' after we read 'src' as they may be identical
+    dst.s = dbits;
+    dst.flags = 0;
+
+    if (dbits<=sbits) {
+        if (s != d) {
+            MOV(AL, 0, d, s);
+        }
+        return;
+    }
+
+    if (sbits == 1) {
+        RSB(AL, 0, d, s, reg_imm(s, LSL, dbits));
+            // d = (s<<dbits) - s;
+        return;
+    }
+
+    if (dbits % sbits) {
+        MOV(AL, 0, d, reg_imm(s, LSL, dbits-sbits));
+            // d = s << (dbits-sbits);
+        dbits -= sbits;
+        do {
+            ORR(AL, 0, d, d, reg_imm(d, LSR, sbits));
+                // d |= d >> sbits;
+            dbits -= sbits;
+            sbits *= 2;
+        } while(dbits>0);
+        return;
+    }
+
+    dbits -= sbits;
+    do {
+        ORR(AL, 0, d, s, reg_imm(s, LSL, sbits));
+            // d |= d<<sbits;
+        s = d;
+        dbits -= sbits;
+        if (sbits*2 < dbits) {
+            sbits *= 2;
+        }
+    } while(dbits>0);
+}
+
+void GGLAssembler::downshift(
+        pixel_t& d, int component, component_t s, const reg_t& dither)
+{
+    Scratch scratches(registerFile());
+
+    int sh = s.h;
+    int sl = s.l;
+    int maskHiBits = (sh!=32) ? ((s.flags & CLEAR_HI)?1:0) : 0;
+    int maskLoBits = (sl!=0)  ? ((s.flags & CLEAR_LO)?1:0) : 0;
+    int sbits = sh - sl;
+
+    int dh = d.format.c[component].h;
+    int dl = d.format.c[component].l;
+    int dbits = dh - dl;
+    int dithering = 0;
+
+    ALOGE_IF(sbits<dbits, "sbits (%d) < dbits (%d) in downshift", sbits, dbits);
+
+    if (sbits>dbits) {
+        // see if we need to dither
+        dithering = mDithering;
+    }
+
+    int ireg = d.reg;
+    if (!(d.flags & FIRST)) {
+        if (s.flags & CORRUPTIBLE)  {
+            ireg = s.reg;
+        } else {
+            ireg = scratches.obtain();
+        }
+    }
+    d.flags &= ~FIRST;
+
+    if (maskHiBits) {
+        // we need to mask the high bits (and possibly the lowbits too)
+        // and we might be able to use immediate mask.
+        if (!dithering) {
+            // we don't do this if we only have maskLoBits because we can
+            // do it more efficiently below (in the case where dl=0)
+            const int offset = sh - dbits;
+            if (dbits<=8 && offset >= 0) {
+                const uint32_t mask = ((1<<dbits)-1) << offset;
+                if (isValidImmediate(mask) || isValidImmediate(~mask)) {
+                    build_and_immediate(ireg, s.reg, mask, 32);
+                    sl = offset;
+                    s.reg = ireg;
+                    sbits = dbits;
+                    maskLoBits = maskHiBits = 0;
+                }
+            }
+        } else {
+            // in the dithering case though, we need to preserve the lower bits
+            const uint32_t mask = ((1<<sbits)-1) << sl;
+            if (isValidImmediate(mask) || isValidImmediate(~mask)) {
+                build_and_immediate(ireg, s.reg, mask, 32);
+                s.reg = ireg;
+                maskLoBits = maskHiBits = 0;
+            }
+        }
+    }
+
+    // XXX: we could special case (maskHiBits & !maskLoBits)
+    // like we do for maskLoBits below, but it happens very rarely
+    // that we have maskHiBits only and the conditions necessary to lead
+    // to better code (like doing d |= s << 24)
+
+    if (maskHiBits) {
+        MOV(AL, 0, ireg, reg_imm(s.reg, LSL, 32-sh));
+        sl += 32-sh;
+        sh = 32;
+        s.reg = ireg;
+        maskHiBits = 0;
+    }
+
+    //	Downsampling should be performed as follows:
+    //  V * ((1<<dbits)-1) / ((1<<sbits)-1)
+    //	V * [(1<<dbits)/((1<<sbits)-1)	-	1/((1<<sbits)-1)]
+    //	V * [1/((1<<sbits)-1)>>dbits	-	1/((1<<sbits)-1)]
+    //	V/((1<<(sbits-dbits))-(1>>dbits))	-	(V>>sbits)/((1<<sbits)-1)>>sbits
+    //	V/((1<<(sbits-dbits))-(1>>dbits))	-	(V>>sbits)/(1-(1>>sbits))
+    //
+    //	By approximating (1>>dbits) and (1>>sbits) to 0:
+    //
+    //		V>>(sbits-dbits)	-	V>>sbits
+    //
+	//  A good approximation is V>>(sbits-dbits),
+    //  but better one (needed for dithering) is:
+    //
+    //		(V>>(sbits-dbits)<<sbits	-	V)>>sbits
+    //		(V<<dbits	-	V)>>sbits
+    //		(V	-	V>>dbits)>>(sbits-dbits)
+
+    // Dithering is done here
+    if (dithering) {
+        comment("dithering");
+        if (sl) {
+            MOV(AL, 0, ireg, reg_imm(s.reg, LSR, sl));
+            sh -= sl;
+            sl = 0;
+            s.reg = ireg;
+        }
+        // scaling (V-V>>dbits)
+        SUB(AL, 0, ireg, s.reg, reg_imm(s.reg, LSR, dbits));
+        const int shift = (GGL_DITHER_BITS - (sbits-dbits));
+        if (shift>0)        ADD(AL, 0, ireg, ireg, reg_imm(dither.reg, LSR, shift));
+        else if (shift<0)   ADD(AL, 0, ireg, ireg, reg_imm(dither.reg, LSL,-shift));
+        else                ADD(AL, 0, ireg, ireg, dither.reg);
+        s.reg = ireg;
+    }
+
+    if ((maskLoBits|dithering) && (sh > dbits)) {
+        int shift = sh-dbits;
+        if (dl) {
+            MOV(AL, 0, ireg, reg_imm(s.reg, LSR, shift));
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(ireg, LSL, dl));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(ireg, LSL, dl));
+            }
+        } else {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSR, shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSR, shift));
+            }
+        }
+    } else {
+        int shift = sh-dh;
+        if (shift>0) {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSR, shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSR, shift));
+            }
+        } else if (shift<0) {
+            if (ireg == d.reg) {
+                MOV(AL, 0, d.reg, reg_imm(s.reg, LSL, -shift));
+            } else {
+                ORR(AL, 0, d.reg, d.reg, reg_imm(s.reg, LSL, -shift));
+            }
+        } else {
+            if (ireg == d.reg) {
+                if (s.reg != d.reg) {
+                    MOV(AL, 0, d.reg, s.reg);
+                }
+            } else {
+                ORR(AL, 0, d.reg, d.reg, s.reg);
+            }
+        }
+    }
+}
+
+}; // namespace android
diff --git a/libpixelflinger/codeflinger/mips64_disassem.c b/libpixelflinger/codeflinger/mips64_disassem.c
new file mode 100644
index 0000000..8528299
--- /dev/null
+++ b/libpixelflinger/codeflinger/mips64_disassem.c
@@ -0,0 +1,584 @@
+/*  $NetBSD: db_disasm.c,v 1.19 2007/02/28 04:21:53 thorpej Exp $   */
+
+/*-
+ * Copyright (c) 1991, 1993
+ *  The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  from: @(#)kadb.c    8.1 (Berkeley) 6/10/93
+ */
+
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <sys/cdefs.h>
+#include <sys/types.h>
+
+#include <android/log.h>
+
+#include "mips_opcode.h"
+
+#define __unused __attribute__((__unused__))
+
+static char *sprintf_buffer;
+static int sprintf_buf_len;
+
+typedef uint64_t db_addr_t;
+static void db_printf(const char* fmt, ...);
+
+static const char * const op_name[64] = {
+/* 0 */ "spec", "bcond", "j", "jal", "beq", "bne", "blez", "bgtz",
+/* 8 */ "pop10", "addiu", "slti", "sltiu", "andi", "ori", "xori", "aui",
+/*16 */ "cop0", "cop1", "cop2", "?", "?", "?", "pop26", "pop27",
+/*24 */ "pop30", "daddiu", "?", "?", "?", "daui", "msa", "op37",
+/*32 */ "lb", "lh", "?",  "lw", "lbu", "lhu", "?", "lwu",
+/*40 */ "sb", "sh", "?", "sw", "?", "?", "?", "?",
+/*48 */ "?", "lwc1", "bc", "?", "?",  "ldc1", "pop66", "ld",
+/*56 */ "?", "swc1", "balc", "pcrel", "?", "sdc1", "pop76", "sd"
+};
+
+static const char * const spec_name[64] = {
+/* 0 */ "sll", "?", "srl", "sra", "sllv", "?", "srlv", "srav",
+/* 8 */ "?", "jalr", "?", "?", "syscall", "break", "sdbpp", "sync",
+/*16 */ "clz", "clo", "dclz", "dclo", "dsllv", "dlsa", "dsrlv", "dsrav",
+/*24 */ "sop30", "sop31", "sop32", "sop33", "sop34", "sop35", "sop36", "sop37",
+/*32 */ "add", "addu", "sub", "subu", "and", "or", "xor", "nor",
+/*40 */ "?", "?", "slt", "sltu", "dadd", "daddu", "dsub", "dsubu",
+/*48 */ "tge", "tgeu", "tlt", "tltu", "teq", "seleqz", "tne", "selnez",
+/*56 */ "dsll", "?", "dsrl", "dsra", "dsll32", "?", "dsrl32", "dsra32"
+};
+
+static const char * const bcond_name[32] = {
+/* 0 */ "bltz", "bgez", "?", "?", "?", "?", "dahi", "?",
+/* 8 */ "?", "?", "?", "?", "?", "?", "?", "?",
+/*16 */ "nal", "bal", "?", "?", "?", "?", "?", "sigrie",
+/*24 */ "?", "?", "?", "?", "?", "?", "dati", "synci",
+};
+
+static const char * const cop1_name[64] = {
+/* 0 */ "fadd",  "fsub", "fmpy", "fdiv", "fsqrt","fabs", "fmov", "fneg",
+/* 8 */ "fop08","fop09","fop0a","fop0b","fop0c","fop0d","fop0e","fop0f",
+/*16 */ "fop10","fop11","fop12","fop13","fop14","fop15","fop16","fop17",
+/*24 */ "fop18","fop19","fop1a","fop1b","fop1c","fop1d","fop1e","fop1f",
+/*32 */ "fcvts","fcvtd","fcvte","fop23","fcvtw","fop25","fop26","fop27",
+/*40 */ "fop28","fop29","fop2a","fop2b","fop2c","fop2d","fop2e","fop2f",
+/*48 */ "fcmp.f","fcmp.un","fcmp.eq","fcmp.ueq","fcmp.olt","fcmp.ult",
+    "fcmp.ole","fcmp.ule",
+/*56 */ "fcmp.sf","fcmp.ngle","fcmp.seq","fcmp.ngl","fcmp.lt","fcmp.nge",
+    "fcmp.le","fcmp.ngt"
+};
+
+static const char * const fmt_name[16] = {
+    "s",    "d",    "e",    "fmt3",
+    "w",    "fmt5", "fmt6", "fmt7",
+    "fmt8", "fmt9", "fmta", "fmtb",
+    "fmtc", "fmtd", "fmte", "fmtf"
+};
+
+static char * const mips_reg_name[32] = {
+    "zero", "at",   "v0",   "v1",   "a0",   "a1",   "a2",   "a3",
+    "a4",   "a5",   "a6",   "a7",   "t0",   "t1",   "t2",   "t3",
+    "s0",   "s1",   "s2",   "s3",   "s4",   "s5",   "s6",   "s7",
+    "t8",   "t9",   "k0",   "k1",   "gp",   "sp",   "s8",   "ra"
+};
+
+static char * alt_arm_reg_name[32] = {  // hacked names for comparison with ARM code
+    "zero", "at",   "r0",   "r1",   "r2",   "r3",   "r4",   "r5",
+    "r6",   "r7",   "r8",   "r9",   "r10",  "r11",  "r12",  "r13",
+    "r14",  "r15",  "at2",  "cmp",  "s4",   "s5",   "s6",   "s7",
+    "t8",   "t9",   "k0",   "k1",   "gp",   "sp",   "s8",   "ra"
+};
+
+static char * const * reg_name =  &mips_reg_name[0];
+
+static const char * const c0_opname[64] = {
+    "c0op00","tlbr",  "tlbwi", "c0op03","c0op04","c0op05","tlbwr", "c0op07",
+    "tlbp",  "c0op11","c0op12","c0op13","c0op14","c0op15","c0op16","c0op17",
+    "rfe",   "c0op21","c0op22","c0op23","c0op24","c0op25","c0op26","c0op27",
+    "eret",  "c0op31","c0op32","c0op33","c0op34","c0op35","c0op36","c0op37",
+    "c0op40","c0op41","c0op42","c0op43","c0op44","c0op45","c0op46","c0op47",
+    "c0op50","c0op51","c0op52","c0op53","c0op54","c0op55","c0op56","c0op57",
+    "c0op60","c0op61","c0op62","c0op63","c0op64","c0op65","c0op66","c0op67",
+    "c0op70","c0op71","c0op72","c0op73","c0op74","c0op75","c0op77","c0op77",
+};
+
+static const char * const c0_reg[32] = {
+    "index",    "random",   "tlblo0",  "tlblo1",
+    "context",  "pagemask", "wired",   "cp0r7",
+    "badvaddr", "count",    "tlbhi",   "compare",
+    "status",   "cause",    "epc",     "prid",
+    "config",   "lladdr",   "watchlo", "watchhi",
+    "xcontext", "cp0r21",   "cp0r22",  "debug",
+    "depc",     "perfcnt",  "ecc",     "cacheerr",
+    "taglo",    "taghi",    "errepc",  "desave"
+};
+
+static void print_addr(db_addr_t);
+db_addr_t mips_disassem(db_addr_t loc, char *di_buffer, int alt_dis_format);
+
+
+/*
+ * Disassemble instruction 'insn' nominally at 'loc'.
+ * 'loc' may in fact contain a breakpoint instruction.
+ */
+static db_addr_t
+db_disasm_insn(int insn, db_addr_t loc, bool altfmt __unused)
+{
+    bool bdslot = false;
+    InstFmt i;
+
+    i.word = insn;
+
+    switch (i.JType.op) {
+    case OP_SPECIAL:
+        if (i.word == 0) {
+            db_printf("nop");
+            break;
+        }
+        if (i.word == 0x0080) {
+            db_printf("NIY");
+            break;
+        }
+        if (i.word == 0x00c0) {
+            db_printf("NOT IMPL");
+            break;
+        }
+        /* Special cases --------------------------------------------------
+         * "addu" is a "move" only in 32-bit mode.  What's the correct
+         * answer - never decode addu/daddu as "move"?
+         */
+        if ( (i.RType.func == OP_ADDU && i.RType.rt == 0)  ||
+             (i.RType.func == OP_OR   && i.RType.rt == 0) ) {
+            db_printf("move\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rs]);
+            break;
+        }
+
+        if (i.RType.func == OP_SRL && (i.RType.rs & 1) == 1) {
+            db_printf("rotr\t%s,%s,%d", reg_name[i.RType.rd],
+                reg_name[i.RType.rt], i.RType.shamt);
+            break;
+        }
+        if (i.RType.func == OP_SRLV && (i.RType.shamt & 1) == 1) {
+            db_printf("rotrv\t%s,%s,%s", reg_name[i.RType.rd],
+                reg_name[i.RType.rt], reg_name[i.RType.rs]);
+            break;
+        }
+
+        if (i.RType.func == OP_SOP30) {
+            if (i.RType.shamt == OP_MUL) {
+                db_printf("mul");
+            } else if (i.RType.shamt == OP_MUH) {
+                db_printf("muh");
+            }
+            db_printf("\t%s,%s,%s", reg_name[i.RType.rd],
+                reg_name[i.RType.rs], reg_name[i.RType.rt]);
+            break;
+        }
+        if (i.RType.func == OP_SOP31) {
+            if (i.RType.shamt == OP_MUL) {
+                db_printf("mulu");
+            } else if (i.RType.shamt == OP_MUH) {
+                db_printf("muhu");
+            }
+            db_printf("\t%s,%s,%s", reg_name[i.RType.rd],
+                reg_name[i.RType.rs], reg_name[i.RType.rt]);
+            break;
+        }
+
+        if (i.RType.func == OP_JALR && i.RType.rd == 0) {
+            db_printf("jr\t%s", reg_name[i.RType.rs]);
+            bdslot = true;
+            break;
+        }
+
+        db_printf("%s", spec_name[i.RType.func]);
+        switch (i.RType.func) {
+        case OP_SLL:
+        case OP_SRL:
+        case OP_SRA:
+        case OP_DSLL:
+
+        case OP_DSRL:
+        case OP_DSRA:
+        case OP_DSLL32:
+        case OP_DSRL32:
+        case OP_DSRA32:
+            db_printf("\t%s,%s,%d",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt],
+                i.RType.shamt);
+            break;
+
+        case OP_SLLV:
+        case OP_SRLV:
+        case OP_SRAV:
+        case OP_DSLLV:
+        case OP_DSRLV:
+        case OP_DSRAV:
+            db_printf("\t%s,%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt],
+                reg_name[i.RType.rs]);
+            break;
+
+        case OP_CLZ:
+        case OP_CLO:
+        case OP_DCLZ:
+        case OP_DCLO:
+            db_printf("\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rs]);
+            break;
+
+        case OP_JALR:
+            db_printf("\t");
+            if (i.RType.rd != 31) {
+                db_printf("%s,", reg_name[i.RType.rd]);
+            }
+            db_printf("%s", reg_name[i.RType.rs]);
+            bdslot = true;
+            break;
+
+        case OP_SYSCALL:
+        case OP_SYNC:
+            break;
+
+        case OP_BREAK:
+            db_printf("\t%d", (i.RType.rs << 5) | i.RType.rt);
+            break;
+
+        default:
+            db_printf("\t%s,%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rs],
+                reg_name[i.RType.rt]);
+        }
+        break;
+
+    case OP_SPECIAL3:
+        if (i.RType.func == OP_EXT)
+            db_printf("ext\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd+1);
+        else if (i.RType.func == OP_DEXT)
+            db_printf("dext\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd+1);
+        else if (i.RType.func == OP_DEXTM)
+            db_printf("dextm\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd+33);
+        else if (i.RType.func == OP_DEXTU)
+            db_printf("dextu\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt+32,
+                    i.RType.rd+1);
+        else if (i.RType.func == OP_INS)
+            db_printf("ins\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd-i.RType.shamt+1);
+        else if (i.RType.func == OP_DINS)
+            db_printf("dins\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd-i.RType.shamt+1);
+        else if (i.RType.func == OP_DINSM)
+            db_printf("dinsm\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd-i.RType.shamt+33);
+        else if (i.RType.func == OP_DINSU)
+            db_printf("dinsu\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt+32,
+                    i.RType.rd-i.RType.shamt+1);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_WSBH)
+            db_printf("wsbh\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_SEB)
+            db_printf("seb\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_SEH)
+            db_printf("seh\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else if (i.RType.func == OP_RDHWR)
+            db_printf("rdhwr\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else
+            db_printf("Unknown");
+        break;
+
+    case OP_BCOND:
+        db_printf("%s\t%s,", bcond_name[i.IType.rt],
+            reg_name[i.IType.rs]);
+        goto pr_displ;
+
+    case OP_BLEZ:
+    case OP_BGTZ:
+        db_printf("%s\t%s,", op_name[i.IType.op],
+            reg_name[i.IType.rs]);
+        goto pr_displ;
+
+    case OP_BEQ:
+        if (i.IType.rs == 0 && i.IType.rt == 0) {
+            db_printf("b\t");
+            goto pr_displ;
+        }
+        /* FALLTHROUGH */
+    case OP_BNE:
+        db_printf("%s\t%s,%s,", op_name[i.IType.op],
+            reg_name[i.IType.rs],
+            reg_name[i.IType.rt]);
+    pr_displ:
+        print_addr(loc + 4 + ((short)i.IType.imm << 2));
+        bdslot = true;
+        break;
+
+    case OP_COP0:
+        switch (i.RType.rs) {
+        case OP_BCx:
+        case OP_BCy:
+
+            db_printf("bc0%c\t",
+                "ft"[i.RType.rt & COPz_BC_TF_MASK]);
+            goto pr_displ;
+
+        case OP_MT:
+            db_printf("mtc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_DMT:
+            db_printf("dmtc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_MF:
+            db_printf("mfc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_DMF:
+            db_printf("dmfc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        default:
+            db_printf("%s", c0_opname[i.FRType.func]);
+        }
+        break;
+
+    case OP_COP1:
+        switch (i.RType.rs) {
+        case OP_BCx:
+        case OP_BCy:
+            db_printf("bc1%c\t",
+                "ft"[i.RType.rt & COPz_BC_TF_MASK]);
+            goto pr_displ;
+
+        case OP_MT:
+            db_printf("mtc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_MF:
+            db_printf("mfc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_CT:
+            db_printf("ctc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_CF:
+            db_printf("cfc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        default:
+            db_printf("%s.%s\tf%d,f%d,f%d",
+                cop1_name[i.FRType.func],
+                fmt_name[i.FRType.fmt],
+                i.FRType.fd, i.FRType.fs, i.FRType.ft);
+        }
+        break;
+
+    case OP_J:
+    case OP_JAL:
+        db_printf("%s\t", op_name[i.JType.op]);
+        print_addr((loc & 0xFFFFFFFFF0000000) | (i.JType.target << 2));
+        bdslot = true;
+        break;
+
+    case OP_LWC1:
+    case OP_SWC1:
+        db_printf("%s\tf%d,", op_name[i.IType.op],
+            i.IType.rt);
+        goto loadstore;
+
+    case OP_LB:
+    case OP_LH:
+    case OP_LW:
+    case OP_LD:
+    case OP_LBU:
+    case OP_LHU:
+    case OP_LWU:
+    case OP_SB:
+    case OP_SH:
+    case OP_SW:
+    case OP_SD:
+        db_printf("%s\t%s,", op_name[i.IType.op],
+            reg_name[i.IType.rt]);
+    loadstore:
+        db_printf("%d(%s)", (short)i.IType.imm,
+            reg_name[i.IType.rs]);
+        break;
+
+    case OP_ORI:
+    case OP_XORI:
+        if (i.IType.rs == 0) {
+            db_printf("li\t%s,0x%x",
+                reg_name[i.IType.rt],
+                i.IType.imm);
+            break;
+        }
+        /* FALLTHROUGH */
+    case OP_ANDI:
+        db_printf("%s\t%s,%s,0x%x", op_name[i.IType.op],
+            reg_name[i.IType.rt],
+            reg_name[i.IType.rs],
+            i.IType.imm);
+        break;
+
+    case OP_AUI:
+        if (i.IType.rs == 0) {
+            db_printf("lui\t%s,0x%x", reg_name[i.IType.rt],
+                i.IType.imm);
+        } else {
+            db_printf("%s\t%s,%s,%d", op_name[i.IType.op],
+            reg_name[i.IType.rt], reg_name[i.IType.rs],
+            (short)i.IType.imm);
+        }
+        break;
+
+    case OP_ADDIU:
+    case OP_DADDIU:
+        if (i.IType.rs == 0) {
+            db_printf("li\t%s,%d",
+                reg_name[i.IType.rt],
+                (short)i.IType.imm);
+            break;
+        }
+        /* FALLTHROUGH */
+    default:
+        db_printf("%s\t%s,%s,%d", op_name[i.IType.op],
+            reg_name[i.IType.rt],
+            reg_name[i.IType.rs],
+            (short)i.IType.imm);
+    }
+    // db_printf("\n");
+    // if (bdslot) {
+    //     db_printf("   bd: ");
+    //     mips_disassem(loc+4);
+    //     return (loc + 8);
+    // }
+    return (loc + 4);
+}
+
+static void
+print_addr(db_addr_t loc)
+{
+    db_printf("0x%08lx", loc);
+}
+
+static void db_printf(const char* fmt, ...)
+{
+    int cnt;
+    va_list argp;
+    va_start(argp, fmt);
+    if (sprintf_buffer) {
+        cnt = vsnprintf(sprintf_buffer, sprintf_buf_len, fmt, argp);
+        sprintf_buffer += cnt;
+        sprintf_buf_len -= cnt;
+    } else {
+        vprintf(fmt, argp);
+    }
+    va_end(argp);
+}
+
+/*
+ * Disassemble instruction at 'loc'.
+ * Return address of start of next instruction.
+ * Since this function is used by 'examine' and by 'step'
+ * "next instruction" does NOT mean the next instruction to
+ * be executed but the 'linear' next instruction.
+ */
+db_addr_t
+mips_disassem(db_addr_t loc, char *di_buffer, int alt_dis_format)
+{
+    u_int32_t instr;
+
+    if (alt_dis_format) {   // use ARM register names for disassembly
+        reg_name = &alt_arm_reg_name[0];
+    }
+
+    sprintf_buffer = di_buffer;     // quick 'n' dirty printf() vs sprintf()
+    sprintf_buf_len = 39;           // should be passed in
+
+    instr =  *(u_int32_t *)loc;
+    return (db_disasm_insn(instr, loc, false));
+}
diff --git a/libpixelflinger/codeflinger/mips64_disassem.h b/libpixelflinger/codeflinger/mips64_disassem.h
new file mode 100644
index 0000000..c94f04f
--- /dev/null
+++ b/libpixelflinger/codeflinger/mips64_disassem.h
@@ -0,0 +1,56 @@
+/*  $NetBSD: db_disasm.c,v 1.19 2007/02/28 04:21:53 thorpej Exp $   */
+
+/*-
+ * Copyright (c) 1991, 1993
+ *  The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  from: @(#)kadb.c    8.1 (Berkeley) 6/10/93
+ */
+
+
+
+#ifndef ANDROID_MIPS_DISASSEM_H
+#define ANDROID_MIPS_DISASSEM_H
+
+#include <sys/types.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+/* Prototypes for callable functions */
+
+void mips_disassem(uint32_t *location, char *di_buffer, int alt_fmt);
+
+#if __cplusplus
+}
+#endif
+
+#endif /* !ANDROID_MIPS_DISASSEM_H */
diff --git a/libpixelflinger/codeflinger/mips_disassem.c b/libpixelflinger/codeflinger/mips_disassem.c
new file mode 100644
index 0000000..1fe6806
--- /dev/null
+++ b/libpixelflinger/codeflinger/mips_disassem.c
@@ -0,0 +1,592 @@
+/*  $NetBSD: db_disasm.c,v 1.19 2007/02/28 04:21:53 thorpej Exp $   */
+
+/*-
+ * Copyright (c) 1991, 1993
+ *  The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  from: @(#)kadb.c    8.1 (Berkeley) 6/10/93
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <sys/cdefs.h>
+
+#include <sys/types.h>
+#include "mips_opcode.h"
+
+
+// #include <sys/systm.h>
+// #include <sys/param.h>
+
+// #include <machine/reg.h>
+// #include <machine/cpu.h>
+/*#include <machine/param.h>*/
+// #include <machine/db_machdep.h>
+
+// #include <ddb/db_interface.h>
+// #include <ddb/db_output.h>
+// #include <ddb/db_extern.h>
+// #include <ddb/db_sym.h>
+
+#define __unused __attribute__((__unused__))
+
+static char *sprintf_buffer;
+static int sprintf_buf_len;
+
+
+typedef uint32_t db_addr_t;
+static void db_printf(const char* fmt, ...);
+
+static const char * const op_name[64] = {
+/* 0 */ "spec", "bcond","j  ",    "jal",  "beq",  "bne",  "blez", "bgtz",
+/* 8 */ "addi", "addiu","slti", "sltiu","andi", "ori",  "xori", "lui",
+/*16 */ "cop0", "cop1", "cop2", "cop3", "beql", "bnel", "blezl","bgtzl",
+/*24 */ "daddi","daddiu","ldl", "ldr",  "op34", "op35", "op36", "op37",
+/*32 */ "lb ",   "lh ",   "lwl",  "lw ",   "lbu",  "lhu",  "lwr",  "lwu",
+/*40 */ "sb ",   "sh ",   "swl",  "sw ",   "sdl",  "sdr",  "swr",  "cache",
+/*48 */ "ll ",   "lwc1", "lwc2", "lwc3", "lld",  "ldc1", "ldc2", "ld ",
+/*56 */ "sc ",   "swc1", "swc2", "swc3", "scd",  "sdc1", "sdc2", "sd "
+};
+
+static const char * const spec_name[64] = {
+/* 0 */ "sll",  "spec01","srl", "sra",  "sllv", "spec05","srlv","srav",
+/* 8 */ "jr",   "jalr", "movz","movn","syscall","break","spec16","sync",
+/*16 */ "mfhi", "mthi", "mflo", "mtlo", "dsllv","spec25","dsrlv","dsrav",
+/*24 */ "mult", "multu","div",  "divu", "dmult","dmultu","ddiv","ddivu",
+/*32 */ "add",  "addu", "sub",  "subu", "and",  "or ",   "xor",  "nor",
+/*40 */ "spec50","spec51","slt","sltu", "dadd","daddu","dsub","dsubu",
+/*48 */ "tge","tgeu","tlt","tltu","teq","spec65","tne","spec67",
+/*56 */ "dsll","spec71","dsrl","dsra","dsll32","spec75","dsrl32","dsra32"
+};
+
+static const char * const spec2_name[64] = {     /* QED RM4650, R5000, etc. */
+/* 0x00 */ "madd", "maddu", "mul", "spec3", "msub", "msubu", "rsrv6", "rsrv7",
+/* 0x08 */ "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv",
+/* 0x10 */ "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv",
+/* 0x18 */ "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv",
+/* 0x20 */ "clz",  "clo",  "rsrv", "rsrv", "dclz", "dclo", "rsrv", "rsrv",
+/* 0x28 */ "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv",
+/* 0x30 */ "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv", "rsrv",
+/* 0x38 */ "rsrv", "rsrv", "rsrv", "resv", "rsrv", "rsrv", "rsrv", "sdbbp"
+};
+
+static const char * const bcond_name[32] = {
+/* 0 */ "bltz", "bgez", "bltzl", "bgezl", "?", "?", "?", "?",
+/* 8 */ "tgei", "tgeiu", "tlti", "tltiu", "teqi", "?", "tnei", "?",
+/*16 */ "bltzal", "bgezal", "bltzall", "bgezall", "?", "?", "?", "?",
+/*24 */ "?", "?", "?", "?", "?", "?", "?", "?",
+};
+
+static const char * const cop1_name[64] = {
+/* 0 */ "fadd",  "fsub", "fmpy", "fdiv", "fsqrt","fabs", "fmov", "fneg",
+/* 8 */ "fop08","fop09","fop0a","fop0b","fop0c","fop0d","fop0e","fop0f",
+/*16 */ "fop10","fop11","fop12","fop13","fop14","fop15","fop16","fop17",
+/*24 */ "fop18","fop19","fop1a","fop1b","fop1c","fop1d","fop1e","fop1f",
+/*32 */ "fcvts","fcvtd","fcvte","fop23","fcvtw","fop25","fop26","fop27",
+/*40 */ "fop28","fop29","fop2a","fop2b","fop2c","fop2d","fop2e","fop2f",
+/*48 */ "fcmp.f","fcmp.un","fcmp.eq","fcmp.ueq","fcmp.olt","fcmp.ult",
+    "fcmp.ole","fcmp.ule",
+/*56 */ "fcmp.sf","fcmp.ngle","fcmp.seq","fcmp.ngl","fcmp.lt","fcmp.nge",
+    "fcmp.le","fcmp.ngt"
+};
+
+static const char * const fmt_name[16] = {
+    "s",    "d",    "e",    "fmt3",
+    "w",    "fmt5", "fmt6", "fmt7",
+    "fmt8", "fmt9", "fmta", "fmtb",
+    "fmtc", "fmtd", "fmte", "fmtf"
+};
+
+#if defined(__mips_n32) || defined(__mips_n64)
+static char * const reg_name[32] = {
+    "zero", "at",   "v0",   "v1",   "a0",   "a1",   "a2",   "a3",
+    "a4",   "a5",   "a6",   "a7",   "t0",   "t1",   "t2",   "t3",
+    "s0",   "s1",   "s2",   "s3",   "s4",   "s5",   "s6",   "s7",
+    "t8",   "t9",   "k0",   "k1",   "gp",   "sp",   "s8",   "ra"
+};
+#else
+
+static char * alt_arm_reg_name[32] = {  // hacked names for comparison with ARM code
+    "zero", "at",   "r0",   "r1",   "r2",   "r3",   "r4",   "r5",
+    "r6",   "r7",   "r8",   "r9",   "r10",  "r11",  "r12",  "r13",
+    "r14",  "r15",  "at2",  "cmp",  "s4",   "s5",   "s6",   "s7",
+    "t8",   "t9",   "k0",   "k1",   "gp",   "sp",   "s8",   "ra"
+};
+
+static char * mips_reg_name[32] = {
+    "zero", "at",   "v0",   "v1",   "a0",   "a1",   "a2",   "a3",
+    "t0",   "t1",   "t2",   "t3",   "t4",   "t5",   "t6",   "t7",
+    "s0",   "s1",   "s2",   "s3",   "s4",   "s5",   "s6",   "s7",
+    "t8",   "t9",   "k0",   "k1",   "gp",   "sp",   "s8",   "ra"
+};
+
+static char ** reg_name =  &mips_reg_name[0];
+
+#endif /* __mips_n32 || __mips_n64 */
+
+static const char * const c0_opname[64] = {
+    "c0op00","tlbr",  "tlbwi", "c0op03","c0op04","c0op05","tlbwr", "c0op07",
+    "tlbp",  "c0op11","c0op12","c0op13","c0op14","c0op15","c0op16","c0op17",
+    "rfe",   "c0op21","c0op22","c0op23","c0op24","c0op25","c0op26","c0op27",
+    "eret",  "c0op31","c0op32","c0op33","c0op34","c0op35","c0op36","c0op37",
+    "c0op40","c0op41","c0op42","c0op43","c0op44","c0op45","c0op46","c0op47",
+    "c0op50","c0op51","c0op52","c0op53","c0op54","c0op55","c0op56","c0op57",
+    "c0op60","c0op61","c0op62","c0op63","c0op64","c0op65","c0op66","c0op67",
+    "c0op70","c0op71","c0op72","c0op73","c0op74","c0op75","c0op77","c0op77",
+};
+
+static const char * const c0_reg[32] = {
+    "index",    "random",   "tlblo0",  "tlblo1",
+    "context",  "pagemask", "wired",   "cp0r7",
+    "badvaddr", "count",    "tlbhi",   "compare",
+    "status",   "cause",    "epc",     "prid",
+    "config",   "lladdr",   "watchlo", "watchhi",
+    "xcontext", "cp0r21",   "cp0r22",  "debug",
+    "depc",     "perfcnt",  "ecc",     "cacheerr",
+    "taglo",    "taghi",    "errepc",  "desave"
+};
+
+static void print_addr(db_addr_t);
+db_addr_t mips_disassem(db_addr_t loc, char *di_buffer, int alt_dis_format);
+
+
+/*
+ * Disassemble instruction 'insn' nominally at 'loc'.
+ * 'loc' may in fact contain a breakpoint instruction.
+ */
+static db_addr_t
+db_disasm_insn(int insn, db_addr_t loc, bool altfmt __unused)
+{
+    bool bdslot = false;
+    InstFmt i;
+
+    i.word = insn;
+
+    switch (i.JType.op) {
+    case OP_SPECIAL:
+        if (i.word == 0) {
+            db_printf("nop");
+            break;
+        }
+        if (i.word == 0x0080) {
+            db_printf("NIY");
+            break;
+        }
+        if (i.word == 0x00c0) {
+            db_printf("NOT IMPL");
+            break;
+        }
+        /* Special cases --------------------------------------------------
+         * "addu" is a "move" only in 32-bit mode.  What's the correct
+         * answer - never decode addu/daddu as "move"?
+         */
+        if ( (i.RType.func == OP_ADDU && i.RType.rt == 0)  ||
+             (i.RType.func == OP_OR   && i.RType.rt == 0) ) {
+            db_printf("move\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rs]);
+            break;
+        }
+        // mips32r2, rotr & rotrv
+        if (i.RType.func == OP_SRL && (i.RType.rs & 1) == 1) {
+            db_printf("rotr\t%s,%s,%d", reg_name[i.RType.rd],
+                reg_name[i.RType.rt], i.RType.shamt);
+            break;
+        }
+        if (i.RType.func == OP_SRLV && (i.RType.shamt & 1) == 1) {
+            db_printf("rotrv\t%s,%s,%s", reg_name[i.RType.rd],
+                reg_name[i.RType.rt], reg_name[i.RType.rs]);
+            break;
+        }
+
+
+        db_printf("%s", spec_name[i.RType.func]);
+        switch (i.RType.func) {
+        case OP_SLL:
+        case OP_SRL:
+        case OP_SRA:
+        case OP_DSLL:
+
+        case OP_DSRL:
+        case OP_DSRA:
+        case OP_DSLL32:
+        case OP_DSRL32:
+        case OP_DSRA32:
+            db_printf("\t%s,%s,%d",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt],
+                i.RType.shamt);
+            break;
+
+        case OP_SLLV:
+        case OP_SRLV:
+        case OP_SRAV:
+        case OP_DSLLV:
+        case OP_DSRLV:
+        case OP_DSRAV:
+            db_printf("\t%s,%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt],
+                reg_name[i.RType.rs]);
+            break;
+
+        case OP_MFHI:
+        case OP_MFLO:
+            db_printf("\t%s", reg_name[i.RType.rd]);
+            break;
+
+        case OP_JR:
+        case OP_JALR:
+            db_printf("\t%s", reg_name[i.RType.rs]);
+            bdslot = true;
+            break;
+        case OP_MTLO:
+        case OP_MTHI:
+            db_printf("\t%s", reg_name[i.RType.rs]);
+            break;
+
+        case OP_MULT:
+        case OP_MULTU:
+        case OP_DMULT:
+        case OP_DMULTU:
+        case OP_DIV:
+        case OP_DIVU:
+        case OP_DDIV:
+        case OP_DDIVU:
+            db_printf("\t%s,%s",
+                reg_name[i.RType.rs],
+                reg_name[i.RType.rt]);
+            break;
+
+
+        case OP_SYSCALL:
+        case OP_SYNC:
+            break;
+
+        case OP_BREAK:
+            db_printf("\t%d", (i.RType.rs << 5) | i.RType.rt);
+            break;
+
+        default:
+            db_printf("\t%s,%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rs],
+                reg_name[i.RType.rt]);
+        }
+        break;
+
+    case OP_SPECIAL2:
+        if (i.RType.func == OP_MUL)
+            db_printf("%s\t%s,%s,%s",
+                spec2_name[i.RType.func & 0x3f],
+                    reg_name[i.RType.rd],
+                    reg_name[i.RType.rs],
+                    reg_name[i.RType.rt]);
+        else
+            db_printf("%s\t%s,%s",
+                spec2_name[i.RType.func & 0x3f],
+                    reg_name[i.RType.rs],
+                    reg_name[i.RType.rt]);
+
+        break;
+
+    case OP_SPECIAL3:
+        if (i.RType.func == OP_EXT)
+            db_printf("ext\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd+1);
+        else if (i.RType.func == OP_INS)
+            db_printf("ins\t%s,%s,%d,%d",
+                    reg_name[i.RType.rt],
+                    reg_name[i.RType.rs],
+                    i.RType.shamt,
+                    i.RType.rd-i.RType.shamt+1);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_WSBH)
+            db_printf("wsbh\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_SEB)
+            db_printf("seb\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else if (i.RType.func == OP_BSHFL && i.RType.shamt == OP_SEH)
+            db_printf("seh\t%s,%s",
+                reg_name[i.RType.rd],
+                reg_name[i.RType.rt]);
+        else
+            db_printf("Unknown");
+        break;
+
+    case OP_BCOND:
+        db_printf("%s\t%s,", bcond_name[i.IType.rt],
+            reg_name[i.IType.rs]);
+        goto pr_displ;
+
+    case OP_BLEZ:
+    case OP_BLEZL:
+    case OP_BGTZ:
+    case OP_BGTZL:
+        db_printf("%s\t%s,", op_name[i.IType.op],
+            reg_name[i.IType.rs]);
+        goto pr_displ;
+
+    case OP_BEQ:
+    case OP_BEQL:
+        if (i.IType.rs == 0 && i.IType.rt == 0) {
+            db_printf("b  \t");
+            goto pr_displ;
+        }
+        /* FALLTHROUGH */
+    case OP_BNE:
+    case OP_BNEL:
+        db_printf("%s\t%s,%s,", op_name[i.IType.op],
+            reg_name[i.IType.rs],
+            reg_name[i.IType.rt]);
+    pr_displ:
+        print_addr(loc + 4 + ((short)i.IType.imm << 2));
+        bdslot = true;
+        break;
+
+    case OP_COP0:
+        switch (i.RType.rs) {
+        case OP_BCx:
+        case OP_BCy:
+
+            db_printf("bc0%c\t",
+                "ft"[i.RType.rt & COPz_BC_TF_MASK]);
+            goto pr_displ;
+
+        case OP_MT:
+            db_printf("mtc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_DMT:
+            db_printf("dmtc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_MF:
+            db_printf("mfc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        case OP_DMF:
+            db_printf("dmfc0\t%s,%s",
+                reg_name[i.RType.rt],
+                c0_reg[i.RType.rd]);
+            break;
+
+        default:
+            db_printf("%s", c0_opname[i.FRType.func]);
+        }
+        break;
+
+    case OP_COP1:
+        switch (i.RType.rs) {
+        case OP_BCx:
+        case OP_BCy:
+            db_printf("bc1%c\t",
+                "ft"[i.RType.rt & COPz_BC_TF_MASK]);
+            goto pr_displ;
+
+        case OP_MT:
+            db_printf("mtc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_MF:
+            db_printf("mfc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_CT:
+            db_printf("ctc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        case OP_CF:
+            db_printf("cfc1\t%s,f%d",
+                reg_name[i.RType.rt],
+                i.RType.rd);
+            break;
+
+        default:
+            db_printf("%s.%s\tf%d,f%d,f%d",
+                cop1_name[i.FRType.func],
+                fmt_name[i.FRType.fmt],
+                i.FRType.fd, i.FRType.fs, i.FRType.ft);
+        }
+        break;
+
+    case OP_J:
+    case OP_JAL:
+        db_printf("%s\t", op_name[i.JType.op]);
+        print_addr((loc & 0xF0000000) | (i.JType.target << 2));
+        bdslot = true;
+        break;
+
+    case OP_LWC1:
+    case OP_SWC1:
+        db_printf("%s\tf%d,", op_name[i.IType.op],
+            i.IType.rt);
+        goto loadstore;
+
+    case OP_LB:
+    case OP_LH:
+    case OP_LW:
+    case OP_LD:
+    case OP_LBU:
+    case OP_LHU:
+    case OP_LWU:
+    case OP_SB:
+    case OP_SH:
+    case OP_SW:
+    case OP_SD:
+        db_printf("%s\t%s,", op_name[i.IType.op],
+            reg_name[i.IType.rt]);
+    loadstore:
+        db_printf("%d(%s)", (short)i.IType.imm,
+            reg_name[i.IType.rs]);
+        break;
+
+    case OP_ORI:
+    case OP_XORI:
+        if (i.IType.rs == 0) {
+            db_printf("li\t%s,0x%x",
+                reg_name[i.IType.rt],
+                i.IType.imm);
+            break;
+        }
+        /* FALLTHROUGH */
+    case OP_ANDI:
+        db_printf("%s\t%s,%s,0x%x", op_name[i.IType.op],
+            reg_name[i.IType.rt],
+            reg_name[i.IType.rs],
+            i.IType.imm);
+        break;
+
+    case OP_LUI:
+        db_printf("%s\t%s,0x%x", op_name[i.IType.op],
+            reg_name[i.IType.rt],
+            i.IType.imm);
+        break;
+
+    case OP_CACHE:
+        db_printf("%s\t0x%x,0x%x(%s)",
+            op_name[i.IType.op],
+            i.IType.rt,
+            i.IType.imm,
+            reg_name[i.IType.rs]);
+        break;
+
+    case OP_ADDI:
+    case OP_DADDI:
+    case OP_ADDIU:
+    case OP_DADDIU:
+        if (i.IType.rs == 0) {
+            db_printf("li\t%s,%d",
+                reg_name[i.IType.rt],
+                (short)i.IType.imm);
+            break;
+        }
+        /* FALLTHROUGH */
+    default:
+        db_printf("%s\t%s,%s,%d", op_name[i.IType.op],
+            reg_name[i.IType.rt],
+            reg_name[i.IType.rs],
+            (short)i.IType.imm);
+    }
+    // db_printf("\n");
+    // if (bdslot) {
+    //     db_printf("   bd: ");
+    //     mips_disassem(loc+4);
+    //     return (loc + 8);
+    // }
+    return (loc + 4);
+}
+
+static void
+print_addr(db_addr_t loc)
+{
+    db_printf("0x%08x", loc);
+}
+
+
+
+static void db_printf(const char* fmt, ...)
+{
+    int cnt;
+    va_list argp;
+    va_start(argp, fmt);
+    if (sprintf_buffer) {
+        cnt = vsnprintf(sprintf_buffer, sprintf_buf_len, fmt, argp);
+        sprintf_buffer += cnt;
+        sprintf_buf_len -= cnt;
+    } else {
+        vprintf(fmt, argp);
+    }
+    va_end(argp);
+}
+
+
+/*
+ * Disassemble instruction at 'loc'.
+ * Return address of start of next instruction.
+ * Since this function is used by 'examine' and by 'step'
+ * "next instruction" does NOT mean the next instruction to
+ * be executed but the 'linear' next instruction.
+ */
+db_addr_t
+mips_disassem(db_addr_t loc, char *di_buffer, int alt_dis_format)
+{
+    u_int32_t instr;
+
+    if (alt_dis_format) {   // use ARM register names for disassembly
+        reg_name = &alt_arm_reg_name[0];
+    }
+
+    sprintf_buffer = di_buffer;     // quick 'n' dirty printf() vs sprintf()
+    sprintf_buf_len = 39;           // should be passed in
+
+    instr =  *(u_int32_t *)loc;
+    return (db_disasm_insn(instr, loc, false));
+}
+
diff --git a/libpixelflinger/codeflinger/mips_disassem.h b/libpixelflinger/codeflinger/mips_disassem.h
new file mode 100644
index 0000000..2d5b7f5
--- /dev/null
+++ b/libpixelflinger/codeflinger/mips_disassem.h
@@ -0,0 +1,66 @@
+/*  $NetBSD: db_disasm.c,v 1.19 2007/02/28 04:21:53 thorpej Exp $   */
+
+/*-
+ * Copyright (c) 1991, 1993
+ *  The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  from: @(#)kadb.c    8.1 (Berkeley) 6/10/93
+ */
+
+
+
+#ifndef ANDROID_MIPS_DISASSEM_H
+#define ANDROID_MIPS_DISASSEM_H
+
+#include <sys/types.h>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+
+// could add an interface like this, but I have not
+// typedef struct {
+//  u_int   (*di_readword)(u_int);
+//  void    (*di_printaddr)(u_int);
+//  void    (*di_printf)(const char *, ...);
+// } disasm_interface_t;
+
+/* Prototypes for callable functions */
+
+// u_int disasm(const disasm_interface_t *, u_int, int);
+
+void mips_disassem(uint32_t *location, char *di_buffer, int alt_fmt);
+
+#if __cplusplus
+}
+#endif
+
+#endif /* !ANDROID_MIPS_DISASSEM_H */
diff --git a/libpixelflinger/codeflinger/mips_opcode.h b/libpixelflinger/codeflinger/mips_opcode.h
new file mode 100644
index 0000000..45bb19e
--- /dev/null
+++ b/libpixelflinger/codeflinger/mips_opcode.h
@@ -0,0 +1,410 @@
+/*  $NetBSD: mips_opcode.h,v 1.12 2005/12/11 12:18:09 christos Exp $    */
+
+/*-
+ * Copyright (c) 1992, 1993
+ *  The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Ralph Campbell.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *  @(#)mips_opcode.h   8.1 (Berkeley) 6/10/93
+ */
+
+/*
+ * Define the instruction formats and opcode values for the
+ * MIPS instruction set.
+ */
+
+#include <endian.h>
+
+/*
+ * Define the instruction formats.
+ */
+typedef union {
+    unsigned word;
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+    struct {
+        unsigned imm: 16;
+        unsigned rt: 5;
+        unsigned rs: 5;
+        unsigned op: 6;
+    } IType;
+
+    struct {
+        unsigned target: 26;
+        unsigned op: 6;
+    } JType;
+
+    struct {
+        unsigned func: 6;
+        unsigned shamt: 5;
+        unsigned rd: 5;
+        unsigned rt: 5;
+        unsigned rs: 5;
+        unsigned op: 6;
+    } RType;
+
+    struct {
+        unsigned func: 6;
+        unsigned fd: 5;
+        unsigned fs: 5;
+        unsigned ft: 5;
+        unsigned fmt: 4;
+        unsigned : 1;       /* always '1' */
+        unsigned op: 6;     /* always '0x11' */
+    } FRType;
+#endif
+#if BYTE_ORDER == BIG_ENDIAN
+    struct {
+        unsigned op: 6;
+        unsigned rs: 5;
+        unsigned rt: 5;
+        unsigned imm: 16;
+    } IType;
+
+    struct {
+        unsigned op: 6;
+        unsigned target: 26;
+    } JType;
+
+    struct {
+        unsigned op: 6;
+        unsigned rs: 5;
+        unsigned rt: 5;
+        unsigned rd: 5;
+        unsigned shamt: 5;
+        unsigned func: 6;
+    } RType;
+
+    struct {
+        unsigned op: 6;     /* always '0x11' */
+        unsigned : 1;       /* always '1' */
+        unsigned fmt: 4;
+        unsigned ft: 5;
+        unsigned fs: 5;
+        unsigned fd: 5;
+        unsigned func: 6;
+    } FRType;
+#endif
+} InstFmt;
+
+/*
+ * Values for the 'op' field.
+ */
+#define OP_SPECIAL  000
+#define OP_BCOND    001
+#define OP_J        002
+#define OP_JAL      003
+#define OP_BEQ      004
+#define OP_BNE      005
+#define OP_BLEZ     006
+#define OP_BGTZ     007
+
+#if __mips_isa_rev < 6
+#define OP_ADDI     010
+#else
+#define OP_POP10    010
+#endif
+
+#define OP_ADDIU    011
+#define OP_SLTI     012
+#define OP_SLTIU    013
+#define OP_ANDI     014
+#define OP_ORI      015
+#define OP_XORI     016
+
+#if __mips_isa_rev < 6
+#define OP_LUI      017
+#else
+#define OP_AUI      017
+#endif
+
+#define OP_COP0     020
+#define OP_COP1     021
+#define OP_COP2     022
+
+#if __mips_isa_rev < 6
+#define OP_COP3     023
+#define OP_BEQL     024
+#define OP_BNEL     025
+#define OP_BLEZL    026
+#define OP_BGTZL    027
+#define OP_DADDI    030
+#else
+#define OP_POP26    026
+#define OP_POP27    027
+#define OP_POP30    030
+#endif
+
+#define OP_DADDIU   031
+
+#if __mips_isa_rev < 6
+#define OP_LDL      032
+#define OP_LDR      033
+#define OP_SPECIAL2 034
+#else
+#define OP_DAUI     035
+#endif
+
+#define OP_SPECIAL3 037
+
+#define OP_LB       040
+#define OP_LH       041
+
+#if __mips_isa_rev < 6
+#define OP_LWL      042
+#endif
+
+#define OP_LW       043
+#define OP_LBU      044
+#define OP_LHU      045
+#define OP_LWR      046
+#define OP_LHU      045
+
+#if __mips_isa_rev < 6
+#define OP_LWR      046
+#endif
+
+#define OP_LWU      047
+
+#define OP_SB       050
+#define OP_SH       051
+
+#if __mips_isa_rev < 6
+#define OP_SWL      052
+#endif
+
+#define OP_SW       053
+
+#if __mips_isa_rev < 6
+#define OP_SDL      054
+#define OP_SDR      055
+#define OP_SWR      056
+#define OP_CACHE    057
+#define OP_LL       060
+#define OP_LWC0     OP_LL
+#define OP_LWC1     061
+#define OP_LWC2     062
+#define OP_LWC3     063
+#define OP_LLD      064
+#else
+#define OP_LWC1     061
+#define OP_BC       062
+#endif
+
+#define OP_LDC1     065
+#define OP_LD       067
+
+#if __mips_isa_rev < 6
+#define OP_SC       070
+#define OP_SWC0     OP_SC
+#endif
+
+#define OP_SWC1     071
+
+#if __mips_isa_rev < 6
+#define OP_SWC2     072
+#define OP_SWC3     073
+#define OP_SCD      074
+#else
+#define OP_BALC     072
+#endif
+
+#define OP_SDC1     075
+#define OP_SD       077
+
+/*
+ * Values for the 'func' field when 'op' == OP_SPECIAL.
+ */
+#define OP_SLL      000
+#define OP_SRL      002
+#define OP_SRA      003
+#define OP_SLLV     004
+#define OP_SRLV     006
+#define OP_SRAV     007
+
+#if __mips_isa_rev < 6
+#define OP_JR       010
+#endif
+
+#define OP_JALR     011
+#define OP_SYSCALL  014
+#define OP_BREAK    015
+#define OP_SYNC     017
+
+#if __mips_isa_rev < 6
+#define OP_MFHI     020
+#define OP_MTHI     021
+#define OP_MFLO     022
+#define OP_MTLO     023
+#else
+#define OP_CLZ      020
+#define OP_CLO      021
+#define OP_DCLZ     022
+#define OP_DCLO     023
+#endif
+
+#define OP_DSLLV    024
+#define OP_DSRLV    026
+#define OP_DSRAV    027
+
+#if __mips_isa_rev < 6
+#define OP_MULT     030
+#define OP_MULTU    031
+#define OP_DIV      032
+#define OP_DIVU     033
+#define OP_DMULT    034
+#define OP_DMULTU   035
+#define OP_DDIV     036
+#define OP_DDIVU    037
+#else
+#define OP_SOP30    030
+#define OP_SOP31    031
+#define OP_SOP32    032
+#define OP_SOP33    033
+#define OP_SOP34    034
+#define OP_SOP35    035
+#define OP_SOP36    036
+#define OP_SOP37    037
+#endif
+
+#define OP_ADD      040
+#define OP_ADDU     041
+#define OP_SUB      042
+#define OP_SUBU     043
+#define OP_AND      044
+#define OP_OR       045
+#define OP_XOR      046
+#define OP_NOR      047
+
+#define OP_SLT      052
+#define OP_SLTU     053
+#define OP_DADD     054
+#define OP_DADDU    055
+#define OP_DSUB     056
+#define OP_DSUBU    057
+
+#define OP_TGE      060
+#define OP_TGEU     061
+#define OP_TLT      062
+#define OP_TLTU     063
+#define OP_TEQ      064
+#define OP_TNE      066
+
+#define OP_DSLL     070
+#define OP_DSRL     072
+#define OP_DSRA     073
+#define OP_DSLL32   074
+#define OP_DSRL32   076
+#define OP_DSRA32   077
+
+#if __mips_isa_rev < 6
+/*
+ * Values for the 'func' field when 'op' == OP_SPECIAL2.
+ * OP_SPECIAL2 opcodes are removed in mips32r6
+ */
+#define OP_MAD      000     /* QED */
+#define OP_MADU     001     /* QED */
+#define OP_MUL      002     /* QED */
+#endif
+
+/*
+ * Values for the 'func' field when 'op' == OP_SPECIAL3.
+ */
+#define OP_EXT      000
+#define OP_DEXTM    001
+#define OP_DEXTU    002
+#define OP_DEXT     003
+#define OP_INS      004
+#define OP_DINSM    005
+#define OP_DINSU    006
+#define OP_DINS     007
+#define OP_BSHFL    040
+#define OP_RDHWR    073
+
+/*
+ * Values for the 'shamt' field when OP_SPECIAL3 && func OP_BSHFL.
+ */
+
+#define OP_WSBH     002
+#define OP_SEB      020
+#define OP_SEH      030
+
+#if __mips_isa_rev == 6
+/*
+ * Values for the 'shamt' field when OP_SOP30.
+ */
+#define OP_MUL      002
+#define OP_MUH      003
+#endif
+
+/*
+ * Values for the 'func' field when 'op' == OP_BCOND.
+ */
+#define OP_BLTZ     000
+#define OP_BGEZ     001
+
+#if __mips_isa_rev < 6
+#define OP_BLTZL    002
+#define OP_BGEZL    003
+#define OP_TGEI     010
+#define OP_TGEIU    011
+#define OP_TLTI     012
+#define OP_TLTIU    013
+#define OP_TEQI     014
+#define OP_TNEI     016
+#define OP_BLTZAL   020
+#define OP_BGEZAL   021
+#define OP_BLTZALL  022
+#define OP_BGEZALL  023
+#else
+#define OP_NAL      020
+#define OP_BAL      021
+#endif
+
+/*
+ * Values for the 'rs' field when 'op' == OP_COPz.
+ */
+#define OP_MF       000
+#define OP_DMF      001
+#define OP_MT       004
+#define OP_DMT      005
+#define OP_BCx      010
+#define OP_BCy      014
+#define OP_CF       002
+#define OP_CT       006
+
+/*
+ * Values for the 'rt' field when 'op' == OP_COPz.
+ */
+#define COPz_BC_TF_MASK 0x01
+#define COPz_BC_TRUE    0x01
+#define COPz_BC_FALSE   0x00
+#define COPz_BCL_TF_MASK    0x02
+#define COPz_BCL_TRUE   0x02
+#define COPz_BCL_FALSE  0x00
diff --git a/libpixelflinger/codeflinger/texturing.cpp b/libpixelflinger/codeflinger/texturing.cpp
new file mode 100644
index 0000000..e6997bd
--- /dev/null
+++ b/libpixelflinger/codeflinger/texturing.cpp
@@ -0,0 +1,1261 @@
+/* libs/pixelflinger/codeflinger/texturing.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "pixelflinger-code"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+#include <log/log.h>
+
+#include "GGLAssembler.h"
+
+namespace android {
+
+// ---------------------------------------------------------------------------
+
+// iterators are initialized like this:
+// (intToFixedCenter(x) * dx)>>16 + x0
+// ((x<<16 + 0x8000) * dx)>>16 + x0
+// ((x<<16)*dx + (0x8000*dx))>>16 + x0
+// ( (x*dx) + dx>>1 ) + x0
+// (x*dx) + (dx>>1 + x0)
+
+void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x)
+{
+    context_t const* c = mBuilderContext.c;
+
+    if (mSmooth) {
+        // NOTE: we could take this case in the mDithering + !mSmooth case,
+        // but this would use up to 4 more registers for the color components
+        // for only a little added quality.
+        // Currently, this causes the system to run out of registers in
+        // some case (see issue #719496)
+
+        comment("compute initial iterated color (smooth and/or dither case)");
+
+        parts.iterated_packed = 0;
+        parts.packed = 0;
+
+        // 0x1: color component
+        // 0x2: iterators
+        const int optReload = mOptLevel >> 1;
+        if (optReload >= 3)         parts.reload = 0; // reload nothing
+        else if (optReload == 2)    parts.reload = 2; // reload iterators
+        else if (optReload == 1)    parts.reload = 1; // reload colors
+        else if (optReload <= 0)    parts.reload = 3; // reload both
+
+        if (!mSmooth) {
+            // we're not smoothing (just dithering), we never have to 
+            // reload the iterators
+            parts.reload &= ~2;
+        }
+
+        Scratch scratches(registerFile());
+        const int t0 = (parts.reload & 1) ? scratches.obtain() : 0;
+        const int t1 = (parts.reload & 2) ? scratches.obtain() : 0;
+        for (int i=0 ; i<4 ; i++) {
+            if (!mInfo[i].iterated)
+                continue;            
+            
+            // this component exists in the destination and is not replaced
+            // by a texture unit.
+            const int c = (parts.reload & 1) ? t0 : obtainReg();              
+            if (i==0) CONTEXT_LOAD(c, iterators.ydady);
+            if (i==1) CONTEXT_LOAD(c, iterators.ydrdy);
+            if (i==2) CONTEXT_LOAD(c, iterators.ydgdy);
+            if (i==3) CONTEXT_LOAD(c, iterators.ydbdy);
+            parts.argb[i].reg = c;
+
+            if (mInfo[i].smooth) {
+                parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg();
+                const int dvdx = parts.argb_dx[i].reg;
+                CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx);
+                MLA(AL, 0, c, x.reg, dvdx, c);
+                
+                // adjust the color iterator to make sure it won't overflow
+                if (!mAA) {
+                    // this is not needed when we're using anti-aliasing
+                    // because we will (have to) clamp the components
+                    // anyway.
+                    int end = scratches.obtain();
+                    MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16));
+                    MLA(AL, 1, end, dvdx, end, c);
+                    SUB(MI, 0, c, c, end);
+                    BIC(AL, 0, c, c, reg_imm(c, ASR, 31)); 
+                    scratches.recycle(end);
+                }
+            }
+            
+            if (parts.reload & 1) {
+                CONTEXT_STORE(c, generated_vars.argb[i].c);
+            }
+        }
+    } else {
+        // We're not smoothed, so we can 
+        // just use a packed version of the color and extract the
+        // components as needed (or not at all if we don't blend)
+
+        // figure out if we need the iterated color
+        int load = 0;
+        for (int i=0 ; i<4 ; i++) {
+            component_info_t& info = mInfo[i];
+            if ((info.inDest || info.needed) && !info.replaced)
+                load |= 1;
+        }
+        
+        parts.iterated_packed = 1;
+        parts.packed = (!mTextureMachine.mask && !mBlending
+                && !mFog && !mDithering);
+        parts.reload = 0;
+        if (load || parts.packed) {
+            if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) {
+                comment("load initial iterated color (8888 packed)");
+                parts.iterated.setTo(obtainReg(),
+                        &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888]));
+                CONTEXT_LOAD(parts.iterated.reg, packed8888);
+            } else {
+                comment("load initial iterated color (dest format packed)");
+
+                parts.iterated.setTo(obtainReg(), &mCbFormat);
+
+                // pre-mask the iterated color
+                const int bits = parts.iterated.size();
+                const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
+                uint32_t mask = 0;
+                if (mMasking) {
+                    for (int i=0 ; i<4 ; i++) {
+                        const int component_mask = 1<<i;
+                        const int h = parts.iterated.format.c[i].h;
+                        const int l = parts.iterated.format.c[i].l;
+                        if (h && (!(mMasking & component_mask))) {
+                            mask |= ((1<<(h-l))-1) << l;
+                        }
+                    }
+                }
+
+                if (mMasking && ((mask & size)==0)) {
+                    // none of the components are present in the mask
+                } else {
+                    CONTEXT_LOAD(parts.iterated.reg, packed);
+                    if (mCbFormat.size == 1) {
+                        AND(AL, 0, parts.iterated.reg,
+                                parts.iterated.reg, imm(0xFF));
+                    } else if (mCbFormat.size == 2) {
+                        MOV(AL, 0, parts.iterated.reg,
+                                reg_imm(parts.iterated.reg, LSR, 16));
+                    }
+                }
+
+                // pre-mask the iterated color
+                if (mMasking) {
+                    build_and_immediate(parts.iterated.reg, parts.iterated.reg,
+                            mask, bits);
+                }
+            }
+        }
+    }
+}
+
+void GGLAssembler::build_iterated_color(
+        component_t& fragment,
+        const fragment_parts_t& parts,
+        int component,
+        Scratch& regs)
+{
+    fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE); 
+
+    if (!mInfo[component].iterated)
+        return;
+
+    if (parts.iterated_packed) {
+        // iterated colors are packed, extract the one we need
+        extract(fragment, parts.iterated, component);
+    } else {
+        fragment.h = GGL_COLOR_BITS;
+        fragment.l = GGL_COLOR_BITS - 8;
+        fragment.flags |= CLEAR_LO;
+        // iterated colors are held in their own register,
+        // (smooth and/or dithering case)
+        if (parts.reload==3) {
+            // this implies mSmooth
+            Scratch scratches(registerFile());
+            int dx = scratches.obtain();
+            CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
+            CONTEXT_LOAD(dx, generated_vars.argb[component].dx);
+            ADD(AL, 0, dx, fragment.reg, dx);
+            CONTEXT_STORE(dx, generated_vars.argb[component].c);
+        } else if (parts.reload & 1) {
+            CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c);
+        } else {
+            // we don't reload, so simply rename the register and mark as
+            // non CORRUPTIBLE so that the texture env or blending code
+            // won't modify this (renamed) register
+            regs.recycle(fragment.reg);
+            fragment.reg = parts.argb[component].reg;
+            fragment.flags &= ~CORRUPTIBLE;
+        }
+        if (mInfo[component].smooth && mAA) {
+            // when using smooth shading AND anti-aliasing, we need to clamp
+            // the iterators because there is always an extra pixel on the
+            // edges, which most of the time will cause an overflow
+            // (since technically its outside of the domain).
+            BIC(AL, 0, fragment.reg, fragment.reg,
+                    reg_imm(fragment.reg, ASR, 31));
+            component_sat(fragment);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs)
+{
+    // gather some informations about the components we need to process...
+    const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
+    switch(opcode) {
+    case GGL_COPY:
+        mLogicOp = 0;
+        break;
+    case GGL_CLEAR:
+    case GGL_SET:
+        mLogicOp = LOGIC_OP;
+        break;
+    case GGL_AND:
+    case GGL_AND_REVERSE:
+    case GGL_AND_INVERTED:
+    case GGL_XOR:
+    case GGL_OR:
+    case GGL_NOR:
+    case GGL_EQUIV:
+    case GGL_OR_REVERSE:
+    case GGL_OR_INVERTED:
+    case GGL_NAND:
+        mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST;
+        break;
+    case GGL_NOOP:
+    case GGL_INVERT:
+        mLogicOp = LOGIC_OP|LOGIC_OP_DST;
+        break;        
+    case GGL_COPY_INVERTED:
+        mLogicOp = LOGIC_OP|LOGIC_OP_SRC;
+        break;
+    };        
+}
+
+void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c)
+{
+    uint8_t replaced=0;
+    mTextureMachine.mask = 0;
+    mTextureMachine.activeUnits = 0;
+    for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (replaced == 0xF) {
+            // all components are replaced, skip this TMU.
+            tmu.format_idx = 0;
+            tmu.mask = 0;
+            tmu.replaced = replaced;
+            continue;
+        }
+        tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]);
+        tmu.format = c->formats[tmu.format_idx];
+        tmu.bits = tmu.format.size*8;
+        tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]);
+        tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]);
+        tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i]));
+        tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]);
+        tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i])
+                && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now
+
+        // 5551 linear filtering is not supported
+        if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551)
+            tmu.linear = 0;
+        
+        tmu.mask = 0;
+        tmu.replaced = replaced;
+
+        if (tmu.format_idx) {
+            mTextureMachine.activeUnits++;
+            if (tmu.format.c[0].h)    tmu.mask |= 0x1;
+            if (tmu.format.c[1].h)    tmu.mask |= 0x2;
+            if (tmu.format.c[2].h)    tmu.mask |= 0x4;
+            if (tmu.format.c[3].h)    tmu.mask |= 0x8;
+            if (tmu.env == GGL_REPLACE) {
+                replaced |= tmu.mask;
+            } else if (tmu.env == GGL_DECAL) {
+                if (!tmu.format.c[GGLFormat::ALPHA].h) {
+                    // if we don't have alpha, decal does nothing
+                    tmu.mask = 0;
+                } else {
+                    // decal always ignores At
+                    tmu.mask &= ~(1<<GGLFormat::ALPHA);
+                }
+            }
+        }
+        mTextureMachine.mask |= tmu.mask;
+        //printf("%d: mask=%08lx, replaced=%08lx\n",
+        //    i, int(tmu.mask), int(tmu.replaced));
+    }
+    mTextureMachine.replaced = replaced;
+    mTextureMachine.directTexture = 0;
+    //printf("replaced=%08lx\n", mTextureMachine.replaced);
+}
+
+
+void GGLAssembler::init_textures(
+        tex_coord_t* coords,
+        const reg_t& x, const reg_t& y)
+{
+    const needs_t& needs = mBuilderContext.needs;
+    int Rx = x.reg;
+    int Ry = y.reg;
+
+    if (mTextureMachine.mask) {
+        comment("compute texture coordinates");
+    }
+
+    // init texture coordinates for each tmu
+    const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n);
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+            (tmu.twrap == GGL_NEEDS_WRAP_11)) 
+        {
+            // 1:1 texture
+            pointer_t& txPtr = coords[i].ptr;
+            txPtr.setTo(obtainReg(), tmu.bits);
+            CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy);
+            ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16));    // x += (s>>16)
+            CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy);
+            ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16));    // y += (t>>16)
+            // merge base & offset
+            CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride);
+            SMLABB(AL, Rx, Ry, txPtr.reg, Rx);               // x+y*stride
+            CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
+            base_offset(txPtr, txPtr, Rx);
+        } else {
+            Scratch scratches(registerFile());
+            reg_t& s = coords[i].s;
+            reg_t& t = coords[i].t;
+            // s = (x * dsdx)>>16 + ydsdy
+            // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0
+            // t = (x * dtdx)>>16 + ydtdy
+            // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0
+            s.setTo(obtainReg());
+            t.setTo(obtainReg());
+            const int need_w = GGL_READ_NEEDS(W, needs.n);
+            if (need_w) {
+                CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy);
+                CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy);
+            } else {
+                int ydsdy = scratches.obtain();
+                int ydtdy = scratches.obtain();
+                CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx);
+                CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy);
+                CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx);
+                CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy);
+                MLA(AL, 0, s.reg, Rx, s.reg, ydsdy);
+                MLA(AL, 0, t.reg, Rx, t.reg, ydtdy);
+            }
+            
+            if ((mOptLevel&1)==0) {
+                CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
+                CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
+                recycleReg(s.reg);
+                recycleReg(t.reg);
+            }
+        }
+
+        // direct texture?
+        if (!multiTexture && !mBlending && !mDithering && !mFog && 
+            cb_format_idx == tmu.format_idx && !tmu.linear &&
+            mTextureMachine.replaced == tmu.mask) 
+        {
+                mTextureMachine.directTexture = i + 1; 
+        }
+    }
+}
+
+void GGLAssembler::build_textures(  fragment_parts_t& parts,
+                                    Scratch& regs)
+{
+    // We don't have a way to spill registers automatically
+    // spill depth and AA regs, when we know we may have to.
+    // build the spill list...
+    uint32_t spill_list = 0;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+        if (tmu.linear) {
+            // we may run out of register if we have linear filtering
+            // at 1 or 4 bytes / pixel on any texture unit.
+            if (tmu.format.size == 1) {
+                // if depth and AA enabled, we'll run out of 1 register
+                if (parts.z.reg > 0 && parts.covPtr.reg > 0)
+                    spill_list |= 1<<parts.covPtr.reg;
+            }
+            if (tmu.format.size == 4) {
+                // if depth or AA enabled, we'll run out of 1 or 2 registers
+                if (parts.z.reg > 0)
+                    spill_list |= 1<<parts.z.reg;
+                if (parts.covPtr.reg > 0)   
+                    spill_list |= 1<<parts.covPtr.reg;
+            }
+        }
+    }
+
+    Spill spill(registerFile(), *this, spill_list);
+
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+
+        pointer_t& txPtr = parts.coords[i].ptr;
+        pixel_t& texel = parts.texel[i];
+
+        // repeat...
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+            (tmu.twrap == GGL_NEEDS_WRAP_11))
+        { // 1:1 textures
+            comment("fetch texel");
+            texel.setTo(regs.obtain(), &tmu.format);
+            load(txPtr, texel, WRITE_BACK);
+        } else {
+            Scratch scratches(registerFile());
+            reg_t& s = parts.coords[i].s;
+            reg_t& t = parts.coords[i].t;
+            if ((mOptLevel&1)==0) {
+                comment("reload s/t (multitexture or linear filtering)");
+                s.reg = scratches.obtain();
+                t.reg = scratches.obtain();
+                CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]);
+                CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]);
+            }
+
+            if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
+                return;
+
+            comment("compute repeat/clamp");
+            int u       = scratches.obtain();
+            int v       = scratches.obtain();
+            int width   = scratches.obtain();
+            int height  = scratches.obtain();
+            int U = 0;
+            int V = 0;
+
+            if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
+                return;
+
+            CONTEXT_LOAD(width,  generated_vars.texture[i].width);
+            CONTEXT_LOAD(height, generated_vars.texture[i].height);
+
+            int FRAC_BITS = 0;
+            if (tmu.linear) {
+                // linear interpolation
+                if (tmu.format.size == 1) {
+                    // for 8-bits textures, we can afford
+                    // 7 bits of fractional precision at no
+                    // additional cost (we can't do 8 bits
+                    // because filter8 uses signed 16 bits muls)
+                    FRAC_BITS = 7;
+                } else if (tmu.format.size == 2) {
+                    // filter16() is internally limited to 4 bits, so:
+                    // FRAC_BITS=2 generates less instructions,
+                    // FRAC_BITS=3,4,5 creates unpleasant artifacts,
+                    // FRAC_BITS=6+ looks good
+                    FRAC_BITS = 6;
+                } else if (tmu.format.size == 4) {
+                    // filter32() is internally limited to 8 bits, so:
+                    // FRAC_BITS=4 looks good
+                    // FRAC_BITS=5+ looks better, but generates 3 extra ipp
+                    FRAC_BITS = 6;
+                } else {
+                    // for all other cases we use 4 bits.
+                    FRAC_BITS = 4;
+                }
+            }
+            wrapping(u, s.reg, width,  tmu.swrap, FRAC_BITS);
+            wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS);
+
+            if (tmu.linear) {
+                comment("compute linear filtering offsets");
+                // pixel size scale
+                const int shift = 31 - gglClz(tmu.format.size);
+                U = scratches.obtain();
+                V = scratches.obtain();
+
+                if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
+                    return;
+
+                // sample the texel center
+                SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1)));
+                SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1)));
+
+                // get the fractionnal part of U,V
+                AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1));
+                AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1));
+
+                // compute width-1 and height-1
+                SUB(AL, 0, width,  width,  imm(1));
+                SUB(AL, 0, height, height, imm(1));
+
+                // get the integer part of U,V and clamp/wrap
+                // and compute offset to the next texel
+                if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) {
+                    // u has already been REPEATed
+                    MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
+                    MOV(MI, 0, u, width);                    
+                    CMP(AL, u, width);
+                    MOV(LT, 0, width, imm(1 << shift));
+                    if (shift)
+                        MOV(GE, 0, width, reg_imm(width, LSL, shift));
+                    RSB(GE, 0, width, width, imm(0));
+                } else {
+                    // u has not been CLAMPed yet
+                    // algorithm:
+                    // if ((u>>4) >= width)
+                    //      u = width<<4
+                    //      width = 0
+                    // else
+                    //      width = 1<<shift
+                    // u = u>>4; // get integer part
+                    // if (u<0)
+                    //      u = 0
+                    //      width = 0
+                    // generated_vars.rt = width
+                    
+                    CMP(AL, width, reg_imm(u, ASR, FRAC_BITS));
+                    MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS));
+                    MOV(LE, 0, width, imm(0));
+                    MOV(GT, 0, width, imm(1 << shift));
+                    MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS));
+                    MOV(MI, 0, u, imm(0));
+                    MOV(MI, 0, width, imm(0));
+                }
+                CONTEXT_STORE(width, generated_vars.rt);
+
+                const int stride = width;
+                CONTEXT_LOAD(stride, generated_vars.texture[i].stride);
+                if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) {
+                    // v has already been REPEATed
+                    MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
+                    MOV(MI, 0, v, height);
+                    CMP(AL, v, height);
+                    MOV(LT, 0, height, imm(1 << shift));
+                    if (shift)
+                        MOV(GE, 0, height, reg_imm(height, LSL, shift));
+                    RSB(GE, 0, height, height, imm(0));
+                    MUL(AL, 0, height, stride, height);
+                } else {
+                    // v has not been CLAMPed yet
+                    CMP(AL, height, reg_imm(v, ASR, FRAC_BITS));
+                    MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS));
+                    MOV(LE, 0, height, imm(0));
+                    if (shift) {
+                        MOV(GT, 0, height, reg_imm(stride, LSL, shift));
+                    } else {
+                        MOV(GT, 0, height, stride);
+                    }
+                    MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS));
+                    MOV(MI, 0, v, imm(0));
+                    MOV(MI, 0, height, imm(0));
+                }
+                CONTEXT_STORE(height, generated_vars.lb);
+            }
+    
+            scratches.recycle(width);
+            scratches.recycle(height);
+
+            // iterate texture coordinates...
+            comment("iterate s,t");
+            int dsdx = scratches.obtain();
+            int dtdx = scratches.obtain();
+
+            if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
+                return;
+
+            CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
+            CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
+            ADD(AL, 0, s.reg, s.reg, dsdx);
+            ADD(AL, 0, t.reg, t.reg, dtdx);
+            if ((mOptLevel&1)==0) {
+                CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]);
+                CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]);
+                scratches.recycle(s.reg);
+                scratches.recycle(t.reg);
+            }
+            scratches.recycle(dsdx);
+            scratches.recycle(dtdx);
+
+            // merge base & offset...
+            comment("merge base & offset");
+            texel.setTo(regs.obtain(), &tmu.format);
+            txPtr.setTo(texel.reg, tmu.bits);
+            int stride = scratches.obtain();
+
+            if (registerFile().status() & RegisterFile::OUT_OF_REGISTERS)
+                return;
+
+            CONTEXT_LOAD(stride,    generated_vars.texture[i].stride);
+            CONTEXT_ADDR_LOAD(txPtr.reg, generated_vars.texture[i].data);
+            SMLABB(AL, u, v, stride, u);    // u+v*stride 
+            base_offset(txPtr, txPtr, u);
+
+            // load texel
+            if (!tmu.linear) {
+                comment("fetch texel");
+                load(txPtr, texel, 0);
+            } else {
+                // recycle registers we don't need anymore
+                scratches.recycle(u);
+                scratches.recycle(v);
+                scratches.recycle(stride);
+
+                comment("fetch texel, bilinear");
+                switch (tmu.format.size) {
+                case 1:  filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break;
+                }
+            }            
+        }
+    }
+}
+
+void GGLAssembler::build_iterate_texture_coordinates(
+    const fragment_parts_t& parts)
+{
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) {
+        const texture_unit_t& tmu = mTextureMachine.tmu[i];
+        if (tmu.format_idx == 0)
+            continue;
+
+        if ((tmu.swrap == GGL_NEEDS_WRAP_11) &&
+            (tmu.twrap == GGL_NEEDS_WRAP_11))
+        { // 1:1 textures
+            const pointer_t& txPtr = parts.coords[i].ptr;
+            ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3));
+        } else {
+            Scratch scratches(registerFile());
+            int s = parts.coords[i].s.reg;
+            int t = parts.coords[i].t.reg;
+            if ((mOptLevel&1)==0) {
+                s = scratches.obtain();
+                t = scratches.obtain();
+                CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]);
+                CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]);
+            }
+            int dsdx = scratches.obtain();
+            int dtdx = scratches.obtain();
+            CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx);
+            CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx);
+            ADD(AL, 0, s, s, dsdx);
+            ADD(AL, 0, t, t, dtdx);
+            if ((mOptLevel&1)==0) {
+                CONTEXT_STORE(s, generated_vars.texture[i].spill[0]);
+                CONTEXT_STORE(t, generated_vars.texture[i].spill[1]);
+            }
+        }
+    }
+}
+
+void GGLAssembler::filter8(
+        const fragment_parts_t& /*parts*/,
+        pixel_t& texel, const texture_unit_t& tmu,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{
+    if (tmu.format.components != GGL_ALPHA &&
+        tmu.format.components != GGL_LUMINANCE)
+    {
+        // this is a packed format, and we don't support
+        // linear filtering (it's probably RGB 332)
+        // Should not happen with OpenGL|ES
+        LDRB(AL, texel.reg, txPtr.reg);
+        return;
+    }
+
+    // ------------------------
+    // about ~22 cycles / pixel
+    Scratch scratches(registerFile());
+
+    int pixel= scratches.obtain();
+    int d    = scratches.obtain();
+    int u    = scratches.obtain();
+    int k    = scratches.obtain();
+    int rt   = scratches.obtain();
+    int lb   = scratches.obtain();
+
+    // RB -> U * V
+
+    CONTEXT_LOAD(rt, generated_vars.rt);
+    CONTEXT_LOAD(lb, generated_vars.lb);
+
+    int offset = pixel;
+    ADD(AL, 0, offset, lb, rt);
+    LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SMULBB(AL, u, U, V);
+    SMULBB(AL, d, pixel, u);
+    RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2)));
+    
+    // LB -> (1-U) * V
+    RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
+    LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb));
+    SMULBB(AL, u, U, V);
+    SMLABB(AL, d, pixel, u, d);
+    SUB(AL, 0, k, k, u);
+    
+    // LT -> (1-U)*(1-V)
+    RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
+    LDRB(AL, pixel, txPtr.reg);
+    SMULBB(AL, u, U, V);
+    SMLABB(AL, d, pixel, u, d);
+
+    // RT -> U*(1-V)
+    LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt));
+    SUB(AL, 0, u, k, u);
+    SMLABB(AL, texel.reg, pixel, u, d);
+    
+    for (int i=0 ; i<4 ; i++) {
+        if (!texel.format.c[i].h) continue;
+        texel.format.c[i].h = FRAC_BITS*2+8;
+        texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough
+    }
+    texel.format.size = 4;
+    texel.format.bitsPerPixel = 32;
+    texel.flags |= CLEAR_LO;
+}
+
+void GGLAssembler::filter16(
+        const fragment_parts_t& /*parts*/,
+        pixel_t& texel, const texture_unit_t& tmu,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{    
+    // compute the mask
+    // XXX: it would be nice if the mask below could be computed
+    // automatically.
+    uint32_t mask = 0;
+    int shift = 0;
+    int prec = 0;
+    switch (tmu.format_idx) {
+        case GGL_PIXEL_FORMAT_RGB_565:
+            // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb
+            // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb
+            mask = 0x07E0F81F;
+            shift = 16;
+            prec = 5;
+            break;
+        case GGL_PIXEL_FORMAT_RGBA_4444:
+            // 0000,1111,0000,1111 | 0000,1111,0000,1111
+            mask = 0x0F0F0F0F;
+            shift = 12;
+            prec = 4;
+            break;
+        case GGL_PIXEL_FORMAT_LA_88:
+            // 0000,0000,1111,1111 | 0000,0000,1111,1111
+            // AALL -> 00AA | 00LL
+            mask = 0x00FF00FF;
+            shift = 8;
+            prec = 8;
+            break;
+        default:
+            // unsupported format, do something sensical...
+            ALOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx);
+            LDRH(AL, texel.reg, txPtr.reg);
+            return;
+    }
+
+    const int adjust = FRAC_BITS*2 - prec;
+    const int round  = 0;
+
+    // update the texel format
+    texel.format.size = 4;
+    texel.format.bitsPerPixel = 32;
+    texel.flags |= CLEAR_HI|CLEAR_LO;
+    for (int i=0 ; i<4 ; i++) {
+        if (!texel.format.c[i].h) continue;
+        const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift;
+        texel.format.c[i].h = tmu.format.c[i].h + offset + prec;
+        texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec);
+    }
+
+    // ------------------------
+    // about ~40 cycles / pixel
+    Scratch scratches(registerFile());
+
+    int pixel= scratches.obtain();
+    int d    = scratches.obtain();
+    int u    = scratches.obtain();
+    int k    = scratches.obtain();
+
+    // RB -> U * V
+    int offset = pixel;
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    CONTEXT_LOAD(u, generated_vars.lb);
+    ADD(AL, 0, offset, offset, u);
+
+    LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
+    SMULBB(AL, u, U, V);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MUL(AL, 0, d, pixel, u);
+    RSB(AL, 0, k, u, imm(1<<prec));
+    
+    // LB -> (1-U) * V
+    CONTEXT_LOAD(offset, generated_vars.lb);
+    RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
+    LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
+    SMULBB(AL, u, U, V);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, d, pixel, u, d);
+    SUB(AL, 0, k, k, u);
+    
+    // LT -> (1-U)*(1-V)
+    RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
+    LDRH(AL, pixel, txPtr.reg);
+    SMULBB(AL, u, U, V);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, d, pixel, u, d);
+
+    // RT -> U*(1-V)            
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    LDRH(AL, pixel, txPtr.reg, reg_pre(offset));
+    SUB(AL, 0, u, k, u);
+    ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift));
+    build_and_immediate(pixel, pixel, mask, 32);
+    MLA(AL, 0, texel.reg, pixel, u, d);
+}
+
+void GGLAssembler::filter24(
+        const fragment_parts_t& /*parts*/,
+        pixel_t& texel, const texture_unit_t& /*tmu*/,
+        int /*U*/, int /*V*/, pointer_t& txPtr,
+        int /*FRAC_BITS*/)
+{
+    // not supported yet (currently disabled)
+    load(txPtr, texel, 0);
+}
+
+void GGLAssembler::filter32(
+        const fragment_parts_t& /*parts*/,
+        pixel_t& texel, const texture_unit_t& /*tmu*/,
+        int U, int V, pointer_t& txPtr,
+        int FRAC_BITS)
+{
+    const int adjust = FRAC_BITS*2 - 8;
+    const int round  = 0;
+
+    // ------------------------
+    // about ~38 cycles / pixel
+    Scratch scratches(registerFile());
+    
+    int pixel= scratches.obtain();
+    int dh   = scratches.obtain();
+    int u    = scratches.obtain();
+    int k    = scratches.obtain();
+
+    int temp = scratches.obtain();
+    int dl   = scratches.obtain();
+    int mask = scratches.obtain();
+
+    MOV(AL, 0, mask, imm(0xFF));
+    ORR(AL, 0, mask, mask, imm(0xFF0000));
+
+    // RB -> U * V
+    int offset = pixel;
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    CONTEXT_LOAD(u, generated_vars.lb);
+    ADD(AL, 0, offset, offset, u);
+
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SMULBB(AL, u, U, V);
+    AND(AL, 0, temp, mask, pixel);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MUL(AL, 0, dh, temp, u);
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MUL(AL, 0, dl, temp, u);
+    RSB(AL, 0, k, u, imm(0x100));
+
+    // LB -> (1-U) * V
+    CONTEXT_LOAD(offset, generated_vars.lb);
+    RSB(AL, 0, U, U, imm(1<<FRAC_BITS));
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SMULBB(AL, u, U, V);
+    AND(AL, 0, temp, mask, pixel);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, dh, temp, u, dh);    
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MLA(AL, 0, dl, temp, u, dl);
+    SUB(AL, 0, k, k, u);
+
+    // LT -> (1-U)*(1-V)
+    RSB(AL, 0, V, V, imm(1<<FRAC_BITS));
+    LDR(AL, pixel, txPtr.reg);
+    SMULBB(AL, u, U, V);
+    AND(AL, 0, temp, mask, pixel);
+    if (adjust) {
+        if (round)
+            ADD(AL, 0, u, u, imm(1<<(adjust-1)));
+        MOV(AL, 0, u, reg_imm(u, LSR, adjust));
+    }
+    MLA(AL, 0, dh, temp, u, dh);    
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MLA(AL, 0, dl, temp, u, dl);
+
+    // RT -> U*(1-V)            
+    CONTEXT_LOAD(offset, generated_vars.rt);
+    LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset));
+    SUB(AL, 0, u, k, u);
+    AND(AL, 0, temp, mask, pixel);
+    MLA(AL, 0, dh, temp, u, dh);    
+    AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8));
+    MLA(AL, 0, dl, temp, u, dl);
+
+    AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8));
+    AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8));
+    ORR(AL, 0, texel.reg, dh, dl);
+}
+
+void GGLAssembler::build_texture_environment(
+        component_t& fragment,
+        const fragment_parts_t& parts,
+        int component,
+        Scratch& regs)
+{
+    const uint32_t component_mask = 1<<component;
+    const bool multiTexture = mTextureMachine.activeUnits > 1;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
+        texture_unit_t& tmu = mTextureMachine.tmu[i];
+
+        if (tmu.mask & component_mask) {
+            // replace or modulate with this texture
+            if ((tmu.replaced & component_mask) == 0) {
+                // not replaced by a later tmu...
+
+                Scratch scratches(registerFile());
+                pixel_t texel(parts.texel[i]);
+
+                if (multiTexture && 
+                    tmu.swrap == GGL_NEEDS_WRAP_11 &&
+                    tmu.twrap == GGL_NEEDS_WRAP_11)
+                {
+                    texel.reg = scratches.obtain();
+                    texel.flags |= CORRUPTIBLE;
+                    comment("fetch texel (multitexture 1:1)");
+                    load(parts.coords[i].ptr, texel, WRITE_BACK);
+                 }
+
+                component_t incoming(fragment);
+                modify(fragment, regs);
+                
+                switch (tmu.env) {
+                case GGL_REPLACE:
+                    extract(fragment, texel, component);
+                    break;
+                case GGL_MODULATE:
+                    modulate(fragment, incoming, texel, component);
+                    break;
+                case GGL_DECAL:
+                    decal(fragment, incoming, texel, component);
+                    break;
+                case GGL_BLEND:
+                    blend(fragment, incoming, texel, component, i);
+                    break;
+                case GGL_ADD:
+                    add(fragment, incoming, texel, component);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::wrapping(
+            int d,
+            int coord, int size,
+            int tx_wrap, int tx_linear)
+{
+    // notes:
+    // if tx_linear is set, we need 4 extra bits of precision on the result
+    // SMULL/UMULL is 3 cycles
+    Scratch scratches(registerFile());
+    int c = coord;
+    if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) {
+        // UMULL takes 4 cycles (interlocked), and we can get away with
+        // 2 cycles using SMULWB, but we're loosing 16 bits of precision
+        // out of 32 (this is not a problem because the iterator keeps
+        // its full precision)
+        // UMULL(AL, 0, size, d, c, size);
+        // note: we can't use SMULTB because it's signed.
+        MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear));
+        SMULWB(AL, d, d, size);
+    } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) {
+        if (tx_linear) {
+            // 1 cycle
+            MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear));
+        } else {
+            // 4 cycles (common case)
+            MOV(AL, 0, d, reg_imm(coord, ASR, 16));
+            BIC(AL, 0, d, d, reg_imm(d, ASR, 31));
+            CMP(AL, d, size);
+            SUB(GE, 0, d, size, imm(1));
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+void GGLAssembler::modulate(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component)
+{
+    Scratch locals(registerFile());
+    integer_t texel(locals.obtain(), 32, CORRUPTIBLE);            
+    extract(texel, incomingTexel, component);
+
+    const int Nt = texel.size();
+        // Nt should always be less than 10 bits because it comes
+        // from the TMU.
+
+    int Ni = incoming.size();
+        // Ni could be big because it comes from previous MODULATEs
+
+    if (Nt == 1) {
+        // texel acts as a bit-mask
+        // dest = incoming & ((texel << incoming.h)-texel)
+        RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h));
+        AND(AL, 0, dest.reg, dest.reg, incoming.reg);
+        dest.l = incoming.l;
+        dest.h = incoming.h;
+        dest.flags |= (incoming.flags & CLEAR_LO);
+    } else if (Ni == 1) {
+        MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h));
+        AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31));
+        dest.l = 0;
+        dest.h = Nt;
+    } else {
+        int inReg = incoming.reg;
+        int shift = incoming.l;
+        if ((Nt + Ni) > 32) {
+            // we will overflow, reduce the precision of Ni to 8 bits
+            // (Note Nt cannot be more than 10 bits which happens with 
+            // 565 textures and GGL_LINEAR)
+            shift += Ni-8;
+            Ni = 8;
+        }
+
+        // modulate by the component with the lowest precision
+        if (Nt >= Ni) {
+            if (shift) {
+                // XXX: we should be able to avoid this shift
+                // when shift==16 && Nt<16 && Ni<16, in which
+                // we could use SMULBT below.
+                MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
+                inReg = dest.reg;
+                shift = 0;
+            }
+            // operation:           (Cf*Ct)/((1<<Ni)-1)
+            // approximated with:   Cf*(Ct + Ct>>(Ni-1))>>Ni
+            // this operation doesn't change texel's size
+            ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1));
+            if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg);
+            else                MUL(AL, 0, dest.reg, texel.reg, dest.reg);
+            dest.l = Ni;
+            dest.h = Nt + Ni;            
+        } else {
+            if (shift && (shift != 16)) {
+                // if shift==16, we can use 16-bits mul instructions later
+                MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift));
+                inReg = dest.reg;
+                shift = 0;
+            }
+            // operation:           (Cf*Ct)/((1<<Nt)-1)
+            // approximated with:   Ct*(Cf + Cf>>(Nt-1))>>Nt
+            // this operation doesn't change incoming's size
+            Scratch scratches(registerFile());
+            int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg;
+            if (t == inReg)
+                t = scratches.obtain();
+            ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1));
+            if (Nt<16 && Ni<16) {
+                if (shift==16)  SMULBT(AL, dest.reg, t, inReg);
+                else            SMULBB(AL, dest.reg, t, inReg);
+            } else              MUL(AL, 0, dest.reg, t, inReg);
+            dest.l = Nt;
+            dest.h = Nt + Ni;
+        }
+
+        // low bits are not valid
+        dest.flags |= CLEAR_LO;
+
+        // no need to keep more than 8 bits/component
+        if (dest.size() > 8)
+            dest.l = dest.h-8;
+    }
+}
+
+void GGLAssembler::decal(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component)
+{
+    // RGBA:
+    // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At
+    // Av = Af
+    Scratch locals(registerFile());
+    integer_t texel(locals.obtain(), 32, CORRUPTIBLE);            
+    integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
+    extract(texel, incomingTexel, component);
+    extract(factor, incomingTexel, GGLFormat::ALPHA);
+
+    // no need to keep more than 8-bits for decal 
+    int Ni = incoming.size();
+    int shift = incoming.l;
+    if (Ni > 8) {
+        shift += Ni-8;
+        Ni = 8;
+    }
+    integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
+    if (shift) {
+        MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
+        incomingNorm.reg = dest.reg;
+        incomingNorm.flags |= CORRUPTIBLE;
+    }
+    ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
+    build_blendOneMinusFF(dest, factor, incomingNorm, texel);
+}
+
+void GGLAssembler::blend(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component, int tmu)
+{
+    // RGBA:
+    // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct
+    // Av = At*Af
+
+    if (component == GGLFormat::ALPHA) {
+        modulate(dest, incoming, incomingTexel, component);
+        return;
+    }
+    
+    Scratch locals(registerFile());
+    integer_t color(locals.obtain(), 8, CORRUPTIBLE);            
+    integer_t factor(locals.obtain(), 32, CORRUPTIBLE);
+    LDRB(AL, color.reg, mBuilderContext.Rctx,
+            immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component])));
+    extract(factor, incomingTexel, component);
+
+    // no need to keep more than 8-bits for blend 
+    int Ni = incoming.size();
+    int shift = incoming.l;
+    if (Ni > 8) {
+        shift += Ni-8;
+        Ni = 8;
+    }
+    integer_t incomingNorm(incoming.reg, Ni, incoming.flags);
+    if (shift) {
+        MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift));
+        incomingNorm.reg = dest.reg;
+        incomingNorm.flags |= CORRUPTIBLE;
+    }
+    ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1));
+    build_blendOneMinusFF(dest, factor, incomingNorm, color);
+}
+
+void GGLAssembler::add(
+        component_t& dest, 
+        const component_t& incoming,
+        const pixel_t& incomingTexel, int component)
+{
+    // RGBA:
+    // Cv = Cf + Ct;
+    Scratch locals(registerFile());
+    
+    component_t incomingTemp(incoming);
+
+    // use "dest" as a temporary for extracting the texel, unless "dest"
+    // overlaps "incoming".
+    integer_t texel(dest.reg, 32, CORRUPTIBLE);
+    if (dest.reg == incomingTemp.reg)
+        texel.reg = locals.obtain();
+    extract(texel, incomingTexel, component);
+
+    if (texel.s < incomingTemp.size()) {
+        expand(texel, texel, incomingTemp.size());
+    } else if (texel.s > incomingTemp.size()) {
+        if (incomingTemp.flags & CORRUPTIBLE) {
+            expand(incomingTemp, incomingTemp, texel.s);
+        } else {
+            incomingTemp.reg = locals.obtain();
+            expand(incomingTemp, incoming, texel.s);
+        }
+    }
+
+    if (incomingTemp.l) {
+        ADD(AL, 0, dest.reg, texel.reg,
+                reg_imm(incomingTemp.reg, LSR, incomingTemp.l));
+    } else {
+        ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg);
+    }
+    dest.l = 0;
+    dest.h = texel.size();
+    component_sat(dest);
+}
+
+// ----------------------------------------------------------------------------
+
+}; // namespace android
+
diff --git a/libpixelflinger/codeflinger/tinyutils/smartpointer.h b/libpixelflinger/codeflinger/tinyutils/smartpointer.h
new file mode 100644
index 0000000..23a5f7e
--- /dev/null
+++ b/libpixelflinger/codeflinger/tinyutils/smartpointer.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright 2005 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_PIXELFLINGER_SMART_POINTER_H
+#define ANDROID_PIXELFLINGER_SMART_POINTER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+#include <stdlib.h>
+
+// ---------------------------------------------------------------------------
+namespace android {
+namespace tinyutils {
+
+// ---------------------------------------------------------------------------
+
+#define COMPARE(_op_)                                           \
+inline bool operator _op_ (const sp<T>& o) const {              \
+    return m_ptr _op_ o.m_ptr;                                  \
+}                                                               \
+inline bool operator _op_ (const T* o) const {                  \
+    return m_ptr _op_ o;                                        \
+}                                                               \
+template<typename U>                                            \
+inline bool operator _op_ (const sp<U>& o) const {              \
+    return m_ptr _op_ o.m_ptr;                                  \
+}                                                               \
+template<typename U>                                            \
+inline bool operator _op_ (const U* o) const {                  \
+    return m_ptr _op_ o;                                        \
+}
+
+// ---------------------------------------------------------------------------
+
+template <typename T>
+class sp
+{
+public:
+    inline sp() : m_ptr(0) { }
+
+    sp(T* other);  // NOLINT, implicit
+    sp(const sp<T>& other);
+    template<typename U> sp(U* other);  // NOLINT, implicit
+    template<typename U> sp(const sp<U>& other);  // NOLINT, implicit
+
+    ~sp();
+    
+    // Assignment
+
+    sp& operator = (T* other);
+    sp& operator = (const sp<T>& other);
+    
+    template<typename U> sp& operator = (const sp<U>& other);
+    template<typename U> sp& operator = (U* other);
+    
+    // Reset
+    void clear();
+    
+    // Accessors
+
+    inline  T&      operator* () const  { return *m_ptr; }
+    inline  T*      operator-> () const { return m_ptr;  }
+    inline  T*      get() const         { return m_ptr; }
+
+    // Operators
+        
+    COMPARE(==)
+    COMPARE(!=)
+    COMPARE(>)
+    COMPARE(<)
+    COMPARE(<=)
+    COMPARE(>=)
+
+private:    
+    template<typename Y> friend class sp;
+
+    T*              m_ptr;
+};
+
+// ---------------------------------------------------------------------------
+// No user serviceable parts below here.
+
+template<typename T>
+sp<T>::sp(T* other)
+    : m_ptr(other)
+{
+    if (other) other->incStrong(this);
+}
+
+template<typename T>
+sp<T>::sp(const sp<T>& other)
+    : m_ptr(other.m_ptr)
+{
+    if (m_ptr) m_ptr->incStrong(this);
+}
+
+template<typename T> template<typename U>
+sp<T>::sp(U* other) : m_ptr(other)
+{
+    if (other) other->incStrong(this);
+}
+
+template<typename T> template<typename U>
+sp<T>::sp(const sp<U>& other)
+    : m_ptr(other.m_ptr)
+{
+    if (m_ptr) m_ptr->incStrong(this);
+}
+
+template<typename T>
+sp<T>::~sp()
+{
+    if (m_ptr) m_ptr->decStrong(this);
+}
+
+template<typename T>
+sp<T>& sp<T>::operator = (const sp<T>& other) {
+    if (other.m_ptr) other.m_ptr->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other.m_ptr;
+    return *this;
+}
+
+template<typename T>
+sp<T>& sp<T>::operator = (T* other)
+{
+    if (other) other->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other;
+    return *this;
+}
+
+template<typename T> template<typename U>
+sp<T>& sp<T>::operator = (const sp<U>& other)
+{
+    if (other.m_ptr) other.m_ptr->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other.m_ptr;
+    return *this;
+}
+
+template<typename T> template<typename U>
+sp<T>& sp<T>::operator = (U* other)
+{
+    if (other) other->incStrong(this);
+    if (m_ptr) m_ptr->decStrong(this);
+    m_ptr = other;
+    return *this;
+}
+
+template<typename T>
+void sp<T>::clear()
+{
+    if (m_ptr) {
+        m_ptr->decStrong(this);
+        m_ptr = 0;
+    }
+}
+
+// ---------------------------------------------------------------------------
+
+} // namespace tinyutils
+} // namespace android
+
+// ---------------------------------------------------------------------------
+
+#endif // ANDROID_PIXELFLINGER_SMART_POINTER_H
diff --git a/libpixelflinger/col32cb16blend.S b/libpixelflinger/col32cb16blend.S
new file mode 100644
index 0000000..39f94e1
--- /dev/null
+++ b/libpixelflinger/col32cb16blend.S
@@ -0,0 +1,77 @@
+/* libs/pixelflinger/col32cb16blend.S
+ *
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+    .text
+    .balign 4
+
+    .global scanline_col32cb16blend_arm
+
+//
+// This function alpha blends a fixed color into a destination scanline, using
+// the formula:
+//
+//     d = s + (((a + (a >> 7)) * d) >> 8)
+//
+// where d is the destination pixel,
+//       s is the source color,
+//       a is the alpha channel of the source color.
+//
+
+// r0 = destination buffer pointer
+// r1 = color value
+// r2 = count
+
+
+scanline_col32cb16blend_arm:
+    push        {r4-r10, lr}                    // stack ARM regs
+
+    mov         r5, r1, lsr #24                 // shift down alpha
+    mov         r9, #0xff                       // create mask
+    add         r5, r5, r5, lsr #7              // add in top bit
+    rsb         r5, r5, #256                    // invert alpha
+    and         r10, r1, #0xff                  // extract red
+    and         r12, r9, r1, lsr #8             // extract green
+    and         r4, r9, r1, lsr #16             // extract blue
+    mov         r10, r10, lsl #5                // prescale red
+    mov         r12, r12, lsl #6                // prescale green
+    mov         r4, r4, lsl #5                  // prescale blue
+    mov         r9, r9, lsr #2                  // create dest green mask
+
+1:
+    ldrh        r8, [r0]                        // load dest pixel
+    subs        r2, r2, #1                      // decrement loop counter
+    mov         r6, r8, lsr #11                 // extract dest red
+    and         r7, r9, r8, lsr #5              // extract dest green
+    and         r8, r8, #0x1f                   // extract dest blue
+
+    smlabb      r6, r6, r5, r10                 // dest red * alpha + src red
+    smlabb      r7, r7, r5, r12                 // dest green * alpha + src green
+    smlabb      r8, r8, r5, r4                  // dest blue * alpha + src blue
+
+    mov         r6, r6, lsr #8                  // shift down red
+    mov         r7, r7, lsr #8                  // shift down green
+    mov         r6, r6, lsl #11                 // shift red into 565
+    orr         r6, r7, lsl #5                  // shift green into 565
+    orr         r6, r8, lsr #8                  // shift blue into 565
+
+    strh        r6, [r0], #2                    // store pixel to dest, update ptr
+    bne         1b                              // if count != 0, loop
+
+    pop         {r4-r10, pc}                    // return
+
+
+
diff --git a/libpixelflinger/col32cb16blend_neon.S b/libpixelflinger/col32cb16blend_neon.S
new file mode 100644
index 0000000..7ad34b0
--- /dev/null
+++ b/libpixelflinger/col32cb16blend_neon.S
@@ -0,0 +1,153 @@
+/* libs/pixelflinger/col32cb16blend_neon.S
+ *
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+    .text
+    .balign 4
+
+    .global scanline_col32cb16blend_neon
+
+//
+// This function alpha blends a fixed color into a destination scanline, using
+// the formula:
+//
+//     d = s + (((a + (a >> 7)) * d) >> 8)
+//
+// where d is the destination pixel,
+//       s is the source color,
+//       a is the alpha channel of the source color.
+//
+// The NEON implementation processes 16 pixels per iteration. The remaining 0 - 15
+// pixels are processed in ARM code.
+//
+
+// r0 = destination buffer pointer
+// r1 = color pointer
+// r2 = count
+
+
+scanline_col32cb16blend_neon:
+    push        {r4-r11, lr}                    // stack ARM regs
+
+    vmov.u16    q15, #256                       // create alpha constant
+    movs        r3, r2, lsr #4                  // calc. sixteens iterations
+    vmov.u16    q14, #0x1f                      // create blue mask
+
+    beq         2f                              // if r3 == 0, branch to singles
+
+    vld4.8      {d0[], d2[], d4[], d6[]}, [r1]  // load color into four registers
+                                                //  split and duplicate them, such that
+                                                //  d0 = 8 equal red values
+                                                //  d2 = 8 equal green values
+                                                //  d4 = 8 equal blue values
+                                                //  d6 = 8 equal alpha values
+    vshll.u8    q0, d0, #5                      // shift up red and widen
+    vshll.u8    q1, d2, #6                      // shift up green and widen
+    vshll.u8    q2, d4, #5                      // shift up blue and widen
+
+    vshr.u8     d7, d6, #7                      // extract top bit of alpha
+    vaddl.u8    q3, d6, d7                      // add top bit into alpha
+    vsub.u16    q3, q15, q3                     // invert alpha
+
+1:
+    // This loop processes 16 pixels per iteration. In the comments, references to
+    // the first eight pixels are suffixed with "0" (red0, green0, blue0), 
+    // the second eight are suffixed "1".
+                                                // q8  = dst red0
+                                                // q9  = dst green0
+                                                // q10 = dst blue0
+                                                // q13 = dst red1
+                                                // q12 = dst green1
+                                                // q11 = dst blue1
+
+    vld1.16     {d20, d21, d22, d23}, [r0]      // load 16 dest pixels
+    vshr.u16    q8, q10, #11                    // shift dst red0 to low 5 bits
+    pld         [r0, #63]                       // preload next dest pixels
+    vshl.u16    q9, q10, #5                     // shift dst green0 to top 6 bits
+    vand        q10, q10, q14                   // extract dst blue0
+    vshr.u16    q9, q9, #10                     // shift dst green0 to low 6 bits
+    vmul.u16    q8, q8, q3                      // multiply dst red0 by src alpha
+    vshl.u16    q12, q11, #5                    // shift dst green1 to top 6 bits
+    vmul.u16    q9, q9, q3                      // multiply dst green0 by src alpha
+    vshr.u16    q13, q11, #11                   // shift dst red1 to low 5 bits
+    vmul.u16    q10, q10, q3                    // multiply dst blue0 by src alpha
+    vshr.u16    q12, q12, #10                   // shift dst green1 to low 6 bits
+    vand        q11, q11, q14                   // extract dst blue1
+    vadd.u16    q8, q8, q0                      // add src red to dst red0
+    vmul.u16    q13, q13, q3                    // multiply dst red1 by src alpha
+    vadd.u16    q9, q9, q1                      // add src green to dst green0 
+    vmul.u16    q12, q12, q3                    // multiply dst green1 by src alpha
+    vadd.u16    q10, q10, q2                    // add src blue to dst blue0
+    vmul.u16    q11, q11, q3                    // multiply dst blue1 by src alpha
+    vshr.u16    q8, q8, #8                      // shift down red0
+    vadd.u16    q13, q13, q0                    // add src red to dst red1
+    vshr.u16    q9, q9, #8                      // shift down green0
+    vadd.u16    q12, q12, q1                    // add src green to dst green1
+    vshr.u16    q10, q10, #8                    // shift down blue0
+    vadd.u16    q11, q11, q2                    // add src blue to dst blue1
+    vsli.u16    q10, q9, #5                     // shift & insert green0 into blue0
+    vshr.u16    q13, q13, #8                    // shift down red1
+    vsli.u16    q10, q8, #11                    // shift & insert red0 into blue0    
+    vshr.u16    q12, q12, #8                    // shift down green1
+    vshr.u16    q11, q11, #8                    // shift down blue1
+    subs        r3, r3, #1                      // decrement loop counter
+    vsli.u16    q11, q12, #5                    // shift & insert green1 into blue1
+    vsli.u16    q11, q13, #11                   // shift & insert red1 into blue1
+
+    vst1.16     {d20, d21, d22, d23}, [r0]!     // write 16 pixels back to dst
+    bne         1b                              // if count != 0, loop
+
+2:
+    ands        r3, r2, #15                     // calc. single iterations 
+    beq         4f                              // if r3 == 0, exit
+
+    ldr         r4, [r1]                        // load source color
+    mov         r5, r4, lsr #24                 // shift down alpha
+    add         r5, r5, r5, lsr #7              // add in top bit
+    rsb         r5, r5, #256                    // invert alpha
+    and         r11, r4, #0xff                  // extract red
+    ubfx        r12, r4, #8, #8                 // extract green
+    ubfx        r4, r4, #16, #8                 // extract blue
+    mov         r11, r11, lsl #5                // prescale red
+    mov         r12, r12, lsl #6                // prescale green
+    mov         r4, r4, lsl #5                  // prescale blue
+
+3:
+    ldrh        r8, [r0]                        // load dest pixel
+    subs        r3, r3, #1                      // decrement loop counter
+    mov         r6, r8, lsr #11                 // extract dest red
+    ubfx        r7, r8, #5, #6                  // extract dest green
+    and         r8, r8, #0x1f                   // extract dest blue
+
+    smlabb      r6, r6, r5, r11                 // dest red * alpha + src red
+    smlabb      r7, r7, r5, r12                 // dest green * alpha + src green
+    smlabb      r8, r8, r5, r4                  // dest blue * alpha + src blue
+
+    mov         r6, r6, lsr #8                  // shift down red
+    mov         r7, r7, lsr #8                  // shift down green
+    mov         r6, r6, lsl #11                 // shift red into 565
+    orr         r6, r7, lsl #5                  // shift green into 565
+    orr         r6, r8, lsr #8                  // shift blue into 565
+
+    strh        r6, [r0], #2                    // store pixel to dest, update ptr
+    bne         3b                              // if count != 0, loop
+4:
+
+    pop         {r4-r11, pc}                    // return
+
+
+
diff --git a/libpixelflinger/fixed.cpp b/libpixelflinger/fixed.cpp
new file mode 100644
index 0000000..de6b479
--- /dev/null
+++ b/libpixelflinger/fixed.cpp
@@ -0,0 +1,329 @@
+/* libs/pixelflinger/fixed.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <stdio.h>
+
+#include <private/pixelflinger/ggl_context.h>
+#include <private/pixelflinger/ggl_fixed.h>
+
+
+// ------------------------------------------------------------------------
+
+int32_t gglRecipQNormalized(int32_t x, int* exponent)
+{
+    const int32_t s = x>>31;
+    uint32_t a = s ? -x : x;
+
+    // the result will overflow, so just set it to the biggest/inf value
+    if (ggl_unlikely(a <= 2LU)) {
+        *exponent = 0;
+        return s ? FIXED_MIN : FIXED_MAX;
+    }
+
+    // Newton-Raphson iteration:
+    // x = r*(2 - a*r)
+
+    const int32_t lz = gglClz(a);
+    a <<= lz;  // 0.32
+    uint32_t r = a;
+    // note: if a == 0x80000000, this means x was a power-of-2, in this
+    // case we don't need to compute anything. We get the reciprocal for
+    // (almost) free.
+    if (a != 0x80000000) {
+        r = (0x2E800 << (30-16)) - (r>>(2-1)); // 2.30, r = 2.90625 - 2*a
+        // 0.32 + 2.30 = 2.62 -> 2.30
+        // 2.30 + 2.30 = 4.60 -> 2.30
+        r = (((2LU<<30) - uint32_t((uint64_t(a)*r) >> 32)) * uint64_t(r)) >> 30;
+        r = (((2LU<<30) - uint32_t((uint64_t(a)*r) >> 32)) * uint64_t(r)) >> 30;
+    }
+
+    // shift right 1-bit to make room for the sign bit
+    *exponent = 30-lz-1;
+    r >>= 1;
+    return s ? -r : r;
+}
+
+int32_t gglRecipQ(GGLfixed x, int q)
+{
+    int shift;
+    x = gglRecipQNormalized(x, &shift);
+    shift += 16-q;
+    if (shift > 0)
+        x += 1L << (shift-1);   // rounding
+    x >>= shift;
+    return x;
+}    
+
+// ------------------------------------------------------------------------
+
+static const GGLfixed ggl_sqrt_reciproc_approx_tab[8] = {
+    // 1/sqrt(x) with x = 1-N/16, N=[8...1]
+    0x16A09, 0x15555, 0x143D1, 0x134BF, 0x1279A, 0x11C01, 0x111AC, 0x10865
+};
+
+GGLfixed gglSqrtRecipx(GGLfixed x)
+{
+    if (x == 0)         return FIXED_MAX;
+    if (x == FIXED_ONE) return x;
+    const GGLfixed a = x;
+    const int32_t lz = gglClz(x);
+    x = ggl_sqrt_reciproc_approx_tab[(a>>(28-lz))&0x7];
+    const int32_t exp = lz - 16;
+    if (exp <= 0)   x >>= -exp>>1;
+    else            x <<= (exp>>1) + (exp & 1);        
+    if (exp & 1) {
+        x = gglMulx(x, ggl_sqrt_reciproc_approx_tab[0])>>1;
+    }
+    // 2 Newton-Raphson iterations: x = x/2*(3-(a*x)*x)
+    x = gglMulx((x>>1),(0x30000 - gglMulx(gglMulx(a,x),x)));
+    x = gglMulx((x>>1),(0x30000 - gglMulx(gglMulx(a,x),x)));
+    return x;
+}
+
+GGLfixed gglSqrtx(GGLfixed a)
+{
+    // Compute a full precision square-root (24 bits accuracy)
+    GGLfixed r = 0;
+    GGLfixed bit = 0x800000;
+    int32_t bshift = 15;
+    do {
+        GGLfixed temp = bit + (r<<1);
+        if (bshift >= 8)    temp <<= (bshift-8);
+        else                temp >>= (8-bshift);
+        if (a >= temp) {
+            r += bit;
+            a -= temp;
+        }
+        bshift--;
+    } while (bit>>=1);
+    return r;
+}
+
+// ------------------------------------------------------------------------
+
+static const GGLfixed ggl_log_approx_tab[] = {
+    // -ln(x)/ln(2) with x = N/16, N=[8...16]
+    0xFFFF, 0xd47f, 0xad96, 0x8a62, 0x6a3f, 0x4caf, 0x3151, 0x17d6, 0x0000
+};
+
+static const GGLfixed ggl_alog_approx_tab[] = { // domain [0 - 1.0]
+	0xffff, 0xeac0, 0xd744, 0xc567, 0xb504, 0xa5fe, 0x9837, 0x8b95, 0x8000
+};
+
+GGLfixed gglPowx(GGLfixed x, GGLfixed y)
+{
+    // prerequisite: 0 <= x <= 1, and y >=0
+
+    // pow(x,y) = 2^(y*log2(x))
+    // =  2^(y*log2(x*(2^exp)*(2^-exp))))
+    // =  2^(y*(log2(X)-exp))
+    // =  2^(log2(X)*y - y*exp)
+    // =  2^( - (-log2(X)*y + y*exp) )
+    
+    int32_t exp = gglClz(x) - 16;
+    GGLfixed f = x << exp;
+    x = (f & 0x0FFF)<<4;
+    f = (f >> 12) & 0x7;
+    GGLfixed p = gglMulAddx(
+            ggl_log_approx_tab[f+1] - ggl_log_approx_tab[f], x,
+            ggl_log_approx_tab[f]);
+    p = gglMulAddx(p, y, y*exp);
+    exp = gglFixedToIntFloor(p);
+    if (exp < 31) {
+        p = gglFracx(p);
+        x = (p & 0x1FFF)<<3;
+        p >>= 13;    
+        p = gglMulAddx(
+                ggl_alog_approx_tab[p+1] - ggl_alog_approx_tab[p], x,
+                ggl_alog_approx_tab[p]);
+        p >>= exp;
+    } else {
+        p = 0;
+    }
+    return p;
+        // ( powf((a*65536.0f), (b*65536.0f)) ) * 65536.0f;
+}
+
+// ------------------------------------------------------------------------
+
+int32_t gglDivQ(GGLfixed n, GGLfixed d, int32_t i)
+{
+    //int32_t r =int32_t((int64_t(n)<<i)/d);
+    const int32_t ds = n^d;
+    if (n<0) n = -n;
+    if (d<0) d = -d;
+    int nd = gglClz(d) - gglClz(n);
+    i += nd + 1;
+    if (nd > 0) d <<= nd;
+    else        n <<= -nd;
+    uint32_t q = 0;
+
+    int j = i & 7;
+    i >>= 3;
+
+    // gcc deals with the code below pretty well.
+    // we get 3.75 cycles per bit in the main loop
+    // and 8 cycles per bit in the termination loop
+    if (ggl_likely(i)) {
+        n -= d;
+        do {
+            q <<= 8;
+            if (n>=0)   q |= 128;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 64;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 32;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 16;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 8;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 4;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 2;
+            else        n += d;
+            n = n*2 - d;
+            if (n>=0)   q |= 1;
+            else        n += d;
+            
+            if (--i == 0)
+                goto finish;
+
+            n = n*2 - d;
+        } while(true);
+        do {
+            q <<= 1;
+            n = n*2 - d;
+            if (n>=0)   q |= 1;
+            else        n += d;
+        finish: ;
+        } while (j--);
+        return (ds<0) ? -q : q;
+    }
+
+    n -= d;
+    if (n>=0)   q |= 1;
+    else        n += d;
+    j--;
+    goto finish;
+}
+
+// ------------------------------------------------------------------------
+
+// assumes that the int32_t values of a, b, and c are all positive
+// use when both a and b are larger than c
+
+template <typename T>
+static inline void swap(T& a, T& b) {
+    T t(a);
+    a = b;
+    b = t;
+}
+
+static __attribute__((noinline))
+int32_t slow_muldiv(uint32_t a, uint32_t b, uint32_t c)
+{
+	// first we compute a*b as a 64-bit integer
+    // (GCC generates umull with the code below)
+    uint64_t ab = uint64_t(a)*b;
+    uint32_t hi = ab>>32;
+    uint32_t lo = ab;
+    uint32_t result;
+
+	// now perform the division
+	if (hi >= c) {
+	overflow:
+		result = 0x7fffffff;  // basic overflow
+	} else if (hi == 0) {
+		result = lo/c;  // note: c can't be 0
+		if ((result >> 31) != 0)  // result must fit in 31 bits
+			goto overflow;
+	} else {
+		uint32_t r = hi;
+		int bits = 31;
+	    result = 0;
+		do {
+			r = (r << 1) | (lo >> 31);
+			lo <<= 1;
+			result <<= 1;
+			if (r >= c) {
+				r -= c;
+				result |= 1;
+			}
+		} while (bits--);
+	}
+	return int32_t(result);
+}
+
+// assumes a >= 0 and c >= b >= 0
+static inline
+int32_t quick_muldiv(int32_t a, int32_t b, int32_t c)
+{
+    int32_t r = 0, q = 0, i;
+    int leading = gglClz(a);
+    i = 32 - leading;
+    a <<= leading;
+    do {
+        r <<= 1;
+        if (a < 0)
+            r += b;
+        a <<= 1;
+        q <<= 1;
+        if (r >= c) {
+            r -= c;
+            q++;
+        }
+        asm(""::); // gcc generates better code this way
+        if (r >= c) {
+            r -= c;
+            q++;
+        }
+    }
+    while (--i);
+    return q;
+}
+
+// this function computes a*b/c with 64-bit intermediate accuracy
+// overflows (e.g. division by 0) are handled and return INT_MAX
+
+int32_t gglMulDivi(int32_t a, int32_t b, int32_t c)
+{
+	int32_t result;
+	int32_t sign = a^b^c;
+
+	if (a < 0) a = -a;
+	if (b < 0) b = -b;
+	if (c < 0) c = -c;
+
+    if (a < b) {
+        swap(a, b);
+    }
+    
+	if (b <= c) result = quick_muldiv(a, b, c);
+	else        result = slow_muldiv((uint32_t)a, (uint32_t)b, (uint32_t)c);
+	
+	if (sign < 0)
+		result = -result;
+	  
+    return result;
+}
diff --git a/libpixelflinger/format.cpp b/libpixelflinger/format.cpp
new file mode 100644
index 0000000..6546e8c
--- /dev/null
+++ b/libpixelflinger/format.cpp
@@ -0,0 +1,76 @@
+/* libs/pixelflinger/format.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+#include <stdio.h>
+#include <pixelflinger/format.h>
+
+namespace android {
+
+static GGLFormat const gPixelFormatInfos[] =
+{   //          Alpha    Red     Green   Blue
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  4, 32, {{32,24,   8, 0,  16, 8,  24,16 }}, GGL_RGBA },   // PIXEL_FORMAT_RGBA_8888
+    {  4, 24, {{ 0, 0,   8, 0,  16, 8,  24,16 }}, GGL_RGB  },   // PIXEL_FORMAT_RGBX_8888
+    {  3, 24, {{ 0, 0,   8, 0,  16, 8,  24,16 }}, GGL_RGB  },   // PIXEL_FORMAT_RGB_888
+    {  2, 16, {{ 0, 0,  16,11,  11, 5,   5, 0 }}, GGL_RGB  },   // PIXEL_FORMAT_RGB_565
+    {  4, 32, {{32,24,  24,16,  16, 8,   8, 0 }}, GGL_RGBA },   // PIXEL_FORMAT_BGRA_8888
+    {  2, 16, {{ 1, 0,  16,11,  11, 6,   6, 1 }}, GGL_RGBA },   // PIXEL_FORMAT_RGBA_5551
+    {  2, 16, {{ 4, 0,  16,12,  12, 8,   8, 4 }}, GGL_RGBA },   // PIXEL_FORMAT_RGBA_4444
+    {  1,  8, {{ 8, 0,   0, 0,   0, 0,   0, 0 }}, GGL_ALPHA},   // PIXEL_FORMAT_A8
+    {  1,  8, {{ 0, 0,   8, 0,   8, 0,   8, 0 }}, GGL_LUMINANCE},//PIXEL_FORMAT_L8
+    {  2, 16, {{16, 8,   8, 0,   8, 0,   8, 0 }}, GGL_LUMINANCE_ALPHA},// PIXEL_FORMAT_LA_88
+    {  1,  8, {{ 0, 0,   8, 5,   5, 2,   2, 0 }}, GGL_RGB  },   // PIXEL_FORMAT_RGB_332
+    
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    
+    {  2, 16, {{  0, 0, 16, 0,   0, 0,   0, 0 }}, GGL_DEPTH_COMPONENT},
+    {  1,  8, {{  8, 0,  0, 0,   0, 0,   0, 0 }}, GGL_STENCIL_INDEX  },
+    {  4, 24, {{  0, 0, 24, 0,   0, 0,   0, 0 }}, GGL_DEPTH_COMPONENT},
+    {  4,  8, {{ 32,24,  0, 0,   0, 0,   0, 0 }}, GGL_STENCIL_INDEX  },
+
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+    {  0,  0, {{ 0, 0,   0, 0,   0, 0,   0, 0 }},        0 },   // PIXEL_FORMAT_NONE
+
+};
+
+}; // namespace android
+
+
+const GGLFormat* gglGetPixelFormatTable(size_t* numEntries)
+{
+    if (numEntries) {
+        *numEntries = sizeof(android::gPixelFormatInfos)/sizeof(GGLFormat);
+    }
+    return android::gPixelFormatInfos;
+}
diff --git a/libpixelflinger/include/pixelflinger/format.h b/libpixelflinger/include/pixelflinger/format.h
new file mode 100644
index 0000000..82eeca4
--- /dev/null
+++ b/libpixelflinger/include/pixelflinger/format.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (C) 2005 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_PIXELFLINGER_FORMAT_H
+#define ANDROID_PIXELFLINGER_FORMAT_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+enum GGLPixelFormat {
+    // these constants need to match those
+    // in graphics/PixelFormat.java, ui/PixelFormat.h, BlitHardware.h
+    GGL_PIXEL_FORMAT_UNKNOWN    =   0,
+    GGL_PIXEL_FORMAT_NONE       =   0,
+
+    GGL_PIXEL_FORMAT_RGBA_8888   =   1,  // 4x8-bit ARGB
+    GGL_PIXEL_FORMAT_RGBX_8888   =   2,  // 3x8-bit RGB stored in 32-bit chunks
+    GGL_PIXEL_FORMAT_RGB_888     =   3,  // 3x8-bit RGB
+    GGL_PIXEL_FORMAT_RGB_565     =   4,  // 16-bit RGB
+    GGL_PIXEL_FORMAT_BGRA_8888   =   5,  // 4x8-bit BGRA
+    GGL_PIXEL_FORMAT_RGBA_5551   =   6,  // 16-bit RGBA
+    GGL_PIXEL_FORMAT_RGBA_4444   =   7,  // 16-bit RGBA
+
+    GGL_PIXEL_FORMAT_A_8         =   8,  // 8-bit A
+    GGL_PIXEL_FORMAT_L_8         =   9,  // 8-bit L (R=G=B = L)
+    GGL_PIXEL_FORMAT_LA_88       = 0xA,  // 16-bit LA
+    GGL_PIXEL_FORMAT_RGB_332     = 0xB,  // 8-bit RGB (non paletted)
+
+    // reserved range. don't use.
+    GGL_PIXEL_FORMAT_RESERVED_10 = 0x10,
+    GGL_PIXEL_FORMAT_RESERVED_11 = 0x11,
+    GGL_PIXEL_FORMAT_RESERVED_12 = 0x12,
+    GGL_PIXEL_FORMAT_RESERVED_13 = 0x13,
+    GGL_PIXEL_FORMAT_RESERVED_14 = 0x14,
+    GGL_PIXEL_FORMAT_RESERVED_15 = 0x15,
+    GGL_PIXEL_FORMAT_RESERVED_16 = 0x16,
+    GGL_PIXEL_FORMAT_RESERVED_17 = 0x17,
+
+    // reserved/special formats
+    GGL_PIXEL_FORMAT_Z_16       =  0x18,
+    GGL_PIXEL_FORMAT_S_8        =  0x19,
+    GGL_PIXEL_FORMAT_SZ_24      =  0x1A,
+    GGL_PIXEL_FORMAT_SZ_8       =  0x1B,
+
+    // reserved range. don't use.
+    GGL_PIXEL_FORMAT_RESERVED_20 = 0x20,
+    GGL_PIXEL_FORMAT_RESERVED_21 = 0x21,
+};
+
+enum GGLFormatComponents {
+	GGL_STENCIL_INDEX		= 0x1901,
+	GGL_DEPTH_COMPONENT		= 0x1902,
+	GGL_ALPHA				= 0x1906,
+	GGL_RGB					= 0x1907,
+	GGL_RGBA				= 0x1908,
+	GGL_LUMINANCE			= 0x1909,
+	GGL_LUMINANCE_ALPHA		= 0x190A,
+};
+
+enum GGLFormatComponentIndex {
+    GGL_INDEX_ALPHA   = 0,
+    GGL_INDEX_RED     = 1,
+    GGL_INDEX_GREEN   = 2,
+    GGL_INDEX_BLUE    = 3,
+    GGL_INDEX_STENCIL = 0,
+    GGL_INDEX_DEPTH   = 1,
+    GGL_INDEX_Y       = 0,
+    GGL_INDEX_CB      = 1,
+    GGL_INDEX_CR      = 2,
+};
+
+typedef struct {
+#ifdef __cplusplus
+    enum {
+        ALPHA   = GGL_INDEX_ALPHA,
+        RED     = GGL_INDEX_RED,
+        GREEN   = GGL_INDEX_GREEN,
+        BLUE    = GGL_INDEX_BLUE,
+        STENCIL = GGL_INDEX_STENCIL,
+        DEPTH   = GGL_INDEX_DEPTH,
+        LUMA    = GGL_INDEX_Y,
+        CHROMAB = GGL_INDEX_CB,
+        CHROMAR = GGL_INDEX_CR,
+    };
+    inline uint32_t mask(int i) const {
+            return ((1<<(c[i].h-c[i].l))-1)<<c[i].l;
+    }
+    inline uint32_t bits(int i) const {
+            return c[i].h - c[i].l;
+    }
+#endif
+	uint8_t     size;	// bytes per pixel
+    uint8_t     bitsPerPixel;
+    union {    
+        struct {
+            uint8_t     ah;		// alpha high bit position + 1
+            uint8_t     al;		// alpha low bit position
+            uint8_t     rh;		// red high bit position + 1
+            uint8_t     rl;		// red low bit position
+            uint8_t     gh;		// green high bit position + 1
+            uint8_t     gl;		// green low bit position
+            uint8_t     bh;		// blue high bit position + 1
+            uint8_t     bl;		// blue low bit position
+        };
+        struct {
+            uint8_t h;
+            uint8_t l;
+        } __attribute__((__packed__)) c[4];        
+    } __attribute__((__packed__));
+	uint16_t    components;	// GGLFormatComponents
+} GGLFormat;
+
+
+#ifdef __cplusplus
+extern "C" const GGLFormat* gglGetPixelFormatTable(size_t* numEntries = 0);
+#else
+const GGLFormat* gglGetPixelFormatTable(size_t* numEntries);
+#endif
+
+
+// ----------------------------------------------------------------------------
+
+#endif // ANDROID_PIXELFLINGER_FORMAT_H
diff --git a/libpixelflinger/include/pixelflinger/pixelflinger.h b/libpixelflinger/include/pixelflinger/pixelflinger.h
new file mode 100644
index 0000000..8a2b442
--- /dev/null
+++ b/libpixelflinger/include/pixelflinger/pixelflinger.h
@@ -0,0 +1,330 @@
+/*
+ * Copyright (C) 2007 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_PIXELFLINGER_H
+#define ANDROID_PIXELFLINGER_H
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include <pixelflinger/format.h>
+
+// GGL types
+
+typedef int8_t			GGLbyte;		// b
+typedef int16_t			GGLshort;		// s
+typedef int32_t			GGLint;			// i
+typedef ssize_t			GGLsizei;		// i
+typedef int32_t			GGLfixed;		// x
+typedef int32_t			GGLclampx;		// x
+typedef float			GGLfloat;		// f
+typedef float			GGLclampf;		// f
+typedef double			GGLdouble;		// d
+typedef double			GGLclampd;		// d
+typedef uint8_t			GGLubyte;		// ub
+typedef uint8_t			GGLboolean;		// ub
+typedef uint16_t		GGLushort;		// us
+typedef uint32_t		GGLuint;		// ui
+typedef unsigned int	GGLenum;		// ui
+typedef unsigned int	GGLbitfield;	// ui
+typedef void			GGLvoid;
+typedef int32_t         GGLfixed32;
+typedef	int32_t         GGLcolor;
+typedef int32_t         GGLcoord;
+
+// ----------------------------------------------------------------------------
+
+#define GGL_MAX_VIEWPORT_DIMS           4096
+#define GGL_MAX_TEXTURE_SIZE            4096
+#define GGL_MAX_ALIASED_POINT_SIZE      0x7FFFFFF
+#define GGL_MAX_SMOOTH_POINT_SIZE       2048
+#define GGL_MAX_SMOOTH_LINE_WIDTH       2048
+
+// ----------------------------------------------------------------------------
+
+// All these names are compatible with their OpenGL equivalents
+// some of them are listed only for completeness
+enum GGLNames {
+	GGL_FALSE						= 0,
+	GGL_TRUE						= 1,
+
+	// enable/disable
+    GGL_SCISSOR_TEST                = 0x0C11,
+	GGL_TEXTURE_2D					= 0x0DE1,
+	GGL_ALPHA_TEST					= 0x0BC0,
+	GGL_BLEND						= 0x0BE2,
+	GGL_COLOR_LOGIC_OP				= 0x0BF2,
+	GGL_DITHER						= 0x0BD0,
+	GGL_STENCIL_TEST				= 0x0B90,
+	GGL_DEPTH_TEST					= 0x0B71,
+    GGL_AA                          = 0x80000001,
+    GGL_W_LERP                      = 0x80000004,
+    GGL_POINT_SMOOTH_NICE           = 0x80000005,
+
+    // buffers, pixel drawing/reading
+    GGL_COLOR                       = 0x1800,
+    
+    // fog
+    GGL_FOG                         = 0x0B60,
+    
+	// shade model
+	GGL_FLAT						= 0x1D00,
+	GGL_SMOOTH						= 0x1D01,
+
+	// Texture parameter name
+	GGL_TEXTURE_MIN_FILTER			= 0x2801,
+	GGL_TEXTURE_MAG_FILTER			= 0x2800,
+	GGL_TEXTURE_WRAP_S				= 0x2802,
+	GGL_TEXTURE_WRAP_T				= 0x2803,
+	GGL_TEXTURE_WRAP_R				= 0x2804,
+
+	// Texture Filter	
+	GGL_NEAREST						= 0x2600,
+	GGL_LINEAR						= 0x2601,
+	GGL_NEAREST_MIPMAP_NEAREST		= 0x2700,
+	GGL_LINEAR_MIPMAP_NEAREST		= 0x2701,
+	GGL_NEAREST_MIPMAP_LINEAR		= 0x2702,
+	GGL_LINEAR_MIPMAP_LINEAR		= 0x2703,
+
+	// Texture Wrap Mode
+	GGL_CLAMP						= 0x2900,
+	GGL_REPEAT						= 0x2901,
+    GGL_CLAMP_TO_EDGE               = 0x812F,
+
+	// Texture Env Mode
+	GGL_REPLACE						= 0x1E01,
+	GGL_MODULATE					= 0x2100,
+	GGL_DECAL						= 0x2101,
+	GGL_ADD							= 0x0104,
+
+	// Texture Env Parameter
+	GGL_TEXTURE_ENV_MODE			= 0x2200,
+	GGL_TEXTURE_ENV_COLOR			= 0x2201,
+
+	// Texture Env Target
+	GGL_TEXTURE_ENV					= 0x2300,
+
+    // Texture coord generation
+    GGL_TEXTURE_GEN_MODE            = 0x2500,
+    GGL_S                           = 0x2000,
+    GGL_T                           = 0x2001,
+    GGL_R                           = 0x2002,
+    GGL_Q                           = 0x2003,
+    GGL_ONE_TO_ONE                  = 0x80000002,
+    GGL_AUTOMATIC                   = 0x80000003,
+
+    // AlphaFunction
+    GGL_NEVER                       = 0x0200,
+    GGL_LESS                        = 0x0201,
+    GGL_EQUAL                       = 0x0202,
+    GGL_LEQUAL                      = 0x0203,
+    GGL_GREATER                     = 0x0204,
+    GGL_NOTEQUAL                    = 0x0205,
+    GGL_GEQUAL                      = 0x0206,
+    GGL_ALWAYS                      = 0x0207,
+
+    // LogicOp
+    GGL_CLEAR                       = 0x1500,   // 0
+    GGL_AND                         = 0x1501,   // s & d
+    GGL_AND_REVERSE                 = 0x1502,   // s & ~d
+    GGL_COPY                        = 0x1503,   // s
+    GGL_AND_INVERTED                = 0x1504,   // ~s & d
+    GGL_NOOP                        = 0x1505,   // d
+    GGL_XOR                         = 0x1506,   // s ^ d
+    GGL_OR                          = 0x1507,   // s | d
+    GGL_NOR                         = 0x1508,   // ~(s | d)
+    GGL_EQUIV                       = 0x1509,   // ~(s ^ d)
+    GGL_INVERT                      = 0x150A,   // ~d
+    GGL_OR_REVERSE                  = 0x150B,   // s | ~d
+    GGL_COPY_INVERTED               = 0x150C,   // ~s 
+    GGL_OR_INVERTED                 = 0x150D,   // ~s | d
+    GGL_NAND                        = 0x150E,   // ~(s & d)
+    GGL_SET                         = 0x150F,   // 1
+
+	// blending equation & function
+	GGL_ZERO                        = 0,		// SD
+	GGL_ONE                         = 1,		// SD
+	GGL_SRC_COLOR                   = 0x0300,	//  D
+	GGL_ONE_MINUS_SRC_COLOR         = 0x0301,	//	D
+	GGL_SRC_ALPHA                   = 0x0302,	// SD
+	GGL_ONE_MINUS_SRC_ALPHA			= 0x0303,	// SD
+	GGL_DST_ALPHA					= 0x0304,	// SD
+	GGL_ONE_MINUS_DST_ALPHA			= 0x0305,	// SD
+	GGL_DST_COLOR					= 0x0306,	// S
+	GGL_ONE_MINUS_DST_COLOR			= 0x0307,	// S
+	GGL_SRC_ALPHA_SATURATE			= 0x0308,	// S
+    
+    // clear bits
+    GGL_DEPTH_BUFFER_BIT            = 0x00000100,
+    GGL_STENCIL_BUFFER_BIT          = 0x00000400,
+    GGL_COLOR_BUFFER_BIT            = 0x00004000,
+
+    // errors
+    GGL_NO_ERROR                    = 0,
+    GGL_INVALID_ENUM                = 0x0500,
+    GGL_INVALID_VALUE               = 0x0501,
+    GGL_INVALID_OPERATION           = 0x0502,
+    GGL_STACK_OVERFLOW              = 0x0503,
+    GGL_STACK_UNDERFLOW             = 0x0504,
+    GGL_OUT_OF_MEMORY               = 0x0505
+};
+
+// ----------------------------------------------------------------------------
+
+typedef struct {
+    GGLsizei    version;    // always set to sizeof(GGLSurface)
+    GGLuint     width;      // width in pixels
+    GGLuint     height;     // height in pixels
+    GGLint      stride;     // stride in pixels
+    GGLubyte*   data;       // pointer to the bits
+    GGLubyte    format;     // pixel format
+    GGLubyte    rfu[3];     // must be zero
+    // these values are dependent on the used format
+    union {
+        GGLint  compressedFormat;
+        GGLint  vstride;
+    };
+    void*       reserved;
+} GGLSurface;
+
+
+typedef struct {
+    // immediate rendering
+    void (*pointx)(void *con, const GGLcoord* v, GGLcoord r);
+    void (*linex)(void *con, 
+            const GGLcoord* v0, const GGLcoord* v1, GGLcoord width);
+    void (*recti)(void* c, GGLint l, GGLint t, GGLint r, GGLint b); 
+    void (*trianglex)(void* c,
+            GGLcoord const* v0, GGLcoord const* v1, GGLcoord const* v2);
+
+    // scissor
+    void (*scissor)(void* c, GGLint x, GGLint y, GGLsizei width, GGLsizei height);
+
+    // Set the textures and color buffers
+    void (*activeTexture)(void* c, GGLuint tmu);
+    void (*bindTexture)(void* c, const GGLSurface* surface);
+    void (*colorBuffer)(void* c, const GGLSurface* surface);
+    void (*readBuffer)(void* c, const GGLSurface* surface);
+    void (*depthBuffer)(void* c, const GGLSurface* surface);
+    void (*bindTextureLod)(void* c, GGLuint tmu, const GGLSurface* surface);
+
+    // enable/disable features
+    void (*enable)(void* c, GGLenum name);
+    void (*disable)(void* c, GGLenum name);
+    void (*enableDisable)(void* c, GGLenum name, GGLboolean en);
+
+    // specify the fragment's color
+    void (*shadeModel)(void* c, GGLenum mode);
+    void (*color4xv)(void* c, const GGLclampx* color);
+    // specify color iterators (16.16)
+    void (*colorGrad12xv)(void* c, const GGLcolor* grad);
+
+    // specify Z coordinate iterators (0.32)
+    void (*zGrad3xv)(void* c, const GGLfixed32* grad);
+
+    // specify W coordinate iterators (16.16)
+    void (*wGrad3xv)(void* c, const GGLfixed* grad);
+
+    // specify fog iterator & color (16.16)
+    void (*fogGrad3xv)(void* c, const GGLfixed* grad);
+    void (*fogColor3xv)(void* c, const GGLclampx* color);
+
+    // specify blending parameters
+    void (*blendFunc)(void* c, GGLenum src, GGLenum dst);
+    void (*blendFuncSeparate)(void* c,  GGLenum src, GGLenum dst,
+                                        GGLenum srcAlpha, GGLenum dstAplha);
+
+    // texture environnement (REPLACE / MODULATE / DECAL / BLEND)
+    void (*texEnvi)(void* c,    GGLenum target,
+                                GGLenum pname,
+                                GGLint param);
+
+    void (*texEnvxv)(void* c, GGLenum target,
+            GGLenum pname, const GGLfixed* params);
+
+    // texture parameters (Wrapping, filter)
+    void (*texParameteri)(void* c,  GGLenum target,
+                                    GGLenum pname,
+                                    GGLint param);
+
+    // texture iterators (16.16)
+    void (*texCoord2i)(void* c, GGLint s, GGLint t);
+    void (*texCoord2x)(void* c, GGLfixed s, GGLfixed t);
+    
+    // s, dsdx, dsdy, scale, t, dtdx, dtdy, tscale
+    // This api uses block floating-point for S and T texture coordinates.
+    // All values are given in 16.16, scaled by 'scale'. In other words,
+    // set scale to 0, for 16.16 values.
+    void (*texCoordGradScale8xv)(void* c, GGLint tmu, const int32_t* grad8);
+    
+    void (*texGeni)(void* c, GGLenum coord, GGLenum pname, GGLint param);
+
+    // masking
+    void (*colorMask)(void* c,  GGLboolean red,
+                                GGLboolean green,
+                                GGLboolean blue,
+                                GGLboolean alpha);
+
+    void (*depthMask)(void* c, GGLboolean flag);
+
+    void (*stencilMask)(void* c, GGLuint mask);
+
+    // alpha func
+    void (*alphaFuncx)(void* c, GGLenum func, GGLclampx ref);
+
+    // depth func
+    void (*depthFunc)(void* c, GGLenum func);
+
+    // logic op
+    void (*logicOp)(void* c, GGLenum opcode); 
+
+    // clear
+    void (*clear)(void* c, GGLbitfield mask);
+    void (*clearColorx)(void* c,
+            GGLclampx r, GGLclampx g, GGLclampx b, GGLclampx a);
+    void (*clearDepthx)(void* c, GGLclampx depth);
+    void (*clearStencil)(void* c, GGLint s);
+
+    // framebuffer operations
+    void (*copyPixels)(void* c, GGLint x, GGLint y,
+            GGLsizei width, GGLsizei height, GGLenum type);
+    void (*rasterPos2x)(void* c, GGLfixed x, GGLfixed y);
+    void (*rasterPos2i)(void* c, GGLint x, GGLint y);
+} GGLContext;
+
+// ----------------------------------------------------------------------------
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// construct / destroy the context
+ssize_t gglInit(GGLContext** context);
+ssize_t gglUninit(GGLContext* context);
+
+GGLint gglBitBlit(
+        GGLContext* c,
+        int tmu,
+        GGLint crop[4],
+        GGLint where[4]);
+
+#ifdef __cplusplus
+};
+#endif
+
+// ----------------------------------------------------------------------------
+
+#endif // ANDROID_PIXELFLINGER_H
diff --git a/libpixelflinger/include/private/pixelflinger/ggl_context.h b/libpixelflinger/include/private/pixelflinger/ggl_context.h
new file mode 100644
index 0000000..563b0f1
--- /dev/null
+++ b/libpixelflinger/include/private/pixelflinger/ggl_context.h
@@ -0,0 +1,565 @@
+/*
+ * Copyright (C) 2006 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_GGL_CONTEXT_H
+#define ANDROID_GGL_CONTEXT_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+#include <endian.h>
+
+#include <pixelflinger/pixelflinger.h>
+#include <private/pixelflinger/ggl_fixed.h>
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+
+inline uint32_t GGL_RGBA_TO_HOST(uint32_t v) {
+    return v;
+}
+inline uint32_t GGL_HOST_TO_RGBA(uint32_t v) {
+    return v;
+}
+
+#else
+
+inline uint32_t GGL_RGBA_TO_HOST(uint32_t v) {
+#if defined(__mips__) && __mips_isa_rev>=2
+    uint32_t r;
+    __asm__("wsbh %0, %1;"
+        "rotr %0, %0, 16"
+        : "=r" (r)
+        : "r" (v)
+        );
+    return r;
+#else
+    return (v<<24) | (v>>24) | ((v<<8)&0xff0000) | ((v>>8)&0xff00);
+#endif
+}
+inline uint32_t GGL_HOST_TO_RGBA(uint32_t v) {
+#if defined(__mips__) && __mips_isa_rev>=2
+    uint32_t r;
+    __asm__("wsbh %0, %1;"
+        "rotr %0, %0, 16"
+        : "=r" (r)
+        : "r" (v)
+        );
+    return r;
+#else
+    return (v<<24) | (v>>24) | ((v<<8)&0xff0000) | ((v>>8)&0xff00);
+#endif
+}
+
+#endif
+
+// ----------------------------------------------------------------------------
+
+const int GGL_DITHER_BITS = 6;  // dither weights stored on 6 bits
+const int GGL_DITHER_ORDER_SHIFT= 3;
+const int GGL_DITHER_ORDER      = (1<<GGL_DITHER_ORDER_SHIFT);
+const int GGL_DITHER_SIZE       = GGL_DITHER_ORDER * GGL_DITHER_ORDER;
+const int GGL_DITHER_MASK       = GGL_DITHER_ORDER-1;
+
+// ----------------------------------------------------------------------------
+
+const int GGL_SUBPIXEL_BITS = 4;
+
+// TRI_FRACTION_BITS defines the number of bits we want to use
+// for the sub-pixel coordinates during the edge stepping, the
+// value shouldn't be more than 7, or bad things are going to
+// happen when drawing large triangles (8 doesn't work because
+// 32 bit muls will loose the sign bit)
+
+#define  TRI_FRACTION_BITS  (GGL_SUBPIXEL_BITS)
+#define  TRI_ONE            (1 << TRI_FRACTION_BITS)
+#define  TRI_HALF           (1 << (TRI_FRACTION_BITS-1))
+#define  TRI_FROM_INT(x)    ((x) << TRI_FRACTION_BITS)
+#define  TRI_FRAC(x)        ((x)                 &  (TRI_ONE-1))
+#define  TRI_FLOOR(x)       ((x)                 & ~(TRI_ONE-1))
+#define  TRI_CEIL(x)        (((x) + (TRI_ONE-1)) & ~(TRI_ONE-1))
+#define  TRI_ROUND(x)       (((x) +  TRI_HALF  ) & ~(TRI_ONE-1))
+
+#define  TRI_ROUDNING       (1 << (16 - TRI_FRACTION_BITS - 1))
+#define  TRI_FROM_FIXED(x)  (((x)+TRI_ROUDNING) >> (16-TRI_FRACTION_BITS))
+
+#define  TRI_SNAP_NEXT_HALF(x)   (TRI_CEIL((x)+TRI_HALF) - TRI_HALF)
+#define  TRI_SNAP_PREV_HALF(x)   (TRI_CEIL((x)-TRI_HALF) - TRI_HALF)
+
+// ----------------------------------------------------------------------------
+
+const int GGL_COLOR_BITS = 24;
+
+// To maintain 8-bits color chanels, with a maximum GGLSurface
+// size of 4096 and GGL_SUBPIXEL_BITS=4, we need 8 + 12 + 4 = 24 bits
+// for encoding the color iterators
+
+inline GGLcolor gglFixedToIteratedColor(GGLfixed c) {
+    return (c << 8) - c;
+}
+
+// ----------------------------------------------------------------------------
+
+template<bool> struct CTA;
+template<> struct CTA<true> { };
+
+#define GGL_CONTEXT(con, c)         context_t *(con) = static_cast<context_t *>(c) /* NOLINT */
+#define GGL_OFFSETOF(field)         uintptr_t(&(((context_t*)0)->field))
+#define GGL_INIT_PROC(p, f)         p.f = ggl_ ## f;
+#define GGL_BETWEEN(x, L, H)        (uint32_t((x)-(L)) <= ((H)-(L)))
+
+#define ggl_likely(x)	__builtin_expect(!!(x), 1)
+#define ggl_unlikely(x)	__builtin_expect(!!(x), 0)
+
+const int GGL_TEXTURE_UNIT_COUNT    = 2;
+const int GGL_TMU_STATE             = 0x00000001;
+const int GGL_CB_STATE              = 0x00000002;
+const int GGL_PIXEL_PIPELINE_STATE  = 0x00000004;
+
+// ----------------------------------------------------------------------------
+
+#define GGL_RESERVE_NEEDS(name, l, s)                               \
+    const uint32_t  GGL_NEEDS_##name##_MASK = (((1LU<<(s))-1)<<(l));  \
+    const uint32_t  GGL_NEEDS_##name##_SHIFT = (l);
+
+#define GGL_BUILD_NEEDS(val, name)                                  \
+    (((val)<<(GGL_NEEDS_##name##_SHIFT)) & GGL_NEEDS_##name##_MASK)
+
+#define GGL_READ_NEEDS(name, n)                                     \
+    (uint32_t((n) & GGL_NEEDS_##name##_MASK) >> GGL_NEEDS_##name##_SHIFT)
+
+#define GGL_NEED_MASK(name)     (uint32_t(GGL_NEEDS_##name##_MASK))
+#define GGL_NEED(name, val)     GGL_BUILD_NEEDS(val, name)
+
+GGL_RESERVE_NEEDS( CB_FORMAT,       0, 6 )
+GGL_RESERVE_NEEDS( SHADE,           6, 1 )
+GGL_RESERVE_NEEDS( W,               7, 1 )
+GGL_RESERVE_NEEDS( BLEND_SRC,       8, 4 )
+GGL_RESERVE_NEEDS( BLEND_DST,      12, 4 )
+GGL_RESERVE_NEEDS( BLEND_SRCA,     16, 4 )
+GGL_RESERVE_NEEDS( BLEND_DSTA,     20, 4 )
+GGL_RESERVE_NEEDS( LOGIC_OP,       24, 4 )
+GGL_RESERVE_NEEDS( MASK_ARGB,      28, 4 )
+
+GGL_RESERVE_NEEDS( P_ALPHA_TEST,    0, 3 )
+GGL_RESERVE_NEEDS( P_AA,            3, 1 )
+GGL_RESERVE_NEEDS( P_DEPTH_TEST,    4, 3 )
+GGL_RESERVE_NEEDS( P_MASK_Z,        7, 1 )
+GGL_RESERVE_NEEDS( P_DITHER,        8, 1 )
+GGL_RESERVE_NEEDS( P_FOG,           9, 1 )
+GGL_RESERVE_NEEDS( P_RESERVED1,    10,22 )
+
+GGL_RESERVE_NEEDS( T_FORMAT,        0, 6 )
+GGL_RESERVE_NEEDS( T_RESERVED0,     6, 1 )
+GGL_RESERVE_NEEDS( T_POT,           7, 1 )
+GGL_RESERVE_NEEDS( T_S_WRAP,        8, 2 )
+GGL_RESERVE_NEEDS( T_T_WRAP,       10, 2 )
+GGL_RESERVE_NEEDS( T_ENV,          12, 3 )
+GGL_RESERVE_NEEDS( T_LINEAR,       15, 1 )
+
+const int GGL_NEEDS_WRAP_CLAMP_TO_EDGE  = 0;
+const int GGL_NEEDS_WRAP_REPEAT         = 1;
+const int GGL_NEEDS_WRAP_11             = 2;
+
+inline uint32_t ggl_wrap_to_needs(uint32_t e) {
+    switch (e) {
+    case GGL_CLAMP:         return GGL_NEEDS_WRAP_CLAMP_TO_EDGE;
+    case GGL_REPEAT:        return GGL_NEEDS_WRAP_REPEAT;
+    }
+    return 0;
+}
+
+inline uint32_t ggl_blendfactor_to_needs(uint32_t b) {
+    if (b <= 1) return b;
+    return (b & 0xF)+2;
+}
+
+inline uint32_t ggl_needs_to_blendfactor(uint32_t n) {
+    if (n <= 1) return n;
+    return (n - 2) + 0x300;
+}
+
+inline uint32_t ggl_env_to_needs(uint32_t e) {
+    switch (e) {
+    case GGL_REPLACE:   return 0;
+    case GGL_MODULATE:  return 1;
+    case GGL_DECAL:     return 2;
+    case GGL_BLEND:     return 3;
+    case GGL_ADD:       return 4;
+    }
+    return 0;
+}
+
+inline uint32_t ggl_needs_to_env(uint32_t n) {
+    const uint32_t envs[] = { GGL_REPLACE, GGL_MODULATE, 
+            GGL_DECAL, GGL_BLEND, GGL_ADD };
+    return envs[n];
+
+}
+
+// ----------------------------------------------------------------------------
+
+enum {
+    GGL_ENABLE_BLENDING     = 0x00000001,
+    GGL_ENABLE_SMOOTH       = 0x00000002,
+    GGL_ENABLE_AA           = 0x00000004,
+    GGL_ENABLE_LOGIC_OP     = 0x00000008,
+    GGL_ENABLE_ALPHA_TEST   = 0x00000010,
+    GGL_ENABLE_SCISSOR_TEST = 0x00000020,
+    GGL_ENABLE_TMUS         = 0x00000040,
+    GGL_ENABLE_DEPTH_TEST   = 0x00000080,
+    GGL_ENABLE_STENCIL_TEST = 0x00000100,
+    GGL_ENABLE_W            = 0x00000200,
+    GGL_ENABLE_DITHER       = 0x00000400,
+    GGL_ENABLE_FOG          = 0x00000800,
+    GGL_ENABLE_POINT_AA_NICE= 0x00001000
+};
+
+// ----------------------------------------------------------------------------
+
+struct needs_filter_t;
+struct needs_t {
+    inline int match(const needs_filter_t& filter);
+    inline bool operator == (const needs_t& rhs) const {
+        return  (n==rhs.n) &&
+                (p==rhs.p) &&
+                (t[0]==rhs.t[0]) &&
+                (t[1]==rhs.t[1]);
+    }
+    inline bool operator != (const needs_t& rhs) const {
+        return !operator == (rhs);
+    }
+    uint32_t    n;
+    uint32_t    p;
+    uint32_t    t[GGL_TEXTURE_UNIT_COUNT];
+};
+
+inline int compare_type(const needs_t& lhs, const needs_t& rhs) {
+    return memcmp(&lhs, &rhs, sizeof(needs_t));
+}
+
+struct needs_filter_t {
+    needs_t     value;
+    needs_t     mask;
+};
+
+int needs_t::match(const needs_filter_t& filter) {
+    uint32_t result = 
+        ((filter.value.n ^ n)       & filter.mask.n)    |
+        ((filter.value.p ^ p)       & filter.mask.p)    |
+        ((filter.value.t[0] ^ t[0]) & filter.mask.t[0]) |
+        ((filter.value.t[1] ^ t[1]) & filter.mask.t[1]);
+    return (result == 0);
+}
+
+// ----------------------------------------------------------------------------
+
+struct context_t;
+class Assembly;
+
+struct blend_state_t {
+	uint32_t			src;
+	uint32_t			dst;
+	uint32_t			src_alpha;
+	uint32_t			dst_alpha;
+	uint8_t				reserved;
+	uint8_t				alpha_separate;
+	uint8_t				operation;
+	uint8_t				equation;
+};
+
+struct mask_state_t {
+    uint8_t             color;
+    uint8_t             depth;
+    uint32_t            stencil;
+};
+
+struct clear_state_t {
+    GGLclampx           r;
+    GGLclampx           g;
+    GGLclampx           b;
+    GGLclampx           a;
+    GGLclampx           depth;
+    GGLint              stencil;
+    uint32_t            colorPacked;
+    uint32_t            depthPacked;
+    uint32_t            stencilPacked;
+    uint32_t            dirty;
+};
+
+struct fog_state_t {
+    uint8_t     color[4];
+};
+
+struct logic_op_state_t {
+    uint16_t            opcode;
+};
+
+struct alpha_test_state_t {
+    uint16_t            func;
+    GGLcolor            ref;
+};
+
+struct depth_test_state_t {
+    uint16_t            func;
+    GGLclampx           clearValue;
+};
+
+struct scissor_t {
+    uint32_t            user_left;
+    uint32_t            user_right;
+    uint32_t            user_top;
+    uint32_t            user_bottom;
+    uint32_t            left;
+    uint32_t            right;
+    uint32_t            top;
+    uint32_t            bottom;
+};
+
+struct pixel_t {
+    uint32_t    c[4];
+    uint8_t     s[4];
+};
+
+struct surface_t {
+    union {
+        GGLSurface          s;
+        // Keep the following struct field types in line with the corresponding
+        // GGLSurface fields to avoid mismatches leading to errors.
+        struct {
+            GGLsizei        reserved;
+            GGLuint         width;
+            GGLuint         height;
+            GGLint          stride;
+            GGLubyte*       data;
+            GGLubyte        format;
+            GGLubyte        dirty;
+            GGLubyte        pad[2];
+        };
+    };
+    void                (*read) (const surface_t* s, context_t* c,
+                                uint32_t x, uint32_t y, pixel_t* pixel);
+    void                (*write)(const surface_t* s, context_t* c,
+                                uint32_t x, uint32_t y, const pixel_t* pixel);
+};
+
+// ----------------------------------------------------------------------------
+
+struct texture_shade_t {
+    union {
+        struct {
+            int32_t             is0;
+            int32_t             idsdx;
+            int32_t             idsdy;
+            int                 sscale;
+            int32_t             it0;
+            int32_t             idtdx;
+            int32_t             idtdy;
+            int                 tscale;
+        };
+        struct {
+            int32_t             v;
+            int32_t             dx;
+            int32_t             dy;
+            int                 scale;
+        } st[2];
+    };
+};
+
+struct texture_iterators_t {
+    // these are not encoded in the same way than in the
+    // texture_shade_t structure
+    union {
+        struct {
+            GGLfixed			ydsdy;
+            GGLfixed            dsdx;
+            GGLfixed            dsdy;
+            int                 sscale;
+            GGLfixed			ydtdy;
+            GGLfixed            dtdx;
+            GGLfixed            dtdy;
+            int                 tscale;
+        };
+        struct {
+            GGLfixed			ydvdy;
+            GGLfixed            dvdx;
+            GGLfixed            dvdy;
+            int                 scale;
+        } st[2];
+    };
+};
+
+struct texture_t {
+	surface_t			surface;
+	texture_iterators_t	iterators;
+    texture_shade_t     shade;
+	uint32_t			s_coord;
+	uint32_t            t_coord;
+	uint16_t			s_wrap;
+	uint16_t            t_wrap;
+	uint16_t            min_filter;
+	uint16_t            mag_filter;
+    uint16_t            env;
+    uint8_t             env_color[4];
+	uint8_t				enable;
+	uint8_t				dirty;
+};
+
+struct raster_t {
+    GGLfixed            x;
+    GGLfixed            y;
+};
+
+struct framebuffer_t {
+    surface_t           color;
+    surface_t           read;
+	surface_t			depth;
+	surface_t			stencil;
+    int16_t             *coverage;
+    size_t              coverageBufferSize;
+};
+
+// ----------------------------------------------------------------------------
+
+struct iterators_t {
+	int32_t             xl;
+	int32_t             xr;
+    int32_t             y;
+	GGLcolor			ydady;
+	GGLcolor			ydrdy;
+	GGLcolor			ydgdy;
+	GGLcolor			ydbdy;
+	GGLfixed			ydzdy;
+	GGLfixed			ydwdy;
+	GGLfixed			ydfdy;
+};
+
+struct shade_t {
+	GGLcolor			a0;
+    GGLcolor            dadx;
+    GGLcolor            dady;
+	GGLcolor			r0;
+    GGLcolor            drdx;
+    GGLcolor            drdy;
+	GGLcolor			g0;
+    GGLcolor            dgdx;
+    GGLcolor            dgdy;
+	GGLcolor			b0;
+    GGLcolor            dbdx;
+    GGLcolor            dbdy;
+	uint32_t            z0;
+    GGLfixed32          dzdx;
+    GGLfixed32          dzdy;
+	GGLfixed            w0;
+    GGLfixed            dwdx;
+    GGLfixed            dwdy;
+	uint32_t			f0;
+    GGLfixed            dfdx;
+    GGLfixed            dfdy;
+};
+
+// these are used in the generated code
+// we use this mirror structure to improve
+// data locality in the pixel pipeline
+struct generated_tex_vars_t {
+    uint32_t    width;
+    uint32_t    height;
+    uint32_t    stride;
+    uintptr_t   data;
+    int32_t     dsdx;
+    int32_t     dtdx;
+    int32_t     spill[2];
+};
+
+struct generated_vars_t {
+    struct {
+        int32_t c;
+        int32_t dx;
+    } argb[4];
+    int32_t     aref;
+    int32_t     dzdx;
+    int32_t     zbase;
+    int32_t     f;
+    int32_t     dfdx;
+    int32_t     spill[3];
+    generated_tex_vars_t    texture[GGL_TEXTURE_UNIT_COUNT];
+    int32_t     rt;
+    int32_t     lb;
+};
+
+// ----------------------------------------------------------------------------
+
+struct state_t {
+	framebuffer_t		buffers;
+	texture_t			texture[GGL_TEXTURE_UNIT_COUNT];
+    scissor_t           scissor;
+    raster_t            raster;
+	blend_state_t		blend;
+    alpha_test_state_t  alpha_test;
+    depth_test_state_t  depth_test;
+    mask_state_t        mask;
+    clear_state_t       clear;
+    fog_state_t         fog;
+    logic_op_state_t    logic_op;
+    uint32_t            enables;
+    uint32_t            enabled_tmu;
+    needs_t             needs;
+};
+
+// ----------------------------------------------------------------------------
+
+struct context_t {
+	GGLContext          procs;
+	state_t             state;
+    shade_t             shade;
+	iterators_t         iterators;
+    generated_vars_t    generated_vars                __attribute__((aligned(32)));
+    uint8_t             ditherMatrix[GGL_DITHER_SIZE] __attribute__((aligned(32)));
+    uint32_t            packed;
+    uint32_t            packed8888;
+    const GGLFormat*    formats;
+    uint32_t            dirty;
+    texture_t*          activeTMU;
+    uint32_t            activeTMUIndex;
+
+    void                (*init_y)(context_t* c, int32_t y);
+	void                (*step_y)(context_t* c);
+	void                (*scanline)(context_t* c);
+    void                (*span)(context_t* c);
+    void                (*rect)(context_t* c, size_t yc);
+    
+    void*               base;
+    Assembly*           scanline_as;
+    GGLenum             error;
+};
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_context(context_t* context);
+void ggl_uninit_context(context_t* context);
+void ggl_error(context_t* c, GGLenum error);
+int64_t ggl_system_time();
+
+// ----------------------------------------------------------------------------
+
+};
+
+#endif // ANDROID_GGL_CONTEXT_H
+
diff --git a/libpixelflinger/include/private/pixelflinger/ggl_fixed.h b/libpixelflinger/include/private/pixelflinger/ggl_fixed.h
new file mode 100644
index 0000000..4217a89
--- /dev/null
+++ b/libpixelflinger/include/private/pixelflinger/ggl_fixed.h
@@ -0,0 +1,878 @@
+/*
+ * Copyright (C) 2005 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ANDROID_GGL_FIXED_H
+#define ANDROID_GGL_FIXED_H
+
+#include <math.h>
+#include <pixelflinger/pixelflinger.h>
+
+// ----------------------------------------------------------------------------
+
+#define CONST           __attribute__((const))
+#define ALWAYS_INLINE   __attribute__((always_inline))
+
+const GGLfixed FIXED_BITS = 16;
+const GGLfixed FIXED_EPSILON  = 1;
+const GGLfixed FIXED_ONE  = 1L<<FIXED_BITS;
+const GGLfixed FIXED_HALF = 1L<<(FIXED_BITS-1);
+const GGLfixed FIXED_MIN  = 0x80000000L;
+const GGLfixed FIXED_MAX  = 0x7FFFFFFFL;
+
+inline GGLfixed gglIntToFixed(GGLfixed i)       ALWAYS_INLINE ;
+inline GGLfixed gglFixedToIntRound(GGLfixed f)  ALWAYS_INLINE ;
+inline GGLfixed gglFixedToIntFloor(GGLfixed f)  ALWAYS_INLINE ;
+inline GGLfixed gglFixedToIntCeil(GGLfixed f)   ALWAYS_INLINE ;
+inline GGLfixed gglFracx(GGLfixed v)            ALWAYS_INLINE ;
+inline GGLfixed gglFloorx(GGLfixed v)           ALWAYS_INLINE ;
+inline GGLfixed gglCeilx(GGLfixed v)            ALWAYS_INLINE ;
+inline GGLfixed gglCenterx(GGLfixed v)          ALWAYS_INLINE ;
+inline GGLfixed gglRoundx(GGLfixed v)           ALWAYS_INLINE ;
+
+GGLfixed gglIntToFixed(GGLfixed i) {
+    return i<<FIXED_BITS;
+}
+GGLfixed gglFixedToIntRound(GGLfixed f) {
+    return (f + FIXED_HALF)>>FIXED_BITS;
+}
+GGLfixed gglFixedToIntFloor(GGLfixed f) {
+    return f>>FIXED_BITS;
+}
+GGLfixed gglFixedToIntCeil(GGLfixed f) {
+    return (f + ((1<<FIXED_BITS) - 1))>>FIXED_BITS;
+}
+
+GGLfixed gglFracx(GGLfixed v) {
+    return v & ((1<<FIXED_BITS)-1);
+}
+GGLfixed gglFloorx(GGLfixed v) {
+    return gglFixedToIntFloor(v)<<FIXED_BITS;
+}
+GGLfixed gglCeilx(GGLfixed v) {
+    return gglFixedToIntCeil(v)<<FIXED_BITS;
+}
+GGLfixed gglCenterx(GGLfixed v) {
+    return gglFloorx(v + FIXED_HALF) | FIXED_HALF;
+}
+GGLfixed gglRoundx(GGLfixed v) {
+    return gglFixedToIntRound(v)<<FIXED_BITS;
+}
+
+// conversion from (unsigned) int, short, byte to fixed...
+#define GGL_B_TO_X(_x)      GGLfixed( ((int32_t(_x)+1)>>1)<<10 )
+#define GGL_S_TO_X(_x)      GGLfixed( ((int32_t(_x)+1)>>1)<<2 )
+#define GGL_I_TO_X(_x)      GGLfixed( ((int32_t(_x)>>1)+1)>>14 )
+#define GGL_UB_TO_X(_x)     GGLfixed(   uint32_t(_x) +      \
+                                        (uint32_t(_x)<<8) + \
+                                        (uint32_t(_x)>>7) )
+#define GGL_US_TO_X(_x)     GGLfixed( (_x) + ((_x)>>15) )
+#define GGL_UI_TO_X(_x)     GGLfixed( (((_x)>>1)+1)>>15 )
+
+// ----------------------------------------------------------------------------
+
+GGLfixed gglPowx(GGLfixed x, GGLfixed y) CONST;
+GGLfixed gglSqrtx(GGLfixed a) CONST;
+GGLfixed gglSqrtRecipx(GGLfixed x) CONST;
+int32_t gglMulDivi(int32_t a, int32_t b, int32_t c);
+
+int32_t gglRecipQNormalized(int32_t x, int* exponent);
+int32_t gglRecipQ(GGLfixed x, int q) CONST;
+
+inline GGLfixed gglRecip(GGLfixed x) CONST;
+inline GGLfixed gglRecip(GGLfixed x) {
+    return gglRecipQ(x, 16);
+}
+
+inline GGLfixed gglRecip28(GGLfixed x) CONST;
+int32_t gglRecip28(GGLfixed x) {
+    return gglRecipQ(x, 28);
+}
+
+// ----------------------------------------------------------------------------
+
+#if defined(__arm__) && !defined(__thumb__)
+
+// inline ARM implementations
+inline GGLfixed gglMulx(GGLfixed x, GGLfixed y, int shift) CONST;
+__attribute__((always_inline)) inline GGLfixed gglMulx(GGLfixed x, GGLfixed y, int shift) {
+    GGLfixed result, t;
+    if (__builtin_constant_p(shift)) {
+    asm("smull  %[lo], %[hi], %[x], %[y]            \n"
+        "movs   %[lo], %[lo], lsr %[rshift]         \n"
+        "adc    %[lo], %[lo], %[hi], lsl %[lshift]  \n"
+        : [lo]"=r"(result), [hi]"=r"(t), [x]"=r"(x)
+        : "%[x]"(x), [y]"r"(y), [lshift] "I"(32-shift), [rshift] "I"(shift)
+        : "cc"
+        );
+    } else {
+    asm("smull  %[lo], %[hi], %[x], %[y]            \n"
+        "movs   %[lo], %[lo], lsr %[rshift]         \n"
+        "adc    %[lo], %[lo], %[hi], lsl %[lshift]  \n"
+        : [lo]"=&r"(result), [hi]"=&r"(t), [x]"=&r"(x)
+        : "%[x]"(x), [y]"r"(y), [lshift] "r"(32-shift), [rshift] "r"(shift)
+        : "cc"
+        );
+    }
+    return result;
+}
+
+inline GGLfixed gglMulAddx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) CONST;
+__attribute__((always_inline)) inline GGLfixed gglMulAddx(GGLfixed x, GGLfixed y, GGLfixed a,
+                                                          int shift) {
+    GGLfixed result, t;
+    if (__builtin_constant_p(shift)) {
+    asm("smull  %[lo], %[hi], %[x], %[y]            \n"
+        "add    %[lo], %[a],  %[lo], lsr %[rshift]  \n"
+        "add    %[lo], %[lo], %[hi], lsl %[lshift]  \n"
+        : [lo]"=&r"(result), [hi]"=&r"(t), [x]"=&r"(x)
+        : "%[x]"(x), [y]"r"(y), [a]"r"(a), [lshift] "I"(32-shift), [rshift] "I"(shift)
+        );
+    } else {
+    asm("smull  %[lo], %[hi], %[x], %[y]            \n"
+        "add    %[lo], %[a],  %[lo], lsr %[rshift]  \n"
+        "add    %[lo], %[lo], %[hi], lsl %[lshift]  \n"
+        : [lo]"=&r"(result), [hi]"=&r"(t), [x]"=&r"(x)
+        : "%[x]"(x), [y]"r"(y), [a]"r"(a), [lshift] "r"(32-shift), [rshift] "r"(shift)
+        );
+    }
+    return result;
+}
+
+inline GGLfixed gglMulSubx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) CONST;
+inline GGLfixed gglMulSubx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) {
+    GGLfixed result, t;
+    if (__builtin_constant_p(shift)) {
+    asm("smull  %[lo], %[hi], %[x], %[y]            \n"
+        "rsb    %[lo], %[a],  %[lo], lsr %[rshift]  \n"
+        "add    %[lo], %[lo], %[hi], lsl %[lshift]  \n"
+        : [lo]"=&r"(result), [hi]"=&r"(t), [x]"=&r"(x)
+        : "%[x]"(x), [y]"r"(y), [a]"r"(a), [lshift] "I"(32-shift), [rshift] "I"(shift)
+        );
+    } else {
+    asm("smull  %[lo], %[hi], %[x], %[y]            \n"
+        "rsb    %[lo], %[a],  %[lo], lsr %[rshift]  \n"
+        "add    %[lo], %[lo], %[hi], lsl %[lshift]  \n"
+        : [lo]"=&r"(result), [hi]"=&r"(t), [x]"=&r"(x)
+        : "%[x]"(x), [y]"r"(y), [a]"r"(a), [lshift] "r"(32-shift), [rshift] "r"(shift)
+        );
+    }
+    return result;
+}
+
+inline int64_t gglMulii(int32_t x, int32_t y) CONST;
+inline int64_t gglMulii(int32_t x, int32_t y)
+{
+    // 64-bits result: r0=low, r1=high
+    union {
+        struct {
+            int32_t lo;
+            int32_t hi;
+        } s;
+        int64_t res;
+    };
+    asm("smull %0, %1, %2, %3   \n"
+        : "=r"(s.lo), "=&r"(s.hi)
+        : "%r"(x), "r"(y)
+        :
+        );
+    return res;
+}
+#elif defined(__mips__) && __mips_isa_rev < 6
+
+/*inline MIPS implementations*/
+inline GGLfixed gglMulx(GGLfixed a, GGLfixed b, int shift) CONST;
+inline GGLfixed gglMulx(GGLfixed a, GGLfixed b, int shift) {
+    GGLfixed result,tmp,tmp1,tmp2;
+
+    if (__builtin_constant_p(shift)) {
+        if (shift == 0) {
+            asm ("mult %[a], %[b] \t\n"
+              "mflo  %[res]   \t\n"
+            : [res]"=&r"(result),[tmp]"=&r"(tmp)
+            : [a]"r"(a),[b]"r"(b)
+            : "%hi","%lo"
+            );
+        } else if (shift == 32)
+        {
+            asm ("mult %[a], %[b] \t\n"
+            "li  %[tmp],1\t\n"
+            "sll  %[tmp],%[tmp],0x1f\t\n"
+            "mflo %[res]   \t\n"
+            "addu %[tmp1],%[tmp],%[res] \t\n"
+            "sltu %[tmp1],%[tmp1],%[tmp]\t\n"   /*obit*/
+            "sra %[tmp],%[tmp],0x1f \t\n"
+            "mfhi  %[res]   \t\n"
+            "addu %[res],%[res],%[tmp]\t\n"
+            "addu %[res],%[res],%[tmp1]\t\n"
+            : [res]"=&r"(result),[tmp]"=&r"(tmp),[tmp1]"=&r"(tmp1)
+            : [a]"r"(a),[b]"r"(b),[shift]"I"(shift)
+            : "%hi","%lo"
+            );
+        } else if ((shift >0) && (shift < 32))
+        {
+            asm ("mult %[a], %[b] \t\n"
+            "li  %[tmp],1 \t\n"
+            "sll  %[tmp],%[tmp],%[shiftm1] \t\n"
+            "mflo  %[res]   \t\n"
+            "addu %[tmp1],%[tmp],%[res] \t\n"
+            "sltu %[tmp1],%[tmp1],%[tmp] \t\n"  /*obit?*/
+            "addu  %[res],%[res],%[tmp] \t\n"
+            "mfhi  %[tmp]   \t\n"
+            "addu  %[tmp],%[tmp],%[tmp1] \t\n"
+            "sll   %[tmp],%[tmp],%[lshift] \t\n"
+            "srl   %[res],%[res],%[rshift]    \t\n"
+            "or    %[res],%[res],%[tmp] \t\n"
+            : [res]"=&r"(result),[tmp]"=&r"(tmp),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+            : [a]"r"(a),[b]"r"(b),[lshift]"I"(32-shift),[rshift]"I"(shift),[shiftm1]"I"(shift-1)
+            : "%hi","%lo"
+            );
+        } else {
+            asm ("mult %[a], %[b] \t\n"
+            "li  %[tmp],1 \t\n"
+            "sll  %[tmp],%[tmp],%[shiftm1] \t\n"
+            "mflo  %[res]   \t\n"
+            "addu %[tmp1],%[tmp],%[res] \t\n"
+            "sltu %[tmp1],%[tmp1],%[tmp] \t\n"  /*obit?*/
+            "sra  %[tmp2],%[tmp],0x1f \t\n"
+            "addu  %[res],%[res],%[tmp] \t\n"
+            "mfhi  %[tmp]   \t\n"
+            "addu  %[tmp],%[tmp],%[tmp2] \t\n"
+            "addu  %[tmp],%[tmp],%[tmp1] \t\n"            /*tmp=hi*/
+            "srl   %[tmp2],%[res],%[rshift]    \t\n"
+            "srav  %[res], %[tmp],%[rshift]\t\n"
+            "sll   %[tmp],%[tmp],1 \t\n"
+            "sll   %[tmp],%[tmp],%[norbits] \t\n"
+            "or    %[tmp],%[tmp],%[tmp2] \t\n"
+            "movz  %[res],%[tmp],%[bit5] \t\n"
+            : [res]"=&r"(result),[tmp]"=&r"(tmp),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+            : [a]"r"(a),[b]"r"(b),[norbits]"I"(~(shift)),[rshift]"I"(shift),[shiftm1] "I"(shift-1),[bit5]"I"(shift & 0x20)
+            : "%hi","%lo"
+            );
+        }
+    } else {
+        asm ("mult %[a], %[b] \t\n"
+        "li  %[tmp],1 \t\n"
+        "sll  %[tmp],%[tmp],%[shiftm1] \t\n"
+        "mflo  %[res]   \t\n"
+        "addu %[tmp1],%[tmp],%[res] \t\n"
+        "sltu %[tmp1],%[tmp1],%[tmp] \t\n"  /*obit?*/
+        "sra  %[tmp2],%[tmp],0x1f \t\n"
+        "addu  %[res],%[res],%[tmp] \t\n"
+        "mfhi  %[tmp]   \t\n"
+        "addu  %[tmp],%[tmp],%[tmp2] \t\n"
+        "addu  %[tmp],%[tmp],%[tmp1] \t\n"            /*tmp=hi*/
+        "srl   %[tmp2],%[res],%[rshift]    \t\n"
+        "srav  %[res], %[tmp],%[rshift]\t\n"
+        "sll   %[tmp],%[tmp],1 \t\n"
+        "sll   %[tmp],%[tmp],%[norbits] \t\n"
+        "or    %[tmp],%[tmp],%[tmp2] \t\n"
+        "movz  %[res],%[tmp],%[bit5] \t\n"
+         : [res]"=&r"(result),[tmp]"=&r"(tmp),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+         : [a]"r"(a),[b]"r"(b),[norbits]"r"(~(shift)),[rshift] "r"(shift),[shiftm1]"r"(shift-1),[bit5] "r"(shift & 0x20)
+         : "%hi","%lo"
+         );
+        }
+
+        return result;
+}
+
+inline GGLfixed gglMulAddx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) CONST;
+inline GGLfixed gglMulAddx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) {
+    GGLfixed result,t,tmp1,tmp2;
+
+    if (__builtin_constant_p(shift)) {
+        if (shift == 0) {
+                 asm ("mult %[a], %[b] \t\n"
+                 "mflo  %[lo]   \t\n"
+                 "addu  %[lo],%[lo],%[c]    \t\n"
+                 : [lo]"=&r"(result)
+                 : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+                 : "%hi","%lo"
+                 );
+                } else if (shift == 32) {
+                    asm ("mult %[a], %[b] \t\n"
+                    "mfhi  %[lo]   \t\n"
+                    "addu  %[lo],%[lo],%[c]    \t\n"
+                    : [lo]"=&r"(result)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+                    : "%hi","%lo"
+                    );
+                } else if ((shift>0) && (shift<32)) {
+                    asm ("mult %[a], %[b] \t\n"
+                    "mflo  %[res]   \t\n"
+                    "mfhi  %[t]   \t\n"
+                    "srl   %[res],%[res],%[rshift]    \t\n"
+                    "sll   %[t],%[t],%[lshift]     \t\n"
+                    "or  %[res],%[res],%[t]    \t\n"
+                    "addu  %[res],%[res],%[c]    \t\n"
+                    : [res]"=&r"(result),[t]"=&r"(t)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c),[lshift]"I"(32-shift),[rshift]"I"(shift)
+                    : "%hi","%lo"
+                    );
+                } else {
+                    asm ("mult %[a], %[b] \t\n"
+                    "nor %[tmp1],$zero,%[shift]\t\n"
+                    "mflo  %[res]   \t\n"
+                    "mfhi  %[t]   \t\n"
+                    "srl   %[res],%[res],%[shift]    \t\n"
+                    "sll   %[tmp2],%[t],1     \t\n"
+                    "sllv  %[tmp2],%[tmp2],%[tmp1]     \t\n"
+                    "or  %[tmp1],%[tmp2],%[res]    \t\n"
+                    "srav  %[res],%[t],%[shift]     \t\n"
+                    "andi %[tmp2],%[shift],0x20\t\n"
+                    "movz %[res],%[tmp1],%[tmp2]\t\n"
+                    "addu  %[res],%[res],%[c]    \t\n"
+                    : [res]"=&r"(result),[t]"=&r"(t),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c),[shift]"I"(shift)
+                    : "%hi","%lo"
+                    );
+                }
+            } else {
+                asm ("mult %[a], %[b] \t\n"
+                "nor %[tmp1],$zero,%[shift]\t\n"
+                "mflo  %[res]   \t\n"
+                "mfhi  %[t]   \t\n"
+                "srl   %[res],%[res],%[shift]    \t\n"
+                "sll   %[tmp2],%[t],1     \t\n"
+                "sllv  %[tmp2],%[tmp2],%[tmp1]     \t\n"
+                "or  %[tmp1],%[tmp2],%[res]    \t\n"
+                "srav  %[res],%[t],%[shift]     \t\n"
+                "andi %[tmp2],%[shift],0x20\t\n"
+                "movz %[res],%[tmp1],%[tmp2]\t\n"
+                "addu  %[res],%[res],%[c]    \t\n"
+                : [res]"=&r"(result),[t]"=&r"(t),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+                : [a]"r"(a),[b]"r"(b),[c]"r"(c),[shift]"r"(shift)
+                : "%hi","%lo"
+                );
+            }
+            return result;
+}
+
+inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) CONST;
+inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) {
+    GGLfixed result,t,tmp1,tmp2;
+
+    if (__builtin_constant_p(shift)) {
+        if (shift == 0) {
+                 asm ("mult %[a], %[b] \t\n"
+                 "mflo  %[lo]   \t\n"
+                 "subu  %[lo],%[lo],%[c]    \t\n"
+                 : [lo]"=&r"(result)
+                 : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+                 : "%hi","%lo"
+                 );
+                } else if (shift == 32) {
+                    asm ("mult %[a], %[b] \t\n"
+                    "mfhi  %[lo]   \t\n"
+                    "subu  %[lo],%[lo],%[c]    \t\n"
+                    : [lo]"=&r"(result)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+                    : "%hi","%lo"
+                    );
+                } else if ((shift>0) && (shift<32)) {
+                    asm ("mult %[a], %[b] \t\n"
+                    "mflo  %[res]   \t\n"
+                    "mfhi  %[t]   \t\n"
+                    "srl   %[res],%[res],%[rshift]    \t\n"
+                    "sll   %[t],%[t],%[lshift]     \t\n"
+                    "or  %[res],%[res],%[t]    \t\n"
+                    "subu  %[res],%[res],%[c]    \t\n"
+                    : [res]"=&r"(result),[t]"=&r"(t)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c),[lshift]"I"(32-shift),[rshift]"I"(shift)
+                    : "%hi","%lo"
+                    );
+                } else {
+                    asm ("mult %[a], %[b] \t\n"
+                    "nor %[tmp1],$zero,%[shift]\t\n"
+                     "mflo  %[res]   \t\n"
+                     "mfhi  %[t]   \t\n"
+                     "srl   %[res],%[res],%[shift]    \t\n"
+                     "sll   %[tmp2],%[t],1     \t\n"
+                     "sllv  %[tmp2],%[tmp2],%[tmp1]     \t\n"
+                     "or  %[tmp1],%[tmp2],%[res]    \t\n"
+                     "srav  %[res],%[t],%[shift]     \t\n"
+                     "andi %[tmp2],%[shift],0x20\t\n"
+                     "movz %[res],%[tmp1],%[tmp2]\t\n"
+                     "subu  %[res],%[res],%[c]    \t\n"
+                     : [res]"=&r"(result),[t]"=&r"(t),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+                     : [a]"r"(a),[b]"r"(b),[c]"r"(c),[shift]"I"(shift)
+                     : "%hi","%lo"
+                     );
+                    }
+                } else {
+                asm ("mult %[a], %[b] \t\n"
+                "nor %[tmp1],$zero,%[shift]\t\n"
+                "mflo  %[res]   \t\n"
+                "mfhi  %[t]   \t\n"
+                "srl   %[res],%[res],%[shift]    \t\n"
+                "sll   %[tmp2],%[t],1     \t\n"
+                "sllv  %[tmp2],%[tmp2],%[tmp1]     \t\n"
+                "or  %[tmp1],%[tmp2],%[res]    \t\n"
+                "srav  %[res],%[t],%[shift]     \t\n"
+                "andi %[tmp2],%[shift],0x20\t\n"
+                "movz %[res],%[tmp1],%[tmp2]\t\n"
+                "subu  %[res],%[res],%[c]    \t\n"
+                : [res]"=&r"(result),[t]"=&r"(t),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+                : [a]"r"(a),[b]"r"(b),[c]"r"(c),[shift]"r"(shift)
+                : "%hi","%lo"
+                );
+            }
+    return result;
+}
+
+inline int64_t gglMulii(int32_t x, int32_t y) CONST;
+inline int64_t gglMulii(int32_t x, int32_t y) {
+    union {
+        struct {
+#if defined(__MIPSEL__)
+            int32_t lo;
+            int32_t hi;
+#elif defined(__MIPSEB__)
+            int32_t hi;
+            int32_t lo;
+#endif
+        } s;
+        int64_t res;
+    }u;
+    asm("mult %2, %3 \t\n"
+        "mfhi %1   \t\n"
+        "mflo %0   \t\n"
+        : "=r"(u.s.lo), "=&r"(u.s.hi)
+        : "%r"(x), "r"(y)
+	: "%hi","%lo"
+        );
+    return u.res;
+}
+
+#elif defined(__aarch64__)
+
+// inline AArch64 implementations
+
+inline GGLfixed gglMulx(GGLfixed x, GGLfixed y, int shift) CONST;
+inline GGLfixed gglMulx(GGLfixed x, GGLfixed y, int shift)
+{
+    GGLfixed result;
+    GGLfixed round;
+
+    asm("mov    %x[round], #1                        \n"
+        "lsl    %x[round], %x[round], %x[shift]      \n"
+        "lsr    %x[round], %x[round], #1             \n"
+        "smaddl %x[result], %w[x], %w[y],%x[round]   \n"
+        "lsr    %x[result], %x[result], %x[shift]    \n"
+        : [round]"=&r"(round), [result]"=&r"(result) \
+        : [x]"r"(x), [y]"r"(y), [shift] "r"(shift)   \
+        :
+       );
+    return result;
+}
+inline GGLfixed gglMulAddx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) CONST;
+inline GGLfixed gglMulAddx(GGLfixed x, GGLfixed y, GGLfixed a, int shift)
+{
+    GGLfixed result;
+    asm("smull  %x[result], %w[x], %w[y]                     \n"
+        "lsr    %x[result], %x[result], %x[shift]            \n"
+        "add    %w[result], %w[result], %w[a]                \n"
+        : [result]"=&r"(result)                               \
+        : [x]"r"(x), [y]"r"(y), [a]"r"(a), [shift] "r"(shift) \
+        :
+        );
+    return result;
+}
+
+inline GGLfixed gglMulSubx(GGLfixed x, GGLfixed y, GGLfixed a, int shift) CONST;
+inline GGLfixed gglMulSubx(GGLfixed x, GGLfixed y, GGLfixed a, int shift)
+{
+
+    GGLfixed result;
+
+    asm("smull  %x[result], %w[x], %w[y]                     \n"
+        "lsr    %x[result], %x[result], %x[shift]            \n"
+        "sub    %w[result], %w[result], %w[a]                \n"
+        : [result]"=&r"(result)                               \
+        : [x]"r"(x), [y]"r"(y), [a]"r"(a), [shift] "r"(shift) \
+        :
+        );
+    return result;
+}
+inline int64_t gglMulii(int32_t x, int32_t y) CONST;
+inline int64_t gglMulii(int32_t x, int32_t y)
+{
+    int64_t res;
+    asm("smull  %x0, %w1, %w2 \n"
+        : "=r"(res)
+        : "%r"(x), "r"(y)
+        :
+        );
+    return res;
+}
+
+#elif defined(__mips__) && __mips_isa_rev == 6
+
+/*inline MIPS implementations*/
+inline GGLfixed gglMulx(GGLfixed a, GGLfixed b, int shift) CONST;
+inline GGLfixed gglMulx(GGLfixed a, GGLfixed b, int shift) {
+    GGLfixed result,tmp,tmp1,tmp2;
+
+    if (__builtin_constant_p(shift)) {
+        if (shift == 0) {
+            asm ("mul %[res], %[a], %[b] \t\n"
+            : [res]"=&r"(result)
+            : [a]"r"(a),[b]"r"(b)
+            );
+        } else if (shift == 32)
+        {
+            asm ("mul %[res], %[a], %[b] \t\n"
+            "li  %[tmp],1\t\n"
+            "sll  %[tmp],%[tmp],0x1f\t\n"
+            "addu %[tmp1],%[tmp],%[res] \t\n"
+            "muh %[res], %[a], %[b] \t\n"
+            "sltu %[tmp1],%[tmp1],%[tmp]\t\n"   /*obit*/
+            "sra %[tmp],%[tmp],0x1f \t\n"
+            "addu %[res],%[res],%[tmp]\t\n"
+            "addu %[res],%[res],%[tmp1]\t\n"
+            : [res]"=&r"(result),[tmp]"=&r"(tmp),[tmp1]"=&r"(tmp1)
+            : [a]"r"(a),[b]"r"(b),[shift]"I"(shift)
+            );
+        } else if ((shift >0) && (shift < 32))
+        {
+            asm ("mul %[res], %[a], %[b] \t\n"
+            "li  %[tmp],1 \t\n"
+            "sll  %[tmp],%[tmp],%[shiftm1] \t\n"
+            "addu %[tmp1],%[tmp],%[res] \t\n"
+            "sltu %[tmp1],%[tmp1],%[tmp] \t\n"  /*obit?*/
+            "addu  %[res],%[res],%[tmp] \t\n"
+            "muh %[tmp], %[a], %[b] \t\n"
+            "addu  %[tmp],%[tmp],%[tmp1] \t\n"
+            "sll   %[tmp],%[tmp],%[lshift] \t\n"
+            "srl   %[res],%[res],%[rshift]    \t\n"
+            "or    %[res],%[res],%[tmp] \t\n"
+            : [res]"=&r"(result),[tmp]"=&r"(tmp),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+            : [a]"r"(a),[b]"r"(b),[lshift]"I"(32-shift),[rshift]"I"(shift),[shiftm1]"I"(shift-1)
+            );
+        } else {
+            asm ("mul %[res], %[a], %[b] \t\n"
+            "li  %[tmp],1 \t\n"
+            "sll  %[tmp],%[tmp],%[shiftm1] \t\n"
+            "addu %[tmp1],%[tmp],%[res] \t\n"
+            "sltu %[tmp1],%[tmp1],%[tmp] \t\n"  /*obit?*/
+            "sra  %[tmp2],%[tmp],0x1f \t\n"
+            "addu  %[res],%[res],%[tmp] \t\n"
+            "muh  %[tmp], %[a], %[b]   \t\n"
+            "addu  %[tmp],%[tmp],%[tmp2] \t\n"
+            "addu  %[tmp],%[tmp],%[tmp1] \t\n"            /*tmp=hi*/
+            "srl   %[tmp2],%[res],%[rshift]    \t\n"
+            "srav  %[res], %[tmp],%[rshift]\t\n"
+            "sll   %[tmp],%[tmp],1 \t\n"
+            "sll   %[tmp],%[tmp],%[norbits] \t\n"
+            "or    %[tmp],%[tmp],%[tmp2] \t\n"
+            "seleqz  %[tmp],%[tmp],%[bit5] \t\n"
+            "selnez  %[res],%[res],%[bit5] \t\n"
+            "or    %[res],%[res],%[tmp] \t\n"
+            : [res]"=&r"(result),[tmp]"=&r"(tmp),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+            : [a]"r"(a),[b]"r"(b),[norbits]"I"(~(shift)),[rshift]"I"(shift),[shiftm1] "I"(shift-1),[bit5]"I"(shift & 0x20)
+            );
+        }
+    } else {
+        asm ("mul %[res], %[a], %[b] \t\n"
+        "li  %[tmp],1 \t\n"
+        "sll  %[tmp],%[tmp],%[shiftm1] \t\n"
+        "addu %[tmp1],%[tmp],%[res] \t\n"
+        "sltu %[tmp1],%[tmp1],%[tmp] \t\n"  /*obit?*/
+        "sra  %[tmp2],%[tmp],0x1f \t\n"
+        "addu  %[res],%[res],%[tmp] \t\n"
+        "muh  %[tmp], %[a], %[b] \t\n"
+        "addu  %[tmp],%[tmp],%[tmp2] \t\n"
+        "addu  %[tmp],%[tmp],%[tmp1] \t\n"            /*tmp=hi*/
+        "srl   %[tmp2],%[res],%[rshift]    \t\n"
+        "srav  %[res], %[tmp],%[rshift]\t\n"
+        "sll   %[tmp],%[tmp],1 \t\n"
+        "sll   %[tmp],%[tmp],%[norbits] \t\n"
+        "or    %[tmp],%[tmp],%[tmp2] \t\n"
+        "seleqz  %[tmp],%[tmp],%[bit5] \t\n"
+        "selnez  %[res],%[res],%[bit5] \t\n"
+        "or    %[res],%[res],%[tmp] \t\n"
+         : [res]"=&r"(result),[tmp]"=&r"(tmp),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+         : [a]"r"(a),[b]"r"(b),[norbits]"r"(~(shift)),[rshift] "r"(shift),[shiftm1]"r"(shift-1),[bit5] "r"(shift & 0x20)
+         );
+        }
+        return result;
+}
+
+inline GGLfixed gglMulAddx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) CONST;
+inline GGLfixed gglMulAddx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) {
+    GGLfixed result,t,tmp1,tmp2;
+
+    if (__builtin_constant_p(shift)) {
+        if (shift == 0) {
+                 asm ("mul %[lo], %[a], %[b] \t\n"
+                 "addu  %[lo],%[lo],%[c]    \t\n"
+                 : [lo]"=&r"(result)
+                 : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+                 );
+                } else if (shift == 32) {
+                    asm ("muh %[lo], %[a], %[b] \t\n"
+                    "addu  %[lo],%[lo],%[c]    \t\n"
+                    : [lo]"=&r"(result)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+                    );
+                } else if ((shift>0) && (shift<32)) {
+                    asm ("mul %[res], %[a], %[b] \t\n"
+                    "muh  %[t], %[a], %[b] \t\n"
+                    "srl   %[res],%[res],%[rshift]    \t\n"
+                    "sll   %[t],%[t],%[lshift]     \t\n"
+                    "or  %[res],%[res],%[t]    \t\n"
+                    "addu  %[res],%[res],%[c]    \t\n"
+                    : [res]"=&r"(result),[t]"=&r"(t)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c),[lshift]"I"(32-shift),[rshift]"I"(shift)
+                    );
+                } else {
+                    asm ("mul %[res], %[a], %[b] \t\n"
+                    "muh %[t], %[a], %[b] \t\n"
+                    "nor %[tmp1],$zero,%[shift]\t\n"
+                    "srl   %[res],%[res],%[shift]    \t\n"
+                    "sll   %[tmp2],%[t],1     \t\n"
+                    "sllv  %[tmp2],%[tmp2],%[tmp1]     \t\n"
+                    "or  %[tmp1],%[tmp2],%[res]    \t\n"
+                    "srav  %[res],%[t],%[shift]     \t\n"
+                    "andi %[tmp2],%[shift],0x20\t\n"
+                    "seleqz %[tmp1],%[tmp1],%[tmp2]\t\n"
+                    "selnez %[res],%[res],%[tmp2]\t\n"
+                    "or %[res],%[res],%[tmp1]\t\n"
+                    "addu  %[res],%[res],%[c]    \t\n"
+                    : [res]"=&r"(result),[t]"=&r"(t),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c),[shift]"I"(shift)
+                    );
+                }
+            } else {
+                asm ("mul %[res], %[a], %[b] \t\n"
+                "muh %[t], %[a], %[b] \t\n"
+                "nor %[tmp1],$zero,%[shift]\t\n"
+                "srl   %[res],%[res],%[shift]    \t\n"
+                "sll   %[tmp2],%[t],1     \t\n"
+                "sllv  %[tmp2],%[tmp2],%[tmp1]     \t\n"
+                "or  %[tmp1],%[tmp2],%[res]    \t\n"
+                "srav  %[res],%[t],%[shift]     \t\n"
+                "andi %[tmp2],%[shift],0x20\t\n"
+                "seleqz %[tmp1],%[tmp1],%[tmp2]\t\n"
+                "selnez %[res],%[res],%[tmp2]\t\n"
+                "or %[res],%[res],%[tmp1]\t\n"
+                "addu  %[res],%[res],%[c]    \t\n"
+                : [res]"=&r"(result),[t]"=&r"(t),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+                : [a]"r"(a),[b]"r"(b),[c]"r"(c),[shift]"r"(shift)
+                );
+            }
+            return result;
+}
+
+inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) CONST;
+inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) {
+    GGLfixed result,t,tmp1,tmp2;
+
+    if (__builtin_constant_p(shift)) {
+        if (shift == 0) {
+                 asm ("mul %[lo], %[a], %[b] \t\n"
+                 "subu  %[lo],%[lo],%[c]    \t\n"
+                 : [lo]"=&r"(result)
+                 : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+                 );
+                } else if (shift == 32) {
+                    asm ("muh %[lo], %[a], %[b] \t\n"
+                    "subu  %[lo],%[lo],%[c]    \t\n"
+                    : [lo]"=&r"(result)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+                    );
+                } else if ((shift>0) && (shift<32)) {
+                    asm ("mul %[res], %[a], %[b] \t\n"
+                    "muh %[t], %[a], %[b] \t\n"
+                    "srl   %[res],%[res],%[rshift]    \t\n"
+                    "sll   %[t],%[t],%[lshift]     \t\n"
+                    "or  %[res],%[res],%[t]    \t\n"
+                    "subu  %[res],%[res],%[c]    \t\n"
+                    : [res]"=&r"(result),[t]"=&r"(t)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c),[lshift]"I"(32-shift),[rshift]"I"(shift)
+                    );
+                } else {
+                    asm ("mul %[res], %[a], %[b] \t\n"
+                    "muh %[t], %[a], %[b] \t\n"
+                    "nor %[tmp1],$zero,%[shift]\t\n"
+                    "srl   %[res],%[res],%[shift]    \t\n"
+                    "sll   %[tmp2],%[t],1     \t\n"
+                    "sllv  %[tmp2],%[tmp2],%[tmp1]     \t\n"
+                    "or  %[tmp1],%[tmp2],%[res]    \t\n"
+                    "srav  %[res],%[t],%[shift]     \t\n"
+                    "andi %[tmp2],%[shift],0x20\t\n"
+                    "seleqz %[tmp1],%[tmp1],%[tmp2]\t\n"
+                    "selnez %[res],%[res],%[tmp2]\t\n"
+                    "or %[res],%[res],%[tmp1]\t\n"
+                    "subu  %[res],%[res],%[c]    \t\n"
+                    : [res]"=&r"(result),[t]"=&r"(t),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+                    : [a]"r"(a),[b]"r"(b),[c]"r"(c),[shift]"I"(shift)
+                     );
+                    }
+                } else {
+                asm ("mul %[res], %[a], %[b] \t\n"
+                "muh %[t], %[a], %[b] \t\n"
+                "nor %[tmp1],$zero,%[shift]\t\n"
+                "srl   %[res],%[res],%[shift]    \t\n"
+                "sll   %[tmp2],%[t],1     \t\n"
+                "sllv  %[tmp2],%[tmp2],%[tmp1]     \t\n"
+                "or  %[tmp1],%[tmp2],%[res]    \t\n"
+                "srav  %[res],%[t],%[shift]     \t\n"
+                "andi %[tmp2],%[shift],0x20\t\n"
+                "seleqz %[tmp1],%[tmp1],%[tmp2]\t\n"
+                "selnez %[res],%[res],%[tmp2]\t\n"
+                "or %[res],%[res],%[tmp1]\t\n"
+                "subu  %[res],%[res],%[c]    \t\n"
+                : [res]"=&r"(result),[t]"=&r"(t),[tmp1]"=&r"(tmp1),[tmp2]"=&r"(tmp2)
+                : [a]"r"(a),[b]"r"(b),[c]"r"(c),[shift]"r"(shift)
+                );
+            }
+    return result;
+}
+
+inline int64_t gglMulii(int32_t x, int32_t y) CONST;
+inline int64_t gglMulii(int32_t x, int32_t y) {
+    union {
+        struct {
+#if defined(__MIPSEL__)
+            int32_t lo;
+            int32_t hi;
+#elif defined(__MIPSEB__)
+            int32_t hi;
+            int32_t lo;
+#endif
+        } s;
+        int64_t res;
+    }u;
+    asm("mul %0, %2, %3 \t\n"
+        "muh %1, %2, %3 \t\n"
+        : "=r"(u.s.lo), "=&r"(u.s.hi)
+        : "%r"(x), "r"(y)
+        );
+    return u.res;
+}
+
+#else // ----------------------------------------------------------------------
+
+inline GGLfixed gglMulx(GGLfixed a, GGLfixed b, int shift) CONST;
+inline GGLfixed gglMulx(GGLfixed a, GGLfixed b, int shift) {
+    return GGLfixed((int64_t(a)*b + (1<<(shift-1)))>>shift);
+}
+inline GGLfixed gglMulAddx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) CONST;
+inline GGLfixed gglMulAddx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) {
+    return GGLfixed((int64_t(a)*b)>>shift) + c;
+}
+inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) CONST;
+inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c, int shift) {
+    return GGLfixed((int64_t(a)*b)>>shift) - c;
+}
+inline int64_t gglMulii(int32_t a, int32_t b) CONST;
+inline int64_t gglMulii(int32_t a, int32_t b) {
+    return int64_t(a)*b;
+}
+
+#endif
+
+// ------------------------------------------------------------------------
+
+inline GGLfixed gglMulx(GGLfixed a, GGLfixed b) CONST;
+inline GGLfixed gglMulx(GGLfixed a, GGLfixed b) {
+    return gglMulx(a, b, 16);
+}
+inline GGLfixed gglMulAddx(GGLfixed a, GGLfixed b, GGLfixed c) CONST;
+inline GGLfixed gglMulAddx(GGLfixed a, GGLfixed b, GGLfixed c) {
+    return gglMulAddx(a, b, c, 16);
+}
+inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c) CONST;
+inline GGLfixed gglMulSubx(GGLfixed a, GGLfixed b, GGLfixed c) {
+    return gglMulSubx(a, b, c, 16);
+}
+
+// ------------------------------------------------------------------------
+
+inline int32_t gglClz(int32_t x) CONST;
+inline int32_t gglClz(int32_t x)
+{
+#if (defined(__arm__) && !defined(__thumb__)) || defined(__mips__) || defined(__aarch64__)
+    return __builtin_clz(x);
+#else
+    if (!x) return 32;
+    int32_t exp = 31;
+    if (x & 0xFFFF0000) { exp -=16; x >>= 16; }
+    if (x & 0x0000ff00) { exp -= 8; x >>= 8; }
+    if (x & 0x000000f0) { exp -= 4; x >>= 4; }
+    if (x & 0x0000000c) { exp -= 2; x >>= 2; }
+    if (x & 0x00000002) { exp -= 1; }
+    return exp;
+#endif
+}
+
+// ------------------------------------------------------------------------
+
+int32_t gglDivQ(GGLfixed n, GGLfixed d, int32_t i) CONST;
+
+inline int32_t gglDivQ16(GGLfixed n, GGLfixed d) CONST;
+inline int32_t gglDivQ16(GGLfixed n, GGLfixed d) {
+    return gglDivQ(n, d, 16);
+}
+
+inline int32_t gglDivx(GGLfixed n, GGLfixed d) CONST;
+inline int32_t gglDivx(GGLfixed n, GGLfixed d) {
+    return gglDivQ(n, d, 16);
+}
+
+// ------------------------------------------------------------------------
+
+inline GGLfixed gglRecipFast(GGLfixed x) CONST;
+inline GGLfixed gglRecipFast(GGLfixed x)
+{
+    // This is a really bad approximation of 1/x, but it's also
+    // very fast. x must be strictly positive.
+    // if x between [0.5, 1[ , then 1/x = 3-2*x
+    // (we use 2.30 fixed-point)
+    const int32_t lz = gglClz(x);
+    return (0xC0000000 - (x << (lz - 1))) >> (30-lz);
+}
+
+// ------------------------------------------------------------------------
+
+inline GGLfixed gglClampx(GGLfixed c) CONST;
+inline GGLfixed gglClampx(GGLfixed c)
+{
+#if defined(__thumb__)
+    // clamp without branches
+    c &= ~(c>>31);  c = FIXED_ONE - c;
+    c &= ~(c>>31);  c = FIXED_ONE - c;
+#else
+#if defined(__arm__)
+    // I don't know why gcc thinks its smarter than me! The code below
+    // clamps to zero in one instruction, but gcc won't generate it and
+    // replace it by a cmp + movlt (it's quite amazing actually).
+    asm("bic %0, %1, %1, asr #31\n" : "=r"(c) : "r"(c));
+#elif defined(__aarch64__)
+    asm("bic %w0, %w1, %w1, asr #31\n" : "=r"(c) : "r"(c));
+#else
+    c &= ~(c>>31);
+#endif
+    if (c>FIXED_ONE)
+        c = FIXED_ONE;
+#endif
+    return c;
+}
+
+// ------------------------------------------------------------------------
+
+#endif // ANDROID_GGL_FIXED_H
diff --git a/libpixelflinger/libpixelflingertwrp_defaults.go.del b/libpixelflinger/libpixelflingertwrp_defaults.go.del
new file mode 100644
index 0000000..e7c1410
--- /dev/null
+++ b/libpixelflinger/libpixelflingertwrp_defaults.go.del
@@ -0,0 +1,117 @@
+package libtwrp_defaults

+

+import (

+	"android/soong/android"

+	"android/soong/cc"

+	"fmt"

+	"path/filepath"

+)

+

+func globalFlags(ctx android.BaseContext) []string {

+	var cflags []string

+

+	if ctx.AConfig().Getenv("ARCH_ARM_HAVE_NEON") == "true" {

+		cflags = append(cflags, "-D__ARM_HAVE_NEON")

+	}

+	return cflags

+}

+

+func globalSrcs(ctx android.BaseContext) []string {

+	var srcs []string

+

+	matches, err := filepath.Glob("system/core/libpixelflinger/codeflinger/tinyutils/VectorImpl.cpp")

+	_ = matches

+	if err != nil {

+		srcs = append(srcs, "codeflinger/tinyutils/SharedBuffer.cpp")

+		srcs = append(srcs, "codeflinger/tinyutils/VectorImpl.cpp")

+	}

+

+	var arch = ctx.DeviceConfig().DeviceArch()

+	fmt.Println("arch: " + arch)

+

+	if arch != "x86" {

+		srcs = append(srcs, "codeflinger/ARMAssemblerInterface.cpp")

+		srcs = append(srcs, "codeflinger/ARMAssemblerProxy.cpp")

+		srcs = append(srcs, "codeflinger/GGLAssembler.cpp")

+		srcs = append(srcs, "codeflinger/load_store.cpp")

+		srcs = append(srcs, "codeflinger/blending.cpp")

+		srcs = append(srcs, "codeflinger/texturing.cpp")

+		srcs = append(srcs, "fixed.cpp")

+		srcs = append(srcs, "picker.cpp")

+		srcs = append(srcs, "pixelflinger.cpp")

+		srcs = append(srcs, "trap.cpp")

+		srcs = append(srcs, "scanline.cpp")

+	} else {

+		srcs = append(srcs, "codeflinger/x86/X86Assembler.cpp")

+		srcs = append(srcs, "codeflinger/x86/GGLX86Assembler.cpp")

+		srcs = append(srcs, "codeflinger/x86/load_store.cpp")

+		srcs = append(srcs, "codeflinger/x86/blending.cpp")

+		srcs = append(srcs, "codeflinger/x86/texturing.cpp")

+		srcs = append(srcs, "fixed.cpp")

+		srcs = append(srcs, "picker.cpp")

+		srcs = append(srcs, "pixelflinger.cpp")

+		srcs = append(srcs, "trap.cpp")

+		srcs = append(srcs, "scanline.cpp")

+	}

+

+	switch arch {

+	case "arm":

+		if ctx.AConfig().Getenv("ARCH_ARM_HAVE_NEON") == "true" {

+			srcs = append(srcs, "col32cb16blend_neon.S")

+		}

+		srcs = append(srcs, "codeflinger/ARMAssembler.cpp")

+		srcs = append(srcs, "codeflinger/disassem.c")

+		srcs = append(srcs, "col32cb16blend.S")

+		srcs = append(srcs, "t32cb16blend.S")

+		break

+	case "arm64":

+		srcs = append(srcs, "codeflinger/Arm64Assembler.cpp")

+		srcs = append(srcs, "codeflinger/Arm64Disassembler.cpp")

+		srcs = append(srcs, "arch-arm64/col32cb16blend.S")

+		srcs = append(srcs, "arch-arm64/t32cb16blend.S")

+	}

+	return srcs

+}

+

+func globalIncludes(ctx android.BaseContext) []string {

+	var includes []string

+

+	if ctx.AConfig().Getenv("TARGET_ARCH") == "arm" {

+		includes = append(includes, "external/libenc")

+	}

+

+	return includes

+}

+

+func libPixelflingerTwrpDefaults(ctx android.LoadHookContext) {

+	type props struct {

+		Target struct {

+			Android struct {

+				Cflags  []string

+				Enabled *bool

+			}

+		}

+		Cflags       []string

+		Srcs         []string

+		Include_dirs []string

+	}

+

+	p := &props{}

+	p.Cflags = globalFlags(ctx)

+	s := globalSrcs(ctx)

+	p.Srcs = s

+	i := globalIncludes(ctx)

+	p.Include_dirs = i

+	ctx.AppendProperties(p)

+}

+

+func init() {

+	android.RegisterModuleType("libpixelflingertwrp_defaults", libPixelflingerTwrpDefaultsFactory)

+}

+

+func libPixelflingerTwrpDefaultsFactory() android.Module {

+	module := cc.DefaultsFactory()

+	android.AddLoadHook(module, libPixelflingerTwrpDefaults)

+

+	return module

+}

diff --git a/libpixelflinger/picker.cpp b/libpixelflinger/picker.cpp
new file mode 100644
index 0000000..aa55229
--- /dev/null
+++ b/libpixelflinger/picker.cpp
@@ -0,0 +1,173 @@
+/* libs/pixelflinger/picker.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <stdio.h>
+
+#include "buffer.h"
+#include "scanline.h"
+#include "picker.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_picker(context_t* /*c*/)
+{
+}
+
+void ggl_pick(context_t* c)
+{
+    if (ggl_likely(!c->dirty))
+        return;
+        
+    // compute needs, see if they changed...
+    const uint32_t enables = c->state.enables;
+    needs_t new_needs(c->state.needs);
+
+    if (c->dirty & GGL_CB_STATE) {
+        new_needs.n &= ~GGL_NEEDS_CB_FORMAT_MASK;
+        new_needs.n |= GGL_BUILD_NEEDS(c->state.buffers.color.format, CB_FORMAT);
+        if (enables & GGL_ENABLE_BLENDING)
+            c->dirty |= GGL_PIXEL_PIPELINE_STATE;
+    }
+
+    if (c->dirty & GGL_PIXEL_PIPELINE_STATE) {
+        uint32_t n = GGL_BUILD_NEEDS(c->state.buffers.color.format, CB_FORMAT);
+        uint32_t p = 0;
+        if (enables & GGL_ENABLE_BLENDING) {
+            uint32_t src = c->state.blend.src;
+            uint32_t dst = c->state.blend.dst;
+            uint32_t src_alpha = c->state.blend.src_alpha;
+            uint32_t dst_alpha = c->state.blend.dst_alpha;
+            const GGLFormat& cbf = c->formats[ c->state.buffers.color.format ];
+            if (!cbf.c[GGLFormat::ALPHA].h) {
+                if ((src == GGL_ONE_MINUS_DST_ALPHA) ||
+                    (src == GGL_DST_ALPHA)) {
+                    src = GGL_ONE;
+                }
+                if ((src_alpha == GGL_ONE_MINUS_DST_ALPHA) ||
+                    (src_alpha == GGL_DST_ALPHA)) {
+                    src_alpha = GGL_ONE;
+                }
+                if ((dst == GGL_ONE_MINUS_DST_ALPHA) ||
+                    (dst == GGL_DST_ALPHA)) {
+                    dst = GGL_ONE;
+                }
+                if ((dst_alpha == GGL_ONE_MINUS_DST_ALPHA) ||
+                    (dst_alpha == GGL_DST_ALPHA)) {
+                    dst_alpha = GGL_ONE;
+                }
+            }
+
+            src       = ggl_blendfactor_to_needs(src);
+            dst       = ggl_blendfactor_to_needs(dst);
+            src_alpha = ggl_blendfactor_to_needs(src_alpha);
+            dst_alpha = ggl_blendfactor_to_needs(dst_alpha);
+                    
+            n |= GGL_BUILD_NEEDS( src, BLEND_SRC );
+            n |= GGL_BUILD_NEEDS( dst, BLEND_DST );
+            if (c->state.blend.alpha_separate) {
+                n |= GGL_BUILD_NEEDS( src_alpha, BLEND_SRCA );
+                n |= GGL_BUILD_NEEDS( dst_alpha, BLEND_DSTA );
+            } else {
+                n |= GGL_BUILD_NEEDS( src, BLEND_SRCA );
+                n |= GGL_BUILD_NEEDS( dst, BLEND_DSTA );
+            }
+        } else {
+            n |= GGL_BUILD_NEEDS( GGL_ONE,  BLEND_SRC );
+            n |= GGL_BUILD_NEEDS( GGL_ZERO, BLEND_DST );
+            n |= GGL_BUILD_NEEDS( GGL_ONE,  BLEND_SRCA );
+            n |= GGL_BUILD_NEEDS( GGL_ZERO, BLEND_DSTA );
+        }
+
+
+        n |= GGL_BUILD_NEEDS(c->state.mask.color^0xF,               MASK_ARGB);
+        n |= GGL_BUILD_NEEDS((enables & GGL_ENABLE_SMOOTH)  ?1:0,   SHADE);
+        if (enables & GGL_ENABLE_TMUS) {
+            n |= GGL_BUILD_NEEDS((enables & GGL_ENABLE_W)       ?1:0,   W);
+        }
+        p |= GGL_BUILD_NEEDS((enables & GGL_ENABLE_DITHER)  ?1:0,   P_DITHER);
+        p |= GGL_BUILD_NEEDS((enables & GGL_ENABLE_AA)      ?1:0,   P_AA);
+        p |= GGL_BUILD_NEEDS((enables & GGL_ENABLE_FOG)     ?1:0,   P_FOG);
+
+        if (enables & GGL_ENABLE_LOGIC_OP) {
+            n |= GGL_BUILD_NEEDS(c->state.logic_op.opcode, LOGIC_OP);
+        } else {
+            n |= GGL_BUILD_NEEDS(GGL_COPY, LOGIC_OP);
+        }
+
+        if (enables & GGL_ENABLE_ALPHA_TEST) {
+            p |= GGL_BUILD_NEEDS(c->state.alpha_test.func, P_ALPHA_TEST);
+        } else {
+            p |= GGL_BUILD_NEEDS(GGL_ALWAYS, P_ALPHA_TEST);
+        }
+
+        if (enables & GGL_ENABLE_DEPTH_TEST) {
+            p |= GGL_BUILD_NEEDS(c->state.depth_test.func, P_DEPTH_TEST);
+            p |= GGL_BUILD_NEEDS(c->state.mask.depth&1, P_MASK_Z);
+        } else {
+            p |= GGL_BUILD_NEEDS(GGL_ALWAYS, P_DEPTH_TEST);
+            // writing to the z-buffer is always disabled if depth-test
+            // is disabled.
+        }
+        new_needs.n = n;
+        new_needs.p = p;
+    }
+
+    if (c->dirty & GGL_TMU_STATE) {
+        int idx = 0;
+        for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+            const texture_t& tx = c->state.texture[i];
+            if (tx.enable) {
+                uint32_t t = 0;
+                t |= GGL_BUILD_NEEDS(tx.surface.format, T_FORMAT);
+                t |= GGL_BUILD_NEEDS(ggl_env_to_needs(tx.env), T_ENV);
+                t |= GGL_BUILD_NEEDS(0, T_POT);       // XXX: not used yet
+                if (tx.s_coord==GGL_ONE_TO_ONE && tx.t_coord==GGL_ONE_TO_ONE) {
+                    // we encode 1-to-1 into the wrap mode
+                    t |= GGL_BUILD_NEEDS(GGL_NEEDS_WRAP_11, T_S_WRAP);
+                    t |= GGL_BUILD_NEEDS(GGL_NEEDS_WRAP_11, T_T_WRAP);
+                } else {
+                    t |= GGL_BUILD_NEEDS(ggl_wrap_to_needs(tx.s_wrap), T_S_WRAP);
+                    t |= GGL_BUILD_NEEDS(ggl_wrap_to_needs(tx.t_wrap), T_T_WRAP);
+                }
+                if (tx.mag_filter == GGL_LINEAR) {
+                    t |= GGL_BUILD_NEEDS(1, T_LINEAR);
+                }
+                if (tx.min_filter == GGL_LINEAR) {
+                    t |= GGL_BUILD_NEEDS(1, T_LINEAR);
+                }
+                new_needs.t[idx++] = t;
+            } else {
+                new_needs.t[i] = 0;
+            }
+        }
+    }
+
+    if (new_needs != c->state.needs) {
+        c->state.needs = new_needs;
+        ggl_pick_texture(c);
+        ggl_pick_cb(c);
+        ggl_pick_scanline(c);
+    }
+    c->dirty = 0;
+}
+
+// ----------------------------------------------------------------------------
+}; // namespace android
+
diff --git a/libpixelflinger/picker.h b/libpixelflinger/picker.h
new file mode 100644
index 0000000..9cdbc3c
--- /dev/null
+++ b/libpixelflinger/picker.h
@@ -0,0 +1,31 @@
+/* libs/pixelflinger/picker.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_PICKER_H
+#define ANDROID_PICKER_H
+
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_picker(context_t* c);
+void ggl_pick(context_t* c);
+
+}; // namespace android
+
+#endif
diff --git a/libpixelflinger/pixelflinger.cpp b/libpixelflinger/pixelflinger.cpp
new file mode 100644
index 0000000..fd449b2
--- /dev/null
+++ b/libpixelflinger/pixelflinger.cpp
@@ -0,0 +1,836 @@
+/* libs/pixelflinger/pixelflinger.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#include <sys/time.h>
+
+#include <pixelflinger/pixelflinger.h>
+#include <private/pixelflinger/ggl_context.h>
+
+#include "buffer.h"
+#include "clear.h"
+#include "picker.h"
+#include "raster.h"
+#include "scanline.h"
+#include "trap.h"
+
+#include "codeflinger/GGLAssembler.h"
+#include "codeflinger/CodeCache.h"
+
+#include <stdio.h> 
+
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+// 8x8 Bayer dither matrix
+static const uint8_t gDitherMatrix[GGL_DITHER_SIZE] = {
+     0, 32,  8, 40,  2, 34, 10, 42,
+    48, 16, 56, 24, 50, 18, 58, 26,
+    12, 44,  4, 36, 14, 46,  6, 38,
+    60, 28, 52, 20, 62, 30, 54, 22,
+     3, 35, 11, 43,  1, 33,  9, 41,
+    51, 19, 59, 27, 49, 17, 57, 25,
+    15, 47,  7, 39, 13, 45,  5, 37,
+    63, 31, 55, 23, 61, 29, 53, 21
+};
+
+static void ggl_init_procs(context_t* c);
+static void ggl_set_scissor(context_t* c);
+
+static void ggl_enable_blending(context_t* c, int enable);
+static void ggl_enable_scissor_test(context_t* c, int enable);
+static void ggl_enable_alpha_test(context_t* c, int enable);
+static void ggl_enable_logic_op(context_t* c, int enable);
+static void ggl_enable_dither(context_t* c, int enable);
+static void ggl_enable_stencil_test(context_t* c, int enable);
+static void ggl_enable_depth_test(context_t* c, int enable);
+static void ggl_enable_aa(context_t* c, int enable);
+static void ggl_enable_point_aa_nice(context_t* c, int enable);
+static void ggl_enable_texture2d(context_t* c, int enable);
+static void ggl_enable_w_lerp(context_t* c, int enable);
+static void ggl_enable_fog(context_t* c, int enable);
+
+static inline int min(int a, int b) CONST;
+static inline int min(int a, int b) {
+    return a < b ? a : b;
+}
+
+static inline int max(int a, int b) CONST;
+static inline int max(int a, int b) {
+    return a < b ? b : a;
+}
+
+// ----------------------------------------------------------------------------
+
+void ggl_error(context_t* c, GGLenum error)
+{
+    if (c->error == GGL_NO_ERROR)
+        c->error = error;
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_bindTexture(void* con, const GGLSurface* surface)
+{
+    GGL_CONTEXT(c, con);
+    if (surface->format != c->activeTMU->surface.format)
+        ggl_state_changed(c, GGL_TMU_STATE);    
+    ggl_set_surface(c, &(c->activeTMU->surface), surface);
+}
+
+
+static void ggl_bindTextureLod(void* con, GGLuint tmu,const GGLSurface* surface)
+{
+    GGL_CONTEXT(c, con);
+    // All LODs must have the same format
+    ggl_set_surface(c, &c->state.texture[tmu].surface, surface);
+}
+
+static void ggl_colorBuffer(void* con, const GGLSurface* surface)
+{
+    GGL_CONTEXT(c, con);
+    if (surface->format != c->state.buffers.color.format)
+        ggl_state_changed(c, GGL_CB_STATE);
+
+    if (surface->width > c->state.buffers.coverageBufferSize) {
+        // allocate the coverage factor buffer
+        free(c->state.buffers.coverage);
+        c->state.buffers.coverage = (int16_t*)malloc(surface->width * 2);
+        c->state.buffers.coverageBufferSize =
+                c->state.buffers.coverage ? surface->width : 0;
+    }
+    ggl_set_surface(c, &(c->state.buffers.color), surface);
+    if (c->state.buffers.read.format == 0) {
+        ggl_set_surface(c, &(c->state.buffers.read), surface);
+    }
+    ggl_set_scissor(c);
+}
+
+static void ggl_readBuffer(void* con, const GGLSurface* surface)
+{
+    GGL_CONTEXT(c, con);
+    ggl_set_surface(c, &(c->state.buffers.read), surface);
+}
+
+static void ggl_depthBuffer(void* con, const GGLSurface* surface)
+{
+    GGL_CONTEXT(c, con);
+    if (surface->format == GGL_PIXEL_FORMAT_Z_16) {
+        ggl_set_surface(c, &(c->state.buffers.depth), surface);
+    } else {
+        c->state.buffers.depth.format = GGL_PIXEL_FORMAT_NONE;
+        ggl_enable_depth_test(c, 0);
+    }
+}
+
+static void ggl_scissor(void* con, GGLint x, GGLint y,
+        GGLsizei width, GGLsizei height)
+{
+    GGL_CONTEXT(c, con);
+    c->state.scissor.user_left = x;
+    c->state.scissor.user_top = y;
+    c->state.scissor.user_right = x + width;
+    c->state.scissor.user_bottom = y + height;
+    ggl_set_scissor(c);
+}
+
+// ----------------------------------------------------------------------------
+
+static void enable_disable(context_t* c, GGLenum name, int en)
+{
+    switch (name) {
+    case GGL_BLEND:             ggl_enable_blending(c, en);      break;
+    case GGL_SCISSOR_TEST:      ggl_enable_scissor_test(c, en);  break;
+    case GGL_ALPHA_TEST:        ggl_enable_alpha_test(c, en);    break;
+    case GGL_COLOR_LOGIC_OP:    ggl_enable_logic_op(c, en);      break;
+    case GGL_DITHER:            ggl_enable_dither(c, en);        break;
+    case GGL_STENCIL_TEST:      ggl_enable_stencil_test(c, en);  break;
+    case GGL_DEPTH_TEST:        ggl_enable_depth_test(c, en);    break;
+    case GGL_AA:                ggl_enable_aa(c, en);            break;
+    case GGL_TEXTURE_2D:        ggl_enable_texture2d(c, en);     break;
+    case GGL_W_LERP:            ggl_enable_w_lerp(c, en);        break;
+    case GGL_FOG:               ggl_enable_fog(c, en);           break;
+    case GGL_POINT_SMOOTH_NICE: ggl_enable_point_aa_nice(c, en); break;
+    }
+}
+
+static void ggl_enable(void* con, GGLenum name)
+{
+    GGL_CONTEXT(c, con);
+    enable_disable(c, name, 1);
+}
+
+static void ggl_disable(void* con, GGLenum name)
+{
+    GGL_CONTEXT(c, con);
+    enable_disable(c, name, 0);
+}
+
+static void ggl_enableDisable(void* con, GGLenum name, GGLboolean en)
+{
+    GGL_CONTEXT(c, con);
+    enable_disable(c, name, en ? 1 : 0);
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_shadeModel(void* con, GGLenum mode)
+{
+    GGL_CONTEXT(c, con);
+    switch (mode) {
+    case GGL_FLAT:
+        if (c->state.enables & GGL_ENABLE_SMOOTH) {
+            c->state.enables &= ~GGL_ENABLE_SMOOTH;
+            ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+        }
+        break;
+    case GGL_SMOOTH:
+        if (!(c->state.enables & GGL_ENABLE_SMOOTH)) {
+            c->state.enables |= GGL_ENABLE_SMOOTH;
+            ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+        }
+        break;
+    default:
+        ggl_error(c, GGL_INVALID_ENUM);
+    }
+}
+
+static void ggl_color4xv(void* con, const GGLclampx* color)
+{
+    GGL_CONTEXT(c, con);
+	c->shade.r0 = gglFixedToIteratedColor(color[0]);
+	c->shade.g0 = gglFixedToIteratedColor(color[1]);
+	c->shade.b0 = gglFixedToIteratedColor(color[2]);
+	c->shade.a0 = gglFixedToIteratedColor(color[3]);
+}
+
+static void ggl_colorGrad12xv(void* con, const GGLcolor* grad)
+{
+    GGL_CONTEXT(c, con);
+    // it is very important to round the iterated value here because
+    // the rasterizer doesn't clamp them, therefore the iterated value
+    //must absolutely be correct.
+    // GGLColor is encoded as 8.16 value
+    const int32_t round = 0x8000;
+	c->shade.r0   = grad[ 0] + round;
+	c->shade.drdx = grad[ 1];
+	c->shade.drdy = grad[ 2];
+	c->shade.g0   = grad[ 3] + round;
+	c->shade.dgdx = grad[ 4];
+	c->shade.dgdy = grad[ 5];
+	c->shade.b0   = grad[ 6] + round;
+	c->shade.dbdx = grad[ 7];
+	c->shade.dbdy = grad[ 8];
+	c->shade.a0   = grad[ 9] + round;
+	c->shade.dadx = grad[10];
+	c->shade.dady = grad[11];
+}
+
+static void ggl_zGrad3xv(void* con, const GGLfixed32* grad)
+{
+    GGL_CONTEXT(c, con);
+    // z iterators are encoded as 0.32 fixed point and the z-buffer
+    // holds 16 bits, the rounding value is 0x8000.
+    const uint32_t round = 0x8000;
+	c->shade.z0   = grad[0] + round;
+	c->shade.dzdx = grad[1];
+	c->shade.dzdy = grad[2];
+}
+
+static void ggl_wGrad3xv(void* con, const GGLfixed* grad)
+{
+    GGL_CONTEXT(c, con);
+	c->shade.w0   = grad[0];
+	c->shade.dwdx = grad[1];
+	c->shade.dwdy = grad[2];
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_fogGrad3xv(void* con, const GGLfixed* grad)
+{
+    GGL_CONTEXT(c, con);
+	c->shade.f0     = grad[0];
+	c->shade.dfdx   = grad[1];
+	c->shade.dfdy   = grad[2];
+}
+
+static void ggl_fogColor3xv(void* con, const GGLclampx* color)
+{
+    GGL_CONTEXT(c, con);
+    const int32_t r = gglClampx(color[0]);
+    const int32_t g = gglClampx(color[1]);
+    const int32_t b = gglClampx(color[2]);
+    c->state.fog.color[GGLFormat::ALPHA]= 0xFF; // unused
+	c->state.fog.color[GGLFormat::RED]  = (r - (r>>8))>>8;
+	c->state.fog.color[GGLFormat::GREEN]= (g - (g>>8))>>8;
+	c->state.fog.color[GGLFormat::BLUE] = (b - (b>>8))>>8;
+}
+
+static void ggl_enable_fog(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_FOG)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_FOG;
+        else        c->state.enables &= ~GGL_ENABLE_FOG;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_blendFunc(void* con, GGLenum src, GGLenum dst)
+{
+    GGL_CONTEXT(c, con);
+    c->state.blend.src = src;
+    c->state.blend.src_alpha = src;
+    c->state.blend.dst = dst;
+    c->state.blend.dst_alpha = dst;
+    c->state.blend.alpha_separate = 0;
+    if (c->state.enables & GGL_ENABLE_BLENDING) {
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+static void ggl_blendFuncSeparate(void* con,
+        GGLenum src, GGLenum dst,
+        GGLenum srcAlpha, GGLenum dstAplha)
+{
+    GGL_CONTEXT(c, con);
+    c->state.blend.src = src;
+    c->state.blend.src_alpha = srcAlpha;
+    c->state.blend.dst = dst;
+    c->state.blend.dst_alpha = dstAplha;
+    c->state.blend.alpha_separate = 1;
+    if (c->state.enables & GGL_ENABLE_BLENDING) {
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_texEnvi(void* con,	GGLenum target,
+							GGLenum pname,
+							GGLint param)
+{
+    GGL_CONTEXT(c, con);
+    if (target != GGL_TEXTURE_ENV || pname != GGL_TEXTURE_ENV_MODE) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    switch (param) {
+    case GGL_REPLACE:
+    case GGL_MODULATE:
+    case GGL_DECAL:
+    case GGL_BLEND:
+    case GGL_ADD:
+        if (c->activeTMU->env != param) {
+            c->activeTMU->env = param;
+            ggl_state_changed(c, GGL_TMU_STATE);
+        }
+        break;
+    default:
+        ggl_error(c, GGL_INVALID_ENUM);
+    }
+}
+
+static void ggl_texEnvxv(void* con, GGLenum target,
+        GGLenum pname, const GGLfixed* params)
+{
+    GGL_CONTEXT(c, con);
+    if (target != GGL_TEXTURE_ENV) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    switch (pname) {
+    case GGL_TEXTURE_ENV_MODE:
+        ggl_texEnvi(con, target, pname, params[0]);
+        break;
+    case GGL_TEXTURE_ENV_COLOR: {
+        uint8_t* const color = c->activeTMU->env_color;
+        const GGLclampx r = gglClampx(params[0]);
+        const GGLclampx g = gglClampx(params[1]);
+        const GGLclampx b = gglClampx(params[2]);
+        const GGLclampx a = gglClampx(params[3]);
+        color[0] = (a-(a>>8))>>8;
+        color[1] = (r-(r>>8))>>8;
+        color[2] = (g-(g>>8))>>8;
+        color[3] = (b-(b>>8))>>8;
+        break;
+    }
+    default:
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+}
+
+
+static void ggl_texParameteri(void* con,
+        GGLenum target,
+        GGLenum pname,
+        GGLint param)
+{
+    GGL_CONTEXT(c, con);
+    if (target != GGL_TEXTURE_2D) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+
+    if (param == GGL_CLAMP_TO_EDGE)
+        param = GGL_CLAMP;
+
+    uint16_t* what = 0;
+    switch (pname) {
+    case GGL_TEXTURE_WRAP_S:
+        if ((param == GGL_CLAMP) ||
+            (param == GGL_REPEAT)) {
+            what = &c->activeTMU->s_wrap;
+        }
+        break;
+    case GGL_TEXTURE_WRAP_T:
+        if ((param == GGL_CLAMP) ||
+            (param == GGL_REPEAT)) {
+            what = &c->activeTMU->t_wrap;
+        }
+        break;
+    case GGL_TEXTURE_MIN_FILTER:
+        if ((param == GGL_NEAREST) ||
+            (param == GGL_NEAREST_MIPMAP_NEAREST) ||
+            (param == GGL_NEAREST_MIPMAP_LINEAR)) {
+            what = &c->activeTMU->min_filter;
+            param = GGL_NEAREST;
+        }
+        if ((param == GGL_LINEAR) ||
+            (param == GGL_LINEAR_MIPMAP_NEAREST) ||
+            (param == GGL_LINEAR_MIPMAP_LINEAR)) {
+            what = &c->activeTMU->min_filter;
+            param = GGL_LINEAR;
+        }
+        break;
+    case GGL_TEXTURE_MAG_FILTER:
+        if ((param == GGL_NEAREST) ||
+            (param == GGL_LINEAR)) {
+            what = &c->activeTMU->mag_filter;
+        }
+        break;
+    }
+    
+    if (!what) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    
+    if (*what != param) {
+        *what = param;
+        ggl_state_changed(c, GGL_TMU_STATE);
+    }
+}
+
+static void ggl_texCoordGradScale8xv(void* con, GGLint tmu, const int32_t* grad)
+{
+    GGL_CONTEXT(c, con);
+    texture_t& u = c->state.texture[tmu];
+	u.shade.is0   = grad[0];
+	u.shade.idsdx = grad[1];
+	u.shade.idsdy = grad[2];
+	u.shade.it0   = grad[3];
+	u.shade.idtdx = grad[4];
+	u.shade.idtdy = grad[5];
+    u.shade.sscale= grad[6];
+    u.shade.tscale= grad[7];
+}
+
+static void ggl_texCoord2x(void* con, GGLfixed s, GGLfixed t)
+{
+    GGL_CONTEXT(c, con);
+	c->activeTMU->shade.is0 = s;
+	c->activeTMU->shade.it0 = t;
+    c->activeTMU->shade.sscale= 0;
+    c->activeTMU->shade.tscale= 0;
+}
+
+static void ggl_texCoord2i(void* con, GGLint s, GGLint t)
+{
+    ggl_texCoord2x(con, s<<16, t<<16);
+}
+
+static void ggl_texGeni(void* con, GGLenum coord, GGLenum pname, GGLint param)
+{
+    GGL_CONTEXT(c, con);
+    if (pname != GGL_TEXTURE_GEN_MODE) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+
+    uint32_t* coord_ptr = 0;
+    if (coord == GGL_S)         coord_ptr = &(c->activeTMU->s_coord);
+    else if (coord == GGL_T)    coord_ptr = &(c->activeTMU->t_coord);
+
+    if (coord_ptr) {
+        if (*coord_ptr != uint32_t(param)) {
+            *coord_ptr = uint32_t(param);
+            ggl_state_changed(c, GGL_TMU_STATE);
+        }
+    } else {
+        ggl_error(c, GGL_INVALID_ENUM);
+    }
+}
+
+static void ggl_activeTexture(void* con, GGLuint tmu)
+{
+    GGL_CONTEXT(c, con);
+    if (tmu >= GGLuint(GGL_TEXTURE_UNIT_COUNT)) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    c->activeTMUIndex = tmu;
+    c->activeTMU = &(c->state.texture[tmu]);
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_colorMask(void* con, GGLboolean r,
+                                     GGLboolean g,
+                                     GGLboolean b,
+                                     GGLboolean a)
+{
+    GGL_CONTEXT(c, con);
+    int mask = 0;
+    if (a)  mask |= 1 << GGLFormat::ALPHA;
+    if (r)  mask |= 1 << GGLFormat::RED;
+    if (g)  mask |= 1 << GGLFormat::GREEN;
+    if (b)  mask |= 1 << GGLFormat::BLUE;
+    if (c->state.mask.color != mask) {
+        c->state.mask.color = mask;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+static void ggl_depthMask(void* con, GGLboolean flag)
+{
+    GGL_CONTEXT(c, con);
+    if (c->state.mask.depth != flag?1:0) {
+        c->state.mask.depth = flag?1:0;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+static void ggl_stencilMask(void* con, GGLuint mask)
+{
+    GGL_CONTEXT(c, con);
+    if (c->state.mask.stencil != mask) {
+        c->state.mask.stencil = mask;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_alphaFuncx(void* con, GGLenum func, GGLclampx ref)
+{
+    GGL_CONTEXT(c, con);
+    if ((func < GGL_NEVER) || (func > GGL_ALWAYS)) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    c->state.alpha_test.ref = gglFixedToIteratedColor(gglClampx(ref));
+    if (c->state.alpha_test.func != func) {
+        c->state.alpha_test.func = func;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_depthFunc(void* con, GGLenum func)
+{
+    GGL_CONTEXT(c, con);
+    if ((func < GGL_NEVER) || (func > GGL_ALWAYS)) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    if (c->state.depth_test.func != func) {
+        c->state.depth_test.func = func;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void ggl_logicOp(void* con, GGLenum opcode)
+{
+    GGL_CONTEXT(c, con);
+    if ((opcode < GGL_CLEAR) || (opcode > GGL_SET)) {
+        ggl_error(c, GGL_INVALID_ENUM);
+        return;
+    }
+    if (c->state.logic_op.opcode != opcode) {
+        c->state.logic_op.opcode = opcode;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+
+// ----------------------------------------------------------------------------
+
+void ggl_set_scissor(context_t* c)
+{
+    if (c->state.enables & GGL_ENABLE_SCISSOR_TEST) {
+        const int32_t l = c->state.scissor.user_left;
+        const int32_t t = c->state.scissor.user_top;
+        const int32_t r = c->state.scissor.user_right;
+        const int32_t b = c->state.scissor.user_bottom;
+        c->state.scissor.left   = max(0, l);
+        c->state.scissor.right  = min(c->state.buffers.color.width, r);
+        c->state.scissor.top    = max(0, t);
+        c->state.scissor.bottom = min(c->state.buffers.color.height, b);
+    } else {
+        c->state.scissor.left   = 0;
+        c->state.scissor.top    = 0;
+        c->state.scissor.right  = c->state.buffers.color.width;
+        c->state.scissor.bottom = c->state.buffers.color.height;
+    }
+}
+
+void ggl_enable_blending(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_BLENDING)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_BLENDING;
+        else        c->state.enables &= ~GGL_ENABLE_BLENDING;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_scissor_test(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_SCISSOR_TEST)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_SCISSOR_TEST;
+        else        c->state.enables &= ~GGL_ENABLE_SCISSOR_TEST;
+        ggl_set_scissor(c);
+    }
+}
+
+void ggl_enable_alpha_test(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_ALPHA_TEST)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_ALPHA_TEST;
+        else        c->state.enables &= ~GGL_ENABLE_ALPHA_TEST;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_logic_op(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_LOGIC_OP)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_LOGIC_OP;
+        else        c->state.enables &= ~GGL_ENABLE_LOGIC_OP;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_dither(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_DITHER)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_DITHER;
+        else        c->state.enables &= ~GGL_ENABLE_DITHER;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_stencil_test(context_t* /*c*/, int /*enable*/)
+{
+}
+
+void ggl_enable_depth_test(context_t* c, int enable)
+{
+    if (c->state.buffers.depth.format == 0)
+        enable = 0;
+    const int e = (c->state.enables & GGL_ENABLE_DEPTH_TEST)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_DEPTH_TEST;
+        else        c->state.enables &= ~GGL_ENABLE_DEPTH_TEST;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_aa(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_AA)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_AA;
+        else        c->state.enables &= ~GGL_ENABLE_AA;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_point_aa_nice(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_POINT_AA_NICE)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_POINT_AA_NICE;
+        else        c->state.enables &= ~GGL_ENABLE_POINT_AA_NICE;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_w_lerp(context_t* c, int enable)
+{
+    const int e = (c->state.enables & GGL_ENABLE_W)?1:0;
+    if (e != enable) {
+        if (enable) c->state.enables |= GGL_ENABLE_W;
+        else        c->state.enables &= ~GGL_ENABLE_W;
+        ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE);
+    }
+}
+
+void ggl_enable_texture2d(context_t* c, int enable)
+{
+    if (c->activeTMU->enable != enable) {
+        const uint32_t tmu = c->activeTMUIndex;
+        c->activeTMU->enable = enable;
+        const uint32_t mask = 1UL << tmu;
+        if (enable)                 c->state.enabled_tmu |= mask;
+        else                        c->state.enabled_tmu &= ~mask;
+        if (c->state.enabled_tmu)   c->state.enables |= GGL_ENABLE_TMUS;
+        else                        c->state.enables &= ~GGL_ENABLE_TMUS;
+        ggl_state_changed(c, GGL_TMU_STATE);
+    }
+}
+
+        
+// ----------------------------------------------------------------------------
+
+int64_t ggl_system_time()
+{
+    struct timespec t;
+    t.tv_sec = t.tv_nsec = 0;
+    clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t);
+    return int64_t(t.tv_sec)*1000000000LL + t.tv_nsec;
+}
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_procs(context_t* c)
+{
+    GGLContext& procs = *(GGLContext*)c;
+    GGL_INIT_PROC(procs, scissor);
+    GGL_INIT_PROC(procs, activeTexture);
+    GGL_INIT_PROC(procs, bindTexture);
+    GGL_INIT_PROC(procs, bindTextureLod);
+    GGL_INIT_PROC(procs, colorBuffer);
+    GGL_INIT_PROC(procs, readBuffer);
+    GGL_INIT_PROC(procs, depthBuffer);
+    GGL_INIT_PROC(procs, enable);
+    GGL_INIT_PROC(procs, disable);
+    GGL_INIT_PROC(procs, enableDisable);
+    GGL_INIT_PROC(procs, shadeModel);
+    GGL_INIT_PROC(procs, color4xv);
+    GGL_INIT_PROC(procs, colorGrad12xv);
+    GGL_INIT_PROC(procs, zGrad3xv);
+    GGL_INIT_PROC(procs, wGrad3xv);
+    GGL_INIT_PROC(procs, fogGrad3xv);
+    GGL_INIT_PROC(procs, fogColor3xv);
+    GGL_INIT_PROC(procs, blendFunc);
+    GGL_INIT_PROC(procs, blendFuncSeparate);
+    GGL_INIT_PROC(procs, texEnvi);
+    GGL_INIT_PROC(procs, texEnvxv);
+    GGL_INIT_PROC(procs, texParameteri);
+    GGL_INIT_PROC(procs, texCoord2i);
+    GGL_INIT_PROC(procs, texCoord2x);
+    GGL_INIT_PROC(procs, texCoordGradScale8xv);
+    GGL_INIT_PROC(procs, texGeni);
+    GGL_INIT_PROC(procs, colorMask);
+    GGL_INIT_PROC(procs, depthMask);
+    GGL_INIT_PROC(procs, stencilMask);
+    GGL_INIT_PROC(procs, alphaFuncx);
+    GGL_INIT_PROC(procs, depthFunc);
+    GGL_INIT_PROC(procs, logicOp);
+    ggl_init_clear(c);
+}
+
+void ggl_init_context(context_t* c)
+{
+    memset(c, 0, sizeof(context_t));
+    ggl_init_procs(c);
+    ggl_init_trap(c);
+    ggl_init_scanline(c);
+    ggl_init_texture(c);
+    ggl_init_picker(c);
+    ggl_init_raster(c);
+    c->formats = gglGetPixelFormatTable();
+    c->state.blend.src = GGL_ONE;
+    c->state.blend.dst = GGL_ZERO;
+    c->state.blend.src_alpha = GGL_ONE;
+    c->state.blend.dst_alpha = GGL_ZERO;
+    c->state.mask.color = 0xF;
+    c->state.mask.depth = 0;
+    c->state.mask.stencil = 0xFFFFFFFF;
+    c->state.logic_op.opcode = GGL_COPY;
+    c->state.alpha_test.func = GGL_ALWAYS;
+    c->state.depth_test.func = GGL_LESS;
+    c->state.depth_test.clearValue = FIXED_ONE;
+    c->shade.w0 = FIXED_ONE;
+    memcpy(c->ditherMatrix, gDitherMatrix, sizeof(gDitherMatrix));
+}
+
+void ggl_uninit_context(context_t* c)
+{
+    ggl_uninit_scanline(c);
+}
+
+// ----------------------------------------------------------------------------
+}; // namespace android
+// ----------------------------------------------------------------------------
+
+
+
+using namespace android;
+
+ssize_t gglInit(GGLContext** context)
+{
+    void* const base = malloc(sizeof(context_t) + 32);
+	if (base) {
+        // always align the context on cache lines
+        context_t *c = (context_t *)((ptrdiff_t(base)+31) & ~0x1FL);
+        ggl_init_context(c);
+        c->base = base;
+		*context = (GGLContext*)c;
+	} else {
+		return -1;
+	}
+	return 0;
+}
+
+ssize_t gglUninit(GGLContext* con)
+{
+    GGL_CONTEXT(c, (void*)con);
+    ggl_uninit_context(c);
+	free(c->base);
+	return 0;
+}
+
diff --git a/libpixelflinger/raster.cpp b/libpixelflinger/raster.cpp
new file mode 100644
index 0000000..e95c2c8
--- /dev/null
+++ b/libpixelflinger/raster.cpp
@@ -0,0 +1,216 @@
+/* libs/pixelflinger/raster.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+
+#include <string.h>
+
+#include "raster.h"
+#include "trap.h"
+
+namespace android {
+
+static void ggl_rasterPos2x(void* con, GGLfixed x, GGLfixed y);
+static void ggl_rasterPos2i(void* con, GGLint x, GGLint y);
+static void ggl_copyPixels(void* con, GGLint xs, GGLint ys,
+        GGLsizei width, GGLsizei height, GGLenum type);
+
+void ggl_init_raster(context_t* c)
+{
+    GGLContext& procs = *(GGLContext*)c;
+    GGL_INIT_PROC(procs, copyPixels);
+    GGL_INIT_PROC(procs, rasterPos2x);
+    GGL_INIT_PROC(procs, rasterPos2i);
+}
+
+void ggl_rasterPos2x(void* con, GGLfixed x, GGLfixed y)
+{
+    GGL_CONTEXT(c, con);
+    // raster pos should be processed just like glVertex
+    c->state.raster.x = x;
+    c->state.raster.y = y;
+}
+
+void ggl_rasterPos2i(void* con, GGLint x, GGLint y)
+{
+    ggl_rasterPos2x(con, gglIntToFixed(x), gglIntToFixed(y));
+}
+
+void ggl_copyPixels(void* con, GGLint xs, GGLint ys,
+        GGLsizei width, GGLsizei height, GGLenum /*type*/)
+{
+    GGL_CONTEXT(c, con);
+
+    // color-buffer
+    surface_t* cb = &(c->state.buffers.color);
+
+    // undefined behaviour if we try to copy from outside the surface
+    if (uint32_t(xs) > cb->width)
+        return;
+    if (uint32_t(ys) > cb->height)
+        return;
+    if (uint32_t(xs + width) > cb->width)
+        return;
+    if (uint32_t(ys + height) > cb->height)
+        return;
+
+    // copy to current raster position
+    GGLint xd = gglFixedToIntRound(c->state.raster.x);
+    GGLint yd = gglFixedToIntRound(c->state.raster.y);
+
+    // clip to scissor
+    if (xd < GGLint(c->state.scissor.left)) {
+        GGLint offset = GGLint(c->state.scissor.left) - xd;
+        xd = GGLint(c->state.scissor.left);
+        xs += offset;
+        width -= offset;
+    }
+    if (yd < GGLint(c->state.scissor.top)) {
+        GGLint offset = GGLint(c->state.scissor.top) - yd;
+        yd = GGLint(c->state.scissor.top);
+        ys += offset;
+        height -= offset;
+    }
+    if ((xd + width) > GGLint(c->state.scissor.right)) {
+        width = GGLint(c->state.scissor.right) - xd;
+    }
+    if ((yd + height) > GGLint(c->state.scissor.bottom)) {
+        height = GGLint(c->state.scissor.bottom) - yd;
+    }
+
+    if (width<=0 || height<=0) {
+        return; // nothing to copy
+    }
+
+    if (xs==xd && ys==yd) {
+        // nothing to do, but be careful, this might not be true when we support
+        // gglPixelTransfer, gglPixelMap and gglPixelZoom
+        return;
+    }
+
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* src = reinterpret_cast<uint8_t*>(cb->data)
+            + (xs + (cb->stride * ys)) * fp->size;
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data)
+            + (xd + (cb->stride * yd)) * fp->size;
+    const size_t bpr = cb->stride * fp->size;
+    const size_t rowsize = width * fp->size;
+    size_t yc = height;
+
+    if (ys < yd) {
+        // bottom to top
+        src += height * bpr;
+        dst += height * bpr;
+        do {
+            dst -= bpr;
+            src -= bpr;
+            memcpy(dst, src, rowsize);
+        } while (--yc);
+    } else {
+        if (ys == yd) {
+            // might be right to left
+            do {
+                memmove(dst, src, rowsize);
+                dst += bpr;
+                src += bpr;
+            } while (--yc);
+        } else {
+            // top to bottom
+            do {
+                memcpy(dst, src, rowsize);
+                dst += bpr;
+                src += bpr;
+            } while (--yc);
+        }
+    }
+}
+
+}; // namespace android
+
+using namespace android;
+
+GGLint gglBitBlit(GGLContext* con, int tmu, GGLint crop[4], GGLint where[4])
+{
+    GGL_CONTEXT(c, (void*)con);
+
+     GGLint x = where[0];
+     GGLint y = where[1];
+     GGLint w = where[2];
+     GGLint h = where[3];
+
+    // exclsively enable this tmu
+    c->procs.activeTexture(c, tmu);
+    c->procs.disable(c, GGL_W_LERP);
+
+    uint32_t tmus = 1UL<<tmu;
+    if (c->state.enabled_tmu != tmus) {
+        c->activeTMU->enable = 1;
+        c->state.enabled_tmu = tmus;
+        c->state.enables |= GGL_ENABLE_TMUS;
+        ggl_state_changed(c, GGL_TMU_STATE);
+    }
+
+    const GGLint Wcr = crop[2];
+    const GGLint Hcr = crop[3];
+    if ((w == Wcr) && (h == Hcr)) {
+        c->procs.texGeni(c, GGL_S, GGL_TEXTURE_GEN_MODE, GGL_ONE_TO_ONE);
+        c->procs.texGeni(c, GGL_T, GGL_TEXTURE_GEN_MODE, GGL_ONE_TO_ONE);
+        const GGLint Ucr = crop[0];
+        const GGLint Vcr = crop[1];
+        const GGLint s0  = Ucr - x;
+        const GGLint t0  = Vcr - y;
+        c->procs.texCoord2i(c, s0, t0);
+        c->procs.recti(c, x, y, x+w, y+h);
+    } else {
+        int32_t texcoords[8];
+        x = gglIntToFixed(x);
+        y = gglIntToFixed(y); 
+    
+        // we CLAMP here, which works with premultiplied (s,t)
+        c->procs.texParameteri(c, GGL_TEXTURE_2D, GGL_TEXTURE_WRAP_S, GGL_CLAMP);
+        c->procs.texParameteri(c, GGL_TEXTURE_2D, GGL_TEXTURE_WRAP_T, GGL_CLAMP);
+        c->procs.texGeni(c, GGL_S, GGL_TEXTURE_GEN_MODE, GGL_AUTOMATIC);
+        c->procs.texGeni(c, GGL_T, GGL_TEXTURE_GEN_MODE, GGL_AUTOMATIC);
+    
+        const GGLint Ucr = crop[0] << 16;
+        const GGLint Vcr = crop[1] << 16;
+        const GGLint Wcr = crop[2] << 16;
+        const GGLint Hcr = crop[3] << 16;
+    
+        // computes texture coordinates (pre-multiplied)
+        int32_t dsdx = Wcr / w;   // dsdx = ((Wcr/w)/Wt)*Wt
+        int32_t dtdy = Hcr / h;   // dtdy = ((Hcr/h)/Ht)*Ht
+        int32_t s0   = Ucr - gglMulx(dsdx, x); // s0 = Ucr - x * dsdx
+        int32_t t0   = Vcr - gglMulx(dtdy, y); // t0 = Vcr - y * dtdy
+        texcoords[0] = s0;
+        texcoords[1] = dsdx;
+        texcoords[2] = 0;
+        texcoords[3] = t0;
+        texcoords[4] = 0;
+        texcoords[5] = dtdy;
+        texcoords[6] = 0;
+        texcoords[7] = 0;
+        c->procs.texCoordGradScale8xv(c, tmu, texcoords);
+        c->procs.recti(c, 
+                gglFixedToIntRound(x),
+                gglFixedToIntRound(y),
+                gglFixedToIntRound(x)+w,
+                gglFixedToIntRound(y)+h);
+    }
+    return 0;
+}
+
diff --git a/libpixelflinger/raster.h b/libpixelflinger/raster.h
new file mode 100644
index 0000000..9f0f240
--- /dev/null
+++ b/libpixelflinger/raster.h
@@ -0,0 +1,33 @@
+/* libs/pixelflinger/raster.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_GGL_RASTER_H
+#define ANDROID_GGL_RASTER_H
+
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_raster(context_t* c);
+
+void gglCopyPixels(void* c, GGLint x, GGLint y, GGLsizei width, GGLsizei height, GGLenum type);
+void gglRasterPos2d(void* c, GGLint x, GGLint y);
+
+}; // namespace android
+
+#endif // ANDROID_GGL_RASTER_H
diff --git a/libpixelflinger/scanline.cpp b/libpixelflinger/scanline.cpp
new file mode 100644
index 0000000..4cc23c7
--- /dev/null
+++ b/libpixelflinger/scanline.cpp
@@ -0,0 +1,2373 @@
+/* libs/pixelflinger/scanline.cpp
+**
+** Copyright 2006-2011, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "pixelflinger"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <cutils/memory.h>
+#include <log/log.h>
+
+#include "buffer.h"
+#include "scanline.h"
+
+#include "codeflinger/CodeCache.h"
+#include "codeflinger/GGLAssembler.h"
+#if defined(__arm__)
+#include "codeflinger/ARMAssembler.h"
+#elif defined(__aarch64__)
+#include "codeflinger/Arm64Assembler.h"
+#elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
+#include "codeflinger/MIPSAssembler.h"
+#elif defined(__mips__) && defined(__LP64__)
+#include "codeflinger/MIPS64Assembler.h"
+#endif
+//#include "codeflinger/ARMAssemblerOptimizer.h"
+
+// ----------------------------------------------------------------------------
+
+#define ANDROID_CODEGEN_GENERIC     0   // force generic pixel pipeline
+#define ANDROID_CODEGEN_C           1   // hand-written C, fallback generic 
+#define ANDROID_CODEGEN_ASM         2   // hand-written asm, fallback generic
+#define ANDROID_CODEGEN_GENERATED   3   // hand-written asm, fallback codegen
+
+#ifdef NDEBUG
+#   define ANDROID_RELEASE
+#   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
+#else
+#   define ANDROID_DEBUG
+#   define ANDROID_CODEGEN      ANDROID_CODEGEN_GENERATED
+#endif
+
+#if defined(__arm__) || (defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))) || defined(__aarch64__)
+#   define ANDROID_ARM_CODEGEN  1
+#else
+#   define ANDROID_ARM_CODEGEN  0
+#endif
+
+#define DEBUG__CODEGEN_ONLY     0
+
+/* Set to 1 to dump to the log the states that need a new
+ * code-generated scanline callback, i.e. those that don't
+ * have a corresponding shortcut function.
+ */
+#define DEBUG_NEEDS  0
+
+#if defined( __mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))
+#define ASSEMBLY_SCRATCH_SIZE   4096
+#elif defined(__aarch64__)
+#define ASSEMBLY_SCRATCH_SIZE   8192
+#else
+#define ASSEMBLY_SCRATCH_SIZE   2048
+#endif
+
+// ----------------------------------------------------------------------------
+namespace android {
+// ----------------------------------------------------------------------------
+
+static void init_y(context_t*, int32_t);
+static void init_y_noop(context_t*, int32_t);
+static void init_y_packed(context_t*, int32_t);
+static void init_y_error(context_t*, int32_t);
+
+static void step_y__generic(context_t* c);
+static void step_y__nop(context_t*);
+static void step_y__smooth(context_t* c);
+static void step_y__tmu(context_t* c);
+static void step_y__w(context_t* c);
+
+static void scanline(context_t* c);
+static void scanline_perspective(context_t* c);
+static void scanline_perspective_single(context_t* c);
+static void scanline_t32cb16blend(context_t* c);
+static void scanline_t32cb16blend_dither(context_t* c);
+static void scanline_t32cb16blend_srca(context_t* c);
+static void scanline_t32cb16blend_clamp(context_t* c);
+static void scanline_t32cb16blend_clamp_dither(context_t* c);
+static void scanline_t32cb16blend_clamp_mod(context_t* c);
+static void scanline_x32cb16blend_clamp_mod(context_t* c);
+static void scanline_t32cb16blend_clamp_mod_dither(context_t* c);
+static void scanline_x32cb16blend_clamp_mod_dither(context_t* c);
+static void scanline_t32cb16(context_t* c);
+static void scanline_t32cb16_dither(context_t* c);
+static void scanline_t32cb16_clamp(context_t* c);
+static void scanline_t32cb16_clamp_dither(context_t* c);
+static void scanline_col32cb16blend(context_t* c);
+static void scanline_t16cb16_clamp(context_t* c);
+static void scanline_t16cb16blend_clamp_mod(context_t* c);
+static void scanline_memcpy(context_t* c);
+static void scanline_memset8(context_t* c);
+static void scanline_memset16(context_t* c);
+static void scanline_memset32(context_t* c);
+static void scanline_noop(context_t* c);
+static void scanline_set(context_t* c);
+static void scanline_clear(context_t* c);
+
+static void rect_generic(context_t* c, size_t yc);
+static void rect_memcpy(context_t* c, size_t yc);
+
+#if defined( __arm__)
+extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
+extern "C" void scanline_t32cb16_arm(uint16_t *dst, uint32_t *src, size_t ct);
+extern "C" void scanline_col32cb16blend_neon(uint16_t *dst, uint32_t *col, size_t ct);
+extern "C" void scanline_col32cb16blend_arm(uint16_t *dst, uint32_t col, size_t ct);
+#elif defined(__aarch64__)
+extern "C" void scanline_t32cb16blend_arm64(uint16_t*, uint32_t*, size_t);
+extern "C" void scanline_col32cb16blend_arm64(uint16_t *dst, uint32_t col, size_t ct);
+#elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
+extern "C" void scanline_t32cb16blend_mips(uint16_t*, uint32_t*, size_t);
+#elif defined(__mips__) && defined(__LP64__)
+extern "C" void scanline_t32cb16blend_mips64(uint16_t*, uint32_t*, size_t);
+extern "C" void scanline_col32cb16blend_mips64(uint16_t *dst, uint32_t col, size_t ct);
+#endif
+
+// ----------------------------------------------------------------------------
+
+static inline uint16_t  convertAbgr8888ToRgb565(uint32_t  pix)
+{
+    return uint16_t( ((pix << 8) & 0xf800) |
+                      ((pix >> 5) & 0x07e0) |
+                      ((pix >> 19) & 0x001f) );
+}
+
+struct shortcut_t {
+    needs_filter_t  filter;
+    const char*     desc;
+    void            (*scanline)(context_t*);
+    void            (*init_y)(context_t*, int32_t);
+};
+
+// Keep in sync with needs
+
+/* To understand the values here, have a look at:
+ *     system/core/include/private/pixelflinger/ggl_context.h
+ *
+ * Especially the lines defining and using GGL_RESERVE_NEEDS
+ *
+ * Quick reminders:
+ *   - the last nibble of the first value is the destination buffer format.
+ *   - the last nibble of the third value is the source texture format
+ *   - formats: 4=rgb565 1=abgr8888 2=xbgr8888
+ *
+ * In the descriptions below:
+ *
+ *   SRC      means we copy the source pixels to the destination
+ *
+ *   SRC_OVER means we blend the source pixels to the destination
+ *            with dstFactor = 1-srcA, srcFactor=1  (premultiplied source).
+ *            This mode is otherwise called 'blend'.
+ *
+ *   SRCA_OVER means we blend the source pixels to the destination
+ *             with dstFactor=srcA*(1-srcA) srcFactor=srcA (non-premul source).
+ *             This mode is otherwise called 'blend_srca'
+ *
+ *   clamp    means we fetch source pixels from a texture with u/v clamping
+ *
+ *   mod      means the source pixels are modulated (multiplied) by the
+ *            a/r/g/b of the current context's color. Typically used for
+ *            fade-in / fade-out.
+ *
+ *   dither   means we dither 32 bit values to 16 bits
+ */
+static shortcut_t shortcuts[] = {
+    { { { 0x03515104, 0x00000077, { 0x00000A01, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, blend SRC_OVER", scanline_t32cb16blend, init_y_noop },
+    { { { 0x03010104, 0x00000077, { 0x00000A01, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, SRC", scanline_t32cb16, init_y_noop  },
+    /* same as first entry, but with dithering */
+    { { { 0x03515104, 0x00000177, { 0x00000A01, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, blend SRC_OVER dither", scanline_t32cb16blend_dither, init_y_noop },
+    /* same as second entry, but with dithering */
+    { { { 0x03010104, 0x00000177, { 0x00000A01, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, SRC dither", scanline_t32cb16_dither, init_y_noop  },
+    /* this is used during the boot animation - CHEAT: ignore dithering */
+    { { { 0x03545404, 0x00000077, { 0x00000A01, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFEFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, blend dst:ONE_MINUS_SRCA src:SRCA", scanline_t32cb16blend_srca, init_y_noop },
+    /* special case for arbitrary texture coordinates (think scaling) */
+    { { { 0x03515104, 0x00000077, { 0x00000001, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, SRC_OVER clamp", scanline_t32cb16blend_clamp, init_y },
+    { { { 0x03515104, 0x00000177, { 0x00000001, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, SRC_OVER clamp dither", scanline_t32cb16blend_clamp_dither, init_y },
+    /* another case used during emulation */
+    { { { 0x03515104, 0x00000077, { 0x00001001, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, SRC_OVER clamp modulate", scanline_t32cb16blend_clamp_mod, init_y },
+    /* and this */
+    { { { 0x03515104, 0x00000077, { 0x00001002, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, x888 tx, SRC_OVER clamp modulate", scanline_x32cb16blend_clamp_mod, init_y },
+    { { { 0x03515104, 0x00000177, { 0x00001001, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, SRC_OVER clamp modulate dither", scanline_t32cb16blend_clamp_mod_dither, init_y },
+    { { { 0x03515104, 0x00000177, { 0x00001002, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, x888 tx, SRC_OVER clamp modulate dither", scanline_x32cb16blend_clamp_mod_dither, init_y },
+    { { { 0x03010104, 0x00000077, { 0x00000001, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, SRC clamp", scanline_t32cb16_clamp, init_y  },
+    { { { 0x03010104, 0x00000077, { 0x00000002, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, x888 tx, SRC clamp", scanline_t32cb16_clamp, init_y  },
+    { { { 0x03010104, 0x00000177, { 0x00000001, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 8888 tx, SRC clamp dither", scanline_t32cb16_clamp_dither, init_y  },
+    { { { 0x03010104, 0x00000177, { 0x00000002, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, x888 tx, SRC clamp dither", scanline_t32cb16_clamp_dither, init_y  },
+    { { { 0x03010104, 0x00000077, { 0x00000004, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 565 tx, SRC clamp", scanline_t16cb16_clamp, init_y  },
+    { { { 0x03515104, 0x00000077, { 0x00001004, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0x0000003F } } },
+        "565 fb, 565 tx, SRC_OVER clamp", scanline_t16cb16blend_clamp_mod, init_y  },
+    { { { 0x03515104, 0x00000077, { 0x00000000, 0x00000000 } },
+        { 0xFFFFFFFF, 0xFFFFFFFF, { 0xFFFFFFFF, 0xFFFFFFFF } } },
+        "565 fb, 8888 fixed color", scanline_col32cb16blend, init_y_packed  },  
+    { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
+        { 0x00000000, 0x00000007, { 0x00000000, 0x00000000 } } },
+        "(nop) alpha test", scanline_noop, init_y_noop },
+    { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
+        { 0x00000000, 0x00000070, { 0x00000000, 0x00000000 } } },
+        "(nop) depth test", scanline_noop, init_y_noop },
+    { { { 0x05000000, 0x00000000, { 0x00000000, 0x00000000 } },
+        { 0x0F000000, 0x00000080, { 0x00000000, 0x00000000 } } },
+        "(nop) logic_op", scanline_noop, init_y_noop },
+    { { { 0xF0000000, 0x00000000, { 0x00000000, 0x00000000 } },
+        { 0xF0000000, 0x00000080, { 0x00000000, 0x00000000 } } },
+        "(nop) color mask", scanline_noop, init_y_noop },
+    { { { 0x0F000000, 0x00000077, { 0x00000000, 0x00000000 } },
+        { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
+        "(set) logic_op", scanline_set, init_y_noop },
+    { { { 0x00000000, 0x00000077, { 0x00000000, 0x00000000 } },
+        { 0xFF000000, 0x000000F7, { 0x00000000, 0x00000000 } } },
+        "(clear) logic_op", scanline_clear, init_y_noop },
+    { { { 0x03000000, 0x00000077, { 0x00000000, 0x00000000 } },
+        { 0xFFFFFF00, 0x000000F7, { 0x00000000, 0x00000000 } } },
+        "(clear) blending 0/0", scanline_clear, init_y_noop },
+    { { { 0x00000000, 0x00000000, { 0x00000000, 0x00000000 } },
+        { 0x0000003F, 0x00000000, { 0x00000000, 0x00000000 } } },
+        "(error) invalid color-buffer format", scanline_noop, init_y_error },
+};
+static const needs_filter_t noblend1to1 = {
+        // (disregard dithering, see below)
+        { 0x03010100, 0x00000077, { 0x00000A00, 0x00000000 } },
+        { 0xFFFFFFC0, 0xFFFFFEFF, { 0xFFFFFFC0, 0x0000003F } }
+};
+static  const needs_filter_t fill16noblend = {
+        { 0x03010100, 0x00000077, { 0x00000000, 0x00000000 } },
+        { 0xFFFFFFC0, 0xFFFFFFFF, { 0x0000003F, 0x0000003F } }
+};
+
+// ----------------------------------------------------------------------------
+
+#if ANDROID_ARM_CODEGEN
+
+#if defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__))
+static CodeCache gCodeCache(32 * 1024);
+#elif defined(__aarch64__)
+static CodeCache gCodeCache(48 * 1024);
+#else
+static CodeCache gCodeCache(12 * 1024);
+#endif
+
+class ScanlineAssembly : public Assembly {
+    AssemblyKey<needs_t> mKey;
+public:
+    ScanlineAssembly(needs_t needs, size_t size)
+        : Assembly(size), mKey(needs) { }
+    const AssemblyKey<needs_t>& key() const { return mKey; }
+};
+#endif
+
+// ----------------------------------------------------------------------------
+
+void ggl_init_scanline(context_t* c)
+{
+    c->init_y = init_y;
+    c->step_y = step_y__generic;
+    c->scanline = scanline;
+}
+
+void ggl_uninit_scanline(context_t* c)
+{
+    if (c->state.buffers.coverage)
+        free(c->state.buffers.coverage);
+#if ANDROID_ARM_CODEGEN
+    if (c->scanline_as)
+        c->scanline_as->decStrong(c);
+#endif
+}
+
+// ----------------------------------------------------------------------------
+
+static void pick_scanline(context_t* c)
+{
+#if (!defined(DEBUG__CODEGEN_ONLY) || (DEBUG__CODEGEN_ONLY == 0))
+
+#if ANDROID_CODEGEN == ANDROID_CODEGEN_GENERIC
+    c->init_y = init_y;
+    c->step_y = step_y__generic;
+    c->scanline = scanline;
+    return;
+#endif
+
+    //printf("*** needs [%08lx:%08lx:%08lx:%08lx]\n",
+    //    c->state.needs.n, c->state.needs.p,
+    //    c->state.needs.t[0], c->state.needs.t[1]);
+
+    // first handle the special case that we cannot test with a filter
+    const uint32_t cb_format = GGL_READ_NEEDS(CB_FORMAT, c->state.needs.n);
+    if (GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0]) == cb_format) {
+        if (c->state.needs.match(noblend1to1)) {
+            // this will match regardless of dithering state, since both
+            // src and dest have the same format anyway, there is no dithering
+            // to be done.
+            const GGLFormat* f =
+                &(c->formats[GGL_READ_NEEDS(T_FORMAT, c->state.needs.t[0])]);
+            if ((f->components == GGL_RGB) ||
+                (f->components == GGL_RGBA) ||
+                (f->components == GGL_LUMINANCE) ||
+                (f->components == GGL_LUMINANCE_ALPHA))
+            {
+                // format must have all of RGB components
+                // (so the current color doesn't show through)
+                c->scanline = scanline_memcpy;
+                c->init_y = init_y_noop;
+                return;
+            }
+        }
+    }
+
+    if (c->state.needs.match(fill16noblend)) {
+        c->init_y = init_y_packed;
+        switch (c->formats[cb_format].size) {
+        case 1: c->scanline = scanline_memset8;  return;
+        case 2: c->scanline = scanline_memset16; return;
+        case 4: c->scanline = scanline_memset32; return;
+        }
+    }
+
+    const int numFilters = sizeof(shortcuts)/sizeof(shortcut_t);
+    for (int i=0 ; i<numFilters ; i++) {
+        if (c->state.needs.match(shortcuts[i].filter)) {
+            c->scanline = shortcuts[i].scanline;
+            c->init_y = shortcuts[i].init_y;
+            return;
+        }
+    }
+
+#if DEBUG_NEEDS
+    ALOGI("Needs: n=0x%08x p=0x%08x t0=0x%08x t1=0x%08x",
+         c->state.needs.n, c->state.needs.p,
+         c->state.needs.t[0], c->state.needs.t[1]);
+#endif
+
+#endif // DEBUG__CODEGEN_ONLY
+
+    c->init_y = init_y;
+    c->step_y = step_y__generic;
+
+#if ANDROID_ARM_CODEGEN
+    // we're going to have to generate some code...
+    // here, generate code for our pixel pipeline
+    const AssemblyKey<needs_t> key(c->state.needs);
+    sp<Assembly> assembly = gCodeCache.lookup(key);
+    if (assembly == 0) {
+        // create a new assembly region
+        sp<ScanlineAssembly> a = new ScanlineAssembly(c->state.needs, 
+                ASSEMBLY_SCRATCH_SIZE);
+        // initialize our assembler
+#if defined(__arm__)
+        GGLAssembler assembler( new ARMAssembler(a) );
+        //GGLAssembler assembler(
+        //        new ARMAssemblerOptimizer(new ARMAssembler(a)) );
+#endif
+#if defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
+        GGLAssembler assembler( new ArmToMipsAssembler(a) );
+#elif defined(__mips__) && defined(__LP64__)
+        GGLAssembler assembler( new ArmToMips64Assembler(a) );
+#elif defined(__aarch64__)
+        GGLAssembler assembler( new ArmToArm64Assembler(a) );
+#endif
+        // generate the scanline code for the given needs
+        bool err = assembler.scanline(c->state.needs, c) != 0;
+        if (ggl_likely(!err)) {
+            // finally, cache this assembly
+            err = gCodeCache.cache(a->key(), a) < 0;
+        }
+        if (ggl_unlikely(err)) {
+            ALOGE("error generating or caching assembly. Reverting to NOP.");
+            c->scanline = scanline_noop;
+            c->init_y = init_y_noop;
+            c->step_y = step_y__nop;
+            return;
+        }
+        assembly = a;
+    }
+
+    // release the previous assembly
+    if (c->scanline_as) {
+        c->scanline_as->decStrong(c);
+    }
+
+    //ALOGI("using generated pixel-pipeline");
+    c->scanline_as = assembly.get();
+    c->scanline_as->incStrong(c); //  hold on to assembly
+    c->scanline = (void(*)(context_t* c))assembly->base();
+#else
+//    ALOGW("using generic (slow) pixel-pipeline");
+    c->scanline = scanline;
+#endif
+}
+
+void ggl_pick_scanline(context_t* c)
+{
+    pick_scanline(c);
+    if ((c->state.enables & GGL_ENABLE_W) &&
+        (c->state.enables & GGL_ENABLE_TMUS))
+    {
+        c->span = c->scanline;
+        c->scanline = scanline_perspective;
+        if (!(c->state.enabled_tmu & (c->state.enabled_tmu - 1))) {
+            // only one TMU enabled
+            c->scanline = scanline_perspective_single;
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+static void blending(context_t* c, pixel_t* fragment, pixel_t* fb);
+static void blend_factor(context_t* c, pixel_t* r, uint32_t factor,
+        const pixel_t* src, const pixel_t* dst);
+static void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv);
+
+#if ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
+
+// no need to compile the generic-pipeline, it can't be reached
+void scanline(context_t*)
+{
+}
+
+#else
+
+void rescale(uint32_t& u, uint8_t& su, uint32_t& v, uint8_t& sv)
+{
+    if (su && sv) {
+        if (su > sv) {
+            v = ggl_expand(v, sv, su);
+            sv = su;
+        } else if (su < sv) {
+            u = ggl_expand(u, su, sv);
+            su = sv;
+        }
+    }
+}
+
+void blending(context_t* c, pixel_t* fragment, pixel_t* fb)
+{
+    rescale(fragment->c[0], fragment->s[0], fb->c[0], fb->s[0]);
+    rescale(fragment->c[1], fragment->s[1], fb->c[1], fb->s[1]);
+    rescale(fragment->c[2], fragment->s[2], fb->c[2], fb->s[2]);
+    rescale(fragment->c[3], fragment->s[3], fb->c[3], fb->s[3]);
+
+    pixel_t sf, df;
+    blend_factor(c, &sf, c->state.blend.src, fragment, fb);
+    blend_factor(c, &df, c->state.blend.dst, fragment, fb);
+
+    fragment->c[1] =
+            gglMulAddx(fragment->c[1], sf.c[1], gglMulx(fb->c[1], df.c[1]));
+    fragment->c[2] =
+            gglMulAddx(fragment->c[2], sf.c[2], gglMulx(fb->c[2], df.c[2]));
+    fragment->c[3] =
+            gglMulAddx(fragment->c[3], sf.c[3], gglMulx(fb->c[3], df.c[3]));
+
+    if (c->state.blend.alpha_separate) {
+        blend_factor(c, &sf, c->state.blend.src_alpha, fragment, fb);
+        blend_factor(c, &df, c->state.blend.dst_alpha, fragment, fb);
+    }
+
+    fragment->c[0] =
+            gglMulAddx(fragment->c[0], sf.c[0], gglMulx(fb->c[0], df.c[0]));
+
+    // clamp to 1.0
+    if (fragment->c[0] >= (1LU<<fragment->s[0]))
+        fragment->c[0] = (1<<fragment->s[0])-1;
+    if (fragment->c[1] >= (1LU<<fragment->s[1]))
+        fragment->c[1] = (1<<fragment->s[1])-1;
+    if (fragment->c[2] >= (1LU<<fragment->s[2]))
+        fragment->c[2] = (1<<fragment->s[2])-1;
+    if (fragment->c[3] >= (1LU<<fragment->s[3]))
+        fragment->c[3] = (1<<fragment->s[3])-1;
+}
+
+static inline int blendfactor(uint32_t x, uint32_t size, uint32_t def = 0)
+{
+    if (!size)
+        return def;
+
+    // scale to 16 bits
+    if (size > 16) {
+        x >>= (size - 16);
+    } else if (size < 16) {
+        x = ggl_expand(x, size, 16);
+    }
+    x += x >> 15;
+    return x;
+}
+
+void blend_factor(context_t* /*c*/, pixel_t* r, 
+        uint32_t factor, const pixel_t* src, const pixel_t* dst)
+{
+    switch (factor) {
+        case GGL_ZERO:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = 0;
+            break;
+        case GGL_ONE:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = FIXED_ONE;
+            break;
+        case GGL_DST_COLOR:
+            r->c[1] = blendfactor(dst->c[1], dst->s[1]);
+            r->c[2] = blendfactor(dst->c[2], dst->s[2]);
+            r->c[3] = blendfactor(dst->c[3], dst->s[3]);
+            r->c[0] = blendfactor(dst->c[0], dst->s[0]);
+            break;
+        case GGL_SRC_COLOR:
+            r->c[1] = blendfactor(src->c[1], src->s[1]);
+            r->c[2] = blendfactor(src->c[2], src->s[2]);
+            r->c[3] = blendfactor(src->c[3], src->s[3]);
+            r->c[0] = blendfactor(src->c[0], src->s[0]);
+            break;
+        case GGL_ONE_MINUS_DST_COLOR:
+            r->c[1] = FIXED_ONE - blendfactor(dst->c[1], dst->s[1]);
+            r->c[2] = FIXED_ONE - blendfactor(dst->c[2], dst->s[2]);
+            r->c[3] = FIXED_ONE - blendfactor(dst->c[3], dst->s[3]);
+            r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0]);
+            break;
+        case GGL_ONE_MINUS_SRC_COLOR:
+            r->c[1] = FIXED_ONE - blendfactor(src->c[1], src->s[1]);
+            r->c[2] = FIXED_ONE - blendfactor(src->c[2], src->s[2]);
+            r->c[3] = FIXED_ONE - blendfactor(src->c[3], src->s[3]);
+            r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0]);
+            break;
+        case GGL_SRC_ALPHA:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = blendfactor(src->c[0], src->s[0], FIXED_ONE);
+            break;
+        case GGL_ONE_MINUS_SRC_ALPHA:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = FIXED_ONE - blendfactor(src->c[0], src->s[0], FIXED_ONE);
+            break;
+        case GGL_DST_ALPHA:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
+            break;
+        case GGL_ONE_MINUS_DST_ALPHA:
+            r->c[1] = 
+            r->c[2] = 
+            r->c[3] = 
+            r->c[0] = FIXED_ONE - blendfactor(dst->c[0], dst->s[0], FIXED_ONE);
+            break;
+        case GGL_SRC_ALPHA_SATURATE:
+            // XXX: GGL_SRC_ALPHA_SATURATE
+            break;
+    }
+}
+
+static GGLfixed wrapping(int32_t coord, uint32_t size, int tx_wrap)
+{
+    GGLfixed d;
+    if (tx_wrap == GGL_REPEAT) {
+        d = (uint32_t(coord)>>16) * size;
+    } else if (tx_wrap == GGL_CLAMP) { // CLAMP_TO_EDGE semantics
+        const GGLfixed clamp_min = FIXED_HALF;
+        const GGLfixed clamp_max = (size << 16) - FIXED_HALF;
+        if (coord < clamp_min)     coord = clamp_min;
+        if (coord > clamp_max)     coord = clamp_max;
+        d = coord;
+    } else { // 1:1
+        const GGLfixed clamp_min = 0;
+        const GGLfixed clamp_max = (size << 16);
+        if (coord < clamp_min)     coord = clamp_min;
+        if (coord > clamp_max)     coord = clamp_max;
+        d = coord;
+    }
+    return d;
+}
+
+static inline
+GGLcolor ADJUST_COLOR_ITERATOR(GGLcolor v, GGLcolor dvdx, int len)
+{
+    const int32_t end = dvdx * (len-1) + v;
+    if (end < 0)
+        v -= end;
+    v &= ~(v>>31);
+    return v;
+}
+
+void scanline(context_t* c)
+{
+    const uint32_t enables = c->state.enables;
+    const int xs = c->iterators.xl;
+    const int x1 = c->iterators.xr;
+	int xc = x1 - xs;
+    const int16_t* covPtr = c->state.buffers.coverage + xs;
+
+    // All iterated values are sampled at the pixel center
+
+    // reset iterators for that scanline...
+    GGLcolor r, g, b, a;
+    iterators_t& ci = c->iterators;
+    if (enables & GGL_ENABLE_SMOOTH) {
+        r = (xs * c->shade.drdx) + ci.ydrdy;
+        g = (xs * c->shade.dgdx) + ci.ydgdy;
+        b = (xs * c->shade.dbdx) + ci.ydbdy;
+        a = (xs * c->shade.dadx) + ci.ydady;
+        r = ADJUST_COLOR_ITERATOR(r, c->shade.drdx, xc);
+        g = ADJUST_COLOR_ITERATOR(g, c->shade.dgdx, xc);
+        b = ADJUST_COLOR_ITERATOR(b, c->shade.dbdx, xc);
+        a = ADJUST_COLOR_ITERATOR(a, c->shade.dadx, xc);
+    } else {
+        r = ci.ydrdy;
+        g = ci.ydgdy;
+        b = ci.ydbdy;
+        a = ci.ydady;
+    }
+
+    // z iterators are 1.31
+    GGLfixed z = (xs * c->shade.dzdx) + ci.ydzdy;
+    GGLfixed f = (xs * c->shade.dfdx) + ci.ydfdy;
+
+    struct {
+        GGLfixed s, t;
+    } tc[GGL_TEXTURE_UNIT_COUNT];
+    if (enables & GGL_ENABLE_TMUS) {
+        for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+            if (c->state.texture[i].enable) {
+                texture_iterators_t& ti = c->state.texture[i].iterators;
+                if (enables & GGL_ENABLE_W) {
+                    tc[i].s = ti.ydsdy;
+                    tc[i].t = ti.ydtdy;
+                } else {
+                    tc[i].s = (xs * ti.dsdx) + ti.ydsdy;
+                    tc[i].t = (xs * ti.dtdx) + ti.ydtdy;
+                }
+            }
+        }
+    }
+
+    pixel_t fragment;
+    pixel_t texel;
+    pixel_t fb;
+
+	uint32_t x = xs;
+	uint32_t y = c->iterators.y;
+
+	while (xc--) {
+    
+        { // just a scope
+
+		// read color (convert to 8 bits by keeping only the integer part)
+        fragment.s[1] = fragment.s[2] =
+        fragment.s[3] = fragment.s[0] = 8;
+        fragment.c[1] = r >> (GGL_COLOR_BITS-8);
+        fragment.c[2] = g >> (GGL_COLOR_BITS-8);
+        fragment.c[3] = b >> (GGL_COLOR_BITS-8);
+        fragment.c[0] = a >> (GGL_COLOR_BITS-8);
+
+		// texturing
+        if (enables & GGL_ENABLE_TMUS) {
+            for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+                texture_t& tx = c->state.texture[i];
+                if (!tx.enable)
+                    continue;
+                texture_iterators_t& ti = tx.iterators;
+                int32_t u, v;
+
+                // s-coordinate
+                if (tx.s_coord != GGL_ONE_TO_ONE) {
+                    const int w = tx.surface.width;
+                    u = wrapping(tc[i].s, w, tx.s_wrap);
+                    tc[i].s += ti.dsdx;
+                } else {
+                    u = (((tx.shade.is0>>16) + x)<<16) + FIXED_HALF;
+                }
+
+                // t-coordinate
+                if (tx.t_coord != GGL_ONE_TO_ONE) {
+                    const int h = tx.surface.height;
+                    v = wrapping(tc[i].t, h, tx.t_wrap);
+                    tc[i].t += ti.dtdx;
+                } else {
+                    v = (((tx.shade.it0>>16) + y)<<16) + FIXED_HALF;
+                }
+
+                // read texture
+                if (tx.mag_filter == GGL_NEAREST &&
+                    tx.min_filter == GGL_NEAREST)
+                {
+                    u >>= 16;
+                    v >>= 16;
+                    tx.surface.read(&tx.surface, c, u, v, &texel);
+                } else {
+                    const int w = tx.surface.width;
+                    const int h = tx.surface.height;
+                    u -= FIXED_HALF;
+                    v -= FIXED_HALF;
+                    int u0 = u >> 16;
+                    int v0 = v >> 16;
+                    int u1 = u0 + 1;
+                    int v1 = v0 + 1;
+                    if (tx.s_wrap == GGL_REPEAT) {
+                        if (u0<0)  u0 += w;
+                        if (u1<0)  u1 += w;
+                        if (u0>=w) u0 -= w;
+                        if (u1>=w) u1 -= w;
+                    } else {
+                        if (u0<0)  u0 = 0;
+                        if (u1<0)  u1 = 0;
+                        if (u0>=w) u0 = w-1;
+                        if (u1>=w) u1 = w-1;
+                    }
+                    if (tx.t_wrap == GGL_REPEAT) {
+                        if (v0<0)  v0 += h;
+                        if (v1<0)  v1 += h;
+                        if (v0>=h) v0 -= h;
+                        if (v1>=h) v1 -= h;
+                    } else {
+                        if (v0<0)  v0 = 0;
+                        if (v1<0)  v1 = 0;
+                        if (v0>=h) v0 = h-1;
+                        if (v1>=h) v1 = h-1;
+                    }
+                    pixel_t texels[4];
+                    uint32_t mm[4];
+                    tx.surface.read(&tx.surface, c, u0, v0, &texels[0]);
+                    tx.surface.read(&tx.surface, c, u0, v1, &texels[1]);
+                    tx.surface.read(&tx.surface, c, u1, v0, &texels[2]);
+                    tx.surface.read(&tx.surface, c, u1, v1, &texels[3]);
+                    u = (u >> 12) & 0xF; 
+                    v = (v >> 12) & 0xF;
+                    u += u>>3;
+                    v += v>>3;
+                    mm[0] = (0x10 - u) * (0x10 - v);
+                    mm[1] = (0x10 - u) * v;
+                    mm[2] = u * (0x10 - v);
+                    mm[3] = 0x100 - (mm[0] + mm[1] + mm[2]);
+                    for (int j=0 ; j<4 ; j++) {
+                        texel.s[j] = texels[0].s[j];
+                        if (!texel.s[j]) continue;
+                        texel.s[j] += 8;
+                        texel.c[j] =    texels[0].c[j]*mm[0] +
+                                        texels[1].c[j]*mm[1] +
+                                        texels[2].c[j]*mm[2] +
+                                        texels[3].c[j]*mm[3] ;
+                    }
+                }
+
+                // Texture environnement...
+                for (int j=0 ; j<4 ; j++) {
+                    uint32_t& Cf = fragment.c[j];
+                    uint32_t& Ct = texel.c[j];
+                    uint8_t& sf  = fragment.s[j];
+                    uint8_t& st  = texel.s[j];
+                    uint32_t At = texel.c[0];
+                    uint8_t sat = texel.s[0];
+                    switch (tx.env) {
+                    case GGL_REPLACE:
+                        if (st) {
+                            Cf = Ct;
+                            sf = st;
+                        }
+                        break;
+                    case GGL_MODULATE:
+                        if (st) {
+                            uint32_t factor = Ct + (Ct>>(st-1));
+                            Cf = (Cf * factor) >> st;
+                        }
+                        break;
+                    case GGL_DECAL:
+                        if (sat) {
+                            rescale(Cf, sf, Ct, st);
+                            Cf += ((Ct - Cf) * (At + (At>>(sat-1)))) >> sat;
+                        }
+                        break;
+                    case GGL_BLEND:
+                        if (st) {
+                            uint32_t Cc = tx.env_color[i];
+                            if (sf>8)       Cc = (Cc * ((1<<sf)-1))>>8;
+                            else if (sf<8)  Cc = (Cc - (Cc>>(8-sf)))>>(8-sf);
+                            uint32_t factor = Ct + (Ct>>(st-1));
+                            Cf = ((((1<<st) - factor) * Cf) + Ct*Cc)>>st;
+                        }
+                        break;
+                    case GGL_ADD:
+                        if (st) {
+                            rescale(Cf, sf, Ct, st);
+                            Cf += Ct;
+                        }
+                        break;
+                    }
+                }
+            }
+		}
+    
+        // coverage application
+        if (enables & GGL_ENABLE_AA) {
+            int16_t cf = *covPtr++;
+            fragment.c[0] = (int64_t(fragment.c[0]) * cf) >> 15;
+        }
+        
+        // alpha-test
+        if (enables & GGL_ENABLE_ALPHA_TEST) {
+            GGLcolor ref = c->state.alpha_test.ref;
+            GGLcolor alpha = (uint64_t(fragment.c[0]) *
+                    ((1<<GGL_COLOR_BITS)-1)) / ((1<<fragment.s[0])-1);
+            switch (c->state.alpha_test.func) {
+            case GGL_NEVER:     goto discard;
+            case GGL_LESS:      if (alpha<ref)  break; goto discard;
+            case GGL_EQUAL:     if (alpha==ref) break; goto discard;
+            case GGL_LEQUAL:    if (alpha<=ref) break; goto discard;
+            case GGL_GREATER:   if (alpha>ref)  break; goto discard;
+            case GGL_NOTEQUAL:  if (alpha!=ref) break; goto discard;
+            case GGL_GEQUAL:    if (alpha>=ref) break; goto discard;
+            }
+        }
+        
+        // depth test
+        if (c->state.buffers.depth.format) {
+            if (enables & GGL_ENABLE_DEPTH_TEST) {
+                surface_t* cb = &(c->state.buffers.depth);
+                uint16_t* p = (uint16_t*)(cb->data)+(x+(cb->stride*y));
+                uint16_t zz = uint32_t(z)>>(16);
+                uint16_t depth = *p;
+                switch (c->state.depth_test.func) {
+                case GGL_NEVER:     goto discard;
+                case GGL_LESS:      if (zz<depth)    break; goto discard;
+                case GGL_EQUAL:     if (zz==depth)   break; goto discard;
+                case GGL_LEQUAL:    if (zz<=depth)   break; goto discard;
+                case GGL_GREATER:   if (zz>depth)    break; goto discard;
+                case GGL_NOTEQUAL:  if (zz!=depth)   break; goto discard;
+                case GGL_GEQUAL:    if (zz>=depth)   break; goto discard;
+                }
+                // depth buffer is not enabled, if depth-test is not enabled
+/*
+        fragment.s[1] = fragment.s[2] =
+        fragment.s[3] = fragment.s[0] = 8;
+        fragment.c[1] = 
+        fragment.c[2] = 
+        fragment.c[3] = 
+        fragment.c[0] = 255 - (zz>>8);
+*/
+                if (c->state.mask.depth) {
+                    *p = zz;
+                }
+            }
+        }
+
+        // fog
+        if (enables & GGL_ENABLE_FOG) {
+            for (int i=1 ; i<=3 ; i++) {
+                GGLfixed fc = (c->state.fog.color[i] * 0x10000) / 0xFF;
+                uint32_t& c = fragment.c[i];
+                uint8_t& s  = fragment.s[i];
+                c = (c * 0x10000) / ((1<<s)-1);
+                c = gglMulAddx(c, f, gglMulx(fc, 0x10000 - f));
+                s = 16;
+            }
+        }
+
+        // blending
+        if (enables & GGL_ENABLE_BLENDING) {
+            fb.c[1] = fb.c[2] = fb.c[3] = fb.c[0] = 0; // placate valgrind
+            fb.s[1] = fb.s[2] = fb.s[3] = fb.s[0] = 0;
+            c->state.buffers.color.read(
+                    &(c->state.buffers.color), c, x, y, &fb);
+            blending( c, &fragment, &fb );
+        }
+
+		// write
+        c->state.buffers.color.write(
+                &(c->state.buffers.color), c, x, y, &fragment);
+        }
+
+discard:
+		// iterate...
+        x += 1;
+        if (enables & GGL_ENABLE_SMOOTH) {
+            r += c->shade.drdx;
+            g += c->shade.dgdx;
+            b += c->shade.dbdx;
+            a += c->shade.dadx;
+        }
+        z += c->shade.dzdx;
+        f += c->shade.dfdx;
+	}
+}
+
+#endif // ANDROID_ARM_CODEGEN && (ANDROID_CODEGEN == ANDROID_CODEGEN_GENERATED)
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Scanline
+#endif
+
+/* Used to parse a 32-bit source texture linearly. Usage is:
+ *
+ * horz_iterator32  hi(context);
+ * while (...) {
+ *    uint32_t  src_pixel = hi.get_pixel32();
+ *    ...
+ * }
+ *
+ * Use only for one-to-one texture mapping.
+ */
+struct horz_iterator32 {
+    explicit horz_iterator32(context_t* c) {
+        const int x = c->iterators.xl;
+        const int y = c->iterators.y;
+        texture_t& tx = c->state.texture[0];
+        const int32_t u = (tx.shade.is0>>16) + x;
+        const int32_t v = (tx.shade.it0>>16) + y;
+        m_src = reinterpret_cast<uint32_t*>(tx.surface.data)+(u+(tx.surface.stride*v));
+    }
+    uint32_t  get_pixel32() {
+        return *m_src++;
+    }
+protected:
+    uint32_t* m_src;
+};
+
+/* A variant for 16-bit source textures. */
+struct horz_iterator16 {
+    explicit horz_iterator16(context_t* c) {
+        const int x = c->iterators.xl;
+        const int y = c->iterators.y;
+        texture_t& tx = c->state.texture[0];
+        const int32_t u = (tx.shade.is0>>16) + x;
+        const int32_t v = (tx.shade.it0>>16) + y;
+        m_src = reinterpret_cast<uint16_t*>(tx.surface.data)+(u+(tx.surface.stride*v));
+    }
+    uint16_t  get_pixel16() {
+        return *m_src++;
+    }
+protected:
+    uint16_t* m_src;
+};
+
+/* A clamp iterator is used to iterate inside a texture with GGL_CLAMP.
+ * After initialization, call get_src16() or get_src32() to get the current
+ * texture pixel value.
+ */
+struct clamp_iterator {
+    explicit clamp_iterator(context_t* c) {
+        const int xs = c->iterators.xl;
+        texture_t& tx = c->state.texture[0];
+        texture_iterators_t& ti = tx.iterators;
+        m_s = (xs * ti.dsdx) + ti.ydsdy;
+        m_t = (xs * ti.dtdx) + ti.ydtdy;
+        m_ds = ti.dsdx;
+        m_dt = ti.dtdx;
+        m_width_m1 = tx.surface.width - 1;
+        m_height_m1 = tx.surface.height - 1;
+        m_data = tx.surface.data;
+        m_stride = tx.surface.stride;
+    }
+    uint16_t get_pixel16() {
+        int  u, v;
+        get_uv(u, v);
+        uint16_t* src = reinterpret_cast<uint16_t*>(m_data) + (u + (m_stride*v));
+        return src[0];
+    }
+    uint32_t get_pixel32() {
+        int  u, v;
+        get_uv(u, v);
+        uint32_t* src = reinterpret_cast<uint32_t*>(m_data) + (u + (m_stride*v));
+        return src[0];
+    }
+private:
+    void   get_uv(int& u, int& v) {
+        int  uu = m_s >> 16;
+        int  vv = m_t >> 16;
+        if (uu < 0)
+            uu = 0;
+        if (uu > m_width_m1)
+            uu = m_width_m1;
+        if (vv < 0)
+            vv = 0;
+        if (vv > m_height_m1)
+            vv = m_height_m1;
+        u = uu;
+        v = vv;
+        m_s += m_ds;
+        m_t += m_dt;
+    }
+
+    GGLfixed  m_s, m_t;
+    GGLfixed  m_ds, m_dt;
+    int       m_width_m1, m_height_m1;
+    uint8_t*  m_data;
+    int       m_stride;
+};
+
+/*
+ * The 'horizontal clamp iterator' variant corresponds to the case where
+ * the 'v' coordinate doesn't change. This is useful to avoid one mult and
+ * extra adds / checks per pixels, if the blending/processing operation after
+ * this is very fast.
+ */
+static int is_context_horizontal(const context_t* c) {
+    return (c->state.texture[0].iterators.dtdx == 0);
+}
+
+struct horz_clamp_iterator {
+    uint16_t  get_pixel16() {
+        int  u = m_s >> 16;
+        m_s += m_ds;
+        if (u < 0)
+            u = 0;
+        if (u > m_width_m1)
+            u = m_width_m1;
+        const uint16_t* src = reinterpret_cast<const uint16_t*>(m_data);
+        return src[u];
+    }
+    uint32_t  get_pixel32() {
+        int  u = m_s >> 16;
+        m_s += m_ds;
+        if (u < 0)
+            u = 0;
+        if (u > m_width_m1)
+            u = m_width_m1;
+        const uint32_t* src = reinterpret_cast<const uint32_t*>(m_data);
+        return src[u];
+    }
+protected:
+    void init(const context_t* c, int shift);
+    GGLfixed       m_s;
+    GGLfixed       m_ds;
+    int            m_width_m1;
+    const uint8_t* m_data;
+};
+
+void horz_clamp_iterator::init(const context_t* c, int shift)
+{
+    const int xs = c->iterators.xl;
+    const texture_t& tx = c->state.texture[0];
+    const texture_iterators_t& ti = tx.iterators;
+    m_s = (xs * ti.dsdx) + ti.ydsdy;
+    m_ds = ti.dsdx;
+    m_width_m1 = tx.surface.width-1;
+    m_data = tx.surface.data;
+
+    GGLfixed t = (xs * ti.dtdx) + ti.ydtdy;
+    int      v = t >> 16;
+    if (v < 0)
+        v = 0;
+    else if (v >= (int)tx.surface.height)
+        v = (int)tx.surface.height-1;
+
+    m_data += (tx.surface.stride*v) << shift;
+}
+
+struct horz_clamp_iterator16 : horz_clamp_iterator {
+    explicit horz_clamp_iterator16(const context_t* c) {
+        init(c,1);
+    };
+};
+
+struct horz_clamp_iterator32 : horz_clamp_iterator {
+    explicit horz_clamp_iterator32(context_t* c) {
+        init(c,2);
+    };
+};
+
+/* This is used to perform dithering operations.
+ */
+struct ditherer {
+    explicit ditherer(const context_t* c) {
+        const int x = c->iterators.xl;
+        const int y = c->iterators.y;
+        m_line = &c->ditherMatrix[ ((y & GGL_DITHER_MASK)<<GGL_DITHER_ORDER_SHIFT) ];
+        m_index = x & GGL_DITHER_MASK;
+    }
+    void step(void) {
+        m_index++;
+    }
+    int  get_value(void) {
+        int ret = m_line[m_index & GGL_DITHER_MASK];
+        m_index++;
+        return ret;
+    }
+    uint16_t abgr8888ToRgb565(uint32_t s) {
+        uint32_t r = s & 0xff;
+        uint32_t g = (s >> 8) & 0xff;
+        uint32_t b = (s >> 16) & 0xff;
+        return rgb888ToRgb565(r,g,b);
+    }
+    /* The following assumes that r/g/b are in the 0..255 range each */
+    uint16_t rgb888ToRgb565(uint32_t& r, uint32_t& g, uint32_t &b) {
+        int threshold = get_value();
+        /* dither in on GGL_DITHER_BITS, and each of r, g, b is on 8 bits */
+        r += (threshold >> (GGL_DITHER_BITS-8 +5));
+        g += (threshold >> (GGL_DITHER_BITS-8 +6));
+        b += (threshold >> (GGL_DITHER_BITS-8 +5));
+        if (r > 0xff)
+            r = 0xff;
+        if (g > 0xff)
+            g = 0xff;
+        if (b > 0xff)
+            b = 0xff;
+        return uint16_t(((r & 0xf8) << 8) | ((g & 0xfc) << 3) | (b >> 3));
+    }
+protected:
+    const uint8_t* m_line;
+    int            m_index;
+};
+
+/* This structure is used to blend (SRC_OVER) 32-bit source pixels
+ * onto 16-bit destination ones. Usage is simply:
+ *
+ *   blender.blend(<32-bit-src-pixel-value>,<ptr-to-16-bit-dest-pixel>)
+ */
+struct blender_32to16 {
+    explicit blender_32to16(context_t* /*c*/) { }
+    void write(uint32_t s, uint16_t* dst) {
+        if (s == 0)
+            return;
+        s = GGL_RGBA_TO_HOST(s);
+        int sA = (s>>24);
+        if (sA == 0xff) {
+            *dst = convertAbgr8888ToRgb565(s);
+        } else {
+            int f = 0x100 - (sA + (sA>>7));
+            int sR = (s >> (   3))&0x1F;
+            int sG = (s >> ( 8+2))&0x3F;
+            int sB = (s >> (16+3))&0x1F;
+            uint16_t d = *dst;
+            int dR = (d>>11)&0x1f;
+            int dG = (d>>5)&0x3f;
+            int dB = (d)&0x1f;
+            sR += (f*dR)>>8;
+            sG += (f*dG)>>8;
+            sB += (f*dB)>>8;
+            *dst = uint16_t((sR<<11)|(sG<<5)|sB);
+        }
+    }
+    void write(uint32_t s, uint16_t* dst, ditherer& di) {
+        if (s == 0) {
+            di.step();
+            return;
+        }
+        s = GGL_RGBA_TO_HOST(s);
+        int sA = (s>>24);
+        if (sA == 0xff) {
+            *dst = di.abgr8888ToRgb565(s);
+        } else {
+            int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
+            int f = 0x100 - (sA + (sA>>7));
+            int sR = (s >> (   3))&0x1F;
+            int sG = (s >> ( 8+2))&0x3F;
+            int sB = (s >> (16+3))&0x1F;
+            uint16_t d = *dst;
+            int dR = (d>>11)&0x1f;
+            int dG = (d>>5)&0x3f;
+            int dB = (d)&0x1f;
+            sR = ((sR << 8) + f*dR + threshold)>>8;
+            sG = ((sG << 8) + f*dG + threshold)>>8;
+            sB = ((sB << 8) + f*dB + threshold)>>8;
+            if (sR > 0x1f) sR = 0x1f;
+            if (sG > 0x3f) sG = 0x3f;
+            if (sB > 0x1f) sB = 0x1f;
+            *dst = uint16_t((sR<<11)|(sG<<5)|sB);
+        }
+    }
+};
+
+/* This blender does the same for the 'blend_srca' operation.
+ * where dstFactor=srcA*(1-srcA) srcFactor=srcA
+ */
+struct blender_32to16_srcA {
+    explicit blender_32to16_srcA(const context_t* /*c*/) { }
+    void write(uint32_t s, uint16_t* dst) {
+        if (!s) {
+            return;
+        }
+        uint16_t d = *dst;
+        s = GGL_RGBA_TO_HOST(s);
+        int sR = (s >> (   3))&0x1F;
+        int sG = (s >> ( 8+2))&0x3F;
+        int sB = (s >> (16+3))&0x1F;
+        int sA = (s>>24);
+        int f1 = (sA + (sA>>7));
+        int f2 = 0x100-f1;
+        int dR = (d>>11)&0x1f;
+        int dG = (d>>5)&0x3f;
+        int dB = (d)&0x1f;
+        sR = (f1*sR + f2*dR)>>8;
+        sG = (f1*sG + f2*dG)>>8;
+        sB = (f1*sB + f2*dB)>>8;
+        *dst = uint16_t((sR<<11)|(sG<<5)|sB);
+    }
+};
+
+/* Common init code the modulating blenders */
+struct blender_modulate {
+    void init(const context_t* c) {
+        const int r = c->iterators.ydrdy >> (GGL_COLOR_BITS-8);
+        const int g = c->iterators.ydgdy >> (GGL_COLOR_BITS-8);
+        const int b = c->iterators.ydbdy >> (GGL_COLOR_BITS-8);
+        const int a = c->iterators.ydady >> (GGL_COLOR_BITS-8);
+        m_r = r + (r >> 7);
+        m_g = g + (g >> 7);
+        m_b = b + (b >> 7);
+        m_a = a + (a >> 7);
+    }
+protected:
+    int m_r, m_g, m_b, m_a;
+};
+
+/* This blender does a normal blend after modulation.
+ */
+struct blender_32to16_modulate : blender_modulate {
+    explicit blender_32to16_modulate(const context_t* c) {
+        init(c);
+    }
+    void write(uint32_t s, uint16_t* dst) {
+        // blend source and destination
+        if (!s) {
+            return;
+        }
+        s = GGL_RGBA_TO_HOST(s);
+
+        /* We need to modulate s */
+        uint32_t  sA = (s >> 24);
+        uint32_t  sB = (s >> 16) & 0xff;
+        uint32_t  sG = (s >> 8) & 0xff;
+        uint32_t  sR = s & 0xff;
+
+        sA = (sA*m_a) >> 8;
+        /* Keep R/G/B scaled to 5.8 or 6.8 fixed float format */
+        sR = (sR*m_r) >> (8 - 5);
+        sG = (sG*m_g) >> (8 - 6);
+        sB = (sB*m_b) >> (8 - 5);
+
+        /* Now do a normal blend */
+        int f = 0x100 - (sA + (sA>>7));
+        uint16_t d = *dst;
+        int dR = (d>>11)&0x1f;
+        int dG = (d>>5)&0x3f;
+        int dB = (d)&0x1f;
+        sR = (sR + f*dR)>>8;
+        sG = (sG + f*dG)>>8;
+        sB = (sB + f*dB)>>8;
+        *dst = uint16_t((sR<<11)|(sG<<5)|sB);
+    }
+    void write(uint32_t s, uint16_t* dst, ditherer& di) {
+        // blend source and destination
+        if (!s) {
+            di.step();
+            return;
+        }
+        s = GGL_RGBA_TO_HOST(s);
+
+        /* We need to modulate s */
+        uint32_t  sA = (s >> 24);
+        uint32_t  sB = (s >> 16) & 0xff;
+        uint32_t  sG = (s >> 8) & 0xff;
+        uint32_t  sR = s & 0xff;
+
+        sA = (sA*m_a) >> 8;
+        /* keep R/G/B scaled to 5.8 or 6.8 fixed float format */
+        sR = (sR*m_r) >> (8 - 5);
+        sG = (sG*m_g) >> (8 - 6);
+        sB = (sB*m_b) >> (8 - 5);
+
+        /* Scale threshold to 0.8 fixed float format */
+        int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
+        int f = 0x100 - (sA + (sA>>7));
+        uint16_t d = *dst;
+        int dR = (d>>11)&0x1f;
+        int dG = (d>>5)&0x3f;
+        int dB = (d)&0x1f;
+        sR = (sR + f*dR + threshold)>>8;
+        sG = (sG + f*dG + threshold)>>8;
+        sB = (sB + f*dB + threshold)>>8;
+        if (sR > 0x1f) sR = 0x1f;
+        if (sG > 0x3f) sG = 0x3f;
+        if (sB > 0x1f) sB = 0x1f;
+        *dst = uint16_t((sR<<11)|(sG<<5)|sB);
+    }
+};
+
+/* same as 32to16_modulate, except that the input is xRGB, instead of ARGB */
+struct blender_x32to16_modulate : blender_modulate {
+    explicit blender_x32to16_modulate(const context_t* c) {
+        init(c);
+    }
+    void write(uint32_t s, uint16_t* dst) {
+        s = GGL_RGBA_TO_HOST(s);
+
+        uint32_t  sB = (s >> 16) & 0xff;
+        uint32_t  sG = (s >> 8) & 0xff;
+        uint32_t  sR = s & 0xff;
+
+        /* Keep R/G/B in 5.8 or 6.8 format */
+        sR = (sR*m_r) >> (8 - 5);
+        sG = (sG*m_g) >> (8 - 6);
+        sB = (sB*m_b) >> (8 - 5);
+
+        int f = 0x100 - m_a;
+        uint16_t d = *dst;
+        int dR = (d>>11)&0x1f;
+        int dG = (d>>5)&0x3f;
+        int dB = (d)&0x1f;
+        sR = (sR + f*dR)>>8;
+        sG = (sG + f*dG)>>8;
+        sB = (sB + f*dB)>>8;
+        *dst = uint16_t((sR<<11)|(sG<<5)|sB);
+    }
+    void write(uint32_t s, uint16_t* dst, ditherer& di) {
+        s = GGL_RGBA_TO_HOST(s);
+
+        uint32_t  sB = (s >> 16) & 0xff;
+        uint32_t  sG = (s >> 8) & 0xff;
+        uint32_t  sR = s & 0xff;
+
+        sR = (sR*m_r) >> (8 - 5);
+        sG = (sG*m_g) >> (8 - 6);
+        sB = (sB*m_b) >> (8 - 5);
+
+        /* Now do a normal blend */
+        int threshold = di.get_value() << (8 - GGL_DITHER_BITS);
+        int f = 0x100 - m_a;
+        uint16_t d = *dst;
+        int dR = (d>>11)&0x1f;
+        int dG = (d>>5)&0x3f;
+        int dB = (d)&0x1f;
+        sR = (sR + f*dR + threshold)>>8;
+        sG = (sG + f*dG + threshold)>>8;
+        sB = (sB + f*dB + threshold)>>8;
+        if (sR > 0x1f) sR = 0x1f;
+        if (sG > 0x3f) sG = 0x3f;
+        if (sB > 0x1f) sB = 0x1f;
+        *dst = uint16_t((sR<<11)|(sG<<5)|sB);
+    }
+};
+
+/* Same as above, but source is 16bit rgb565 */
+struct blender_16to16_modulate : blender_modulate {
+    explicit blender_16to16_modulate(const context_t* c) {
+        init(c);
+    }
+    void write(uint16_t s16, uint16_t* dst) {
+        uint32_t  s = s16;
+
+        uint32_t  sR = s >> 11;
+        uint32_t  sG = (s >> 5) & 0x3f;
+        uint32_t  sB = s & 0x1f;
+
+        sR = (sR*m_r);
+        sG = (sG*m_g);
+        sB = (sB*m_b);
+
+        int f = 0x100 - m_a;
+        uint16_t d = *dst;
+        int dR = (d>>11)&0x1f;
+        int dG = (d>>5)&0x3f;
+        int dB = (d)&0x1f;
+        sR = (sR + f*dR)>>8;
+        sG = (sG + f*dG)>>8;
+        sB = (sB + f*dB)>>8;
+        *dst = uint16_t((sR<<11)|(sG<<5)|sB);
+    }
+};
+
+/* This is used to iterate over a 16-bit destination color buffer.
+ * Usage is:
+ *
+ *   dst_iterator16  di(context);
+ *   while (di.count--) {
+ *       <do stuff with dest pixel at di.dst>
+ *       di.dst++;
+ *   }
+ */
+struct dst_iterator16 {
+    explicit dst_iterator16(const context_t* c) {
+        const int x = c->iterators.xl;
+        const int width = c->iterators.xr - x;
+        const int32_t y = c->iterators.y;
+        const surface_t* cb = &(c->state.buffers.color);
+        count = width;
+        dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
+    }
+    int        count;
+    uint16_t*  dst;
+};
+
+
+static void scanline_t32cb16_clamp(context_t* c)
+{
+    dst_iterator16  di(c);
+
+    if (is_context_horizontal(c)) {
+        /* Special case for simple horizontal scaling */
+        horz_clamp_iterator32 ci(c);
+        while (di.count--) {
+            uint32_t s = ci.get_pixel32();
+            *di.dst++ = convertAbgr8888ToRgb565(s);
+        }
+    } else {
+        /* General case */
+        clamp_iterator ci(c);
+        while (di.count--) {
+            uint32_t s = ci.get_pixel32();
+            *di.dst++ = convertAbgr8888ToRgb565(s);
+        }
+    }
+}
+
+static void scanline_t32cb16_dither(context_t* c)
+{
+    horz_iterator32 si(c);
+    dst_iterator16  di(c);
+    ditherer        dither(c);
+
+    while (di.count--) {
+        uint32_t s = si.get_pixel32();
+        *di.dst++ = dither.abgr8888ToRgb565(s);
+    }
+}
+
+static void scanline_t32cb16_clamp_dither(context_t* c)
+{
+    dst_iterator16  di(c);
+    ditherer        dither(c);
+
+    if (is_context_horizontal(c)) {
+        /* Special case for simple horizontal scaling */
+        horz_clamp_iterator32 ci(c);
+        while (di.count--) {
+            uint32_t s = ci.get_pixel32();
+            *di.dst++ = dither.abgr8888ToRgb565(s);
+        }
+    } else {
+        /* General case */
+        clamp_iterator ci(c);
+        while (di.count--) {
+            uint32_t s = ci.get_pixel32();
+            *di.dst++ = dither.abgr8888ToRgb565(s);
+        }
+    }
+}
+
+static void scanline_t32cb16blend_dither(context_t* c)
+{
+    dst_iterator16 di(c);
+    ditherer       dither(c);
+    blender_32to16 bl(c);
+    horz_iterator32  hi(c);
+    while (di.count--) {
+        uint32_t s = hi.get_pixel32();
+        bl.write(s, di.dst, dither);
+        di.dst++;
+    }
+}
+
+static void scanline_t32cb16blend_clamp(context_t* c)
+{
+    dst_iterator16  di(c);
+    blender_32to16  bl(c);
+
+    if (is_context_horizontal(c)) {
+        horz_clamp_iterator32 ci(c);
+        while (di.count--) {
+            uint32_t s = ci.get_pixel32();
+            bl.write(s, di.dst);
+            di.dst++;
+        }
+    } else {
+        clamp_iterator ci(c);
+        while (di.count--) {
+            uint32_t s = ci.get_pixel32();
+            bl.write(s, di.dst);
+            di.dst++;
+        }
+    }
+}
+
+static void scanline_t32cb16blend_clamp_dither(context_t* c)
+{
+    dst_iterator16 di(c);
+    ditherer       dither(c);
+    blender_32to16 bl(c);
+
+    clamp_iterator ci(c);
+    while (di.count--) {
+        uint32_t s = ci.get_pixel32();
+        bl.write(s, di.dst, dither);
+        di.dst++;
+    }
+}
+
+void scanline_t32cb16blend_clamp_mod(context_t* c)
+{
+    dst_iterator16 di(c);
+    blender_32to16_modulate bl(c);
+
+    clamp_iterator ci(c);
+    while (di.count--) {
+        uint32_t s = ci.get_pixel32();
+        bl.write(s, di.dst);
+        di.dst++;
+    }
+}
+
+void scanline_t32cb16blend_clamp_mod_dither(context_t* c)
+{
+    dst_iterator16 di(c);
+    blender_32to16_modulate bl(c);
+    ditherer dither(c);
+
+    clamp_iterator ci(c);
+    while (di.count--) {
+        uint32_t s = ci.get_pixel32();
+        bl.write(s, di.dst, dither);
+        di.dst++;
+    }
+}
+
+/* Variant of scanline_t32cb16blend_clamp_mod with a xRGB texture */
+void scanline_x32cb16blend_clamp_mod(context_t* c)
+{
+    dst_iterator16 di(c);
+    blender_x32to16_modulate  bl(c);
+
+    clamp_iterator ci(c);
+    while (di.count--) {
+        uint32_t s = ci.get_pixel32();
+        bl.write(s, di.dst);
+        di.dst++;
+    }
+}
+
+void scanline_x32cb16blend_clamp_mod_dither(context_t* c)
+{
+    dst_iterator16 di(c);
+    blender_x32to16_modulate  bl(c);
+    ditherer dither(c);
+
+    clamp_iterator ci(c);
+    while (di.count--) {
+        uint32_t s = ci.get_pixel32();
+        bl.write(s, di.dst, dither);
+        di.dst++;
+    }
+}
+
+void scanline_t16cb16_clamp(context_t* c)
+{
+    dst_iterator16  di(c);
+
+    /* Special case for simple horizontal scaling */
+    if (is_context_horizontal(c)) {
+        horz_clamp_iterator16 ci(c);
+        while (di.count--) {
+            *di.dst++ = ci.get_pixel16();
+        }
+    } else {
+        clamp_iterator ci(c);
+        while (di.count--) {
+            *di.dst++ = ci.get_pixel16();
+        }
+    }
+}
+
+
+
+template <typename T, typename U>
+static inline __attribute__((const))
+T interpolate(int y, T v0, U dvdx, U dvdy) {
+    // interpolates in pixel's centers
+    // v = v0 + (y + 0.5) * dvdy + (0.5 * dvdx)
+    return (y * dvdy) + (v0 + ((dvdy + dvdx) >> 1));
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#endif
+
+void init_y(context_t* c, int32_t ys)
+{
+    const uint32_t enables = c->state.enables;
+
+    // compute iterators...
+    iterators_t& ci = c->iterators;
+    
+    // sample in the center
+    ci.y = ys;
+
+    if (enables & (GGL_ENABLE_DEPTH_TEST|GGL_ENABLE_W|GGL_ENABLE_FOG)) {
+        ci.ydzdy = interpolate(ys, c->shade.z0, c->shade.dzdx, c->shade.dzdy);
+        ci.ydwdy = interpolate(ys, c->shade.w0, c->shade.dwdx, c->shade.dwdy);
+        ci.ydfdy = interpolate(ys, c->shade.f0, c->shade.dfdx, c->shade.dfdy);
+    }
+
+    if (ggl_unlikely(enables & GGL_ENABLE_SMOOTH)) {
+        ci.ydrdy = interpolate(ys, c->shade.r0, c->shade.drdx, c->shade.drdy);
+        ci.ydgdy = interpolate(ys, c->shade.g0, c->shade.dgdx, c->shade.dgdy);
+        ci.ydbdy = interpolate(ys, c->shade.b0, c->shade.dbdx, c->shade.dbdy);
+        ci.ydady = interpolate(ys, c->shade.a0, c->shade.dadx, c->shade.dady);
+        c->step_y = step_y__smooth;
+    } else {
+        ci.ydrdy = c->shade.r0;
+        ci.ydgdy = c->shade.g0;
+        ci.ydbdy = c->shade.b0;
+        ci.ydady = c->shade.a0;
+        // XXX: do only if needed, or make sure this is fast
+        c->packed = ggl_pack_color(c, c->state.buffers.color.format,
+                ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
+        c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888, 
+                ci.ydrdy, ci.ydgdy, ci.ydbdy, ci.ydady);
+    }
+
+    // initialize the variables we need in the shader
+    generated_vars_t& gen = c->generated_vars;
+    gen.argb[GGLFormat::ALPHA].c  = ci.ydady;
+    gen.argb[GGLFormat::ALPHA].dx = c->shade.dadx;
+    gen.argb[GGLFormat::RED  ].c  = ci.ydrdy;
+    gen.argb[GGLFormat::RED  ].dx = c->shade.drdx;
+    gen.argb[GGLFormat::GREEN].c  = ci.ydgdy;
+    gen.argb[GGLFormat::GREEN].dx = c->shade.dgdx;
+    gen.argb[GGLFormat::BLUE ].c  = ci.ydbdy;
+    gen.argb[GGLFormat::BLUE ].dx = c->shade.dbdx;
+    gen.dzdx = c->shade.dzdx;
+    gen.f    = ci.ydfdy;
+    gen.dfdx = c->shade.dfdx;
+
+    if (enables & GGL_ENABLE_TMUS) {
+        for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+            texture_t& t = c->state.texture[i];
+            if (!t.enable) continue;
+
+            texture_iterators_t& ti = t.iterators;
+            if (t.s_coord == GGL_ONE_TO_ONE && t.t_coord == GGL_ONE_TO_ONE) {
+                // we need to set all of these to 0 because in some cases
+                // step_y__generic() or step_y__tmu() will be used and
+                // therefore will update dtdy, however, in 1:1 mode
+                // this is always done by the scanline rasterizer.
+                ti.dsdx = ti.dsdy = ti.dtdx = ti.dtdy = 0;
+                ti.ydsdy = t.shade.is0;
+                ti.ydtdy = t.shade.it0;
+            } else {
+                const int adjustSWrap = ((t.s_wrap==GGL_CLAMP)?0:16);
+                const int adjustTWrap = ((t.t_wrap==GGL_CLAMP)?0:16);
+                ti.sscale = t.shade.sscale + adjustSWrap;
+                ti.tscale = t.shade.tscale + adjustTWrap;
+                if (!(enables & GGL_ENABLE_W)) {
+                    // S coordinate
+                    const int32_t sscale = ti.sscale;
+                    const int32_t sy = interpolate(ys,
+                            t.shade.is0, t.shade.idsdx, t.shade.idsdy);
+                    if (sscale>=0) {
+                        ti.ydsdy= sy            << sscale;
+                        ti.dsdx = t.shade.idsdx << sscale; 
+                        ti.dsdy = t.shade.idsdy << sscale;
+                    } else {
+                        ti.ydsdy= sy            >> -sscale;
+                        ti.dsdx = t.shade.idsdx >> -sscale; 
+                        ti.dsdy = t.shade.idsdy >> -sscale;
+                    }
+                    // T coordinate
+                    const int32_t tscale = ti.tscale;
+                    const int32_t ty = interpolate(ys,
+                            t.shade.it0, t.shade.idtdx, t.shade.idtdy);
+                    if (tscale>=0) {
+                        ti.ydtdy= ty            << tscale;
+                        ti.dtdx = t.shade.idtdx << tscale; 
+                        ti.dtdy = t.shade.idtdy << tscale;
+                    } else {
+                        ti.ydtdy= ty            >> -tscale;
+                        ti.dtdx = t.shade.idtdx >> -tscale; 
+                        ti.dtdy = t.shade.idtdy >> -tscale;
+                    }
+                }
+            }
+            // mirror for generated code...
+            generated_tex_vars_t& gen = c->generated_vars.texture[i];
+            gen.width   = t.surface.width;
+            gen.height  = t.surface.height;
+            gen.stride  = t.surface.stride;
+            gen.data    = uintptr_t(t.surface.data);
+            gen.dsdx = ti.dsdx;
+            gen.dtdx = ti.dtdx;
+        }
+    }
+
+    // choose the y-stepper
+    c->step_y = step_y__nop;
+    if (enables & GGL_ENABLE_FOG) {
+        c->step_y = step_y__generic;
+    } else if (enables & GGL_ENABLE_TMUS) {
+        if (enables & GGL_ENABLE_SMOOTH) {
+            c->step_y = step_y__generic;
+        } else if (enables & GGL_ENABLE_W) {
+            c->step_y = step_y__w;
+        } else {
+            c->step_y = step_y__tmu;
+        }
+    } else {
+        if (enables & GGL_ENABLE_SMOOTH) {
+            c->step_y = step_y__smooth;
+        }
+    }
+    
+    // choose the rectangle blitter
+    c->rect = rect_generic;
+    if ((c->step_y == step_y__nop) &&
+        (c->scanline == scanline_memcpy))
+    {
+        c->rect = rect_memcpy;
+    }
+}
+
+void init_y_packed(context_t* c, int32_t y0)
+{
+    uint8_t f = c->state.buffers.color.format;
+    c->packed = ggl_pack_color(c, f,
+            c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
+    c->packed8888 = ggl_pack_color(c, GGL_PIXEL_FORMAT_RGBA_8888,
+            c->shade.r0, c->shade.g0, c->shade.b0, c->shade.a0);
+    c->iterators.y = y0;
+    c->step_y = step_y__nop;
+    // choose the rectangle blitter
+    c->rect = rect_generic;
+    if (c->scanline == scanline_memcpy) {
+        c->rect = rect_memcpy;
+    }
+}
+
+void init_y_noop(context_t* c, int32_t y0)
+{
+    c->iterators.y = y0;
+    c->step_y = step_y__nop;
+    // choose the rectangle blitter
+    c->rect = rect_generic;
+    if (c->scanline == scanline_memcpy) {
+        c->rect = rect_memcpy;
+    }
+}
+
+void init_y_error(context_t* c, int32_t y0)
+{
+    // woooops, shoud never happen,
+    // fail gracefully (don't display anything)
+    init_y_noop(c, y0);
+    ALOGE("color-buffer has an invalid format!");
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#endif
+
+void step_y__generic(context_t* c)
+{
+    const uint32_t enables = c->state.enables;
+
+    // iterate...
+    iterators_t& ci = c->iterators;
+    ci.y += 1;
+                
+    if (enables & GGL_ENABLE_SMOOTH) {
+        ci.ydrdy += c->shade.drdy;
+        ci.ydgdy += c->shade.dgdy;
+        ci.ydbdy += c->shade.dbdy;
+        ci.ydady += c->shade.dady;
+    }
+
+    const uint32_t mask =
+            GGL_ENABLE_DEPTH_TEST |
+            GGL_ENABLE_W |
+            GGL_ENABLE_FOG;
+    if (enables & mask) {
+        ci.ydzdy += c->shade.dzdy;
+        ci.ydwdy += c->shade.dwdy;
+        ci.ydfdy += c->shade.dfdy;
+    }
+
+    if ((enables & GGL_ENABLE_TMUS) && (!(enables & GGL_ENABLE_W))) {
+        for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+            if (c->state.texture[i].enable) {
+                texture_iterators_t& ti = c->state.texture[i].iterators;
+                ti.ydsdy += ti.dsdy;
+                ti.ydtdy += ti.dtdy;
+            }
+        }
+    }
+}
+
+void step_y__nop(context_t* c)
+{
+    c->iterators.y += 1;
+    c->iterators.ydzdy += c->shade.dzdy;
+}
+
+void step_y__smooth(context_t* c)
+{
+    iterators_t& ci = c->iterators;
+    ci.y += 1;
+    ci.ydrdy += c->shade.drdy;
+    ci.ydgdy += c->shade.dgdy;
+    ci.ydbdy += c->shade.dbdy;
+    ci.ydady += c->shade.dady;
+    ci.ydzdy += c->shade.dzdy;
+}
+
+void step_y__w(context_t* c)
+{
+    iterators_t& ci = c->iterators;
+    ci.y += 1;
+    ci.ydzdy += c->shade.dzdy;
+    ci.ydwdy += c->shade.dwdy;
+}
+
+void step_y__tmu(context_t* c)
+{
+    iterators_t& ci = c->iterators;
+    ci.y += 1;
+    ci.ydzdy += c->shade.dzdy;
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+        if (c->state.texture[i].enable) {
+            texture_iterators_t& ti = c->state.texture[i].iterators;
+            ti.ydsdy += ti.dsdy;
+            ti.ydtdy += ti.dtdy;
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#endif
+
+void scanline_perspective(context_t* c)
+{
+    struct {
+        union {
+            struct {
+                int32_t s, sq;
+                int32_t t, tq;
+            } sqtq;
+            struct {
+                int32_t v, q;
+            } st[2];
+        };
+    } tc[GGL_TEXTURE_UNIT_COUNT] __attribute__((aligned(16)));
+
+    // XXX: we should have a special case when dwdx = 0
+
+    // 32 pixels spans works okay. 16 is a lot better,
+    // but hey, it's a software renderer...
+    const uint32_t SPAN_BITS = 5; 
+    const uint32_t ys = c->iterators.y;
+    const uint32_t xs = c->iterators.xl;
+    const uint32_t x1 = c->iterators.xr;
+	const uint32_t xc = x1 - xs;
+    uint32_t remainder = xc & ((1<<SPAN_BITS)-1);
+    uint32_t numSpans = xc >> SPAN_BITS;
+
+    const iterators_t& ci = c->iterators;
+    int32_t w0 = (xs * c->shade.dwdx) + ci.ydwdy;
+    int32_t q0 = gglRecipQ(w0, 30);
+    const int iwscale = 32 - gglClz(q0);
+
+    const int32_t dwdx = c->shade.dwdx << SPAN_BITS;
+    int32_t xl = c->iterators.xl;
+
+    // We process s & t with a loop to reduce the code size
+    // (and i-cache pressure).
+
+    for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+        const texture_t& tmu = c->state.texture[i];
+        if (!tmu.enable) continue;
+        int32_t s =   tmu.shade.is0 +
+                     (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
+                     ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
+        int32_t t =   tmu.shade.it0 +
+                     (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
+                     ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
+        tc[i].sqtq.s  = s;
+        tc[i].sqtq.t  = t;
+        tc[i].sqtq.sq = gglMulx(s, q0, iwscale);
+        tc[i].sqtq.tq = gglMulx(t, q0, iwscale);
+    }
+
+    int32_t span = 0;
+    do {
+        int32_t w1;
+        if (ggl_likely(numSpans)) {
+            w1 = w0 + dwdx;
+        } else {
+            if (remainder) {
+                // finish off the scanline...
+                span = remainder;
+                w1 = (c->shade.dwdx * span) + w0;
+            } else {
+                break;
+            }
+        }
+        int32_t q1 = gglRecipQ(w1, 30);
+        for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; ++i) {
+            texture_t& tmu = c->state.texture[i];
+            if (!tmu.enable) continue;
+            texture_iterators_t& ti = tmu.iterators;
+
+            for (int j=0 ; j<2 ; j++) {
+                int32_t v = tc[i].st[j].v;
+                if (span)   v += (tmu.shade.st[j].dx)*span;
+                else        v += (tmu.shade.st[j].dx)<<SPAN_BITS;
+                const int32_t v0 = tc[i].st[j].q;
+                const int32_t v1 = gglMulx(v, q1, iwscale);
+                int32_t dvdx = v1 - v0;
+                if (span)   dvdx /= span;
+                else        dvdx >>= SPAN_BITS;
+                tc[i].st[j].v = v;
+                tc[i].st[j].q = v1;
+
+                const int scale = ti.st[j].scale + (iwscale - 30);
+                if (scale >= 0) {
+                    ti.st[j].ydvdy = v0   << scale;
+                    ti.st[j].dvdx  = dvdx << scale;
+                } else {
+                    ti.st[j].ydvdy = v0   >> -scale;
+                    ti.st[j].dvdx  = dvdx >> -scale;
+                }
+            }
+            generated_tex_vars_t& gen = c->generated_vars.texture[i];
+            gen.dsdx = ti.st[0].dvdx;
+            gen.dtdx = ti.st[1].dvdx;
+        }
+        c->iterators.xl = xl;
+        c->iterators.xr = xl = xl + (span ? span : (1<<SPAN_BITS));
+        w0 = w1;
+        q0 = q1;
+        c->span(c);
+    } while(numSpans--);
+}
+
+void scanline_perspective_single(context_t* c)
+{
+    // 32 pixels spans works okay. 16 is a lot better,
+    // but hey, it's a software renderer...
+    const uint32_t SPAN_BITS = 5; 
+    const uint32_t ys = c->iterators.y;
+    const uint32_t xs = c->iterators.xl;
+    const uint32_t x1 = c->iterators.xr;
+	const uint32_t xc = x1 - xs;
+
+    const iterators_t& ci = c->iterators;
+    int32_t w = (xs * c->shade.dwdx) + ci.ydwdy;
+    int32_t iw = gglRecipQ(w, 30);
+    const int iwscale = 32 - gglClz(iw);
+
+    const int i = 31 - gglClz(c->state.enabled_tmu);
+    generated_tex_vars_t& gen = c->generated_vars.texture[i];
+    texture_t& tmu = c->state.texture[i];
+    texture_iterators_t& ti = tmu.iterators;
+    const int sscale = ti.sscale + (iwscale - 30);
+    const int tscale = ti.tscale + (iwscale - 30);
+    int32_t s =   tmu.shade.is0 +
+                 (tmu.shade.idsdy * ys) + (tmu.shade.idsdx * xs) +
+                 ((tmu.shade.idsdx + tmu.shade.idsdy)>>1);
+    int32_t t =   tmu.shade.it0 +
+                 (tmu.shade.idtdy * ys) + (tmu.shade.idtdx * xs) +
+                 ((tmu.shade.idtdx + tmu.shade.idtdy)>>1);
+    int32_t s0 = gglMulx(s, iw, iwscale);
+    int32_t t0 = gglMulx(t, iw, iwscale);
+    int32_t xl = c->iterators.xl;
+
+    int32_t sq, tq, dsdx, dtdx;
+    int32_t premainder = xc & ((1<<SPAN_BITS)-1);
+    uint32_t numSpans = xc >> SPAN_BITS;
+    if (c->shade.dwdx == 0) {
+        // XXX: we could choose to do this if the error is small enough
+        numSpans = 0;
+        premainder = xc;
+        goto no_perspective;
+    }
+
+    if (premainder) {
+        w += c->shade.dwdx   * premainder;
+        iw = gglRecipQ(w, 30);
+no_perspective:        
+        s += tmu.shade.idsdx * premainder;
+        t += tmu.shade.idtdx * premainder;
+        sq = gglMulx(s, iw, iwscale);
+        tq = gglMulx(t, iw, iwscale);
+        dsdx = (sq - s0) / premainder;
+        dtdx = (tq - t0) / premainder;
+        c->iterators.xl = xl;
+        c->iterators.xr = xl = xl + premainder;
+        goto finish;
+    }
+
+    while (numSpans--) {
+        w += c->shade.dwdx   << SPAN_BITS;
+        s += tmu.shade.idsdx << SPAN_BITS;
+        t += tmu.shade.idtdx << SPAN_BITS;
+        iw = gglRecipQ(w, 30);
+        sq = gglMulx(s, iw, iwscale);
+        tq = gglMulx(t, iw, iwscale);
+        dsdx = (sq - s0) >> SPAN_BITS;
+        dtdx = (tq - t0) >> SPAN_BITS;
+        c->iterators.xl = xl;
+        c->iterators.xr = xl = xl + (1<<SPAN_BITS);
+finish:
+        if (sscale >= 0) {
+            ti.ydsdy = s0   << sscale;
+            ti.dsdx  = dsdx << sscale;
+        } else {
+            ti.ydsdy = s0   >>-sscale;
+            ti.dsdx  = dsdx >>-sscale;
+        }
+        if (tscale >= 0) {
+            ti.ydtdy = t0   << tscale;
+            ti.dtdx  = dtdx << tscale;
+        } else {
+            ti.ydtdy = t0   >>-tscale;
+            ti.dtdx  = dtdx >>-tscale;
+        }
+        s0 = sq;
+        t0 = tq;
+        gen.dsdx = ti.dsdx;
+        gen.dtdx = ti.dtdx;
+        c->span(c);
+    }
+}
+
+// ----------------------------------------------------------------------------
+
+void scanline_col32cb16blend(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    union {
+        uint16_t* dst;
+        uint32_t* dst32;
+    };
+    dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
+
+#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__arm__))
+#if defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
+    scanline_col32cb16blend_neon(dst, &(c->packed8888), ct);
+#else  // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
+    scanline_col32cb16blend_arm(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
+#endif // defined(__ARM_HAVE_NEON) && BYTE_ORDER == LITTLE_ENDIAN
+#elif ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && defined(__aarch64__))
+    scanline_col32cb16blend_arm64(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
+#elif ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__mips__) && defined(__LP64__)))
+    scanline_col32cb16blend_mips64(dst, GGL_RGBA_TO_HOST(c->packed8888), ct);
+#else
+    uint32_t s = GGL_RGBA_TO_HOST(c->packed8888);
+    int sA = (s>>24);
+    int f = 0x100 - (sA + (sA>>7));
+    while (ct--) {
+        uint16_t d = *dst;
+        int dR = (d>>11)&0x1f;
+        int dG = (d>>5)&0x3f;
+        int dB = (d)&0x1f;
+        int sR = (s >> (   3))&0x1F;
+        int sG = (s >> ( 8+2))&0x3F;
+        int sB = (s >> (16+3))&0x1F;
+        sR += (f*dR)>>8;
+        sG += (f*dG)>>8;
+        sB += (f*dB)>>8;
+        *dst++ = uint16_t((sR<<11)|(sG<<5)|sB);
+    }
+#endif
+
+}
+
+void scanline_t32cb16(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;    
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    union {
+        uint16_t* dst;
+        uint32_t* dst32;
+    };
+    dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
+
+    surface_t* tex = &(c->state.texture[0].surface);
+    const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
+    const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
+    uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
+    uint32_t s, d;
+
+    if (ct==1 || uintptr_t(dst)&2) {
+last_one:
+        s = GGL_RGBA_TO_HOST( *src++ );
+        *dst++ = convertAbgr8888ToRgb565(s);
+        ct--;
+    }
+
+    while (ct >= 2) {
+#if BYTE_ORDER == BIG_ENDIAN
+        s = GGL_RGBA_TO_HOST( *src++ );
+        d = convertAbgr8888ToRgb565_hi16(s);
+
+        s = GGL_RGBA_TO_HOST( *src++ );
+        d |= convertAbgr8888ToRgb565(s);
+#else
+        s = GGL_RGBA_TO_HOST( *src++ );
+        d = convertAbgr8888ToRgb565(s);
+
+        s = GGL_RGBA_TO_HOST( *src++ );
+        d |= convertAbgr8888ToRgb565(s) << 16;
+#endif
+        *dst32++ = d;
+        ct -= 2;
+    }
+    
+    if (ct > 0) {
+        goto last_one;
+    }
+}
+
+void scanline_t32cb16blend(context_t* c)
+{
+#if ((ANDROID_CODEGEN >= ANDROID_CODEGEN_ASM) && (defined(__arm__) || defined(__aarch64__) || \
+    (defined(__mips__) && ((!defined(__LP64__) && __mips_isa_rev < 6) || defined(__LP64__)))))
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
+
+    surface_t* tex = &(c->state.texture[0].surface);
+    const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
+    const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
+    uint32_t *src = reinterpret_cast<uint32_t*>(tex->data)+(u+(tex->stride*v));
+
+#ifdef __arm__
+    scanline_t32cb16blend_arm(dst, src, ct);
+#elif defined(__aarch64__)
+    scanline_t32cb16blend_arm64(dst, src, ct);
+#elif defined(__mips__) && !defined(__LP64__) && __mips_isa_rev < 6
+    scanline_t32cb16blend_mips(dst, src, ct);
+#elif defined(__mips__) && defined(__LP64__)
+    scanline_t32cb16blend_mips64(dst, src, ct);
+#endif
+#else
+    dst_iterator16  di(c);
+    horz_iterator32  hi(c);
+    blender_32to16  bl(c);
+    while (di.count--) {
+        uint32_t s = hi.get_pixel32();
+        bl.write(s, di.dst);
+        di.dst++;
+    }
+#endif
+}
+
+void scanline_t32cb16blend_srca(context_t* c)
+{
+    dst_iterator16  di(c);
+    horz_iterator32  hi(c);
+    blender_32to16_srcA  blender(c);
+
+    while (di.count--) {
+        uint32_t s = hi.get_pixel32();
+        blender.write(s,di.dst);
+        di.dst++;
+    }
+}
+
+void scanline_t16cb16blend_clamp_mod(context_t* c)
+{
+    const int a = c->iterators.ydady >> (GGL_COLOR_BITS-8);
+    if (a == 0) {
+        return;
+    }
+
+    if (a == 255) {
+        scanline_t16cb16_clamp(c);
+        return;
+    }
+
+    dst_iterator16  di(c);
+    blender_16to16_modulate  blender(c);
+    clamp_iterator  ci(c);
+
+    while (di.count--) {
+        uint16_t s = ci.get_pixel16();
+        blender.write(s, di.dst);
+        di.dst++;
+    }
+}
+
+void scanline_memcpy(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
+                            (x + (cb->stride * y)) * fp->size;
+
+    surface_t* tex = &(c->state.texture[0].surface);
+    const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
+    const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
+    uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
+                            (u + (tex->stride * v)) * fp->size;
+
+    const size_t size = ct * fp->size;
+    memcpy(dst, src, size);
+}
+
+void scanline_memset8(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) + (x+(cb->stride*y));
+    uint32_t packed = c->packed;
+    memset(dst, packed, ct);
+}
+
+void scanline_memset16(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    uint16_t* dst = reinterpret_cast<uint16_t*>(cb->data) + (x+(cb->stride*y));
+    uint32_t packed = c->packed;
+    android_memset16(dst, packed, ct*2);
+}
+
+void scanline_memset32(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    uint32_t* dst = reinterpret_cast<uint32_t*>(cb->data) + (x+(cb->stride*y));
+    uint32_t packed = GGL_HOST_TO_RGBA(c->packed);
+    android_memset32(dst, packed, ct*4);
+}
+
+void scanline_clear(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
+                            (x + (cb->stride * y)) * fp->size;
+    const size_t size = ct * fp->size;
+    memset(dst, 0, size);
+}
+
+void scanline_set(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
+                            (x + (cb->stride * y)) * fp->size;
+    const size_t size = ct * fp->size;
+    memset(dst, 0xFF, size);
+}
+
+void scanline_noop(context_t* /*c*/)
+{
+}
+
+void rect_generic(context_t* c, size_t yc)
+{
+    do {
+        c->scanline(c);
+        c->step_y(c);
+    } while (--yc);
+}
+
+void rect_memcpy(context_t* c, size_t yc)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
+                            (x + (cb->stride * y)) * fp->size;
+
+    surface_t* tex = &(c->state.texture[0].surface);
+    const int32_t u = (c->state.texture[0].shade.is0>>16) + x;
+    const int32_t v = (c->state.texture[0].shade.it0>>16) + y;
+    uint8_t *src = reinterpret_cast<uint8_t*>(tex->data) +
+                            (u + (tex->stride * v)) * fp->size;
+
+    if (cb->stride == tex->stride && ct == size_t(cb->stride)) {
+        memcpy(dst, src, ct * fp->size * yc);
+    } else {
+        const size_t size = ct * fp->size;
+        const size_t dbpr = cb->stride  * fp->size;
+        const size_t sbpr = tex->stride * fp->size;
+        do {
+            memcpy(dst, src, size);
+            dst += dbpr;
+            src += sbpr;        
+        } while (--yc);
+    }
+}
+// ----------------------------------------------------------------------------
+}; // namespace android
+
diff --git a/libpixelflinger/scanline.h b/libpixelflinger/scanline.h
new file mode 100644
index 0000000..b6f4d37
--- /dev/null
+++ b/libpixelflinger/scanline.h
@@ -0,0 +1,32 @@
+/* libs/pixelflinger/scanline.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_SCANLINE_H
+#define ANDROID_SCANLINE_H
+
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_scanline(context_t* c);
+void ggl_uninit_scanline(context_t* c);
+void ggl_pick_scanline(context_t* c);
+
+}; // namespace android
+
+#endif
diff --git a/libpixelflinger/t32cb16blend.S b/libpixelflinger/t32cb16blend.S
new file mode 100644
index 0000000..5e4995a
--- /dev/null
+++ b/libpixelflinger/t32cb16blend.S
@@ -0,0 +1,203 @@
+/* libs/pixelflinger/t32cb16blend.S
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+	.text
+	.syntax unified
+	.balign 4
+	
+	.global scanline_t32cb16blend_arm
+
+
+/*
+ * .macro pixel
+ *
+ * \DREG is a 32-bit register containing *two* original destination RGB565 
+ *       pixels, with the even one in the low-16 bits, and the odd one in the
+ *       high 16 bits.
+ *
+ * \SRC is a 32-bit 0xAABBGGRR pixel value, with pre-multiplied colors.
+ *
+ * \FB is a target register that will contain the blended pixel values.
+ *
+ * \ODD is either 0 or 1 and indicates if we're blending the lower or 
+ *      upper 16-bit pixels in DREG into FB
+ *
+ *
+ * clobbered: r6, r7, lr
+ *
+ */
+
+.macro pixel,   DREG, SRC, FB, ODD
+
+    // SRC = 0xAABBGGRR
+    mov     r7, \SRC, lsr #24           // sA
+    add     r7, r7, r7, lsr #7          // sA + (sA >> 7)
+    rsb     r7, r7, #0x100              // sA = 0x100 - (sA+(sA>>7))
+
+1:
+
+.if \ODD
+
+    // red
+    mov     lr, \DREG, lsr #(16 + 11)
+    smulbb  lr, r7, lr
+    mov     r6, \SRC, lsr #3
+    and     r6, r6, #0x1F
+    add     lr, r6, lr, lsr #8
+    cmp     lr, #0x1F
+    orrhs   \FB, \FB, #(0x1F<<(16 + 11))
+    orrlo   \FB, \FB, lr, lsl #(16 + 11)
+
+        // green
+        and     r6, \DREG, #(0x3F<<(16 + 5))
+        smulbt  r6, r7, r6
+        mov     lr, \SRC, lsr #(8+2)
+        and     lr, lr, #0x3F
+        add     r6, lr, r6, lsr #(5+8)
+        cmp     r6, #0x3F
+        orrhs   \FB, \FB, #(0x3F<<(16 + 5))
+        orrlo   \FB, \FB, r6, lsl #(16 + 5)
+
+            // blue
+            and     lr, \DREG, #(0x1F << 16)
+            smulbt  lr, r7, lr
+            mov     r6, \SRC, lsr #(8+8+3)
+            and     r6, r6, #0x1F
+            add     lr, r6, lr, lsr #8
+            cmp     lr, #0x1F
+            orrhs   \FB, \FB, #(0x1F << 16)
+            orrlo   \FB, \FB, lr, lsl #16
+
+.else
+
+    // red
+    mov     lr, \DREG, lsr #11
+    and     lr, lr, #0x1F
+    smulbb  lr, r7, lr
+    mov     r6, \SRC, lsr #3
+    and     r6, r6, #0x1F
+    add     lr, r6, lr, lsr #8
+    cmp     lr, #0x1F
+    movhs   \FB, #(0x1F<<11)
+    movlo   \FB, lr, lsl #11
+
+
+        // green
+        and     r6, \DREG, #(0x3F<<5)
+        smulbb  r6, r7, r6
+        mov     lr, \SRC, lsr #(8+2)
+        and     lr, lr, #0x3F
+        add     r6, lr, r6, lsr #(5+8)
+        cmp     r6, #0x3F
+        orrhs   \FB, \FB, #(0x3F<<5)
+        orrlo   \FB, \FB, r6, lsl #5
+
+            // blue
+            and     lr, \DREG, #0x1F
+            smulbb  lr, r7, lr
+            mov     r6, \SRC, lsr #(8+8+3)
+            and     r6, r6, #0x1F
+            add     lr, r6, lr, lsr #8
+            cmp     lr, #0x1F
+            orrhs   \FB, \FB, #0x1F
+            orrlo   \FB, \FB, lr
+
+.endif
+
+    .endm
+    
+
+// r0:  dst ptr
+// r1:  src ptr
+// r2:  count
+// r3:  d
+// r4:  s0
+// r5:  s1
+// r6:  pixel
+// r7:  pixel
+// r8:  free
+// r9:  free
+// r10: free
+// r11: free
+// r12: scratch
+// r14: pixel
+
+scanline_t32cb16blend_arm:
+    stmfd	sp!, {r4-r7, lr}
+
+    pld     [r0]
+    pld     [r1]
+
+    // align DST to 32 bits
+    tst     r0, #0x3
+    beq     aligned
+    subs    r2, r2, #1
+    ldmfdlo sp!, {r4-r7, lr}        // return
+    bxlo    lr
+
+last:
+    ldr     r4, [r1], #4
+    ldrh    r3, [r0]
+    pixel   r3, r4, r12, 0
+    strh    r12, [r0], #2
+
+aligned:
+    subs    r2, r2, #2
+    blo     9f
+
+    // The main loop is unrolled twice and processes 4 pixels
+8:  ldmia   r1!, {r4, r5}
+    // stream the source
+    pld     [r1, #32]
+    add     r0, r0, #4
+    // it's all zero, skip this pixel
+    orrs    r3, r4, r5
+    beq     7f
+    
+    // load the destination
+    ldr     r3, [r0, #-4]
+    // stream the destination
+    pld     [r0, #32]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 1
+    // effectively, we're getting write-combining by virtue of the
+    // cpu's write-back cache.
+    str     r12, [r0, #-4]
+
+    // 2nd iterration of the loop, don't stream anything
+    subs    r2, r2, #2
+    movlt   r4, r5
+    blt     9f
+    ldmia   r1!, {r4, r5}
+    add     r0, r0, #4
+    orrs    r3, r4, r5
+    beq     7f
+    ldr     r3, [r0, #-4]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 16
+    str     r12, [r0, #-4]
+
+    
+7:  subs    r2, r2, #2
+    bhs     8b
+    mov     r4, r5
+
+9:  adds    r2, r2, #1
+    ldmfdlo sp!, {r4-r7, lr}        // return
+    bxlo    lr
+    b       last
diff --git a/libpixelflinger/trap.cpp b/libpixelflinger/trap.cpp
new file mode 100644
index 0000000..06ad237
--- /dev/null
+++ b/libpixelflinger/trap.cpp
@@ -0,0 +1,1173 @@
+/* libs/pixelflinger/trap.cpp
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License");
+** you may not use this file except in compliance with the License.
+** You may obtain a copy of the License at
+**
+**     http://www.apache.org/licenses/LICENSE-2.0
+**
+** Unless required by applicable law or agreed to in writing, software
+** distributed under the License is distributed on an "AS IS" BASIS,
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+** See the License for the specific language governing permissions and
+** limitations under the License.
+*/
+
+#define LOG_TAG "pixelflinger-trap"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cutils/memory.h>
+#include <log/log.h>
+
+#include "trap.h"
+#include "picker.h"
+
+namespace android {
+
+// ----------------------------------------------------------------------------
+
+// enable to see triangles edges
+#define DEBUG_TRANGLES  0
+
+// ----------------------------------------------------------------------------
+
+static void pointx_validate(void *con, const GGLcoord* c, GGLcoord r);
+static void pointx(void *con, const GGLcoord* c, GGLcoord r);
+static void aa_pointx(void *con, const GGLcoord* c, GGLcoord r);
+static void aa_nice_pointx(void *con, const GGLcoord* c, GGLcoord r);
+
+static void linex_validate(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord w);
+static void linex(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord w);
+static void aa_linex(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord w);
+
+static void recti_validate(void* c, GGLint l, GGLint t, GGLint r, GGLint b); 
+static void recti(void* c, GGLint l, GGLint t, GGLint r, GGLint b); 
+
+static void trianglex_validate(void*,
+        const GGLcoord*, const GGLcoord*, const GGLcoord*);
+static void trianglex_small(void*,
+        const GGLcoord*, const GGLcoord*, const GGLcoord*);
+static void trianglex_big(void*,
+        const GGLcoord*, const GGLcoord*, const GGLcoord*);
+static void aa_trianglex(void*,
+        const GGLcoord*, const GGLcoord*, const GGLcoord*);
+static void trianglex_debug(void* con,
+        const GGLcoord*, const GGLcoord*, const GGLcoord*);
+
+static void aapolyx(void* con,
+        const GGLcoord* pts, int count);
+
+static inline int min(int a, int b) CONST;
+static inline int max(int a, int b) CONST;
+static inline int min(int a, int b, int c) CONST;
+static inline int max(int a, int b, int c) CONST;
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Tools
+#endif
+
+inline int min(int a, int b) {
+    return a<b ? a : b;
+}
+inline int max(int a, int b) {
+    return a<b ? b : a;
+}
+inline int min(int a, int b, int c) {
+    return min(a,min(b,c));
+}
+inline int max(int a, int b, int c) {
+    return max(a,max(b,c));
+}
+
+template <typename T>
+static inline void swap(T& a, T& b) {
+    T t(a);
+    a = b;
+    b = t;
+}
+
+static void
+triangle_dump_points( const GGLcoord*  v0,
+                      const GGLcoord*  v1,
+                      const GGLcoord*  v2 )
+{
+    float tri = 1.0f / TRI_ONE;
+    ALOGD("  P0=(%.3f, %.3f)  [%08x, %08x]\n"
+          "  P1=(%.3f, %.3f)  [%08x, %08x]\n"
+          "  P2=(%.3f, %.3f)  [%08x, %08x]\n",
+          v0[0]*tri, v0[1]*tri, v0[0], v0[1],
+          v1[0]*tri, v1[1]*tri, v1[0], v1[1],
+          v2[0]*tri, v2[1]*tri, v2[0], v2[1] );
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Misc
+#endif
+
+void ggl_init_trap(context_t* c)
+{
+    ggl_state_changed(c, GGL_PIXEL_PIPELINE_STATE|GGL_TMU_STATE|GGL_CB_STATE);
+}
+
+void ggl_state_changed(context_t* c, int flags)
+{
+    if (ggl_likely(!c->dirty)) {
+        c->procs.pointx     = pointx_validate;
+        c->procs.linex      = linex_validate;
+        c->procs.recti      = recti_validate;
+        c->procs.trianglex  = trianglex_validate;
+    }
+    c->dirty |= uint32_t(flags);
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Point
+#endif
+
+void pointx_validate(void *con, const GGLcoord* v, GGLcoord rad)
+{
+    GGL_CONTEXT(c, con);
+    ggl_pick(c);
+    if (c->state.needs.p & GGL_NEED_MASK(P_AA)) {
+        if (c->state.enables & GGL_ENABLE_POINT_AA_NICE) {
+            c->procs.pointx = aa_nice_pointx;
+        } else {
+            c->procs.pointx = aa_pointx;
+        }
+    } else {
+        c->procs.pointx = pointx;
+    }
+    c->procs.pointx(con, v, rad);
+}
+
+void pointx(void *con, const GGLcoord* v, GGLcoord rad)
+{
+    GGL_CONTEXT(c, con);
+    GGLcoord halfSize = TRI_ROUND(rad) >> 1;
+    if (halfSize == 0)
+        halfSize = TRI_HALF;
+    GGLcoord xc = v[0]; 
+    GGLcoord yc = v[1];
+    if (halfSize & TRI_HALF) { // size odd
+        xc = TRI_FLOOR(xc) + TRI_HALF;
+        yc = TRI_FLOOR(yc) + TRI_HALF;
+    } else { // size even
+        xc = TRI_ROUND(xc);
+        yc = TRI_ROUND(yc);
+    }
+    GGLint l = (xc - halfSize) >> TRI_FRACTION_BITS;
+    GGLint t = (yc - halfSize) >> TRI_FRACTION_BITS;
+    GGLint r = (xc + halfSize) >> TRI_FRACTION_BITS;
+    GGLint b = (yc + halfSize) >> TRI_FRACTION_BITS;
+    recti(c, l, t, r, b);
+}
+
+// This way of computing the coverage factor, is more accurate and gives
+// better results for small circles, but it is also a lot slower.
+// Here we use super-sampling.
+static int32_t coverageNice(GGLcoord x, GGLcoord y, 
+        GGLcoord rmin, GGLcoord rmax, GGLcoord rr)
+{
+    const GGLcoord d2 = x*x + y*y;
+    if (d2 >= rmax) return 0;
+    if (d2 < rmin)  return 0x7FFF;
+
+    const int kSamples              =  4;
+    const int kInc                  =  4;    // 1/4 = 0.25
+    const int kCoverageUnit         =  1;    // 1/(4^2) = 0.0625
+    const GGLcoord kCoordOffset     = -6;    // -0.375
+
+    int hits = 0;
+    int x_sample = x + kCoordOffset;
+    for (int i=0 ; i<kSamples ; i++, x_sample += kInc) {
+        const int xval = rr - (x_sample * x_sample);
+        int y_sample = y + kCoordOffset;
+        for (int j=0 ; j<kSamples ; j++, y_sample += kInc) {
+            if (xval - (y_sample * y_sample) > 0)
+                hits += kCoverageUnit;
+        }
+    }
+    return min(0x7FFF, hits << (15 - kSamples));
+}
+
+
+void aa_nice_pointx(void *con, const GGLcoord* v, GGLcoord size)
+{
+    GGL_CONTEXT(c, con);
+
+    GGLcoord rad = ((size + 1)>>1);
+    GGLint l = (v[0] - rad) >> TRI_FRACTION_BITS;
+    GGLint t = (v[1] - rad) >> TRI_FRACTION_BITS;
+    GGLint r = (v[0] + rad + (TRI_ONE-1)) >> TRI_FRACTION_BITS;
+    GGLint b = (v[1] + rad + (TRI_ONE-1)) >> TRI_FRACTION_BITS;
+    GGLcoord xstart = TRI_FROM_INT(l) - v[0] + TRI_HALF; 
+    GGLcoord ystart = TRI_FROM_INT(t) - v[1] + TRI_HALF; 
+
+    // scissor...
+    if (l < GGLint(c->state.scissor.left)) {
+        xstart += TRI_FROM_INT(c->state.scissor.left-l);
+        l = GGLint(c->state.scissor.left);
+    }
+    if (t < GGLint(c->state.scissor.top)) {
+        ystart += TRI_FROM_INT(c->state.scissor.top-t);
+        t = GGLint(c->state.scissor.top);
+    }
+    if (r > GGLint(c->state.scissor.right)) {
+        r = GGLint(c->state.scissor.right);
+    }
+    if (b > GGLint(c->state.scissor.bottom)) {
+        b = GGLint(c->state.scissor.bottom);
+    }
+
+    int xc = r - l;
+    int yc = b - t;
+    if (xc>0 && yc>0) {
+        int16_t* covPtr = c->state.buffers.coverage;
+        const int32_t sqr2Over2 = 0xC; // rounded up
+        GGLcoord rr = rad*rad;
+        GGLcoord rmin = (rad - sqr2Over2)*(rad - sqr2Over2);
+        GGLcoord rmax = (rad + sqr2Over2)*(rad + sqr2Over2);
+        GGLcoord y = ystart;
+        c->iterators.xl = l;
+        c->iterators.xr = r;
+        c->init_y(c, t);
+        do {
+            // compute coverage factors for each pixel
+            GGLcoord x = xstart;
+            for (int i=l ; i<r ; i++) {
+                covPtr[i] = coverageNice(x, y, rmin, rmax, rr);
+                x += TRI_ONE;
+            }
+            y += TRI_ONE;
+            c->scanline(c);
+            c->step_y(c);
+        } while (--yc);
+    }
+}
+
+// This is a cheap way of computing the coverage factor for a circle.
+// We just lerp between the circles of radii r-sqrt(2)/2 and r+sqrt(2)/2
+static inline int32_t coverageFast(GGLcoord x, GGLcoord y,
+        GGLcoord rmin, GGLcoord rmax, GGLcoord scale)
+{
+    const GGLcoord d2 = x*x + y*y;
+    if (d2 >= rmax) return 0;
+    if (d2 < rmin)  return 0x7FFF;
+    return 0x7FFF - (d2-rmin)*scale;
+}
+
+void aa_pointx(void *con, const GGLcoord* v, GGLcoord size)
+{
+    GGL_CONTEXT(c, con);
+
+    GGLcoord rad = ((size + 1)>>1);
+    GGLint l = (v[0] - rad) >> TRI_FRACTION_BITS;
+    GGLint t = (v[1] - rad) >> TRI_FRACTION_BITS;
+    GGLint r = (v[0] + rad + (TRI_ONE-1)) >> TRI_FRACTION_BITS;
+    GGLint b = (v[1] + rad + (TRI_ONE-1)) >> TRI_FRACTION_BITS;
+    GGLcoord xstart = TRI_FROM_INT(l) - v[0] + TRI_HALF; 
+    GGLcoord ystart = TRI_FROM_INT(t) - v[1] + TRI_HALF; 
+
+    // scissor...
+    if (l < GGLint(c->state.scissor.left)) {
+        xstart += TRI_FROM_INT(c->state.scissor.left-l);
+        l = GGLint(c->state.scissor.left);
+    }
+    if (t < GGLint(c->state.scissor.top)) {
+        ystart += TRI_FROM_INT(c->state.scissor.top-t);
+        t = GGLint(c->state.scissor.top);
+    }
+    if (r > GGLint(c->state.scissor.right)) {
+        r = GGLint(c->state.scissor.right);
+    }
+    if (b > GGLint(c->state.scissor.bottom)) {
+        b = GGLint(c->state.scissor.bottom);
+    }
+
+    int xc = r - l;
+    int yc = b - t;
+    if (xc>0 && yc>0) {
+        int16_t* covPtr = c->state.buffers.coverage;
+        rad <<= 4;
+        const int32_t sqr2Over2 = 0xB5;    // fixed-point 24.8
+        GGLcoord rmin = rad - sqr2Over2;
+        GGLcoord rmax = rad + sqr2Over2;
+        GGLcoord scale;
+        rmin *= rmin;
+        rmax *= rmax;
+        scale = 0x800000 / (rmax - rmin);
+        rmin >>= 8;
+        rmax >>= 8;
+
+        GGLcoord y = ystart;
+        c->iterators.xl = l;
+        c->iterators.xr = r;
+        c->init_y(c, t);
+
+        do {
+            // compute coverage factors for each pixel
+            GGLcoord x = xstart;
+            for (int i=l ; i<r ; i++) {
+                covPtr[i] = coverageFast(x, y, rmin, rmax, scale);
+                x += TRI_ONE;
+            }
+            y += TRI_ONE;
+            c->scanline(c);
+            c->step_y(c);
+        } while (--yc);
+    }
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Line
+#endif
+
+void linex_validate(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord w)
+{
+    GGL_CONTEXT(c, con);
+    ggl_pick(c);
+    if (c->state.needs.p & GGL_NEED_MASK(P_AA)) {
+        c->procs.linex = aa_linex;
+    } else {
+        c->procs.linex = linex;
+    }
+    c->procs.linex(con, v0, v1, w);
+}
+
+static void linex(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord width)
+{
+    GGLcoord v[4][2];
+    v[0][0] = v0[0];    v[0][1] = v0[1];
+    v[1][0] = v1[0];    v[1][1] = v1[1];
+    v0 = v[0];
+    v1 = v[1];
+    const GGLcoord dx = abs(v0[0] - v1[0]);
+    const GGLcoord dy = abs(v0[1] - v1[1]);
+    GGLcoord nx, ny;
+    nx = ny = 0;
+
+    GGLcoord halfWidth = TRI_ROUND(width) >> 1;
+    if (halfWidth == 0)
+        halfWidth = TRI_HALF;
+
+    ((dx > dy) ? ny : nx) = halfWidth;
+    v[2][0] = v1[0];    v[2][1] = v1[1];
+    v[3][0] = v0[0];    v[3][1] = v0[1];
+    v[0][0] += nx;      v[0][1] += ny;
+    v[1][0] += nx;      v[1][1] += ny;
+    v[2][0] -= nx;      v[2][1] -= ny;
+    v[3][0] -= nx;      v[3][1] -= ny;
+    trianglex_big(con, v[0], v[1], v[2]);
+    trianglex_big(con, v[0], v[2], v[3]);
+}
+
+static void aa_linex(void *con, const GGLcoord* v0, const GGLcoord* v1, GGLcoord width)
+{
+    GGLcoord v[4][2];
+    v[0][0] = v0[0];    v[0][1] = v0[1];
+    v[1][0] = v1[0];    v[1][1] = v1[1];
+    v0 = v[0];
+    v1 = v[1];
+    
+    const GGLcoord dx = v0[0] - v1[0];
+    const GGLcoord dy = v0[1] - v1[1];
+    GGLcoord nx = -dy;
+    GGLcoord ny =  dx;
+
+    // generally, this will be well below 1.0
+    const GGLfixed norm = gglMulx(width, gglSqrtRecipx(nx*nx+ny*ny), 4);
+    nx = gglMulx(nx, norm, 21);
+    ny = gglMulx(ny, norm, 21);
+    
+    v[2][0] = v1[0];    v[2][1] = v1[1];
+    v[3][0] = v0[0];    v[3][1] = v0[1];
+    v[0][0] += nx;      v[0][1] += ny;
+    v[1][0] += nx;      v[1][1] += ny;
+    v[2][0] -= nx;      v[2][1] -= ny;
+    v[3][0] -= nx;      v[3][1] -= ny;
+    aapolyx(con, v[0], 4);        
+}
+
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Rect
+#endif
+
+void recti_validate(void *con, GGLint l, GGLint t, GGLint r, GGLint b)
+{
+    GGL_CONTEXT(c, con);
+    ggl_pick(c);
+    c->procs.recti = recti;
+    c->procs.recti(con, l, t, r, b);
+}
+
+void recti(void* con, GGLint l, GGLint t, GGLint r, GGLint b)
+{
+    GGL_CONTEXT(c, con);
+
+    // scissor...
+    if (l < GGLint(c->state.scissor.left))
+        l = GGLint(c->state.scissor.left);
+    if (t < GGLint(c->state.scissor.top))
+        t = GGLint(c->state.scissor.top);
+    if (r > GGLint(c->state.scissor.right))
+        r = GGLint(c->state.scissor.right);
+    if (b > GGLint(c->state.scissor.bottom))
+        b = GGLint(c->state.scissor.bottom);
+
+    int xc = r - l;
+    int yc = b - t;
+    if (xc>0 && yc>0) {
+        c->iterators.xl = l;
+        c->iterators.xr = r;
+        c->init_y(c, t);
+        c->rect(c, yc);
+    }
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Triangle / Debugging
+#endif
+
+static void scanline_set(context_t* c)
+{
+    int32_t x = c->iterators.xl;
+    size_t ct = c->iterators.xr - x;
+    int32_t y = c->iterators.y;
+    surface_t* cb = &(c->state.buffers.color);
+    const GGLFormat* fp = &(c->formats[cb->format]);
+    uint8_t* dst = reinterpret_cast<uint8_t*>(cb->data) +
+                            (x + (cb->stride * y)) * fp->size;
+    const size_t size = ct * fp->size;
+    memset(dst, 0xFF, size);
+}
+
+static void trianglex_debug(void* con,
+        const GGLcoord* v0, const GGLcoord* v1, const GGLcoord* v2)
+{
+    GGL_CONTEXT(c, con);
+    if (c->state.needs.p & GGL_NEED_MASK(P_AA)) {
+        aa_trianglex(con,v0,v1,v2);
+    } else {
+        trianglex_big(con,v0,v1,v2);
+    }
+	void (*save_scanline)(context_t*)  = c->scanline;
+    c->scanline = scanline_set;
+    linex(con, v0, v1, TRI_ONE);
+    linex(con, v1, v2, TRI_ONE);
+    linex(con, v2, v0, TRI_ONE);
+    c->scanline = save_scanline;
+}
+
+static void trianglex_xor(void* con,
+        const GGLcoord* v0, const GGLcoord* v1, const GGLcoord* v2)
+{
+    trianglex_big(con,v0,v1,v2);
+    trianglex_small(con,v0,v1,v2);
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#pragma mark Triangle
+#endif
+
+void trianglex_validate(void *con,
+        const GGLcoord* v0, const GGLcoord* v1, const GGLcoord* v2)
+{
+    GGL_CONTEXT(c, con);
+    ggl_pick(c);
+    if (c->state.needs.p & GGL_NEED_MASK(P_AA)) {
+        c->procs.trianglex = DEBUG_TRANGLES ? trianglex_debug : aa_trianglex;
+    } else {
+        c->procs.trianglex = DEBUG_TRANGLES ? trianglex_debug : trianglex_big;
+    }
+    c->procs.trianglex(con, v0, v1, v2);
+}
+
+// ----------------------------------------------------------------------------
+
+void trianglex_small(void* con,
+        const GGLcoord* v0, const GGLcoord* v1, const GGLcoord* v2)
+{
+    GGL_CONTEXT(c, con);
+
+    // vertices are in 28.4 fixed point, which allows
+    // us to use 32 bits multiplies below.
+    int32_t x0 = v0[0];
+    int32_t y0 = v0[1];
+    int32_t x1 = v1[0];
+    int32_t y1 = v1[1];
+    int32_t x2 = v2[0];
+    int32_t y2 = v2[1];
+
+    int32_t dx01 = x0 - x1;
+    int32_t dy20 = y2 - y0;
+    int32_t dy01 = y0 - y1;
+    int32_t dx20 = x2 - x0;
+
+    // The code below works only with CCW triangles
+    // so if we get a CW triangle, we need to swap two of its vertices
+    if (dx01*dy20 < dy01*dx20) {
+        swap(x0, x1);
+        swap(y0, y1);
+        dx01 = x0 - x1;
+        dy01 = y0 - y1;
+        dx20 = x2 - x0;
+        dy20 = y2 - y0;
+    }
+    int32_t dx12 = x1 - x2;
+    int32_t dy12 = y1 - y2;
+
+    // bounding box & scissor
+    const int32_t bminx = TRI_FLOOR(min(x0, x1, x2)) >> TRI_FRACTION_BITS;
+    const int32_t bminy = TRI_FLOOR(min(y0, y1, y2)) >> TRI_FRACTION_BITS;
+    const int32_t bmaxx = TRI_CEIL( max(x0, x1, x2)) >> TRI_FRACTION_BITS;
+    const int32_t bmaxy = TRI_CEIL( max(y0, y1, y2)) >> TRI_FRACTION_BITS;
+    const int32_t minx = max(bminx, c->state.scissor.left);
+    const int32_t miny = max(bminy, c->state.scissor.top);
+    const int32_t maxx = min(bmaxx, c->state.scissor.right);
+    const int32_t maxy = min(bmaxy, c->state.scissor.bottom);
+    if ((minx >= maxx) || (miny >= maxy))
+        return; // too small or clipped out...
+
+    // step equations to the bounding box and snap to pixel center
+    const int32_t my = (miny << TRI_FRACTION_BITS) + TRI_HALF;
+    const int32_t mx = (minx << TRI_FRACTION_BITS) + TRI_HALF;
+    int32_t ey0 = dy01 * (x0 - mx) - dx01 * (y0 - my);
+    int32_t ey1 = dy12 * (x1 - mx) - dx12 * (y1 - my);
+    int32_t ey2 = dy20 * (x2 - mx) - dx20 * (y2 - my);
+
+    // right-exclusive fill rule, to avoid rare cases
+    // of over drawing
+    if (dy01<0 || (dy01 == 0 && dx01>0)) ey0++;
+    if (dy12<0 || (dy12 == 0 && dx12>0)) ey1++;
+    if (dy20<0 || (dy20 == 0 && dx20>0)) ey2++;
+    
+    c->init_y(c, miny);
+    for (int32_t y = miny; y < maxy; y++) {
+        int32_t ex0 = ey0;
+        int32_t ex1 = ey1;
+        int32_t ex2 = ey2;    
+        int32_t xl, xr;
+        for (xl=minx ; xl<maxx ; xl++) {
+            if (ex0>0 && ex1>0 && ex2>0)
+                break; // all strictly positive
+            ex0 -= dy01 << TRI_FRACTION_BITS;
+            ex1 -= dy12 << TRI_FRACTION_BITS;
+            ex2 -= dy20 << TRI_FRACTION_BITS;
+        }
+        xr = xl;
+        for ( ; xr<maxx ; xr++) {
+            if (!(ex0>0 && ex1>0 && ex2>0))
+                break; // not all strictly positive
+            ex0 -= dy01 << TRI_FRACTION_BITS;
+            ex1 -= dy12 << TRI_FRACTION_BITS;
+            ex2 -= dy20 << TRI_FRACTION_BITS;
+        }
+
+        if (xl < xr) {
+            c->iterators.xl = xl;
+            c->iterators.xr = xr;
+            c->scanline(c);
+        }
+        c->step_y(c);
+
+        ey0 += dx01 << TRI_FRACTION_BITS;
+        ey1 += dx12 << TRI_FRACTION_BITS;
+        ey2 += dx20 << TRI_FRACTION_BITS;
+    }
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#endif
+
+// the following routine fills a triangle via edge stepping, which
+// unfortunately requires divisions in the setup phase to get right,
+// it should probably only be used for relatively large trianges
+
+
+// x = y*DX/DY    (ou DX and DY are constants, DY > 0, et y >= 0)
+// 
+// for an equation of the type:
+//      x' = y*K/2^p     (with K and p constants "carefully chosen")
+// 
+// We can now do a DDA without precision loss. We define 'e' by:
+//      x' - x = y*(DX/DY - K/2^p) = y*e
+// 
+// If we choose K = round(DX*2^p/DY) then,
+//      abs(e) <= 1/2^(p+1) by construction
+// 
+// therefore abs(x'-x) = y*abs(e) <= y/2^(p+1) <= DY/2^(p+1) <= DMAX/2^(p+1)
+// 
+// which means that if DMAX <= 2^p, therefore abs(x-x') <= 1/2, including
+// at the last line. In fact, it's even a strict inequality except in one
+// extrem case (DY == DMAX et e = +/- 1/2)
+// 
+// Applying that to our coordinates, we need 2^p >= 4096*16 = 65536
+// so p = 16 is enough, we're so lucky!
+
+const int TRI_ITERATORS_BITS = 16;
+
+struct Edge
+{
+  int32_t  x;      // edge position in 16.16 coordinates
+  int32_t  x_incr; // on each step, increment x by that amount
+  int32_t  y_top;  // starting scanline, 16.4 format
+  int32_t  y_bot;
+};
+
+static void
+edge_dump( Edge*  edge )
+{
+  ALOGI( "  top=%d (%.3f)  bot=%d (%.3f)  x=%d (%.3f)  ix=%d (%.3f)",
+        edge->y_top, edge->y_top/float(TRI_ONE),
+		edge->y_bot, edge->y_bot/float(TRI_ONE),
+		edge->x, edge->x/float(FIXED_ONE),
+		edge->x_incr, edge->x_incr/float(FIXED_ONE) );
+}
+
+static void
+triangle_dump_edges( Edge*  edges,
+                     int            count )
+{ 
+    ALOGI( "%d edge%s:\n", count, count == 1 ? "" : "s" );
+	for ( ; count > 0; count--, edges++ )
+	  edge_dump( edges );
+}
+
+// the following function sets up an edge, it assumes
+// that ymin and ymax are in already in the 'reduced'
+// format
+static __attribute__((noinline))
+void edge_setup(
+        Edge*           edges,
+        int*            pcount,
+        const GGLcoord* p1,
+        const GGLcoord* p2,
+        int32_t         ymin,
+        int32_t         ymax )
+{
+	const GGLfixed*  top = p1;
+	const GGLfixed*  bot = p2;
+	Edge*    edge = edges + *pcount;
+
+	if (top[1] > bot[1]) {
+        swap(top, bot);
+	}
+
+	int  y1 = top[1] | 1;
+	int  y2 = bot[1] | 1;
+	int  dy = y2 - y1;
+
+	if ( dy == 0 || y1 > ymax || y2 < ymin )
+		return;
+
+	if ( y1 > ymin )
+		ymin = TRI_SNAP_NEXT_HALF(y1);
+	
+	if ( y2 < ymax )
+		ymax = TRI_SNAP_PREV_HALF(y2);
+
+	if ( ymin > ymax )  // when the edge doesn't cross any scanline
+	  return;
+
+	const int x1 = top[0];
+	const int dx = bot[0] - x1;
+    const int shift = TRI_ITERATORS_BITS - TRI_FRACTION_BITS;
+
+	// setup edge fields
+    // We add 0.5 to edge->x here because it simplifies the rounding
+    // in triangle_sweep_edges() -- this doesn't change the ordering of 'x'
+	edge->x      = (x1 << shift) + (1LU << (TRI_ITERATORS_BITS-1));
+	edge->x_incr = 0;
+	edge->y_top  = ymin;
+	edge->y_bot  = ymax;
+
+	if (ggl_likely(ymin <= ymax && dx)) {
+        edge->x_incr = gglDivQ16(dx, dy);
+    }
+    if (ggl_likely(y1 < ymin)) {
+        int32_t xadjust = (edge->x_incr * (ymin-y1)) >> TRI_FRACTION_BITS;
+        edge->x += xadjust;
+    }
+  
+	++*pcount;
+}
+
+
+static void
+triangle_sweep_edges( Edge*  left,
+                      Edge*  right,
+					  int            ytop,
+					  int            ybot,
+					  context_t*     c )
+{
+    int count = ((ybot - ytop)>>TRI_FRACTION_BITS) + 1;
+    if (count<=0) return;
+
+    // sort the edges horizontally
+    if ((left->x > right->x) || 
+        ((left->x == right->x) && (left->x_incr > right->x_incr))) {
+        swap(left, right);
+    }
+
+    int left_x = left->x;
+    int right_x = right->x;
+    const int left_xi = left->x_incr;
+    const int right_xi  = right->x_incr;
+    left->x  += left_xi * count;
+    right->x += right_xi * count;
+
+	const int xmin = c->state.scissor.left;
+	const int xmax = c->state.scissor.right;
+    do {
+        // horizontal scissoring
+        const int32_t xl = max(left_x  >> TRI_ITERATORS_BITS, xmin);
+        const int32_t xr = min(right_x >> TRI_ITERATORS_BITS, xmax);
+        left_x  += left_xi;
+        right_x += right_xi;
+        // invoke the scanline rasterizer
+        if (ggl_likely(xl < xr)) {
+            c->iterators.xl = xl;
+            c->iterators.xr = xr;
+            c->scanline(c);
+        }
+		c->step_y(c);
+	} while (--count);
+}
+
+
+void trianglex_big(void* con,
+        const GGLcoord* v0, const GGLcoord* v1, const GGLcoord* v2)
+{
+    GGL_CONTEXT(c, con);
+
+    Edge edges[3];
+	int num_edges = 0;
+	int32_t ymin = TRI_FROM_INT(c->state.scissor.top)    + TRI_HALF;
+	int32_t ymax = TRI_FROM_INT(c->state.scissor.bottom) - TRI_HALF;
+	    
+	edge_setup( edges, &num_edges, v0, v1, ymin, ymax );
+	edge_setup( edges, &num_edges, v0, v2, ymin, ymax );
+	edge_setup( edges, &num_edges, v1, v2, ymin, ymax );
+
+    if (ggl_unlikely(num_edges<2))  // for really tiny triangles that don't
+		return;                     // cross any scanline centers
+
+    Edge* left  = &edges[0];
+    Edge* right = &edges[1];
+    Edge* other = &edges[2];
+    int32_t y_top = min(left->y_top, right->y_top);
+    int32_t y_bot = max(left->y_bot, right->y_bot);
+
+	if (ggl_likely(num_edges==3)) {
+        y_top = min(y_top, edges[2].y_top);
+        y_bot = max(y_bot, edges[2].y_bot);
+		if (edges[0].y_top > y_top) {
+            other = &edges[0];
+            left  = &edges[2];
+		} else if (edges[1].y_top > y_top) {
+            other = &edges[1];
+            right = &edges[2];
+		}
+    }
+
+    c->init_y(c, y_top >> TRI_FRACTION_BITS);
+
+    int32_t y_mid = min(left->y_bot, right->y_bot);
+    triangle_sweep_edges( left, right, y_top, y_mid, c );
+
+    // second scanline sweep loop, if necessary
+    y_mid += TRI_ONE;
+    if (y_mid <= y_bot) {
+        ((left->y_bot == y_bot) ? right : left) = other;
+        if (other->y_top < y_mid) {
+            other->x += other->x_incr;
+        }
+        triangle_sweep_edges( left, right, y_mid, y_bot, c );
+    }
+}
+
+void aa_trianglex(void* con,
+        const GGLcoord* a, const GGLcoord* b, const GGLcoord* c)
+{
+    GGLcoord pts[6] = { a[0], a[1], b[0], b[1], c[0], c[1] };
+    aapolyx(con, pts, 3);
+}
+
+// ----------------------------------------------------------------------------
+#if 0
+#pragma mark -
+#endif
+
+struct AAEdge
+{
+    GGLfixed x;         // edge position in 12.16 coordinates
+    GGLfixed x_incr;    // on each y step, increment x by that amount
+    GGLfixed y_incr;    // on each x step, increment y by that amount
+    int16_t y_top;      // starting scanline, 12.4 format
+    int16_t y_bot;      // starting scanline, 12.4 format
+    void dump();
+};
+
+void AAEdge::dump()
+{
+    float tri  = 1.0f / TRI_ONE;
+    float iter = 1.0f / (1<<TRI_ITERATORS_BITS);
+    float fix  = 1.0f / FIXED_ONE;
+    ALOGD(   "x=%08x (%.3f), "
+            "x_incr=%08x (%.3f), y_incr=%08x (%.3f), "
+            "y_top=%08x (%.3f), y_bot=%08x (%.3f) ",
+        x, x*fix,
+        x_incr, x_incr*iter,
+        y_incr, y_incr*iter,
+        y_top, y_top*tri,
+        y_bot, y_bot*tri );
+}
+
+// the following function sets up an edge, it assumes
+// that ymin and ymax are in already in the 'reduced'
+// format
+static __attribute__((noinline))
+void aa_edge_setup(
+        AAEdge*         edges,
+        int*            pcount,
+        const GGLcoord* p1,
+        const GGLcoord* p2,
+        int32_t         ymin,
+        int32_t         ymax )
+{
+    const GGLfixed*  top = p1;
+    const GGLfixed*  bot = p2;
+    AAEdge* edge = edges + *pcount;
+
+    if (top[1] > bot[1])
+        swap(top, bot);
+
+    int  y1 = top[1];
+    int  y2 = bot[1];
+    int  dy = y2 - y1;
+
+    if (dy==0 || y1>ymax || y2<ymin)
+        return;
+
+    if (y1 > ymin)
+        ymin = y1;
+    
+    if (y2 < ymax)
+        ymax = y2;
+
+    const int x1 = top[0];
+    const int dx = bot[0] - x1;
+    const int shift = FIXED_BITS - TRI_FRACTION_BITS;
+
+    // setup edge fields
+    edge->x      = x1 << shift;
+    edge->x_incr = 0;
+    edge->y_top  = ymin;
+    edge->y_bot  = ymax;
+    edge->y_incr = 0x7FFFFFFF;
+
+    if (ggl_likely(ymin <= ymax && dx)) {
+        edge->x_incr = gglDivQ16(dx, dy);
+        if (dx != 0) {
+            edge->y_incr = abs(gglDivQ16(dy, dx));
+        }
+    }
+    if (ggl_likely(y1 < ymin)) {
+        int32_t xadjust = (edge->x_incr * (ymin-y1))
+                >> (TRI_FRACTION_BITS + TRI_ITERATORS_BITS - FIXED_BITS);
+        edge->x += xadjust;
+    }
+  
+    ++*pcount;
+}
+
+
+typedef int (*compar_t)(const void*, const void*);
+static int compare_edges(const AAEdge *e0, const AAEdge *e1) {
+    if (e0->y_top > e1->y_top)      return 1;
+    if (e0->y_top < e1->y_top)      return -1;
+    if (e0->x > e1->x)              return 1;
+    if (e0->x < e1->x)              return -1;
+    if (e0->x_incr > e1->x_incr)    return 1;
+    if (e0->x_incr < e1->x_incr)    return -1;
+    return 0; // same edges, should never happen
+}
+
+static inline 
+void SET_COVERAGE(int16_t*& p, int32_t value, ssize_t n)
+{
+    android_memset16((uint16_t*)p, value, n*2);
+    p += n;
+}
+
+static inline 
+void ADD_COVERAGE(int16_t*& p, int32_t value)
+{
+    value = *p + value;
+    if (value >= 0x8000)
+        value = 0x7FFF;
+    *p++ = value;
+}
+
+static inline
+void SUB_COVERAGE(int16_t*& p, int32_t value)
+{
+    value = *p - value;
+    value &= ~(value>>31);
+    *p++ = value;
+}
+
+void aapolyx(void* con,
+        const GGLcoord* pts, int count)
+{
+    /*
+     * NOTE: This routine assumes that the polygon has been clipped to the
+     * viewport already, that is, no vertex lies outside of the framebuffer.
+     * If this happens, the code below won't corrupt memory but the 
+     * coverage values may not be correct.
+     */
+    
+    GGL_CONTEXT(c, con);
+
+    // we do only quads for now (it's used for thick lines)
+    if ((count>4) || (count<2)) return;
+
+    // take scissor into account
+    const int xmin = c->state.scissor.left;
+    const int xmax = c->state.scissor.right;
+    if (xmin >= xmax) return;
+
+    // generate edges from the vertices
+    int32_t ymin = TRI_FROM_INT(c->state.scissor.top);
+    int32_t ymax = TRI_FROM_INT(c->state.scissor.bottom);
+    if (ymin >= ymax) return;
+
+    AAEdge edges[4];
+    int num_edges = 0;
+    GGLcoord const * p = pts;
+    for (int i=0 ; i<count-1 ; i++, p+=2) {
+        aa_edge_setup(edges, &num_edges, p, p+2, ymin, ymax);
+    }
+    aa_edge_setup(edges, &num_edges, p, pts, ymin, ymax );
+    if (ggl_unlikely(num_edges<2))
+        return;
+
+    // sort the edge list top to bottom, left to right.
+    qsort(edges, num_edges, sizeof(AAEdge), (compar_t)compare_edges);
+
+    int16_t* const covPtr = c->state.buffers.coverage;
+    memset(covPtr+xmin, 0, (xmax-xmin)*sizeof(*covPtr));
+
+    // now, sweep all edges in order
+    // start with the 2 first edges. We know that they share their top
+    // vertex, by construction.
+    int i = 2;
+    AAEdge* left  = &edges[0];
+    AAEdge* right = &edges[1];
+    int32_t yt = left->y_top;
+    GGLfixed l = left->x;
+    GGLfixed r = right->x;
+    int retire = 0;
+    int16_t* coverage;
+
+    // at this point we can initialize the rasterizer    
+    c->init_y(c, yt>>TRI_FRACTION_BITS);
+    c->iterators.xl = xmax;
+    c->iterators.xr = xmin;
+
+    do {
+        int32_t y = min(min(left->y_bot, right->y_bot), TRI_FLOOR(yt + TRI_ONE));
+        const int32_t shift = TRI_FRACTION_BITS + TRI_ITERATORS_BITS - FIXED_BITS;
+        const int cf_shift = (1 + TRI_FRACTION_BITS*2 + TRI_ITERATORS_BITS - 15);
+
+        // compute xmin and xmax for the left edge
+        GGLfixed l_min = gglMulAddx(left->x_incr, y - left->y_top, left->x, shift);
+        GGLfixed l_max = l;
+        l = l_min;
+        if (l_min > l_max)
+            swap(l_min, l_max);
+
+        // compute xmin and xmax for the right edge
+        GGLfixed r_min = gglMulAddx(right->x_incr, y - right->y_top, right->x, shift);
+        GGLfixed r_max = r;
+        r = r_min;
+        if (r_min > r_max)
+            swap(r_min, r_max);
+
+        // make sure we're not touching coverage values outside of the
+        // framebuffer
+        l_min &= ~(l_min>>31);
+        r_min &= ~(r_min>>31);
+        l_max &= ~(l_max>>31);
+        r_max &= ~(r_max>>31);
+        if (gglFixedToIntFloor(l_min) >= xmax) l_min = gglIntToFixed(xmax)-1;
+        if (gglFixedToIntFloor(r_min) >= xmax) r_min = gglIntToFixed(xmax)-1;
+        if (gglFixedToIntCeil(l_max) >= xmax)  l_max = gglIntToFixed(xmax)-1;
+        if (gglFixedToIntCeil(r_max) >= xmax)  r_max = gglIntToFixed(xmax)-1;
+
+        // compute the integer versions of the above
+        const GGLfixed l_min_i = gglFloorx(l_min);
+        const GGLfixed l_max_i = gglCeilx (l_max);
+        const GGLfixed r_min_i = gglFloorx(r_min);
+        const GGLfixed r_max_i = gglCeilx (r_max);
+
+        // clip horizontally using the scissor
+        const int xml = max(xmin, gglFixedToIntFloor(l_min_i));
+        const int xmr = min(xmax, gglFixedToIntFloor(r_max_i));
+
+        // if we just stepped to a new scanline, render the previous one.
+        // and clear the coverage buffer
+        if (retire) {
+            if (c->iterators.xl < c->iterators.xr)
+                c->scanline(c);
+            c->step_y(c);
+            memset(covPtr+xmin, 0, (xmax-xmin)*sizeof(*covPtr));
+            c->iterators.xl = xml;
+            c->iterators.xr = xmr;
+        } else {
+            // update the horizontal range of this scanline
+            c->iterators.xl = min(c->iterators.xl, xml);
+            c->iterators.xr = max(c->iterators.xr, xmr);
+        }
+
+        coverage = covPtr + gglFixedToIntFloor(l_min_i);
+        if (l_min_i == gglFloorx(l_max)) {
+            
+            /*
+             *  fully traverse this pixel vertically
+             *       l_max
+             *  +-----/--+  yt
+             *  |    /   |  
+             *  |   /    |
+             *  |  /     |
+             *  +-/------+  y
+             *   l_min  (l_min_i + TRI_ONE)
+             */
+              
+            GGLfixed dx = l_max - l_min;
+            int32_t dy = y - yt;
+            int cf = gglMulx((dx >> 1) + (l_min_i + FIXED_ONE - l_max), dy,
+                FIXED_BITS + TRI_FRACTION_BITS - 15);
+            ADD_COVERAGE(coverage, cf);
+            // all pixels on the right have cf = 1.0
+        } else {
+            /*
+             *  spans several pixels in one scanline
+             *            l_max
+             *  +--------+--/-----+  yt
+             *  |        |/       |
+             *  |       /|        |
+             *  |     /  |        |
+             *  +---/----+--------+  y
+             *   l_min (l_min_i + TRI_ONE)
+             */
+
+            // handle the first pixel separately...
+            const int32_t y_incr = left->y_incr;
+            int32_t dx = TRI_FROM_FIXED(l_min_i - l_min) + TRI_ONE;
+            int32_t cf = (dx * dx * y_incr) >> cf_shift;
+            ADD_COVERAGE(coverage, cf);
+
+            // following pixels get covered by y_incr, but we need
+            // to fix-up the cf to account for previous partial pixel
+            dx = TRI_FROM_FIXED(l_min - l_min_i);
+            cf -= (dx * dx * y_incr) >> cf_shift;
+            for (int x = l_min_i+FIXED_ONE ; x < l_max_i-FIXED_ONE ; x += FIXED_ONE) {
+                cf += y_incr >> (TRI_ITERATORS_BITS-15);
+                ADD_COVERAGE(coverage, cf);
+            }
+            
+            // and the last pixel
+            dx = TRI_FROM_FIXED(l_max - l_max_i) - TRI_ONE;
+            cf += (dx * dx * y_incr) >> cf_shift;
+            ADD_COVERAGE(coverage, cf);
+        }
+        
+        // now, fill up all fully covered pixels
+        coverage = covPtr + gglFixedToIntFloor(l_max_i);
+        int cf = ((y - yt) << (15 - TRI_FRACTION_BITS));
+        if (ggl_likely(cf >= 0x8000)) {
+            SET_COVERAGE(coverage, 0x7FFF, ((r_max - l_max_i)>>FIXED_BITS)+1);
+        } else {
+            for (int x=l_max_i ; x<r_max ; x+=FIXED_ONE) {
+                ADD_COVERAGE(coverage, cf);
+            }
+        }
+        
+        // subtract the coverage of the right edge
+        coverage = covPtr + gglFixedToIntFloor(r_min_i); 
+        if (r_min_i == gglFloorx(r_max)) {
+            GGLfixed dx = r_max - r_min;
+            int32_t dy = y - yt;
+            int cf = gglMulx((dx >> 1) + (r_min_i + FIXED_ONE - r_max), dy,
+                FIXED_BITS + TRI_FRACTION_BITS - 15);
+            SUB_COVERAGE(coverage, cf);
+            // all pixels on the right have cf = 1.0
+        } else {
+            // handle the first pixel separately...
+            const int32_t y_incr = right->y_incr;
+            int32_t dx = TRI_FROM_FIXED(r_min_i - r_min) + TRI_ONE;
+            int32_t cf = (dx * dx * y_incr) >> cf_shift;
+            SUB_COVERAGE(coverage, cf);
+            
+            // following pixels get covered by y_incr, but we need
+            // to fix-up the cf to account for previous partial pixel
+            dx = TRI_FROM_FIXED(r_min - r_min_i);
+            cf -= (dx * dx * y_incr) >> cf_shift;
+            for (int x = r_min_i+FIXED_ONE ; x < r_max_i-FIXED_ONE ; x += FIXED_ONE) {
+                cf += y_incr >> (TRI_ITERATORS_BITS-15);
+                SUB_COVERAGE(coverage, cf);
+            }
+            
+            // and the last pixel
+            dx = TRI_FROM_FIXED(r_max - r_max_i) - TRI_ONE;
+            cf += (dx * dx * y_incr) >> cf_shift;
+            SUB_COVERAGE(coverage, cf);
+        }
+
+        // did we reach the end of an edge? if so, get a new one.
+        if (y == left->y_bot || y == right->y_bot) {
+            // bail out if we're done
+            if (i>=num_edges)
+                break;
+            if (y == left->y_bot)
+                left = &edges[i++];
+            if (y == right->y_bot)
+                right = &edges[i++];
+        }
+
+        // next scanline
+        yt = y;
+        
+        // did we just finish a scanline?        
+        retire = (y << (32-TRI_FRACTION_BITS)) == 0;
+    } while (true);
+
+    // render the last scanline
+    if (c->iterators.xl < c->iterators.xr)
+        c->scanline(c);
+}
+
+}; // namespace android
diff --git a/libpixelflinger/trap.h b/libpixelflinger/trap.h
new file mode 100644
index 0000000..7cce7b3
--- /dev/null
+++ b/libpixelflinger/trap.h
@@ -0,0 +1,31 @@
+/* libs/pixelflinger/trap.h
+**
+** Copyright 2006, The Android Open Source Project
+**
+** Licensed under the Apache License, Version 2.0 (the "License"); 
+** you may not use this file except in compliance with the License. 
+** You may obtain a copy of the License at 
+**
+**     http://www.apache.org/licenses/LICENSE-2.0 
+**
+** Unless required by applicable law or agreed to in writing, software 
+** distributed under the License is distributed on an "AS IS" BASIS, 
+** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+** See the License for the specific language governing permissions and 
+** limitations under the License.
+*/
+
+
+#ifndef ANDROID_TRAP_H
+#define ANDROID_TRAP_H
+
+#include <private/pixelflinger/ggl_context.h>
+
+namespace android {
+
+void ggl_init_trap(context_t* c);
+void ggl_state_changed(context_t* c, int flags);
+
+}; // namespace android
+
+#endif