Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[safe/jmp/linux-2.6] / arch / arm / boot / compressed / head.S
index d7ecc7e..c5191b1 100644 (file)
 #ifdef DEBUG
 
 #if defined(CONFIG_DEBUG_ICEDCC)
-               .macro  loadsp, rb
+
+#ifdef CONFIG_CPU_V6
+               .macro  loadsp, rb, tmp
+               .endm
+               .macro  writeb, ch, rb
+               mcr     p14, 0, \ch, c0, c5, 0
+               .endm
+#elif defined(CONFIG_CPU_V7)
+               .macro  loadsp, rb, tmp
+               .endm
+               .macro  writeb, ch, rb
+wait:          mrc     p14, 0, pc, c0, c1, 0
+               bcs     wait
+               mcr     p14, 0, \ch, c0, c5, 0
+               .endm
+#elif defined(CONFIG_CPU_XSCALE)
+               .macro  loadsp, rb, tmp
+               .endm
+               .macro  writeb, ch, rb
+               mcr     p14, 0, \ch, c8, c0, 0
+               .endm
+#else
+               .macro  loadsp, rb, tmp
                .endm
                .macro  writeb, ch, rb
-               mcr     p14, 0, \ch, c0, c1, 0
+               mcr     p14, 0, \ch, c1, c0, 0
                .endm
+#endif
+
 #else
 
-#include <asm/arch/debug-macro.S>
+#include <mach/debug-macro.S>
 
                .macro  writeb, ch, rb
                senduart \ch, \rb
                .endm
 
 #if defined(CONFIG_ARCH_SA1100)
-               .macro  loadsp, rb
+               .macro  loadsp, rb, tmp
                mov     \rb, #0x80000000        @ physical base address
 #ifdef CONFIG_DEBUG_LL_SER3
                add     \rb, \rb, #0x00050000   @ Ser3
                add     \rb, \rb, #0x00010000   @ Ser1
 #endif
                .endm
-#elif defined(CONFIG_ARCH_IOP33X)
-               .macro loadsp, rb
-                mov    \rb, #0xff000000
-                orr     \rb, \rb, #0x00ff0000
-                orr     \rb, \rb, #0x0000f700   @ location of the UART
-               .endm
 #elif defined(CONFIG_ARCH_S3C2410)
-               .macro loadsp, rb
+               .macro loadsp, rb, tmp
                mov     \rb, #0x50000000
-               add     \rb, \rb, #0x4000 * CONFIG_S3C2410_LOWLEVEL_UART_PORT
+               add     \rb, \rb, #0x4000 * CONFIG_S3C_LOWLEVEL_UART_PORT
                .endm
 #else
-               .macro  loadsp, rb
-               addruart \rb
+               .macro  loadsp, rb, tmp
+               addruart \rb, \tmp
                .endm
 #endif
 #endif
                kphex   r6, 8           /* processor id */
                kputc   #':'
                kphex   r7, 8           /* architecture id */
+#ifdef CONFIG_CPU_CP15
                kputc   #':'
                mrc     p15, 0, r0, c1, c0
                kphex   r0, 8           /* control reg */
+#endif
                kputc   #'\n'
                kphex   r5, 8           /* decompressed kernel start */
                kputc   #'-'
@@ -128,7 +148,8 @@ start:
                tst     r2, #3                  @ not user?
                bne     not_angel
                mov     r0, #0x17               @ angel_SWIreason_EnterSVC
-               swi     0x123456                @ angel_SWI_ARM
+ ARM(          swi     0x123456        )       @ angel_SWI_ARM
+ THUMB(                svc     0xab            )       @ angel_SWI_THUMB
 not_angel:
                mrs     r2, cpsr                @ turn off interrupts to
                orr     r2, r2, #0xc0           @ prevent angel from running
@@ -149,7 +170,9 @@ not_angel:
 
                .text
                adr     r0, LC0
-               ldmia   r0, {r1, r2, r3, r4, r5, r6, ip, sp}
+ ARM(          ldmia   r0, {r1, r2, r3, r4, r5, r6, r11, ip, sp})
+ THUMB(                ldmia   r0, {r1, r2, r3, r4, r5, r6, r11, ip}   )
+ THUMB(                ldr     sp, [r0, #32]                           )
                subs    r0, r0, r1              @ calculate the delta offset
 
                                                @ if delta is zero, we are
@@ -159,12 +182,13 @@ not_angel:
                /*
                 * We're running at a different address.  We need to fix
                 * up various pointers:
-                *   r5 - zImage base address
-                *   r6 - GOT start
+                *   r5 - zImage base address (_start)
+                *   r6 - size of decompressed image
+                *   r11 - GOT start
                 *   ip - GOT end
                 */
                add     r5, r5, r0
-               add     r6, r6, r0
+               add     r11, r11, r0
                add     ip, ip, r0
 
 #ifndef CONFIG_ZBOOT_ROM
@@ -182,10 +206,10 @@ not_angel:
                /*
                 * Relocate all entries in the GOT table.
                 */
-1:             ldr     r1, [r6, #0]            @ relocate entries in the GOT
+1:             ldr     r1, [r11, #0]           @ relocate entries in the GOT
                add     r1, r1, r0              @ table.  This fixes up the
-               str     r1, [r6], #4            @ C references.
-               cmp     r6, ip
+               str     r1, [r11], #4           @ C references.
+               cmp     r11, ip
                blo     1b
 #else
 
@@ -193,12 +217,12 @@ not_angel:
                 * Relocate entries in the GOT table.  We only relocate
                 * the entries that are outside the (relocated) BSS region.
                 */
-1:             ldr     r1, [r6, #0]            @ relocate entries in the GOT
+1:             ldr     r1, [r11, #0]           @ relocate entries in the GOT
                cmp     r1, r2                  @ entry < bss_start ||
                cmphs   r3, r1                  @ _end < entry
                addlo   r1, r1, r0              @ table.  This fixes up the
-               str     r1, [r6], #4            @ C references.
-               cmp     r6, ip
+               str     r1, [r11], #4           @ C references.
+               cmp     r11, ip
                blo     1b
 #endif
 
@@ -224,6 +248,7 @@ not_relocated:      mov     r0, #0
  * Check to see if we will overwrite ourselves.
  *   r4 = final kernel address
  *   r5 = start of this image
+ *   r6 = size of decompressed image
  *   r2 = end of malloc space (and therefore this image)
  * We basically want:
  *   r4 >= r2 -> OK
@@ -231,7 +256,7 @@ not_relocated:      mov     r0, #0
  */
                cmp     r4, r2
                bhs     wont_overwrite
-               add     r0, r4, #4096*1024      @ 4MB largest kernel size
+               add     r0, r4, r6
                cmp     r0, r5
                bls     wont_overwrite
 
@@ -240,31 +265,34 @@ not_relocated:    mov     r0, #0
                mov     r3, r7
                bl      decompress_kernel
 
-               add     r0, r0, #127
+               add     r0, r0, #127 + 128      @ alignment + stack
                bic     r0, r0, #127            @ align the kernel length
 /*
  * r0     = decompressed kernel length
  * r1-r3  = unused
  * r4     = kernel execution address
  * r5     = decompressed kernel start
- * r6     = processor ID
  * r7     = architecture ID
  * r8     = atags pointer
- * r9-r14 = corrupted
+ * r9-r12,r14 = corrupted
  */
                add     r1, r5, r0              @ end of decompressed kernel
                adr     r2, reloc_start
                ldr     r3, LC1
                add     r3, r2, r3
-1:             ldmia   r2!, {r9 - r14}         @ copy relocation code
-               stmia   r1!, {r9 - r14}
-               ldmia   r2!, {r9 - r14}
-               stmia   r1!, {r9 - r14}
+1:             ldmia   r2!, {r9 - r12, r14}    @ copy relocation code
+               stmia   r1!, {r9 - r12, r14}
+               ldmia   r2!, {r9 - r12, r14}
+               stmia   r1!, {r9 - r12, r14}
                cmp     r2, r3
                blo     1b
+               mov     sp, r1
+               add     sp, sp, #128            @ relocate the stack
 
                bl      cache_clean_flush
-               add     pc, r5, r0              @ call relocation code
+ ARM(          add     pc, r5, r0              ) @ call relocation code
+ THUMB(                add     r12, r5, r0             )
+ THUMB(                mov     pc, r12                 ) @ call relocation code
 
 /*
  * We're not in danger of overwriting ourselves.  Do this the simple way.
@@ -277,13 +305,15 @@ wont_overwrite:   mov     r0, r4
                bl      decompress_kernel
                b       call_kernel
 
+               .align  2
                .type   LC0, #object
 LC0:           .word   LC0                     @ r1
                .word   __bss_start             @ r2
                .word   _end                    @ r3
                .word   zreladdr                @ r4
                .word   _start                  @ r5
-               .word   _got_start              @ r6
+               .word   _image_size             @ r6
+               .word   _got_start              @ r11
                .word   _got_end                @ ip
                .word   user_stack+4096         @ sp
 LC1:           .word   reloc_end - reloc_start
@@ -307,7 +337,6 @@ params:             ldr     r0, =params_phys
  *
  * On entry,
  *  r4 = kernel execution address
- *  r6 = processor ID
  *  r7 = architecture number
  *  r8 = atags pointer
  *  r9 = run-time address of "start"  (???)
@@ -413,9 +442,11 @@ __setup_mmu:       sub     r3, r4, #16384          @ Page directory size
                add     r1, r1, #1048576
                str     r1, [r0]
                mov     pc, lr
+ENDPROC(__setup_mmu)
 
 __armv4_mmu_cache_on:
                mov     r12, lr
+#ifdef CONFIG_MMU
                bl      __setup_mmu
                mov     r0, #0
                mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
@@ -423,9 +454,56 @@ __armv4_mmu_cache_on:
                mrc     p15, 0, r0, c1, c0, 0   @ read control reg
                orr     r0, r0, #0x5000         @ I-cache enable, RR cache replacement
                orr     r0, r0, #0x0030
+#ifdef CONFIG_CPU_ENDIAN_BE8
+               orr     r0, r0, #1 << 25        @ big-endian page tables
+#endif
                bl      __common_mmu_cache_on
                mov     r0, #0
                mcr     p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
+#endif
+               mov     pc, r12
+
+__armv7_mmu_cache_on:
+               mov     r12, lr
+#ifdef CONFIG_MMU
+               mrc     p15, 0, r11, c0, c1, 4  @ read ID_MMFR0
+               tst     r11, #0xf               @ VMSA
+               blne    __setup_mmu
+               mov     r0, #0
+               mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
+               tst     r11, #0xf               @ VMSA
+               mcrne   p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
+#endif
+               mrc     p15, 0, r0, c1, c0, 0   @ read control reg
+               orr     r0, r0, #0x5000         @ I-cache enable, RR cache replacement
+               orr     r0, r0, #0x003c         @ write buffer
+#ifdef CONFIG_MMU
+#ifdef CONFIG_CPU_ENDIAN_BE8
+               orr     r0, r0, #1 << 25        @ big-endian page tables
+#endif
+               orrne   r0, r0, #1              @ MMU enabled
+               movne   r1, #-1
+               mcrne   p15, 0, r3, c2, c0, 0   @ load page table pointer
+               mcrne   p15, 0, r1, c3, c0, 0   @ load domain access control
+#endif
+               mcr     p15, 0, r0, c1, c0, 0   @ load control register
+               mrc     p15, 0, r0, c1, c0, 0   @ and read it back
+               mov     r0, #0
+               mcr     p15, 0, r0, c7, c5, 4   @ ISB
+               mov     pc, r12
+
+__fa526_cache_on:
+               mov     r12, lr
+               bl      __setup_mmu
+               mov     r0, #0
+               mcr     p15, 0, r0, c7, c7, 0   @ Invalidate whole cache
+               mcr     p15, 0, r0, c7, c10, 4  @ drain write buffer
+               mcr     p15, 0, r0, c8, c7, 0   @ flush UTLB
+               mrc     p15, 0, r0, c1, c0, 0   @ read control reg
+               orr     r0, r0, #0x1000         @ I-cache enable
+               bl      __common_mmu_cache_on
+               mov     r0, #0
+               mcr     p15, 0, r0, c8, c7, 0   @ flush UTLB
                mov     pc, r12
 
 __arm6_mmu_cache_on:
@@ -441,6 +519,7 @@ __arm6_mmu_cache_on:
                mov     pc, r12
 
 __common_mmu_cache_on:
+#ifndef CONFIG_THUMB2_KERNEL
 #ifndef DEBUG
                orr     r0, r0, #0x000d         @ Write buffer, mmu
 #endif
@@ -452,6 +531,7 @@ __common_mmu_cache_on:
 1:             mcr     p15, 0, r0, c1, c0, 0   @ load control register
                mrc     p15, 0, r0, c1, c0, 0   @ and read it back to
                sub     pc, lr, r0, lsr #32     @ properly flush pipeline
+#endif
 
 /*
  * All code following this line is relocatable.  It is relocated by
@@ -462,23 +542,25 @@ __common_mmu_cache_on:
  * r1-r3  = unused
  * r4     = kernel execution address
  * r5     = decompressed kernel start
- * r6     = processor ID
  * r7     = architecture ID
  * r8     = atags pointer
- * r9-r14 = corrupted
+ * r9-r12,r14 = corrupted
  */
                .align  5
 reloc_start:   add     r9, r5, r0
+               sub     r9, r9, #128            @ do not copy the stack
                debug_reloc_start
                mov     r1, r4
 1:
                .rept   4
-               ldmia   r5!, {r0, r2, r3, r10 - r14}    @ relocate kernel
-               stmia   r1!, {r0, r2, r3, r10 - r14}
+               ldmia   r5!, {r0, r2, r3, r10 - r12, r14}       @ relocate kernel
+               stmia   r1!, {r0, r2, r3, r10 - r12, r14}
                .endr
 
                cmp     r5, r9
                blo     1b
+               mov     sp, r1
+               add     sp, sp, #128            @ relocate the stack
                debug_reloc_end
 
 call_kernel:   bl      cache_clean_flush
@@ -498,17 +580,23 @@ call_kernel:      bl      cache_clean_flush
  *  r1  = corrupted
  *  r2  = corrupted
  *  r3  = block offset
- *  r6  = corrupted
+ *  r9  = corrupted
  *  r12 = corrupted
  */
 
 call_cache_fn: adr     r12, proc_types
-               mrc     p15, 0, r6, c0, c0      @ get processor ID
+#ifdef CONFIG_CPU_CP15
+               mrc     p15, 0, r9, c0, c0      @ get processor ID
+#else
+               ldr     r9, =CONFIG_PROCESSOR_ID
+#endif
 1:             ldr     r1, [r12, #0]           @ get value
                ldr     r2, [r12, #4]           @ get mask
-               eor     r1, r1, r6              @ (real ^ match)
+               eor     r1, r1, r9              @ (real ^ match)
                tst     r1, r2                  @       & mask
-               addeq   pc, r12, r3             @ call cache function
+ ARM(          addeq   pc, r12, r3             ) @ call cache function
+ THUMB(                addeq   r12, r3                 )
+ THUMB(                moveq   pc, r12                 ) @ call cache function
                add     r12, r12, #4*5
                b       1b
 
@@ -526,13 +614,15 @@ call_cache_fn:    adr     r12, proc_types
  * methods.  Writeback caches _must_ have the flush method
  * defined.
  */
+               .align  2
                .type   proc_types,#object
 proc_types:
                .word   0x41560600              @ ARM6/610
                .word   0xffffffe0
-               b       __arm6_mmu_cache_off    @ works, but slow
-               b       __arm6_mmu_cache_off
+               W(b)    __arm6_mmu_cache_off    @ works, but slow
+               W(b)    __arm6_mmu_cache_off
                mov     pc, lr
+ THUMB(                nop                             )
 @              b       __arm6_mmu_cache_on             @ untested
 @              b       __arm6_mmu_cache_off
 @              b       __armv3_mmu_cache_flush
@@ -540,84 +630,140 @@ proc_types:
                .word   0x00000000              @ old ARM ID
                .word   0x0000f000
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
 
                .word   0x41007000              @ ARM7/710
                .word   0xfff8fe00
-               b       __arm7_mmu_cache_off
-               b       __arm7_mmu_cache_off
+               W(b)    __arm7_mmu_cache_off
+               W(b)    __arm7_mmu_cache_off
                mov     pc, lr
+ THUMB(                nop                             )
 
                .word   0x41807200              @ ARM720T (writethrough)
                .word   0xffffff00
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
                mov     pc, lr
+ THUMB(                nop                             )
 
                .word   0x41007400              @ ARM74x
                .word   0xff00ff00
-               b       __armv3_mpu_cache_on
-               b       __armv3_mpu_cache_off
-               b       __armv3_mpu_cache_flush
+               W(b)    __armv3_mpu_cache_on
+               W(b)    __armv3_mpu_cache_off
+               W(b)    __armv3_mpu_cache_flush
                
                .word   0x41009400              @ ARM94x
                .word   0xff00ff00
-               b       __armv4_mpu_cache_on
-               b       __armv4_mpu_cache_off
-               b       __armv4_mpu_cache_flush
+               W(b)    __armv4_mpu_cache_on
+               W(b)    __armv4_mpu_cache_off
+               W(b)    __armv4_mpu_cache_flush
 
                .word   0x00007000              @ ARM7 IDs
                .word   0x0000f000
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
 
                @ Everything from here on will be the new ID system.
 
                .word   0x4401a100              @ sa110 / sa1100
                .word   0xffffffe0
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_flush
 
                .word   0x6901b110              @ sa1110
                .word   0xfffffff0
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_flush
+
+               .word   0x56056900
+               .word   0xffffff00              @ PXA9xx
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_flush
+
+               .word   0x56158000              @ PXA168
+               .word   0xfffff000
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv5tej_mmu_cache_flush
+
+               .word   0x56050000              @ Feroceon
+               .word   0xff0f0000
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv5tej_mmu_cache_flush
+
+#ifdef CONFIG_CPU_FEROCEON_OLD_ID
+               /* this conflicts with the standard ARMv5TE entry */
+               .long   0x41009260              @ Old Feroceon
+               .long   0xff00fff0
                b       __armv4_mmu_cache_on
                b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               b       __armv5tej_mmu_cache_flush
+#endif
+
+               .word   0x66015261              @ FA526
+               .word   0xff01fff1
+               W(b)    __fa526_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __fa526_cache_flush
 
                @ These match on the architecture ID
 
                .word   0x00020000              @ ARMv4T
                .word   0x000f0000
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_flush
 
                .word   0x00050000              @ ARMv5TE
                .word   0x000f0000
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv4_mmu_cache_flush
 
                .word   0x00060000              @ ARMv5TEJ
                .word   0x000f0000
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv4_mmu_cache_flush
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv5tej_mmu_cache_flush
 
                .word   0x0007b000              @ ARMv6
-               .word   0x0007f000
-               b       __armv4_mmu_cache_on
-               b       __armv4_mmu_cache_off
-               b       __armv6_mmu_cache_flush
+               .word   0x000ff000
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv6_mmu_cache_flush
+
+               .word   0x560f5810              @ Marvell PJ4 ARMv6
+               .word   0xff0ffff0
+               W(b)    __armv4_mmu_cache_on
+               W(b)    __armv4_mmu_cache_off
+               W(b)    __armv6_mmu_cache_flush
+
+               .word   0x000f0000              @ new CPU Id
+               .word   0x000f0000
+               W(b)    __armv7_mmu_cache_on
+               W(b)    __armv7_mmu_cache_off
+               W(b)    __armv7_mmu_cache_flush
 
                .word   0                       @ unrecognised type
                .word   0
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
                mov     pc, lr
+ THUMB(                nop                             )
 
                .size   proc_types, . - proc_types
 
@@ -625,8 +771,7 @@ proc_types:
  * Turn off the Cache and MMU.  ARMv3 does not support
  * reading the control register, but ARMv4 does.
  *
- * On entry,  r6 = processor ID
- * On exit,   r0, r1, r2, r3, r12 corrupted
+ * On exit, r0, r1, r2, r3, r9, r12 corrupted
  * This routine must preserve: r4, r6, r7
  */
                .align  5
@@ -652,14 +797,35 @@ __armv3_mpu_cache_off:
                mov     pc, lr
 
 __armv4_mmu_cache_off:
+#ifdef CONFIG_MMU
                mrc     p15, 0, r0, c1, c0
                bic     r0, r0, #0x000d
                mcr     p15, 0, r0, c1, c0      @ turn MMU and cache off
                mov     r0, #0
                mcr     p15, 0, r0, c7, c7      @ invalidate whole cache v4
                mcr     p15, 0, r0, c8, c7      @ invalidate whole TLB v4
+#endif
                mov     pc, lr
 
+__armv7_mmu_cache_off:
+               mrc     p15, 0, r0, c1, c0
+#ifdef CONFIG_MMU
+               bic     r0, r0, #0x000d
+#else
+               bic     r0, r0, #0x000c
+#endif
+               mcr     p15, 0, r0, c1, c0      @ turn MMU and cache off
+               mov     r12, lr
+               bl      __armv7_mmu_cache_flush
+               mov     r0, #0
+#ifdef CONFIG_MMU
+               mcr     p15, 0, r0, c8, c7, 0   @ invalidate whole TLB
+#endif
+               mcr     p15, 0, r0, c7, c5, 6   @ invalidate BTC
+               mcr     p15, 0, r0, c7, c10, 4  @ DSB
+               mcr     p15, 0, r0, c7, c5, 4   @ ISB
+               mov     pc, r12
+
 __arm6_mmu_cache_off:
                mov     r0, #0x00000030         @ ARM6 control reg.
                b       __armv3_mmu_cache_off
@@ -678,10 +844,8 @@ __armv3_mmu_cache_off:
 /*
  * Clean and flush the cache to maintain consistency.
  *
- * On entry,
- *  r6 = processor ID
  * On exit,
- *  r1, r2, r3, r11, r12 corrupted
+ *  r1, r2, r3, r9, r11, r12 corrupted
  * This routine must preserve:
  *  r0, r4, r5, r6, r7
  */
@@ -707,6 +871,12 @@ __armv4_mpu_cache_flush:
                mcr     p15, 0, ip, c7, c10, 4  @ drain WB
                mov     pc, lr
                
+__fa526_cache_flush:
+               mov     r1, #0
+               mcr     p15, 0, r1, c7, c14, 0  @ clean and invalidate D cache
+               mcr     p15, 0, r1, c7, c5, 0   @ flush I cache
+               mcr     p15, 0, r1, c7, c10, 4  @ drain WB
+               mov     pc, lr
 
 __armv6_mmu_cache_flush:
                mov     r1, #0
@@ -716,11 +886,78 @@ __armv6_mmu_cache_flush:
                mcr     p15, 0, r1, c7, c10, 4  @ drain WB
                mov     pc, lr
 
+__armv7_mmu_cache_flush:
+               mrc     p15, 0, r10, c0, c1, 5  @ read ID_MMFR1
+               tst     r10, #0xf << 16         @ hierarchical cache (ARMv7)
+               mov     r10, #0
+               beq     hierarchical
+               mcr     p15, 0, r10, c7, c14, 0 @ clean+invalidate D
+               b       iflush
+hierarchical:
+               mcr     p15, 0, r10, c7, c10, 5 @ DMB
+               stmfd   sp!, {r0-r7, r9-r11}
+               mrc     p15, 1, r0, c0, c0, 1   @ read clidr
+               ands    r3, r0, #0x7000000      @ extract loc from clidr
+               mov     r3, r3, lsr #23         @ left align loc bit field
+               beq     finished                @ if loc is 0, then no need to clean
+               mov     r10, #0                 @ start clean at cache level 0
+loop1:
+               add     r2, r10, r10, lsr #1    @ work out 3x current cache level
+               mov     r1, r0, lsr r2          @ extract cache type bits from clidr
+               and     r1, r1, #7              @ mask of the bits for current cache only
+               cmp     r1, #2                  @ see what cache we have at this level
+               blt     skip                    @ skip if no cache, or just i-cache
+               mcr     p15, 2, r10, c0, c0, 0  @ select current cache level in cssr
+               mcr     p15, 0, r10, c7, c5, 4  @ isb to sych the new cssr&csidr
+               mrc     p15, 1, r1, c0, c0, 0   @ read the new csidr
+               and     r2, r1, #7              @ extract the length of the cache lines
+               add     r2, r2, #4              @ add 4 (line length offset)
+               ldr     r4, =0x3ff
+               ands    r4, r4, r1, lsr #3      @ find maximum number on the way size
+               clz     r5, r4                  @ find bit position of way size increment
+               ldr     r7, =0x7fff
+               ands    r7, r7, r1, lsr #13     @ extract max number of the index size
+loop2:
+               mov     r9, r4                  @ create working copy of max way size
+loop3:
+ ARM(          orr     r11, r10, r9, lsl r5    ) @ factor way and cache number into r11
+ ARM(          orr     r11, r11, r7, lsl r2    ) @ factor index number into r11
+ THUMB(                lsl     r6, r9, r5              )
+ THUMB(                orr     r11, r10, r6            ) @ factor way and cache number into r11
+ THUMB(                lsl     r6, r7, r2              )
+ THUMB(                orr     r11, r11, r6            ) @ factor index number into r11
+               mcr     p15, 0, r11, c7, c14, 2 @ clean & invalidate by set/way
+               subs    r9, r9, #1              @ decrement the way
+               bge     loop3
+               subs    r7, r7, #1              @ decrement the index
+               bge     loop2
+skip:
+               add     r10, r10, #2            @ increment cache number
+               cmp     r3, r10
+               bgt     loop1
+finished:
+               ldmfd   sp!, {r0-r7, r9-r11}
+               mov     r10, #0                 @ swith back to cache level 0
+               mcr     p15, 2, r10, c0, c0, 0  @ select current cache level in cssr
+iflush:
+               mcr     p15, 0, r10, c7, c10, 4 @ DSB
+               mcr     p15, 0, r10, c7, c5, 0  @ invalidate I+BTB
+               mcr     p15, 0, r10, c7, c10, 4 @ DSB
+               mcr     p15, 0, r10, c7, c5, 4  @ ISB
+               mov     pc, lr
+
+__armv5tej_mmu_cache_flush:
+1:             mrc     p15, 0, r15, c7, c14, 3 @ test,clean,invalidate D cache
+               bne     1b
+               mcr     p15, 0, r0, c7, c5, 0   @ flush I cache
+               mcr     p15, 0, r0, c7, c10, 4  @ drain WB
+               mov     pc, lr
+
 __armv4_mmu_cache_flush:
                mov     r2, #64*1024            @ default: 32K dcache size (*2)
                mov     r11, #32                @ default: 32 byte line size
                mrc     p15, 0, r3, c0, c0, 1   @ read cache type
-               teq     r3, r6                  @ cache ID register present?
+               teq     r3, r9                  @ cache ID register present?
                beq     no_cache_id
                mov     r1, r3, lsr #18
                and     r1, r1, #7
@@ -733,9 +970,13 @@ __armv4_mmu_cache_flush:
                mov     r11, #8
                mov     r11, r11, lsl r3        @ cache line size in bytes
 no_cache_id:
-               bic     r1, pc, #63             @ align to longest cache line
+               mov     r1, pc
+               bic     r1, r1, #63             @ align to longest cache line
                add     r2, r1, r2
-1:             ldr     r3, [r1], r11           @ s/w flush D cache
+1:
+ ARM(          ldr     r3, [r1], r11           ) @ s/w flush D cache
+ THUMB(                ldr     r3, [r1]                ) @ s/w flush D cache
+ THUMB(                add     r1, r1, r11             )
                teq     r1, r2
                bne     1b
 
@@ -755,6 +996,7 @@ __armv3_mpu_cache_flush:
  * memory, which again must be relocatable.
  */
 #ifdef DEBUG
+               .align  2
                .type   phexbuf,#object
 phexbuf:       .space  12
                .size   phexbuf, . - phexbuf
@@ -773,7 +1015,7 @@ phex:              adr     r3, phexbuf
                strb    r2, [r3, r1]
                b       1b
 
-puts:          loadsp  r3
+puts:          loadsp  r3, r1
 1:             ldrb    r2, [r0], #1
                teq     r2, #0
                moveq   pc, lr
@@ -790,7 +1032,7 @@ puts:              loadsp  r3
 putc:
                mov     r2, r0
                mov     r0, #0
-               loadsp  r3
+               loadsp  r3, r1
                b       2b
 
 memdump:       mov     r12, r0
@@ -822,6 +1064,7 @@ memdump:   mov     r12, r0
                mov     pc, r10
 #endif
 
+               .ltorg
 reloc_end:
 
                .align