summaryrefslogtreecommitdiffstats
path: root/target/linux/etrax-2.6/patches/cris/006-gcc-4.patch
diff options
context:
space:
mode:
authorblogic <blogic@3c298f89-4303-0410-b956-a3cf2f4a3e73>2007-06-02 00:46:02 +0000
committerblogic <blogic@3c298f89-4303-0410-b956-a3cf2f4a3e73>2007-06-02 00:46:02 +0000
commit9b11307b07431bac96f8c8e4367a3747942d5751 (patch)
treed4161d76c676ff352e44294ba8819194d66356c4 /target/linux/etrax-2.6/patches/cris/006-gcc-4.patch
parent5bae61fcd1f9040dcc145d8d122e55430d29da12 (diff)
add initial support for the crisarchitecture used on foxboards to openwrt
git-svn-id: svn://svn.openwrt.org/openwrt/trunk@7439 3c298f89-4303-0410-b956-a3cf2f4a3e73
Diffstat (limited to 'target/linux/etrax-2.6/patches/cris/006-gcc-4.patch')
-rw-r--r--target/linux/etrax-2.6/patches/cris/006-gcc-4.patch752
1 files changed, 752 insertions, 0 deletions
diff --git a/target/linux/etrax-2.6/patches/cris/006-gcc-4.patch b/target/linux/etrax-2.6/patches/cris/006-gcc-4.patch
new file mode 100644
index 000000000..31a410770
--- /dev/null
+++ b/target/linux/etrax-2.6/patches/cris/006-gcc-4.patch
@@ -0,0 +1,752 @@
+diff -urN linux-2.6.19.2.orig/arch/cris/arch-v10/lib/memset.c linux-2.6.19.2/arch/cris/arch-v10/lib/memset.c
+--- linux-2.6.19.2.orig/arch/cris/arch-v10/lib/memset.c 2007-05-20 01:46:35.000000000 +0200
++++ linux-2.6.19.2/arch/cris/arch-v10/lib/memset.c 2007-05-20 01:51:47.000000000 +0200
+@@ -29,224 +29,21 @@
+
+ #include <linux/types.h>
+
+-/* No, there's no macro saying 12*4, since it is "hard" to get it into
+- the asm in a good way. Thus better to expose the problem everywhere.
+- */
+
+-/* Assuming 1 cycle per dword written or read (ok, not really true), and
+- one per instruction, then 43+3*(n/48-1) <= 24+24*(n/48-1)
+- so n >= 45.7; n >= 0.9; we win on the first full 48-byte block to set. */
+-
+-#define ZERO_BLOCK_SIZE (1*12*4)
+-
+-void *memset(void *pdst,
+- int c,
+- size_t plen)
++/**
++ * memset - Fill a region of memory with the given value
++ * @s: Pointer to the start of the area.
++ * @c: The byte to fill the area with
++ * @count: The size of the area.
++ *
++ * Do not use memset() to access IO space, use memset_io() instead.
++ */
++void *memset(void *s, int c, size_t count)
+ {
+- /* Ok. Now we want the parameters put in special registers.
+- Make sure the compiler is able to make something useful of this. */
+-
+- register char *return_dst __asm__ ("r10") = pdst;
+- register int n __asm__ ("r12") = plen;
+- register int lc __asm__ ("r11") = c;
+-
+- /* Most apps use memset sanely. Only those memsetting about 3..4
+- bytes or less get penalized compared to the generic implementation
+- - and that's not really sane use. */
+-
+- /* Ugh. This is fragile at best. Check with newer GCC releases, if
+- they compile cascaded "x |= x << 8" sanely! */
+- __asm__("movu.b %0,$r13\n\t"
+- "lslq 8,$r13\n\t"
+- "move.b %0,$r13\n\t"
+- "move.d $r13,%0\n\t"
+- "lslq 16,$r13\n\t"
+- "or.d $r13,%0"
+- : "=r" (lc) : "0" (lc) : "r13");
+-
+- {
+- register char *dst __asm__ ("r13") = pdst;
+-
+- /* This is NONPORTABLE, but since this whole routine is */
+- /* grossly nonportable that doesn't matter. */
+-
+- if (((unsigned long) pdst & 3) != 0
+- /* Oops! n=0 must be a legal call, regardless of alignment. */
+- && n >= 3)
+- {
+- if ((unsigned long)dst & 1)
+- {
+- *dst = (char) lc;
+- n--;
+- dst++;
+- }
+-
+- if ((unsigned long)dst & 2)
+- {
+- *(short *)dst = lc;
+- n -= 2;
+- dst += 2;
+- }
+- }
+-
+- /* Now the fun part. For the threshold value of this, check the equation
+- above. */
+- /* Decide which copying method to use. */
+- if (n >= ZERO_BLOCK_SIZE)
+- {
+- /* For large copies we use 'movem' */
+-
+- /* It is not optimal to tell the compiler about clobbering any
+- registers; that will move the saving/restoring of those registers
+- to the function prologue/epilogue, and make non-movem sizes
+- suboptimal.
+-
+- This method is not foolproof; it assumes that the "asm reg"
+- declarations at the beginning of the function really are used
+- here (beware: they may be moved to temporary registers).
+- This way, we do not have to save/move the registers around into
+- temporaries; we can safely use them straight away.
+-
+- If you want to check that the allocation was right; then
+- check the equalities in the first comment. It should say
+- "r13=r13, r12=r12, r11=r11" */
+- __asm__ volatile ("
+- ;; Check that the following is true (same register names on
+- ;; both sides of equal sign, as in r8=r8):
+- ;; %0=r13, %1=r12, %4=r11
+- ;;
+- ;; Save the registers we'll clobber in the movem process
+- ;; on the stack. Don't mention them to gcc, it will only be
+- ;; upset.
+- subq 11*4,$sp
+- movem $r10,[$sp]
+-
+- move.d $r11,$r0
+- move.d $r11,$r1
+- move.d $r11,$r2
+- move.d $r11,$r3
+- move.d $r11,$r4
+- move.d $r11,$r5
+- move.d $r11,$r6
+- move.d $r11,$r7
+- move.d $r11,$r8
+- move.d $r11,$r9
+- move.d $r11,$r10
+-
+- ;; Now we've got this:
+- ;; r13 - dst
+- ;; r12 - n
+-
+- ;; Update n for the first loop
+- subq 12*4,$r12
+-0:
+- subq 12*4,$r12
+- bge 0b
+- movem $r11,[$r13+]
+-
+- addq 12*4,$r12 ;; compensate for last loop underflowing n
+-
+- ;; Restore registers from stack
+- movem [$sp+],$r10"
+-
+- /* Outputs */ : "=r" (dst), "=r" (n)
+- /* Inputs */ : "0" (dst), "1" (n), "r" (lc));
+-
+- }
+-
+- /* Either we directly starts copying, using dword copying
+- in a loop, or we copy as much as possible with 'movem'
+- and then the last block (<44 bytes) is copied here.
+- This will work since 'movem' will have updated src,dst,n. */
+-
+- while ( n >= 16 )
+- {
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- n -= 16;
+- }
++ char *xs = s;
+
+- /* A switch() is definitely the fastest although it takes a LOT of code.
+- * Particularly if you inline code this.
+- */
+- switch (n)
+- {
+- case 0:
+- break;
+- case 1:
+- *(char*)dst = (char) lc;
+- break;
+- case 2:
+- *(short*)dst = (short) lc;
+- break;
+- case 3:
+- *((short*)dst)++ = (short) lc;
+- *(char*)dst = (char) lc;
+- break;
+- case 4:
+- *((long*)dst)++ = lc;
+- break;
+- case 5:
+- *((long*)dst)++ = lc;
+- *(char*)dst = (char) lc;
+- break;
+- case 6:
+- *((long*)dst)++ = lc;
+- *(short*)dst = (short) lc;
+- break;
+- case 7:
+- *((long*)dst)++ = lc;
+- *((short*)dst)++ = (short) lc;
+- *(char*)dst = (char) lc;
+- break;
+- case 8:
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- break;
+- case 9:
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *(char*)dst = (char) lc;
+- break;
+- case 10:
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *(short*)dst = (short) lc;
+- break;
+- case 11:
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *((short*)dst)++ = (short) lc;
+- *(char*)dst = (char) lc;
+- break;
+- case 12:
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- break;
+- case 13:
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *(char*)dst = (char) lc;
+- break;
+- case 14:
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *(short*)dst = (short) lc;
+- break;
+- case 15:
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *((long*)dst)++ = lc;
+- *((short*)dst)++ = (short) lc;
+- *(char*)dst = (char) lc;
+- break;
+- }
+- }
++ while (count--)
++ *xs++ = c;
++ return s;
++}
+
+- return return_dst; /* destination pointer. */
+-} /* memset() */
+diff -urN linux-2.6.19.2.orig/arch/cris/arch-v10/lib/string.c linux-2.6.19.2/arch/cris/arch-v10/lib/string.c
+--- linux-2.6.19.2.orig/arch/cris/arch-v10/lib/string.c 2007-05-20 01:46:35.000000000 +0200
++++ linux-2.6.19.2/arch/cris/arch-v10/lib/string.c 2007-05-20 01:51:19.000000000 +0200
+@@ -33,193 +33,21 @@
+
+ #include <linux/types.h>
+
+-void *memcpy(void *pdst,
+- const void *psrc,
+- size_t pn)
++ /**
++ * memcpy - Copy one area of memory to another
++ * @dest: Where to copy to
++ * @src: Where to copy from
++ * @count: The size of the area.
++ *
++ * You should not use this function to access IO space, use memcpy_toio()
++ * or memcpy_fromio() instead.
++ */
++void *memcpy(void *dest, const void *src, size_t count)
+ {
+- /* Ok. Now we want the parameters put in special registers.
+- Make sure the compiler is able to make something useful of this.
+- As it is now: r10 -> r13; r11 -> r11 (nop); r12 -> r12 (nop).
++ char *tmp = dest;
++ const char *s = src;
+
+- If gcc was allright, it really would need no temporaries, and no
+- stack space to save stuff on. */
+-
+- register void *return_dst __asm__ ("r10") = pdst;
+- register char *dst __asm__ ("r13") = pdst;
+- register const char *src __asm__ ("r11") = psrc;
+- register int n __asm__ ("r12") = pn;
+-
+-
+- /* When src is aligned but not dst, this makes a few extra needless
+- cycles. I believe it would take as many to check that the
+- re-alignment was unnecessary. */
+- if (((unsigned long) dst & 3) != 0
+- /* Don't align if we wouldn't copy more than a few bytes; so we
+- don't have to check further for overflows. */
+- && n >= 3)
+- {
+- if ((unsigned long) dst & 1)
+- {
+- n--;
+- *(char*)dst = *(char*)src;
+- src++;
+- dst++;
+- }
+-
+- if ((unsigned long) dst & 2)
+- {
+- n -= 2;
+- *(short*)dst = *(short*)src;
+- src += 2;
+- dst += 2;
+- }
+- }
+-
+- /* Decide which copying method to use. */
+- if (n >= 44*2) /* Break even between movem and
+- move16 is at 38.7*2, but modulo 44. */
+- {
+- /* For large copies we use 'movem' */
+-
+- /* It is not optimal to tell the compiler about clobbering any
+- registers; that will move the saving/restoring of those registers
+- to the function prologue/epilogue, and make non-movem sizes
+- suboptimal.
+-
+- This method is not foolproof; it assumes that the "asm reg"
+- declarations at the beginning of the function really are used
+- here (beware: they may be moved to temporary registers).
+- This way, we do not have to save/move the registers around into
+- temporaries; we can safely use them straight away.
+-
+- If you want to check that the allocation was right; then
+- check the equalities in the first comment. It should say
+- "r13=r13, r11=r11, r12=r12" */
+- __asm__ volatile ("
+- ;; Check that the following is true (same register names on
+- ;; both sides of equal sign, as in r8=r8):
+- ;; %0=r13, %1=r11, %2=r12
+- ;;
+- ;; Save the registers we'll use in the movem process
+- ;; on the stack.
+- subq 11*4,$sp
+- movem $r10,[$sp]
+-
+- ;; Now we've got this:
+- ;; r11 - src
+- ;; r13 - dst
+- ;; r12 - n
+-
+- ;; Update n for the first loop
+- subq 44,$r12
+-0:
+- movem [$r11+],$r10
+- subq 44,$r12
+- bge 0b
+- movem $r10,[$r13+]
+-
+- addq 44,$r12 ;; compensate for last loop underflowing n
+-
+- ;; Restore registers from stack
+- movem [$sp+],$r10"
+-
+- /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n)
+- /* Inputs */ : "0" (dst), "1" (src), "2" (n));
+-
+- }
+-
+- /* Either we directly starts copying, using dword copying
+- in a loop, or we copy as much as possible with 'movem'
+- and then the last block (<44 bytes) is copied here.
+- This will work since 'movem' will have updated src,dst,n. */
+-
+- while ( n >= 16 )
+- {
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- n -= 16;
+- }
+-
+- /* A switch() is definitely the fastest although it takes a LOT of code.
+- * Particularly if you inline code this.
+- */
+- switch (n)
+- {
+- case 0:
+- break;
+- case 1:
+- *(char*)dst = *(char*)src;
+- break;
+- case 2:
+- *(short*)dst = *(short*)src;
+- break;
+- case 3:
+- *((short*)dst)++ = *((short*)src)++;
+- *(char*)dst = *(char*)src;
+- break;
+- case 4:
+- *((long*)dst)++ = *((long*)src)++;
+- break;
+- case 5:
+- *((long*)dst)++ = *((long*)src)++;
+- *(char*)dst = *(char*)src;
+- break;
+- case 6:
+- *((long*)dst)++ = *((long*)src)++;
+- *(short*)dst = *(short*)src;
+- break;
+- case 7:
+- *((long*)dst)++ = *((long*)src)++;
+- *((short*)dst)++ = *((short*)src)++;
+- *(char*)dst = *(char*)src;
+- break;
+- case 8:
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- break;
+- case 9:
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *(char*)dst = *(char*)src;
+- break;
+- case 10:
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *(short*)dst = *(short*)src;
+- break;
+- case 11:
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *((short*)dst)++ = *((short*)src)++;
+- *(char*)dst = *(char*)src;
+- break;
+- case 12:
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- break;
+- case 13:
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *(char*)dst = *(char*)src;
+- break;
+- case 14:
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *(short*)dst = *(short*)src;
+- break;
+- case 15:
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *((long*)dst)++ = *((long*)src)++;
+- *((short*)dst)++ = *((short*)src)++;
+- *(char*)dst = *(char*)src;
+- break;
+- }
+-
+- return return_dst; /* destination pointer. */
+-} /* memcpy() */
++ while (count--)
++ *tmp++ = *s++;
++ return dest;
++}
+diff -urN linux-2.6.19.2.orig/arch/cris/arch-v10/lib/usercopy.c linux-2.6.19.2/arch/cris/arch-v10/lib/usercopy.c
+--- linux-2.6.19.2.orig/arch/cris/arch-v10/lib/usercopy.c 2007-05-16 22:11:26.000000000 +0200
++++ linux-2.6.19.2/arch/cris/arch-v10/lib/usercopy.c 2007-05-16 23:17:41.000000000 +0200
+@@ -88,63 +88,38 @@
+ If you want to check that the allocation was right; then
+ check the equalities in the first comment. It should say
+ "r13=r13, r11=r11, r12=r12". */
+- __asm__ volatile ("\
+- .ifnc %0%1%2%3,$r13$r11$r12$r10 \n\
+- .err \n\
+- .endif \n\
+-
+- ;; Save the registers we'll use in the movem process
+- ;; on the stack.
+- subq 11*4,$sp
+- movem $r10,[$sp]
+-
+- ;; Now we've got this:
+- ;; r11 - src
+- ;; r13 - dst
+- ;; r12 - n
+-
+- ;; Update n for the first loop
+- subq 44,$r12
+-
+-; Since the noted PC of a faulting instruction in a delay-slot of a taken
+-; branch, is that of the branch target, we actually point at the from-movem
+-; for this case. There is no ambiguity here; if there was a fault in that
+-; instruction (meaning a kernel oops), the faulted PC would be the address
+-; after *that* movem.
+-
+-0:
+- movem [$r11+],$r10
+- subq 44,$r12
+- bge 0b
+- movem $r10,[$r13+]
+-1:
+- addq 44,$r12 ;; compensate for last loop underflowing n
+-
+- ;; Restore registers from stack
+- movem [$sp+],$r10
+-2:
+- .section .fixup,\"ax\"
+-
+-; To provide a correct count in r10 of bytes that failed to be copied,
+-; we jump back into the loop if the loop-branch was taken. There is no
+-; performance penalty for sany use; the program will segfault soon enough.
+-
+-3:
+- move.d [$sp],$r10
+- addq 44,$r10
+- move.d $r10,[$sp]
+- jump 0b
+-4:
+- movem [$sp+],$r10
+- addq 44,$r10
+- addq 44,$r12
+- jump 2b
+-
+- .previous
+- .section __ex_table,\"a\"
+- .dword 0b,3b
+- .dword 1b,4b
+- .previous"
++ __asm__ volatile (
++ ".ifnc %0%1%2%3,$r13$r11$r12$r10 \n\t"
++ ".err \n\t"
++ ".endif \n\t"
++ "subq 11*4,$sp\n\t"
++ "movem $r10,[$sp]\n\t"
++ "subq 44,$r12\n\t"
++ "0:\n\t"
++ "movem [$r11+],$r10\n\t"
++ "subq 44,$r12\n\t"
++ "bge 0b\n\t"
++ "movem $r10,[$r13+]\n\t"
++ "1:\n\t"
++ "addq 44,$r12 \n\t"
++ "movem [$sp+],$r10\n\t"
++ "2:\n\t"
++ ".section .fixup,\"ax\"\n\t"
++ "3:\n\t"
++ "move.d [$sp],$r10\n\t"
++ "addq 44,$r10\n\t"
++ "move.d $r10,[$sp]\n\t"
++ "jump 0b\n\t"
++ "4:\n\t"
++ "movem [$sp+],$r10\n\t"
++ "addq 44,$r10\n\t"
++ "addq 44,$r12\n\t"
++ "jump 2b\n\t"
++ ".previous\n\t"
++ ".section __ex_table,\"a\"\n\t"
++ ".dword 0b,3b\n\t"
++ ".dword 1b,4b\n\t"
++ ".previous\n\t"
+
+ /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn)
+ /* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn));
+@@ -253,60 +228,32 @@
+ If you want to check that the allocation was right; then
+ check the equalities in the first comment. It should say
+ "r13=r13, r11=r11, r12=r12" */
+- __asm__ volatile ("
+- .ifnc %0%1%2%3,$r13$r11$r12$r10 \n\
+- .err \n\
+- .endif \n\
+-
+- ;; Save the registers we'll use in the movem process
+- ;; on the stack.
+- subq 11*4,$sp
+- movem $r10,[$sp]
+-
+- ;; Now we've got this:
+- ;; r11 - src
+- ;; r13 - dst
+- ;; r12 - n
+-
+- ;; Update n for the first loop
+- subq 44,$r12
+-0:
+- movem [$r11+],$r10
+-1:
+- subq 44,$r12
+- bge 0b
+- movem $r10,[$r13+]
+-
+- addq 44,$r12 ;; compensate for last loop underflowing n
+-
+- ;; Restore registers from stack
+- movem [$sp+],$r10
+-4:
+- .section .fixup,\"ax\"
+-
+-;; Do not jump back into the loop if we fail. For some uses, we get a
+-;; page fault somewhere on the line. Without checking for page limits,
+-;; we don't know where, but we need to copy accurately and keep an
+-;; accurate count; not just clear the whole line. To do that, we fall
+-;; down in the code below, proceeding with smaller amounts. It should
+-;; be kept in mind that we have to cater to code like what at one time
+-;; was in fs/super.c:
+-;; i = size - copy_from_user((void *)page, data, size);
+-;; which would cause repeated faults while clearing the remainder of
+-;; the SIZE bytes at PAGE after the first fault.
+-;; A caveat here is that we must not fall through from a failing page
+-;; to a valid page.
+-
+-3:
+- movem [$sp+],$r10
+- addq 44,$r12 ;; Get back count before faulting point.
+- subq 44,$r11 ;; Get back pointer to faulting movem-line.
+- jump 4b ;; Fall through, pretending the fault didn't happen.
+-
+- .previous
+- .section __ex_table,\"a\"
+- .dword 1b,3b
+- .previous"
++ __asm__ volatile (
++ ".ifnc %0%1%2%3,$r13$r11$r12$r10 \n\t"
++ ".err \n\t"
++ ".endif \n\t"
++ "subq 11*4,$sp\n\t"
++ "movem $r10,[$sp]\n\t"
++ "subq 44,$r12\n\t"
++ "0:\n\t"
++ "movem [$r11+],$r10\n\t"
++ "1:\n\t"
++ "subq 44,$r12\n\t"
++ "bge 0b\n\t"
++ "movem $r10,[$r13+]\n\t"
++ "addq 44,$r12 \n\t"
++ "movem [$sp+],$r10\n\t"
++ "4:\n\t"
++ ".section .fixup,\"ax\"\n\t"
++ "3:\n\t"
++ "movem [$sp+],$r10\n\t"
++ "addq 44,$r12\n\t"
++ "subq 44,$r11\n\t"
++ "jump 4b \n\t"
++ ".previous\n\t"
++ ".section __ex_table,\"a\"\n\t"
++ ".dword 1b,3b\n\t"
++ ".previous\n\t"
+
+ /* Outputs */ : "=r" (dst), "=r" (src), "=r" (n), "=r" (retn)
+ /* Inputs */ : "0" (dst), "1" (src), "2" (n), "3" (retn));
+@@ -425,66 +372,50 @@
+ If you want to check that the allocation was right; then
+ check the equalities in the first comment. It should say
+ something like "r13=r13, r11=r11, r12=r12". */
+- __asm__ volatile ("
+- .ifnc %0%1%2,$r13$r12$r10 \n\
+- .err \n\
+- .endif \n\
+-
+- ;; Save the registers we'll clobber in the movem process
+- ;; on the stack. Don't mention them to gcc, it will only be
+- ;; upset.
+- subq 11*4,$sp
+- movem $r10,[$sp]
+-
+- clear.d $r0
+- clear.d $r1
+- clear.d $r2
+- clear.d $r3
+- clear.d $r4
+- clear.d $r5
+- clear.d $r6
+- clear.d $r7
+- clear.d $r8
+- clear.d $r9
+- clear.d $r10
+- clear.d $r11
+-
+- ;; Now we've got this:
+- ;; r13 - dst
+- ;; r12 - n
+-
+- ;; Update n for the first loop
+- subq 12*4,$r12
+-0:
+- subq 12*4,$r12
+- bge 0b
+- movem $r11,[$r13+]
+-1:
+- addq 12*4,$r12 ;; compensate for last loop underflowing n
+-
+- ;; Restore registers from stack
+- movem [$sp+],$r10
+-2:
+- .section .fixup,\"ax\"
+-3:
+- move.d [$sp],$r10
+- addq 12*4,$r10
+- move.d $r10,[$sp]
+- clear.d $r10
+- jump 0b
+-
+-4:
+- movem [$sp+],$r10
+- addq 12*4,$r10
+- addq 12*4,$r12
+- jump 2b
+-
+- .previous
+- .section __ex_table,\"a\"
+- .dword 0b,3b
+- .dword 1b,4b
+- .previous"
+-
++ __asm__ volatile (
++ ".ifnc %0%1%2,$r13$r12$r10\n\t"
++ ".err \n\t"
++ ".endif\n\t"
++ "subq 11*4,$sp\n\t"
++ "movem $r10,[$sp]\n\t"
++ "clear.d $r0\n\t"
++ "clear.d $r1\n\t"
++ "clear.d $r2\n\t"
++ "clear.d $r3\n\t"
++ "clear.d $r4\n\t"
++ "clear.d $r5\n\t"
++ "clear.d $r6\n\t"
++ "clear.d $r7\n\t"
++ "clear.d $r8\n\t"
++ "clear.d $r9\n\t"
++ "clear.d $r10\n\t"
++ "clear.d $r11\n\t"
++ "subq 12*4,$r12\n\t"
++ "0:\n\t"
++ "subq 12*4,$r12\n\t"
++ "bge 0b\n\t"
++ "movem $r11,[$r13+]\n\t"
++ "1: \n\t"
++ "addq 12*4,$r12 \n\t"
++ "movem [$sp+],$r10\n\t"
++ "2:\n\t"
++ ".section .fixup,\"ax\"\n\t"
++ "3:\n\t"
++ "move.d [$sp],$r10\n\t"
++ "addq 12*4,$r10\n\t"
++ "move.d $r10,[$sp]\n\t"
++ "clear.d $r10\n\t"
++ "jump 0b\n\t"
++ "4:\n\t"
++ "movem [$sp+],$r10\n\t"
++ "addq 12*4,$r10\n\t"
++ "addq 12*4,$r12\n\t"
++ "jump 2b\n\t"
++ ".previous\n\t"
++ ".section __ex_table,\"a\"\n\t"
++ ".dword 0b,3b\n\t"
++ ".dword 1b,4b\n\t"
++ ".previous\n\t"
+ /* Outputs */ : "=r" (dst), "=r" (n), "=r" (retn)
+ /* Inputs */ : "0" (dst), "1" (n), "2" (retn)
+ /* Clobber */ : "r11");