Subject: [PATCH] AVR32-optimized string operations

Add hand-optimized AVR32-specific string operations. Some of them
need a bit more testing, though.

---

 libc/string/avr32/Makefile      |   40 +++++++++++
 libc/string/avr32/bcopy.S       |   15 ++++
 libc/string/avr32/bzero.S       |   12 +++
 libc/string/avr32/memchr.S      |   62 +++++++++++++++++
 libc/string/avr32/memcmp.S      |   50 +++++++++++++
 libc/string/avr32/memcpy.S      |  110 ++++++++++++++++++++++++++++++
 libc/string/avr32/memmove.S     |  114 +++++++++++++++++++++++++++++++
 libc/string/avr32/memset.S      |   60 ++++++++++++++++
 libc/string/avr32/strcat.S      |   95 ++++++++++++++++++++++++++
 libc/string/avr32/strcmp.S      |   80 ++++++++++++++++++++++
 libc/string/avr32/strcpy.S      |   63 +++++++++++++++++
 libc/string/avr32/stringtest.c  |  144 ++++++++++++++++++++++++++++++++++++++++
 libc/string/avr32/strlen.S      |   52 ++++++++++++++
 libc/string/avr32/strncpy.S     |   77 +++++++++++++++++++++
 libc/string/avr32/test_memcpy.c |   66 ++++++++++++++++++
 15 files changed, 1040 insertions(+)

Index: uClibc-0.9.28-avr32/libc/string/avr32/bcopy.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/bcopy.S	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,15 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+	.text
+	.global bcopy
+	.type	bcopy, @function
+	.align	1
+bcopy:
+	/* Swap the first two arguments */
+	eor	r11, r12
+	eor	r12, r11
+	eor	r11, r12
+	rjmp	__memmove
+	.size	bcopy, . - bcopy
Index: uClibc-0.9.28-avr32/libc/string/avr32/bzero.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/bzero.S	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,12 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+	.text
+	.global bzero
+	.type	bzero, @function
+	.align	1
+bzero:
+	mov	r10, r11
+	mov	r11, 0
+	rjmp	__memset
Index: uClibc-0.9.28-avr32/libc/string/avr32/Makefile
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/Makefile	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,40 @@
+# Makefile for uClibc
+#
+# Copyright (C) 2000-2003 Erik Andersen <andersen@uclibc.org>
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU Library General Public License as published by the Free
+# Software Foundation; either version 2 of the License, or (at your option) any
+# later version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more
+# details.
+#
+# You should have received a copy of the GNU Library General Public License
+# along with this program; if not, write to the Free Software Foundation, Inc.,
+# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+TOPDIR=../../../
+include $(TOPDIR)Rules.mak
+
+SSRC	:= bcopy.S bzero.S memcmp.S memcpy.S memmove.S
+SSRC	+= memset.S strcmp.S strlen.S
+# memchr.S, strcat.S, strcpy.S, strncpy.S is broken
+SOBJS	:= $(patsubst %.S,%.o, $(SSRC))
+OBJS	:= $(SOBJS)
+
+OBJ_LIST:= ../../obj.string.$(TARGET_ARCH)
+
+all: $(OBJ_LIST)
+
+$(OBJ_LIST): $(OBJS)
+	echo $(addprefix string/$(TARGET_ARCH)/, $(OBJS)) > $@
+
+$(SOBJS): %.o: %.S
+	$(CC) $(ASFLAGS) -c $< -o $@
+	$(STRIPTOOL) -x -R .note -R .comment $@
+
+clean:
+	$(RM) *.[oa] *~ core
Index: uClibc-0.9.28-avr32/libc/string/avr32/memchr.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/memchr.S	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+#define str r12
+#define chr r11
+#define len r10
+
+	.text
+	.global memchr
+	.type	memchr, @function
+memchr:
+	or	chr, chr, chr << 8
+	or	chr, chr, chr << 16
+
+	mov	r9, str
+	andl	r9, 3, COH
+	brne	.Lunaligned_str
+
+1:	sub	len, 4
+	brlt	2f
+	ld.w	r8, str++
+	psub.b	r9, r8, r11
+	tnbz	r9
+	brne	1b
+
+	sub	str, 4
+	bfextu	r9, r8, 24, 8
+	cp.b	r9, r11
+	reteq	str
+	sub	str, -1
+	bfextu	r9, r8, 16, 8
+	cp.b	r9, r11
+	reteq	str
+	sub	str, -1
+	bfextu	r9, r8, 8, 8
+	cp.b	r9, r11
+	reteq	str
+	sub	str, -1
+	retal	str
+
+2:	sub	len, -4
+	reteq	0
+
+3:	ld.ub	r8, str++
+	cp.w	r8, 0
+	reteq	str
+	sub	len, 1
+	brne	3b
+
+	retal	0
+
+.Lunaligned_str:
+1:	sub	len, 1
+	retlt	0
+	ld.ub	r8, str++
+	cp.b	r8, r11
+	reteq	str
+	sub	r9, 1
+	brge	1b
+
+	rjmp	.Laligned_search
Index: uClibc-0.9.28-avr32/libc/string/avr32/memcmp.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/memcmp.S	2006-10-20 10:42:09.000000000 +0200
@@ -0,0 +1,50 @@
+/*
+ * Copyright (C) 2004 Atmel Norway.
+ */
+
+#define s1 r12
+#define s2 r11
+#define len r10
+
+	.text
+	.global memcmp
+	.type	memcmp, @function
+	.align	1
+memcmp:
+	sub	len, 4
+	brlt	.Lless_than_4
+
+1:	ld.w	r8, s1++
+	ld.w	r9, s2++
+	cp.w	r8, r9
+	brne	.Lfound_word
+	sub	len, 4
+	brge	1b
+
+.Lless_than_4:
+	sub	len, -4
+	reteq	0
+
+1:	ld.ub	r8, s1++
+	ld.ub	r9, s2++
+	sub	r8, r9
+	retne	r8
+	sub	len, 1
+	brgt	1b
+
+	retal	0
+
+.Lfound_word:
+	psub.b	r9, r8, r9
+	bfextu	r8, r9, 24, 8
+	retne	r8
+	bfextu	r8, r9, 16, 8
+	retne	r8
+	bfextu	r8, r9, 8, 8
+	retne	r8
+	retal	r9
+
+	.size	memcmp, . - memcmp
+
+	.weak	bcmp
+	bcmp = memcmp
Index: uClibc-0.9.28-avr32/libc/string/avr32/memcpy.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/memcpy.S	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+/* Don't use r12 as dst since we must return it unmodified */
+#define dst r9
+#define src r11
+#define len r10
+
+	.text
+	.global	memcpy
+	.type	memcpy, @function
+
+	.global	__memcpy
+	.hidden	__memcpy
+	.type	__memcpy, @function
+memcpy:
+__memcpy:
+	pref	src[0]
+	mov	dst, r12
+
+	/* If we have less than 32 bytes, don't do anything fancy */
+	cp.w	len, 32
+	brge	.Lmore_than_31
+
+	sub	len, 1
+	retlt	r12
+1:	ld.ub	r8, src++
+	st.b	dst++, r8
+	sub	len, 1
+	brge	1b
+	retal	r12
+
+.Lmore_than_31:
+	pushm	r0-r7, lr
+
+	/* Check alignment */
+	mov	r8, src
+	andl	r8, 31, COH
+	brne	.Lunaligned_src
+	mov	r8, dst
+	andl	r8, 3, COH
+	brne	.Lunaligned_dst
+
+.Laligned_copy:
+	sub	len, 32
+	brlt	.Lless_than_32
+
+1:	/* Copy 32 bytes at a time */
+	ldm	src, r0-r7
+	sub	src, -32
+	stm	dst, r0-r7
+	sub	dst, -32
+	sub	len, 32
+	brge	1b
+
+.Lless_than_32:
+	/* Copy 16 more bytes if possible */
+	sub	len, -16
+	brlt	.Lless_than_16
+	ldm	src, r0-r3
+	sub	src, -16
+	sub	len, 16
+	stm	dst, r0-r3
+	sub	dst, -16
+
+.Lless_than_16:
+	/* Do the remaining as byte copies */
+	neg	len
+	add	pc, pc, len << 2
+	.rept	15
+	ld.ub	r0, src++
+	st.b	dst++, r0
+	.endr
+
+	popm	r0-r7, pc
+
+.Lunaligned_src:
+	/* Make src cacheline-aligned. r8 = (src & 31) */
+	rsub	r8, r8, 32
+	sub	len, r8
+1:	ld.ub	r0, src++
+	st.b	dst++, r0
+	sub	r8, 1
+	brne	1b
+
+	/* If dst is word-aligned, we're ready to go */
+	pref	src[0]
+	mov	r8, 3
+	tst	dst, r8
+	breq	.Laligned_copy
+
+.Lunaligned_dst:
+	/* src is aligned, but dst is not. Expect bad performance */
+	sub	len, 4
+	brlt	2f
+1:	ld.w	r0, src++
+	st.w	dst++, r0
+	sub	len, 4
+	brge	1b
+
+2:	neg	len
+	add	pc, pc, len << 2
+	.rept	3
+	ld.ub	r0, src++
+	st.b	dst++, r0
+	.endr
+
+	popm	r0-r7, pc
+	.size	memcpy, . - memcpy
Index: uClibc-0.9.28-avr32/libc/string/avr32/memmove.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/memmove.S	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+#define dst r12
+#define src r11
+#define len r10
+
+	.text
+	.global memmove
+	.type	memmove, @function
+
+	.global	__memmove
+	.hidden	__memmove
+	.type	__memmove, @function
+memmove:
+__memmove:
+	cp.w	src, dst
+	brge	__memcpy
+
+	add	dst, len
+	add	src, len
+	pref	src[-1]
+
+	/*
+	 * The rest is basically the same as in memcpy.S except that
+	 * the direction is reversed.
+	 */
+	cp.w	len, 32
+	brge	.Lmore_than_31
+
+	sub	len, 1
+	retlt	r12
+1:	ld.ub	r8, --src
+	st.b	--dst, r8
+	sub	len, 1
+	brge	1b
+	retal	r12
+
+.Lmore_than_31:
+	pushm	r0-r7, lr
+
+	/* Check alignment */
+	mov	r8, src
+	andl	r8, 31, COH
+	brne	.Lunaligned_src
+	mov	r8, r12
+	andl	r8, 3, COH
+	brne	.Lunaligned_dst
+
+.Laligned_copy:
+	sub	len, 32
+	brlt	.Lless_than_32
+
+1:	/* Copy 32 bytes at a time */
+	sub	src, 32
+	ldm	src, r0-r7
+	sub	dst, 32
+	sub	len, 32
+	stm	dst, r0-r7
+	brge	1b
+
+.Lless_than_32:
+	/* Copy 16 more bytes if possible */
+	sub	len, -16
+	brlt	.Lless_than_16
+	sub	src, 16
+	ldm	src, r0-r3
+	sub	dst, 16
+	sub	len, 16
+	stm	dst, r0-r3
+
+.Lless_than_16:
+	/* Do the remaining as byte copies */
+	sub	len, -16
+	breq	2f
+1:	ld.ub	r0, --src
+	st.b	--dst, r0
+	sub	len, 1
+	brne	1b
+
+2:	popm	r0-r7, pc
+
+.Lunaligned_src:
+	/* Make src cacheline-aligned. r8 = (src & 31) */
+	sub	len, r8
+1:	ld.ub	r0, --src
+	st.b	--dst, r0
+	sub	r8, 1
+	brne	1b
+
+	/* If dst is word-aligned, we're ready to go */
+	pref	src[-4]
+	mov	r8, 3
+	tst	dst, r8
+	breq	.Laligned_copy
+
+.Lunaligned_dst:
+	/* src is aligned, but dst is not. Expect bad performance */
+	sub	len, 4
+	brlt	2f
+1:	ld.w	r0, --src
+	st.w	--dst, r0
+	sub	len, 4
+	brge	1b
+
+2:	neg	len
+	add	pc, pc, len << 2
+	.rept	3
+	ld.ub	r0, --src
+	st.b	--dst, r0
+	.endr
+
+	popm	r0-r7, pc
Index: uClibc-0.9.28-avr32/libc/string/avr32/memset.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/memset.S	2006-10-20 10:42:15.000000000 +0200
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2004 Atmel Norway.
+ */
+
+#define s r12
+#define c r11
+#define n r10
+
+	.text
+	.global memset
+	.type	memset, @function
+
+	.global	__memset
+	.hidden	__memset
+	.type	__memset, @function
+
+	.align	1
+memset:
+__memset:
+	cp.w	n, 32
+	mov	r9, s
+	brge	.Llarge_memset
+
+	sub	n, 1
+	retlt	s
+1:	st.b	s++, c
+	sub	n, 1
+	brge	1b
+
+	retal	r9
+
+.Llarge_memset:
+	mov	r8, r11
+	mov	r11, 3
+	bfins	r8, r8, 8, 8
+	bfins	r8, r8, 16, 16
+	tst	s, r11
+	breq	2f
+
+1:	st.b	s++, r8
+	sub	n, 1
+	tst	s, r11
+	brne	1b
+
+2:	mov	r11, r9
+	mov	r9, r8
+	sub	n, 8
+
+3:	st.d	s++, r8
+	sub	n, 8
+	brge	3b
+
+	/* If we are done, n == -8 and we'll skip all st.b insns below */
+	neg	n
+	lsl	n, 1
+	add	pc, n
+	.rept	7
+	st.b	s++, r8
+	.endr
+	retal	r11
Index: uClibc-0.9.28-avr32/libc/string/avr32/strcat.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/strcat.S	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+#define s1 r9
+#define s2 r11
+
+	.text
+	.global strcat
+	.type	strcat, @function
+	.align	1
+strcat:
+	mov	s1, r12
+
+	/* Make sure s1 is word-aligned */
+	mov	r10, s1
+	andl	r10, 3, COH
+	breq	2f
+
+	add	pc, pc, r10 << 3
+	sub	r0, r0, 0	/* 4-byte nop */
+	ld.ub	r8, s1++
+	sub	r8, r8, 0
+	breq	2f
+	ld.ub	r8, s1++
+	sub	r8, r8, 0
+	breq	3f
+	ld.ub	r8, s1++
+	sub	r8, r8, 0
+	breq	4f
+
+	/* Find the end of the first string */
+5:	ld.w	r8, s1++
+	tnbz	r8
+	brne	5b
+
+	sub	s1, 4
+
+	bfextu	r10, r8, 24, 8
+	cp.w	r10, 0
+	breq	1f
+	sub	s1, -1
+	bfextu	r10, r8, 16, 8
+	cp.w	r10, 0
+	breq	2f
+	sub	s1, -1
+	bfextu	r10, r8, 8, 8
+	cp.w	r10, 0
+	breq	3f
+	sub	s1, -1
+	rjmp	4f
+
+	/* Now, append s2 */
+1:	ld.ub	r8, s2++
+	st.b	s1++, r8
+	cp.w	r8, 0
+	reteq	r12
+2:	ld.ub	r8, s2++
+	st.b	s1++, r8
+	cp.w	r8, 0
+	reteq	r12
+3:	ld.ub	r8, s2++
+	st.b	s1++, r8
+	cp.w	r8, 0
+	reteq	r12
+4:	ld.ub	r8, s2++
+	st.b	s1++, r8
+	cp.w	r8, 0
+	reteq	r12
+
+	/* Copy one word at a time */
+	ld.w	r8, s2++
+	tnbz	r8
+	breq	2f
+1:	st.w	r8, s2++
+	ld.w	r8, s2++
+	tnbz	r8
+	brne	1b
+
+	/* Copy the remaining bytes */
+	bfextu	r10, r8, 24, 8
+	st.b	s1++, r10
+	cp.w	r10, 0
+	reteq	r12
+	bfextu	r10, r8, 16, 8
+	st.b	s1++, r10
+	cp.w	r10, 0
+	reteq	r12
+	bfextu	r10, r8, 8, 8
+	st.b	s1++, r10
+	cp.w	r10, 0
+	reteq	r12
+	st.b	s1++, r8
+	retal	r12
+	.size	strcat, . - strcat
Index: uClibc-0.9.28-avr32/libc/string/avr32/strcmp.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/strcmp.S	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2004 Atmel Norway.
+ */
+
+#define s1 r12
+#define s2 r11
+#define len r10
+
+	.text
+	.global strcmp
+	.type	strcmp, @function
+	.align	1
+strcmp:
+	mov	r8, 3
+	tst	s1, r8
+	brne	.Lunaligned_s1
+	tst	s2, r8
+	brne	.Lunaligned_s2
+
+1:	ld.w	r8, s1++
+	ld.w	r9, s2++
+	cp.w	r8, r9
+	brne	2f
+	tnbz	r8
+	brne	1b
+	retal	0
+
+2:	bfextu	r12, r8, 24, 8
+	bfextu	r11, r9, 24, 8
+	sub	r12, r11
+	retne	r12
+	cp.w	r11, 0
+	reteq	0
+	bfextu	r12, r8, 16, 8
+	bfextu	r11, r9, 16, 8
+	sub	r12, r11
+	retne	r12
+	cp.w	r11, 0
+	reteq	0
+	bfextu	r12, r8, 8, 8
+	bfextu	r11, r9, 8, 8
+	sub	r12, r11
+	retne	r12
+	cp.w	r11, 0
+	reteq	0
+	bfextu	r12, r8, 0, 8
+	bfextu	r11, r9, 0, 8
+	sub	r12, r11
+	retal	r12
+
+.Lunaligned_s1:
+3:	tst	s1, r8
+	breq	4f
+	ld.ub	r10, s1++
+	ld.ub	r9, s2++
+	sub	r10, r9
+	retne	r10
+	cp.w	r9, 0
+	brne	3b
+	retal	r10
+
+4:	tst	s2, r8
+	breq	1b
+
+.Lunaligned_s2:
+	/*
+	 * s1 and s2 can't both be aligned, and unaligned word loads
+	 * can trigger spurious exceptions if we cross a page boundary.
+	 * Do it the slow way...
+	 */
+1:	ld.ub	r8, s1++
+	ld.ub	r9, s2++
+	sub	r8, r9
+	retne	r8
+	cp.w	r9, 0
+	brne	1b
+	retal	0
+
+	.weak	strcoll
+	strcoll	= strcmp
Index: uClibc-0.9.28-avr32/libc/string/avr32/strcpy.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/strcpy.S	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ *
+ * To reduce the size, this one might simply call strncpy with len = -1.
+ */
+
+#define dst r9
+#define src r11
+
+	.text
+	.global strcpy
+	.type	strcpy, @function
+strcpy:
+	mov	dst, r12
+
+	pref	src[0]
+
+	/*
+	 * Check alignment. If src is aligned but dst isn't, we can't
+	 * do much about it...
+	 */
+	mov	r8, src
+	andl	r8, 3 COH
+	brne	.Lunaligned_src
+
+.Laligned_copy:
+1:	ld.w	r8, src++
+	tnbz	r8
+	breq	2f
+	st.w	dst++, r8
+	rjmp	1b
+
+2:	/*
+	 * Ok, r8 now contains the terminating '\0'. Copy the
+	 * remaining bytes individually.
+	 */
+	bfextu	r10, r8, 24, 8
+	st.b	dst++, r10
+	cp.w	r10, 0
+	reteq	r12
+	bfextu	r10, r8, 16, 8
+	st.b	dst++, r10
+	cp.w	r10, 0
+	reteq	r12
+	bfextu	r10, r8, 8, 8
+	st.b	dst++, r10
+	cp.w	r10, 0
+	reteq	r12
+	st.b	dst++, r8
+	retal	r12
+
+.Lunaligned_src:
+	/* Copy bytes until we're aligned */
+	rsub	r8, r8, 4
+	add	pc, pc, r8 << 3
+	nop
+	nop
+	ld.ub	r10, src++
+	st.b	dst++, r10
+	cp.w	r10, 0
+	reteq	r12
+
+	rjmp	.Laligned_copy
Index: uClibc-0.9.28-avr32/libc/string/avr32/stringtest.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/stringtest.c	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,144 @@
+
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <sys/mman.h>
+
+#define BUF_SIZE (8 * 1024)
+
+static char *buf1;
+static char *buf1_ref;
+static char *buf2;
+
+extern void *optimized_memcpy(void *dest, void *src, size_t len);
+extern void *optimized_memmove(void *dest, void *src, size_t len);
+extern char *optimized_strcpy(char *dest, char *src);
+extern char *optimized_strncpy(char *dest, char *src, size_t len);
+
+void dump_mismatch(char *buf, char *ref, size_t len)
+{
+	int i, j;
+
+	for (i = 0; i < len; i += 16) {
+		if (memcmp(buf + i, ref + i, 16) == 0)
+			continue;
+
+		printf("%4x buf:", i);
+		for (j = i; j < (i + 16); j++)
+			printf(" %02x", buf[j]);
+		printf("\n     ref:");
+		for (j = i; j < (i + 16); j++)
+			printf(" %02x", ref[j]);
+		printf("\n");
+	}
+}
+
+static void test_memcpy(int src_offset, int dst_offset, int len)
+{
+	clock_t start, old, new;
+	int i;
+
+	memset(buf1, 0x55, BUF_SIZE);
+	memset(buf1_ref, 0x55, BUF_SIZE);
+	memset(buf2, 0xaa, BUF_SIZE);
+
+	printf("Testing memcpy with offsets %d => %d and len %d...",
+	       src_offset, dst_offset, len);
+
+	start = clock();
+	for (i = 0; i < 8192; i++)
+		optimized_memcpy(buf1 + dst_offset, buf2 + src_offset, len);
+	new = clock() - start;
+	start = clock();
+	for ( i = 0; i < 8192; i++)
+		memcpy(buf1_ref + dst_offset, buf2 + src_offset, len);
+	old = clock() - start;
+
+	if (memcmp(buf1, buf1_ref, BUF_SIZE) == 0)
+		printf("OK\n");
+	else {
+		printf("FAILED\n");
+		dump_mismatch(buf1, buf1_ref, BUF_SIZE);
+	}
+	printf("CPU time used: %d vs. %d\n", new, old);
+}
+
+static void test_memmove(int src_offset, int dst_offset, int len)
+{
+	clock_t start, old, new;
+
+	memset(buf1, 0x55, BUF_SIZE);
+	memset(buf1_ref, 0x55, BUF_SIZE);
+	memset(buf2, 0xaa, BUF_SIZE);
+
+	printf("Testing memmove with offsets %d => %d and len %d...",
+	       src_offset, dst_offset, len);
+
+	start = clock();
+	optimized_memmove(buf1 + dst_offset, buf2 + src_offset, len);
+	new = clock() - start;
+	start = clock();
+	memmove(buf1_ref + dst_offset, buf2 + src_offset, len);
+	old = clock() - start;
+
+	if (memcmp(buf1, buf1_ref, BUF_SIZE) == 0)
+		printf("OK\n");
+	else {
+		printf("FAILED\n");
+		dump_mismatch(buf1, buf1_ref, BUF_SIZE);
+	}
+	printf("CPU time used: %d vs. %d\n", new, old);
+}
+
+int main(int argc, char *argv[])
+{
+	buf2 = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
+		    MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+	if (buf2 == MAP_FAILED) {
+		perror("Failed to allocate memory for buf2");
+		return 1;
+	}
+	buf1 = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
+		    MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+	if (buf1 == MAP_FAILED) {
+		perror("Failed to allocate memory for buf1");
+		return 1;
+	}
+	buf1_ref = mmap(NULL, BUF_SIZE, PROT_READ | PROT_WRITE,
+			MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
+	if (buf1_ref == MAP_FAILED) {
+		perror("Failed to allocate memory for buf1_ref");
+		return 1;
+	}
+	printf("\n === MEMCPY ===\n\n");
+
+	test_memcpy(0, 0, BUF_SIZE - 32);
+	test_memcpy(0, 0, 1);
+	test_memcpy(0, 0, 31);
+	test_memcpy(0, 0, 32);
+	test_memcpy(0, 0, 127);
+	test_memcpy(0, 0, 128);
+	test_memcpy(4, 4, BUF_SIZE - 32 - 4);
+	test_memcpy(1, 1, BUF_SIZE - 32 - 1);
+	test_memcpy(1, 1, 126);
+	test_memcpy(0, 3, 128);
+	test_memcpy(1, 4, 128);
+	test_memcpy(0, 0, 0);
+
+	printf("\n === MEMMOVE ===\n\n");
+
+	test_memmove(0, 0, BUF_SIZE - 32);
+	test_memmove(0, 0, 1);
+	test_memmove(0, 0, 31);
+	test_memmove(0, 0, 32);
+	test_memmove(0, 0, BUF_SIZE - 33);
+	test_memmove(0, 0, 128);
+	test_memmove(4, 4, BUF_SIZE - 32 - 4);
+	test_memmove(1, 1, BUF_SIZE - 32 - 1);
+	test_memmove(1, 1, BUF_SIZE - 130);
+	test_memmove(0, 3, BUF_SIZE - 128);
+	test_memmove(1, 4, BUF_SIZE - 128);
+	test_memmove(0, 0, 0);
+
+	return 0;
+}
Index: uClibc-0.9.28-avr32/libc/string/avr32/strlen.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/strlen.S	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,52 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+#define str r12
+
+	.text
+	.global strlen
+	.type	strlen, @function
+strlen:
+	mov	r11, r12
+
+	mov	r9, str
+	andl	r9, 3, COH
+	brne	.Lunaligned_str
+
+1:	ld.w	r8, str++
+	tnbz	r8
+	brne	1b
+
+	sub	r12, r11
+	bfextu	r9, r8, 24, 8
+	cp.w	r9, 0
+	subeq	r12, 4
+	reteq	r12
+	bfextu	r9, r8, 16, 8
+	cp.w	r9, 0
+	subeq	r12, 3
+	reteq	r12
+	bfextu	r9, r8, 8, 8
+	cp.w	r9, 0
+	subeq	r12, 2
+	reteq	r12
+	sub	r12, 1
+	retal	r12
+
+.Lunaligned_str:
+	add	pc, pc, r9 << 3
+	sub	r0, r0, 0	/* 4-byte nop */
+	ld.ub	r8, str++
+	sub	r8, r8, 0
+	breq	1f
+	ld.ub	r8, str++
+	sub	r8, r8, 0
+	breq	1f
+	ld.ub	r8, str++
+	sub	r8, r8, 0
+	brne	1b
+
+1:	sub	r12, 1
+	sub	r12, r11
+	retal	r12
Index: uClibc-0.9.28-avr32/libc/string/avr32/strncpy.S
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/strncpy.S	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2004 Atmel Norway
+ */
+
+#define dst r9
+#define src r11
+
+	.text
+	.global strcpy
+	.type	strncpy, @function
+strncpy:
+	mov	dst, r12
+
+	pref	src[0]
+	mov	dst, r12
+
+	/*
+	 * Check alignment. If src is aligned but dst isn't, we can't
+	 * do much about it...
+	 */
+	mov	r8, src
+	andl	r8, 3 COH
+	brne	.Lunaligned_src
+
+.Laligned_copy:
+	sub	r10, 4
+	brlt	3f
+1:	ld.w	r8, src++
+	tnbz	r8
+	breq	2f
+	st.w	dst++, r8
+	sub	r10, 4
+	brne	1b
+
+3:	sub	r10, -4
+	reteq	r12
+
+	/* This is safe as long as src is word-aligned and r10 > 0 */
+	ld.w	r8, src++
+
+2:	/*
+	 * Ok, r8 now contains the terminating '\0'. Copy the
+	 * remaining bytes individually.
+	 */
+	bfextu	r11, r8, 24, 8
+	st.b	dst++, r11
+	cp.w	r11, 0
+	reteq	r12
+	sub	r10, 1
+	reteq	r12
+	bfextu	r11, r8, 16, 8
+	st.b	dst++, r11
+	cp.w	r11, 0
+	reteq	r12
+	sub	r10, 1
+	reteq	r12
+	bfextu	r11, r8, 8, 8
+	st.b	dst++, r11
+	cp.w	r11, 0
+	reteq	r12
+	sub	r10, 1
+	reteq	r12
+	st.b	dst++, r8
+	retal	r12
+
+.Lunaligned_src:
+	/* Copy bytes until we're aligned */
+	min	r8, r8, r10
+	sub	r10, r8
+	sub	r8, 1
+	retlt	r12
+1:	ld.ub	r10, src++
+	st.b	dst++, r10
+	sub	r8, 1
+	brge	1b
+
+	rjmp	.Laligned_copy
Index: uClibc-0.9.28-avr32/libc/string/avr32/test_memcpy.c
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ uClibc-0.9.28-avr32/libc/string/avr32/test_memcpy.c	2006-10-19 15:05:52.000000000 +0200
@@ -0,0 +1,66 @@
+
+#include <stdio.h>
+#include <string.h>
+
+#define BUF_SIZE 32768
+
+static char buf1[BUF_SIZE] __attribute__((aligned(32)));
+static char buf1_ref[BUF_SIZE] __attribute__((aligned(32)));
+static char buf2[BUF_SIZE] __attribute__((aligned(32)));
+
+extern void *new_memcpy(void *dest, void *src, size_t len);
+
+void dump_mismatch(char *buf, char *ref, size_t len)
+{
+	int i, j;
+
+	for (i = 0; i < len; i += 16) {
+		if (memcmp(buf + i, ref + i, 16) == 0)
+			continue;
+
+		printf("% 4x buf:", i);
+		for (j = i; j < (i + 16); j++)
+			printf(" %02x", buf[j]);
+		printf("\n     ref:");
+		for (j = i; j < (i + 16); j++)
+			printf(" %02x", ref[j]);
+		printf("\n");
+	}
+}
+
+void test(int src_offset, int dst_offset, int len)
+{
+	memset(buf1, 0x55, sizeof(buf1));
+	memset(buf1_ref, 0x55, sizeof(buf1_ref));
+	memset(buf2, 0xaa, sizeof(buf2));
+
+	printf("Testing with offsets %d => %d and len %d...",
+	       src_offset, dst_offset, len);
+
+	new_memcpy(buf1 + dst_offset, buf2 + src_offset, len);
+	memcpy(buf1_ref + dst_offset, buf2 + src_offset, len);
+
+	if (memcmp(buf1, buf1_ref, sizeof(buf1)) == 0)
+		printf("OK\n");
+	else {
+		printf("FAILED\n");
+		dump_mismatch(buf1, buf1_ref, sizeof(buf1));
+	}
+}
+
+int main(int argc, char *argv[])
+{
+	test(0, 0, BUF_SIZE);
+	test(0, 0, 1);
+	test(0, 0, 31);
+	test(0, 0, 32);
+	test(0, 0, 127);
+	test(0, 0, 128);
+	test(4, 4, BUF_SIZE - 4);
+	test(1, 1, BUF_SIZE - 1);
+	test(1, 1, 126);
+	test(0, 3, 128);
+	test(1, 4, 128);
+
+	return 0;
+}