[GitHub/mt8127/android_kernel_alcatel_ttab.git] / arch / powerpc / lib / checksum_64.S

/*
 * This file contains assembly-language implementations
 * of IP-style 1's complement checksum routines.
 *	
 *    Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
 *
 *  This program is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU General Public License
 *  as published by the Free Software Foundation; either version
 *  2 of the License, or (at your option) any later version.
 *
 * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
 */

#include <linux/sys.h>
#include <asm/processor.h>
#include <asm/errno.h>
#include <asm/ppc_asm.h>

/*
 * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
 * len is in words and is always >= 5.
 *
 * In practice len == 5, but this is not guaranteed.  So this code does not
 * attempt to use doubleword instructions.
 */
_GLOBAL(ip_fast_csum)
	lwz	r0,0(r3)
	lwzu	r5,4(r3)
	addic.	r4,r4,-2
	addc	r0,r0,r5
	mtctr	r4
	blelr-
1:	lwzu	r4,4(r3)
	adde	r0,r0,r4
	bdnz	1b
	addze	r0,r0		/* add in final carry */
        rldicl  r4,r0,32,0      /* fold two 32-bit halves together */
        add     r0,r0,r4
        srdi    r0,r0,32
	rlwinm	r3,r0,16,0,31	/* fold two halves together */
	add	r3,r0,r3
	not	r3,r3
	srwi	r3,r3,16
	blr

/*
 * Compute checksum of TCP or UDP pseudo-header:
 *   csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
 * No real gain trying to do this specially for 64 bit, but
 * the 32 bit addition may spill into the upper bits of
 * the doubleword so we still must fold it down from 64.
 */	
_GLOBAL(csum_tcpudp_magic)
	rlwimi	r5,r6,16,0,15	/* put proto in upper half of len */
	addc	r0,r3,r4	/* add 4 32-bit words together */
	adde	r0,r0,r5
	adde	r0,r0,r7
        rldicl  r4,r0,32,0      /* fold 64 bit value */
        add     r0,r4,r0
        srdi    r0,r0,32
	rlwinm	r3,r0,16,0,31	/* fold two halves together */
	add	r3,r0,r3
	not	r3,r3
	srwi	r3,r3,16
	blr

#define STACKFRAMESIZE 256
#define STK_REG(i)	(112 + ((i)-14)*8)

/*
 * Computes the checksum of a memory block at buff, length len,
 * and adds in "sum" (32-bit).
 *
 * csum_partial(r3=buff, r4=len, r5=sum)
 */
_GLOBAL(csum_partial)
	addic	r0,r5,0			/* clear carry */

	srdi.	r6,r4,3			/* less than 8 bytes? */
	beq	.Lcsum_tail_word

	/*
	 * If only halfword aligned, align to a double word. Since odd
	 * aligned addresses should be rare and they would require more
	 * work to calculate the correct checksum, we ignore that case
	 * and take the potential slowdown of unaligned loads.
	 */
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
	beq	.Lcsum_aligned

	li	r7,4
	sub	r6,r7,r6
	mtctr	r6

1:
	lhz	r6,0(r3)		/* align to doubleword */
	subi	r4,r4,2
	addi	r3,r3,2
	adde	r0,r0,r6
	bdnz	1b

.Lcsum_aligned:
	/*
	 * We unroll the loop such that each iteration is 64 bytes with an
	 * entry and exit limb of 64 bytes, meaning a minimum size of
	 * 128 bytes.
	 */
	srdi.	r6,r4,7
	beq	.Lcsum_tail_doublewords		/* len < 128 */

	srdi	r6,r4,6
	subi	r6,r6,1
	mtctr	r6

	stdu	r1,-STACKFRAMESIZE(r1)
	std	r14,STK_REG(r14)(r1)
	std	r15,STK_REG(r15)(r1)
	std	r16,STK_REG(r16)(r1)

	ld	r6,0(r3)
	ld	r9,8(r3)

	ld	r10,16(r3)
	ld	r11,24(r3)

	/*
	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
	 * the XER dependency. This means the fastest this loop can go is
	 * 16 cycles per iteration. The scheduling of the loop below has
	 * been shown to hit this on both POWER6 and POWER7.
	 */
	.align 5
2:
	adde	r0,r0,r6
	ld	r12,32(r3)
	ld	r14,40(r3)

	adde	r0,r0,r9
	ld	r15,48(r3)
	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10

	adde	r0,r0,r11

	adde	r0,r0,r12

	adde	r0,r0,r14

	adde	r0,r0,r15
	ld	r6,0(r3)
	ld	r9,8(r3)

	adde	r0,r0,r16
	ld	r10,16(r3)
	ld	r11,24(r3)
	bdnz	2b


	adde	r0,r0,r6
	ld	r12,32(r3)
	ld	r14,40(r3)

	adde	r0,r0,r9
	ld	r15,48(r3)
	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10
	adde	r0,r0,r11
	adde	r0,r0,r12
	adde	r0,r0,r14
	adde	r0,r0,r15
	adde	r0,r0,r16

	ld	r14,STK_REG(r14)(r1)
	ld	r15,STK_REG(r15)(r1)
	ld	r16,STK_REG(r16)(r1)
	addi	r1,r1,STACKFRAMESIZE

	andi.	r4,r4,63

.Lcsum_tail_doublewords:		/* Up to 127 bytes to go */
	srdi.	r6,r4,3
	beq	.Lcsum_tail_word

	mtctr	r6
3:
	ld	r6,0(r3)
	addi	r3,r3,8
	adde	r0,r0,r6
	bdnz	3b

	andi.	r4,r4,7

.Lcsum_tail_word:			/* Up to 7 bytes to go */
	srdi.	r6,r4,2
	beq	.Lcsum_tail_halfword

	lwz	r6,0(r3)
	addi	r3,r3,4
	adde	r0,r0,r6
	subi	r4,r4,4

.Lcsum_tail_halfword:			/* Up to 3 bytes to go */
	srdi.	r6,r4,1
	beq	.Lcsum_tail_byte

	lhz	r6,0(r3)
	addi	r3,r3,2
	adde	r0,r0,r6
	subi	r4,r4,2

.Lcsum_tail_byte:			/* Up to 1 byte to go */
	andi.	r6,r4,1
	beq	.Lcsum_finish

	lbz	r6,0(r3)
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
	adde	r0,r0,r9

.Lcsum_finish:
	addze	r0,r0			/* add in final carry */
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
	add	r3,r4,r0
	srdi	r3,r3,32
	blr


	.macro source
100:
	.section __ex_table,"a"
	.align 3
	.llong 100b,.Lsrc_error
	.previous
	.endm

	.macro dest
200:
	.section __ex_table,"a"
	.align 3
	.llong 200b,.Ldest_error
	.previous
	.endm

/*
 * Computes the checksum of a memory block at src, length len,
 * and adds in "sum" (32-bit), while copying the block to dst.
 * If an access exception occurs on src or dst, it stores -EFAULT
 * to *src_err or *dst_err respectively. The caller must take any action
 * required in this case (zeroing memory, recalculating partial checksum etc).
 *
 * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
 */
_GLOBAL(csum_partial_copy_generic)
	addic	r0,r6,0			/* clear carry */

	srdi.	r6,r5,3			/* less than 8 bytes? */
	beq	.Lcopy_tail_word

	/*
	 * If only halfword aligned, align to a double word. Since odd
	 * aligned addresses should be rare and they would require more
	 * work to calculate the correct checksum, we ignore that case
	 * and take the potential slowdown of unaligned loads.
	 *
	 * If the source and destination are relatively unaligned we only
	 * align the source. This keeps things simple.
	 */
	rldicl. r6,r3,64-1,64-2		/* r6 = (r3 & 0x3) >> 1 */
	beq	.Lcopy_aligned

	li	r7,4
	sub	r6,r7,r6
	mtctr	r6

1:
source;	lhz	r6,0(r3)		/* align to doubleword */
	subi	r5,r5,2
	addi	r3,r3,2
	adde	r0,r0,r6
dest;	sth	r6,0(r4)
	addi	r4,r4,2
	bdnz	1b

.Lcopy_aligned:
	/*
	 * We unroll the loop such that each iteration is 64 bytes with an
	 * entry and exit limb of 64 bytes, meaning a minimum size of
	 * 128 bytes.
	 */
	srdi.	r6,r5,7
	beq	.Lcopy_tail_doublewords		/* len < 128 */

	srdi	r6,r5,6
	subi	r6,r6,1
	mtctr	r6

	stdu	r1,-STACKFRAMESIZE(r1)
	std	r14,STK_REG(r14)(r1)
	std	r15,STK_REG(r15)(r1)
	std	r16,STK_REG(r16)(r1)

source;	ld	r6,0(r3)
source;	ld	r9,8(r3)

source;	ld	r10,16(r3)
source;	ld	r11,24(r3)

	/*
	 * On POWER6 and POWER7 back to back addes take 2 cycles because of
	 * the XER dependency. This means the fastest this loop can go is
	 * 16 cycles per iteration. The scheduling of the loop below has
	 * been shown to hit this on both POWER6 and POWER7.
	 */
	.align 5
2:
	adde	r0,r0,r6
source;	ld	r12,32(r3)
source;	ld	r14,40(r3)

	adde	r0,r0,r9
source;	ld	r15,48(r3)
source;	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10
dest;	std	r6,0(r4)
dest;	std	r9,8(r4)

	adde	r0,r0,r11
dest;	std	r10,16(r4)
dest;	std	r11,24(r4)

	adde	r0,r0,r12
dest;	std	r12,32(r4)
dest;	std	r14,40(r4)

	adde	r0,r0,r14
dest;	std	r15,48(r4)
dest;	std	r16,56(r4)
	addi	r4,r4,64

	adde	r0,r0,r15
source;	ld	r6,0(r3)
source;	ld	r9,8(r3)

	adde	r0,r0,r16
source;	ld	r10,16(r3)
source;	ld	r11,24(r3)
	bdnz	2b


	adde	r0,r0,r6
source;	ld	r12,32(r3)
source;	ld	r14,40(r3)

	adde	r0,r0,r9
source;	ld	r15,48(r3)
source;	ld	r16,56(r3)
	addi	r3,r3,64

	adde	r0,r0,r10
dest;	std	r6,0(r4)
dest;	std	r9,8(r4)

	adde	r0,r0,r11
dest;	std	r10,16(r4)
dest;	std	r11,24(r4)

	adde	r0,r0,r12
dest;	std	r12,32(r4)
dest;	std	r14,40(r4)

	adde	r0,r0,r14
dest;	std	r15,48(r4)
dest;	std	r16,56(r4)
	addi	r4,r4,64

	adde	r0,r0,r15
	adde	r0,r0,r16

	ld	r14,STK_REG(r14)(r1)
	ld	r15,STK_REG(r15)(r1)
	ld	r16,STK_REG(r16)(r1)
	addi	r1,r1,STACKFRAMESIZE

	andi.	r5,r5,63

.Lcopy_tail_doublewords:		/* Up to 127 bytes to go */
	srdi.	r6,r5,3
	beq	.Lcopy_tail_word

	mtctr	r6
3:
source;	ld	r6,0(r3)
	addi	r3,r3,8
	adde	r0,r0,r6
dest;	std	r6,0(r4)
	addi	r4,r4,8
	bdnz	3b

	andi.	r5,r5,7

.Lcopy_tail_word:			/* Up to 7 bytes to go */
	srdi.	r6,r5,2
	beq	.Lcopy_tail_halfword

source;	lwz	r6,0(r3)
	addi	r3,r3,4
	adde	r0,r0,r6
dest;	stw	r6,0(r4)
	addi	r4,r4,4
	subi	r5,r5,4

.Lcopy_tail_halfword:			/* Up to 3 bytes to go */
	srdi.	r6,r5,1
	beq	.Lcopy_tail_byte

source;	lhz	r6,0(r3)
	addi	r3,r3,2
	adde	r0,r0,r6
dest;	sth	r6,0(r4)
	addi	r4,r4,2
	subi	r5,r5,2

.Lcopy_tail_byte:			/* Up to 1 byte to go */
	andi.	r6,r5,1
	beq	.Lcopy_finish

source;	lbz	r6,0(r3)
	sldi	r9,r6,8			/* Pad the byte out to 16 bits */
	adde	r0,r0,r9
dest;	stb	r6,0(r4)

.Lcopy_finish:
	addze	r0,r0			/* add in final carry */
	rldicl	r4,r0,32,0		/* fold two 32 bit halves together */
	add	r3,r4,r0
	srdi	r3,r3,32
	blr

.Lsrc_error:
	cmpdi	0,r7,0
	beqlr
	li	r6,-EFAULT
	stw	r6,0(r7)
	blr

.Ldest_error:
	cmpdi	0,r8,0
	beqlr
	li	r6,-EFAULT
	stw	r6,0(r8)
	blr
Commit	Line	Data
14cf11af PM	1	/*
	2	* This file contains assembly-language implementations
	3	* of IP-style 1's complement checksum routines.
	4	*
	5	* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
	6	*
	7	* This program is free software; you can redistribute it and/or
	8	* modify it under the terms of the GNU General Public License
	9	* as published by the Free Software Foundation; either version
	10	* 2 of the License, or (at your option) any later version.
	11	*
	12	* Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au).
	13	*/
	14
	15	#include <linux/sys.h>
	16	#include <asm/processor.h>
	17	#include <asm/errno.h>
	18	#include <asm/ppc_asm.h>
	19
	20	/*
	21	* ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header
	22	* len is in words and is always >= 5.
	23	*
	24	* In practice len == 5, but this is not guaranteed. So this code does not
	25	* attempt to use doubleword instructions.
	26	*/
	27	_GLOBAL(ip_fast_csum)
	28	lwz r0,0(r3)
	29	lwzu r5,4(r3)
	30	addic. r4,r4,-2
	31	addc r0,r0,r5
	32	mtctr r4
	33	blelr-
	34	1: lwzu r4,4(r3)
	35	adde r0,r0,r4
	36	bdnz 1b
	37	addze r0,r0 /* add in final carry */
	38	rldicl r4,r0,32,0 /* fold two 32-bit halves together */
	39	add r0,r0,r4
	40	srdi r0,r0,32
	41	rlwinm r3,r0,16,0,31 /* fold two halves together */
	42	add r3,r0,r3
	43	not r3,r3
	44	srwi r3,r3,16
	45	blr
	46
	47	/*
	48	* Compute checksum of TCP or UDP pseudo-header:
	49	* csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum)
	50	* No real gain trying to do this specially for 64 bit, but
	51	* the 32 bit addition may spill into the upper bits of
	52	* the doubleword so we still must fold it down from 64.
	53	*/
	54	_GLOBAL(csum_tcpudp_magic)
	55	rlwimi r5,r6,16,0,15 /* put proto in upper half of len */
	56	addc r0,r3,r4 /* add 4 32-bit words together */
	57	adde r0,r0,r5
	58	adde r0,r0,r7
	59	rldicl r4,r0,32,0 /* fold 64 bit value */
	60	add r0,r4,r0
	61	srdi r0,r0,32
	62	rlwinm r3,r0,16,0,31 /* fold two halves together */
	63	add r3,r0,r3
	64	not r3,r3
65	srwi r3,r3,16
66	blr
67
9b83ecb0 AB	68	#define STACKFRAMESIZE 256
	69	#define STK_REG(i) (112 + ((i)-14)*8)
	70
14cf11af PM	71	/*
	72	* Computes the checksum of a memory block at buff, length len,
	73	* and adds in "sum" (32-bit).
	74	*
14cf11af PM	75	* csum_partial(r3=buff, r4=len, r5=sum)
	76	*/
	77	_GLOBAL(csum_partial)
9b83ecb0 AB	78	addic r0,r5,0 /* clear carry */
	79
	80	srdi. r6,r4,3 /* less than 8 bytes? */
	81	beq .Lcsum_tail_word
	82
	83	/*
	84	* If only halfword aligned, align to a double word. Since odd
	85	* aligned addresses should be rare and they would require more
	86	* work to calculate the correct checksum, we ignore that case
	87	* and take the potential slowdown of unaligned loads.
	88	*/
	89	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
	90	beq .Lcsum_aligned
	91
	92	li r7,4
	93	sub r6,r7,r6
	94	mtctr r6
	95
	96	1:
	97	lhz r6,0(r3) /* align to doubleword */
	98	subi r4,r4,2
	99	addi r3,r3,2
	100	adde r0,r0,r6
	101	bdnz 1b
	102
	103	.Lcsum_aligned:
	104	/*
	105	* We unroll the loop such that each iteration is 64 bytes with an
	106	* entry and exit limb of 64 bytes, meaning a minimum size of
	107	* 128 bytes.
	108	*/
	109	srdi. r6,r4,7
	110	beq .Lcsum_tail_doublewords /* len < 128 */
	111
	112	srdi r6,r4,6
	113	subi r6,r6,1
	114	mtctr r6
	115
	116	stdu r1,-STACKFRAMESIZE(r1)
	117	std r14,STK_REG(r14)(r1)
	118	std r15,STK_REG(r15)(r1)
	119	std r16,STK_REG(r16)(r1)
	120
	121	ld r6,0(r3)
	122	ld r9,8(r3)
	123
	124	ld r10,16(r3)
	125	ld r11,24(r3)
	126
	127	/*
	128	* On POWER6 and POWER7 back to back addes take 2 cycles because of
	129	* the XER dependency. This means the fastest this loop can go is
	130	* 16 cycles per iteration. The scheduling of the loop below has
	131	* been shown to hit this on both POWER6 and POWER7.
	132	*/
	133	.align 5
	134	2:
	135	adde r0,r0,r6
	136	ld r12,32(r3)
	137	ld r14,40(r3)
	138
	139	adde r0,r0,r9
	140	ld r15,48(r3)
	141	ld r16,56(r3)
142	addi r3,r3,64
143
144	adde r0,r0,r10
145
146	adde r0,r0,r11
147
148	adde r0,r0,r12
149
150	adde r0,r0,r14
151
152	adde r0,r0,r15
153	ld r6,0(r3)
154	ld r9,8(r3)
155
156	adde r0,r0,r16
157	ld r10,16(r3)
158	ld r11,24(r3)
159	bdnz 2b
160
161
162	adde r0,r0,r6
163	ld r12,32(r3)
164	ld r14,40(r3)
165
166	adde r0,r0,r9
167	ld r15,48(r3)
168	ld r16,56(r3)
169	addi r3,r3,64
170
171	adde r0,r0,r10
172	adde r0,r0,r11
173	adde r0,r0,r12
174	adde r0,r0,r14
175	adde r0,r0,r15
176	adde r0,r0,r16
177
178	ld r14,STK_REG(r14)(r1)
179	ld r15,STK_REG(r15)(r1)
180	ld r16,STK_REG(r16)(r1)
181	addi r1,r1,STACKFRAMESIZE
182
183	andi. r4,r4,63
184
185	.Lcsum_tail_doublewords: /* Up to 127 bytes to go */
186	srdi. r6,r4,3
187	beq .Lcsum_tail_word
188
189	mtctr r6
190	3:
191	ld r6,0(r3)
192	addi r3,r3,8
193	adde r0,r0,r6
194	bdnz 3b
195
196	andi. r4,r4,7
197
198	.Lcsum_tail_word: /* Up to 7 bytes to go */
199	srdi. r6,r4,2
200	beq .Lcsum_tail_halfword
201
202	lwz r6,0(r3)
14cf11af	203	addi r3,r3,4
9b83ecb0	204	adde r0,r0,r6
14cf11af	205	subi r4,r4,4
9b83ecb0 AB	206
	207	.Lcsum_tail_halfword: /* Up to 3 bytes to go */
	208	srdi. r6,r4,1
	209	beq .Lcsum_tail_byte
	210
	211	lhz r6,0(r3)
	212	addi r3,r3,2
	213	adde r0,r0,r6
	214	subi r4,r4,2
	215
	216	.Lcsum_tail_byte: /* Up to 1 byte to go */
	217	andi. r6,r4,1
	218	beq .Lcsum_finish
	219
	220	lbz r6,0(r3)
	221	sldi r9,r6,8 /* Pad the byte out to 16 bits */
	222	adde r0,r0,r9
	223
	224	.Lcsum_finish:
	225	addze r0,r0 /* add in final carry */
	226	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
	227	add r3,r4,r0
	228	srdi r3,r3,32
	229	blr
14cf11af	230
fdd374b6 AB	231
	232	.macro source
	233	100:
	234	.section __ex_table,"a"
	235	.align 3
	236	.llong 100b,.Lsrc_error
	237	.previous
	238	.endm
	239
	240	.macro dest
	241	200:
	242	.section __ex_table,"a"
	243	.align 3
	244	.llong 200b,.Ldest_error
	245	.previous
	246	.endm
	247
14cf11af PM	248	/*
	249	* Computes the checksum of a memory block at src, length len,
	250	* and adds in "sum" (32-bit), while copying the block to dst.
	251	* If an access exception occurs on src or dst, it stores -EFAULT
fdd374b6 AB	252	* to src_err or dst_err respectively. The caller must take any action
fdd374b6 AB	253	* required in this case (zeroing memory, recalculating partial checksum etc).
14cf11af PM	254	*
	255	* csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err)
	256	*/
	257	_GLOBAL(csum_partial_copy_generic)
fdd374b6 AB	258	addic r0,r6,0 /* clear carry */
	259
	260	srdi. r6,r5,3 /* less than 8 bytes? */
	261	beq .Lcopy_tail_word
	262
	263	/*
	264	* If only halfword aligned, align to a double word. Since odd
	265	* aligned addresses should be rare and they would require more
	266	* work to calculate the correct checksum, we ignore that case
	267	* and take the potential slowdown of unaligned loads.
	268	*
	269	* If the source and destination are relatively unaligned we only
	270	* align the source. This keeps things simple.
	271	*/
	272	rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */
	273	beq .Lcopy_aligned
	274
	275	li r7,4
	276	sub r6,r7,r6
	277	mtctr r6
	278
	279	1:
	280	source; lhz r6,0(r3) /* align to doubleword */
14cf11af	281	subi r5,r5,2
14cf11af	282	addi r3,r3,2
fdd374b6 AB	283	adde r0,r0,r6
fdd374b6 AB	284	dest; sth r6,0(r4)
14cf11af	285	addi r4,r4,2
fdd374b6 AB	286	bdnz 1b
	287
	288	.Lcopy_aligned:
	289	/*
	290	* We unroll the loop such that each iteration is 64 bytes with an
	291	* entry and exit limb of 64 bytes, meaning a minimum size of
	292	* 128 bytes.
	293	*/
	294	srdi. r6,r5,7
	295	beq .Lcopy_tail_doublewords /* len < 128 */
	296
	297	srdi r6,r5,6
	298	subi r6,r6,1
	299	mtctr r6
	300
	301	stdu r1,-STACKFRAMESIZE(r1)
	302	std r14,STK_REG(r14)(r1)
	303	std r15,STK_REG(r15)(r1)
	304	std r16,STK_REG(r16)(r1)
	305
	306	source; ld r6,0(r3)
	307	source; ld r9,8(r3)
	308
	309	source; ld r10,16(r3)
	310	source; ld r11,24(r3)
	311
	312	/*
	313	* On POWER6 and POWER7 back to back addes take 2 cycles because of
	314	* the XER dependency. This means the fastest this loop can go is
	315	* 16 cycles per iteration. The scheduling of the loop below has
	316	* been shown to hit this on both POWER6 and POWER7.
	317	*/
	318	.align 5
	319	2:
	320	adde r0,r0,r6
	321	source; ld r12,32(r3)
	322	source; ld r14,40(r3)
	323
	324	adde r0,r0,r9
	325	source; ld r15,48(r3)
	326	source; ld r16,56(r3)
	327	addi r3,r3,64
	328
	329	adde r0,r0,r10
	330	dest; std r6,0(r4)
	331	dest; std r9,8(r4)
	332
	333	adde r0,r0,r11
	334	dest; std r10,16(r4)
	335	dest; std r11,24(r4)
	336
	337	adde r0,r0,r12
	338	dest; std r12,32(r4)
	339	dest; std r14,40(r4)
	340
	341	adde r0,r0,r14
	342	dest; std r15,48(r4)
	343	dest; std r16,56(r4)
	344	addi r4,r4,64
	345
	346	adde r0,r0,r15
	347	source; ld r6,0(r3)
	348	source; ld r9,8(r3)
	349
350	adde r0,r0,r16
351	source; ld r10,16(r3)
352	source; ld r11,24(r3)
353	bdnz 2b
354
355
14cf11af	356	adde r0,r0,r6
fdd374b6 AB	357	source; ld r12,32(r3)
	358	source; ld r14,40(r3)
	359
	360	adde r0,r0,r9
	361	source; ld r15,48(r3)
	362	source; ld r16,56(r3)
	363	addi r3,r3,64
	364
	365	adde r0,r0,r10
	366	dest; std r6,0(r4)
	367	dest; std r9,8(r4)
	368
	369	adde r0,r0,r11
	370	dest; std r10,16(r4)
	371	dest; std r11,24(r4)
	372
	373	adde r0,r0,r12
	374	dest; std r12,32(r4)
	375	dest; std r14,40(r4)
	376
	377	adde r0,r0,r14
	378	dest; std r15,48(r4)
	379	dest; std r16,56(r4)
	380	addi r4,r4,64
	381
	382	adde r0,r0,r15
	383	adde r0,r0,r16
	384
	385	ld r14,STK_REG(r14)(r1)
	386	ld r15,STK_REG(r15)(r1)
	387	ld r16,STK_REG(r16)(r1)
	388	addi r1,r1,STACKFRAMESIZE
	389
	390	andi. r5,r5,63
	391
	392	.Lcopy_tail_doublewords: /* Up to 127 bytes to go */
	393	srdi. r6,r5,3
	394	beq .Lcopy_tail_word
	395
	396	mtctr r6
	397	3:
	398	source; ld r6,0(r3)
	399	addi r3,r3,8
14cf11af	400	adde r0,r0,r6
fdd374b6 AB	401	dest; std r6,0(r4)
	402	addi r4,r4,8
	403	bdnz 3b
14cf11af	404
fdd374b6	405	andi. r5,r5,7
14cf11af	406
fdd374b6 AB	407	.Lcopy_tail_word: /* Up to 7 bytes to go */
	408	srdi. r6,r5,2
	409	beq .Lcopy_tail_halfword
	410
	411	source; lwz r6,0(r3)
	412	addi r3,r3,4
	413	adde r0,r0,r6
	414	dest; stw r6,0(r4)
	415	addi r4,r4,4
	416	subi r5,r5,4
	417
	418	.Lcopy_tail_halfword: /* Up to 3 bytes to go */
	419	srdi. r6,r5,1
	420	beq .Lcopy_tail_byte
	421
	422	source; lhz r6,0(r3)
	423	addi r3,r3,2
	424	adde r0,r0,r6
	425	dest; sth r6,0(r4)
14cf11af	426	addi r4,r4,2
fdd374b6 AB	427	subi r5,r5,2
	428
	429	.Lcopy_tail_byte: /* Up to 1 byte to go */
	430	andi. r6,r5,1
	431	beq .Lcopy_finish
	432
	433	source; lbz r6,0(r3)
	434	sldi r9,r6,8 /* Pad the byte out to 16 bits */
	435	adde r0,r0,r9
	436	dest; stb r6,0(r4)
	437
	438	.Lcopy_finish:
	439	addze r0,r0 /* add in final carry */
	440	rldicl r4,r0,32,0 /* fold two 32 bit halves together */
	441	add r3,r4,r0
	442	srdi r3,r3,32
	443	blr
	444
	445	.Lsrc_error:
14cf11af	446	cmpdi 0,r7,0
fdd374b6	447	beqlr
14cf11af PM	448	li r6,-EFAULT
14cf11af PM	449	stw r6,0(r7)
14cf11af PM	450	blr
14cf11af PM	451
fdd374b6	452	.Ldest_error:
14cf11af	453	cmpdi 0,r8,0
fdd374b6	454	beqlr
14cf11af PM	455	li r6,-EFAULT
14cf11af PM	456	stw r6,0(r8)
14cf11af	457	blr