Merge tag 'disintegrate-fbdev-20121220' of git://git.infradead.org/users/dhowells...
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / arch / powerpc / lib / copyuser_power7.S
CommitLineData
a66086b8
AB
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2011
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20#include <asm/ppc_asm.h>
21
a66086b8
AB
22 .macro err1
23100:
24 .section __ex_table,"a"
25 .align 3
26 .llong 100b,.Ldo_err1
27 .previous
28 .endm
29
30 .macro err2
31200:
32 .section __ex_table,"a"
33 .align 3
34 .llong 200b,.Ldo_err2
35 .previous
36 .endm
37
38#ifdef CONFIG_ALTIVEC
39 .macro err3
40300:
41 .section __ex_table,"a"
42 .align 3
43 .llong 300b,.Ldo_err3
44 .previous
45 .endm
46
47 .macro err4
48400:
49 .section __ex_table,"a"
50 .align 3
51 .llong 400b,.Ldo_err4
52 .previous
53 .endm
54
55
56.Ldo_err4:
c75df6f9
MN
57 ld r16,STK_REG(R16)(r1)
58 ld r15,STK_REG(R15)(r1)
59 ld r14,STK_REG(R14)(r1)
a66086b8 60.Ldo_err3:
6f7839e5 61 bl .exit_vmx_usercopy
a66086b8
AB
62 ld r0,STACKFRAMESIZE+16(r1)
63 mtlr r0
64 b .Lexit
65#endif /* CONFIG_ALTIVEC */
66
67.Ldo_err2:
c75df6f9
MN
68 ld r22,STK_REG(R22)(r1)
69 ld r21,STK_REG(R21)(r1)
70 ld r20,STK_REG(R20)(r1)
71 ld r19,STK_REG(R19)(r1)
72 ld r18,STK_REG(R18)(r1)
73 ld r17,STK_REG(R17)(r1)
74 ld r16,STK_REG(R16)(r1)
75 ld r15,STK_REG(R15)(r1)
76 ld r14,STK_REG(R14)(r1)
a66086b8
AB
77.Lexit:
78 addi r1,r1,STACKFRAMESIZE
79.Ldo_err1:
80 ld r3,48(r1)
81 ld r4,56(r1)
82 ld r5,64(r1)
83 b __copy_tofrom_user_base
84
85
86_GLOBAL(__copy_tofrom_user_power7)
87#ifdef CONFIG_ALTIVEC
88 cmpldi r5,16
89 cmpldi cr1,r5,4096
90
91 std r3,48(r1)
92 std r4,56(r1)
93 std r5,64(r1)
94
95 blt .Lshort_copy
96 bgt cr1,.Lvmx_copy
97#else
98 cmpldi r5,16
99
100 std r3,48(r1)
101 std r4,56(r1)
102 std r5,64(r1)
103
104 blt .Lshort_copy
105#endif
106
107.Lnonvmx_copy:
108 /* Get the source 8B aligned */
109 neg r6,r4
110 mtocrf 0x01,r6
111 clrldi r6,r6,(64-3)
112
113 bf cr7*4+3,1f
114err1; lbz r0,0(r4)
115 addi r4,r4,1
116err1; stb r0,0(r3)
117 addi r3,r3,1
118
1191: bf cr7*4+2,2f
120err1; lhz r0,0(r4)
121 addi r4,r4,2
122err1; sth r0,0(r3)
123 addi r3,r3,2
124
1252: bf cr7*4+1,3f
126err1; lwz r0,0(r4)
127 addi r4,r4,4
128err1; stw r0,0(r3)
129 addi r3,r3,4
130
1313: sub r5,r5,r6
132 cmpldi r5,128
133 blt 5f
134
135 mflr r0
136 stdu r1,-STACKFRAMESIZE(r1)
c75df6f9
MN
137 std r14,STK_REG(R14)(r1)
138 std r15,STK_REG(R15)(r1)
139 std r16,STK_REG(R16)(r1)
140 std r17,STK_REG(R17)(r1)
141 std r18,STK_REG(R18)(r1)
142 std r19,STK_REG(R19)(r1)
143 std r20,STK_REG(R20)(r1)
144 std r21,STK_REG(R21)(r1)
145 std r22,STK_REG(R22)(r1)
a66086b8
AB
146 std r0,STACKFRAMESIZE+16(r1)
147
148 srdi r6,r5,7
149 mtctr r6
150
151 /* Now do cacheline (128B) sized loads and stores. */
152 .align 5
1534:
154err2; ld r0,0(r4)
155err2; ld r6,8(r4)
156err2; ld r7,16(r4)
157err2; ld r8,24(r4)
158err2; ld r9,32(r4)
159err2; ld r10,40(r4)
160err2; ld r11,48(r4)
161err2; ld r12,56(r4)
162err2; ld r14,64(r4)
163err2; ld r15,72(r4)
164err2; ld r16,80(r4)
165err2; ld r17,88(r4)
166err2; ld r18,96(r4)
167err2; ld r19,104(r4)
168err2; ld r20,112(r4)
169err2; ld r21,120(r4)
170 addi r4,r4,128
171err2; std r0,0(r3)
172err2; std r6,8(r3)
173err2; std r7,16(r3)
174err2; std r8,24(r3)
175err2; std r9,32(r3)
176err2; std r10,40(r3)
177err2; std r11,48(r3)
178err2; std r12,56(r3)
179err2; std r14,64(r3)
180err2; std r15,72(r3)
181err2; std r16,80(r3)
182err2; std r17,88(r3)
183err2; std r18,96(r3)
184err2; std r19,104(r3)
185err2; std r20,112(r3)
186err2; std r21,120(r3)
187 addi r3,r3,128
188 bdnz 4b
189
190 clrldi r5,r5,(64-7)
191
c75df6f9
MN
192 ld r14,STK_REG(R14)(r1)
193 ld r15,STK_REG(R15)(r1)
194 ld r16,STK_REG(R16)(r1)
195 ld r17,STK_REG(R17)(r1)
196 ld r18,STK_REG(R18)(r1)
197 ld r19,STK_REG(R19)(r1)
198 ld r20,STK_REG(R20)(r1)
199 ld r21,STK_REG(R21)(r1)
200 ld r22,STK_REG(R22)(r1)
a66086b8
AB
201 addi r1,r1,STACKFRAMESIZE
202
203 /* Up to 127B to go */
2045: srdi r6,r5,4
205 mtocrf 0x01,r6
206
2076: bf cr7*4+1,7f
208err1; ld r0,0(r4)
209err1; ld r6,8(r4)
210err1; ld r7,16(r4)
211err1; ld r8,24(r4)
212err1; ld r9,32(r4)
213err1; ld r10,40(r4)
214err1; ld r11,48(r4)
215err1; ld r12,56(r4)
216 addi r4,r4,64
217err1; std r0,0(r3)
218err1; std r6,8(r3)
219err1; std r7,16(r3)
220err1; std r8,24(r3)
221err1; std r9,32(r3)
222err1; std r10,40(r3)
223err1; std r11,48(r3)
224err1; std r12,56(r3)
225 addi r3,r3,64
226
227 /* Up to 63B to go */
2287: bf cr7*4+2,8f
229err1; ld r0,0(r4)
230err1; ld r6,8(r4)
231err1; ld r7,16(r4)
232err1; ld r8,24(r4)
233 addi r4,r4,32
234err1; std r0,0(r3)
235err1; std r6,8(r3)
236err1; std r7,16(r3)
237err1; std r8,24(r3)
238 addi r3,r3,32
239
240 /* Up to 31B to go */
2418: bf cr7*4+3,9f
242err1; ld r0,0(r4)
243err1; ld r6,8(r4)
244 addi r4,r4,16
245err1; std r0,0(r3)
246err1; std r6,8(r3)
247 addi r3,r3,16
248
2499: clrldi r5,r5,(64-4)
250
251 /* Up to 15B to go */
252.Lshort_copy:
253 mtocrf 0x01,r5
254 bf cr7*4+0,12f
255err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
256err1; lwz r6,4(r4)
257 addi r4,r4,8
258err1; stw r0,0(r3)
259err1; stw r6,4(r3)
260 addi r3,r3,8
261
26212: bf cr7*4+1,13f
263err1; lwz r0,0(r4)
264 addi r4,r4,4
265err1; stw r0,0(r3)
266 addi r3,r3,4
267
26813: bf cr7*4+2,14f
269err1; lhz r0,0(r4)
270 addi r4,r4,2
271err1; sth r0,0(r3)
272 addi r3,r3,2
273
27414: bf cr7*4+3,15f
275err1; lbz r0,0(r4)
276err1; stb r0,0(r3)
277
27815: li r3,0
279 blr
280
281.Lunwind_stack_nonvmx_copy:
282 addi r1,r1,STACKFRAMESIZE
283 b .Lnonvmx_copy
284
285#ifdef CONFIG_ALTIVEC
286.Lvmx_copy:
287 mflr r0
288 std r0,16(r1)
289 stdu r1,-STACKFRAMESIZE(r1)
6f7839e5 290 bl .enter_vmx_usercopy
2fae7cdb 291 cmpwi cr1,r3,0
a66086b8
AB
292 ld r0,STACKFRAMESIZE+16(r1)
293 ld r3,STACKFRAMESIZE+48(r1)
294 ld r4,STACKFRAMESIZE+56(r1)
295 ld r5,STACKFRAMESIZE+64(r1)
296 mtlr r0
297
a9514dc6
AB
298 /*
299 * We prefetch both the source and destination using enhanced touch
300 * instructions. We use a stream ID of 0 for the load side and
301 * 1 for the store side.
302 */
303 clrrdi r6,r4,7
304 clrrdi r9,r3,7
305 ori r9,r9,1 /* stream=1 */
306
307 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
308 cmpldi r7,0x3FF
309 ble 1f
310 li r7,0x3FF
3111: lis r0,0x0E00 /* depth=7 */
312 sldi r7,r7,7
313 or r7,r7,r0
314 ori r10,r7,1 /* stream=1 */
315
316 lis r8,0x8000 /* GO=1 */
317 clrldi r8,r8,32
318
319.machine push
320.machine "power4"
321 dcbt r0,r6,0b01000
322 dcbt r0,r7,0b01010
323 dcbtst r0,r9,0b01000
324 dcbtst r0,r10,0b01010
325 eieio
326 dcbt r0,r8,0b01010 /* GO */
327.machine pop
328
2fae7cdb 329 beq cr1,.Lunwind_stack_nonvmx_copy
a66086b8
AB
330
331 /*
332 * If source and destination are not relatively aligned we use a
333 * slower permute loop.
334 */
335 xor r6,r4,r3
336 rldicl. r6,r6,0,(64-4)
337 bne .Lvmx_unaligned_copy
338
339 /* Get the destination 16B aligned */
340 neg r6,r3
341 mtocrf 0x01,r6
342 clrldi r6,r6,(64-4)
343
344 bf cr7*4+3,1f
345err3; lbz r0,0(r4)
346 addi r4,r4,1
347err3; stb r0,0(r3)
348 addi r3,r3,1
349
3501: bf cr7*4+2,2f
351err3; lhz r0,0(r4)
352 addi r4,r4,2
353err3; sth r0,0(r3)
354 addi r3,r3,2
355
3562: bf cr7*4+1,3f
357err3; lwz r0,0(r4)
358 addi r4,r4,4
359err3; stw r0,0(r3)
360 addi r3,r3,4
361
3623: bf cr7*4+0,4f
363err3; ld r0,0(r4)
364 addi r4,r4,8
365err3; std r0,0(r3)
366 addi r3,r3,8
367
3684: sub r5,r5,r6
369
370 /* Get the desination 128B aligned */
371 neg r6,r3
372 srdi r7,r6,4
373 mtocrf 0x01,r7
374 clrldi r6,r6,(64-7)
375
376 li r9,16
377 li r10,32
378 li r11,48
379
380 bf cr7*4+3,5f
381err3; lvx vr1,r0,r4
382 addi r4,r4,16
383err3; stvx vr1,r0,r3
384 addi r3,r3,16
385
3865: bf cr7*4+2,6f
387err3; lvx vr1,r0,r4
388err3; lvx vr0,r4,r9
389 addi r4,r4,32
390err3; stvx vr1,r0,r3
391err3; stvx vr0,r3,r9
392 addi r3,r3,32
393
3946: bf cr7*4+1,7f
395err3; lvx vr3,r0,r4
396err3; lvx vr2,r4,r9
397err3; lvx vr1,r4,r10
398err3; lvx vr0,r4,r11
399 addi r4,r4,64
400err3; stvx vr3,r0,r3
401err3; stvx vr2,r3,r9
402err3; stvx vr1,r3,r10
403err3; stvx vr0,r3,r11
404 addi r3,r3,64
405
4067: sub r5,r5,r6
407 srdi r6,r5,7
408
c75df6f9
MN
409 std r14,STK_REG(R14)(r1)
410 std r15,STK_REG(R15)(r1)
411 std r16,STK_REG(R16)(r1)
a66086b8
AB
412
413 li r12,64
414 li r14,80
415 li r15,96
416 li r16,112
417
418 mtctr r6
419
420 /*
421 * Now do cacheline sized loads and stores. By this stage the
422 * cacheline stores are also cacheline aligned.
423 */
424 .align 5
4258:
426err4; lvx vr7,r0,r4
427err4; lvx vr6,r4,r9
428err4; lvx vr5,r4,r10
429err4; lvx vr4,r4,r11
430err4; lvx vr3,r4,r12
431err4; lvx vr2,r4,r14
432err4; lvx vr1,r4,r15
433err4; lvx vr0,r4,r16
434 addi r4,r4,128
435err4; stvx vr7,r0,r3
436err4; stvx vr6,r3,r9
437err4; stvx vr5,r3,r10
438err4; stvx vr4,r3,r11
439err4; stvx vr3,r3,r12
440err4; stvx vr2,r3,r14
441err4; stvx vr1,r3,r15
442err4; stvx vr0,r3,r16
443 addi r3,r3,128
444 bdnz 8b
445
c75df6f9
MN
446 ld r14,STK_REG(R14)(r1)
447 ld r15,STK_REG(R15)(r1)
448 ld r16,STK_REG(R16)(r1)
a66086b8
AB
449
450 /* Up to 127B to go */
451 clrldi r5,r5,(64-7)
452 srdi r6,r5,4
453 mtocrf 0x01,r6
454
455 bf cr7*4+1,9f
456err3; lvx vr3,r0,r4
457err3; lvx vr2,r4,r9
458err3; lvx vr1,r4,r10
459err3; lvx vr0,r4,r11
460 addi r4,r4,64
461err3; stvx vr3,r0,r3
462err3; stvx vr2,r3,r9
463err3; stvx vr1,r3,r10
464err3; stvx vr0,r3,r11
465 addi r3,r3,64
466
4679: bf cr7*4+2,10f
468err3; lvx vr1,r0,r4
469err3; lvx vr0,r4,r9
470 addi r4,r4,32
471err3; stvx vr1,r0,r3
472err3; stvx vr0,r3,r9
473 addi r3,r3,32
474
47510: bf cr7*4+3,11f
476err3; lvx vr1,r0,r4
477 addi r4,r4,16
478err3; stvx vr1,r0,r3
479 addi r3,r3,16
480
481 /* Up to 15B to go */
48211: clrldi r5,r5,(64-4)
483 mtocrf 0x01,r5
484 bf cr7*4+0,12f
485err3; ld r0,0(r4)
486 addi r4,r4,8
487err3; std r0,0(r3)
488 addi r3,r3,8
489
49012: bf cr7*4+1,13f
491err3; lwz r0,0(r4)
492 addi r4,r4,4
493err3; stw r0,0(r3)
494 addi r3,r3,4
495
49613: bf cr7*4+2,14f
497err3; lhz r0,0(r4)
498 addi r4,r4,2
499err3; sth r0,0(r3)
500 addi r3,r3,2
501
50214: bf cr7*4+3,15f
503err3; lbz r0,0(r4)
504err3; stb r0,0(r3)
505
50615: addi r1,r1,STACKFRAMESIZE
6f7839e5 507 b .exit_vmx_usercopy /* tail call optimise */
a66086b8
AB
508
509.Lvmx_unaligned_copy:
510 /* Get the destination 16B aligned */
511 neg r6,r3
512 mtocrf 0x01,r6
513 clrldi r6,r6,(64-4)
514
515 bf cr7*4+3,1f
516err3; lbz r0,0(r4)
517 addi r4,r4,1
518err3; stb r0,0(r3)
519 addi r3,r3,1
520
5211: bf cr7*4+2,2f
522err3; lhz r0,0(r4)
523 addi r4,r4,2
524err3; sth r0,0(r3)
525 addi r3,r3,2
526
5272: bf cr7*4+1,3f
528err3; lwz r0,0(r4)
529 addi r4,r4,4
530err3; stw r0,0(r3)
531 addi r3,r3,4
532
5333: bf cr7*4+0,4f
534err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
535err3; lwz r7,4(r4)
536 addi r4,r4,8
537err3; stw r0,0(r3)
538err3; stw r7,4(r3)
539 addi r3,r3,8
540
5414: sub r5,r5,r6
542
543 /* Get the desination 128B aligned */
544 neg r6,r3
545 srdi r7,r6,4
546 mtocrf 0x01,r7
547 clrldi r6,r6,(64-7)
548
549 li r9,16
550 li r10,32
551 li r11,48
552
553 lvsl vr16,0,r4 /* Setup permute control vector */
554err3; lvx vr0,0,r4
555 addi r4,r4,16
556
557 bf cr7*4+3,5f
558err3; lvx vr1,r0,r4
559 vperm vr8,vr0,vr1,vr16
560 addi r4,r4,16
561err3; stvx vr8,r0,r3
562 addi r3,r3,16
563 vor vr0,vr1,vr1
564
5655: bf cr7*4+2,6f
566err3; lvx vr1,r0,r4
567 vperm vr8,vr0,vr1,vr16
568err3; lvx vr0,r4,r9
569 vperm vr9,vr1,vr0,vr16
570 addi r4,r4,32
571err3; stvx vr8,r0,r3
572err3; stvx vr9,r3,r9
573 addi r3,r3,32
574
5756: bf cr7*4+1,7f
576err3; lvx vr3,r0,r4
577 vperm vr8,vr0,vr3,vr16
578err3; lvx vr2,r4,r9
579 vperm vr9,vr3,vr2,vr16
580err3; lvx vr1,r4,r10
581 vperm vr10,vr2,vr1,vr16
582err3; lvx vr0,r4,r11
583 vperm vr11,vr1,vr0,vr16
584 addi r4,r4,64
585err3; stvx vr8,r0,r3
586err3; stvx vr9,r3,r9
587err3; stvx vr10,r3,r10
588err3; stvx vr11,r3,r11
589 addi r3,r3,64
590
5917: sub r5,r5,r6
592 srdi r6,r5,7
593
c75df6f9
MN
594 std r14,STK_REG(R14)(r1)
595 std r15,STK_REG(R15)(r1)
596 std r16,STK_REG(R16)(r1)
a66086b8
AB
597
598 li r12,64
599 li r14,80
600 li r15,96
601 li r16,112
602
603 mtctr r6
604
605 /*
606 * Now do cacheline sized loads and stores. By this stage the
607 * cacheline stores are also cacheline aligned.
608 */
609 .align 5
6108:
611err4; lvx vr7,r0,r4
612 vperm vr8,vr0,vr7,vr16
613err4; lvx vr6,r4,r9
614 vperm vr9,vr7,vr6,vr16
615err4; lvx vr5,r4,r10
616 vperm vr10,vr6,vr5,vr16
617err4; lvx vr4,r4,r11
618 vperm vr11,vr5,vr4,vr16
619err4; lvx vr3,r4,r12
620 vperm vr12,vr4,vr3,vr16
621err4; lvx vr2,r4,r14
622 vperm vr13,vr3,vr2,vr16
623err4; lvx vr1,r4,r15
624 vperm vr14,vr2,vr1,vr16
625err4; lvx vr0,r4,r16
626 vperm vr15,vr1,vr0,vr16
627 addi r4,r4,128
628err4; stvx vr8,r0,r3
629err4; stvx vr9,r3,r9
630err4; stvx vr10,r3,r10
631err4; stvx vr11,r3,r11
632err4; stvx vr12,r3,r12
633err4; stvx vr13,r3,r14
634err4; stvx vr14,r3,r15
635err4; stvx vr15,r3,r16
636 addi r3,r3,128
637 bdnz 8b
638
c75df6f9
MN
639 ld r14,STK_REG(R14)(r1)
640 ld r15,STK_REG(R15)(r1)
641 ld r16,STK_REG(R16)(r1)
a66086b8
AB
642
643 /* Up to 127B to go */
644 clrldi r5,r5,(64-7)
645 srdi r6,r5,4
646 mtocrf 0x01,r6
647
648 bf cr7*4+1,9f
649err3; lvx vr3,r0,r4
650 vperm vr8,vr0,vr3,vr16
651err3; lvx vr2,r4,r9
652 vperm vr9,vr3,vr2,vr16
653err3; lvx vr1,r4,r10
654 vperm vr10,vr2,vr1,vr16
655err3; lvx vr0,r4,r11
656 vperm vr11,vr1,vr0,vr16
657 addi r4,r4,64
658err3; stvx vr8,r0,r3
659err3; stvx vr9,r3,r9
660err3; stvx vr10,r3,r10
661err3; stvx vr11,r3,r11
662 addi r3,r3,64
663
6649: bf cr7*4+2,10f
665err3; lvx vr1,r0,r4
666 vperm vr8,vr0,vr1,vr16
667err3; lvx vr0,r4,r9
668 vperm vr9,vr1,vr0,vr16
669 addi r4,r4,32
670err3; stvx vr8,r0,r3
671err3; stvx vr9,r3,r9
672 addi r3,r3,32
673
67410: bf cr7*4+3,11f
675err3; lvx vr1,r0,r4
676 vperm vr8,vr0,vr1,vr16
677 addi r4,r4,16
678err3; stvx vr8,r0,r3
679 addi r3,r3,16
680
681 /* Up to 15B to go */
68211: clrldi r5,r5,(64-4)
683 addi r4,r4,-16 /* Unwind the +16 load offset */
684 mtocrf 0x01,r5
685 bf cr7*4+0,12f
686err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
687err3; lwz r6,4(r4)
688 addi r4,r4,8
689err3; stw r0,0(r3)
690err3; stw r6,4(r3)
691 addi r3,r3,8
692
69312: bf cr7*4+1,13f
694err3; lwz r0,0(r4)
695 addi r4,r4,4
696err3; stw r0,0(r3)
697 addi r3,r3,4
698
69913: bf cr7*4+2,14f
700err3; lhz r0,0(r4)
701 addi r4,r4,2
702err3; sth r0,0(r3)
703 addi r3,r3,2
704
70514: bf cr7*4+3,15f
706err3; lbz r0,0(r4)
707err3; stb r0,0(r3)
708
70915: addi r1,r1,STACKFRAMESIZE
6f7839e5 710 b .exit_vmx_usercopy /* tail call optimise */
a66086b8 711#endif /* CONFiG_ALTIVEC */