ARM: OMAP3: Fix iva2_pwrdm settings for 3703
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / arch / powerpc / lib / copyuser_power7.S
1 /*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2011
17 *
18 * Author: Anton Blanchard <anton@au.ibm.com>
19 */
20 #include <asm/ppc_asm.h>
21
22 .macro err1
23 100:
24 .section __ex_table,"a"
25 .align 3
26 .llong 100b,.Ldo_err1
27 .previous
28 .endm
29
30 .macro err2
31 200:
32 .section __ex_table,"a"
33 .align 3
34 .llong 200b,.Ldo_err2
35 .previous
36 .endm
37
38 #ifdef CONFIG_ALTIVEC
39 .macro err3
40 300:
41 .section __ex_table,"a"
42 .align 3
43 .llong 300b,.Ldo_err3
44 .previous
45 .endm
46
47 .macro err4
48 400:
49 .section __ex_table,"a"
50 .align 3
51 .llong 400b,.Ldo_err4
52 .previous
53 .endm
54
55
56 .Ldo_err4:
57 ld r16,STK_REG(R16)(r1)
58 ld r15,STK_REG(R15)(r1)
59 ld r14,STK_REG(R14)(r1)
60 .Ldo_err3:
61 bl .exit_vmx_usercopy
62 ld r0,STACKFRAMESIZE+16(r1)
63 mtlr r0
64 b .Lexit
65 #endif /* CONFIG_ALTIVEC */
66
67 .Ldo_err2:
68 ld r22,STK_REG(R22)(r1)
69 ld r21,STK_REG(R21)(r1)
70 ld r20,STK_REG(R20)(r1)
71 ld r19,STK_REG(R19)(r1)
72 ld r18,STK_REG(R18)(r1)
73 ld r17,STK_REG(R17)(r1)
74 ld r16,STK_REG(R16)(r1)
75 ld r15,STK_REG(R15)(r1)
76 ld r14,STK_REG(R14)(r1)
77 .Lexit:
78 addi r1,r1,STACKFRAMESIZE
79 .Ldo_err1:
80 ld r3,48(r1)
81 ld r4,56(r1)
82 ld r5,64(r1)
83 b __copy_tofrom_user_base
84
85
86 _GLOBAL(__copy_tofrom_user_power7)
87 #ifdef CONFIG_ALTIVEC
88 cmpldi r5,16
89 cmpldi cr1,r5,4096
90
91 std r3,48(r1)
92 std r4,56(r1)
93 std r5,64(r1)
94
95 blt .Lshort_copy
96 bgt cr1,.Lvmx_copy
97 #else
98 cmpldi r5,16
99
100 std r3,48(r1)
101 std r4,56(r1)
102 std r5,64(r1)
103
104 blt .Lshort_copy
105 #endif
106
107 .Lnonvmx_copy:
108 /* Get the source 8B aligned */
109 neg r6,r4
110 mtocrf 0x01,r6
111 clrldi r6,r6,(64-3)
112
113 bf cr7*4+3,1f
114 err1; lbz r0,0(r4)
115 addi r4,r4,1
116 err1; stb r0,0(r3)
117 addi r3,r3,1
118
119 1: bf cr7*4+2,2f
120 err1; lhz r0,0(r4)
121 addi r4,r4,2
122 err1; sth r0,0(r3)
123 addi r3,r3,2
124
125 2: bf cr7*4+1,3f
126 err1; lwz r0,0(r4)
127 addi r4,r4,4
128 err1; stw r0,0(r3)
129 addi r3,r3,4
130
131 3: sub r5,r5,r6
132 cmpldi r5,128
133 blt 5f
134
135 mflr r0
136 stdu r1,-STACKFRAMESIZE(r1)
137 std r14,STK_REG(R14)(r1)
138 std r15,STK_REG(R15)(r1)
139 std r16,STK_REG(R16)(r1)
140 std r17,STK_REG(R17)(r1)
141 std r18,STK_REG(R18)(r1)
142 std r19,STK_REG(R19)(r1)
143 std r20,STK_REG(R20)(r1)
144 std r21,STK_REG(R21)(r1)
145 std r22,STK_REG(R22)(r1)
146 std r0,STACKFRAMESIZE+16(r1)
147
148 srdi r6,r5,7
149 mtctr r6
150
151 /* Now do cacheline (128B) sized loads and stores. */
152 .align 5
153 4:
154 err2; ld r0,0(r4)
155 err2; ld r6,8(r4)
156 err2; ld r7,16(r4)
157 err2; ld r8,24(r4)
158 err2; ld r9,32(r4)
159 err2; ld r10,40(r4)
160 err2; ld r11,48(r4)
161 err2; ld r12,56(r4)
162 err2; ld r14,64(r4)
163 err2; ld r15,72(r4)
164 err2; ld r16,80(r4)
165 err2; ld r17,88(r4)
166 err2; ld r18,96(r4)
167 err2; ld r19,104(r4)
168 err2; ld r20,112(r4)
169 err2; ld r21,120(r4)
170 addi r4,r4,128
171 err2; std r0,0(r3)
172 err2; std r6,8(r3)
173 err2; std r7,16(r3)
174 err2; std r8,24(r3)
175 err2; std r9,32(r3)
176 err2; std r10,40(r3)
177 err2; std r11,48(r3)
178 err2; std r12,56(r3)
179 err2; std r14,64(r3)
180 err2; std r15,72(r3)
181 err2; std r16,80(r3)
182 err2; std r17,88(r3)
183 err2; std r18,96(r3)
184 err2; std r19,104(r3)
185 err2; std r20,112(r3)
186 err2; std r21,120(r3)
187 addi r3,r3,128
188 bdnz 4b
189
190 clrldi r5,r5,(64-7)
191
192 ld r14,STK_REG(R14)(r1)
193 ld r15,STK_REG(R15)(r1)
194 ld r16,STK_REG(R16)(r1)
195 ld r17,STK_REG(R17)(r1)
196 ld r18,STK_REG(R18)(r1)
197 ld r19,STK_REG(R19)(r1)
198 ld r20,STK_REG(R20)(r1)
199 ld r21,STK_REG(R21)(r1)
200 ld r22,STK_REG(R22)(r1)
201 addi r1,r1,STACKFRAMESIZE
202
203 /* Up to 127B to go */
204 5: srdi r6,r5,4
205 mtocrf 0x01,r6
206
207 6: bf cr7*4+1,7f
208 err1; ld r0,0(r4)
209 err1; ld r6,8(r4)
210 err1; ld r7,16(r4)
211 err1; ld r8,24(r4)
212 err1; ld r9,32(r4)
213 err1; ld r10,40(r4)
214 err1; ld r11,48(r4)
215 err1; ld r12,56(r4)
216 addi r4,r4,64
217 err1; std r0,0(r3)
218 err1; std r6,8(r3)
219 err1; std r7,16(r3)
220 err1; std r8,24(r3)
221 err1; std r9,32(r3)
222 err1; std r10,40(r3)
223 err1; std r11,48(r3)
224 err1; std r12,56(r3)
225 addi r3,r3,64
226
227 /* Up to 63B to go */
228 7: bf cr7*4+2,8f
229 err1; ld r0,0(r4)
230 err1; ld r6,8(r4)
231 err1; ld r7,16(r4)
232 err1; ld r8,24(r4)
233 addi r4,r4,32
234 err1; std r0,0(r3)
235 err1; std r6,8(r3)
236 err1; std r7,16(r3)
237 err1; std r8,24(r3)
238 addi r3,r3,32
239
240 /* Up to 31B to go */
241 8: bf cr7*4+3,9f
242 err1; ld r0,0(r4)
243 err1; ld r6,8(r4)
244 addi r4,r4,16
245 err1; std r0,0(r3)
246 err1; std r6,8(r3)
247 addi r3,r3,16
248
249 9: clrldi r5,r5,(64-4)
250
251 /* Up to 15B to go */
252 .Lshort_copy:
253 mtocrf 0x01,r5
254 bf cr7*4+0,12f
255 err1; lwz r0,0(r4) /* Less chance of a reject with word ops */
256 err1; lwz r6,4(r4)
257 addi r4,r4,8
258 err1; stw r0,0(r3)
259 err1; stw r6,4(r3)
260 addi r3,r3,8
261
262 12: bf cr7*4+1,13f
263 err1; lwz r0,0(r4)
264 addi r4,r4,4
265 err1; stw r0,0(r3)
266 addi r3,r3,4
267
268 13: bf cr7*4+2,14f
269 err1; lhz r0,0(r4)
270 addi r4,r4,2
271 err1; sth r0,0(r3)
272 addi r3,r3,2
273
274 14: bf cr7*4+3,15f
275 err1; lbz r0,0(r4)
276 err1; stb r0,0(r3)
277
278 15: li r3,0
279 blr
280
281 .Lunwind_stack_nonvmx_copy:
282 addi r1,r1,STACKFRAMESIZE
283 b .Lnonvmx_copy
284
285 #ifdef CONFIG_ALTIVEC
286 .Lvmx_copy:
287 mflr r0
288 std r0,16(r1)
289 stdu r1,-STACKFRAMESIZE(r1)
290 bl .enter_vmx_usercopy
291 cmpwi cr1,r3,0
292 ld r0,STACKFRAMESIZE+16(r1)
293 ld r3,STACKFRAMESIZE+48(r1)
294 ld r4,STACKFRAMESIZE+56(r1)
295 ld r5,STACKFRAMESIZE+64(r1)
296 mtlr r0
297
298 /*
299 * We prefetch both the source and destination using enhanced touch
300 * instructions. We use a stream ID of 0 for the load side and
301 * 1 for the store side.
302 */
303 clrrdi r6,r4,7
304 clrrdi r9,r3,7
305 ori r9,r9,1 /* stream=1 */
306
307 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */
308 cmpldi r7,0x3FF
309 ble 1f
310 li r7,0x3FF
311 1: lis r0,0x0E00 /* depth=7 */
312 sldi r7,r7,7
313 or r7,r7,r0
314 ori r10,r7,1 /* stream=1 */
315
316 lis r8,0x8000 /* GO=1 */
317 clrldi r8,r8,32
318
319 .machine push
320 .machine "power4"
321 dcbt r0,r6,0b01000
322 dcbt r0,r7,0b01010
323 dcbtst r0,r9,0b01000
324 dcbtst r0,r10,0b01010
325 eieio
326 dcbt r0,r8,0b01010 /* GO */
327 .machine pop
328
329 beq cr1,.Lunwind_stack_nonvmx_copy
330
331 /*
332 * If source and destination are not relatively aligned we use a
333 * slower permute loop.
334 */
335 xor r6,r4,r3
336 rldicl. r6,r6,0,(64-4)
337 bne .Lvmx_unaligned_copy
338
339 /* Get the destination 16B aligned */
340 neg r6,r3
341 mtocrf 0x01,r6
342 clrldi r6,r6,(64-4)
343
344 bf cr7*4+3,1f
345 err3; lbz r0,0(r4)
346 addi r4,r4,1
347 err3; stb r0,0(r3)
348 addi r3,r3,1
349
350 1: bf cr7*4+2,2f
351 err3; lhz r0,0(r4)
352 addi r4,r4,2
353 err3; sth r0,0(r3)
354 addi r3,r3,2
355
356 2: bf cr7*4+1,3f
357 err3; lwz r0,0(r4)
358 addi r4,r4,4
359 err3; stw r0,0(r3)
360 addi r3,r3,4
361
362 3: bf cr7*4+0,4f
363 err3; ld r0,0(r4)
364 addi r4,r4,8
365 err3; std r0,0(r3)
366 addi r3,r3,8
367
368 4: sub r5,r5,r6
369
370 /* Get the desination 128B aligned */
371 neg r6,r3
372 srdi r7,r6,4
373 mtocrf 0x01,r7
374 clrldi r6,r6,(64-7)
375
376 li r9,16
377 li r10,32
378 li r11,48
379
380 bf cr7*4+3,5f
381 err3; lvx vr1,r0,r4
382 addi r4,r4,16
383 err3; stvx vr1,r0,r3
384 addi r3,r3,16
385
386 5: bf cr7*4+2,6f
387 err3; lvx vr1,r0,r4
388 err3; lvx vr0,r4,r9
389 addi r4,r4,32
390 err3; stvx vr1,r0,r3
391 err3; stvx vr0,r3,r9
392 addi r3,r3,32
393
394 6: bf cr7*4+1,7f
395 err3; lvx vr3,r0,r4
396 err3; lvx vr2,r4,r9
397 err3; lvx vr1,r4,r10
398 err3; lvx vr0,r4,r11
399 addi r4,r4,64
400 err3; stvx vr3,r0,r3
401 err3; stvx vr2,r3,r9
402 err3; stvx vr1,r3,r10
403 err3; stvx vr0,r3,r11
404 addi r3,r3,64
405
406 7: sub r5,r5,r6
407 srdi r6,r5,7
408
409 std r14,STK_REG(R14)(r1)
410 std r15,STK_REG(R15)(r1)
411 std r16,STK_REG(R16)(r1)
412
413 li r12,64
414 li r14,80
415 li r15,96
416 li r16,112
417
418 mtctr r6
419
420 /*
421 * Now do cacheline sized loads and stores. By this stage the
422 * cacheline stores are also cacheline aligned.
423 */
424 .align 5
425 8:
426 err4; lvx vr7,r0,r4
427 err4; lvx vr6,r4,r9
428 err4; lvx vr5,r4,r10
429 err4; lvx vr4,r4,r11
430 err4; lvx vr3,r4,r12
431 err4; lvx vr2,r4,r14
432 err4; lvx vr1,r4,r15
433 err4; lvx vr0,r4,r16
434 addi r4,r4,128
435 err4; stvx vr7,r0,r3
436 err4; stvx vr6,r3,r9
437 err4; stvx vr5,r3,r10
438 err4; stvx vr4,r3,r11
439 err4; stvx vr3,r3,r12
440 err4; stvx vr2,r3,r14
441 err4; stvx vr1,r3,r15
442 err4; stvx vr0,r3,r16
443 addi r3,r3,128
444 bdnz 8b
445
446 ld r14,STK_REG(R14)(r1)
447 ld r15,STK_REG(R15)(r1)
448 ld r16,STK_REG(R16)(r1)
449
450 /* Up to 127B to go */
451 clrldi r5,r5,(64-7)
452 srdi r6,r5,4
453 mtocrf 0x01,r6
454
455 bf cr7*4+1,9f
456 err3; lvx vr3,r0,r4
457 err3; lvx vr2,r4,r9
458 err3; lvx vr1,r4,r10
459 err3; lvx vr0,r4,r11
460 addi r4,r4,64
461 err3; stvx vr3,r0,r3
462 err3; stvx vr2,r3,r9
463 err3; stvx vr1,r3,r10
464 err3; stvx vr0,r3,r11
465 addi r3,r3,64
466
467 9: bf cr7*4+2,10f
468 err3; lvx vr1,r0,r4
469 err3; lvx vr0,r4,r9
470 addi r4,r4,32
471 err3; stvx vr1,r0,r3
472 err3; stvx vr0,r3,r9
473 addi r3,r3,32
474
475 10: bf cr7*4+3,11f
476 err3; lvx vr1,r0,r4
477 addi r4,r4,16
478 err3; stvx vr1,r0,r3
479 addi r3,r3,16
480
481 /* Up to 15B to go */
482 11: clrldi r5,r5,(64-4)
483 mtocrf 0x01,r5
484 bf cr7*4+0,12f
485 err3; ld r0,0(r4)
486 addi r4,r4,8
487 err3; std r0,0(r3)
488 addi r3,r3,8
489
490 12: bf cr7*4+1,13f
491 err3; lwz r0,0(r4)
492 addi r4,r4,4
493 err3; stw r0,0(r3)
494 addi r3,r3,4
495
496 13: bf cr7*4+2,14f
497 err3; lhz r0,0(r4)
498 addi r4,r4,2
499 err3; sth r0,0(r3)
500 addi r3,r3,2
501
502 14: bf cr7*4+3,15f
503 err3; lbz r0,0(r4)
504 err3; stb r0,0(r3)
505
506 15: addi r1,r1,STACKFRAMESIZE
507 b .exit_vmx_usercopy /* tail call optimise */
508
509 .Lvmx_unaligned_copy:
510 /* Get the destination 16B aligned */
511 neg r6,r3
512 mtocrf 0x01,r6
513 clrldi r6,r6,(64-4)
514
515 bf cr7*4+3,1f
516 err3; lbz r0,0(r4)
517 addi r4,r4,1
518 err3; stb r0,0(r3)
519 addi r3,r3,1
520
521 1: bf cr7*4+2,2f
522 err3; lhz r0,0(r4)
523 addi r4,r4,2
524 err3; sth r0,0(r3)
525 addi r3,r3,2
526
527 2: bf cr7*4+1,3f
528 err3; lwz r0,0(r4)
529 addi r4,r4,4
530 err3; stw r0,0(r3)
531 addi r3,r3,4
532
533 3: bf cr7*4+0,4f
534 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
535 err3; lwz r7,4(r4)
536 addi r4,r4,8
537 err3; stw r0,0(r3)
538 err3; stw r7,4(r3)
539 addi r3,r3,8
540
541 4: sub r5,r5,r6
542
543 /* Get the desination 128B aligned */
544 neg r6,r3
545 srdi r7,r6,4
546 mtocrf 0x01,r7
547 clrldi r6,r6,(64-7)
548
549 li r9,16
550 li r10,32
551 li r11,48
552
553 lvsl vr16,0,r4 /* Setup permute control vector */
554 err3; lvx vr0,0,r4
555 addi r4,r4,16
556
557 bf cr7*4+3,5f
558 err3; lvx vr1,r0,r4
559 vperm vr8,vr0,vr1,vr16
560 addi r4,r4,16
561 err3; stvx vr8,r0,r3
562 addi r3,r3,16
563 vor vr0,vr1,vr1
564
565 5: bf cr7*4+2,6f
566 err3; lvx vr1,r0,r4
567 vperm vr8,vr0,vr1,vr16
568 err3; lvx vr0,r4,r9
569 vperm vr9,vr1,vr0,vr16
570 addi r4,r4,32
571 err3; stvx vr8,r0,r3
572 err3; stvx vr9,r3,r9
573 addi r3,r3,32
574
575 6: bf cr7*4+1,7f
576 err3; lvx vr3,r0,r4
577 vperm vr8,vr0,vr3,vr16
578 err3; lvx vr2,r4,r9
579 vperm vr9,vr3,vr2,vr16
580 err3; lvx vr1,r4,r10
581 vperm vr10,vr2,vr1,vr16
582 err3; lvx vr0,r4,r11
583 vperm vr11,vr1,vr0,vr16
584 addi r4,r4,64
585 err3; stvx vr8,r0,r3
586 err3; stvx vr9,r3,r9
587 err3; stvx vr10,r3,r10
588 err3; stvx vr11,r3,r11
589 addi r3,r3,64
590
591 7: sub r5,r5,r6
592 srdi r6,r5,7
593
594 std r14,STK_REG(R14)(r1)
595 std r15,STK_REG(R15)(r1)
596 std r16,STK_REG(R16)(r1)
597
598 li r12,64
599 li r14,80
600 li r15,96
601 li r16,112
602
603 mtctr r6
604
605 /*
606 * Now do cacheline sized loads and stores. By this stage the
607 * cacheline stores are also cacheline aligned.
608 */
609 .align 5
610 8:
611 err4; lvx vr7,r0,r4
612 vperm vr8,vr0,vr7,vr16
613 err4; lvx vr6,r4,r9
614 vperm vr9,vr7,vr6,vr16
615 err4; lvx vr5,r4,r10
616 vperm vr10,vr6,vr5,vr16
617 err4; lvx vr4,r4,r11
618 vperm vr11,vr5,vr4,vr16
619 err4; lvx vr3,r4,r12
620 vperm vr12,vr4,vr3,vr16
621 err4; lvx vr2,r4,r14
622 vperm vr13,vr3,vr2,vr16
623 err4; lvx vr1,r4,r15
624 vperm vr14,vr2,vr1,vr16
625 err4; lvx vr0,r4,r16
626 vperm vr15,vr1,vr0,vr16
627 addi r4,r4,128
628 err4; stvx vr8,r0,r3
629 err4; stvx vr9,r3,r9
630 err4; stvx vr10,r3,r10
631 err4; stvx vr11,r3,r11
632 err4; stvx vr12,r3,r12
633 err4; stvx vr13,r3,r14
634 err4; stvx vr14,r3,r15
635 err4; stvx vr15,r3,r16
636 addi r3,r3,128
637 bdnz 8b
638
639 ld r14,STK_REG(R14)(r1)
640 ld r15,STK_REG(R15)(r1)
641 ld r16,STK_REG(R16)(r1)
642
643 /* Up to 127B to go */
644 clrldi r5,r5,(64-7)
645 srdi r6,r5,4
646 mtocrf 0x01,r6
647
648 bf cr7*4+1,9f
649 err3; lvx vr3,r0,r4
650 vperm vr8,vr0,vr3,vr16
651 err3; lvx vr2,r4,r9
652 vperm vr9,vr3,vr2,vr16
653 err3; lvx vr1,r4,r10
654 vperm vr10,vr2,vr1,vr16
655 err3; lvx vr0,r4,r11
656 vperm vr11,vr1,vr0,vr16
657 addi r4,r4,64
658 err3; stvx vr8,r0,r3
659 err3; stvx vr9,r3,r9
660 err3; stvx vr10,r3,r10
661 err3; stvx vr11,r3,r11
662 addi r3,r3,64
663
664 9: bf cr7*4+2,10f
665 err3; lvx vr1,r0,r4
666 vperm vr8,vr0,vr1,vr16
667 err3; lvx vr0,r4,r9
668 vperm vr9,vr1,vr0,vr16
669 addi r4,r4,32
670 err3; stvx vr8,r0,r3
671 err3; stvx vr9,r3,r9
672 addi r3,r3,32
673
674 10: bf cr7*4+3,11f
675 err3; lvx vr1,r0,r4
676 vperm vr8,vr0,vr1,vr16
677 addi r4,r4,16
678 err3; stvx vr8,r0,r3
679 addi r3,r3,16
680
681 /* Up to 15B to go */
682 11: clrldi r5,r5,(64-4)
683 addi r4,r4,-16 /* Unwind the +16 load offset */
684 mtocrf 0x01,r5
685 bf cr7*4+0,12f
686 err3; lwz r0,0(r4) /* Less chance of a reject with word ops */
687 err3; lwz r6,4(r4)
688 addi r4,r4,8
689 err3; stw r0,0(r3)
690 err3; stw r6,4(r3)
691 addi r3,r3,8
692
693 12: bf cr7*4+1,13f
694 err3; lwz r0,0(r4)
695 addi r4,r4,4
696 err3; stw r0,0(r3)
697 addi r3,r3,4
698
699 13: bf cr7*4+2,14f
700 err3; lhz r0,0(r4)
701 addi r4,r4,2
702 err3; sth r0,0(r3)
703 addi r3,r3,2
704
705 14: bf cr7*4+3,15f
706 err3; lbz r0,0(r4)
707 err3; stb r0,0(r3)
708
709 15: addi r1,r1,STACKFRAMESIZE
710 b .exit_vmx_usercopy /* tail call optimise */
711 #endif /* CONFiG_ALTIVEC */