Merge tag 'msm-fix-noncrit-for-3.7' of git://git.kernel.org/pub/scm/linux/kernel...
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / arch / mips / cavium-octeon / octeon-memcpy.S
1 /*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Unified implementation of memcpy, memmove and the __copy_user backend.
7 *
8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10 * Copyright (C) 2002 Broadcom, Inc.
11 * memcpy/copy_user author: Mark Vandevoorde
12 *
13 * Mnemonic names for arguments to memcpy/__copy_user
14 */
15
16 #include <asm/asm.h>
17 #include <asm/asm-offsets.h>
18 #include <asm/regdef.h>
19
20 #define dst a0
21 #define src a1
22 #define len a2
23
24 /*
25 * Spec
26 *
27 * memcpy copies len bytes from src to dst and sets v0 to dst.
28 * It assumes that
29 * - src and dst don't overlap
30 * - src is readable
31 * - dst is writable
32 * memcpy uses the standard calling convention
33 *
34 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
35 * the number of uncopied bytes due to an exception caused by a read or write.
36 * __copy_user assumes that src and dst don't overlap, and that the call is
37 * implementing one of the following:
38 * copy_to_user
39 * - src is readable (no exceptions when reading src)
40 * copy_from_user
41 * - dst is writable (no exceptions when writing dst)
42 * __copy_user uses a non-standard calling convention; see
43 * arch/mips/include/asm/uaccess.h
44 *
45 * When an exception happens on a load, the handler must
46 # ensure that all of the destination buffer is overwritten to prevent
47 * leaking information to user mode programs.
48 */
49
50 /*
51 * Implementation
52 */
53
54 /*
55 * The exception handler for loads requires that:
56 * 1- AT contain the address of the byte just past the end of the source
57 * of the copy,
58 * 2- src_entry <= src < AT, and
59 * 3- (dst - src) == (dst_entry - src_entry),
60 * The _entry suffix denotes values when __copy_user was called.
61 *
62 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
63 * (2) is met by incrementing src by the number of bytes copied
64 * (3) is met by not doing loads between a pair of increments of dst and src
65 *
66 * The exception handlers for stores adjust len (if necessary) and return.
67 * These handlers do not need to overwrite any data.
68 *
69 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
70 * they're not protected.
71 */
72
73 #define EXC(inst_reg,addr,handler) \
74 9: inst_reg, addr; \
75 .section __ex_table,"a"; \
76 PTR 9b, handler; \
77 .previous
78
79 /*
80 * Only on the 64-bit kernel we can made use of 64-bit registers.
81 */
82 #ifdef CONFIG_64BIT
83 #define USE_DOUBLE
84 #endif
85
86 #ifdef USE_DOUBLE
87
88 #define LOAD ld
89 #define LOADL ldl
90 #define LOADR ldr
91 #define STOREL sdl
92 #define STORER sdr
93 #define STORE sd
94 #define ADD daddu
95 #define SUB dsubu
96 #define SRL dsrl
97 #define SRA dsra
98 #define SLL dsll
99 #define SLLV dsllv
100 #define SRLV dsrlv
101 #define NBYTES 8
102 #define LOG_NBYTES 3
103
104 /*
105 * As we are sharing code base with the mips32 tree (which use the o32 ABI
106 * register definitions). We need to redefine the register definitions from
107 * the n64 ABI register naming to the o32 ABI register naming.
108 */
109 #undef t0
110 #undef t1
111 #undef t2
112 #undef t3
113 #define t0 $8
114 #define t1 $9
115 #define t2 $10
116 #define t3 $11
117 #define t4 $12
118 #define t5 $13
119 #define t6 $14
120 #define t7 $15
121
122 #else
123
124 #define LOAD lw
125 #define LOADL lwl
126 #define LOADR lwr
127 #define STOREL swl
128 #define STORER swr
129 #define STORE sw
130 #define ADD addu
131 #define SUB subu
132 #define SRL srl
133 #define SLL sll
134 #define SRA sra
135 #define SLLV sllv
136 #define SRLV srlv
137 #define NBYTES 4
138 #define LOG_NBYTES 2
139
140 #endif /* USE_DOUBLE */
141
142 #ifdef CONFIG_CPU_LITTLE_ENDIAN
143 #define LDFIRST LOADR
144 #define LDREST LOADL
145 #define STFIRST STORER
146 #define STREST STOREL
147 #define SHIFT_DISCARD SLLV
148 #else
149 #define LDFIRST LOADL
150 #define LDREST LOADR
151 #define STFIRST STOREL
152 #define STREST STORER
153 #define SHIFT_DISCARD SRLV
154 #endif
155
156 #define FIRST(unit) ((unit)*NBYTES)
157 #define REST(unit) (FIRST(unit)+NBYTES-1)
158 #define UNIT(unit) FIRST(unit)
159
160 #define ADDRMASK (NBYTES-1)
161
162 .text
163 .set noreorder
164 .set noat
165
166 /*
167 * t7 is used as a flag to note inatomic mode.
168 */
169 LEAF(__copy_user_inatomic)
170 b __copy_user_common
171 li t7, 1
172 END(__copy_user_inatomic)
173
174 /*
175 * A combined memcpy/__copy_user
176 * __copy_user sets len to 0 for success; else to an upper bound of
177 * the number of uncopied bytes.
178 * memcpy sets v0 to dst.
179 */
180 .align 5
181 LEAF(memcpy) /* a0=dst a1=src a2=len */
182 move v0, dst /* return value */
183 __memcpy:
184 FEXPORT(__copy_user)
185 li t7, 0 /* not inatomic */
186 __copy_user_common:
187 /*
188 * Note: dst & src may be unaligned, len may be 0
189 * Temps
190 */
191 #
192 # Octeon doesn't care if the destination is unaligned. The hardware
193 # can fix it faster than we can special case the assembly.
194 #
195 pref 0, 0(src)
196 sltu t0, len, NBYTES # Check if < 1 word
197 bnez t0, copy_bytes_checklen
198 and t0, src, ADDRMASK # Check if src unaligned
199 bnez t0, src_unaligned
200 sltu t0, len, 4*NBYTES # Check if < 4 words
201 bnez t0, less_than_4units
202 sltu t0, len, 8*NBYTES # Check if < 8 words
203 bnez t0, less_than_8units
204 sltu t0, len, 16*NBYTES # Check if < 16 words
205 bnez t0, cleanup_both_aligned
206 sltu t0, len, 128+1 # Check if len < 129
207 bnez t0, 1f # Skip prefetch if len is too short
208 sltu t0, len, 256+1 # Check if len < 257
209 bnez t0, 1f # Skip prefetch if len is too short
210 pref 0, 128(src) # We must not prefetch invalid addresses
211 #
212 # This is where we loop if there is more than 128 bytes left
213 2: pref 0, 256(src) # We must not prefetch invalid addresses
214 #
215 # This is where we loop if we can't prefetch anymore
216 1:
217 EXC( LOAD t0, UNIT(0)(src), l_exc)
218 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
219 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
220 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
221 SUB len, len, 16*NBYTES
222 EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
223 EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
224 EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
225 EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
226 EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
227 EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
228 EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
229 EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
230 EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
231 EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
232 EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
233 ADD src, src, 16*NBYTES
234 EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
235 ADD dst, dst, 16*NBYTES
236 EXC( LOAD t0, UNIT(-8)(src), l_exc_copy)
237 EXC( LOAD t1, UNIT(-7)(src), l_exc_copy)
238 EXC( LOAD t2, UNIT(-6)(src), l_exc_copy)
239 EXC( LOAD t3, UNIT(-5)(src), l_exc_copy)
240 EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
241 EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
242 EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
243 EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
244 EXC( LOAD t0, UNIT(-4)(src), l_exc_copy)
245 EXC( LOAD t1, UNIT(-3)(src), l_exc_copy)
246 EXC( LOAD t2, UNIT(-2)(src), l_exc_copy)
247 EXC( LOAD t3, UNIT(-1)(src), l_exc_copy)
248 EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
249 EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
250 EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
251 EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
252 sltu t0, len, 256+1 # See if we can prefetch more
253 beqz t0, 2b
254 sltu t0, len, 128 # See if we can loop more time
255 beqz t0, 1b
256 nop
257 #
258 # Jump here if there are less than 16*NBYTES left.
259 #
260 cleanup_both_aligned:
261 beqz len, done
262 sltu t0, len, 8*NBYTES
263 bnez t0, less_than_8units
264 nop
265 EXC( LOAD t0, UNIT(0)(src), l_exc)
266 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
267 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
268 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
269 SUB len, len, 8*NBYTES
270 EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
271 EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
272 EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
273 EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
274 EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
275 EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
276 EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
277 EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
278 EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
279 EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
280 EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
281 EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
282 ADD src, src, 8*NBYTES
283 beqz len, done
284 ADD dst, dst, 8*NBYTES
285 #
286 # Jump here if there are less than 8*NBYTES left.
287 #
288 less_than_8units:
289 sltu t0, len, 4*NBYTES
290 bnez t0, less_than_4units
291 nop
292 EXC( LOAD t0, UNIT(0)(src), l_exc)
293 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
294 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
295 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
296 SUB len, len, 4*NBYTES
297 EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
298 EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
299 EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
300 EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
301 ADD src, src, 4*NBYTES
302 beqz len, done
303 ADD dst, dst, 4*NBYTES
304 #
305 # Jump here if there are less than 4*NBYTES left. This means
306 # we may need to copy up to 3 NBYTES words.
307 #
308 less_than_4units:
309 sltu t0, len, 1*NBYTES
310 bnez t0, copy_bytes_checklen
311 nop
312 #
313 # 1) Copy NBYTES, then check length again
314 #
315 EXC( LOAD t0, 0(src), l_exc)
316 SUB len, len, NBYTES
317 sltu t1, len, 8
318 EXC( STORE t0, 0(dst), s_exc_p1u)
319 ADD src, src, NBYTES
320 bnez t1, copy_bytes_checklen
321 ADD dst, dst, NBYTES
322 #
323 # 2) Copy NBYTES, then check length again
324 #
325 EXC( LOAD t0, 0(src), l_exc)
326 SUB len, len, NBYTES
327 sltu t1, len, 8
328 EXC( STORE t0, 0(dst), s_exc_p1u)
329 ADD src, src, NBYTES
330 bnez t1, copy_bytes_checklen
331 ADD dst, dst, NBYTES
332 #
333 # 3) Copy NBYTES, then check length again
334 #
335 EXC( LOAD t0, 0(src), l_exc)
336 SUB len, len, NBYTES
337 ADD src, src, NBYTES
338 ADD dst, dst, NBYTES
339 b copy_bytes_checklen
340 EXC( STORE t0, -8(dst), s_exc_p1u)
341
342 src_unaligned:
343 #define rem t8
344 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
345 beqz t0, cleanup_src_unaligned
346 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
347 1:
348 /*
349 * Avoid consecutive LD*'s to the same register since some mips
350 * implementations can't issue them in the same cycle.
351 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
352 * are to the same unit (unless src is aligned, but it's not).
353 */
354 EXC( LDFIRST t0, FIRST(0)(src), l_exc)
355 EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
356 SUB len, len, 4*NBYTES
357 EXC( LDREST t0, REST(0)(src), l_exc_copy)
358 EXC( LDREST t1, REST(1)(src), l_exc_copy)
359 EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
360 EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
361 EXC( LDREST t2, REST(2)(src), l_exc_copy)
362 EXC( LDREST t3, REST(3)(src), l_exc_copy)
363 ADD src, src, 4*NBYTES
364 EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
365 EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
366 EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
367 EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
368 bne len, rem, 1b
369 ADD dst, dst, 4*NBYTES
370
371 cleanup_src_unaligned:
372 beqz len, done
373 and rem, len, NBYTES-1 # rem = len % NBYTES
374 beq rem, len, copy_bytes
375 nop
376 1:
377 EXC( LDFIRST t0, FIRST(0)(src), l_exc)
378 EXC( LDREST t0, REST(0)(src), l_exc_copy)
379 SUB len, len, NBYTES
380 EXC( STORE t0, 0(dst), s_exc_p1u)
381 ADD src, src, NBYTES
382 bne len, rem, 1b
383 ADD dst, dst, NBYTES
384
385 copy_bytes_checklen:
386 beqz len, done
387 nop
388 copy_bytes:
389 /* 0 < len < NBYTES */
390 #define COPY_BYTE(N) \
391 EXC( lb t0, N(src), l_exc); \
392 SUB len, len, 1; \
393 beqz len, done; \
394 EXC( sb t0, N(dst), s_exc_p1)
395
396 COPY_BYTE(0)
397 COPY_BYTE(1)
398 #ifdef USE_DOUBLE
399 COPY_BYTE(2)
400 COPY_BYTE(3)
401 COPY_BYTE(4)
402 COPY_BYTE(5)
403 #endif
404 EXC( lb t0, NBYTES-2(src), l_exc)
405 SUB len, len, 1
406 jr ra
407 EXC( sb t0, NBYTES-2(dst), s_exc_p1)
408 done:
409 jr ra
410 nop
411 END(memcpy)
412
413 l_exc_copy:
414 /*
415 * Copy bytes from src until faulting load address (or until a
416 * lb faults)
417 *
418 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
419 * may be more than a byte beyond the last address.
420 * Hence, the lb below may get an exception.
421 *
422 * Assumes src < THREAD_BUADDR($28)
423 */
424 LOAD t0, TI_TASK($28)
425 LOAD t0, THREAD_BUADDR(t0)
426 1:
427 EXC( lb t1, 0(src), l_exc)
428 ADD src, src, 1
429 sb t1, 0(dst) # can't fault -- we're copy_from_user
430 bne src, t0, 1b
431 ADD dst, dst, 1
432 l_exc:
433 LOAD t0, TI_TASK($28)
434 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
435 SUB len, AT, t0 # len number of uncopied bytes
436 bnez t7, 2f /* Skip the zeroing out part if inatomic */
437 /*
438 * Here's where we rely on src and dst being incremented in tandem,
439 * See (3) above.
440 * dst += (fault addr - src) to put dst at first byte to clear
441 */
442 ADD dst, t0 # compute start address in a1
443 SUB dst, src
444 /*
445 * Clear len bytes starting at dst. Can't call __bzero because it
446 * might modify len. An inefficient loop for these rare times...
447 */
448 beqz len, done
449 SUB src, len, 1
450 1: sb zero, 0(dst)
451 ADD dst, dst, 1
452 bnez src, 1b
453 SUB src, src, 1
454 2: jr ra
455 nop
456
457
458 #define SEXC(n) \
459 s_exc_p ## n ## u: \
460 jr ra; \
461 ADD len, len, n*NBYTES
462
463 SEXC(16)
464 SEXC(15)
465 SEXC(14)
466 SEXC(13)
467 SEXC(12)
468 SEXC(11)
469 SEXC(10)
470 SEXC(9)
471 SEXC(8)
472 SEXC(7)
473 SEXC(6)
474 SEXC(5)
475 SEXC(4)
476 SEXC(3)
477 SEXC(2)
478 SEXC(1)
479
480 s_exc_p1:
481 jr ra
482 ADD len, len, 1
483 s_exc:
484 jr ra
485 nop
486
487 .align 5
488 LEAF(memmove)
489 ADD t0, a0, a2
490 ADD t1, a1, a2
491 sltu t0, a1, t0 # dst + len <= src -> memcpy
492 sltu t1, a0, t1 # dst >= src + len -> memcpy
493 and t0, t1
494 beqz t0, __memcpy
495 move v0, a0 /* return value */
496 beqz a2, r_out
497 END(memmove)
498
499 /* fall through to __rmemcpy */
500 LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
501 sltu t0, a1, a0
502 beqz t0, r_end_bytes_up # src >= dst
503 nop
504 ADD a0, a2 # dst = dst + len
505 ADD a1, a2 # src = src + len
506
507 r_end_bytes:
508 lb t0, -1(a1)
509 SUB a2, a2, 0x1
510 sb t0, -1(a0)
511 SUB a1, a1, 0x1
512 bnez a2, r_end_bytes
513 SUB a0, a0, 0x1
514
515 r_out:
516 jr ra
517 move a2, zero
518
519 r_end_bytes_up:
520 lb t0, (a1)
521 SUB a2, a2, 0x1
522 sb t0, (a0)
523 ADD a1, a1, 0x1
524 bnez a2, r_end_bytes_up
525 ADD a0, a0, 0x1
526
527 jr ra
528 move a2, zero
529 END(__rmemcpy)