Merge branch 'topic/snd_card_new-err' into topic/drop-l3
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / arch / mips / cavium-octeon / octeon-memcpy.S
1 /*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * Unified implementation of memcpy, memmove and the __copy_user backend.
7 *
8 * Copyright (C) 1998, 99, 2000, 01, 2002 Ralf Baechle (ralf@gnu.org)
9 * Copyright (C) 1999, 2000, 01, 2002 Silicon Graphics, Inc.
10 * Copyright (C) 2002 Broadcom, Inc.
11 * memcpy/copy_user author: Mark Vandevoorde
12 *
13 * Mnemonic names for arguments to memcpy/__copy_user
14 */
15
16 #include <asm/asm.h>
17 #include <asm/asm-offsets.h>
18 #include <asm/regdef.h>
19
20 #define dst a0
21 #define src a1
22 #define len a2
23
24 /*
25 * Spec
26 *
27 * memcpy copies len bytes from src to dst and sets v0 to dst.
28 * It assumes that
29 * - src and dst don't overlap
30 * - src is readable
31 * - dst is writable
32 * memcpy uses the standard calling convention
33 *
34 * __copy_user copies up to len bytes from src to dst and sets a2 (len) to
35 * the number of uncopied bytes due to an exception caused by a read or write.
36 * __copy_user assumes that src and dst don't overlap, and that the call is
37 * implementing one of the following:
38 * copy_to_user
39 * - src is readable (no exceptions when reading src)
40 * copy_from_user
41 * - dst is writable (no exceptions when writing dst)
42 * __copy_user uses a non-standard calling convention; see
43 * arch/mips/include/asm/uaccess.h
44 *
45 * When an exception happens on a load, the handler must
46 # ensure that all of the destination buffer is overwritten to prevent
47 * leaking information to user mode programs.
48 */
49
50 /*
51 * Implementation
52 */
53
54 /*
55 * The exception handler for loads requires that:
56 * 1- AT contain the address of the byte just past the end of the source
57 * of the copy,
58 * 2- src_entry <= src < AT, and
59 * 3- (dst - src) == (dst_entry - src_entry),
60 * The _entry suffix denotes values when __copy_user was called.
61 *
62 * (1) is set up up by uaccess.h and maintained by not writing AT in copy_user
63 * (2) is met by incrementing src by the number of bytes copied
64 * (3) is met by not doing loads between a pair of increments of dst and src
65 *
66 * The exception handlers for stores adjust len (if necessary) and return.
67 * These handlers do not need to overwrite any data.
68 *
69 * For __rmemcpy and memmove an exception is always a kernel bug, therefore
70 * they're not protected.
71 */
72
73 #define EXC(inst_reg,addr,handler) \
74 9: inst_reg, addr; \
75 .section __ex_table,"a"; \
76 PTR 9b, handler; \
77 .previous
78
79 /*
80 * Only on the 64-bit kernel we can made use of 64-bit registers.
81 */
82 #ifdef CONFIG_64BIT
83 #define USE_DOUBLE
84 #endif
85
86 #ifdef USE_DOUBLE
87
88 #define LOAD ld
89 #define LOADL ldl
90 #define LOADR ldr
91 #define STOREL sdl
92 #define STORER sdr
93 #define STORE sd
94 #define ADD daddu
95 #define SUB dsubu
96 #define SRL dsrl
97 #define SRA dsra
98 #define SLL dsll
99 #define SLLV dsllv
100 #define SRLV dsrlv
101 #define NBYTES 8
102 #define LOG_NBYTES 3
103
104 /*
105 * As we are sharing code base with the mips32 tree (which use the o32 ABI
106 * register definitions). We need to redefine the register definitions from
107 * the n64 ABI register naming to the o32 ABI register naming.
108 */
109 #undef t0
110 #undef t1
111 #undef t2
112 #undef t3
113 #define t0 $8
114 #define t1 $9
115 #define t2 $10
116 #define t3 $11
117 #define t4 $12
118 #define t5 $13
119 #define t6 $14
120 #define t7 $15
121
122 #else
123
124 #define LOAD lw
125 #define LOADL lwl
126 #define LOADR lwr
127 #define STOREL swl
128 #define STORER swr
129 #define STORE sw
130 #define ADD addu
131 #define SUB subu
132 #define SRL srl
133 #define SLL sll
134 #define SRA sra
135 #define SLLV sllv
136 #define SRLV srlv
137 #define NBYTES 4
138 #define LOG_NBYTES 2
139
140 #endif /* USE_DOUBLE */
141
142 #ifdef CONFIG_CPU_LITTLE_ENDIAN
143 #define LDFIRST LOADR
144 #define LDREST LOADL
145 #define STFIRST STORER
146 #define STREST STOREL
147 #define SHIFT_DISCARD SLLV
148 #else
149 #define LDFIRST LOADL
150 #define LDREST LOADR
151 #define STFIRST STOREL
152 #define STREST STORER
153 #define SHIFT_DISCARD SRLV
154 #endif
155
156 #define FIRST(unit) ((unit)*NBYTES)
157 #define REST(unit) (FIRST(unit)+NBYTES-1)
158 #define UNIT(unit) FIRST(unit)
159
160 #define ADDRMASK (NBYTES-1)
161
162 .text
163 .set noreorder
164 .set noat
165
166 /*
167 * A combined memcpy/__copy_user
168 * __copy_user sets len to 0 for success; else to an upper bound of
169 * the number of uncopied bytes.
170 * memcpy sets v0 to dst.
171 */
172 .align 5
173 LEAF(memcpy) /* a0=dst a1=src a2=len */
174 move v0, dst /* return value */
175 __memcpy:
176 FEXPORT(__copy_user)
177 /*
178 * Note: dst & src may be unaligned, len may be 0
179 * Temps
180 */
181 #
182 # Octeon doesn't care if the destination is unaligned. The hardware
183 # can fix it faster than we can special case the assembly.
184 #
185 pref 0, 0(src)
186 sltu t0, len, NBYTES # Check if < 1 word
187 bnez t0, copy_bytes_checklen
188 and t0, src, ADDRMASK # Check if src unaligned
189 bnez t0, src_unaligned
190 sltu t0, len, 4*NBYTES # Check if < 4 words
191 bnez t0, less_than_4units
192 sltu t0, len, 8*NBYTES # Check if < 8 words
193 bnez t0, less_than_8units
194 sltu t0, len, 16*NBYTES # Check if < 16 words
195 bnez t0, cleanup_both_aligned
196 sltu t0, len, 128+1 # Check if len < 129
197 bnez t0, 1f # Skip prefetch if len is too short
198 sltu t0, len, 256+1 # Check if len < 257
199 bnez t0, 1f # Skip prefetch if len is too short
200 pref 0, 128(src) # We must not prefetch invalid addresses
201 #
202 # This is where we loop if there is more than 128 bytes left
203 2: pref 0, 256(src) # We must not prefetch invalid addresses
204 #
205 # This is where we loop if we can't prefetch anymore
206 1:
207 EXC( LOAD t0, UNIT(0)(src), l_exc)
208 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
209 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
210 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
211 SUB len, len, 16*NBYTES
212 EXC( STORE t0, UNIT(0)(dst), s_exc_p16u)
213 EXC( STORE t1, UNIT(1)(dst), s_exc_p15u)
214 EXC( STORE t2, UNIT(2)(dst), s_exc_p14u)
215 EXC( STORE t3, UNIT(3)(dst), s_exc_p13u)
216 EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
217 EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
218 EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
219 EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
220 EXC( STORE t0, UNIT(4)(dst), s_exc_p12u)
221 EXC( STORE t1, UNIT(5)(dst), s_exc_p11u)
222 EXC( STORE t2, UNIT(6)(dst), s_exc_p10u)
223 ADD src, src, 16*NBYTES
224 EXC( STORE t3, UNIT(7)(dst), s_exc_p9u)
225 ADD dst, dst, 16*NBYTES
226 EXC( LOAD t0, UNIT(-8)(src), l_exc_copy)
227 EXC( LOAD t1, UNIT(-7)(src), l_exc_copy)
228 EXC( LOAD t2, UNIT(-6)(src), l_exc_copy)
229 EXC( LOAD t3, UNIT(-5)(src), l_exc_copy)
230 EXC( STORE t0, UNIT(-8)(dst), s_exc_p8u)
231 EXC( STORE t1, UNIT(-7)(dst), s_exc_p7u)
232 EXC( STORE t2, UNIT(-6)(dst), s_exc_p6u)
233 EXC( STORE t3, UNIT(-5)(dst), s_exc_p5u)
234 EXC( LOAD t0, UNIT(-4)(src), l_exc_copy)
235 EXC( LOAD t1, UNIT(-3)(src), l_exc_copy)
236 EXC( LOAD t2, UNIT(-2)(src), l_exc_copy)
237 EXC( LOAD t3, UNIT(-1)(src), l_exc_copy)
238 EXC( STORE t0, UNIT(-4)(dst), s_exc_p4u)
239 EXC( STORE t1, UNIT(-3)(dst), s_exc_p3u)
240 EXC( STORE t2, UNIT(-2)(dst), s_exc_p2u)
241 EXC( STORE t3, UNIT(-1)(dst), s_exc_p1u)
242 sltu t0, len, 256+1 # See if we can prefetch more
243 beqz t0, 2b
244 sltu t0, len, 128 # See if we can loop more time
245 beqz t0, 1b
246 nop
247 #
248 # Jump here if there are less than 16*NBYTES left.
249 #
250 cleanup_both_aligned:
251 beqz len, done
252 sltu t0, len, 8*NBYTES
253 bnez t0, less_than_8units
254 nop
255 EXC( LOAD t0, UNIT(0)(src), l_exc)
256 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
257 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
258 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
259 SUB len, len, 8*NBYTES
260 EXC( STORE t0, UNIT(0)(dst), s_exc_p8u)
261 EXC( STORE t1, UNIT(1)(dst), s_exc_p7u)
262 EXC( STORE t2, UNIT(2)(dst), s_exc_p6u)
263 EXC( STORE t3, UNIT(3)(dst), s_exc_p5u)
264 EXC( LOAD t0, UNIT(4)(src), l_exc_copy)
265 EXC( LOAD t1, UNIT(5)(src), l_exc_copy)
266 EXC( LOAD t2, UNIT(6)(src), l_exc_copy)
267 EXC( LOAD t3, UNIT(7)(src), l_exc_copy)
268 EXC( STORE t0, UNIT(4)(dst), s_exc_p4u)
269 EXC( STORE t1, UNIT(5)(dst), s_exc_p3u)
270 EXC( STORE t2, UNIT(6)(dst), s_exc_p2u)
271 EXC( STORE t3, UNIT(7)(dst), s_exc_p1u)
272 ADD src, src, 8*NBYTES
273 beqz len, done
274 ADD dst, dst, 8*NBYTES
275 #
276 # Jump here if there are less than 8*NBYTES left.
277 #
278 less_than_8units:
279 sltu t0, len, 4*NBYTES
280 bnez t0, less_than_4units
281 nop
282 EXC( LOAD t0, UNIT(0)(src), l_exc)
283 EXC( LOAD t1, UNIT(1)(src), l_exc_copy)
284 EXC( LOAD t2, UNIT(2)(src), l_exc_copy)
285 EXC( LOAD t3, UNIT(3)(src), l_exc_copy)
286 SUB len, len, 4*NBYTES
287 EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
288 EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
289 EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
290 EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
291 ADD src, src, 4*NBYTES
292 beqz len, done
293 ADD dst, dst, 4*NBYTES
294 #
295 # Jump here if there are less than 4*NBYTES left. This means
296 # we may need to copy up to 3 NBYTES words.
297 #
298 less_than_4units:
299 sltu t0, len, 1*NBYTES
300 bnez t0, copy_bytes_checklen
301 nop
302 #
303 # 1) Copy NBYTES, then check length again
304 #
305 EXC( LOAD t0, 0(src), l_exc)
306 SUB len, len, NBYTES
307 sltu t1, len, 8
308 EXC( STORE t0, 0(dst), s_exc_p1u)
309 ADD src, src, NBYTES
310 bnez t1, copy_bytes_checklen
311 ADD dst, dst, NBYTES
312 #
313 # 2) Copy NBYTES, then check length again
314 #
315 EXC( LOAD t0, 0(src), l_exc)
316 SUB len, len, NBYTES
317 sltu t1, len, 8
318 EXC( STORE t0, 0(dst), s_exc_p1u)
319 ADD src, src, NBYTES
320 bnez t1, copy_bytes_checklen
321 ADD dst, dst, NBYTES
322 #
323 # 3) Copy NBYTES, then check length again
324 #
325 EXC( LOAD t0, 0(src), l_exc)
326 SUB len, len, NBYTES
327 ADD src, src, NBYTES
328 ADD dst, dst, NBYTES
329 b copy_bytes_checklen
330 EXC( STORE t0, -8(dst), s_exc_p1u)
331
332 src_unaligned:
333 #define rem t8
334 SRL t0, len, LOG_NBYTES+2 # +2 for 4 units/iter
335 beqz t0, cleanup_src_unaligned
336 and rem, len, (4*NBYTES-1) # rem = len % 4*NBYTES
337 1:
338 /*
339 * Avoid consecutive LD*'s to the same register since some mips
340 * implementations can't issue them in the same cycle.
341 * It's OK to load FIRST(N+1) before REST(N) because the two addresses
342 * are to the same unit (unless src is aligned, but it's not).
343 */
344 EXC( LDFIRST t0, FIRST(0)(src), l_exc)
345 EXC( LDFIRST t1, FIRST(1)(src), l_exc_copy)
346 SUB len, len, 4*NBYTES
347 EXC( LDREST t0, REST(0)(src), l_exc_copy)
348 EXC( LDREST t1, REST(1)(src), l_exc_copy)
349 EXC( LDFIRST t2, FIRST(2)(src), l_exc_copy)
350 EXC( LDFIRST t3, FIRST(3)(src), l_exc_copy)
351 EXC( LDREST t2, REST(2)(src), l_exc_copy)
352 EXC( LDREST t3, REST(3)(src), l_exc_copy)
353 ADD src, src, 4*NBYTES
354 EXC( STORE t0, UNIT(0)(dst), s_exc_p4u)
355 EXC( STORE t1, UNIT(1)(dst), s_exc_p3u)
356 EXC( STORE t2, UNIT(2)(dst), s_exc_p2u)
357 EXC( STORE t3, UNIT(3)(dst), s_exc_p1u)
358 bne len, rem, 1b
359 ADD dst, dst, 4*NBYTES
360
361 cleanup_src_unaligned:
362 beqz len, done
363 and rem, len, NBYTES-1 # rem = len % NBYTES
364 beq rem, len, copy_bytes
365 nop
366 1:
367 EXC( LDFIRST t0, FIRST(0)(src), l_exc)
368 EXC( LDREST t0, REST(0)(src), l_exc_copy)
369 SUB len, len, NBYTES
370 EXC( STORE t0, 0(dst), s_exc_p1u)
371 ADD src, src, NBYTES
372 bne len, rem, 1b
373 ADD dst, dst, NBYTES
374
375 copy_bytes_checklen:
376 beqz len, done
377 nop
378 copy_bytes:
379 /* 0 < len < NBYTES */
380 #define COPY_BYTE(N) \
381 EXC( lb t0, N(src), l_exc); \
382 SUB len, len, 1; \
383 beqz len, done; \
384 EXC( sb t0, N(dst), s_exc_p1)
385
386 COPY_BYTE(0)
387 COPY_BYTE(1)
388 #ifdef USE_DOUBLE
389 COPY_BYTE(2)
390 COPY_BYTE(3)
391 COPY_BYTE(4)
392 COPY_BYTE(5)
393 #endif
394 EXC( lb t0, NBYTES-2(src), l_exc)
395 SUB len, len, 1
396 jr ra
397 EXC( sb t0, NBYTES-2(dst), s_exc_p1)
398 done:
399 jr ra
400 nop
401 END(memcpy)
402
403 l_exc_copy:
404 /*
405 * Copy bytes from src until faulting load address (or until a
406 * lb faults)
407 *
408 * When reached by a faulting LDFIRST/LDREST, THREAD_BUADDR($28)
409 * may be more than a byte beyond the last address.
410 * Hence, the lb below may get an exception.
411 *
412 * Assumes src < THREAD_BUADDR($28)
413 */
414 LOAD t0, TI_TASK($28)
415 nop
416 LOAD t0, THREAD_BUADDR(t0)
417 1:
418 EXC( lb t1, 0(src), l_exc)
419 ADD src, src, 1
420 sb t1, 0(dst) # can't fault -- we're copy_from_user
421 bne src, t0, 1b
422 ADD dst, dst, 1
423 l_exc:
424 LOAD t0, TI_TASK($28)
425 nop
426 LOAD t0, THREAD_BUADDR(t0) # t0 is just past last good address
427 nop
428 SUB len, AT, t0 # len number of uncopied bytes
429 /*
430 * Here's where we rely on src and dst being incremented in tandem,
431 * See (3) above.
432 * dst += (fault addr - src) to put dst at first byte to clear
433 */
434 ADD dst, t0 # compute start address in a1
435 SUB dst, src
436 /*
437 * Clear len bytes starting at dst. Can't call __bzero because it
438 * might modify len. An inefficient loop for these rare times...
439 */
440 beqz len, done
441 SUB src, len, 1
442 1: sb zero, 0(dst)
443 ADD dst, dst, 1
444 bnez src, 1b
445 SUB src, src, 1
446 jr ra
447 nop
448
449
450 #define SEXC(n) \
451 s_exc_p ## n ## u: \
452 jr ra; \
453 ADD len, len, n*NBYTES
454
455 SEXC(16)
456 SEXC(15)
457 SEXC(14)
458 SEXC(13)
459 SEXC(12)
460 SEXC(11)
461 SEXC(10)
462 SEXC(9)
463 SEXC(8)
464 SEXC(7)
465 SEXC(6)
466 SEXC(5)
467 SEXC(4)
468 SEXC(3)
469 SEXC(2)
470 SEXC(1)
471
472 s_exc_p1:
473 jr ra
474 ADD len, len, 1
475 s_exc:
476 jr ra
477 nop
478
479 .align 5
480 LEAF(memmove)
481 ADD t0, a0, a2
482 ADD t1, a1, a2
483 sltu t0, a1, t0 # dst + len <= src -> memcpy
484 sltu t1, a0, t1 # dst >= src + len -> memcpy
485 and t0, t1
486 beqz t0, __memcpy
487 move v0, a0 /* return value */
488 beqz a2, r_out
489 END(memmove)
490
491 /* fall through to __rmemcpy */
492 LEAF(__rmemcpy) /* a0=dst a1=src a2=len */
493 sltu t0, a1, a0
494 beqz t0, r_end_bytes_up # src >= dst
495 nop
496 ADD a0, a2 # dst = dst + len
497 ADD a1, a2 # src = src + len
498
499 r_end_bytes:
500 lb t0, -1(a1)
501 SUB a2, a2, 0x1
502 sb t0, -1(a0)
503 SUB a1, a1, 0x1
504 bnez a2, r_end_bytes
505 SUB a0, a0, 0x1
506
507 r_out:
508 jr ra
509 move a2, zero
510
511 r_end_bytes_up:
512 lb t0, (a1)
513 SUB a2, a2, 0x1
514 sb t0, (a0)
515 ADD a1, a1, 0x1
516 bnez a2, r_end_bytes_up
517 ADD a0, a0, 0x1
518
519 jr ra
520 move a2, zero
521 END(__rmemcpy)