Commit | Line | Data |
---|---|---|
14cf11af PM |
1 | /* |
2 | * This file contains assembly-language implementations | |
3 | * of IP-style 1's complement checksum routines. | |
4 | * | |
5 | * Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org) | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public License | |
9 | * as published by the Free Software Foundation; either version | |
10 | * 2 of the License, or (at your option) any later version. | |
11 | * | |
12 | * Severely hacked about by Paul Mackerras (paulus@cs.anu.edu.au). | |
13 | */ | |
14 | ||
15 | #include <linux/sys.h> | |
16 | #include <asm/processor.h> | |
17 | #include <asm/errno.h> | |
18 | #include <asm/ppc_asm.h> | |
19 | ||
20 | /* | |
21 | * ip_fast_csum(r3=buf, r4=len) -- Optimized for IP header | |
22 | * len is in words and is always >= 5. | |
23 | * | |
24 | * In practice len == 5, but this is not guaranteed. So this code does not | |
25 | * attempt to use doubleword instructions. | |
26 | */ | |
27 | _GLOBAL(ip_fast_csum) | |
28 | lwz r0,0(r3) | |
29 | lwzu r5,4(r3) | |
30 | addic. r4,r4,-2 | |
31 | addc r0,r0,r5 | |
32 | mtctr r4 | |
33 | blelr- | |
34 | 1: lwzu r4,4(r3) | |
35 | adde r0,r0,r4 | |
36 | bdnz 1b | |
37 | addze r0,r0 /* add in final carry */ | |
38 | rldicl r4,r0,32,0 /* fold two 32-bit halves together */ | |
39 | add r0,r0,r4 | |
40 | srdi r0,r0,32 | |
41 | rlwinm r3,r0,16,0,31 /* fold two halves together */ | |
42 | add r3,r0,r3 | |
43 | not r3,r3 | |
44 | srwi r3,r3,16 | |
45 | blr | |
46 | ||
47 | /* | |
48 | * Compute checksum of TCP or UDP pseudo-header: | |
49 | * csum_tcpudp_magic(r3=saddr, r4=daddr, r5=len, r6=proto, r7=sum) | |
50 | * No real gain trying to do this specially for 64 bit, but | |
51 | * the 32 bit addition may spill into the upper bits of | |
52 | * the doubleword so we still must fold it down from 64. | |
53 | */ | |
54 | _GLOBAL(csum_tcpudp_magic) | |
55 | rlwimi r5,r6,16,0,15 /* put proto in upper half of len */ | |
56 | addc r0,r3,r4 /* add 4 32-bit words together */ | |
57 | adde r0,r0,r5 | |
58 | adde r0,r0,r7 | |
59 | rldicl r4,r0,32,0 /* fold 64 bit value */ | |
60 | add r0,r4,r0 | |
61 | srdi r0,r0,32 | |
62 | rlwinm r3,r0,16,0,31 /* fold two halves together */ | |
63 | add r3,r0,r3 | |
64 | not r3,r3 | |
65 | srwi r3,r3,16 | |
66 | blr | |
67 | ||
9b83ecb0 AB |
68 | #define STACKFRAMESIZE 256 |
69 | #define STK_REG(i) (112 + ((i)-14)*8) | |
70 | ||
14cf11af PM |
71 | /* |
72 | * Computes the checksum of a memory block at buff, length len, | |
73 | * and adds in "sum" (32-bit). | |
74 | * | |
14cf11af PM |
75 | * csum_partial(r3=buff, r4=len, r5=sum) |
76 | */ | |
77 | _GLOBAL(csum_partial) | |
9b83ecb0 AB |
78 | addic r0,r5,0 /* clear carry */ |
79 | ||
80 | srdi. r6,r4,3 /* less than 8 bytes? */ | |
81 | beq .Lcsum_tail_word | |
82 | ||
83 | /* | |
84 | * If only halfword aligned, align to a double word. Since odd | |
85 | * aligned addresses should be rare and they would require more | |
86 | * work to calculate the correct checksum, we ignore that case | |
87 | * and take the potential slowdown of unaligned loads. | |
88 | */ | |
89 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ | |
90 | beq .Lcsum_aligned | |
91 | ||
92 | li r7,4 | |
93 | sub r6,r7,r6 | |
94 | mtctr r6 | |
95 | ||
96 | 1: | |
97 | lhz r6,0(r3) /* align to doubleword */ | |
98 | subi r4,r4,2 | |
99 | addi r3,r3,2 | |
100 | adde r0,r0,r6 | |
101 | bdnz 1b | |
102 | ||
103 | .Lcsum_aligned: | |
104 | /* | |
105 | * We unroll the loop such that each iteration is 64 bytes with an | |
106 | * entry and exit limb of 64 bytes, meaning a minimum size of | |
107 | * 128 bytes. | |
108 | */ | |
109 | srdi. r6,r4,7 | |
110 | beq .Lcsum_tail_doublewords /* len < 128 */ | |
111 | ||
112 | srdi r6,r4,6 | |
113 | subi r6,r6,1 | |
114 | mtctr r6 | |
115 | ||
116 | stdu r1,-STACKFRAMESIZE(r1) | |
117 | std r14,STK_REG(r14)(r1) | |
118 | std r15,STK_REG(r15)(r1) | |
119 | std r16,STK_REG(r16)(r1) | |
120 | ||
121 | ld r6,0(r3) | |
122 | ld r9,8(r3) | |
123 | ||
124 | ld r10,16(r3) | |
125 | ld r11,24(r3) | |
126 | ||
127 | /* | |
128 | * On POWER6 and POWER7 back to back addes take 2 cycles because of | |
129 | * the XER dependency. This means the fastest this loop can go is | |
130 | * 16 cycles per iteration. The scheduling of the loop below has | |
131 | * been shown to hit this on both POWER6 and POWER7. | |
132 | */ | |
133 | .align 5 | |
134 | 2: | |
135 | adde r0,r0,r6 | |
136 | ld r12,32(r3) | |
137 | ld r14,40(r3) | |
138 | ||
139 | adde r0,r0,r9 | |
140 | ld r15,48(r3) | |
141 | ld r16,56(r3) | |
142 | addi r3,r3,64 | |
143 | ||
144 | adde r0,r0,r10 | |
145 | ||
146 | adde r0,r0,r11 | |
147 | ||
148 | adde r0,r0,r12 | |
149 | ||
150 | adde r0,r0,r14 | |
151 | ||
152 | adde r0,r0,r15 | |
153 | ld r6,0(r3) | |
154 | ld r9,8(r3) | |
155 | ||
156 | adde r0,r0,r16 | |
157 | ld r10,16(r3) | |
158 | ld r11,24(r3) | |
159 | bdnz 2b | |
160 | ||
161 | ||
162 | adde r0,r0,r6 | |
163 | ld r12,32(r3) | |
164 | ld r14,40(r3) | |
165 | ||
166 | adde r0,r0,r9 | |
167 | ld r15,48(r3) | |
168 | ld r16,56(r3) | |
169 | addi r3,r3,64 | |
170 | ||
171 | adde r0,r0,r10 | |
172 | adde r0,r0,r11 | |
173 | adde r0,r0,r12 | |
174 | adde r0,r0,r14 | |
175 | adde r0,r0,r15 | |
176 | adde r0,r0,r16 | |
177 | ||
178 | ld r14,STK_REG(r14)(r1) | |
179 | ld r15,STK_REG(r15)(r1) | |
180 | ld r16,STK_REG(r16)(r1) | |
181 | addi r1,r1,STACKFRAMESIZE | |
182 | ||
183 | andi. r4,r4,63 | |
184 | ||
185 | .Lcsum_tail_doublewords: /* Up to 127 bytes to go */ | |
186 | srdi. r6,r4,3 | |
187 | beq .Lcsum_tail_word | |
188 | ||
189 | mtctr r6 | |
190 | 3: | |
191 | ld r6,0(r3) | |
192 | addi r3,r3,8 | |
193 | adde r0,r0,r6 | |
194 | bdnz 3b | |
195 | ||
196 | andi. r4,r4,7 | |
197 | ||
198 | .Lcsum_tail_word: /* Up to 7 bytes to go */ | |
199 | srdi. r6,r4,2 | |
200 | beq .Lcsum_tail_halfword | |
201 | ||
202 | lwz r6,0(r3) | |
14cf11af | 203 | addi r3,r3,4 |
9b83ecb0 | 204 | adde r0,r0,r6 |
14cf11af | 205 | subi r4,r4,4 |
9b83ecb0 AB |
206 | |
207 | .Lcsum_tail_halfword: /* Up to 3 bytes to go */ | |
208 | srdi. r6,r4,1 | |
209 | beq .Lcsum_tail_byte | |
210 | ||
211 | lhz r6,0(r3) | |
212 | addi r3,r3,2 | |
213 | adde r0,r0,r6 | |
214 | subi r4,r4,2 | |
215 | ||
216 | .Lcsum_tail_byte: /* Up to 1 byte to go */ | |
217 | andi. r6,r4,1 | |
218 | beq .Lcsum_finish | |
219 | ||
220 | lbz r6,0(r3) | |
221 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ | |
222 | adde r0,r0,r9 | |
223 | ||
224 | .Lcsum_finish: | |
225 | addze r0,r0 /* add in final carry */ | |
226 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ | |
227 | add r3,r4,r0 | |
228 | srdi r3,r3,32 | |
229 | blr | |
14cf11af | 230 | |
fdd374b6 AB |
231 | |
232 | .macro source | |
233 | 100: | |
234 | .section __ex_table,"a" | |
235 | .align 3 | |
236 | .llong 100b,.Lsrc_error | |
237 | .previous | |
238 | .endm | |
239 | ||
240 | .macro dest | |
241 | 200: | |
242 | .section __ex_table,"a" | |
243 | .align 3 | |
244 | .llong 200b,.Ldest_error | |
245 | .previous | |
246 | .endm | |
247 | ||
14cf11af PM |
248 | /* |
249 | * Computes the checksum of a memory block at src, length len, | |
250 | * and adds in "sum" (32-bit), while copying the block to dst. | |
251 | * If an access exception occurs on src or dst, it stores -EFAULT | |
fdd374b6 AB |
252 | * to *src_err or *dst_err respectively. The caller must take any action |
253 | * required in this case (zeroing memory, recalculating partial checksum etc). | |
14cf11af PM |
254 | * |
255 | * csum_partial_copy_generic(r3=src, r4=dst, r5=len, r6=sum, r7=src_err, r8=dst_err) | |
256 | */ | |
257 | _GLOBAL(csum_partial_copy_generic) | |
fdd374b6 AB |
258 | addic r0,r6,0 /* clear carry */ |
259 | ||
260 | srdi. r6,r5,3 /* less than 8 bytes? */ | |
261 | beq .Lcopy_tail_word | |
262 | ||
263 | /* | |
264 | * If only halfword aligned, align to a double word. Since odd | |
265 | * aligned addresses should be rare and they would require more | |
266 | * work to calculate the correct checksum, we ignore that case | |
267 | * and take the potential slowdown of unaligned loads. | |
268 | * | |
269 | * If the source and destination are relatively unaligned we only | |
270 | * align the source. This keeps things simple. | |
271 | */ | |
272 | rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ | |
273 | beq .Lcopy_aligned | |
274 | ||
275 | li r7,4 | |
276 | sub r6,r7,r6 | |
277 | mtctr r6 | |
278 | ||
279 | 1: | |
280 | source; lhz r6,0(r3) /* align to doubleword */ | |
14cf11af | 281 | subi r5,r5,2 |
14cf11af | 282 | addi r3,r3,2 |
fdd374b6 AB |
283 | adde r0,r0,r6 |
284 | dest; sth r6,0(r4) | |
14cf11af | 285 | addi r4,r4,2 |
fdd374b6 AB |
286 | bdnz 1b |
287 | ||
288 | .Lcopy_aligned: | |
289 | /* | |
290 | * We unroll the loop such that each iteration is 64 bytes with an | |
291 | * entry and exit limb of 64 bytes, meaning a minimum size of | |
292 | * 128 bytes. | |
293 | */ | |
294 | srdi. r6,r5,7 | |
295 | beq .Lcopy_tail_doublewords /* len < 128 */ | |
296 | ||
297 | srdi r6,r5,6 | |
298 | subi r6,r6,1 | |
299 | mtctr r6 | |
300 | ||
301 | stdu r1,-STACKFRAMESIZE(r1) | |
302 | std r14,STK_REG(r14)(r1) | |
303 | std r15,STK_REG(r15)(r1) | |
304 | std r16,STK_REG(r16)(r1) | |
305 | ||
306 | source; ld r6,0(r3) | |
307 | source; ld r9,8(r3) | |
308 | ||
309 | source; ld r10,16(r3) | |
310 | source; ld r11,24(r3) | |
311 | ||
312 | /* | |
313 | * On POWER6 and POWER7 back to back addes take 2 cycles because of | |
314 | * the XER dependency. This means the fastest this loop can go is | |
315 | * 16 cycles per iteration. The scheduling of the loop below has | |
316 | * been shown to hit this on both POWER6 and POWER7. | |
317 | */ | |
318 | .align 5 | |
319 | 2: | |
320 | adde r0,r0,r6 | |
321 | source; ld r12,32(r3) | |
322 | source; ld r14,40(r3) | |
323 | ||
324 | adde r0,r0,r9 | |
325 | source; ld r15,48(r3) | |
326 | source; ld r16,56(r3) | |
327 | addi r3,r3,64 | |
328 | ||
329 | adde r0,r0,r10 | |
330 | dest; std r6,0(r4) | |
331 | dest; std r9,8(r4) | |
332 | ||
333 | adde r0,r0,r11 | |
334 | dest; std r10,16(r4) | |
335 | dest; std r11,24(r4) | |
336 | ||
337 | adde r0,r0,r12 | |
338 | dest; std r12,32(r4) | |
339 | dest; std r14,40(r4) | |
340 | ||
341 | adde r0,r0,r14 | |
342 | dest; std r15,48(r4) | |
343 | dest; std r16,56(r4) | |
344 | addi r4,r4,64 | |
345 | ||
346 | adde r0,r0,r15 | |
347 | source; ld r6,0(r3) | |
348 | source; ld r9,8(r3) | |
349 | ||
350 | adde r0,r0,r16 | |
351 | source; ld r10,16(r3) | |
352 | source; ld r11,24(r3) | |
353 | bdnz 2b | |
354 | ||
355 | ||
14cf11af | 356 | adde r0,r0,r6 |
fdd374b6 AB |
357 | source; ld r12,32(r3) |
358 | source; ld r14,40(r3) | |
359 | ||
360 | adde r0,r0,r9 | |
361 | source; ld r15,48(r3) | |
362 | source; ld r16,56(r3) | |
363 | addi r3,r3,64 | |
364 | ||
365 | adde r0,r0,r10 | |
366 | dest; std r6,0(r4) | |
367 | dest; std r9,8(r4) | |
368 | ||
369 | adde r0,r0,r11 | |
370 | dest; std r10,16(r4) | |
371 | dest; std r11,24(r4) | |
372 | ||
373 | adde r0,r0,r12 | |
374 | dest; std r12,32(r4) | |
375 | dest; std r14,40(r4) | |
376 | ||
377 | adde r0,r0,r14 | |
378 | dest; std r15,48(r4) | |
379 | dest; std r16,56(r4) | |
380 | addi r4,r4,64 | |
381 | ||
382 | adde r0,r0,r15 | |
383 | adde r0,r0,r16 | |
384 | ||
385 | ld r14,STK_REG(r14)(r1) | |
386 | ld r15,STK_REG(r15)(r1) | |
387 | ld r16,STK_REG(r16)(r1) | |
388 | addi r1,r1,STACKFRAMESIZE | |
389 | ||
390 | andi. r5,r5,63 | |
391 | ||
392 | .Lcopy_tail_doublewords: /* Up to 127 bytes to go */ | |
393 | srdi. r6,r5,3 | |
394 | beq .Lcopy_tail_word | |
395 | ||
396 | mtctr r6 | |
397 | 3: | |
398 | source; ld r6,0(r3) | |
399 | addi r3,r3,8 | |
14cf11af | 400 | adde r0,r0,r6 |
fdd374b6 AB |
401 | dest; std r6,0(r4) |
402 | addi r4,r4,8 | |
403 | bdnz 3b | |
14cf11af | 404 | |
fdd374b6 | 405 | andi. r5,r5,7 |
14cf11af | 406 | |
fdd374b6 AB |
407 | .Lcopy_tail_word: /* Up to 7 bytes to go */ |
408 | srdi. r6,r5,2 | |
409 | beq .Lcopy_tail_halfword | |
410 | ||
411 | source; lwz r6,0(r3) | |
412 | addi r3,r3,4 | |
413 | adde r0,r0,r6 | |
414 | dest; stw r6,0(r4) | |
415 | addi r4,r4,4 | |
416 | subi r5,r5,4 | |
417 | ||
418 | .Lcopy_tail_halfword: /* Up to 3 bytes to go */ | |
419 | srdi. r6,r5,1 | |
420 | beq .Lcopy_tail_byte | |
421 | ||
422 | source; lhz r6,0(r3) | |
423 | addi r3,r3,2 | |
424 | adde r0,r0,r6 | |
425 | dest; sth r6,0(r4) | |
14cf11af | 426 | addi r4,r4,2 |
fdd374b6 AB |
427 | subi r5,r5,2 |
428 | ||
429 | .Lcopy_tail_byte: /* Up to 1 byte to go */ | |
430 | andi. r6,r5,1 | |
431 | beq .Lcopy_finish | |
432 | ||
433 | source; lbz r6,0(r3) | |
434 | sldi r9,r6,8 /* Pad the byte out to 16 bits */ | |
435 | adde r0,r0,r9 | |
436 | dest; stb r6,0(r4) | |
437 | ||
438 | .Lcopy_finish: | |
439 | addze r0,r0 /* add in final carry */ | |
440 | rldicl r4,r0,32,0 /* fold two 32 bit halves together */ | |
441 | add r3,r4,r0 | |
442 | srdi r3,r3,32 | |
443 | blr | |
444 | ||
445 | .Lsrc_error: | |
14cf11af | 446 | cmpdi 0,r7,0 |
fdd374b6 | 447 | beqlr |
14cf11af PM |
448 | li r6,-EFAULT |
449 | stw r6,0(r7) | |
14cf11af PM |
450 | blr |
451 | ||
fdd374b6 | 452 | .Ldest_error: |
14cf11af | 453 | cmpdi 0,r8,0 |
fdd374b6 | 454 | beqlr |
14cf11af PM |
455 | li r6,-EFAULT |
456 | stw r6,0(r8) | |
14cf11af | 457 | blr |