LOAD _t1, (offset + UNIT(1))(src); \
LOAD _t2, (offset + UNIT(2))(src); \
LOAD _t3, (offset + UNIT(3))(src); \
+ ADDC(_t0, _t1); \
+ ADDC(_t2, _t3); \
ADDC(sum, _t0); \
- ADDC(sum, _t1); \
- ADDC(sum, _t2); \
- ADDC(sum, _t3)
+ ADDC(sum, _t2)
#ifdef USE_DOUBLE
#define CSUM_BIGCHUNK(src, offset, sum, _t0, _t1, _t2, _t3) \
SUB len, len, 8*NBYTES
ADD src, src, 8*NBYTES
STORE(t0, UNIT(0)(dst), .Ls_exc\@)
- ADDC(sum, t0)
+ ADDC(t0, t1)
STORE(t1, UNIT(1)(dst), .Ls_exc\@)
- ADDC(sum, t1)
+ ADDC(sum, t0)
STORE(t2, UNIT(2)(dst), .Ls_exc\@)
- ADDC(sum, t2)
+ ADDC(t2, t3)
STORE(t3, UNIT(3)(dst), .Ls_exc\@)
- ADDC(sum, t3)
+ ADDC(sum, t2)
STORE(t4, UNIT(4)(dst), .Ls_exc\@)
- ADDC(sum, t4)
+ ADDC(t4, t5)
STORE(t5, UNIT(5)(dst), .Ls_exc\@)
- ADDC(sum, t5)
+ ADDC(sum, t4)
STORE(t6, UNIT(6)(dst), .Ls_exc\@)
- ADDC(sum, t6)
+ ADDC(t6, t7)
STORE(t7, UNIT(7)(dst), .Ls_exc\@)
- ADDC(sum, t7)
+ ADDC(sum, t6)
.set reorder /* DADDI_WAR */
ADD dst, dst, 8*NBYTES
bgez len, 1b
SUB len, len, 4*NBYTES
ADD src, src, 4*NBYTES
STORE(t0, UNIT(0)(dst), .Ls_exc\@)
- ADDC(sum, t0)
+ ADDC(t0, t1)
STORE(t1, UNIT(1)(dst), .Ls_exc\@)
- ADDC(sum, t1)
+ ADDC(sum, t0)
STORE(t2, UNIT(2)(dst), .Ls_exc\@)
- ADDC(sum, t2)
+ ADDC(t2, t3)
STORE(t3, UNIT(3)(dst), .Ls_exc\@)
- ADDC(sum, t3)
+ ADDC(sum, t2)
.set reorder /* DADDI_WAR */
ADD dst, dst, 4*NBYTES
beqz len, .Ldone\@
nop # improves slotting
#endif
STORE(t0, UNIT(0)(dst), .Ls_exc\@)
- ADDC(sum, t0)
+ ADDC(t0, t1)
STORE(t1, UNIT(1)(dst), .Ls_exc\@)
- ADDC(sum, t1)
+ ADDC(sum, t0)
STORE(t2, UNIT(2)(dst), .Ls_exc\@)
- ADDC(sum, t2)
+ ADDC(t2, t3)
STORE(t3, UNIT(3)(dst), .Ls_exc\@)
- ADDC(sum, t3)
+ ADDC(sum, t2)
.set reorder /* DADDI_WAR */
ADD dst, dst, 4*NBYTES
bne len, rem, 1b