Below is a faster implementation of the memcpy function (previous version is here). The comparison of the old, the new, the FreeBSD's version in C language and the standard implementation on the test:
test0: FreeBSD memcpy in C 2.7686 test1: <new dps_memcpy> 0.43485 test2: <old dps_memcpy> 2.50218 test3: <standard memcpy> 0.456584 ratio(1/2): 0.17 ratio(1/0): 0.16 ratio(2/0): 0.90 ratio(1/3): 0.95
These results are for unaligned data under FreeBSD 7.1 running on Intel Duo E8400 3MHz. So the new implementation is about 5 times faster than the old version, and about a salt faster than FreeBSD's memcpy (which is implemented in Assembler language but works on integer alignment).
The results for aligned data are as follow:
test0: FreeBSD memcpy in C 0.0575361 test1: <new dps_memcpy> 0.022099 test2: <old dps_memcpy> 0.290257 test3: <standard memcpy> 0.00528717 ratio(1/2): 0.08 ratio(1/0): 0.38 ratio(2/0): 5.04 ratio(1/3): 4.18
So the new version is about 10 times faster than the old, but is about 4 times slower than FreeDSD memcpy function written in Assembler (though the new version is about 2.5 times faster than its implementation in C language).
typedef long long word; // up to 32 bytes long
#define wsize sizeof(word)
#define wmask (wsize - 1)
void dps_minimove_forward(char *dst, const char *src, size_t t) {
if (t) { dst[0] = src[0];
if (t > 1) { dst[1] = src[1];
if (t > 2) { dst[2] = src[2];
if (t > 3) { dst[3] = src[3];
if (t > 4) { dst[4] = src[4];
if (t > 5) { dst[5] = src[5];
if (t > 6) { dst[6] = src[6];
if (t > 7) { dst[7] = src[7];
if (t > 8 ) { dst[8] = src[8];
if (t > 9) { dst[9] = src[9];
if (t > 10) { dst[10] = src[10];
if (t > 11) { dst[11] = src[11];
if (t > 12) { dst[12] = src[12];
if (t > 13) { dst[13] = src[13];
if (t > 14) { dst[14] = src[14];
if (t > 15) { dst[15] = src[15];
if (t > 16) { dst[16] = src[16];
if (t > 17) { dst[17] = src[17];
if (t > 18) { dst[18] = src[18];
if (t > 19) { dst[19] = src[19];
if (t > 20) { dst[20] = src[20];
if (t > 21) { dst[21] = src[21];
if (t > 22) { dst[22] = src[22];
if (t > 23) { dst[23] = src[23];
if (t > 24) { dst[24] = src[24];
if (t > 25) { dst[25] = src[25];
if (t > 26) { dst[26] = src[26];
if (t > 27) { dst[27] = src[27];
if (t > 28) { dst[28] = src[28];
if (t > 29) { dst[29] = src[29];
if (t > 30) { dst[30] = src[30];
}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
}
void dps_minimove_backward(char *dst, char *src, size_t t) {
switch(t) {
case 31: dst[30] = src[30];
case 30: dst[29] = src[29];
case 29: dst[28] = src[28];
case 28: dst[27] = src[27];
case 27: dst[26] = src[26];
case 26: dst[25] = src[25];
case 25: dst[24] = src[24];
case 24: dst[23] = src[23];
case 23: dst[22] = src[22];
case 22: dst[21] = src[21];
case 21: dst[20] = src[20];
case 20: dst[19] = src[19];
case 19: dst[18] = src[18];
case 18: dst[17] = src[17];
case 17: dst[16] = src[16];
case 16: dst[15] = src[15];
case 15: dst[14] = src[14];
case 14: dst[13] = src[13];
case 13: dst[12] = src[12];
case 12: dst[11] = src[11];
case 11: dst[10] = src[10];
case 10: dst[9] = src[9];
case 9: dst[8] = src[8];
case 8: dst[7] = src[7];
case 7: dst[6] = src[6];
case 6: dst[5] = src[5];
case 5: dst[4] = src[4];
case 4: dst[3] = src[3];
case 3: dst[2] = src[2];
case 2: dst[1] = src[1];
case 1: dst[0] = src[0];
}
}
void * dps_memcpy_new(char *dst0, char *src0, size_t length) {
size_t t;
if (length == 0 || dst0 == src0) /* nothing to do */
return dst0;
if ((unsigned long long)dst0 < (unsigned long long)src0) { /* copy forward */
register char *dst = dst0, *src = src0;
t = (unsigned int)src & wmask;
if (t) {
if (length < wsize) {
t = length;
} else {
t = wsize - t;
}
dps_minimove_forward(dst, src, t);
length -= t;
src += t; dst += t;
}
t = length / wsize;
if (t) {
register size_t n = (t + 7) / 8;
register size_t r = (t % 8);
register word *wdst = (word*)dst, *wsrc = (word*)src;
if (r == 0) r = 8;
wdst[0] = wsrc[0];
if (r > 1) { wdst[1] = wsrc[1];
if (r > 2) { wdst[2] = wsrc[2];
if (r > 3) { wdst[3] = wsrc[3];
if (r > 4) { wdst[4] = wsrc[4];
if (r > 5) { wdst[5] = wsrc[5];
if (r > 6) { wdst[6] = wsrc[6];
if (r > 7) { wdst[7] = wsrc[7];
}}}}}}}
wsrc += r; wdst += r;
while (--n > 0) {
wdst[0] = wsrc[0];
wdst[1] = wsrc[1];
wdst[2] = wsrc[2];
wdst[3] = wsrc[3];
wdst[4] = wsrc[4];
wdst[5] = wsrc[5];
wdst[6] = wsrc[6];
wdst[7] = wsrc[7];
wsrc += 8; wdst += 8;
}
dst = (char*)wdst; src = (char *)wsrc;
}
if ( (t = (length & wmask)) ) dps_minimove_forward(dst, src, t);
} else { /* copy backward */
register char *dst = dst0 + length, *src = src0 + length;
t = (unsigned int)src & wmask;
if (t) {
if (length < wsize) {
t = length;
}
dst -= t; src -= t;
length -= t;
dps_minimove_backward(dst, src, t);
}
t = length / wsize;
if (t) {
register size_t n = (t + 7) / 8;
register size_t r = (t % 8);
register word *wdst = (word*)dst, *wsrc = (word*)src;
if (r == 0) r = 8;
wsrc -= r; wdst -= r;
switch(r) {
case 8:wdst[7] = wsrc[7];
case 7:wdst[6] = wsrc[6];
case 6:wdst[5] = wsrc[5];
case 5:wdst[4] = wsrc[4];
case 4:wdst[3] = wsrc[3];
case 3:wdst[2] = wsrc[2];
case 2:wdst[1] = wsrc[1];
case 1:wdst[0] = wsrc[0];
}
while (--n > 0) {
wsrc -= 8; wdst -= 8;
wdst[7] = wsrc[7];
wdst[6] = wsrc[6];
wdst[5] = wsrc[5];
wdst[4] = wsrc[4];
wdst[3] = wsrc[3];
wdst[2] = wsrc[2];
wdst[1] = wsrc[1];
wdst[0] = wsrc[0];
}
dst = (char*)wdst; src = (char*)wsrc;
}
t = length & wmask;
dps_minimove_backward(dst - t, src - t, t);
}
return dst0;
}