void * dps_memcpy(void *dst0, const void *src0, size_t length) {
if (length == 0 || dst0 == src0) /* nothing to do */
return dst0;
if ((unsigned long)dst0 < (unsigned long)src0) { /* copy forward */
register size_t n = (length + 7) / 8;
register char *dst = dst0, *src = src0;
switch( length % 8 ) {
case 0: do { *dst++ = *src++;
case 7: *dst++ = *src++;
case 6: *dst++ = *src++;
case 5: *dst++ = *src++;
case 4: *dst++ = *src++;
case 3: *dst++ = *src++;
case 2: *dst++ = *src++;
case 1: *dst++ = *src++;
} while(--n > 0);
}
} else { /* copy backward */
register size_t n = (length + 7) / 8;
register char *dst = dst0 + length, *src = src0 + length;
switch( length % 8 ) {
case 0: do { *--dst = *--src;
case 7: *--dst = *--src;
case 6: *--dst = *--src;
case 5: *--dst = *--src;
case 4: *--dst = *--src;
case 3: *--dst = *--src;
case 2: *--dst = *--src;
case 1: *--dst = *--src;
} while(--n > 0);
}
}
return dst0;
}
N.B.: Code is under GPL.
Addendum: A faster version of memcpy
That's not fast. In fact it's more than three times slower than my implementations (plain C). Anything that is not accidently char *s, *d; while(n--) *d++ = *s++ can possibly already beat this.
But what your implementation is ?
Could you tell me how your implementation looks like? And by saying its faster, is it faster that whats natively available in gcc or just faster than the routine given here?
Jan, perhaps you're looking only on aligned data, but on unaligned data the results are different.
Anyway,there is the new implementation, which is about 10 times faster on aligned data.
Pingback: Even faster memcpy | Founds