Even faster strncpy has a caveat of nonpadding by 0's if the length of source string is less than the length specified by a function parameter. The version below has this issue fixed, but it's still faster than standard version of strncpy, if the code is compiled with optimization on a modern processor.
typedef long long word; /* up to 32 bytes long */
#define wsize sizeof(word)
#define wmask (wsize - 1)
inline void dps_minibzero(char *dst, size_t t) {
if (t) { dst[0] = '\0';
if (t > 1) { dst[1] = '\0';
if (t > 2) { dst[2] = '\0';
if (t > 3) { dst[3] = '\0';
if (t > 4) { dst[4] = '\0';
if (t > 5) { dst[5] = '\0';
if (t > 6) { dst[6] = '\0';
if (t > 7) { dst[7] = '\0';
if (t > 8 ) { dst[8] = '\0';
if (t > 9) { dst[9] = '\0';
if (t > 10) { dst[10] = '\0';
if (t > 11) { dst[11] = '\0';
if (t > 12) { dst[12] = '\0';
if (t > 13) { dst[13] = '\0';
if (t > 14) { dst[14] = '\0';
if (t > 15) { dst[15] = '\0';
if (t > 16) { dst[16] = '\0';
if (t > 17) { dst[17] = '\0';
if (t > 18) { dst[18] = '\0';
if (t > 19) { dst[19] = '\0';
if (t > 20) { dst[20] = '\0';
if (t > 21) { dst[21] = '\0';
if (t > 22) { dst[22] = '\0';
if (t > 23) { dst[23] = '\0';
if (t > 24) { dst[24] = '\0';
if (t > 25) { dst[25] = '\0';
if (t > 26) { dst[26] = '\0';
if (t > 27) { dst[27] = '\0';
if (t > 28) { dst[28] = '\0';
if (t > 29) { dst[29] = '\0';
if (t > 30) { dst[30] = '\0';
}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}
}
void * dps_strncpy(char *dst0, char *src0, size_t length) {
if (length) {
register size_t n = length / 8;
register size_t r = (length % 8);
register char *dst = dst0, *src = src0;
if (r == 0) r = 8; else n++;
if (!(dst[0] = src[0])) { dst++; src++; goto dps_strncpy_second_pas; }
if (r > 1) { if (!(dst[1] = src[1])) { dst += 2; src += 2; goto dps_strncpy_second_pas; }
if (r > 2) { if (!(dst[2] = src[2])) { dst += 3; src += 3; goto dps_strncpy_second_pas; }
if (r > 3) { if (!(dst[3] = src[3])) { dst += 4; src += 4; goto dps_strncpy_second_pas; }
if (r > 4) { if (!(dst[4] = src[4])) { dst += 5; src += 5; goto dps_strncpy_second_pas; }
if (r > 5) { if (!(dst[5] = src[5])) { dst += 6; src += 6; goto dps_strncpy_second_pas; }
if (r > 6) { if (!(dst[6] = src[6])) { dst += 7; src += 7; goto dps_strncpy_second_pas; }
if (r > 7) { if (!(dst[7] = src[7])) { dst += 8; src += 8; goto dps_strncpy_second_pas; }
}}}}}}}
src += r; dst += r;
while (--n > 0) {
if (!(dst[0] = src[0])) { dst++; src++; goto dps_strncpy_second_pas; }
if (!(dst[1] = src[1])) { dst += 2; src += 2; goto dps_strncpy_second_pas; }
if (!(dst[2] = src[2])) { dst += 3; src += 3; goto dps_strncpy_second_pas; }
if (!(dst[3] = src[3])) { dst += 4; src += 4; goto dps_strncpy_second_pas; }
if (!(dst[4] = src[4])) { dst += 5; src += 5; goto dps_strncpy_second_pas; }
if (!(dst[5] = src[5])) { dst += 6; src += 6; goto dps_strncpy_second_pas; }
if (!(dst[6] = src[6])) { dst += 7; src += 7; goto dps_strncpy_second_pas; }
if (!(dst[7] = src[7])) { dst += 8; src += 8; goto dps_strncpy_second_pas; }
src += 8; dst += 8;
}
dps_strncpy_second_pas:
if (dst < dst0 + length) {
size_t t, restlen = length - (dst - dst0);
t = (unsigned int)dst & wmask;
if (t) {
if (restlen < wsize) {
t = restlen;
} else {
t = wsize - t;
}
bzero(dst, t);
dps_minibzero(dst, t);
restlen -= t;
dst += t;
}
t = restlen / wsize;
if (t) {
n = t / 8;
r = (t % 8 );
register word *wdst = (word*)dst;
if (r == 0) r = 8; else n++;
wdst[0] = (word)0;
if (r > 1) { wdst[1] = (word)0;
if (r > 2) { wdst[2] = (word)0;
if (r > 3) { wdst[3] = (word)0;
if (r > 4) { wdst[4] = (word)0;
if (r > 5) { wdst[5] = (word)0;
if (r > 6) { wdst[6] = (word)0;
if (r > 7) { wdst[7] = (word)0;
}}}}}}}
wdst += r;
while (--n > 0) {
wdst[0] = (word)0;
wdst[1] = (word)0;
wdst[2] = (word)0;
wdst[3] = (word)0;
wdst[4] = (word)0;
wdst[5] = (word)0;
wdst[6] = (word)0;
wdst[7] = (word)0;
wdst += 8;
}
dst = (char*)wdst;
}
if ( (t = (restlen & wmask)) ) dps_minibzero(dst, t);
}
}
return dst0;
}