在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
From: Will DeWitt Jr. Subject: Fast strlen routine? NewsGroup: borland.public.delphi.language.basm Date Posted: 28-May-2003 at 13:50:4 PST Download from Google I've been tinkering with re-writing some of the standard C run-time library routines and haven't really played much with MMX instructions, or SSE for that matter. But I thought what I came up with was interesting and maybe worth sharing-- function strlenmmx(s: PAnsiChar): longword; register; asm TEST EAX, EAX JZ @@Error PXOR MM1, MM1 MOV ECX, EAX // save original pointer @@1: MOVQ MM0, [EAX] // grab 8 chars PCMPEQB MM0, MM1 // check all 8 for null/0 (00 = null, FF = not null - for each char in MM0) PMOVMSKB EDX, MM0 // move 1-bit mask of each char to DL ADD EAX, 8 // move pointer forward 8 chars TEST EDX, EDX // check for any null/0 chars JNZ @@2 MOVQ MM0, [EAX] // unroll twice (#1) PCMPEQB MM0, MM1 PMOVMSKB EDX, MM0 ADD EAX, 8 TEST EDX, EDX JNZ @@2 MOVQ MM0, [EAX] // (#2) PCMPEQB MM0, MM1 PMOVMSKB EDX, MM0 ADD EAX, 8 TEST EDX, EDX JZ @@1 @@2: EMMS BSF EDX, EDX SUB EAX, DWORD PTR [@@SubTable+EDX*4] SUB EAX, ECX RET @@SubTable: DD 8 DD 7 DD 6 DD 5 DD 4 DD 3 DD 2 DD 1 DD 0 @@Error: end; function _PCharLen(P: _PAnsiChr): Longint; {$IFNDEF LEGACY_PCHARLEN} begin Result := 0; if P <> nil then while P[Result] <> #0 do Inc(Result); end; {$ELSE !LEGACY_PCHARLEN} {$IFDEF CPUX86} asm TEST EAX,EAX JE @@5 PUSH EAX XOR ECX,ECX @@0: CMP CL,[EAX+0] JE @@4 CMP CL,[EAX+1] JE @@3 CMP CL,[EAX+2] JE @@2 CMP CL,[EAX+3] JE @@1 ADD EAX,4 JMP @@0 @@1: INC EAX @@2: INC EAX @@3: INC EAX @@4: POP ECX SUB EAX,ECX @@5: end; {$ENDIF CPUX86} {$ENDIF !LEGACY_PCHARLEN} http://www.verydemo.com/demo_c230_i66795.html /* 下面是库函数中strlen的实现,比想像的要复杂 */ size_t strlen (str) const char *str; { const char *char_ptr; const unsigned long int *longword_ptr; unsigned long int longword, himagic, lomagic; for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0; ++char_ptr) if (*char_ptr == '\0') return char_ptr - str; longword_ptr = (unsigned long int *) char_ptr; himagic = 0x80808080L; lomagic = 0x01010101L; /* Instead of the traditional loop which tests each character, we will test a longword at a time. The tricky part(棘手的部分) is testing if *any of the four* bytes in the longword in question are zero. */ for (;;) { longword = *longword_ptr++; if (((longword - lomagic) & ~longword & himagic) != 0) { /* 关键在于如果有0,就一定要测试出来,误判没关系 */ /* 只是读,并没有写,不会出现段错误 */ const char *cp = (const char *) (longword_ptr - 1); /* 减一是因为前面已经加了1 */ if (cp[0] == 0) return cp - str; if (cp[1] == 0) return cp - str + 1; if (cp[2] == 0) return cp - str + 2; if (cp[3] == 0) return cp - str + 3; if (sizeof (longword) > 4) { if (cp[4] == 0) return cp - str + 4; if (cp[5] == 0) return cp - str + 5; if (cp[6] == 0) return cp - str + 6; if (cp[7] == 0) return cp - str + 7; } } } }
int i; while (*str++ != '\0') ++i; return i; http://www.strchr.com/optimized_strlen_function http://www.strchr.com/sse2_optimised_strlen
size_t strlen(const char * str) { const char *s; for (s = str; *s; ++s) {} return(s - str); } size_t strlen(const char *s) { const char *start = s; while(*s) s++; return s - start; }
// for x86 only size_t my_strlen(const char *s) { size_t len = 0; for(;;) { unsigned x = *(unsigned*)s; if((x & 0xFF) == 0) return len; if((x & 0xFF00) == 0) return len + 1; if((x & 0xFF0000) == 0) return len + 2; if((x & 0xFF000000) == 0) return len + 3; s += 4, len += 4; } }
#ifndef WORDS_BIGENDIAN #if 0 static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes { register int i = 0; if (!(x & (1 << 0))) i ++; else return i; if (!(x & (1 << 1))) i ++; else return i; if (!(x & (1 << 2))) i ++; else return i; if (!(x & (1 << 3))) i ++; else return i; if (!(x & (1 << 4))) i ++; else return i; if (!(x & (1 << 5))) i ++; else return i; if (!(x & (1 << 6))) i ++; else return i; if (!(x & (1 << 7))) i ++; else return i; if (!(x & (1 << 8))) i ++; else return i; if (!(x & (1 << 9))) i ++; else return i; if (!(x & (1 << 10))) i ++; else return i; if (!(x & (1 << 11))) i ++; else return i; if (!(x & (1 << 12))) i ++; else return i; if (!(x & (1 << 13))) i ++; else return i; if (!(x & (1 << 14))) i ++; else return i; if (!(x & (1 << 15))) i ++; return i; } #elif 0 static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes { // http://www.hackersdelight.org/: ntz3() shortened for 16-bit mask by Peter Kankowski register int n = 1; if ((x & 0x000000FFU) == 0) {n += 8; x >>= 8;} if ((x & 0x0000000FU) == 0) {n += 4; x >>= 4;} if ((x & 0x00000003U) == 0) {n += 2; x >>= 2;} return n - (x & 1); } #else static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40 { // this is current winner for speed static const unsigned char table[256] = { 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, }; if ((unsigned char)x) return table[(unsigned char)x]; return table[x >> 8] + 8; // t[x / 256] + 8 } #endif #else #if 0 static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes { register int i = 0; if (!(x & (1 << 15))) i ++; else return i; if (!(x & (1 << 14))) i ++; else return i; if (!(x & (1 << 13))) i ++; else return i; if (!(x & (1 << 12))) i ++; else return i; if (!(x & (1 << 11))) i ++; else return i; if (!(x & (1 << 10))) i ++; else return i; if (!(x & (1 << 9))) i ++; else return i; if (!(x & (1 << 8))) i ++; else return i; if (!(x & (1 << 7))) i ++; else return i; if (!(x & (1 << 6))) i ++; else return i; if (!(x & (1 << 5))) i ++; else return i; if (!(x & (1 << 4))) i ++; else return i; if (!(x & (1 << 3))) i ++; else return i; if (!(x & (1 << 2))) i ++; else return i; if (!(x & (1 << 1))) i ++; else return i; if (!(x & (1 << 0))) i ++; return i; } #else static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes { // http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask register int n = 0; if (x <= 0x000000FFU) {n = n + 8; x = x << 8;} if (x <= 0x00000FFFU) {n = n + 4; x = x << 4;} if (x <= 0x00003FFFU) {n = n + 2; x = x << 2;} if (x <= 0x00007FFFU) {n = n + 1;} return n; } #endif #endif size_t strlen(const char *str) { register size_t len = 0; // align to 16 bytes while ((((intptr_t)str) & (sizeof(__m128i)-1)) != 0) { if (*str++ == 0) return len; ++ len; } // search for 0 __m128i xmm0 = _mm_setzero_si128(); __m128i xmm1; int mask = 0; for (;;) { xmm1 = _mm_load_si128((__m128i *)str); xmm1 = _mm_cmpeq_epi8(xmm1, xmm0); if ((mask = _mm_movemask_epi8(xmm1)) != 0) { // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask // find index of first set bit #ifndef _DISABLE_ASM_BSF // define it to disable ASM #if (_MSC_VER >= 1300) // make sure <intrin.h> is included unsigned long pos; _BitScanForward(&pos, mask); len += (size_t)pos; #elif defined(_MSC_VER) // earlier MSVC's do not have _BitScanForward, use inline asm __asm bsf edx, mask ; edx = bsf(mask) __asm add edx, len ; edx += len __asm mov len, edx ; len = edx #elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz len += __builtin_ctz(mask); #elif defined(__GNUC__) // older GCC shall use inline asm unsigned int pos; asm("bsf %1, %0" : "=r" (pos) : "rm" (mask)); len += (size_t)pos; #else // none of choices exist, use local BSF implementation len += count_bits_to_0(mask); #endif #else len += count_bits_to_0(mask); #endif break; } str += sizeof(__m128i); len += sizeof(__m128i); } return len; }
This implementation would win more performance boost if 'count_bits_to_0' is optimised in less conditions. We could use _mm_loadu_si128 to load unaligned data and thus skip own aligning loop but the performance will still be worse due to additional CPU cycles if _mm_loadu_si128 is used. SSE2 SIMD instructions are present on all modern CPUs and thus this implementation may bring real benefits to intensive database/text processing applications. License: Public Domain.
http://stackoverflow.com/questions/2372315/how-to-implement-strlen-as-fast-as-possible also do two micro-optimizations:
uint32_t gatopeich_strlen32(const char* str) { uint32_t *u32 = (uint32_t*)str, u, abcd, i=0; while(1) { u = u32[i++]; abcd = (u-0x01010101) & 0x80808080; if (abcd && // If abcd is not 0, we have NUL or a non-ASCII char > 127... (abcd &= ~u)) // ... Discard non-ASCII chars { #if BYTE_ORDER == BIG_ENDIAN return 4*i - (abcd&0xffff0000 ? (abcd&0xff000000?4:3) : abcd&0xff00? 全部评论
专题导读
热门推荐
热门话题
阅读排行榜
|
请发表评论