• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    公众号

glibcstrlendelphipascal

原作者: [db:作者] 来自: [db:来源] 收藏 邀请
From: Will DeWitt Jr.     
Subject: Fast strlen routine?
NewsGroup: borland.public.delphi.language.basm
Date Posted: 28-May-2003 at 13:50:4 PST

  
Download from Google
I've been tinkering with re-writing some of the standard C run-time
library routines and haven't really played much with MMX instructions,
or SSE for that matter.  But I thought what I came up with was
interesting and maybe worth sharing--

function  strlenmmx(s: PAnsiChar): longword; register;
asm
          TEST      EAX, EAX
          JZ        @@Error

          PXOR      MM1, MM1
          MOV       ECX, EAX    // save original pointer
@@1:
          MOVQ      MM0, [EAX]  // grab 8 chars
          PCMPEQB   MM0, MM1    // check all 8 for null/0 (00 = null, FF = not null - for each char in MM0)
          PMOVMSKB  EDX, MM0    // move 1-bit mask of each char to DL
          ADD       EAX, 8      // move pointer forward 8 chars
          TEST      EDX, EDX    // check for any null/0 chars
          JNZ       @@2

          MOVQ      MM0, [EAX]  // unroll twice (#1)
          PCMPEQB   MM0, MM1
          PMOVMSKB  EDX, MM0
          ADD       EAX, 8
          TEST      EDX, EDX
          JNZ       @@2

          MOVQ      MM0, [EAX]  // (#2)
          PCMPEQB   MM0, MM1
          PMOVMSKB  EDX, MM0
          ADD       EAX, 8
          TEST      EDX, EDX
          JZ        @@1

@@2:
          EMMS
          BSF       EDX, EDX
          SUB       EAX, DWORD PTR [@@SubTable+EDX*4]
          SUB       EAX, ECX
          RET
@@SubTable:
          DD        8
          DD        7
          DD        6
          DD        5
          DD        4
          DD        3
          DD        2
          DD        1
          DD        0
@@Error:
end;
function _PCharLen(P: _PAnsiChr): Longint;
{$IFNDEF LEGACY_PCHARLEN}
begin
  Result := 0;
  if P <> nil then
    while P[Result] <> #0 do
      Inc(Result);
end;
{$ELSE !LEGACY_PCHARLEN}
{$IFDEF CPUX86}
asm
        TEST    EAX,EAX
        JE      @@5
        PUSH    EAX
        XOR     ECX,ECX
@@0:    CMP     CL,[EAX+0]
        JE      @@4
        CMP     CL,[EAX+1]
        JE      @@3
        CMP     CL,[EAX+2]
        JE      @@2
        CMP     CL,[EAX+3]
        JE      @@1
        ADD     EAX,4
        JMP     @@0
@@1:    INC     EAX
@@2:    INC     EAX
@@3:    INC     EAX
@@4:    POP     ECX
        SUB     EAX,ECX
@@5:
end;
{$ENDIF CPUX86}
{$ENDIF !LEGACY_PCHARLEN}

http://www.verydemo.com/demo_c230_i66795.html

/* 下面是库函数中strlen的实现,比想像的要复杂  */
size_t strlen (str)
     const char *str;
{
     const char *char_ptr;
     const unsigned long int *longword_ptr;
     unsigned long int longword, himagic, lomagic;

     for (char_ptr = str; ((unsigned long int) char_ptr & (sizeof (longword) - 1)) != 0;
       ++char_ptr)
        if (*char_ptr == '\0')
          return char_ptr - str;

    longword_ptr = (unsigned long int *) char_ptr;

    himagic = 0x80808080L;
    lomagic = 0x01010101L;

  /* Instead of the traditional loop which tests each character,
     we will test a longword at a time.  The tricky part(棘手的部分) is testing
     if *any of the four* bytes in the longword in question are zero.  */
    for (;;)
    {
      longword = *longword_ptr++;

      if (((longword - lomagic) & ~longword & himagic) != 0)
      {
        /* 关键在于如果有0,就一定要测试出来,误判没关系 */
        /* 只是读,并没有写,不会出现段错误 */
        const char *cp = (const char *) (longword_ptr - 1);  /* 减一是因为前面已经加了1 */
        if (cp[0] == 0)
          return cp - str;
        if (cp[1] == 0)
          return cp - str + 1;
        if (cp[2] == 0)
          return cp - str + 2;
        if (cp[3] == 0)
          return cp - str + 3;
        if (sizeof (longword) > 4)
        {
            if (cp[4] == 0)
                return cp - str + 4;
            if (cp[5] == 0)
                return cp - str + 5;
            if (cp[6] == 0)
                return cp - str + 6;
            if (cp[7] == 0)
                return cp - str + 7;
        }
      }
    }
}

 

 

int i;
 
while (*str++ != '\0') ++i;
 
return i;

 http://www.strchr.com/optimized_strlen_function

http://www.strchr.com/sse2_optimised_strlen

 

size_t strlen(const char * str)
{
    const char *s;
    for (s = str; *s; ++s) {}
    return(s - str);
}
size_t strlen(const char *s) {
    const char *start = s;
    while(*s)
        s++;
    return s - start;
}

 

 

// for x86 only
size_t my_strlen(const char *s) {
    size_t len = 0;
    for(;;) {
        unsigned x = *(unsigned*)s;
        if((x & 0xFF) == 0) return len;
        if((x & 0xFF00) == 0) return len + 1;
        if((x & 0xFF0000) == 0) return len + 2;
        if((x & 0xFF000000) == 0) return len + 3;
        s += 4, len += 4;
    }
}

 

#ifndef WORDS_BIGENDIAN
    #if 0
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
        {
            register int i = 0;
            if (!(x & (1 << 0))) i ++;
            else return i;
            if (!(x & (1 << 1))) i ++;
            else return i;
            if (!(x & (1 << 2))) i ++;
            else return i;
            if (!(x & (1 << 3))) i ++;
            else return i;
            if (!(x & (1 << 4))) i ++;
            else return i;
            if (!(x & (1 << 5))) i ++;
            else return i;
            if (!(x & (1 << 6))) i ++;
            else return i;
            if (!(x & (1 << 7))) i ++;
            else return i;
            if (!(x & (1 << 8))) i ++;
            else return i;
            if (!(x & (1 << 9))) i ++;
            else return i;
            if (!(x & (1 << 10))) i ++;
            else return i;
            if (!(x & (1 << 11))) i ++;
            else return i;
            if (!(x & (1 << 12))) i ++;
            else return i;
            if (!(x & (1 << 13))) i ++;
            else return i;
            if (!(x & (1 << 14))) i ++;
            else return i;
            if (!(x & (1 << 15))) i ++;
            return i;
        }
    #elif 0
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes
        {
            // http://www.hackersdelight.org/: ntz3() shortened for 16-bit mask by Peter Kankowski
            register int n = 1;
            if ((x & 0x000000FFU) == 0) {n += 8; x >>= 8;}
            if ((x & 0x0000000FU) == 0) {n += 4; x >>= 4;}
            if ((x & 0x00000003U) == 0) {n += 2; x >>= 2;}
            return n - (x & 1);
        }
    #else
        static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40
        {                                                 // this is current winner for speed
            static const unsigned char table[256] = 
            {
                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
                4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
            };
            if ((unsigned char)x)
                return table[(unsigned char)x];
            return table[x >> 8] + 8; // t[x / 256] + 8
        }
    #endif
#else
    #if 0
        static inline int count_bits_to_0(unsigned int x)  // counting trailing zeroes
        {
            register int i = 0;
            if (!(x & (1 << 15))) i ++;
            else return i;
            if (!(x & (1 << 14))) i ++;
            else return i;
            if (!(x & (1 << 13))) i ++;
            else return i;
            if (!(x & (1 << 12))) i ++;
            else return i;
            if (!(x & (1 << 11))) i ++;
            else return i;
            if (!(x & (1 << 10))) i ++;
            else return i;
            if (!(x & (1 << 9))) i ++;
            else return i;
            if (!(x & (1 << 8))) i ++;
            else return i;
            if (!(x & (1 << 7))) i ++;
            else return i;
            if (!(x & (1 << 6))) i ++;
            else return i;
            if (!(x & (1 << 5))) i ++;
            else return i;
            if (!(x & (1 << 4))) i ++;
            else return i;
            if (!(x & (1 << 3))) i ++;
            else return i;
            if (!(x & (1 << 2))) i ++;
            else return i;
            if (!(x & (1 << 1))) i ++;
            else return i;
            if (!(x & (1 << 0))) i ++;
            return i;
        }
    #else
        static inline int count_bits_to_0(unsigned int x)  // counting trailing zeroes
        {
            // http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask
            register int n = 0;
            if (x <= 0x000000FFU) {n = n + 8; x = x << 8;}
            if (x <= 0x00000FFFU) {n = n + 4; x = x << 4;}
            if (x <= 0x00003FFFU) {n = n + 2; x = x << 2;}
            if (x <= 0x00007FFFU) {n = n + 1;}
            return n;
        }
    #endif
#endif
size_t strlen(const char *str)
{
    register size_t len = 0;
    // align to 16 bytes
    while ((((intptr_t)str) & (sizeof(__m128i)-1)) != 0)
    {
        if (*str++ == 0)
            return len;
        ++ len;
    }
    // search for 0
    __m128i xmm0 = _mm_setzero_si128();
    __m128i xmm1;
    int mask = 0;
    for (;;)
    {
        xmm1 = _mm_load_si128((__m128i *)str);
        xmm1 = _mm_cmpeq_epi8(xmm1, xmm0);
        if ((mask = _mm_movemask_epi8(xmm1)) != 0)
        {
            // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask
            // find index of first set bit

        #ifndef _DISABLE_ASM_BSF // define it to disable ASM
            #if (_MSC_VER >= 1300)   // make sure <intrin.h> is included
                unsigned long pos;
                _BitScanForward(&pos, mask);
                len += (size_t)pos;
            #elif defined(_MSC_VER)  // earlier MSVC's do not have _BitScanForward, use inline asm
                __asm bsf edx, mask ; edx = bsf(mask)
                __asm add edx, len  ; edx += len
                __asm mov len, edx  ; len = edx
            #elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz
                len += __builtin_ctz(mask);
            #elif defined(__GNUC__) // older GCC shall use inline asm
                unsigned int pos;
                asm("bsf %1, %0" : "=r" (pos) : "rm" (mask));
                len += (size_t)pos;
            #else                    // none of choices exist, use local BSF implementation
                len += count_bits_to_0(mask);
            #endif
        #else
            len += count_bits_to_0(mask);
        #endif

            break;
        }
        str += sizeof(__m128i);
        len += sizeof(__m128i);
    }
    return len;
}

 

This implementation would win more performance boost if 'count_bits_to_0' is optimised in less conditions.

We could use _mm_loadu_si128 to load unaligned data and thus skip own aligning loop but the performance

will still be worse due to additional CPU cycles if _mm_loadu_si128 is used.

SSE2 SIMD instructions are present on all modern CPUs and thus this implementation

may bring real benefits to intensive database/text processing applications.

License: Public Domain.

 

http://stackoverflow.com/questions/2372315/how-to-implement-strlen-as-fast-as-possible

 also do two micro-optimizations:

  • Since most strings we use scan consist of ASCII chars in the range 0~127, the 
    high bit is (almost) never set, so only check for it in a second test.

  • Increment an index rather than a pointer,
    which is cheaper on some architectures (notably x86) and give you the length for 'free'... 

uint32_t gatopeich_strlen32(const char* str)
{
    uint32_t *u32 = (uint32_t*)str, u, abcd, i=0;
    while(1)
    {
        u = u32[i++];
        abcd = (u-0x01010101) & 0x80808080;
        if (abcd && // If abcd is not 0, we have NUL or a non-ASCII char > 127...
             (abcd &= ~u)) // ... Discard non-ASCII chars
        {
        #if BYTE_ORDER == BIG_ENDIAN
            return 4*i - (abcd&0xffff0000 ? (abcd&0xff000000?4:3) : abcd&0xff00? 
                       
                    
                    

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
上一篇:
K-means聚类算法MATLAB发布时间:2022-07-18
下一篇:
matlab中nargin函数输入参数数目发布时间:2022-07-18
热门推荐
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap