Boyer-Moore算法的C++实现

OStack程序员社区-中国程序员成长平台 › 门户 › 编程› C++›C++教程

原作者: [db:作者] 来自: [db:来源] 收藏邀请

以上给出了通俗易懂的算法讲解，下面给出代码实现，使用的宽字符，这样就不限于英文字母了:(stdafx.h编译不过去就屏蔽掉)

// StringSearch_BoyerMoore.cpp : 定义控制台应用程序的入口点。
//
/*
	@Date: 2016/03/21 18:31;
	@Where: Tongzhou District of Beijing;
	@Author: Alex Hong;
	@Mail: [email protected];
	@Reference: 
		http://www.ruanyifeng.com/blog/2013/05/boyer-moore_string_search_algorithm.html BM算法-阮一峰的网络日志;
*/

#include "stdafx.h"
#include <sstream>
#include <iostream> 
#include <string>
#include <vector>
#include <map>
using namespace std;

typedef std::vector<int> TArrayInt;
TArrayInt g_arSearchResult;//store the index of searched result;
typedef std::map<std::wstring, int> TMapWszToInt; //E.g. "我,4":7;
typedef std::map<int, int> TMapIntToInt;


void StringSearch_BM(wchar_t* wszStrMain, int iLenMain, wchar_t* wszSearch, int iLenSearch, bool bOnlyFirst)
{
	//reset variables;
	g_arSearchResult.clear();

	//handle exceptions..
	if( NULL == wszStrMain || NULL == wszSearch)
		return;
	if(iLenSearch > iLenMain)
		return;
	/*
		未知的字符有很多, 但是已知的搜索字符是有限的. 我们只初始化已知的字符的跳跃值(Skip), 数组中不存在的就是-1;
		Shift数组的初始化同样基于这种想法.
		初始化这两个数组的值后, 对于长主串/耗时的任务是有利的;
	*/

	int i = 0, j = 0;

	//Initialize the Skip array data for Bad-Character-Rules..
	TMapWszToInt mapSkip;
	mapSkip.clear();
	TMapWszToInt::iterator it1;
	std::wostringstream oStream;
	for(i = iLenSearch - 1; i > 0; --i)//0-->1;
	{
		for(j = i - 1; j >= 0; --j)
		{
			if(wszSearch[j] != wszSearch[i]) //Skip is for bad character which is different with wszSearch;
			{
				oStream.clear();//clear bad bits;
				oStream.str(L"");//clear data;
				oStream<<wszSearch[j]<<','<<i;//format to key string;
				//wcout<<oStream.str().c_str()<<endl;//test..
				it1 = mapSkip.find(oStream.str());
				if(mapSkip.end() == it1) //only set the pos of nearest same bad character;
					mapSkip.insert(TMapWszToInt::value_type(oStream.str(), i-j));
			}
		}
	}

	//好后缀的前提是: 至少有一个字符是相同的!!!!!!
	//Initialize the Shift array data for Good-Suffix-Rules..
	//Shift数组的键值是好后缀的最左字符的下标;
	//1. 首先判断模式串中是否含有完全一致的好后缀;有就直接计算位移, EXIT;
	//  其实这本身就是一个字符串搜索的过程,为了简化计算, 这里只使用Bad-Character-Rule;
	//2. 如果没有, 就在串首寻找好后缀的子串;
	TMapIntToInt mapShift;	
	mapShift.clear();
	mapShift.insert(TMapIntToInt::value_type(0, iLenSearch));//0==i的话就匹配了, Shift[]=iLenSearch;
	//1. try to find the same suffix..
	for(int iSuffixPosL = iLenSearch - 1; iSuffixPosL > 0; --iSuffixPosL)//iSuffixPosL: left position of good suffix;
	{
		int iSuffixLen = iLenSearch - iSuffixPosL;
		wchar_t* wszSuffix = wszSearch + iSuffixPosL;
		
		for(int iPosMainL = iSuffixPosL - 1; iPosMainL >= 0;)//不断向左移动suffix串比较;
		{
			//compare..
			for(i = 0; i < iSuffixLen; ++i)
			{
				if(wszSearch[iPosMainL+i] != wszSuffix[i])//bad character
					break;
			}
			//check result..
			if(iSuffixLen == i) //find it!
			{				
				mapShift[iLenSearch-i] = iSuffixPosL - iPosMainL; 
				break; //结束为当前suffix找Shift值;
			}
			else //calc the skip distance;
			{
				for(j = i + 1; j < iSuffixLen; ++j)//在suffix中寻找最近的坏字符
				{
					if(wszSuffix[j] == wszSearch[iPosMainL+i])
						break;
				}
				iPosMainL = iPosMainL - (j - i);//iSuffixLen==j的情况也适用;	
			}
		}
	}
	//2. try to find the child suffix at head..
	TMapIntToInt::iterator it2;
	for(int iSuffixPosL = iLenSearch - 1; iSuffixPosL > 0; --iSuffixPosL)//iSuffixPosL: left position of good suffix;
	{
		it2 = mapShift.find(iSuffixPosL);
		if(mapShift.end() == it2)
		{
			wchar_t* wszSuffix = NULL;
			int iSuffixLen  = 0;
			for(i = 0; i < iLenSearch - iSuffixPosL; ++i)//form sub suffix big to small.
			{
				wszSuffix = wszSearch + iSuffixPosL + i;
				iSuffixLen = iLenSearch - iSuffixPosL - i;
				//compare if match..				
				for(j = 0; j < iSuffixLen; ++j)
				{
					if(wszSearch[j] != wszSuffix[j])
						break;
				}
				if(iSuffixLen == j) //find it!
				{
					mapShift[iSuffixPosL] = iLenSearch - iSuffixLen; //以好后缀的最后一个字符为准;
					break;
				}
			}

			//if no sub suffix at all..
			it2 = mapShift.find(iSuffixPosL);
			if(mapShift.end() == it2)
				mapShift[iSuffixPosL] = iLenSearch;
		}
	}

	//================
	//start to search!
	//================
	int iPosMainL = 0;//left iterator of main string;
	//int iPosSearch = 0;//iterator of search string; no use!
	while( iPosMainL <= iLenMain - iLenSearch + 1)
	{
		//compare with aligned sub-string..
		int i = iLenSearch - 1;
		for(; i >= 0; --i)//right to left comparison;
		{
			if(wszSearch[i] != wszStrMain[iPosMainL+i])//bad character!
				break;
		}
		//----calc how many steps to move rightwards..----
		int iSkip = 0;
		if(i < 0)
		{
			g_arSearchResult.push_back(iPosMainL);
			if(bOnlyFirst)
			{
				mapSkip.clear();
				mapShift.clear();
				return;
			}
			iSkip = iLenSearch;
		}
		else
		{
			//1. calc max skip steps for bad characters..
			oStream.clear();//clear bits;
			oStream.str(L"");//clear data;
			oStream<<wszStrMain[iPosMainL+i]<<','<<i;
			it1 = mapSkip.find((wchar_t*)oStream.str().c_str());
			if(mapSkip.end() == it1)
				iSkip = i + 1;
			else
				iSkip = it1->second;
		}
		
		//--2. calc max shift steps for good suffix..--
		int iShift = 0;
		if(i < iLenSearch - 1)//至少有一个相同的字符才可以使用"好后缀"规则!
		{
			it2 = mapShift.find(i-1);
			if(mapShift.end() != it2)
				iShift = it2->second;
		}
		iPosMainL += ((iSkip>iShift) ? iSkip : iShift);
	}

	//clear..
	mapSkip.clear();
	mapShift.clear();
}

int _tmain(int argc, _TCHAR* argv[])
{
	wchar_t wszMainStr[] = L"THIS IS A 我你她我不是的LE EXA我们我们MPLEEXAMPLE我们";
	wchar_t wszSearchStr[] = L"我们";

	StringSearch_BM(wszMainStr, wcslen(wszMainStr), wszSearchStr, wcslen(wszSearchStr), false);

	for(TArrayInt::iterator it = g_arSearchResult.begin(); it != g_arSearchResult.end(); ++it)
		cout<<*it<<endl;

	//clear..
	g_arSearchResult.clear();

	getchar();
	return 0;
}

一些例子分析过程:

主字符串:       THIS IS A SIMPLE EXAMPLE (长度:24)
搜索字符串:    EXAMPLE                  (长度:7)


mapSkip的初始化值:

主串中的坏字符的下面字符是:'E'

mapSkip['L,6'] = 1
mapSkip['P,6'] = 2
mapSkip['M,6'] = 3
mapSkip['A,6'] = 4
mapSkip['X,6'] = 5
                             mapSkip['E,6'] = 不是坏字符, 舍去
其他: 7
-----------------------------------------------------------
搜索字符串:    EXAMPLE                   (长度:7)
主串中的坏字符的下面字符是:
         'L'
mapSkip['P,5'] = 1
mapSkip['M,5'] = 2
mapSkip['A,5'] = 3
mapSkip['X,5'] = 4
mapSkip['E,5'] = 5
其他:6
---------------------------------------------------------------
...
主字符串:     THIS IS A SIMPLE EXAMPLE (长度:24)
搜索字符串:    EXAMPLE                     (长度:7)
主串中的坏字符的下面字符是:
         'X'
mapSkip['E,1'] = 1
其他:2
---------------------------------------------------------------
mapSkip['[任何],0'] = 1

============好后缀数组初始化的流程============================

搜索字符串:    SIMPLE EXAMPLE (长度:14)

0 1 2 3 4 5 6 7 8 9 10 11 12 13
S I M P L   E     E X A   M   P    L    E

好后缀的第一个字符为:'E' --> Shift[13] = 6; //以最后一个字符的下标为基准计算要移动的距离;
好后缀的第一个字符为:'L' --> Shift[12] = 8; //"SIMPLE"中有全匹配的, 取全匹配的;
好后缀的第一个字符为:'P' --> Shift[11] = 8;
好后缀的第一个字符为:'M' --> Shift[10] = 8;
好后缀的第一个字符为:'A' --> Shift[9] = 14; //没有全匹配的, 在串头部找子串匹配的, 这里没有, 认为在-1位置(不妨虚拟出来);
...
好后缀的第一个字符为:'I' --> Shift[1] = 14;
好后缀的第一个字符为:'S' --> Shift[0] = 14;//可能需要特殊处理;

E.g. 2

0 1 2 3   4   5   6   7
L E X A   M   P   L   E

好后缀的第一个字符为:'E' --> Shift[7] = 6;
好后缀的第一个字符为:'L' --> Shift[6] = 6;
好后缀的第一个字符为:'P' --> Shift[6] = 6;

鲜花

握手

雷人

路过

鸡蛋

该文章已有0人参与评论

请发表评论

全部评论

专题导读

More+

10-27 六六分期app的软件客服如何联系？(六六分期

11-06 可心卡盟:win10系统火狐flash插件崩溃怎么

11-06 亲亲特价:怎么删除回收站图标

11-06 济南大学虚拟社区:鲁大师节能降温的具体办

11-06 xlueops.exe:无线网络安装向导

11-06 女斗合众国:win7系统cf与主机连接不稳定怎

11-06 0xc000022-[cf烟雾头]cf怎么调烟雾头

11-06 qizideyouhuo:应用程序无法正常启动0xc0000

11-06 ipz-185:win7系统vcf文件怎么打开

11-06 傻哥蹦迪:win10系统s4怎么打开usb调试

11-06 八神浩树gtaste:回收站清空了怎么恢复

11-06 妖尾之黑色守护:win10系统电脑没有1440x900

11-06 校园至尊魔王小说:win7系统浏览网页时字体

11-06 女斗合众国:win10系统访问共享文件夹提示请

11-06 tokyo hot n0654:恢复win7系统默认字体一招

11-06 雨酷仙境:设置win7系统转移临时文件夹腾出

11-06 阿穆纳伊之杖:win7系统开始菜单在右边还原

11-06 tunespotting:win10系统火狐flash插件总是

11-06 甘尔葛分析师：计谋网站seo关键词暴涨有什

11-06 蔡贵霖: 计谋网站seo关键词暴涨有什么秘密

11-06 博益网首页:ao3网页版进入不了解决方法

11-06 漏斗子专栏: 网站数据分析小白易懂精华篇

11-06 见证双虹怎么做:win7系统开启telnet命令的

11-06 颾狐蝶蜋:系统资源不足无法完成请求的服务

11-06 国光中学校歌:提交网站到alexa查询详细步骤

11-06 西安有情天:静态网页和动态网页的区别

11-06 红木雅尚斋:外部链接构造对网站的好处

11-06 前官礼遇：防止域名劫持–增强域安全性的10

11-06 密传二转答案: 中文分词算法有哪些

11-06 金泉家园邮编:百度快照劫持的表现及应对方

C#中使用Newtonsoft.Json处理JSON数据绝对能用发布时间：2022-07-14

刚用c#完成的一个交通仿真程序发布时间：2022-07-14

剪的笔顺,诠释剪的笔画,认识剪的部首

1 六六分期app的软件客服如何联系？(六六分期

六六分期app的软件客服如何联系？不知道吗？加qq群【895510560】即可！标题：六六分期

阅读：19144|2023-10-27

2 可心卡盟:win10系统火狐flash插件崩溃怎么

今天小编告诉大家如何处理win10系统火狐flash插件总是崩溃的问题，可能很多用户都不知

阅读：9973|2022-11-06

3 亲亲特价:怎么删除回收站图标

今天小编告诉大家如何对win10系统删除桌面回收站图标进行设置，可能很多用户都不知道

阅读：8317|2022-11-06

4 济南大学虚拟社区:鲁大师节能降温的具体办

今天小编告诉大家如何对win10系统电脑设置节能降温的设置方法，想必大家都遇到过需要

阅读：8686|2022-11-06

5 xlueops.exe:无线网络安装向导

我们在使用xp系统的过程中,经常需要对xp系统无线网络安装向导设置进行设置，可能很多

阅读：8627|2022-11-06

6 女斗合众国:win7系统cf与主机连接不稳定怎

今天小编告诉大家如何处理win7系统玩cf老是与主机连接不稳定的问题，可能很多用户都不

阅读：9643|2022-11-06

7 0xc000022-[cf烟雾头]cf怎么调烟雾头

电脑对日常生活的重要性小编就不多说了，可是一旦碰到win7系统设置cf烟雾头的问题，很

阅读：8611|2022-11-06

8 qizideyouhuo:应用程序无法正常启动0xc0000

我们在日常使用电脑的时候，有的小伙伴们可能在打开应用的时候会遇见提示应用程序无法

阅读：7991|2022-11-06

9 ipz-185:win7系统vcf文件怎么打开

今天小编告诉大家如何对win7系统打开vcf文件进行设置，可能很多用户都不知道怎么对win

阅读：8642|2022-11-06

10 傻哥蹦迪:win10系统s4怎么打开usb调试

今天小编告诉大家如何对win10系统s4开启USB调试模式进行设置，可能很多用户都不知道怎

阅读：7527|2022-11-06

客服电话

电子邮件

Boyer-Moore算法的C++实现

请发表评论

全部评论

上一篇：

下一篇：

jcmsalves/kotlin-playground: Kotlin Andr

bradtraversy/iweather: Ionic 3 mobile we

avehtari/BDA_m_demos: Bayesian Data Anal

断牙刷新位置时间（断牙属性及刷新位置介绍

delphi进度条

剪的笔顺,诠释剪的笔画,认识剪的部首

六六分期app的软件客服如何联系？(六六分期

florent37/ViewAnimator: A fluent Android

florent37/Shrine-MaterialDesign2: implem

CVE-2020-36276

SimpleSoftwareIO/simple-sms: Send and re

关于我们

产品与服务

解决方案

139-2527-9053