在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
突然想写个爬虫,然后发现,如果有正则表达式,会方便些。 C++11提供了Regex类.可以用来完成: 1.Match: 将整个输入拿来比对(匹配)某个正则表达式。 2.Search:查找“与正则表达式吻合”的子序列。 3.Tokenize:正则表达式作为分割器,得到分割器之前的字符串。 4.Replace:将与正则表达式吻合之的子序列替换掉
主要函数有: regex_match(),regex_search(),regex_replace(); 主要对象:sregex_iterator,sregex_token_iterator,regex,smatch
例子: [_[:alpha:]][_[:alnum:]]* 表示,以_或字母开头,后面接着任意个_或字母的组合 [123]?[0-9]\.1?[0-9]\.20[0-9]{2} 表示german format,如 24.12.2010
C++11默认使用 ECMAScript 文法,告诉你怎么构造正则表达式
附上一个测试例子: #include <regex> #include <iostream> #include <string> #include <iomanip> #include <algorithm> using namespace std; void out(bool b){ cout << ( b? "found" : "not found") << endl; } void regex1(); void regex2(); void regex3(); void regex4(); void regex5(); void regex6(); int main(){ //regex1(); //regex2(); //regex3(); //regex4(); //regex5(); //regex6(); string data = "1994-06-25\n" "2015-09-13\n" "2015 09 13\n"; smatch m; regex reg("(\\d{4})[- ](\\d{2})[- ](\\d{2})"); //sregex_iterator pos(data.cbegin(),data.cend(),regex("(\\d{4})[- ](\\d{2})[- ](\\d{2})")); sregex_iterator pos(data.cbegin(),data.cend(),reg); sregex_iterator end; for( ; pos!=end ;pos++){ cout << pos->str() << " "; cout << pos->str(1) << " " <<pos->str(2) <<" " << pos->str(3) << endl; } system("pause"); return 0; } /* * regex_replace(string,reg1,reg2) * 将reg1匹配到的子串,用reg2替换掉 */ void regex6(){ string data = "<person>\n" "<first>Nico</first>\n" "<last>Josuttis</last>\n" "</person>\n"; regex reg("<(.*)>(.*)</(\\1)>"); cout << regex_replace(data,reg,"<$1 value=\"$2\"/>") << endl; string res2; regex_replace (back_inserter(res2), data.begin(),data.end(), reg, "<$1 value=\"$2\"/>", regex_constants::format_no_copy | regex_constants::format_first_only); cout << res2 << endl; } /* * sregex_token_iteartor 分割器 * 详情看函数输出,比如,通过这个,可以取出下面的名字 */ void regex5(){ string data = "<person>\n" "<first>Nico</first>\n" "<last>Josuttis</last>\n" "</person>\n"; regex reg("<(.*)>(.*)</(\\1)>"); sregex_token_iterator pos(data.cbegin(),data.cend(),reg,0); sregex_token_iterator end; for(; pos!=end;pos++){ cout << "match: "<<pos->str() << endl; } cout<< endl; string names = "nico,jim,helmut,paul,tim,john paul,rita"; regex sep("[ \t\n]*[,;.][ \t\n]*"); sregex_token_iterator p(names.cbegin(),names.cend(),sep,-1); sregex_token_iterator e; for(; p!=e;p++){ cout << "name: "<<*p << endl; } } /* * sregex_iterator 迭代器,通过这样个来遍历所以满足的子串 * 注意传进去的 begin,end 必须是const 所以使用 cbegin() */ void regex4(){ string data = "<person>\n" "<first>Nico</first>\n" "<last>Josuttis</last>\n" "</person>\n"; regex reg("<(.*)>(.*)</(\\1)>"); sregex_iterator pos(data.cbegin(),data.cend(),reg); sregex_iterator end; for(;pos != end;++pos){ cout << "match: "<< pos->str(0) << endl; cout << "tag: "<< pos->str(1)<< endl; cout << "value "<< pos->str(2) << endl; } sregex_iterator beg(data.cbegin(),data.cend(),reg); for_each(beg,end,[](const smatch& m){ cout << "match: "<< m.str() << endl; cout << "tag: "<< m.str(1)<< endl; cout << "value "<< m.str(2) << endl; }); } /* * bool regex_search(string , smatch ,regex ) * 对整个字符串,用这个regex进行匹配,找到第一个满足的子串, * 通过前面的例子,可以发现 m.suffix() 指得是,满足子串后面的, * 一个字符的索引,所以,通过一个循环,可以不断找出后面满足的 */ void regex3(){ string data = "<person>\n" "<first>Nico</first>\n" "<last>Josuttis</last>\n" "</person>\n"; regex reg("<(.*)>(.*)</(\\1)>"); auto pos = data.cbegin(); auto end = data.cend(); smatch m; for(; regex_search(pos,end,m,reg);pos = m.suffix().first){ cout << "match: "<<m.str() << endl; cout << "tag: "<<m.str(1) << endl; cout << "value: " << m.str(2) << endl; cout << "m.prefix(): "<<m.prefix().str() << endl; cout << "m.suffix(): "<<m.suffix().str() << endl; } } /* * bool regex_search(string , smatch ,regex ) * 对整个字符串,用这个regex进行匹配,找到第一个满足的子串, * 下面是通过smatch 获取子串内容的方法,索印对应群组 */ void regex2(){ string data = "XML tag: <tag-name>the value</tag-name>."; cout << "data: "<<data << "\n\n"; smatch m; bool found = regex_search(data,m,regex("<(.*)>(.*)</(\\1)>")); cout << "m.empty(): "<<boolalpha << m.empty() << endl; cout << "m.size(): "<<m.size() << endl; if(found){ cout << "m.str(): "<<m.str() << endl; cout << "m.length(): "<<m.length()<<endl; cout << "m.position(): "<<m.position()<<endl; cout << "m.prefix().str(): "<<m.prefix().str()<< endl; cout << "m.suffix().str(): "<<m.suffix().str() << endl; cout << endl; for(int i = 0;i<m.size();i++){ cout << "m["<<i<<"].str(): " << m[i].str() << endl; cout << "m.str("<<i << "): " << m.str(i) << endl; cout << "m.position(" << i << "): "<<m.position(i)<<endl; } cout << endl; cout << "matches:" << endl; for(auto pos = m.begin();pos!=m.end();pos++){ cout << " "<< *pos << " "; cout << "(length: " << pos->length() << ")" << endl; } } } /* * bool regex_match(string , regex ) * 对整个字符串,用这个regex进行匹配,会匹配最大满足的字符串 */ void regex1(){ regex reg1("<.*>.*</.*>"); bool found = regex_match("<tag>value</tag>",reg1); out(found); regex reg2("<(.*)>.*</\\1>"); found = regex_match("<tag>value</tag>",reg2); out(found); regex reg3("<\\(.*\\)>.*</\\1>",regex_constants::grep); found = regex_match("<tag>value</tag>",reg3); out(found); found = regex_match("<tag>value</tag>",regex("<(.*)>.*</\\1>")); out(found); cout << endl; found = regex_match("XML tag: <tag>value</tag>", regex("<(.*)>.*</\\1>")); out(found); found = regex_match("XML tag: <tag>value</tag>", regex(".*<(.*)>.*</\\1>")); out(found); found = regex_search("XML tag: <tag>value</tag>", regex("<(.*)>.*</\\1>")); out(found); found = regex_search("XML tag: <tag>value</tag>", regex(".*<(.*)>.*</\\1>")); out(found); }
|
2023-10-27
2022-08-15
2022-08-17
2022-09-23
2022-08-13
请发表评论