在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
递归枚举IHTMLDocument2的所有元素 http://blog.csdn.net/fishmai/article/details/52388843 void EnumHTMLDocument( MSHTML::IHTMLDocument2* pDoc ) { if( pDoc == NULL )return; //遍历搜索子框架,递归处理子框架的文档 CComPtr<MSHTML::IHTMLFramesCollection2> spFramesCollection; pDoc->get_frames( &spFramesCollection ); long lCount = 0; HRESULT hr = spFramesCollection->get_length( &lCount ); if( FAILED( hr ) )return; for ( long lIndex = 0; lIndex < lCount; lIndex++ ) { CComVariant vDispWin; vDispWin = spFramesCollection->item( &CComVariant( lIndex ) ); CComQIPtr<MSHTML::IHTMLWindow2> spWin = vDispWin.pdispVal; if( spWin == NULL )continue; CComPtr<MSHTML::IHTMLDocument2> spSubDoc; spWin->get_document( &spSubDoc ); EnumHTMLDocument( spSubDoc ); } CComQIPtr<MSHTML::IHTMLElementCollection> spElementCollection; hr = pDoc->get_forms( &spElementCollection ); if( FAILED( hr ) )return; long lFormCount = 0; hr = spElementCollection->get_length( &lFormCount ); if( FAILED( hr ) )return; for ( long lIndex = 0; lIndex < lFormCount; lIndex++ ) { CComQIPtr<MSHTML::IHTMLFormElement> spFormElement = spElementCollection->item( &CComVariant( lIndex ) ); if( spFormElement == NULL )continue; long lElemCount = 0; hr = spFormElement->get_length( &lElemCount ); if( FAILED( hr ) )continue; for ( long lElemIndex = 0; lElemIndex < lElemCount; lElemIndex++ ) { CComDispatchDriver spInputElement; spInputElement = spFormElement->item( &CComVariant( lElemIndex ) ); if( spInputElement == NULL )continue; CComVariant varName, varValue, varType; hr = spInputElement.GetPropertyByName( L"name", &varName ); if( SUCCEEDED( hr ) ) { LPCTSTR lpszName = varName.bstrVal ? COLE2CT( varName.bstrVal ) : _T("NULL"); AtlMessageBox( NULL, lpszName ); } hr = spInputElement.GetPropertyByName( L"value", &varValue ); if( SUCCEEDED( hr ) ) { LPCTSTR lpszValue = varValue.bstrVal ? COLE2CT( varValue.bstrVal ) : _T("NULL"); AtlMessageBox( NULL, lpszValue ); } hr = spInputElement.GetPropertyByName( L"type", &varType ); if( SUCCEEDED( hr ) ) { LPCTSTR lpszType = varType.bstrVal ? COLE2CT( varType.bstrVal ) : _T("NULL"); AtlMessageBox( NULL, lpszType ); } } } } 解析html程序(C#版)——遍历各个节点(mshtml) . http://blog.csdn.net/hanjieson/article/details/8576150 /* 在项目里引用了mshtml.dll,并且引用命名空间:using mshtml; 首先,参数html就是html文本内容(里面有markup标记和显示文本等等) 其次,getHtmlDisplayContent这个函数就是获取html里浏览器上可看到的内容,即从源码中取出显示文本。 最后,traverseNodes是个人写的一个遍历各个节点的一个小小递归程序,没考虑效率什么的,只是想知道怎么使用IHtmlDocument2和IHtmlDocument3接口 Note:当html文档不规范时,比如在<!Document....之前还有别的标记或者符号时,加载工作受到严重影响,此时估计是解析不出来了,我开始还不知道为什么解析有些html时卡住了,原来是因为这些html文档在html标记前有\n\n\n....等。。。 */ private static string getHtmlDisplayContent(string html) { string cont = ""; mshtml.HTMLDocumentClass oc = new mshtml.HTMLDocumentClass(); mshtml.IHTMLDocument2 doc2 = oc; doc2.write(html); mshtml.IHTMLDocument3 HTMLDocument = (mshtml.IHTMLDocument3)doc2; traverseNodes(HTMLDocument.documentElement, ref cont); //mshtml.IHTMLTitleElement title = (mshtml.IHTMLTitleElement)doc2.title; /* cont += doc2.title.ToString(); mshtml.IHTMLBodyElement body = (mshtml.IHTMLBodyElement)doc2.body; if (body.text!=null) cont += body.text.ToString(); * */ doc2.close(); return cont; } private static void traverseNodes(mshtml.IHTMLElement parentNode,ref string cont) { if (parentNode.innerText!=null) cont += parentNode.innerText; mshtml.IHTMLElementCollection nodes = (IHTMLElementCollection)parentNode.children; IEnumerator ienum= nodes.GetEnumerator(); while (ienum.MoveNext()) { IHTMLElement node = (IHTMLElement)ienum.Current; traverseNodes(node,ref cont); } }
|
请发表评论