(******************************************************) (* **工作室 *) (* HTML解析单元库 *) (* *) (* DxHtmlParser Unit *) (* *) (* email:[email protected] QQ:316454904 *) (******************************************************) unit MyHtmlParser;
interface uses Windows,MSHTML,ActiveX,Forms,Variants, SysUtils, Classes;
type TMyHtmlParser = class private Doc: IHTMLDocument2; FHTML, FURL: string; procedure SetHTML(const Value: string); procedure SetURL(s: string); public Doc2:IHTMLDocument2; FParserOK:boolean; FTimeOut:integer; constructor Create; destructor Destroy;override; property HTML: string read FHTML write SetHTML; property URL: string read FURL write SetURL; property TimeOut:integer read FTimeOut write FTimeOut default 20000; property ParserOK:boolean read FParserOK default false; end; implementation
{ TDxHtmlParser }
procedure TMyHtmlParser.SetURL(s: string); var doc4:ihtmldocument4; tick:integer; begin FURL:=s ; if FURL<>'' then begin tick:=gettickcount; doc.QueryInterface(IID_ihtmldocument4,doc4); if assigned(doc4) then begin doc2:=doc4.createDocumentFromUrl(s,'null'); while (doc2.readyState<>'complete') and (gettickcount-tick<FTimeOut) do begin application.ProcessMessages; sleep(10); end; if doc2.readyState='complete' then FParserOK:=true; end; end; end;
constructor TMyHtmlParser.Create; begin CoInitialize(nil); //创建IHTMLDocument2接口 FTimeOut:=20000; CoCreateInstance(CLASS_HTMLDocument, nil, CLSCTX_INPROC_SERVER, IID_IHTMLDocument2, Doc); Assert(Doc<>nil,'构建HTMLDocument接口失败'); Doc.Set_designMode('On'); //设置为设计模式,不执行脚本 while not (Doc.readyState = 'complete') do begin sleep(1); Application.ProcessMessages; end; HTML:='<html></html>'; end;
destructor TMyHtmlParser.Destroy; begin CoUninitialize; inherited; end;
procedure TMyHtmlParser.SetHTML(const Value: string); var V: OLEVariant; vDocument: OLEVariant; vMimeType: OLEVariant; vHtml: OLEVariant; tick:integer; begin if FHTML <> Value then begin tick:=gettickcount; FHTML := Value; V := Doc; vDocument := V.script.Document; vMimeType := 'text/Html'; vHtml := FHtml; vDocument.Open(vMimeType); vDocument.Clear; vDocument.Write(vHtml); vDocument.Close; while (doc.readyState<>'complete') and (gettickcount-tick<FTimeOut) do begin application.ProcessMessages; sleep(10); end; if doc.readyState='complete' then begin FParserOK:=true; doc2:=doc; end; end; end;
end.
受到得闲老师的htmlparser启发,完善了一下,去掉的自认为没必要的东西(有了IhtmlDocument2,神马都是浮云),当然不是完全抄自得闲老师的解析器,本单元中的精华是SetHTML(const Value: string);和SetURL(s: string);这两个函数,其它的没什么意思。
SetHTML(const Value: string)是抄自TEmbeddedwb的IEParser。
SetURL(s: string);是根据MSDN上ihtmlDocument4.createDocumentFromUrl创建出新的ihtmlDocument2接口。
不解释了,代码就这点。
不足的地方:doc2会自动去下载图片,如有朋友修改后还请发我一份,谢谢!!
|
请发表评论