在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
功能描述:
原代码奉上: #!/usr/bin/perl use HTML::Element; use HTML::TreeBuilder; use HTML::Parser; open DATAFH,">>data" || die "open file failed:$!"; select DATAFH; foreach my $file_name (@ARGV) { unless(-e $file_name){ print "$file_name文件不存在.\n"; next; } ##首先判断文件编码是不是UTF-8,如果不是要转换成UTF-8,因为我发现perl的HTML模块好像只能处理UTF-8编码的文件,处理GBK文件时会出现中文乱码。而我们从网上下载的页面几乎都是GBK编码。 system "enca","-L","zh_CN","-x","UTF-8",$file_name; ##用enca比用iconv的好处在于不需要管原文件是什么编码(即使本来就UFTF-8编码),直接转换成UTF-8编码就是了。 my $root = HTML::TreeBuilder->new; $root->parse_file($file_name); $title=$root->find_by_tag_name('title'); $str_title=$title->as_text(); if($str_title=~qr/图书 - 当当网$/){ print "网页标题:$str_title\n"; $div_h1=$root->look_down("_tag","div","class","h1_title book_head"); $h1=$div_h1->look_down('_tag','h1'); print "图书名称:",$h1->as_text(),"\n"; $div_info=$root->look_down("_tag","div","class","info book_r"); $price_d=$div_info->look_down("_tag","p","class","price_d"); print $price_d->as_text(),"\n"; $price_m=$div_info->look_down("_tag","p", "class","price_m"); foreach my $node_r ($price_m->content_refs_list){ next if ref $$node_r; print $$node_r,"\n"; } $div_detail=$div_info->look_down("_tag","div","class","book_detailed","name","__Property_pub"); @elements=$div_detail->find_by_tag_name('p'); foreach(@elements){ print $_->as_text(),"\n"; } $clear=$div_detail->look_down("_tag","ul","class","clearfix"); @lis=$clear->content_list(); @spans=$lis[0]->content_list(); print $spans[1]->as_text(),"\n"; @spans=$lis[2]->content_list(); print $spans[1]->as_text(),"\n"; print $spans[2]->as_text(),"\n"; print "********************************************\n"; $|=1; } else{ print STDOUT "$file_name不是当当网图书信息页面。\n"; next; } $tree = $root->delete; } close DATAFH; 输出文件里的内容: |
请发表评论