在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
library(xml2) library(magrittr) #点评官网 dianpingweb<-"http://www.dianping.com" #综合商场 comshop<-"/search/category/1/20/g119p" # webii<-seq(1:22) fulldata<-data.frame() for(i in webii) { web<-read_html(paste0(dianpingweb,comshop,as.character(i)),encoding="UTF-8") #"http://www.dianping.com/search/category/1/20/g119" #商城名字 businame<-web %>% html_nodes("div.tit")%>% html_nodes("h4")%>% html_text() busitag<-web %>% html_nodes("div.tag-addr")%>% html_nodes("a")%>% html_nodes("span")%>% html_text() temptag<-matrix(busitag,ncol=2,byrow=T) #商城类别和商城地址 busicatog<-temptag[,1] busiaddr<-temptag[,2] #商城等级 busideg<-web %>% html_nodes("div.comment")%>% html_nodes("span")%>%html_attr("title") #评论数 # reviewa<-web %>% html_nodes("div.comment")%>% html_nodes("a.review-num")%>%html_text() reviewnum<-web %>% html_nodes("div.comment")%>% html_nodes("a.review-num b")%>%html_text() # reviewa #平均消费tempprice nullprice<-web %>% html_nodes("a.mean-price b")%>%html_text() nullprice<-gsub(pattern = "¥", replacement = "", nullprice) pricea<-web %>% html_nodes("a.mean-price")%>%html_text() jvector<-vector() for(j in 1:length(pricea)) { pricelist=unlist(strsplit(pricea[j],split=" ")) pricegrep<-grep(pattern = "\\¥", pricelist, value = TRUE) if(identical(pricegrep, character(0))) { jvector[j]=0 } else jvector[j]=1 } meanprice=jvector ii=1 for(k in 1:length(jvector)) { if(jvector[k]==1) { meanprice[k]=nullprice[ii] ii=ii+1 } } meanprice #下一层连接url childhtml<-web %>% html_nodes("div.pic")%>% html_nodes("a")%>%html_attr("href") print(i) } 图片中是爬取的数据样例。 |
请发表评论