在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
注:处理异步加载需要模拟浏览器登陆,然后用import json,用loads解析 例如: 代码: 1 #! /usr/bin/env python 2 # -*- coding=utf-8 -*- 3 import requests 4 import json 5 import re 6 import sys 7 reload(sys) 8 sys.setdefaultencoding("utf-8") 9 classinfo = [] 10 f = open('info.txt','w') 11 12 num = 0 13 def write(htm): 14 titl = re.findall('data-tit(.*?)data-enough',htm.text,re.S) 15 for each in titl: 16 print each 17 info = {} 18 #print each 19 info['title'] = re.search('le="(.*?)"',each,re.S).group(1) 20 info['year'] = re.search('data-release="(.*?)" data',each,re.S).group(1) 21 info['Rating']= re.findall('data-rate="(.*?)" data-star',each,re.S)[0] 22 info['time'] = re.findall('data-duration="(.*?)" data-re',each,re.S)[0] 23 info['reg'] = re.findall('data-region="(.*?)" data-dir',each,re.S)[0] 24 info['act'] = re.findall('data-actors="(.*?)" data-in',each,re.S)[0] 25 global num 26 num = num + 1 27 f.writelines('%d\n' %num) 28 f.writelines(u'电影名:'+info['title'] + '\n') 29 f.writelines(u'主演:'+info['act'] + '\n') 30 f.writelines(u'电影地区:' + info['reg']+'\n') 31 f.writelines(u'上映年份:' + info['year']+'\n') 32 f.writelines(u'电影时长:' + info['time']+'\n') 33 f.writelines(u'评分:' + info['Rating']+'\n\n') 34 def write1(info): 35 global num 36 num = num + 1 37 f.writelines('%d\n' %num) 38 f.writelines(u'电影名:'+info['title'] + '\n') 39 f.writelines(u'评分:' + info['Rating']+'\n') 40 f.writelines(u'链接:'+info['url'] + '\n\n') 41 def getry(): 42 # html = requests.get('http://movie.douban.com/') 43 url = 'http://movie.douban.com/' 44 html = requests.get(url) 45 html.encoding = 'utf-8' 46 #print html.text 47 write(html) 48 def getrm(): 49 info = {} 50 url = 'http://movie.douban.com/j/search_subjects?type=movie&tag=热门&sort=recommend&page_limit=20&page_start=0' 51 head = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0'} 52 html = requests.get(url,headers = head) 53 for i in range(0,16): 54 newurl = re.sub('start=\d+','start=%d'%(i*20),url,re.S) 55 #print newurl 56 jscontent = requests.get(newurl,headers = head).content 57 jsdict = json.loads(jscontent)#将json解析成表文件 58 for i in range(0,20): 59 #print jsdict['subjects'][i]['url'] 60 info['title'] = jsdict['subjects'][i]['title'] 61 info['Rating'] = jsdict['subjects'][i]['rate'] 62 info['url'] = jsdict['subjects'][i]['url'] 63 write1(info) 64 if __name__ == "__main__": 65 getry() 66 getrm() 效果图: |
请发表评论