代码如下,不知道为什么一直不能成功登录
># -*- coding: utf-8 -*-
import scrapy
import re
import requests
#import urllib
from bs4 import BeautifulSoup
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.loader import ItemLoader
from scrapy.loader.processors import MapCompose, Join
from scrapy.http import Request,FormRequest
from getweibo.items import InformationItem,TweetsItem
loginURL = "https://login.weibo.cn/login/"
#获得验证码等信息
def get_captchainfo(loginURL):
html = requests.get(loginURL).content
bs = BeautifulSoup(html,"lxml")
#print bs
#注意通过bs.select元素寻找对象,返回的是列表对象
password_name = (bs.select('input[type="password"]'))[0].get('name')
vk = (bs.select('input[name="vk"]'))[0].get('value')
capId = (bs.select('input[name="capId"]'))[0].get('value')
#print password_name,vk,capId
captcha_img = bs.find("img", src=re.compile('http://weibo.cn/interface/f/ttt/captcha/')).get('src')
print captcha_img
#captchaid可以从验证码图片地址中直接截取获得
#urllib.urlretrieve(captcha_img, 'weibo_spider/image/captcha.jpg')
#print "captcha download success!"
captcha_input = raw_input("please input the captcha
>")
return (captcha_input,password_name,vk,capId)
class WeiboSpider(CrawlSpider):
name = 'weibo'
allowed_domains = ['weibo.cn']
start_urls = ['http://weibo.cn/dafendi']#先暂时确定精分君的微博,之后start_urls可以从文件提取
rules = (
Rule(LinkExtractor(restrict_xpaths='//*[@id="pagelist"]/form/div/a')),
Rule(LinkExtractor(restrict_xpaths='//*[contains(@href,"repost")]'),callback='parse_item')
)
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.8",
"Connection": "keep-alive",
"Content-Type":" application/x-www-form-urlencoded",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36",
"Referer": "https://login.weibo.cn/login/"
}
# Start on the welcome page
def start_requests(self):
return [
Request(
loginURL,
meta = {'cookiejar': 1},
headers=self.headers,
callback=self.parse_login)
]
# Post welcome page's first form with the given user/pass
def parse_login(self, response):
print 'Preparing login'
captcha=get_captchainfo(loginURL)
print captcha
return FormRequest.from_response(
response,#from loginURL
method="POST",
meta = {'cookiejar' : response.meta['cookiejar']},#获取cookies
headers = self.headers,
formdata = {
"mobile": "帐号",
captcha[1]: "密码",
"code": captcha[0],
"remember":"on",
"backurl": "http%3A%2F%2Fweibo.cn",
"backtitle":u'手机新浪网',
"tryCount":"",
"vk": captcha[2],
"capId": captcha[3],
"submit": u'登录'},
callback = self.after_login,
dont_filter = True
)
def after_login(self, response) :
for url in self.start_urls :
yield self.make_requests_from_url(url)
def parse_start_url(self, response):#用来处理初始response
html = response.xpath('/html').extract()
print html
# Create the loader using the response
l = ItemLoader(item=InformationItem(), response=response)
# Load fields using XPath expressions
l.add_xpath('id_', '//title/text()', MapCompose(lambda i:i[0:len(i)-3])),
l.add_xpath('Info','//span[contains(@class,"ctt")][2]/text()'),
l.add_xpath('Num_Tweets','//span[contains(@class,"tc")]/text()',MapCompose(lambda i: i[(i.index("[")+1):(i.index("]"))])),
l.add_xpath('Num_Follows','//a[contains(@href,"follow")]/text()',MapCompose(lambda i: i[(i.index("[")+1):(i.index("]"))])),
l.add_xpath('Num_Fans','//a[contains(@href,"fans")]/text()',MapCompose(lambda i: i[(i.index("[")+1):(i.index("]"))])),
return l.load_item()
def parse_item(self, response):
l = ItemLoader(item=TweetsItem(), response=response)
l.add_xpath('Content','//span[contains(@class,"ctt")]/text()')
#l.add_xpath('')
return l.load_item()
下边settins.py的内容
ROBOTSTXT_OBEY = False
HTTPERROR_ALLOWED_CODES = [302,]#返回400时按正常的返回对待
REDIRECT_ENABLED = False #关掉重定向,不会重定向到新的地址
DOWNLOAD_DELAY = 3
COOKIES_ENABLED = True
COOKIES_DEBUG = True
下边是输出
2017-04-09 15:53:17 [scrapy] DEBUG: Sending cookies to: <POST https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4>
Cookie: _T_WM=6348fb8a523fe1bc486f14d1304cf0d2
2017-04-09 15:53:19 [scrapy] DEBUG: Received cookies from: <302 https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4>
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn
Set-Cookie: SUB=_2A2517Zg9DeRhGeVG61ER8yrEwzyIHXVXETh1rDV6PUJbkdAKLRXgkW0wSZc8S6dp1d-NlyAraSqa-1-_0Q..; expires=Tue, 09-May-2017 07:53:17 GMT; path=/; domain=.weibo.cn; httponly
Set-Cookie: gsid_CTandWM=4uuCcdef1lRXUEnMtsgL1fXlgec; expires=Tue, 09-May-2017 07:53:19 GMT; path=/; domain=.weibo.cn; httponly
2017-04-09 15:53:19 [scrapy] DEBUG: Crawled (302) <POST https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4> (referer: https://login.weibo.cn/login/)
2017-04-09 15:53:20 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/dafendi>
Set-Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0; expires=Tue, 09-May-2017 07:53:19 GMT; path=/; domain=.weibo.cn; httponly
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn
2017-04-09 15:53:20 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/dafendi> (referer: https://login.weibo.cn/login/?rand=201282002&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4)
2017-04-09 15:53:20 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/dafendi>
{'Info': [u'u8ba4u8bc1uff1au77e5u540du5e7du9ed8u535au4e3b u5faeu535au7b7eu7ea6u81eau5a92u4f53'],
'Num_Fans': [u'2055326'],
'Num_Follows': [u'891'],
'Num_Tweets': [u'1958'],
'id_': [u'u7cbeu5206u541b']}
2017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743>
Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0
2017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743>
Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0
2017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743>
Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0
2017-04-09 15:53:20 [scrapy] DEBUG: Sending cookies to: <GET http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743>
Cookie: _T_WM=80e15f38a0dfb65ea7bbcd00ebcaf1c0
2017-04-09 15:53:24 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743>
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn
2017-04-09 15:53:24 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)
2017-04-09 15:53:24 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/EDsDTFqfJ?rl=0&uid=2626948743>
{'Content': [u':',
u' u5047u5982u4efbu4f55u4e8bu90fdu80fdu6210u4e3au804cu4e1auff0cu4f60u4f1au9009u62e9u4ec0u4e48u4f5cu4e3au804cu4e1auff1f u200bu200bu200b']}
2017-04-09 15:53:28 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743>
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn
2017-04-09 15:53:28 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)
2017-04-09 15:53:28 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/EDxAwrBrG?rl=0&uid=2626948743>
{'Content': [u'u7279u522bu7684u751fu65e5u793cu7269u3002 u200bu200bu200b']}
2017-04-09 15:53:32 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743>
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn
2017-04-09 15:53:32 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)
2017-04-09 15:53:32 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/EDBmajRBl?rl=0&uid=2626948743>
{'Content': [u'u7231u7b11u7684u5973u5b69u5b50uff0cu8fd0u6c14u4e00u5b9au4e0du4f1au592au597du2026u2026',
u' u200bu200bu200b']}
2017-04-09 15:53:36 [scrapy] DEBUG: Received cookies from: <200 http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743>
Set-Cookie: WEIBOCN_FROM=deleted; expires=Thu, 01-Jan-1970 00:00:01 GMT; path=/; domain=.weibo.cn
2017-04-09 15:53:36 [scrapy] DEBUG: Crawled (200) <GET http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743> (referer: http://weibo.cn/dafendi)
2017-04-09 15:53:36 [scrapy] DEBUG: Scraped from <200 http://weibo.cn/repost/CsN9LnQiG?rl=0&uid=2626948743>
{'Content': [u':u4e00u4e2au957fu5faeu535au5408u96c6uff0cu5927u5bb6u65e0u804au53c8u6ca1u770bu8fc7u7684u8bddu53efu4ee5u770bu770b[u7f9eu55d2u55d2] u200bu200bu200b']}
2017-04-09 15:53:36 [scrapy] INFO: Closing spider (finished)
2017-04-09 15:53:36 [scrapy] INFO: Stored json feed (5 items) in: wanghongmingdan.json
2017-04-09 15:53:36 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 3029,
'downloader/request_count': 7,
'downloader/request_method_count/GET': 6,
'downloader/request_method_count/POST': 1,
'downloader/response_bytes': 22746,
'downloader/response_count': 7,
'downloader/response_status_count/200': 6,
'downloader/response_status_count/302': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2017, 4, 9, 7, 53, 36, 596076),
'item_scraped_count': 5,
'log_count/DEBUG': 27,
'log_count/INFO': 8,
'log_count/WARNING': 2,
'request_depth_max': 3,
'response_received_count': 7,
'scheduler/dequeued': 7,
'scheduler/dequeued/memory': 7,
'scheduler/enqueued': 7,
'scheduler/enqueued/memory': 7,
'start_time': datetime.datetime(2017, 4, 9, 7, 53, 2, 180831)}
2017-04-09 15:53:36 [scrapy] INFO: Spider closed (finished)
2017-04-09 20:11:50 [scrapy] DEBUG: Redirecting (302) to <GET http://weibo.cn/crossDomain/?g=4uegcdef1d93rkj4S3ZomfXlgec&t=1491739909&m=9144&r=&u=http%3A%2F%2Fweibo.cn%3Fgsid%3D4uegcdef1d93rkj4S3ZomfXlgec%26PHPSESSID%3D%26vt%3D4&cross=1&st=ST-MzgwMzAzNDg4MA==-1491739909-tc-27ED8C8D7528C9185E75F7986B8050B7-1,ST-MzgwMzAzNDg4MA==-1491739909-tc-BED83CC16AC311D2BBA234E8F08BBD39-1> from <POST https://login.weibo.cn/login/?rand=842328789&backURL=http%3A%2F%2Fweibo.cn&backTitle=%E6%89%8B%E6%9C%BA%E6%96%B0%E6%B5%AA%E7%BD%91&vt=4>
2017-04-09 20:11:50 [scrapy] DEBUG: Redirecting (meta refresh) to <GET http://weibo.cn/> from <GET http://weibo.cn/crossDomain/?g=4uegcdef1d93rkj4S3Zo