通过scrapy框架模拟登陆豆瓣并进入个人信息页面
# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request,FormRequest
import urllib.request
class DbSpider(scrapy.Spider):
name = 'db'
allowed_domains = ['douban.com']
hearder={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"}
#start_urls = ('http://douban.com/',)
#开始的请求信息
def start_requests(self):
print("进入开始爬取")
return [Request("https://www.douban.com/login",meta={"cookiejar":1},callback=self.parse)]
def parse(self, response):
print("进入parse方法")
#验证码的判断
captcha=response.xpath('//*[@id="captcha_image"]/@src').extract()
print(captcha)
print("验证码长度",len(captcha))
if len(captcha)>0:
print("此时有验证码")
localpath="D:/学习目录/代码/最新python数据分析与爬虫实战 数据挖掘/result/captcha.png"
urllib.request.urlretrieve(captcha[0],filename=localpath)
print("请查看本地验证码图片并输入图片")
#等待输入的值
captchavalue=input()
# 通过redir,模拟登陆成功后跳转的页面
data = {
"form_email": "1623016349@qq.com",
"form_password": "lxc5201314",
"captcha - solution":captchavalue,
"redir": "https://www.douban.com/people/183292056/",
}
else:
print("此时没有验证码")
# 通过redir,模拟登陆成功后跳转的页面
data = {
"form_email": "1623016349@qq.com",
"form_password": "lxc5201314",
"redir": "https://www.douban.com/people/183292056/",
}
print("登陆中.....")
# 服务器发送post请求
return [FormRequest.from_response(response,
meta={"cookiejar": response.meta["cookiejar"]},
headers=self.hearder,
formdata=data,
callback=self.next,
)]
def next(self,response):
print("此时已经登陆成功并爬取个人中心的数据")
title=response.xpath("/html/head/title").extract()
print(title)
注意
修改setting的配置文件
1,添加HTTPERROR_ALLOWED_CODES = [403]
2,修改ROBOTSTXT_OBEY = False
3,取消注释useragent并修改USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'