python scrapy爬取生物谷之模拟登陆（使用FormRequest）

最新推荐文章于 2023-02-01 07:46:50 发布

不愿透露姓名的菜鸟

最新推荐文章于 2023-02-01 07:46:50 发布

阅读量940

点赞数

分类专栏：爬虫开发学习

本文链接：https://blog.csdn.net/Homewm/article/details/77412965

版权

爬虫开发学习专栏收录该内容

33 篇文章 2 订阅

订阅专栏

# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy import FormRequest

from bioon import settings             ###############工程名为bioon，修改了settings.py
from bioon.items import BioonItem      

class BioonspiderSpider(scrapy.Spider):
    name = "bioonspider"
    allowed_domains = ["bioon.com"]
    start_urls=['http://login.bioon.com/login']
    
    def parse(self,response):
        
        #######################从response.headers中获取cookies信息########################3
        r_headers = response.headers['Set-Cookie']
        cookies_v = r_headers.split(';')[0].split('=')
        
        cookies = {cookies_v[0]:cookies_v[1]}
        
        ###################模拟请求的头部信息#############################                               
        headers = {
        'Host':	'login.bioon.com',
        'Referer':'http://login.bioon.com/login',
        'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
        'X-Requested-With':'XMLHttpRequest' 
        }
        
        ###############获取验证信息#################
        csrf_token = response.xpath(
            '//input[@id="csrf_token"]/@value').extract()[0]
        
        #获得post的目的URL
        login_url = response.xpath(
            '//form[@id="login_form"]/@action').extract()[0]
        end_login = response.urljoin(login_url)
        
        ###############生成post的数据###################
        formdata={
        #请使用自己注册的用户名
        'account':'********',
        'client_id':'usercenter',
        'csrf_token':csrf_token,
        'grant_type':'grant_type',
        'redirect_uri':'http://login.bioon.com/userinfo',
        #请使用自己注册的用户名
        'username':'********',
        #请使用自己用户名的密码
        'password':'xxxxxxx',
        }
        
        #############模拟登录请求###############
        return FormRequest(
        end_login,
        formdata=formdata,
        headers=headers,
        cookies=cookies,
        callback=self.after_login
        )

    def after_login(self,response):
        
        self.log('Now handling bioon login page.')
        
        aim_url = 'http://news.bioon.com/Cfda/'
        
        obj = json.loads(response.body)
        
        print "Loging state: ", obj['message']
        if "success" in obj['message']:
            self.logger.info("=========Login success.==========")
        
        return scrapy.Request(aim_url,callback = self.parse_list)
    
    def parse_list(self,response):
        
        lis_news = response.xpath(
            '//ul[@id="cms_list"]/li/div/h4/a/@href').extract()
        
        for li in lis_news:
            end_url = response.urljoin(li)
            yield scrapy.Request(end_url,callback=self.parse_content)
    
    def parse_content(self,response):
        
        head = response.xpath(
            '//div[@class="list_left"]/div[@class="title5"]')[0]
        
        item=BioonItem()
        
        item['title'] = head.xpath('h1/text()').extract()[0]
            
        item['source'] = head.xpath('p/text()').re(ur'来源：(.*?)\s(.*?)$')[0]
        
        item['date_time'] = head.xpath('p/text()').re(ur'来源：(.*?)\s(.*?)$')[1]
        
        item['body'] = response.xpath(
            '//div[@class="list_left"]/div[@class="text3"]').extract()[0]
        
        return item

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item,Field


class BioonItem(scrapy.Item):
    # define the fields for your item here like:
    title = Field()
    source =Field()
    date_time = Field()
    body = Field()

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for bioon project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
#Scrapy项目实现的bot的名字(也为项目名称)。
BOT_NAME = 'bioon'

SPIDER_MODULES = ['bioon.spiders']
NEWSPIDER_MODULE = 'bioon.spiders'

#保存项目中启用的下载中间件及其顺序的字典。默认:: {}
DOWNLOADER_MIDDLEWARES = {
    'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
}

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0'

#保存项目中启用的pipeline及其顺序的字典。该字典默认为空，值(value)任意。 
#不过值(value)习惯设定在0-1000范围内。
ITEM_PIPELINES={
#'bioon.pipelines.BioonPipeline':500
}

#下载器下载网站页面时需要等待的时间。该选项可以用来限制爬取速度， 
#减轻服务器压力。同时也支持小数:
DOWNLOAD_DELAY = 0.25    # 250 ms of delay

#爬取网站最大允许的深度(depth)值。如果为0，则没有限制。
DEPTH_LIMIT=0

#是否启用DNS内存缓存(DNS in-memory cache)。默认: True
DNSCACHE_ENABLED=True

#logging输出的文件名。如果为None，则使用标准错误输出(standard error)。默认: None
LOG_FILE='scrapy.log'

#log的最低级别。可选的级别有: CRITICAL、 ERROR、WARNING、INFO、DEBUG。默认: 'DEBUG'
LOG_LEVEL='DEBUG'

#如果为 True ，进程所有的标准输出(及错误)将会被重定向到log中。
#例如， 执行 print 'hello' ，其将会在Scrapy log中显示。
#默认: False
LOG_STDOUT=False

#对单个网站进行并发请求的最大值。默认: 8
CONCURRENT_REQUESTS_PER_DOMAIN=8

#Default: True ,Whether to enable the cookies middleware. If disabled, no cookies will be sent to web servers.
COOKIES_ENABLED = True

#feed settings
FEED_URI = 'file:///C:/Users/stwan/Desktop/bioon/a.txt'
FEED_FORMAT = 'jsonlines'

#feed settings
FEED_URI = 'file:///C:/Users/stwan/Desktop/bioon/a.txt'
FEED_FORMAT = 'jsonlines'

尤其注意最后两行代码，保存为本地的jsonlines格式

pipeline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html

from bioon.handledb import adb_insert_data,exec_sql
from bioon.settings import DBAPI,DBKWARGS

class BioonPipeline(object):
    def process_item(self, item, spider):
        print "Now in pipeline:"
        print item['name']
        print item['value']
        print "End of pipeline."
        #store data
        #adb_insert_data(item,"tablename",DBAPI,**DBKWARGS)
        return item

上述代码非原创，来自于教学视频。为了下次遇到类似的问题，特备份于此。

当然我们也可以使用已经登录产生的cookie信息进行伪装登陆。

此处可以参考我另外一篇博客文章：

python爬虫设置cookie模拟登录微博方法

http://blog.csdn.net/homewm/article/details/77302616

不愿透露姓名的菜鸟

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python scrapy爬取生物谷之模拟登陆（使用FormRequest）

# -*- coding: utf-8 -*-import jsonimport scrapyfrom scrapy import FormRequestfrom bioon import settings ###############工程名为bioon，修改了settings.pyfrom bioon.items import BioonItem
复制链接

扫一扫