# -*- coding: utf-8 -*-
import json
import scrapy
from scrapy import FormRequest
from bioon import settings ###############工程名为bioon,修改了settings.py
from bioon.items import BioonItem
class BioonspiderSpider(scrapy.Spider):
name = "bioonspider"
allowed_domains = ["bioon.com"]
start_urls=['http://login.bioon.com/login']
def parse(self,response):
#######################从response.headers中获取cookies信息########################3
r_headers = response.headers['Set-Cookie']
cookies_v = r_headers.split(';')[0].split('=')
cookies = {cookies_v[0]:cookies_v[1]}
###################模拟请求的头部信息#############################
headers = {
'Host': 'login.bioon.com',
'Referer':'http://login.bioon.com/login',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0',
'X-Requested-With':'XMLHttpRequest'
}
###############获取验证信息#################
csrf_token = response.xpath(
'//input[@id="csrf_token"]/@value').extract()[0]
#获得post的目的URL
login_url = response.xpath(
'//form[@id="login_form"]/@action').extract()[0]
end_login = response.urljoin(login_url)
###############生成post的数据###################
formdata={
#请使用自己注册的用户名
'account':'********',
'client_id':'usercenter',
'csrf_token':csrf_token,
'grant_type':'grant_type',
'redirect_uri':'http://login.bioon.com/userinfo',
#请使用自己注册的用户名
'username':'********',
#请使用自己用户名的密码
'password':'xxxxxxx',
}
#############模拟登录请求###############
return FormRequest(
end_login,
formdata=formdata,
headers=headers,
cookies=cookies,
callback=self.after_login
)
def after_login(self,response):
self.log('Now handling bioon login page.')
aim_url = 'http://news.bioon.com/Cfda/'
obj = json.loads(response.body)
print "Loging state: ", obj['message']
if "success" in obj['message']:
self.logger.info("=========Login success.==========")
return scrapy.Request(aim_url,callback = self.parse_list)
def parse_list(self,response):
lis_news = response.xpath(
'//ul[@id="cms_list"]/li/div/h4/a/@href').extract()
for li in lis_news:
end_url = response.urljoin(li)
yield scrapy.Request(end_url,callback=self.parse_content)
def parse_content(self,response):
head = response.xpath(
'//div[@class="list_left"]/div[@class="title5"]')[0]
item=BioonItem()
item['title'] = head.xpath('h1/text()').extract()[0]
item['source'] = head.xpath('p/text()').re(ur'来源:(.*?)\s(.*?)$')[0]
item['date_time'] = head.xpath('p/text()').re(ur'来源:(.*?)\s(.*?)$')[1]
item['body'] = response.xpath(
'//div[@class="list_left"]/div[@class="text3"]').extract()[0]
return item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
from scrapy import Item,Field
class BioonItem(scrapy.Item):
# define the fields for your item here like:
title = Field()
source =Field()
date_time = Field()
body = Field()
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for bioon project
#
# For simplicity, this file contains only the most important settings by
# default. All the other settings are documented here:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
#
#Scrapy项目实现的bot的名字(也为项目名称)。
BOT_NAME = 'bioon'
SPIDER_MODULES = ['bioon.spiders']
NEWSPIDER_MODULE = 'bioon.spiders'
#保存项目中启用的下载中间件及其顺序的字典。默认:: {}
DOWNLOADER_MIDDLEWARES = {
'scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware': 110,
}
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0'
#保存项目中启用的pipeline及其顺序的字典。该字典默认为空,值(value)任意。
#不过值(value)习惯设定在0-1000范围内。
ITEM_PIPELINES={
#'bioon.pipelines.BioonPipeline':500
}
#下载器下载网站页面时需要等待的时间。该选项可以用来限制爬取速度,
#减轻服务器压力。同时也支持小数:
DOWNLOAD_DELAY = 0.25 # 250 ms of delay
#爬取网站最大允许的深度(depth)值。如果为0,则没有限制。
DEPTH_LIMIT=0
#是否启用DNS内存缓存(DNS in-memory cache)。默认: True
DNSCACHE_ENABLED=True
#logging输出的文件名。如果为None,则使用标准错误输出(standard error)。默认: None
LOG_FILE='scrapy.log'
#log的最低级别。可选的级别有: CRITICAL、 ERROR、WARNING、INFO、DEBUG。默认: 'DEBUG'
LOG_LEVEL='DEBUG'
#如果为 True ,进程所有的标准输出(及错误)将会被重定向到log中。
#例如, 执行 print 'hello' ,其将会在Scrapy log中显示。
#默认: False
LOG_STDOUT=False
#对单个网站进行并发请求的最大值。默认: 8
CONCURRENT_REQUESTS_PER_DOMAIN=8
#Default: True ,Whether to enable the cookies middleware. If disabled, no cookies will be sent to web servers.
COOKIES_ENABLED = True
#feed settings
FEED_URI = 'file:///C:/Users/stwan/Desktop/bioon/a.txt'
FEED_FORMAT = 'jsonlines'
#feed settings
FEED_URI = 'file:///C:/Users/stwan/Desktop/bioon/a.txt'
FEED_FORMAT = 'jsonlines'
尤其注意最后两行代码,保存为本地的jsonlines格式
pipeline.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from bioon.handledb import adb_insert_data,exec_sql
from bioon.settings import DBAPI,DBKWARGS
class BioonPipeline(object):
def process_item(self, item, spider):
print "Now in pipeline:"
print item['name']
print item['value']
print "End of pipeline."
#store data
#adb_insert_data(item,"tablename",DBAPI,**DBKWARGS)
return item
上述代码非原创,来自于教学视频。为了下次遇到类似的问题,特备份于此。
当然我们也可以使用已经登录产生的cookie信息进行伪装登陆。
此处可以参考我另外一篇博客文章:
http://blog.csdn.net/homewm/article/details/77302616