post请求
class FanyiSpider(scrapy.Spider):
name = 'fanyi'
allowed_domains = ['baidu.com']
# start_urls = ['https://fanyi.baidu.com/sug']
# 引擎调度起来以后首先会从start_urls中提取起始url然后发起get请求,现在把这个属性注释掉,引擎就找不到起始url了,就不会发起默认的get请求
# 如果要发起post请求,我们需要在这里重写爬虫的周期函数
def start_requests(self):
# 这个周期函数,下载器开始下载的时候开启
print("下载器开始请求网络数据...")
post_url = "https://fanyi.baidu.com/sug"
# 创建表单
data = {
"kw":'a'
}
# 发起post请求
yield scrapy.FormRequest(url=post_url,formdata=data,callback=self.parse_post) # 下载器对象不能由我们手动,我们需要将下载器创建流程返回给调度器,让调度器去同一调度
# 定义post请求的回调函数
def parse_post(self, response):
print(111111111111111111111111111)
print(response.text)
GET加POST请求
# -*- coding: utf-8 -*-
import scrapy
import pytesseract
from PIL import Image
class GushiwenSpider(scrapy.Spider):
name = 'gushiwen'
allowed_domains = ['gushiwen.org']
start_urls = ['https://so.gushiwen.org/user/login.aspx?from=http://so.gushiwen.org/user/collect.aspx']
def parse(self, response):
# 从response中取出两个token和一个验证码url
# 取出两个token
self.token1 = response.css("#__VIEWSTATE::attr(value)").extract()[0]
self.token2 = response.css("#__VIEWSTATEGENERATOR::attr(value)").extract()[0]
img_src = "https://so.gushiwen.org"+ response.css("#imgCode::attr(src)").extract()[0]
# 【注意】scrapy自带的css方法可以直接通过css选择器选择页面中元素,如果我们要提取元素的某属性值直接在选择器后面加上“::attr(某属性)”,如果要提取内容,在选择器后面加上“::text”
# 下载验证码
# 调起来一个get下载器,来下载验证码
yield scrapy.Request(url=img_src,callback=self.parse_code)
# 封装一个回调函数,用于处理验证码的响应
def parse_code(self,response):
# response是验证码图片的响应数据
with open("./code.png",'wb') as fp:
fp.write(response.body) # response的二进制是body
img = Image.open("./code.png")
img = img.convert("L")
code = pytesseract.image_to_string(img)
# 登录接口
login_url = "https://so.gushiwen.org/user/login.aspx?from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx"
# 登录提交的表单数据
data = {
"__VIEWSTATE": self.token1,
"__VIEWSTATEGENERATOR":self.token2,
"from":"http://so.gushiwen.org/user/collect.aspx",
"email":"fanjianbo666@163.com",
"pwd":"12345678",
"code":code,
"denglu":"登录"
}
# 返回出去一个post请求的下载器对象
yield scrapy.FormRequest(url=login_url,formdata=data,callback=self.parse_login)
# 定义一个回调函数,用于处理登录的响应数据
def parse_login(self,response):
print(response.text)
pass
两个相互关联的爬虫
案例:爬取“穷游网”
“中国”的所有的旅游城市,要爬取的内容
1)城市名,去过的人数,常见的景点,城市的概况
2)进入每个城市的二级页面“旅行地”模块,提取出每个城市的旅行地的如下字段:地名,评分,评论数,推荐锦囊数,排名
city的爬虫器
# -*- coding: utf-8 -*-
import scrapy
from Qiongyou.items import QiongyouItem
class CitySpider(scrapy.Spider):
name = 'city'
allowed_domains = ['qyer.com']
start_urls = ['https://place.qyer.com/china/citylist-0-0-%d/'%i for i in range(1,9)]
def parse(self, response):
# print(response.text)
city_list = response.xpath("//ul[@class='plcCitylist']/li")
for city in city_list:
item = QiongyouItem()
item["cityName"] = " ".join(city.xpath(".//h3//a//text()").extract())
item["visitorNum"] = city.xpath(".//p[@class='beento']/text()").extract()[0]
item["scenicSpot"] = "".join(city.xpath(".//p[@class='pois']//text()").extract())
item["cityNum"] = city.xpath(".//h3//a/@href").extract()[0].split("/")[-2]
# 取出城市的pid
item["cityPid"] = city.xpath(".//p[@class='addPlanBtn']/@data-pid").extract()[0]
# 城市的概况在下级页面中,所以在这里要匹配出下级页面的url
next_url = "https://place.qyer.com/" + item["cityNum"] + "/profile/"
print(next_url)
# 发起请求,打开二级页面提取数据
yield scrapy.Request(url=next_url,callback=self.parse_next,meta={"item":item})
# response有一个属性叫做meta,用于记录响应的相关信息,这个属性可以自定义,在发起Request请求的时候,把我们item放入到meta,此时item就可以跟着响应数据对象传入到下级页面
def parse_next(self,response):
# print(response.meta)
item = response.meta["item"] # 取出上级页面中传过来的item对象
item["cityInfo"] = "\n".join(response.xpath("//div[@class='entry_main']//text()").extract()) # 继续解析上级页面中没有解析完的item
yield item
#travel爬虫器
# -*- coding: utf-8 -*-
import scrapy
import redis
import json
class TravelSpider(scrapy.Spider):
name = 'travel'
allowed_domains = ['qyer.com']
# start_urls = ['http://qyer.com/']
def start_requests(self):
print("开始请求...")
# 1)从redis数据库中提取出城市pid和城市的代号
rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=8)
lens = rds.llen("Qiuyou:cityList")
print(lens)
city_list = rds.lrange("Qiuyou:cityList",0,lens)
# print(city_list)
# 定义两个列表分别用于存储pid和城市代号
pid_list = []
cityNum_list = []
for c in city_list:
city = json.loads(c)
pid_list.append(city["cityPid"])
cityNum_list.append(city["cityNum"])
# 2)将城市代号拼接出城市的旅游景点的列表页,从中提取出总页数
for i in range(len(cityNum_list)):
city_url = "https://place.qyer.com/" + cityNum_list[i] + "/alltravel/"
print("正在向:%s发起请求!"%city_url)
yield scrapy.Request(url=city_url,callback=self.parse_city,meta={"pid":pid_list[i]})
# 定义一个回调函数用于处理每个城市的旅游景点列表的页面
def parse_city(self, response):
pid = response.meta["pid"]
# 解析出每个城市景点列表页的总页数
total_pages = response.xpath("//a[@class='ui_page_item']/@data-page").extract()
if len(total_pages) != 0:
# 抓取每一页每一页的数据
for page in range(1,int(total_pages[-1])+1):
post_url = "https://place.qyer.com/poi.php?action=list_json"
data = {
'page': str(page),
"type": "city",
"pid": str(pid),
"sort": '0',
"subsort": "all",
"isnominate": '-1',
"haslastm": "false",
"rank": '0'
}
yield scrapy.FormRequest(url=post_url,formdata=data,callback=self.parse_post)
def parse_post(self, response):
# print(response.text)
# 练习:json解析并且存储景点页面
pass
- 管道文件
import redis
import json
class QiongyouPipeline(object):
def open_spider(self,spider):
# 判断爬虫
if spider.name == "city":
self.rds = redis.StrictRedis(host="www.fanjianbo.com",port=6379,db=8)
pass
def process_item(self, item, spider):
if spider.name == "city":
self.rds.lpush("Qiuyou:cityList",json.dumps(dict(item)))
return item
def close_spider(self,spider):
pass