主题
- 参考创智播客视频制定的笔记
- http://www.iqiyi.com/v_19rr8ahgw4.html#curid=764882600_53a7f394e1341b7d5c21922442855ca4
- selectors选择器,selector有四个基本的方法,最常用的还是XPath
- xpath()传入xpath表达式,返回该表达式所对应的所有节点的selector list列表
- extract()序列化该节点为unicode字符串并返回list
- css()传入css表达式,返回该表达式所对应的所有节点的selector list列表,语法同beautifulSoup4
- re(),根据传入的正则表达式对数据进行提取,返回unicode字符串list列表
- scrapy有两大爬虫 Spider和CrawlSpider
- 案例:Tencent招聘信息的采集
腾讯社招网站:https://hr.tencent.com/position.php?&start=#a0
职位名称:positionname
职位链接:positionLink
职位类型:positionType
招聘人数:peopleNumber
工作地点:workLocation
发布时间:publishTime
- 注意xpath有个规则是从1开始计数,使用xpath对象的到的是xpath对象的列表,必须使用extract()提取
- 流程
- Scrapy startproject xxx
- Scrapy genspider xxx “http://xxx.com”
- 编写item.py 明确需要提取的数据
- 编写spides/xxx.py 编写爬虫文件,处理请求和响应,以及提取数据(yield item yield scrapy.Request(url.callback=self.parse)
- 有个问题就是,教程进行了encode编码,而实践没有,如果出现编码问题,自己根据情况解决
- 编写pipelines.py 编写管道文件,处理spider返回item数据
- 编写settings.py启动管道组件以及其他相关设置
- 执行爬虫 scrapy crawl xxx
- yield重要性发送数据后还能回来,return就直接结束了
- 编写items.py确定要采集的数据
import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#职位名称
positionName =scrapy.Field();
#职位链接:
positionLink =scrapy.Field();
#职位类型:
positionType =scrapy.Field();
#招聘人数:
peopleNumber =scrapy.Field();
#工作地点:
workLocation =scrapy.Field();
#发布时间:
publishTime =scrapy.Field();
- 编写tencent.py,提取数据,并通过下一页不断提取
import scrapy
from Tencent.items import TencentItem;
class TencentSpider(scrapy.Spider):
name = 'tencent'
allowed_domains = ['hr.tencent.com']
#start_urls = ['http://tencent.com/']
baseURL='https://hr.tencent.com/position.php?&start=';
#偏移量
offset=0;
start_urls=[baseURL+str(offset)];
def parse(self, response):
node_list=response.xpath("//tr[@class='even'] | //tr[@class='odd']");
for node in node_list:
item=TencentItem();
#提取每个职位的信息,并将编码改为utf-8
item['positionName']=node.xpath("./td[1]/a/text()").extract()[0]#.encode('utf-8');
item['positionLink']=node.xpath("./td[1]/a/@href").extract()[0]#.encode('utf-8');
if len(node.xpath("td[2]/text()")):
item['positionType']=node.xpath("./td[2]/text()").extract()[0]#.encode('utf-8');
else:
item['positionType']='';
item['peopleNumber']=node.xpath("./td[3]/text()").extract()[0]#.encode('utf-8');
item['workLocation']=node.xpath("./td[4]/text()").extract()[0]#encode('utf-8')
item['publishTime'] =node.xpath("./td[5]/text()").extract()[0]#.encode('utf-8');
yield item;
#通过偏移量来采集
#if self.offset<3900:
# self.offset+=10;
# url=self.baseURL+str(self.offset);
# yield scrapy.Request(url,callback=self.parse);
#通过下一页
if len(response.xpath("//a[@class='noactive' and @id='next']"))==0:
url=response.xpath("//a[@id='next']/@href").extract()[0];
yield scrapy.Request(“https://hr.tencent.com/"+url,callback=self.parse);
- 编写pipeline.py管道文件,对数据进行持久化
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import json;
class TencentPipeline(object):
def __init__(self):
self.f=open('tencent.json','w');
def process_item(self, item, spider):
#content=json.dumps(dict(item),ensure_ascii=False)+',\n';
content=str(dict(item))+",\n";
self.f.write(content);
return item;
def close_spider(self,spider):
self.f.close();
- 修改settings.py文件
ROBOTSTXT_OBEY = False
并开启管道
ITEM_PIPELINES = {
'Tencent.pipelines.TencentPipeline': 300,
}
斗鱼主播颜值区图片下载
items.py
import scrapy
class DouyuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
#主播名称
nickname=scrapy.Field();
#图片链接
imagelink=scrapy.Field();
douyu.py
import scrapy
import json;
from Douyu.items import DouyuItem;
class DouyuSpider(scrapy.Spider):
name = 'douyu'
allowed_domains = ['douyucdn.com']
#start_urls = ['http://douyucdn.com/']
#http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=0
baseURL='http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=';
offset=0;
start_urls=[baseURL+str(offset)];
def parse(self, response):
data_list=(json.loads(response.body))['data'];
print(len(data_list));
if len(data_list)==0:
return;
else:
for data in data_list:
#print(data['nickname']);
#print(data['vertical_src']);
item=DouyuItem();
item['nickname']=data['nickname'];
item['imagelink']=data['vertical_src'];
yield item;
#if len(json.loads(response.body)['data']):
self.offset = self.offset+20;
yield scrapy.Request(self.baseURL+str(self.offset),callback=self.parse,dont_filter=True);
Pipelines.py
import scrapy;
from Douyu.settings import IMAGES_STORE as images_store;
from scrapy.contrib.pipeline.images import ImagesPipeline;
from Douyu.items import DouyuItem;
import os;
class DouyuPipeline(ImagesPipeline):
def get_media_requests(self,item,info):
image_link=item['imagelink'];
yield scrapy.Request(image_link);
#[
#(True,
#{'url': 'https://rpic.douyucdn.cn/amrpic-180515/1977639_1126.jpg',
#'path': 'full/ea1e27dfee48e49c029b5bd5aa1a1fd6652ce9cd.jpg',
#'checksum': '91a00aa61465331c42d55c5d96260427'}
#)]
def item_completed(self,results,item,info):
#print(results);
#print('\n');
#取出图片路径的值
image_path=[x["path"] for ok,x in results if ok];
print('****************')
#print(item);
print('****************')
os.rename(images_store+image_path[0],images_store+item['nickname']+'.jpg');
return item;
Settings.py
添加图片保存地址信息
IMAGES_STORE=“/Users/zingfront/Documents/newdir/learn_scrapy/Douyu/Images/";
添加user-agent
USER_AGENT = ‘okhttp/3.8.1'
开启管道
ITEM_PIPELINES = {
#'scrapy.contrib.pipeline.images.ImagesPipeline': 1,
'Douyu.pipelines.DouyuPipeline': 300,
}
- 注意:遇到的问题就是,不能翻页获取主播信息,是因为scrapy.Request()的请求的第三个参数修改为dont_fiter=true即可(表示可以重复提取)