Linux:pip3 install scrapy
window:
a:pip3 install wheel
b:下载twisted高性能异步模块 https://www.lfd.uci.edu/~gohlke/pythonlibs/#twisted
c:进入下载的目录 执行pip3 install Twisted-17.1---.whl
d:pip3 install pywin32
e:pip3 install scrapy
f:pip3 install scrapy
创建项目:scrapy startproject 项目名称
创建爬虫文件:cd 项目
scrapy genspider 爬虫文件名 www.baidu.com
启动爬虫文件:scray crawl 爬虫文件 --nolog
框架简单理解:spiders文件下存放爬虫文件
items文件存放永久化存储的属性字段,与管道配合使用
middlewares中间件存放下载中间件与爬虫中间件
管道做持久化存储,可以写多个管道文件
settings文件做配置
setting文件配置:
添加User-Agent:USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
ROBOT.TXT改为false:ROBOTSTXT_OBEY = False
如需持久化存储将管道文件注释打开,如有多个管道文件,要条件注册,并写好优先级
如需中间件操作需要将中间件注释打开
案例1 直聘网获取岗位信息 核心:多种持久化存储方式的管道配置
class BoosSpider(scrapy.Spider):
name = 'boos'
# allowed_domains = ['www.baidu.com']
start_urls = [
'https://www.zhipin.com/job_detail/?query=python%E7%88%AC%E8%99%AB&scity=101010100&industry=&position=']
def parse(self, response):
li_list = response.xpath('//div[@class="job-list"]/ul/li')
for li in li_list:
title = li.xpath('.//div[@class="info-primary"]/h3/a/div[@class="job-title"]/text()').extract_first()
price = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()').extract_first()
company = li.xpath('.//div[@class="info-company"]/div/h3/a/text()').extract_first()
item = BoosproItem()
item['title']=title
item['price']=price
item['company'] = company
yield item
item配置
class BoosproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
price = scrapy.Field()
company = scrapy.Field()
管道配置
import pymysql
from redis import Redis
import json
class BoosproPipeline(object):
fp = None
def open_spider(self,spider):
print("开始爬虫")
self.fp = open('./job.txt','w',encoding='utf-8')
def process_item(self, item, spider):
self.fp.write(item['title']+'\t'+item['price']+'\t'+item['company']+'\n')
return item
def close_spider(self,spider):
print('爬虫结束!!!')
self.fp.close()
class MysqlPipeline(object):
conn =None
cursor =None
def open_spider(self,spider):
print("开始爬虫")
self.conn = pymysql.Connect(host='127.0.0.1', port=3306, user='root', password='321', db='pa')
def process_item(self, item, spider):
self.cursor = self.conn.cursor()
sql='insert into job values ("%s","%s","%s")'%(item['title'],item['price'],item['company'])
try:
self.cursor.execute(sql)
self.conn.commit()
except Exception as e:
print(e)
self.conn.rollback()
return item
def close_spider(self,spider):
print('爬虫结束!!!')
self.cursor.close()
self.conn.close()
class ReidsPipeline(object):
conn =None
def open_spider(self,spider):
print("开始爬虫")
self.conn = Redis(host='127.0.0.1', port=6379 ,db=14)
def process_item(self, item, spider):
dic = {
'title':item["title"],
'price':item['price'],
'company':item['company']
}
dic= json.dumps(dic,ensure_ascii=False)
self.conn.lpush('jobinfo',dic)
return item
案例2 多页爬去网站信息
爬虫文件
import scrapy
from chouti.items import ChoutiItem
class CtSpider(scrapy.Spider):
name = 'ct'
# allowed_domains = ['www.baidu.com']
url = 'https://dig.chouti.com/r/scoff/hot/%d'
page_num =1
start_urls = ['https://dig.chouti.com/r/scoff/hot/1']
def parse(self, response):
div_list = response.xpath('//div[@id="content-list"]/div')
for div in div_list:
head = div.xpath('./div[3]/div[1]/a/text()').extract_first()
author = div.xpath('./div[3]/div[2]/a[4]/b/text()').extract_first()
item = ChoutiItem()
item['head'] = head
item['author'] = author
yield item
if self.page_num<5:
self.page_num+=1
new_url = format(self.url%self.page_num)
yield scrapy.Request(url=new_url, callback=self.parse)
items配置
import scrapy class ChoutiItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() head = scrapy.Field() author = scrapy.Field() pass
管道配置
class ChoutiproPipeline(object):
def process_item(self, item, spider):
print(item['head'],item['author'])
return item