1、创建Scrapy项目
scrapy startproject caigou
2.进入项目目录,使用命令genspider创建Spider
scrapy genspider ccgp ccgp-shaanxi.gov.cn
3、定义要抓取的数据(处理items.py文件)
import scrapy
class CaigouItem(scrapy.Item):
# id
notice_id = scrapy.Field()
# 区域省份名字
province = scrapy.Field()
# 地区名字
area = scrapy.Field()
# 公告类型
notice_type_title = scrapy.Field()
# 公告标题
notice_title = scrapy.Field()
# 公告发布时间
notice_date = scrapy.Field()
# 公告url
notice_url = scrapy.Field()
# 发布单位
pub_company = scrapy.Field()
4、编写提取item数据的Spider(在spiders文件夹下:ccgp.py)
# -*- coding: utf-8 -*-
# 此案例难点是,我们需要爬取的数据是ajax动态请求加载,需要传入相关表单数据才可以返回正确数据
# 同一公告类型不同区域请求的ajax网址是一样的,只是传入的表单参数不一样,所以需要禁止去重dont_filter=True
# 如果用这个则需要在settings中去掉中间件:SeleniumMiddleWare
# 注意获取页数的xpath规则,当我们页数在第五页的时候,总页数会显示成...,所以要从最后一页的按钮上处理
import re
import scrapy
from ..items import CaigouItem
class CcgpSpider(scrapy.Spider):
name = 'ccgp'
allowed_domains = ['ccgp-shaanxi.gov.cn']
start_urls = ['http://www.ccgp-shaanxi.gov.cn/notice/list.do?noticetype=3&province=province']
def parse(self, response):
items = []
# 公告类型编码: totype('3')
notice_type_nums = response.xpath('//div[@class="list-group"]/div/ul[@class="type-list"]/li/@onclick').extract()
# 公告类型名称
notice_type_titles = response.xpath('//div[@class="list-group"]/div/ul[@class="type-list"]/li/a/text()').extract()
patt_notice = r"\d+"
# 获取省份的名称和相应代码,结果:regionCity('610001','陕西省本级')
regionCitys = response.xpath('//ul[@id="toRegion"]/li/@onclick').extract()
patt = r"regionCity\('(\d+)','(.+)'\)"
# 表单请求网址
form_url = "http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do?isgovertment=¬icetype="
length = len(notice_type_titles)
# 测试用,减少公告类型的循环
for i in range(6,8):
# for i in range(length):
notice_num = re.findall(patt_notice,notice_type_nums[i])
notice_num = notice_num[0]
notice_type_title = notice_type_titles[i]
# 测试用,减少省份的循环
for regionCity in regionCitys[4:6]:
# for regionCity in regionCitys:
notice_dict = {}
result = re.findall(patt, regionCity)
# 省份名字
notice_dict['province'] = result[0][1]
# 省份编码
notice_dict['province_code'] = result[0][0]
# 公告类型
notice_dict['notice_type_title'] = notice_type_title
# 提交表单的地址
notice_dict['form_url'] = form_url + notice_num
items.append(notice_dict)
# items:[{"province":"陕西省本级","province_code":"610001","notice_type_title":"采购公告",
# "form_url":"http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do?isgovertment=¬icetype=3"},
# {"province":"陕西省本级","province_code":"610001","notice_type_title":"结果公告",
# "form_url":"http://www.ccgp-shaanxi.gov.cn/notice/noticeaframe.do?isgovertment=¬icetype=5"}]
for item in items:
# 因为只有提交表单后,网页才会显示我们需要的数据和获取多少页
yield scrapy.FormRequest(url=item['form_url'],meta={"meta_1":item},
formdata={"page.pageNum": "1", "parameters['regionguid']": item['province_code']},
callback=self.parse_pages)
# 获取页数
def parse_pages(self,response):
meta_1 = response.meta['meta_1']
province = meta_1['province']
notice_type_title = meta_1['notice_type_title']
item= {}
item['province'] = province
item['notice_type_title'] = notice_type_title
# 获取总页数 javascript:toPage('',1525);
re_total_pages = response.xpath('//ul[@class="pagination"]/li[last()-2]/a/@href').get()
total_pages = re.findall(r"\d+", re_total_pages)
if not total_pages:
total_pages = "1"
else:
total_pages = total_pages[0]
# 测试用,总页数限制在3
if total_pages != '1':
total_pages = '3'
# print("[%s][%s]总页数是%s"%(province,notice_type_title,total_pages))
for page in range(1,int(total_pages)+1):
yield scrapy.FormRequest(url=meta_1['form_url'],meta={"meta_2":item},
formdata={"page.pageNum": str(page), "parameters['regionguid']": meta_1['province_code']},
callback=self.parse_detail, dont_filter=True)
def parse_detail(self,response):
meta_2 = response.meta['meta_2']
# 获取地区area:列表形式
areas = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[2]/text()').extract()
# 公告标题
notice_titles = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[3]/@title').extract()
# 公告url
notice_urls = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[3]/a/@href').extract()
length = len(notice_titles)
province = meta_2['province']
notice_type_title = meta_2['notice_type_title']
if length > 0:
for i in range(length):
item = {}
item['notice_id'] = notice_urls[i].split("=")[-1]
# 区域
item['province'] = province
# 公告类型
item['notice_type_title'] = notice_type_title
# 公告标题
item['notice_title'] = notice_titles[i]
# 公告url
item['notice_url'] = notice_urls[i]
# 获取地区area
item['area'] = areas[i]
yield scrapy.Request(url=item['notice_url'],meta={"meta_3":item},callback=self.parse_info)
else:
item = CaigouItem()
no_data = "无数据"
item['notice_id'] = province + notice_type_title + no_data
# 区域
item['province'] = province
# 公告类型
item['notice_type_title'] = notice_type_title
# 公告标题
item['notice_title'] = no_data
# 公告url
item['notice_url'] = no_data
# 获取地区area
item['area'] = no_data
# 发布单位
item['pub_company'] = no_data
# 发布时间
item['notice_date'] = no_data
yield item
def parse_info(self,response):
meta_3 = response.meta['meta_3']
item = CaigouItem()
# id
item['notice_id'] = meta_3['notice_id']
# 区域
item['province'] = meta_3['province']
# 获取地区area
item['area'] = meta_3['area']
# 公告类型
item['notice_type_title'] = meta_3['notice_type_title']
# 发布单位 发布单位:陕西炬荣招标代理有限公司
pub_company = response.xpath('//div[@class="content_about"]/span[1]/text()').get()
item['pub_company'] = pub_company.strip().split(":")[-1]
# 公告标题
item['notice_title'] = meta_3['notice_title']
# 公告发布时间
item['notice_date'] = response.xpath('//div[@class="content_about"]/span[2]/em/text()').get()
# 公告url
item['notice_url'] = meta_3['notice_url']
yield item
4、选用:编写提取item数据的Spider(在spiders文件夹下:ccgp2.py,增加中间件:SeleniumMiddleWare,用selenium获取第一个省份和第一个公告类型模拟点击下一页数据的情形)
# -*- coding: utf-8 -*-
# 此案例在ccgp.py基础上,增加了中间件:SeleniumMiddleWare,用selenium获取第一个省份和第一个公告类型模拟点击下一页数据的情形
import re
import time
import scrapy
from scrapy.utils.project import get_project_settings
class CcgpSpider(scrapy.Spider):
name = 'ccgp2'
allowed_domains = ['ccgp-shaanxi.gov.cn']
start_urls = ['http://www.ccgp-shaanxi.gov.cn/notice/list.do?noticetype=3&province=province']
next_first_datas = []
now_time = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
# 先清空日志文件,这样重新写入的时候跟直接删掉日志文件是一个效果
with open(get_project_settings().get('LOG_FILE'), 'w+',encoding='utf-8')as f:
f.write('清空日志完成,当前时间为:%s\n'%(now_time))
def parse(self, response):
# 获取总页数 javascript:toPage('',1525);
re_total_pages = response.xpath('//ul[@class="pagination"]/li[last()-2]/a/@href').get()
total_pages = re.findall(r"\d+", re_total_pages)
if not total_pages:
total_pages = "无数据"
else:
total_pages = total_pages[0]
if total_pages != "无数据":
# 测试用只要最后3页数据,可去掉
page = int(total_pages) - 2
notice_nums = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[1]/text()').extract()
notice_type_title_num = 1
regionCity_num = 1
# 如果下一页的第一个数据和上一页的第一个数据一样则退出系统,为防止列表过于庞大,只要下一个数据不在列表中,则清空列表
# 下一页的第一个数据的网址id(可能有一样的),所以换成获取序号
notice_num_first = notice_nums[0]
if notice_num_first not in self.next_first_datas:
self.next_first_datas.clear()
self.next_first_datas.append(notice_num_first)
yield scrapy.Request(url=response.request.url, meta={"next_page": True,
"notice_type_title_num":notice_type_title_num,
"regionCity_num":regionCity_num,"page":page}, dont_filter=True)
# 获取地区area:列表形式
areas = response.xpath('//div[@class="list-box"]/table//tbody/tr/td[2]/text()').extract()
# 公告标题
notice_titles = response.xpath(
'//div[@class="list-box"]/table//tbody/tr/td[3]/@title').extract()
# 公告url
notice_urls = response.xpath(
'//div[@class="list-box"]/table//tbody/tr/td[3]/a/@href').extract()
length = len(notice_titles)
for i in range(length):
item = {}
item['notice_id'] = notice_urls[i].split("=")[-1]
item['area'] = areas[i].strip()
item['notice_title'] = notice_titles[i].strip()
item['notice_url'] = notice_urls[i]
yield item
else:
# 获取当前页码
current_page = response.xpath('//ul[@class="pagination"]/li[@class="active"]/a/text()').get()
print("已经获取到最后一页:%s" % current_page)
else:
no_data = "无数据"
item = {}
item['area'] = no_data
item['notice_title'] = no_data
item['notice_url'] = no_data
yield item
5.处理pipelines管道文件保存数据,可将结果保存到文件中(pipelines.py)
# -*- coding: utf-8 -*-
import time
from openpyxl import Workbook
import pymysql
import pymongo
import redis
from scrapy.exceptions import DropItem
from scrapy.utils.project import get_project_settings
from scrapy import Item
class XlsxPipeline(object):
def __init__(self):
self.wb = Workbook()
self.ws = self.wb.active
self.ws.title = "采购网数据表"
self.ws.append(['notice_id','区域','地区','公告类型',
'发布单位','公告标题',
'公告发布时间','公告url',])
def process_item(self, item, spider):
text = [item['notice_id'],item['province'],item['area'],item['notice_type_title'],
item['pub_company'],item['notice_title'],
item['notice_date'],item['notice_url'],]
self.ws.append(text)
return item
def close_spider(self,spider):
file_end_name = time.strftime("%Y-%m-%d",time.localtime())
self.wb.save(spider.name + file_end_name + ".xlsx")
print("表格处理完毕!")
class MysqlPipeline():
@classmethod
def from_crawler(cls,crawler):
cls.MYSQL_HOST = crawler.settings.get('MYSQL_HOST')
cls.MYSQL_PORT = crawler.settings.get('MYSQL_PORT')
cls.MYSQL_USER = crawler.settings.get('MYSQL_USER')
cls.MYSQL_PASSWD = crawler.settings.get('MYSQL_PASSWD')
cls.MYSQL_DBNAME = crawler.settings.get('MYSQL_DBNAME')
cls.MYSQL_CHARSET = crawler.settings.get('MYSQL_CHARSET')
return cls()
def open_spider(self,spider):
self.db = pymysql.connect(host=self.MYSQL_HOST,port=self.MYSQL_PORT,user=self.MYSQL_USER,passwd=self.MYSQL_PASSWD,
db=self.MYSQL_DBNAME,charset=self.MYSQL_CHARSET)
self.cursor = self.db.cursor()
# 记录插入了几条数据
self.num = 0
def process_item(self,item,spider):
try:
sql = """CREATE TABLE IF NOT EXISTS caigou(notice_id VARCHAR(40)PRIMARY KEY NOT NULL ,
province VARCHAR(10),area VARCHAR(17),notice_type_title VARCHAR(8),pub_company VARCHAR(60),
notice_title VARCHAR(150),notice_date VARCHAR(20),notice_url VARCHAR(150))ENGINE=Innodb DEFAULT CHARSET=utf8mb4;"""
self.cursor.execute(sql)
except:
pass
try:
self.cursor.execute("select notice_id from caigou WHERE notice_id=%s",item['notice_id'])
switch = self.cursor.fetchone()
keys,values = zip(*item.items())
if switch:
sql = "insert into caigou({})VALUES ({})on duplicate key update {};".format(
','.join(keys),
','.join(['%s'] * len(values)),
','.join(["{}=%s".format(k)for k in keys])
)
self.cursor.execute(sql,values * 2)
else:
self.cursor.execute("insert into caigou({})VALUES ({});".format(
','.join(keys),
','.join(['%s'] * len(values))
),values)
self.num += 1
self.db.commit()
except Exception as e:
print("MYSQL出错:",e)
self.db.rollback()
return item
def close_spider(self,spider):
print("MYSQL处理完毕,本次共计增加%s条数据!" % self.num)
self.cursor.close()
self.db.close()
class MongoPipeline():
MONGO_HOST = get_project_settings().get('MONGO_HOST')
MONGO_PORT = get_project_settings().get('MONGO_PORT')
MONGO_DB = get_project_settings().get('MONGO_DB')
def open_spider(self,spider):
self.cli = pymongo.MongoClient(self.MONGO_HOST,self.MONGO_PORT)
# 记录插入了几条数据
self.num = 0
def process_item(self, item, spider):
try:
self.db = self.cli[self.MONGO_DB]
ccgp = self.db[spider.name]
# count = ccgp.find().count()
data = dict(item) if isinstance(item, Item) else item
notice_title = item['notice_title']
notice_id = item['notice_id']
count = ccgp.find({'notice_id':notice_id}).count()
if count == 0:
print("%s添加mongo数据库中..."%notice_title)
ccgp.insert(data)
self.num += 1
else:
print("%s:该数据已存在无需添加!" % notice_title)
except Exception as e:
print("mongodb出错:",e)
return item
def close_spider(self, spider):
print("MongoDB数据库处理完毕,共计增加%s条数据!" % self.num)
self.cli.close()
class RedisPipeline():
@classmethod
def from_crawler(cls,crawler):
cls.REDIS_HOST = crawler.settings.get('REDIS_HOST')
cls.REDIS_PORT = crawler.settings.get('REDIS_PORT')
cls.REDIS_DBNAME = crawler.settings.get('REDIS_DBNAME')
cls.REDIS_decode_responses = crawler.settings.get('REDIS_decode_responses')
return cls()
def open_spider(self,spider):
try:
self.redis_client = redis.StrictRedis(self.REDIS_HOST,self.REDIS_PORT,
self.REDIS_DBNAME,self.REDIS_decode_responses)
except Exception as e:
print("redis数据库出错:",e)
def process_item(self, item, spider):
if self.redis_client.sadd("ccgp:items",item['notice_id']):
return item
raise DropItem
def close_spider(self, spider):
print("redis处理完毕!")
6.配置settings文件(settings.py)
# 还可以将日志存到本地文件中(可选添加设置)
LOG_FILE = "ccgp.log"
LOG_LEVEL = "DEBUG"
# 包含打印信息也一起写进日志里
LOG_STDOUT = True
MYSQL_HOST = "localhost"
MYSQL_PORT = 3306
MYSQL_USER = "root"
MYSQL_PASSWD = "123456"
MYSQL_DBNAME = "python5"
MYSQL_CHARSET = "utf8mb4"
MONGO_HOST = "localhost"
MONGO_PORT = 27017
MONGO_DB = "py4"
REDIS_HOST = "localhost"
REDIS_PORT = 6379
REDIS_DBNAME = 4
REDIS_decode_response = True # 这样写存的数据是字符串格式
# 增加随机请求头
USER_AGENT_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0",
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)",
"Opera/9.80 (Windows NT 6.1; U; zh-cn) Presto/2.9.168 Version/11.50",
"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0"
]
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
# 下载中间件SeleniumMiddleWare注意
DOWNLOADER_MIDDLEWARES = {
'caigou.middlewares.RandomUserAgentMiddleWare': 543,
# 'caigou.middlewares.SeleniumMiddleWare': 544,
# 'caigou.middlewares.CaigouDownloaderMiddleware': 543,
}
# 有选择性的开启
ITEM_PIPELINES = {
'caigou.pipelines.XlsxPipeline': 300,
# 'caigou.pipelines.MysqlPipeline': 301,
# 'caigou.pipelines.MongoPipeline': 302,
# 'caigou.pipelines.RedisPipeline': 303,
}
7-选用,增加随机请求头中间件和selenium中间件(middlewares.py)
import time
import random
from scrapy import signals
from scrapy.http import HtmlResponse
from selenium import webdriver
class RandomUserAgentMiddleWare():
@classmethod
def from_crawler(cls, crawler):
cls.USER_AGENT_LIST = crawler.settings.get('USER_AGENT_LIST')
return cls()
def process_request(self, request, spider):
user_agent = random.choice(self.USER_AGENT_LIST)
print("当前选用的请求头:%s" % user_agent)
request.headers['User-Agent'] = user_agent
class SeleniumMiddleWare():
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
# crawler.signals.connect(s.engine_started, signal=signals.engine_started)
# cls.LOG_FILE = crawler.settings.get('LOG_FILE')
return s
# 清空日志文件功能可以使用,但是爬虫开始启动的一些日志也会清空,可以直接在爬虫文件里写
# def engine_started(self):
# with open(self.LOG_FILE,'w+')as f:
# f.write('')
# print("清空日志文件内容!")
def spider_opened(self, spider):
self.chrome = webdriver.Chrome()
self.chrome.maximize_window()
def process_request(self, request, spider):
# 判断是否为第一次请求,第一次不会有next_page meta数据
# request.meta第一次结果显示:{'download_timeout': 180.0}
if not request.meta.get("next_page",False):
self.chrome.get(request.url)
# request.meta第二次请求显示:{'next_page': True, 'depth': 1, 'download_timeout': 180.0}
else:
if request.meta['depth'] == 1 or request.meta.get("current_page",False):
notice_type_title_num = request.meta['notice_type_title_num']
regionCity_num = request.meta['regionCity_num']
time.sleep(2)
print("点击区域第%s个" % notice_type_title_num)
self.chrome.find_element_by_xpath(
'//ul[@id="toRegion"]/li['+str(notice_type_title_num)+']').click()
time.sleep(1)
print("点击类型第%s个" % notice_type_title_num)
self.chrome.find_element_by_xpath('//div[@class="list-group"]/div/ul[@class="type-list"]/li['+str(
regionCity_num)+']').click()
# 测试用:由于页数太多,我们需要知道最后一页的处理效果
time.sleep(2)
self.chrome.find_element_by_id('infoNoticeInputPage').clear()
time.sleep(1)
self.chrome.find_element_by_id('infoNoticeInputPage').send_keys(request.meta['page'])
time.sleep(1)
self.chrome.find_element_by_xpath('//ul[@class="pagination"]/li[last()]/button').click()
time.sleep(1)
else:
# 点击下一页
self.chrome.find_element_by_xpath('//ul[@class="pagination"]/li[last()-3]/a').click()
time.sleep(2)
html = self.chrome.page_source
return HtmlResponse(url=request.url,body=html.encode('utf-8'))
def spider_closed(self, spider):
time.sleep(3)
print("准备关闭浏览器...")
self.chrome.quit()
7.记得提前打开mysq/redis/MongoDBl数据库,并且建立好相应的表
CREATE TABLE IF NOT EXISTS caigou(notice_id VARCHAR(40)PRIMARY KEY NOT NULL ,
province VARCHAR(10),area VARCHAR(17),notice_type_title VARCHAR(8),
pub_company VARCHAR(60),notice_title VARCHAR(150),
notice_date VARCHAR(20),notice_url VARCHAR(150))ENGINE=Innodb DEFAULT CHARSET=utf8mb4;
8.以上设置完毕,进行爬取:执行项目命令crawl,启动Spider:
scrapy crawl ccgp
或
scrapy crawl ccgp2