一、master主机配置
1、开启redis服务器
2、city.py#文件
# 城市编码
CITY_CODE = ['shijiazhuang', 'tangshan', 'qinhuangdao', 'handan', 'xingtai', 'baoding', 'zhangjiakou',
'chengde', 'cangzhou', 'langfang', 'hengshui', 'taiyuan', 'datong', 'yangquan', 'changzhi', 'jincheng',
'shuozhou', 'jinzhong', 'yuncheng', 'xinzhou', 'linfen', 'lvliang', 'huhehaote', 'baotou', 'wuhai',
'chifeng', 'tongliao', 'eerduosi', 'hulunbeier', 'bayannaoer', 'wulanchabu', 'xinganmeng',
'xilinguolemeng', 'alashanmeng', 'changchun', 'jilin', 'hangzhou', 'ningbo', 'wenzhou', 'jiaxing',
'huzhou', 'shaoxing', 'jinhua', 'quzhou', 'zhoushan', 'tz', 'lishui', 'bozhou', 'chizhou', 'xuancheng',
'nanchang', 'jingdezhen', 'pingxiang', 'jiujiang', 'xinyu', 'yingtan', 'ganzhou', 'jian', 'yichun', 'jxfz',
'shangrao', 'xian', 'tongchuan', 'baoji', 'xianyang', 'weinan', 'yanan', 'hanzhong', 'yl', 'ankang',
'shangluo', 'lanzhou', 'jiayuguan', 'jinchang', 'baiyin', 'tianshui', 'wuwei', 'zhangye', 'pingliang',
'jiuquan', 'qingyang', 'dingxi', 'longnan', 'linxia', 'gannan', 'xining', 'haidongdiqu', 'haibei',
'huangnan', 'hainanzangzuzizhizho', 'guoluo', 'yushu', 'haixi', 'yinchuan', 'shizuishan', 'wuzhong',
'guyuan', 'zhongwei', 'wulumuqi', 'kelamayi', 'shihezi', 'tulufandiqu', 'hamidiqu', 'changji', 'boertala',
'bazhou', 'akesudiqu', 'xinjiangkezhou', 'kashidiqu', 'hetiandiqu', 'yili', 'tachengdiqu', 'aletaidiqu',
'xinjiangzhixiaxian', 'changsha', 'zhuzhou', 'xiangtan', 'hengyang', 'shaoyang', 'yueyang', 'changde',
'zhangjiajie', 'yiyang', 'chenzhou', 'yongzhou', 'huaihua', 'loudi', 'xiangxi', 'guangzhou', 'shaoguan',
'shenzhen', 'zhuhai', 'shantou', 'foshan', 'jiangmen', 'zhanjiang', 'maoming', 'zhaoqing', 'huizhou',
'meizhou', 'shanwei', 'heyuan', 'yangjiang', 'qingyuan', 'dongguan', 'zhongshan', 'chaozhou', 'jieyang',
'yunfu', 'nanning', 'liuzhou', 'guilin', 'wuzhou', 'beihai', 'fangchenggang', 'qinzhou', 'guigang',
'yulin', 'baise', 'hezhou', 'hechi', 'laibin', 'chongzuo', 'haikou', 'sanya', 'sanshashi', 'qiongbeidiqu',
'qiongnandiqu', 'hainanzhixiaxian', 'chengdu', 'zigong', 'panzhihua', 'luzhou', 'deyang', 'mianyang',
'guangyuan', 'suining', 'neijiang', 'leshan', 'nanchong', 'meishan', 'yibin', 'guangan', 'dazhou', 'yaan',
'bazhong', 'ziyang', 'aba', 'ganzi', 'liangshan', 'guiyang', 'liupanshui', 'zunyi', 'anshun',
'tongrendiqu', 'qianxinan', 'bijiediqu', 'qiandongnan', 'qiannan', 'kunming', 'qujing', 'yuxi', 'baoshan',
'zhaotong', 'lijiang', 'puer', 'lincang', 'chuxiong', 'honghe', 'wenshan', 'xishuangbanna', 'dali',
'dehong', 'nujiang', 'diqing', 'siping', 'liaoyuan', 'tonghua', 'baishan', 'songyuan', 'baicheng',
'yanbian', 'haerbin', 'qiqihaer', 'jixi', 'hegang', 'shuangyashan', 'daqing', 'yc', 'jiamusi', 'qitaihe',
'mudanjiang', 'heihe', 'suihua', 'daxinganlingdiqu', 'shanghai', 'tianjin', 'chongqing', 'nanjing', 'wuxi',
'xuzhou', 'changzhou', 'suzhou', 'nantong', 'lianyungang', 'huaian', 'yancheng', 'yangzhou', 'zhenjiang',
'taizhou', 'suqian', 'lasa', 'changdudiqu', 'shannan', 'rikazediqu', 'naqudiqu', 'alidiqu', 'linzhidiqu',
'hefei', 'wuhu', 'bengbu', 'huainan', 'maanshan', 'huaibei', 'tongling', 'anqing', 'huangshan', 'chuzhou',
'fuyang', 'sz', 'chaohu', 'luan', 'fuzhou', 'xiamen', 'putian', 'sanming', 'quanzhou', 'zhangzhou',
'nanping', 'longyan', 'ningde', 'jinan', 'qingdao', 'zibo', 'zaozhuang', 'dongying', 'yantai', 'weifang',
'jining', 'taian', 'weihai', 'rizhao', 'laiwu', 'linyi', 'dezhou', 'liaocheng', 'binzhou', 'heze',
'zhengzhou', 'kaifeng', 'luoyang', 'pingdingshan', 'jiyuan', 'anyang', 'hebi', 'xinxiang', 'jiaozuo',
'puyang', 'xuchang', 'luohe', 'sanmenxia', 'nanyang', 'shangqiu', 'xinyang', 'zhoukou', 'zhumadian',
'henanzhixiaxian', 'wuhan', 'huangshi', 'shiyan', 'yichang', 'xiangfan', 'ezhou', 'jingmen', 'xiaogan',
'jingzhou', 'huanggang', 'xianning', 'qianjiang', 'suizhou', 'xiantao', 'tianmen', 'enshi',
'hubeizhixiaxian', 'beijing', 'shenyang', 'dalian', 'anshan', 'fushun', 'benxi', 'dandong', 'jinzhou',
'yingkou', 'fuxin', 'liaoyang', 'panjin', 'tieling', 'chaoyang', 'huludao', 'anhui', 'fujian', 'gansu',
'guangdong', 'guangxi', 'guizhou', 'hainan', 'hebei', 'henan', 'heilongjiang', 'hubei', 'hunan', 'jl',
'jiangsu', 'jiangxi', 'liaoning', 'neimenggu', 'ningxia', 'qinghai', 'shandong', 'shanxi', 'shaanxi',
'sichuan', 'xizang', 'xinjiang', 'yunnan', 'zhejiang', 'jjj', 'jzh', 'zsj', 'csj', 'ygc']
# 品牌类型列表
CAR_CODE_LIST = ['southeastautomobile', 'sma', 'audi', 'hummer', 'tianqimeiya', 'seat', 'lamborghini', 'weltmeister',
'changanqingxingche-281', 'chevrolet', 'fiat', 'foday', 'eurise', 'dongfengfengdu', 'lotus-146', 'jac',
'enranger', 'bjqc', 'luxgen', 'jinbei', 'sgautomotive', 'jonwayautomobile', 'beijingjeep', 'linktour',
'landrover', 'denza', 'jeep', 'rely', 'gacne', 'porsche', 'wey', 'shenbao', 'bisuqiche-263',
'beiqihuansu', 'sinogold', 'roewe', 'maybach', 'greatwall', 'chenggongqiche', 'zotyeauto', 'kaersen',
'gonow', 'dodge', 'siwei', 'ora', 'lifanmotors', 'cajc', 'hafeiautomobile', 'sol', 'beiqixinnengyuan',
'dorcen', 'lexus', 'mercedesbenz', 'ford', 'huataiautomobile', 'jmc', 'peugeot', 'kinglongmotor',
'oushang', 'dongfengxiaokang-205', 'chautotechnology', 'faw-hongqi', 'mclaren', 'dearcc',
'fengxingauto', 'singulato', 'nissan', 'saleen', 'ruichixinnengyuan', 'yulu', 'isuzu', 'zhinuo',
'alpina', 'renult', 'kawei', 'cadillac', 'hanteng', 'defu', 'subaru', 'huasong', 'casyc', 'geely',
'xpeng', 'jlkc', 'sj', 'nanqixinyatu1', 'horki', 'venucia', 'xinkaiauto', 'traum',
'shanghaihuizhong-45', 'zhidou', 'ww', 'riich', 'brillianceauto', 'galue', 'bugatti',
'guagnzhouyunbao', 'borgward', 'qzbd1', 'bj', 'changheauto', 'faw', 'saab', 'fuqiautomobile', 'skoda',
'citroen', 'mitsubishi', 'opel', 'qorosauto', 'zxauto', 'infiniti', 'mazda', 'arcfox-289',
'jinchengautomobile', 'kia', 'mini', 'tesla', 'gmc-109', 'chery', 'daoda-282', 'joylongautomobile',
'hafu-196', 'sgmw', 'wiesmann', 'acura', 'yunqueqiche', 'volvo', 'lynkco', 'karry', 'chtc', 'gq',
'redstar', 'everus', 'kangdi', 'chrysler', 'cf', 'maxus', 'smart', 'maserati', 'dayu', 'besturn',
'dadiqiche', 'ym', 'huakai', 'buick', 'faradayfuture', 'leapmotor', 'koenigsegg', 'bentley',
'rolls-royce', 'iveco', 'dongfeng-27', 'haige1', 'ds', 'landwind', 'volkswagen', 'sitech', 'toyota',
'polarsunautomobile', 'zhejiangkaersen', 'ladaa', 'lincoln', 'weilaiqiche', 'li', 'ferrari', 'jetour',
'honda', 'barbus', 'morgancars', 'ol', 'sceo', 'hama', 'dongfengfengguang', 'mg-79', 'ktm',
'changankuayue-283', 'suzuki', 'yudo', 'yusheng-258', 'fs', 'bydauto', 'jauger', 'foton', 'pagani',
'shangqisaibao', 'guangqihinomotors', 'polestar', 'fujianxinlongmaqichegufenyouxiangongsi',
'alfaromeo', 'shanqitongjia1', 'xingchi', 'lotus', 'hyundai', 'kaiyi', 'isuzu-132', 'bmw', 'ssangyong',
'astonmartin']
3、redis_url文件
from taoche.taoche.spiders.city import CITY_CODE, CAR_CODE_LIST
from redis import Redis
class Redis_url():
def __init__(self):
#连接客户端
self.re=Redis("localhost",6379)
def add(self,url):
#将url,利用lpush方法,添加到"taoche:start_urls"中
self.re.lpush("taoche:start_urls",url)
def flushdb(self):
pass
rd=Redis_url()#实例化对象
#先将redis中的request全部清空
for city in CITY_CODE:
for car_code in CAR_CODE_LIST:
rd.add("https://{}.taoche.com/{}/".format(city,car_code))
二、爬取数据的电脑配置
1、settings.py文件配置
添加如下代码
#调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#redis服务器地址
REDIS_HOST = '10.10.21.13'#要连接的master主机地址
#redis端口号
REDIS_PORT = 6379
##开启队列
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
完整settings.py代码
# -*- coding: utf-8 -*-
# Scrapy settings for taoche project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'taoche'
SPIDER_MODULES = ['taoche.spiders']
NEWSPIDER_MODULE = 'taoche.spiders'
#调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#redis服务器地址
REDIS_HOST = '10.10.21.13'#要连接的master主机地址
#redis端口号
REDIS_PORT = 6379
##开启队列
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'taoche (+http://www.yourdomain.com)'#伪装浏览器
# Obey robots.txt rules
ROBOTSTXT_OBEY = False#非测试模式
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'taoche.middlewares.TaocheSpiderMiddleware': 543,
}
# 中间件
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'taoche.middlewares.TaocheDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'taoche.pipelines.TaochePipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
2、spiders文件夹下的taochec.py文件
# -*- coding: utf-8 -*-
import scrapy
from .city import *#从同级目录的city.py文件导入所有内容
from lxml import etree
from ..items import TaocheItem
from scrapy_redis.spiders import RedisSpider
import re
class TaochecSpider(RedisSpider):#redis分布式爬虫
name = 'taochec'
redis_key = "taoche:start_urls"
# class TaochecSpider(scrapy.Spider):
# name = 'taochec'
# allowed_domains = ['taoche.com']
# start_urls = []
# for city in CITY_CODE[:3]:
# for pinpai in CAR_CODE_LIST[:3]:
# url=f'https://{city}.taoche.com/{pinpai}/'
# start_urls.append(url)
# print(url)
def parse(self, response):
tree = etree.HTML(response.body.decode('utf-8'))
# 获取li列表页信息
li_list = tree.xpath('//ul[@class="gongge_ul"]//li')
print(len(li_list))
if len(li_list) == 0:
pass
else:
for li_data in li_list:
item = TaocheItem()
# 获取标题
title = li_data.xpath('./div[@class="gongge_main"]//span/text()')[0]
reg_date = li_data.xpath('./div[@class="gongge_main"]/p/i[1]/text()')[0]
mile = li_data.xpath('./div[@class="gongge_main"]/p/i[2]/text()')[0]
city_name = tree.xpath('//div[@class="nav_statusMain"]//a[2]/text()')[0]
price = li_data.xpath('.//div[@class="price"]//i[@class="Total brand_col"]/text()')[0]
try:
all_price = li_data.xpath('.//div[@class="price"]//i[@class="onepaynor"]/text()')[0]
except:
all_price = li_data.xpath('.//div[@class="price"]//i[@class="original"]/text()')[0]
# 获取详情页的url
base_url = li_data.xpath('.//div[@class="item_img"]/a/@href')[0]
# 拼接url
detail_url = 'https:' + base_url
item['title'] = title
item['reg_date'] = reg_date
item['mile'] = mile
item['city_name'] = city_name
item['price'] = price
item['all_price'] = all_price
item['detail_url'] = detail_url
yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}, dont_filter=True)
page_next = tree.xpath('//a[@class="pages-next"]')
if page_next:
next_url = tree.xpath('//a[@class="pages-next"]/@href')[0]
next_url = 'http:' + next_url
yield scrapy.Request(next_url, callback=self.parse, encoding='utf-8', dont_filter=True)
def parse_detail(self,response):
item = response.meta["item"]
print(response.url)
response = response.body.decode('utf-8')
tree = etree.HTML(response)
# 图片
pic =tree.xpath('//div[@class="taoche-details-xs-picbox"]//ul[@id="taoche-details-xs-pic"]//li[1]/img/@data-src')[0]
# 排量
displace = tree.xpath('//div[@class="summary-attrs"]//dl[3]/dd/text()')[0]
# 车源号
source_id = tree.xpath('//span[@class="car-number"]/text()')[0]
source_id = source_id.split(':')[-1]
item["pic"] = pic
item["displace"] = displace
item["source_id"] = source_id
item["name"] = '天主极乐大帝'
yield item
3、items.py文件
import scrapy
class TaocheItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field() # 标题
reg_date = scrapy.Field() # 上牌日期
mile = scrapy.Field() # 公里数
city_name = scrapy.Field() # 城市名称
price = scrapy.Field() # 优惠价格
all_price = scrapy.Field() # 全款价格
# 详情页
detail_url =scrapy.Field() # 详情url
pic = scrapy.Field() # 图片
displace = scrapy.Field() # 排量
source_id = scrapy.Field() # 车源号
name= scrapy.Field() # 车源号
4、pipelines.py文件
import pymongo
class TaochePipeline(object):
def __init__(self):
self.client=pymongo.MongoClient('10.10.21.13',port=27017)#连接mongo数据库,建立客户端对象
self.db=self.client['taoche']#连接数据库
self.collection=self.db['taoche']
def process_item(self,item,spider):
self.collection.insert(dict(item))
三、mongodb数据库主机配置
1.修改mongo.config文件
在该文件中添加如下代码
logappend=true
journal=true
quiet=true
port=27017