scrapy分布式爬虫爬取淘车网

最新推荐文章于 2024-05-27 08:39:47 发布

天主极乐大帝

最新推荐文章于 2024-05-27 08:39:47 发布

阅读量1.9k

点赞数

分类专栏：爬虫文章标签： scrapy分布式爬虫爬取淘车网

本文链接：https://blog.csdn.net/weixin_42218868/article/details/100578753

版权

爬虫专栏收录该内容

24 篇文章 0 订阅

订阅专栏

一、master主机配置
1、开启redis服务器
2、city.py#文件

# 城市编码
CITY_CODE = ['shijiazhuang', 'tangshan', 'qinhuangdao', 'handan', 'xingtai', 'baoding', 'zhangjiakou',
             'chengde', 'cangzhou', 'langfang', 'hengshui', 'taiyuan', 'datong', 'yangquan', 'changzhi', 'jincheng',
             'shuozhou', 'jinzhong', 'yuncheng', 'xinzhou', 'linfen', 'lvliang', 'huhehaote', 'baotou', 'wuhai',
             'chifeng', 'tongliao', 'eerduosi', 'hulunbeier', 'bayannaoer', 'wulanchabu', 'xinganmeng',
             'xilinguolemeng', 'alashanmeng', 'changchun', 'jilin', 'hangzhou', 'ningbo', 'wenzhou', 'jiaxing',
             'huzhou', 'shaoxing', 'jinhua', 'quzhou', 'zhoushan', 'tz', 'lishui', 'bozhou', 'chizhou', 'xuancheng',
             'nanchang', 'jingdezhen', 'pingxiang', 'jiujiang', 'xinyu', 'yingtan', 'ganzhou', 'jian', 'yichun', 'jxfz',
             'shangrao', 'xian', 'tongchuan', 'baoji', 'xianyang', 'weinan', 'yanan', 'hanzhong', 'yl', 'ankang',
             'shangluo', 'lanzhou', 'jiayuguan', 'jinchang', 'baiyin', 'tianshui', 'wuwei', 'zhangye', 'pingliang',
             'jiuquan', 'qingyang', 'dingxi', 'longnan', 'linxia', 'gannan', 'xining', 'haidongdiqu', 'haibei',
             'huangnan', 'hainanzangzuzizhizho', 'guoluo', 'yushu', 'haixi', 'yinchuan', 'shizuishan', 'wuzhong',
             'guyuan', 'zhongwei', 'wulumuqi', 'kelamayi', 'shihezi', 'tulufandiqu', 'hamidiqu', 'changji', 'boertala',
             'bazhou', 'akesudiqu', 'xinjiangkezhou', 'kashidiqu', 'hetiandiqu', 'yili', 'tachengdiqu', 'aletaidiqu',
             'xinjiangzhixiaxian', 'changsha', 'zhuzhou', 'xiangtan', 'hengyang', 'shaoyang', 'yueyang', 'changde',
             'zhangjiajie', 'yiyang', 'chenzhou', 'yongzhou', 'huaihua', 'loudi', 'xiangxi', 'guangzhou', 'shaoguan',
             'shenzhen', 'zhuhai', 'shantou', 'foshan', 'jiangmen', 'zhanjiang', 'maoming', 'zhaoqing', 'huizhou',
             'meizhou', 'shanwei', 'heyuan', 'yangjiang', 'qingyuan', 'dongguan', 'zhongshan', 'chaozhou', 'jieyang',
             'yunfu', 'nanning', 'liuzhou', 'guilin', 'wuzhou', 'beihai', 'fangchenggang', 'qinzhou', 'guigang',
             'yulin', 'baise', 'hezhou', 'hechi', 'laibin', 'chongzuo', 'haikou', 'sanya', 'sanshashi', 'qiongbeidiqu',
             'qiongnandiqu', 'hainanzhixiaxian', 'chengdu', 'zigong', 'panzhihua', 'luzhou', 'deyang', 'mianyang',
             'guangyuan', 'suining', 'neijiang', 'leshan', 'nanchong', 'meishan', 'yibin', 'guangan', 'dazhou', 'yaan',
             'bazhong', 'ziyang', 'aba', 'ganzi', 'liangshan', 'guiyang', 'liupanshui', 'zunyi', 'anshun',
             'tongrendiqu', 'qianxinan', 'bijiediqu', 'qiandongnan', 'qiannan', 'kunming', 'qujing', 'yuxi', 'baoshan',
             'zhaotong', 'lijiang', 'puer', 'lincang', 'chuxiong', 'honghe', 'wenshan', 'xishuangbanna', 'dali',
             'dehong', 'nujiang', 'diqing', 'siping', 'liaoyuan', 'tonghua', 'baishan', 'songyuan', 'baicheng',
             'yanbian', 'haerbin', 'qiqihaer', 'jixi', 'hegang', 'shuangyashan', 'daqing', 'yc', 'jiamusi', 'qitaihe',
             'mudanjiang', 'heihe', 'suihua', 'daxinganlingdiqu', 'shanghai', 'tianjin', 'chongqing', 'nanjing', 'wuxi',
             'xuzhou', 'changzhou', 'suzhou', 'nantong', 'lianyungang', 'huaian', 'yancheng', 'yangzhou', 'zhenjiang',
             'taizhou', 'suqian', 'lasa', 'changdudiqu', 'shannan', 'rikazediqu', 'naqudiqu', 'alidiqu', 'linzhidiqu',
             'hefei', 'wuhu', 'bengbu', 'huainan', 'maanshan', 'huaibei', 'tongling', 'anqing', 'huangshan', 'chuzhou',
             'fuyang', 'sz', 'chaohu', 'luan', 'fuzhou', 'xiamen', 'putian', 'sanming', 'quanzhou', 'zhangzhou',
             'nanping', 'longyan', 'ningde', 'jinan', 'qingdao', 'zibo', 'zaozhuang', 'dongying', 'yantai', 'weifang',
             'jining', 'taian', 'weihai', 'rizhao', 'laiwu', 'linyi', 'dezhou', 'liaocheng', 'binzhou', 'heze',
             'zhengzhou', 'kaifeng', 'luoyang', 'pingdingshan', 'jiyuan', 'anyang', 'hebi', 'xinxiang', 'jiaozuo',
             'puyang', 'xuchang', 'luohe', 'sanmenxia', 'nanyang', 'shangqiu', 'xinyang', 'zhoukou', 'zhumadian',
             'henanzhixiaxian', 'wuhan', 'huangshi', 'shiyan', 'yichang', 'xiangfan', 'ezhou', 'jingmen', 'xiaogan',
             'jingzhou', 'huanggang', 'xianning', 'qianjiang', 'suizhou', 'xiantao', 'tianmen', 'enshi',
             'hubeizhixiaxian', 'beijing', 'shenyang', 'dalian', 'anshan', 'fushun', 'benxi', 'dandong', 'jinzhou',
             'yingkou', 'fuxin', 'liaoyang', 'panjin', 'tieling', 'chaoyang', 'huludao', 'anhui', 'fujian', 'gansu',
             'guangdong', 'guangxi', 'guizhou', 'hainan', 'hebei', 'henan', 'heilongjiang', 'hubei', 'hunan', 'jl',
             'jiangsu', 'jiangxi', 'liaoning', 'neimenggu', 'ningxia', 'qinghai', 'shandong', 'shanxi', 'shaanxi',
             'sichuan', 'xizang', 'xinjiang', 'yunnan', 'zhejiang', 'jjj', 'jzh', 'zsj', 'csj', 'ygc']

# 品牌类型列表
CAR_CODE_LIST = ['southeastautomobile', 'sma', 'audi', 'hummer', 'tianqimeiya', 'seat', 'lamborghini', 'weltmeister',
                 'changanqingxingche-281', 'chevrolet', 'fiat', 'foday', 'eurise', 'dongfengfengdu', 'lotus-146', 'jac',
                 'enranger', 'bjqc', 'luxgen', 'jinbei', 'sgautomotive', 'jonwayautomobile', 'beijingjeep', 'linktour',
                 'landrover', 'denza', 'jeep', 'rely', 'gacne', 'porsche', 'wey', 'shenbao', 'bisuqiche-263',
                 'beiqihuansu', 'sinogold', 'roewe', 'maybach', 'greatwall', 'chenggongqiche', 'zotyeauto', 'kaersen',
                 'gonow', 'dodge', 'siwei', 'ora', 'lifanmotors', 'cajc', 'hafeiautomobile', 'sol', 'beiqixinnengyuan',
                 'dorcen', 'lexus', 'mercedesbenz', 'ford', 'huataiautomobile', 'jmc', 'peugeot', 'kinglongmotor',
                 'oushang', 'dongfengxiaokang-205', 'chautotechnology', 'faw-hongqi', 'mclaren', 'dearcc',
                 'fengxingauto', 'singulato', 'nissan', 'saleen', 'ruichixinnengyuan', 'yulu', 'isuzu', 'zhinuo',
                 'alpina', 'renult', 'kawei', 'cadillac', 'hanteng', 'defu', 'subaru', 'huasong', 'casyc', 'geely',
                 'xpeng', 'jlkc', 'sj', 'nanqixinyatu1', 'horki', 'venucia', 'xinkaiauto', 'traum',
                 'shanghaihuizhong-45', 'zhidou', 'ww', 'riich', 'brillianceauto', 'galue', 'bugatti',
                 'guagnzhouyunbao', 'borgward', 'qzbd1', 'bj', 'changheauto', 'faw', 'saab', 'fuqiautomobile', 'skoda',
                 'citroen', 'mitsubishi', 'opel', 'qorosauto', 'zxauto', 'infiniti', 'mazda', 'arcfox-289',
                 'jinchengautomobile', 'kia', 'mini', 'tesla', 'gmc-109', 'chery', 'daoda-282', 'joylongautomobile',
                 'hafu-196', 'sgmw', 'wiesmann', 'acura', 'yunqueqiche', 'volvo', 'lynkco', 'karry', 'chtc', 'gq',
                 'redstar', 'everus', 'kangdi', 'chrysler', 'cf', 'maxus', 'smart', 'maserati', 'dayu', 'besturn',
                 'dadiqiche', 'ym', 'huakai', 'buick', 'faradayfuture', 'leapmotor', 'koenigsegg', 'bentley',
                 'rolls-royce', 'iveco', 'dongfeng-27', 'haige1', 'ds', 'landwind', 'volkswagen', 'sitech', 'toyota',
                 'polarsunautomobile', 'zhejiangkaersen', 'ladaa', 'lincoln', 'weilaiqiche', 'li', 'ferrari', 'jetour',
                 'honda', 'barbus', 'morgancars', 'ol', 'sceo', 'hama', 'dongfengfengguang', 'mg-79', 'ktm',
                 'changankuayue-283', 'suzuki', 'yudo', 'yusheng-258', 'fs', 'bydauto', 'jauger', 'foton', 'pagani',
                 'shangqisaibao', 'guangqihinomotors', 'polestar', 'fujianxinlongmaqichegufenyouxiangongsi',
                 'alfaromeo', 'shanqitongjia1', 'xingchi', 'lotus', 'hyundai', 'kaiyi', 'isuzu-132', 'bmw', 'ssangyong',
                 'astonmartin']

3、redis_url文件

from taoche.taoche.spiders.city import CITY_CODE, CAR_CODE_LIST
from  redis  import  Redis
class  Redis_url():
    def  __init__(self):
        #连接客户端
        self.re=Redis("localhost",6379)
    def  add(self,url):
        #将url,利用lpush方法，添加到"taoche:start_urls"中
        self.re.lpush("taoche:start_urls",url)
    def  flushdb(self):
        pass
rd=Redis_url()#实例化对象
#先将redis中的request全部清空
for  city  in CITY_CODE:
    for car_code in CAR_CODE_LIST:
        rd.add("https://{}.taoche.com/{}/".format(city,car_code))

二、爬取数据的电脑配置
1、settings.py文件配置
添加如下代码

#调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#redis服务器地址
REDIS_HOST = '10.10.21.13'#要连接的master主机地址
#redis端口号
REDIS_PORT = 6379
##开启队列
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'

完整settings.py代码

# -*- coding: utf-8 -*-

# Scrapy settings for taoche project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'taoche'

SPIDER_MODULES = ['taoche.spiders']
NEWSPIDER_MODULE = 'taoche.spiders'
#调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
#去重
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
#redis服务器地址
REDIS_HOST = '10.10.21.13'#要连接的master主机地址
#redis端口号
REDIS_PORT = 6379
##开启队列
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.PriorityQueue'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'taoche (+http://www.yourdomain.com)'#伪装浏览器

# Obey robots.txt rules
ROBOTSTXT_OBEY = False#非测试模式

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
   'taoche.middlewares.TaocheSpiderMiddleware': 543,
}
# 中间件
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'taoche.middlewares.TaocheDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   'taoche.pipelines.TaochePipeline': 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

2、spiders文件夹下的taochec.py文件

# -*- coding: utf-8 -*-
import scrapy
from  .city import  *#从同级目录的city.py文件导入所有内容
from  lxml  import  etree
from   ..items import TaocheItem
from scrapy_redis.spiders   import RedisSpider
import  re

class TaochecSpider(RedisSpider):#redis分布式爬虫
    name = 'taochec'
    redis_key = "taoche:start_urls"

# class TaochecSpider(scrapy.Spider):
#     name = 'taochec'
#     allowed_domains = ['taoche.com']
#     start_urls = []
#     for  city  in  CITY_CODE[:3]:
#         for pinpai  in  CAR_CODE_LIST[:3]:
#             url=f'https://{city}.taoche.com/{pinpai}/'
#             start_urls.append(url)
#             print(url)
    def parse(self, response):
        tree = etree.HTML(response.body.decode('utf-8'))
        # 获取li列表页信息
        li_list = tree.xpath('//ul[@class="gongge_ul"]//li')
        print(len(li_list))
        if len(li_list) == 0:
            pass
        else:
            for li_data in li_list:
                item = TaocheItem()
                # 获取标题
                title = li_data.xpath('./div[@class="gongge_main"]//span/text()')[0]
                reg_date = li_data.xpath('./div[@class="gongge_main"]/p/i[1]/text()')[0]
                mile = li_data.xpath('./div[@class="gongge_main"]/p/i[2]/text()')[0]
                city_name = tree.xpath('//div[@class="nav_statusMain"]//a[2]/text()')[0]
                price = li_data.xpath('.//div[@class="price"]//i[@class="Total brand_col"]/text()')[0]
                try:
                    all_price = li_data.xpath('.//div[@class="price"]//i[@class="onepaynor"]/text()')[0]
                except:
                    all_price = li_data.xpath('.//div[@class="price"]//i[@class="original"]/text()')[0]

                # 获取详情页的url
                base_url = li_data.xpath('.//div[@class="item_img"]/a/@href')[0]
                # 拼接url
                detail_url = 'https:' + base_url
                item['title'] = title
                item['reg_date'] = reg_date
                item['mile'] = mile
                item['city_name'] = city_name
                item['price'] = price
                item['all_price'] = all_price
                item['detail_url'] = detail_url
                yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}, dont_filter=True)
            page_next = tree.xpath('//a[@class="pages-next"]')
            if page_next:
                next_url = tree.xpath('//a[@class="pages-next"]/@href')[0]
                next_url = 'http:' + next_url
                yield scrapy.Request(next_url, callback=self.parse, encoding='utf-8', dont_filter=True)
    def  parse_detail(self,response):
        item = response.meta["item"]
        print(response.url)
        response = response.body.decode('utf-8')
        tree = etree.HTML(response)
        # 图片
        pic =tree.xpath('//div[@class="taoche-details-xs-picbox"]//ul[@id="taoche-details-xs-pic"]//li[1]/img/@data-src')[0]
        # 排量
        displace = tree.xpath('//div[@class="summary-attrs"]//dl[3]/dd/text()')[0]
        # 车源号
        source_id = tree.xpath('//span[@class="car-number"]/text()')[0]
        source_id = source_id.split('：')[-1]
        item["pic"] = pic
        item["displace"] = displace
        item["source_id"] = source_id
        item["name"] = '天主极乐大帝'
        yield item

3、items.py文件

import scrapy


class TaocheItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()  # 标题
    reg_date = scrapy.Field()  # 上牌日期
    mile = scrapy.Field()  # 公里数
    city_name = scrapy.Field()  # 城市名称
    price = scrapy.Field()  # 优惠价格
    all_price = scrapy.Field()  # 全款价格
    # 详情页
    detail_url =scrapy.Field()  # 详情url
    pic = scrapy.Field()  # 图片
    displace = scrapy.Field()  # 排量
    source_id = scrapy.Field()  # 车源号
    name= scrapy.Field()  # 车源号

4、pipelines.py文件

import pymongo
class  TaochePipeline(object):
    def  __init__(self):
        self.client=pymongo.MongoClient('10.10.21.13',port=27017)#连接mongo数据库，建立客户端对象
        self.db=self.client['taoche']#连接数据库
        self.collection=self.db['taoche']
    def  process_item(self,item,spider):
        self.collection.insert(dict(item))

三、mongodb数据库主机配置
1.修改mongo.config文件
在这里插入图片描述

在该文件中添加如下代码

logappend=true
journal=true
quiet=true
port=27017

天主极乐大帝

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
scrapy分布式爬虫爬取淘车网

1、master主机city.py#文件redis_url文件from taoche.taoche.spiders.city import CITY_CODE, CAR_CODE_LISTfrom redis import Redisclass Redis_url(): def __init__(self): #连接客户端 self.re...
复制链接

扫一扫

专栏目录