scrapy爬取双色球数据

用scrapy 爬取双色球数据

本案例用scrapy+selenium对这个网站进行表中的各期号双色球数据进行爬取

用到的包

  • scrapy
  • time
  • selenium

开始

可以直接在pycharm中打开终端
在这里插入图片描述
然后终端中这样按顺序输入,一步步输入回车

scrapy startproject caipiao  
cd .\caipiao\
scrapy genspider ssq  https://datachart.500.com/ssq/?expect=100 

如果你按部就班,那么你的文件栏目会出现一个新的文件夹caipiao,它的详细文件结构是这样的
在这里插入图片描述

之后你只需复制粘贴下面的spider等文件就ok注意需要改成你自己的chromedriver.exe的文件路径

ssq.py

import time
import scrapy
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import Chrome
from scrapy.http import HtmlResponse 

service = Service('E:/converse_spider/converse_pyspider/06scrapy/chromedriver.exe') #这里的话用你自己的chromedriver.exe的路径哦






class SsqSpider(scrapy.Spider):
    # tips:创建浏览器对象
    browser = Chrome(service=service)
    name = "ssq"
    # allowed_domains = ["www.xxx.com"]
    start_urls = ["https://datachart.500.com/ssq/?expect=100"]
    model_urls = ["https://datachart.500.com/ssq/?expect=100"]
    print(start_urls[0])
    browser.get(url=start_urls[0])
    time.sleep(8)
    page_text = browser.page_source
    myresponse = HtmlResponse(url=model_urls[0],encoding='utf-8', body=page_text)

    print(myresponse)

    def parse(self, response):
        myresponse = self.myresponse
        tr_lst = myresponse.xpath('//*[@id="tdata"]/tr')
        for tr in tr_lst:
            if tr.xpath("./@class").extract_first() == "tdbck":
                continue
            date = tr.xpath('./td[1]/text()').extract_first()
            red_ball = tr.xpath('./td[@class="chartBall01"]/text()').extract()
            blue_ball = tr.xpath('./td[@class="chartBall01 chartBall07"]/text()').extract()

            dic = {
                'qihao':date,
                'red_ball':red_ball,
                'blue_ball':blue_ball
            }
            yield dic

piplines.py

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter


class CaipiaoPipeline:
    def process_item(self, item, spider):
        print(item)
        return item

settings.py

# Scrapy settings for caipiao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://docs.scrapy.org/en/latest/topics/settings.html
#     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "caipiao"

SPIDER_MODULES = ["caipiao.spiders"]
NEWSPIDER_MODULE = "caipiao.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "caipiao (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'WARNING'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
#    "Accept-Language": "en",
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    "caipiao.middlewares.CaipiaoSpiderMiddleware": 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
#    "caipiao.middlewares.CaipiaoDownloaderMiddleware": 543,
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    "scrapy.extensions.telnet.TelnetConsole": None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
   "caipiao.pipelines.CaipiaoPipeline": 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"

输入命令运行

在这里插入图片描述

看不清就看这个
scrapy crawl ssq

等待其运行完毕后输出结果

{'qihao': '24074 ', 'red_ball': ['7', '8', '10', '24', '32'], 'blue_ball': ['22']}
{'qihao': '24075 ', 'red_ball': ['3', '5', '8', '18', '28'], 'blue_ball': ['22']}
{'qihao': '24076 ', 'red_ball': ['24', '27', '29', '32'], 'blue_ball': ['3', '22']}
{'qihao': '24077 ', 'red_ball': ['1', '6', '14', '17'], 'blue_ball': ['4', '22']}
{'qihao': '24078 ', 'red_ball': ['9', '14', '21', '26'], 'blue_ball': ['5', '22']}
{'qihao': '24079 ', 'red_ball': ['2', '3', '7', '16', '26'], 'blue_ball': []}
{'qihao': '24080 ', 'red_ball': ['12', '27', '29', '30'], 'blue_ball': ['6', '11']}
{'qihao': '24081 ', 'red_ball': ['17', '23', '25'], 'blue_ball': ['1', '6', '12']}
{'qihao': '24082 ', 'red_ball': ['1', '16', '17'], 'blue_ball': ['2', '13', '29']}
{'qihao': '24083 ', 'red_ball': ['9', '32', '33'], 'blue_ball': ['3', '14', '29']}
{'qihao': '24084 ', 'red_ball': ['1', '8', '10', '13', '19'], 'blue_ball': ['29']}
{'qihao': '24085 ', 'red_ball': ['1', '5', '15', '23', '27'], 'blue_ball': ['21']}
{'qihao': '24086 ', 'red_ball': ['17', '23', '25', '31'], 'blue_ball': ['19', '20']}
{'qihao': '24087 ', 'red_ball': ['9', '26'], 'blue_ball': ['3', '20']}
{'qihao': '24088 ', 'red_ball': ['8', '10', '32'], 'blue_ball': ['3', '17', '30']}
{'qihao': '24089 ', 'red_ball': ['6', '9', '15'], 'blue_ball': ['3', '18', '31']}
{'qihao': '24090 ', 'red_ball': ['7', '11', '24', '31'], 'blue_ball': ['17', '32']}
{'qihao': '24091 ', 'red_ball': ['15', '19', '20', '24'], 'blue_ball': ['13', '16']}
{'qihao': '24092 ', 'red_ball': ['3', '18', '29'], 'blue_ball': ['8', '11', '12']}
{'qihao': '24093 ', 'red_ball': ['1', '5', '15'], 'blue_ball': ['7', '11', '12']}
{'qihao': '24094 ', 'red_ball': ['17', '22', '25', '27'], 'blue_ball': ['6', '13']}
{'qihao': '24095 ', 'red_ball': ['6', '9', '16', '21'], 'blue_ball': ['4', '14']}
{'qihao': '24096 ', 'red_ball': ['2', '14', '18', '23', '27'], 'blue_ball': ['4']}
{'qihao': '24097 ', 'red_ball': ['10', '12', '13', '18', '30'], 'blue_ball': ['4']}
{'qihao': '24098 ', 'red_ball': ['3', '8', '10', '20', '30', '31'], 'blue_ball': []}
{'qihao': '24099 ', 'red_ball': ['4', '12', '17', '24', '26', '27'], 'blue_ball': []}
{'qihao': '24100 ', 'red_ball': ['13', '14', '19', '27', '30'], 'blue_ball': ['20']}
{'qihao': '24101 ', 'red_ball': ['8', '12', '15', '17', '30'], 'blue_ball': ['19']}
{'qihao': '24102 ', 'red_ball': ['9', '15', '21', '25'], 'blue_ball': ['18', '22']}
{'qihao': '24103 ', 'red_ball': ['12', '21', '27', '32', '33'], 'blue_ball': ['23']}
{'qihao': '24104 ', 'red_ball': ['5', '16', '23', '26', '29'], 'blue_ball': ['24']}
{'qihao': '24105 ', 'red_ball': ['2', '5', '17', '19', '29', '33'], 'blue_ball': []}
{'qihao': '24106 ', 'red_ball': ['3', '11', '22', '31', '33'], 'blue_ball': ['8']}
{'qihao': '24107 ', 'red_ball': ['1', '6', '13', '17', '19'], 'blue_ball': ['8']}
{'qihao': '24108 ', 'red_ball': ['1', '9', '24', '30'], 'blue_ball': ['8', '23']}
{'qihao': '24109 ', 'red_ball': ['3', '28', '29', '33'], 'blue_ball': ['4']}
{'qihao': '24110 ', 'red_ball': ['17', '33'], 'blue_ball': ['4', '13', '23', '25']}
{'qihao': '24111 ', 'red_ball': ['1', '11', '30'], 'blue_ball': ['4', '12', '22']}
{'qihao': '24112 ', 'red_ball': ['8', '16', '29', '32'], 'blue_ball': ['11', '25']}
{'qihao': '24113 ', 'red_ball': ['4', '5', '23', '24', '31'], 'blue_ball': ['26']}
{'qihao': '24114 ', 'red_ball': ['7', '11', '24', '32'], 'blue_ball': ['18', '27']}
{'qihao': '24115 ', 'red_ball': ['3', '10', '11', '27'], 'blue_ball': ['19', '28']}
{'qihao': '24116 ', 'red_ball': ['1', '15', '22', '31', '32'], 'blue_ball': ['20']}
{'qihao': '24117 ', 'red_ball': ['3', '12', '14', '16', '29', '32'], 'blue_ball': []}
{'qihao': '24118 ', 'red_ball': ['6', '11', '15'], 'blue_ball': ['2', '4']}
{'qihao': '24119 ', 'red_ball': ['9', '26', '27', '31', '32'], 'blue_ball': ['2']}
{'qihao': '24120 ', 'red_ball': ['5', '18', '19'], 'blue_ball': ['1', '7', '11']}
{'qihao': '24121 ', 'red_ball': ['1', '13', '27', '33'], 'blue_ball': ['7', '10']}
{'qihao': '24122 ', 'red_ball': ['5', '17', '29'], 'blue_ball': ['7', '9', '16']}
{'qihao': '24123 ', 'red_ball': ['2', '22', '26', '33'], 'blue_ball': ['15', '30']}
{'qihao': '24124 ', 'red_ball': ['2', '15', '17', '25'], 'blue_ball': ['14', '30']}
{'qihao': '24125 ', 'red_ball': ['1', '4', '18', '26'], 'blue_ball': ['13', '30']}
{'qihao': '24126 ', 'red_ball': ['14', '18', '23', '24', '26', '33'], 'blue_ball': []}
{'qihao': '24127 ', 'red_ball': ['2', '5', '20', '27', '32'], 'blue_ball': ['13']}
{'qihao': '24128 ', 'red_ball': ['1', '8', '18', '20', '26'], 'blue_ball': ['13']}
{'qihao': '24129 ', 'red_ball': ['9', '10', '19', '24', '32'], 'blue_ball': []}
{'qihao': '24130 ', 'red_ball': ['1', '8', '17', '19', '24'], 'blue_ball': ['12']}
{'qihao': '24131 ', 'red_ball': ['4', '5', '15', '20', '32'], 'blue_ball': ['11']}
{'qihao': '24132 ', 'red_ball': ['1', '4', '25', '27', '28', '33'], 'blue_ball': []}
{'qihao': '24133 ', 'red_ball': ['1', '11', '27', '30', '33'], 'blue_ball': ['15']}
{'qihao': '24134 ', 'red_ball': ['2', '4', '13', '18', '20'], 'blue_ball': ['16']}
{'qihao': '24135 ', 'red_ball': ['5', '11', '18', '30', '31'], 'blue_ball': ['17']}
{'qihao': '24136 ', 'red_ball': ['3', '11', '15', '21'], 'blue_ball': ['25', '26']}
{'qihao': '24137 ', 'red_ball': ['4', '9', '10', '19'], 'blue_ball': ['26', '27']}
{'qihao': '24138 ', 'red_ball': ['2', '7', '11', '21'], 'blue_ball': ['27', '28']}
{'qihao': '24139 ', 'red_ball': ['15', '16', '20', '22', '23'], 'blue_ball': ['29']}
{'qihao': '24140 ', 'red_ball': ['4', '7', '17', '22', '26'], 'blue_ball': ['8']}
{'qihao': '24141 ', 'red_ball': ['1', '2', '15', '24', '29'], 'blue_ball': ['7']}
{'qihao': '24142 ', 'red_ball': ['4', '13', '21', '22', '25'], 'blue_ball': ['6']}
{'qihao': '24143 ', 'red_ball': ['2', '11', '22', '33'], 'blue_ball': ['5', '30']}
{'qihao': '24144 ', 'red_ball': ['9', '11', '17', '20'], 'blue_ball': ['2', '30']}
{'qihao': '24145 ', 'red_ball': ['1', '16', '22'], 'blue_ball': ['3', '23', '30']}
{'qihao': '24146 ', 'red_ball': ['2', '11', '27', '32'], 'blue_ball': ['4', '22']}
{'qihao': '24147 ', 'red_ball': ['11', '13', '30'], 'blue_ball': ['5', '16', '21']}
{'qihao': '24148 ', 'red_ball': ['2', '15', '21', '23'], 'blue_ball': ['6', '16']}
{'qihao': '24149 ', 'red_ball': ['3', '9', '22'], 'blue_ball': ['17', '18']}
{'qihao': '24150 ', 'red_ball': ['13', '14', '22', '26', '32'], 'blue_ball': ['20']}
{'qihao': '24151 ', 'red_ball': ['5', '10', '16', '29', '32'], 'blue_ball': ['19']}
{'qihao': '25001 ', 'red_ball': ['2', '3', '17', '22', '33'], 'blue_ball': ['18']}
{'qihao': '25002 ', 'red_ball': ['9', '12', '13', '15', '22', '26'], 'blue_ball': []}
{'qihao': '25003 ', 'red_ball': ['10', '19', '20', '29'], 'blue_ball': ['26', '28']}
{'qihao': '25004 ', 'red_ball': ['3', '7', '17', '32'], 'blue_ball': ['27', '29']}
{'qihao': '25005 ', 'red_ball': ['10', '27'], 'blue_ball': ['16', '19', '28', '30']}
{'qihao': '25006 ', 'red_ball': ['1', '8', '22'], 'blue_ball': ['7', '17', '20']}
{'qihao': '25007 ', 'red_ball': ['7', '14', '27'], 'blue_ball': ['8', '18', '21']}
{'qihao': '25008 ', 'red_ball': ['14', '16', '17', '25', '33'], 'blue_ball': ['9']}
{'qihao': '25009 ', 'red_ball': ['2', '4', '11', '12', '23', '25'], 'blue_ball': []}
{'qihao': '25010 ', 'red_ball': ['4', '6', '7', '17', '21'], 'blue_ball': ['16']}
{'qihao': '25011 ', 'red_ball': ['6', '13', '22', '24', '29'], 'blue_ball': ['17']}
{'qihao': '25012 ', 'red_ball': ['7', '11', '13', '27', '31'], 'blue_ball': ['18']}
{'qihao': '25013 ', 'red_ball': ['4', '14', '16', '23', '24', '30'], 'blue_ball': []}
{'qihao': '25014 ', 'red_ball': ['6', '7', '9', '13', '21', '27'], 'blue_ball': []}
{'qihao': '25015 ', 'red_ball': ['4', '11', '15', '24', '25', '33'], 'blue_ball': []}
{'qihao': '25016 ', 'red_ball': ['2', '3', '12', '16', '20'], 'blue_ball': ['29']}
{'qihao': '25017 ', 'red_ball': ['4', '12', '15', '28'], 'blue_ball': ['18', '29']}
{'qihao': '25018 ', 'red_ball': ['3', '9', '21', '28'], 'blue_ball': ['29']}
{'qihao': '25019 ', 'red_ball': ['5', '13', '22', '23'], 'blue_ball': ['17', '18']}
{'qihao': '25020 ', 'red_ball': ['4', '11', '13', '26', '30'], 'blue_ball': ['16']}
{'qihao': '25021 ', 'red_ball': ['7', '8', '16', '18', '25', '32'], 'blue_ball': []}
{'qihao': '25022 ', 'red_ball': ['11', '17', '18', '20', '28', '30'], 'blue_ball': []}

写的不算完美,因为这个网站的网页双色球中蓝色球的数据是动态加载的,所以用到了selenium来进行网页数据获取以及用HtmlResponse 对数据进行打包,将它转换为scrapy中能够使用的response类型而不是字符串类型,这是重点!!!

希望本文对你学习scrapy有所帮助

请添加图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值