用scrapy 爬取双色球数据
本案例用scrapy+selenium对这个网站进行表中的各期号双色球数据进行爬取
用到的包
- scrapy
- time
- selenium
开始
可以直接在pycharm中打开终端
然后终端中这样按顺序输入,一步步输入回车
scrapy startproject caipiao
cd .\caipiao\
scrapy genspider ssq https://datachart.500.com/ssq/?expect=100
如果你按部就班,那么你的文件栏目会出现一个新的文件夹caipiao,它的详细文件结构是这样的
之后你只需复制粘贴下面的spider等文件就ok注意需要改成你自己的chromedriver.exe的文件路径
ssq.py
import time
import scrapy
from selenium.webdriver.chrome.service import Service
from selenium.webdriver import Chrome
from scrapy.http import HtmlResponse
service = Service('E:/converse_spider/converse_pyspider/06scrapy/chromedriver.exe') #这里的话用你自己的chromedriver.exe的路径哦
class SsqSpider(scrapy.Spider):
# tips:创建浏览器对象
browser = Chrome(service=service)
name = "ssq"
# allowed_domains = ["www.xxx.com"]
start_urls = ["https://datachart.500.com/ssq/?expect=100"]
model_urls = ["https://datachart.500.com/ssq/?expect=100"]
print(start_urls[0])
browser.get(url=start_urls[0])
time.sleep(8)
page_text = browser.page_source
myresponse = HtmlResponse(url=model_urls[0],encoding='utf-8', body=page_text)
print(myresponse)
def parse(self, response):
myresponse = self.myresponse
tr_lst = myresponse.xpath('//*[@id="tdata"]/tr')
for tr in tr_lst:
if tr.xpath("./@class").extract_first() == "tdbck":
continue
date = tr.xpath('./td[1]/text()').extract_first()
red_ball = tr.xpath('./td[@class="chartBall01"]/text()').extract()
blue_ball = tr.xpath('./td[@class="chartBall01 chartBall07"]/text()').extract()
dic = {
'qihao':date,
'red_ball':red_ball,
'blue_ball':blue_ball
}
yield dic
piplines.py
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
class CaipiaoPipeline:
def process_item(self, item, spider):
print(item)
return item
settings.py
# Scrapy settings for caipiao project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = "caipiao"
SPIDER_MODULES = ["caipiao.spiders"]
NEWSPIDER_MODULE = "caipiao.spiders"
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "caipiao (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'WARNING'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "caipiao.middlewares.CaipiaoSpiderMiddleware": 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# "caipiao.middlewares.CaipiaoDownloaderMiddleware": 543,
# }
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
"caipiao.pipelines.CaipiaoPipeline": 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
# Set settings whose default value is deprecated to a future-proof value
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
输入命令运行
看不清就看这个
scrapy crawl ssq
等待其运行完毕后输出结果
{'qihao': '24074 ', 'red_ball': ['7', '8', '10', '24', '32'], 'blue_ball': ['22']}
{'qihao': '24075 ', 'red_ball': ['3', '5', '8', '18', '28'], 'blue_ball': ['22']}
{'qihao': '24076 ', 'red_ball': ['24', '27', '29', '32'], 'blue_ball': ['3', '22']}
{'qihao': '24077 ', 'red_ball': ['1', '6', '14', '17'], 'blue_ball': ['4', '22']}
{'qihao': '24078 ', 'red_ball': ['9', '14', '21', '26'], 'blue_ball': ['5', '22']}
{'qihao': '24079 ', 'red_ball': ['2', '3', '7', '16', '26'], 'blue_ball': []}
{'qihao': '24080 ', 'red_ball': ['12', '27', '29', '30'], 'blue_ball': ['6', '11']}
{'qihao': '24081 ', 'red_ball': ['17', '23', '25'], 'blue_ball': ['1', '6', '12']}
{'qihao': '24082 ', 'red_ball': ['1', '16', '17'], 'blue_ball': ['2', '13', '29']}
{'qihao': '24083 ', 'red_ball': ['9', '32', '33'], 'blue_ball': ['3', '14', '29']}
{'qihao': '24084 ', 'red_ball': ['1', '8', '10', '13', '19'], 'blue_ball': ['29']}
{'qihao': '24085 ', 'red_ball': ['1', '5', '15', '23', '27'], 'blue_ball': ['21']}
{'qihao': '24086 ', 'red_ball': ['17', '23', '25', '31'], 'blue_ball': ['19', '20']}
{'qihao': '24087 ', 'red_ball': ['9', '26'], 'blue_ball': ['3', '20']}
{'qihao': '24088 ', 'red_ball': ['8', '10', '32'], 'blue_ball': ['3', '17', '30']}
{'qihao': '24089 ', 'red_ball': ['6', '9', '15'], 'blue_ball': ['3', '18', '31']}
{'qihao': '24090 ', 'red_ball': ['7', '11', '24', '31'], 'blue_ball': ['17', '32']}
{'qihao': '24091 ', 'red_ball': ['15', '19', '20', '24'], 'blue_ball': ['13', '16']}
{'qihao': '24092 ', 'red_ball': ['3', '18', '29'], 'blue_ball': ['8', '11', '12']}
{'qihao': '24093 ', 'red_ball': ['1', '5', '15'], 'blue_ball': ['7', '11', '12']}
{'qihao': '24094 ', 'red_ball': ['17', '22', '25', '27'], 'blue_ball': ['6', '13']}
{'qihao': '24095 ', 'red_ball': ['6', '9', '16', '21'], 'blue_ball': ['4', '14']}
{'qihao': '24096 ', 'red_ball': ['2', '14', '18', '23', '27'], 'blue_ball': ['4']}
{'qihao': '24097 ', 'red_ball': ['10', '12', '13', '18', '30'], 'blue_ball': ['4']}
{'qihao': '24098 ', 'red_ball': ['3', '8', '10', '20', '30', '31'], 'blue_ball': []}
{'qihao': '24099 ', 'red_ball': ['4', '12', '17', '24', '26', '27'], 'blue_ball': []}
{'qihao': '24100 ', 'red_ball': ['13', '14', '19', '27', '30'], 'blue_ball': ['20']}
{'qihao': '24101 ', 'red_ball': ['8', '12', '15', '17', '30'], 'blue_ball': ['19']}
{'qihao': '24102 ', 'red_ball': ['9', '15', '21', '25'], 'blue_ball': ['18', '22']}
{'qihao': '24103 ', 'red_ball': ['12', '21', '27', '32', '33'], 'blue_ball': ['23']}
{'qihao': '24104 ', 'red_ball': ['5', '16', '23', '26', '29'], 'blue_ball': ['24']}
{'qihao': '24105 ', 'red_ball': ['2', '5', '17', '19', '29', '33'], 'blue_ball': []}
{'qihao': '24106 ', 'red_ball': ['3', '11', '22', '31', '33'], 'blue_ball': ['8']}
{'qihao': '24107 ', 'red_ball': ['1', '6', '13', '17', '19'], 'blue_ball': ['8']}
{'qihao': '24108 ', 'red_ball': ['1', '9', '24', '30'], 'blue_ball': ['8', '23']}
{'qihao': '24109 ', 'red_ball': ['3', '28', '29', '33'], 'blue_ball': ['4']}
{'qihao': '24110 ', 'red_ball': ['17', '33'], 'blue_ball': ['4', '13', '23', '25']}
{'qihao': '24111 ', 'red_ball': ['1', '11', '30'], 'blue_ball': ['4', '12', '22']}
{'qihao': '24112 ', 'red_ball': ['8', '16', '29', '32'], 'blue_ball': ['11', '25']}
{'qihao': '24113 ', 'red_ball': ['4', '5', '23', '24', '31'], 'blue_ball': ['26']}
{'qihao': '24114 ', 'red_ball': ['7', '11', '24', '32'], 'blue_ball': ['18', '27']}
{'qihao': '24115 ', 'red_ball': ['3', '10', '11', '27'], 'blue_ball': ['19', '28']}
{'qihao': '24116 ', 'red_ball': ['1', '15', '22', '31', '32'], 'blue_ball': ['20']}
{'qihao': '24117 ', 'red_ball': ['3', '12', '14', '16', '29', '32'], 'blue_ball': []}
{'qihao': '24118 ', 'red_ball': ['6', '11', '15'], 'blue_ball': ['2', '4']}
{'qihao': '24119 ', 'red_ball': ['9', '26', '27', '31', '32'], 'blue_ball': ['2']}
{'qihao': '24120 ', 'red_ball': ['5', '18', '19'], 'blue_ball': ['1', '7', '11']}
{'qihao': '24121 ', 'red_ball': ['1', '13', '27', '33'], 'blue_ball': ['7', '10']}
{'qihao': '24122 ', 'red_ball': ['5', '17', '29'], 'blue_ball': ['7', '9', '16']}
{'qihao': '24123 ', 'red_ball': ['2', '22', '26', '33'], 'blue_ball': ['15', '30']}
{'qihao': '24124 ', 'red_ball': ['2', '15', '17', '25'], 'blue_ball': ['14', '30']}
{'qihao': '24125 ', 'red_ball': ['1', '4', '18', '26'], 'blue_ball': ['13', '30']}
{'qihao': '24126 ', 'red_ball': ['14', '18', '23', '24', '26', '33'], 'blue_ball': []}
{'qihao': '24127 ', 'red_ball': ['2', '5', '20', '27', '32'], 'blue_ball': ['13']}
{'qihao': '24128 ', 'red_ball': ['1', '8', '18', '20', '26'], 'blue_ball': ['13']}
{'qihao': '24129 ', 'red_ball': ['9', '10', '19', '24', '32'], 'blue_ball': []}
{'qihao': '24130 ', 'red_ball': ['1', '8', '17', '19', '24'], 'blue_ball': ['12']}
{'qihao': '24131 ', 'red_ball': ['4', '5', '15', '20', '32'], 'blue_ball': ['11']}
{'qihao': '24132 ', 'red_ball': ['1', '4', '25', '27', '28', '33'], 'blue_ball': []}
{'qihao': '24133 ', 'red_ball': ['1', '11', '27', '30', '33'], 'blue_ball': ['15']}
{'qihao': '24134 ', 'red_ball': ['2', '4', '13', '18', '20'], 'blue_ball': ['16']}
{'qihao': '24135 ', 'red_ball': ['5', '11', '18', '30', '31'], 'blue_ball': ['17']}
{'qihao': '24136 ', 'red_ball': ['3', '11', '15', '21'], 'blue_ball': ['25', '26']}
{'qihao': '24137 ', 'red_ball': ['4', '9', '10', '19'], 'blue_ball': ['26', '27']}
{'qihao': '24138 ', 'red_ball': ['2', '7', '11', '21'], 'blue_ball': ['27', '28']}
{'qihao': '24139 ', 'red_ball': ['15', '16', '20', '22', '23'], 'blue_ball': ['29']}
{'qihao': '24140 ', 'red_ball': ['4', '7', '17', '22', '26'], 'blue_ball': ['8']}
{'qihao': '24141 ', 'red_ball': ['1', '2', '15', '24', '29'], 'blue_ball': ['7']}
{'qihao': '24142 ', 'red_ball': ['4', '13', '21', '22', '25'], 'blue_ball': ['6']}
{'qihao': '24143 ', 'red_ball': ['2', '11', '22', '33'], 'blue_ball': ['5', '30']}
{'qihao': '24144 ', 'red_ball': ['9', '11', '17', '20'], 'blue_ball': ['2', '30']}
{'qihao': '24145 ', 'red_ball': ['1', '16', '22'], 'blue_ball': ['3', '23', '30']}
{'qihao': '24146 ', 'red_ball': ['2', '11', '27', '32'], 'blue_ball': ['4', '22']}
{'qihao': '24147 ', 'red_ball': ['11', '13', '30'], 'blue_ball': ['5', '16', '21']}
{'qihao': '24148 ', 'red_ball': ['2', '15', '21', '23'], 'blue_ball': ['6', '16']}
{'qihao': '24149 ', 'red_ball': ['3', '9', '22'], 'blue_ball': ['17', '18']}
{'qihao': '24150 ', 'red_ball': ['13', '14', '22', '26', '32'], 'blue_ball': ['20']}
{'qihao': '24151 ', 'red_ball': ['5', '10', '16', '29', '32'], 'blue_ball': ['19']}
{'qihao': '25001 ', 'red_ball': ['2', '3', '17', '22', '33'], 'blue_ball': ['18']}
{'qihao': '25002 ', 'red_ball': ['9', '12', '13', '15', '22', '26'], 'blue_ball': []}
{'qihao': '25003 ', 'red_ball': ['10', '19', '20', '29'], 'blue_ball': ['26', '28']}
{'qihao': '25004 ', 'red_ball': ['3', '7', '17', '32'], 'blue_ball': ['27', '29']}
{'qihao': '25005 ', 'red_ball': ['10', '27'], 'blue_ball': ['16', '19', '28', '30']}
{'qihao': '25006 ', 'red_ball': ['1', '8', '22'], 'blue_ball': ['7', '17', '20']}
{'qihao': '25007 ', 'red_ball': ['7', '14', '27'], 'blue_ball': ['8', '18', '21']}
{'qihao': '25008 ', 'red_ball': ['14', '16', '17', '25', '33'], 'blue_ball': ['9']}
{'qihao': '25009 ', 'red_ball': ['2', '4', '11', '12', '23', '25'], 'blue_ball': []}
{'qihao': '25010 ', 'red_ball': ['4', '6', '7', '17', '21'], 'blue_ball': ['16']}
{'qihao': '25011 ', 'red_ball': ['6', '13', '22', '24', '29'], 'blue_ball': ['17']}
{'qihao': '25012 ', 'red_ball': ['7', '11', '13', '27', '31'], 'blue_ball': ['18']}
{'qihao': '25013 ', 'red_ball': ['4', '14', '16', '23', '24', '30'], 'blue_ball': []}
{'qihao': '25014 ', 'red_ball': ['6', '7', '9', '13', '21', '27'], 'blue_ball': []}
{'qihao': '25015 ', 'red_ball': ['4', '11', '15', '24', '25', '33'], 'blue_ball': []}
{'qihao': '25016 ', 'red_ball': ['2', '3', '12', '16', '20'], 'blue_ball': ['29']}
{'qihao': '25017 ', 'red_ball': ['4', '12', '15', '28'], 'blue_ball': ['18', '29']}
{'qihao': '25018 ', 'red_ball': ['3', '9', '21', '28'], 'blue_ball': ['29']}
{'qihao': '25019 ', 'red_ball': ['5', '13', '22', '23'], 'blue_ball': ['17', '18']}
{'qihao': '25020 ', 'red_ball': ['4', '11', '13', '26', '30'], 'blue_ball': ['16']}
{'qihao': '25021 ', 'red_ball': ['7', '8', '16', '18', '25', '32'], 'blue_ball': []}
{'qihao': '25022 ', 'red_ball': ['11', '17', '18', '20', '28', '30'], 'blue_ball': []}