思路
1. 确定数据结构 item 2. 写爬虫程序 spider ① 每一页的每一个详情页 url ② 翻页 ③ 详情页匹配目标数据 3. 管道处理数据 piplines ① 保存到 excel ② 下载图片 4. 配置设置 settings
①items.py
import scrapy
class Ftb2Item(scrapy.Item):
game_title = scrapy.Field() # 比赛详情标题
name_1 = scrapy.Field() # 主队球队名称
logo_src_1 = scrapy.Field() # 主队球队logo
name_2 = scrapy.Field() # 客队球队名称
logo_src_2 = scrapy.Field() # 客队球队logo
② ftb.py (spiders)
import scrapy
from ..items import Ftb2Item
class FtbSpider(scrapy.Spider):
name = 'ftb'
allowed_domains = ['mynba.tv']
domain = 'http://www.mynba.tv'
base_page_url = 'http://www.mynba.tv/video/?page='
page = 1
# start_urls = ['http://www.mynba.tv/video/?page=1']
# 重写start_requests
def start_requests(self):
url = "http://www.mynba.tv/video/?page=1"
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
detail_urls = response.xpath('/html/body/div[1]/div[2]/div[2]/div[1]/div/a/@href').extract()
# 进一步请求每一页的比赛详情页
for detail_url in detail_urls:
detail_url = FtbSpider.domain + detail_url
print("请求 " + detail_url + '详情页...')
yield scrapy.Request(url=detail_url, callback=self.parse_detail_info) # 一页的每个比赛
# 请求多页(前10页)
if self.page < 11:
self.page += 1
page_url = base_page_url = 'http://www.mynba.tv/video/?page=' + str(self.page)
print('切换页面至:', page_url)
yield scrapy.Request(url=page_url,callback=self.parse)
# 解析详情页数据
def parse_detail_info(self, response):
# 匹配数据
game_title = response.xpath('//*[@id="app"]/div/div[4]/div/div[2]/p[1]/text()').extract_first()
name_1 = response.xpath('//*[@id="app"]/div/div[4]/div/div[1]/p/a/text()').extract_first()
logo_src_1 = response.xpath('//*[@id="app"]/div/div[4]/div/div[1]/img/@src').extract_first()
name_2 = response.xpath('//*[@id="app"]/div/div[4]/div/div[3]/a/text()').extract_first()
logo_src_2 = response.xpath('//*[@id="app"]/div/div[4]/div/div[3]/img/@src').extract_first()
# 推送给管道
yield Ftb2Item(game_title=game_title, name_1=name_1, logo_src_1=logo_src_1, name_2=name_2,
logo_src_2=logo_src_2)
③ piplines.py
import xlwt
class Ftb2Pipeline:
def open_spider(self, spider):
self.workbook = xlwt.Workbook()
self.worksheet = self.workbook.add_sheet('sheet1')
self.line_cnt = 0
self.col_name = ['game_title', 'name_1', 'logo_src_1', 'name_2', 'logo_src_2']
# 写入表头
for i in range(4):
self.worksheet.write(self.line_cnt, i, self.col_name[i])
self.line_cnt += 1
def process_item(self, item, spider):
try:
# 写入数据
for i in range(4):
self.worksheet.write(self.line_cnt, i, item[self.col_name[i]])
self.line_cnt += 1
self.workbook.save('football_data.xls')
return item # 必须加,否则其他管道就无法获得item了!
except Exception as e:
print('写入失败!有残缺数据!已自动跳过!')
import urllib.request # 用于下载图片
class Ftb2Pipeline_2:
def process_item(self, item, spider):
try:
img_src_1 = item['logo_src_1']
img_src_2 = item['logo_src_2']
name_1 = img_src_1.split('/')[-1]
name_2 = img_src_2.split('/')[-1]
urllib.request.urlretrieve(img_src_1,filename=f'img/{name_1}') # 此方法对于此页面请求下载不太稳定
urllib.request.urlretrieve(img_src_2,filename=f'img/{name_2}')
print('over!!')
return item
except Exception as e:
print(e)
print('下载失败!队徽或队名内容不存在!已自动跳过!')
④ settings.py
BOT_NAME = 'ftb_2'
SPIDER_MODULES = ['ftb_2.spiders']
NEWSPIDER_MODULE = 'ftb_2.spiders'
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
RANDOMIZE_DOWNLOAD_DELAY = True
ITEM_PIPELINES = {
'ftb_2.pipelines.Ftb2Pipeline': 300,
'ftb_2.pipelines.Ftb2Pipeline_2': 300,
}
⑤ start.py (启动文件)
from scrapy import cmdline
cmdline.execute('scrapy crawl ftb'.split(' '))
Scrapy 基础链接: Python爬虫|Scrapy 基础用法