我们来看一下流程
目标: 全书网
地址: http://www.quanshuwang.com
内容: 抓取网站所有小说分类,小说,章节,内容 分四张表分别存储
将封面下载到本地,将图片url替换成本地的
技术:
python scrpay框架, (为什么选这个,因为scrapy自带高并发,多线程,程序启动刹不住,必须强行关闭)
mysql 作为我们本次存储
接下来看看代码吧
spider
spider也就是你自己创建的app我这里是book项目名是books
import scrapy
from ..items import BooksItem, BooksClassifyItem, BooksChapterItem, BooksContentItem
class BookSpider(scrapy.Spider):
name = 'book'
allowed_domains = ['quanshuwang.com']
start_urls = ['http://www.quanshuwang.com']
def __init__(self):
self.count = 0
def parse(self, response):
item = BooksItem()
# 获取所有分类地址,目的所有分类下的所有小说
classify_list_title = response.xpath("//nav[@class='channel-nav']//li")[:11]
for classify in classify_list_title:
classify_title = classify.xpath("./a/text()").extract_first() # 分类名称
books_id = classify.xpath('./a/@href').extract_first().split('/')[-1].split("_")[-2] # 获取分类id, 目的关联每个分类下的小说
item['classify_title'] = classify_title
item['books_id'] = books_id
yield item
url = classify.xpath('./a/@href').extract_first()
yield scrapy.Request(url=url, callback=self.parse2, dont_filter=True, meta={'each_url': url})
def parse2(self, response):
# 获取小说地址,目的所有小说详情信息
books_list_url = response.xpath('//ul[@class="seeWell cf"]/li/a/@href').extract()
for books_url in books_list_url:
url = books_url
yield scrapy.Request(url=url, callback=self.parse3, dont_filter=True, meta={'each_url': url})
next_page = response.xpath('//a[@class="next"]/@href').extract_first()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callback=self.parse2)
def parse3(self, response):
item = BooksClassifyItem()
# 获取自己所需要的数据
books_title = response.xpath("//div[@class='b-info']/h1/text()").extract_first() # 名称
books_author = response.xpath("//dl[@class='bookso']/dd/text()").extract_first() # 作者
books_status = response.xpath("//dl/dd/text()").extract_first() # 状态
books_introduce = response.xpath("//div[@id='waa']/text()").extract_first().split("介绍:")[-1].split(",")[0] # 介绍
front_image_path = response.xpath("//a[@class='l mr11']/img/@src").extract_first() # 图片地址
books_classify_id = response.xpath("//div[@class='main-index']/a[2]/@href").extract_first().split("/")[-1].split("_")[-2] #分类与小说关联id
books_chapter_id = response.xpath("//div[@c