scrapy 爬取笔趣阁小说
首先创建scrapy startproject Novel项目
然后创建爬虫 scrapy genspider Downnovel www.qu.la
利用xpath对笔趣阁进行分析后
Downnovel.py如下
start_url可以换成想要爬取的书url
import scrapy
from Novel.items import NovelItem
import re
from copy import deepcopy
import urllib
import copy
class DownnovelSpider(scrapy.Spider):
name = 'DownNovel'
allowed_domains = ['www.qu.la']
start_urls = ['https://www.qu.la/book/1230/']
#div_list = response.xpath("//div[@class='article']/div[@class='']/div[@class='']")
current_page = 1 #设置当前页
def parse(self, response):
item = NovelItem()
#现获取当前页的所有章节
chapter_list = response.xpath("//div[@class='section-box']/ul[@class='section-list fix']")[1]
chapters = chapter_list.xpath('./li')
item['current_page'] = self.current_page
#遍历当前所有章节
for chapter_id,chapter in enumerate(chapters):
# chapter 是用来后面进行排序的
item['chapter_url']=chapter.xpath("./a/@href").extract_first()
item['chapter_name'] = chapter.xpath("./a/text()").extract_first()
item['chapter_url'] ='http://www.qu.la'+ item['chapter_url']
yield scrapy.Request(
url=item['chapter_url'],
callback=self.parse_chapter,
meta={'item':deepcopy(item)},
cb_kwargs={'num': chapter_id + 1}
)
# 下一页
next_url=response.xpath("//div[@class='listpage']/span[@class='right']/a/@href").extract_first()
next_url = urllib.parse.urljoin(response.url,next_url)
# if next_url != 'https://www.qu.la/book/1230/index_3.html':
# self.current_page+=1
# yield scrapy.Request(
# next_url,
# callback=self.parse
#
# )
if next_url !=response.url:
self.current_page+=1
yield scrapy.Request(
next_url,
callback=self.parse
)
def parse_chapter(self,response,num):
item = response.meta['item']
# item['num'] = str(chapter_id) + ':'
item['num']=num
print(item['num'])
item['chapter_title'] = response.xpath("//div[@class='reader-main']/h1/text()").extract_first()
item['chapter_content'] = response.xpath("//div[@class='content']/text()").extract()
item['chapter_content'] = [i.strip() for i in item['chapter_content'] if item['chapter_content']!='' ]
chapter_content = ''.join(item['chapter_content'])
item['chapter_content'] = re.sub("'',", '', chapter_content)
yield item
item.py如下
import scrapy
class NovelItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
chapter_url = scrapy.Field()
chapter_name = scrapy.Field()
chapter_title = scrapy.Field()
chapter_content =scrapy.Field()
num = scrapy.Field()
current_page= scrapy.Field()
pass
pipelines.py
导入pymysql
import pymysql
class NovelPipeline(object):
def open_spider(self, spider):
"""定义items,用来保存每个item"""
self.items = []
self.connection = pymysql.connect(
host='localhost', # 连接的是本地数据库
user='root', # 自己的mysql用户名
passwd='123456', # 自己的密码
db='novel', # 数据库的名字
charset='utf8mb4', # 默认的编码方式:
cursorclass=pymysql.cursors.DictCursor
)
def process_item(self, item, spider):
"""将下载解析到的各个item添加到items,此时是乱序的"""
self.items.append(item)
return item
def close_spider(self, spider):
#根据 item里面的current_page,num进行排序
items = sorted(self.items, key=lambda keys: (keys['current_page'],keys['num']))
try:
with self.connection.cursor() as cursor:
# 数据库表的sql
sql1 = 'Create Table If Not Exists jiushen2(id int auto_increment primary key,zjm text,body text)'
cursor.execute(sql1)
# 单章小说的写入
for item in items:
sql = "Insert into jiushen2(zjm,body) values ('%s','%s')" % (
item['chapter_title'], item['chapter_content'])
cursor.execute(sql)
# 提交本次插入的记录
self.connection.commit()
finally:
# 关闭连接
self.connection.close()
setting.py
BOT_NAME = 'Novel'
LOG_LEVEL = 'DEBUG'
SPIDER_MODULES = ['Novel.spiders']
NEWSPIDER_MODULE = 'Novel.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'WARNING'
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = True
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) \Chrome/27.0.1453.94 Safari/537.36'
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Novel.middlewares.NovelSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'Novel.middlewares.NovelDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'Novel.pipelines.NovelPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'