scrapy
一. scrapy 准备工作
1,安装scrapy
pip install scrapy
2,创建scrapy项目
scrapy startproject novel
3,创建spider
cd novel
scrapy genspider spider_novel www.biquge5200.cc
4,执行scrapy命令
scrapy crawl spider_novel
二.scrapy配置
1, setting.py
ROBOTSTXT_OBEY = False
LOG_LEVEL = 'ERROR'
DOWNLOADER_MIDDLEWARES = {
"novel.middlewares.NovelDownloaderMiddleware": 543,
}
2,middlewares.py
from fake_useragent import UserAgent # 随机UA头
class NovelDownloaderMiddleware:
def __init__(self):
self.agent = UserAgent()
def process_request(self, request, spider):
request.headers["User-Agent"] = self.agent.random
return None
3,items.py
class NovelItem(scrapy.Item):
# 书名
book_name = scrapy.Field()
# 作者
author_name = scrapy.Field()
# 更新时间
update_time = scrapy.Field()
# 状态
state = scrapy.Field()
# 简介
intro = scrapy.Field()
# 图片
image_url = scrapy.Field()
# 章节
chapter = scrapy.Field()
# 章节内容
chapter_content = scrapy.Field()
# 序号
number = scrapy.Field()
# 小说类型
novel_type = scrapy.Field()
三.爬虫
1,pipelines.py
from itemadapter import ItemAdapter
import pymysql
class NovelPipeline:
def __init__(self):
self.coon = pymysql.connect(
host='127.0.0.1',
port=3306,
user='root',
password='123456',
db='novel_db',
)
self.cursor = self.coon.cursor()
def book_query(self, item):
book_name = item['book_name']
try:
book_sql = f"select book_name from index_novel where book_name ='{book_name}'" # sql查询语句 根据书名查询对应id
self.cursor.execute(book_sql) # 执行sql语句
sql_book_name = self.cursor.fetchone()[0] # 获取查询的id
book_sql_name = True
except:
book_sql_name = False
return book_sql_name
def chapter_query(self, item):
chapter = item['chapter']
try:
chapter_sql = f"select chapter from index_chapter where chapter ='{chapter}'" # sql查询语句 根据书名查询对应id
self.cursor.execute(chapter_sql) # 执行sql语句
sql_chapter = self.cursor.fetchone()[0] # 获取查询的id
chapter_sql = True
except:
chapter_sql = False
return chapter_sql
def process_item(self, item, spider):
book_name = item['book_name']
if not self.book_query(item):
try:
sql_novel = (
"insert into index_novel(book_name, author_name, update_time, intro, image_url, state, novel_type_id) "
"values (%s, %s, %s, %s, %s, %s, %s)")
params_novel = [(item['book_name'], item['author_name'], item['update_time'], item['intro'],
item['image_url'], item['state'], item['novel_type'])]
print(params_novel)
self.cursor.executemany(sql_novel, params_novel)
self.coon.commit()
print('小说信息保存成功了!')
except Exception as e:
print(e)
self.coon.rollback()
if not self.chapter_query(item):
try:
book_id_sql = f"select id from index_novel where book_name ='{book_name}'" # sql查询语句 根据书名查询对应id
self.cursor.execute(book_id_sql) # 执行sql语句
book_id = self.cursor.fetchone()[0] # 获取查询的id
sql_chapter = ("insert into index_chapter(chapter_number, chapter, chapter_content, book_name_id) "
"values (%s, %s, %s, %s)")
params_chapter = [(item['number'], item['chapter'], item['chapter_content'], book_id)]
self.cursor.executemany(sql_chapter, params_chapter)
self.coon.commit()
print(f'书名{item["book_name"]},{item["chapter"]},保存成功!')
except Exception as e:
print(e)
self.coon.rollback()
return item
2.spider_novel.py
import scrapy
from novel.items import NovelItem
from fake_useragent import UserAgent
from requests import get
from copy import deepcopy
import cn2an # 汉字数字转化阿拉伯数字
class SpiderNovelSpider(scrapy.Spider):
name = "spider_novel"
allowed_domains = ["www.biquge5200.cc"]
start_urls = ["http://www.biquge5200.cc/"]
download_connections = set()
agent = UserAgent()
num = 2
def start_requests(self):
part_url = "http://www.biquge5200.cc/xiuzhenxiaoshuo/"
yield scrapy.Request(url=part_url, callback=self.parse)
def parse(self, response, **kwargs):
novel_list = response.xpath('//*[@id="newscontent"]/div[@class="l"]//ul//li')
for novel in novel_list:
novel_url = response.urljoin(novel.xpath('./span[@class="s2"]/a/@href').extract_first())
yield scrapy.Request(url=novel_url, callback=self.parse_novel_chapter)
def parse_novel_chapter(self, response):
item = NovelItem()
item['book_name'] = response.xpath('//*[@id="info"]/h1/text()').extract_first()
item['author_name'] = response.xpath('//*[@id="info"]/p[1]/text()').extract_first().split(":")[1]
item['update_time'] = response.xpath('//*[@id="info"]/p[3]/text()').extract_first().split(":")[1]
item['novel_type'] = self.num
if '2024' in item['update_time']:
item['state'] = 2
else:
item['state'] = 1
item['intro'] = response.xpath('//*[@id="intro"]/p/text()').extract_first()
image_url = response.xpath('//*[@id="fmimg"]/img/@src').extract_first()
if image_url not in self.download_connections:
self.download_connections.add(image_url)
img_res = get(url=image_url, headers={'User-Agent': self.agent.random})
img_name = f"{item['book_name']}" + '.jpg'
item['image_url'] = f"novel_img/{img_name}"
with open('./image/' + img_name, 'wb') as file:
file.write(img_res.content)
chapter_list = response.xpath('//*[@id="list"]/dl//dd')
for chapter in chapter_list[9:]:
chapter_name = chapter.xpath('./a/text()').extract_first()
try:
if '第' in chapter_name:
number = chapter_name.split('第')[1].split('章')[0]
item['chapter'] = chapter_name
chapter_url = response.urljoin(chapter.xpath('./a/@href').extract_first())
else:
number = chapter_name.split('【')[1].split('】')[0]
item['chapter'] = chapter_name
chapter_url = response.urljoin(chapter.xpath('./a/@href').extract_first())
if number.isdigit():
item['number'] = int(number)
else:
num = cn2an.cn2an(f"{number}")
item['number'] = int(num)
print(item)
yield scrapy.Request(url=chapter_url, callback=self.parse_chapter, meta={"item": deepcopy(item)})
except:
pass
def parse_chapter(self, response):
item = response.meta["item"]
item['chapter_content'] = "".join(response.xpath('//*[@id="content"]//p').extract()).replace('\u3000', '')
yield item