源码在最后
// _ooOoo_
// o8888888o
// 88" . "88
// (| -_- |)
// O\ = /O
// ____/`---'\____
// . ' \| |// `.
// / \||| : |||// \
// / _||||| -:- |||||- \
// | | \\\ - /// | |
// | \_| ''\---/'' | |
// \ .-\__ `-` ___/-. /
// ___`. .' /--.--\ `. . __
// ."" '< `.___\_<|>_/___.' >'"".
// | | : `- \`.;`\ _ /`;.`/ - ` : | |
// \ \ `-. \_ __\ /__ _/ .-` / /
// ======`-.____`-.___\_____/___.-`____.-'======
// `=---='
//
// .............................................
// 佛祖镇楼 BUG辟易
// 佛曰:
// 写字楼里写字间,写字间里程序员;
// 程序人员写程序,又拿程序换酒钱。
// 酒醒只在网上坐,酒醉还来网下眠;
// 酒醉酒醒日复日,网上网下年复年。
// 但愿老死电脑间,不愿鞠躬老板前;
// 奔驰宝马贵者趣,公交自行程序员。
// 别人笑我忒疯癫,我笑自己命太贱;
// 不见满街漂亮妹,哪个归得程序员?
1.工程搭建
scrapy startproject xinnuan
2.实现过程
1.在items.py中定义自己要抓取的数据
import scrapy
class XinnuanItem(scrapy.Item):
name = scrapy.Field()
chapter = scrapy.Field()
url = scrapy.Field()
uri = scrapy.Field()
content = scrapy.Field()
2.在spiders下新建xinnuan2.py编写爬取逻辑
# -*- coding: utf-8 -*-
import scrapy
import unicodedata
from scrapy.http import Request
from ..items import XinnuanItem
class Xncwxw2Spider(scrapy.Spider):
name = 'xncwxw2'
allowed_domains = ['m.xncwxw2.com']
start_urls = ['http://m.xncwxw2.com/xclass/0/1.html']
base_url = 'http://m.xncwxw2.com'
def parse(self, response):
# num = response.xpath("//p[@class='page']/input/@value").extract()[0].split("/")[-1]
# print("共 %s 页"%num)
# 下一页
next_link = response.xpath("//p[@class='page']/a[text()='[下页]']/@href").extract()
if next_link:
yield Request(url=self.base_url + next_link[0],callback=self.parse)
page_links = response.xpath('//div[@id="main"]/div/a/@href').extract()
for link in page_links:
link = self.base_url + link
yield Request(url=link,callback=self.parse_chapter_list)
# 进入章节列表
def parse_chapter_list(self,response):
next_link = response.xpath('//span[@class="right"]/a/@href').extract()
# 下一页
if next_link:
yield Request(url=self.base_url+next_link[0],callback=self.parse_chapter_list)
chapter_links = response.xpath('//div[@class="info_chapters"]/ul[2]//a/@href').extract()
if chapter_links:
for link in chapter_links:
yield Request(url=self.base_url+link,callback=self.parse_chapter)
# 进入章节内容
def parse_chapter(self,response):
# 下一步
next_link = response.xpath('//a[text()="下一页"]/@href').extract()
if next_link:
yield Request(url=self.base_url, callback=self.parse_chapter)
item = XinnuanItem()
item['url'] = response.url
item['uri'] = response.url.replace(self.base_url,"").replace(".html","")
item['name'] = response.xpath('//div[@class="pagetitle"]/h1/text()').extract()[0]
item['chapter'] = unicodedata.normalize('NFKC',response.xpath('//div[@id="nr_title"]/text()').extract()[0])
contents = response.xpath('//div[@id="nr1"]/text()').extract()
content = "".join(contents[6:])
item['content'] = unicodedata.normalize('NFKC',content)
yield item
3.编写pipelines异步处理保存内容
- 本地保存
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import os
class XinnuanPipeline(object):
def __init__(self, file_dir=False):
# 定义分隔符
self.separate = '/'
if file_dir:
self.file_dir = file_dir
# 初始化获取settings 的属性
@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
return cls(settings['FILE_DIR'])
def process_item(self, item, spider):
path = self.file_dir + item['name'] + self.separate
if not os.path.exists(path):
os.makedirs(path)
filePath = path + item['uri'][9:] + '.hj'
if os.path.exists(filePath):
os.remove(filePath)
with open(filePath,mode='a',encoding='gbk') as f:
f.write(item['chapter'])
f.write(item['content'])
return item
- 数据库保存
import pymysql
from twisted.enterprise import adbapi
class MysqldbPipeline(object):
def __init__(self,dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings): # 函数名固定,会被scrapy调用,直接可用settings的值
"""
数据库建立连接
:param settings: 配置参数
:return: 实例化参数
"""
adbparams = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
password=settings['MYSQL_PASSWORD'],
cursorclass=pymysql.cursors.DictCursor # 指定cursor类型
)
# 连接数据池ConnectionPool,使用pymysql或者Mysqldb连接
dbpool = adbapi.ConnectionPool('pymysql', **adbparams)
# 返回实例化参数
return cls(dbpool)
def process_item(self, item, spider):
"""
使用twisted将MySQL插入变成异步执行。通过连接池执行具体的sql操作,返回一个对象
"""
query = self.dbpool.runInteraction(self.do_insert, item) # 指定操作方法和操作数据
# 添加异常处理
query.addCallback(self.handle_error) # 处理异常
def do_insert(self, cursor, item):
# 对数据库进行插入操作,并不需要commit,twisted会自动commit
insert_sql = """
INSERT INTO xinnuan2(url,uri,name,chapter,content) VALUES(%s,%s,%s,%s,%s)
"""
cursor.execute(insert_sql, (item['url'], item['uri'], item['name'], item['chapter'],
item['content']))
def handle_error(self, failure):
if failure:
# 打印错误信息
print(failure)
4.修改项目配置文件settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for xinnuan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'xinnuan'
SPIDER_MODULES = ['xinnuan.spiders']
NEWSPIDER_MODULE = 'xinnuan.spiders'
# 设置日志的等级与日志存放的路径
LOG_LEVEL= "DEBUG"
LOG_FILE="../logs/xiaoshuo.log"
# 文件保存路径
FILE_DIR="F:/txt/xinnuan2/"
# 自定义mysql配置
MYSQL_HOST="127.0.0.1"
MYSQL_DBNAME="python"
MYSQL_USER="root"
MYSQL_PASSWORD="root"
# 默认是注释的,这个东西非常重要,如果不写很容易被判断为电脑,简单点洗一个Mozilla/5.0即可
USER_AGENT = 'Mozilla/5.0'
# 是否遵循机器人协议,默认是true,需要改为false,否则很多东西爬不了
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 0
# The download delay setting will honor only one of:
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'xinnuan.middlewares.XinnuanSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'xinnuan.middlewares.XinnuanDownloaderMiddleware': 543,
# }
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'xinnuan.pipelines.pipelines.XinnuanPipeline': 300,
'xinnuan.pipelines.mysqlPipelines.MysqldbPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
5.编写项目运行文件
from scrapy import cmdline
cmdline.execute('scrapy crawl xncwxw2'.split())
6.编写本地保存文件排序及合并
import os
import shutil
file_dir = 'F:/txt/xinnuan2'
# 合并文件
def merge(dir):
if not os.path.exists(dir):
print("目录不存在!")
return
listdir = os.listdir(dir)
for name in listdir:
path = getpath(dir,name)
if ismerge(path): # 判断是否符合合并条件 要不要都行
merging(path,name,dir)
# 合并文件
def merging(dir,fileName,savePath):
if not os.path.exists(dir):
return
if not os.path.exists(savePath):
return
if not os.path.isdir(dir):
return
print("目录文件夹:%s 正在合并……"%dir)
print("正在生成文件:%s.txt ……"%fileName)
filePath = getpath(savePath,fileName) + '.txt'
if os.path.exists(filePath):
os.remove(filePath)
with open(filePath,mode='a',encoding='gbk')as f:
fileList = os.listdir(dir)
fileList.sort()
print(fileList)
for name in fileList:
if not name.endswith('.hj'):
continue
path = getpath(dir, name)
if not os.path.isfile(path):
continue
with open(path,mode='r',encoding='gbk') as r:
line = r.readline()
while line:
f.write(line)
line = r.readline()
print("文件《%s.txt》生成成功!!!" % fileName)
# 删除已合并文件夹
shutil.rmtree(dir)
def getpath(dir,name):
return dir + '/' + name
# 合并条件判断
def ismerge(dir):
if not os.path.exists(dir):
return False
if not os.path.isdir(dir):
return False
listdir = os.listdir(dir)
for name in listdir:
path = getpath(dir,name)
if not os.path.isfile(path):
return False
if not path.endswith(".hj"):
return False
return True
merge(file_dir)
3.说明
- 直接运行本项目脚本会爬取目标网站全网小说,耗时长
- 可修改spiders下脚本,去除下一页爬取减少爬取量
- final 下脚本在项目运行完毕后单独运行
4.最后
有人的捧个人场,有钱的捧个钱场
源码地址: xinnuan