我们以奇书网为例进行爬取
一,先新建一个新的文件夹,名字自取,用于存放py文件和爬取的数据
二,找到要爬取的网站的ur和你自己浏览器的请求头,(因为我是以奇书网为例,浏览器为火狐浏览器)
url= ‘https://www.qisuu.la/soft/sort01/’
请求头:‘User_Anger’: 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
( 找不到请求头的可以直接拿去用)
三,开始爬取
1创建一个类,并.定义初始化函数,在初始化函数中定义好url和请求头,由于下面要用到的数据较多,所以我定义的比较多,代码如下:
class NovelSpider(object):
def __init__(self):
self.url = 'https://www.qisuu.la/soft/sort01/'
self.html = ''
self.herders = {
'User_Anger': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0'
}
self.total = 0
self.count = 0
self.retry_count = 0
#创建Excel表格,用于存储爬取的数据
self.workbook = xlwt.Workbook(encoding='utf-8')
self.sheet = self.workbook.add_sheet('novel_data')
self.create_excel()
2,创建Excel表,用于存储爬取的数据
def create_excel(self):
self.sheet.write(0, 0, '小说名称')
self.sheet.write(0, 1, '点击次数')
self.sheet.write(0, 2, '文本大小')
self.sheet.write(0, 3, '书籍类型')
self.sheet.write(0, 4, '更新日期')
self.sheet.write(0, 5, '连载状态')
self.sheet.write(0, 6, '书籍作者')
self.sheet.write(0, 7, '运行环境')
self.sheet.write(0, 8, '小说简介')
self.sheet.write(0, 9, '下载地址')
3.模拟浏览器发送请求,并接受返回的网页源码,代码如下:
def get_html(self, url):
# 1.创建request对象,设置随机请求头
req = request.Request(url=url, headers={
'User-Agent': choice(self.ua_list)
})
try:
self.retry_count += 1
# 2.发起请求
response = request.urlopen(req)
# 3.接收数据
self.html = response.read().decode('utf-8')
except Exception as e:
# 请求重试次数大于3,放弃该请求
if self.retry_count > 3:
print('请求失败,地址:{}'.format(url))
return
# 重新发送请求
print('请求数据失败,正在尝试重新连接...')
self.get_html(url)
else:
# 归0
self.retry_count = 0
4.用正则表达式来解析网页源码,并获取小说详情页的链接,代码如下:
def get_story_link(self):
"""获取小说页面链接"""
#用正则从网页源码中匹配小说页面的链接
pattern = re.compile(r'<li>.*?<div class="s.*?<a href="(.*?)">.*?"',re.S)
res = re.findall(pattern,self.html)
if res:
"""遍历小说链接"""
for x in res:
#拼接新的链接,并传入请求函数中
url =self.url2 + x
self.get_html(url)
# #解析小说网页数据
self.parse_story()
5.解析小说网页的数据,拿到自己想要的数据,代码如下:
(我们在这里找了小说信息的部分数据)
def parse_story(self):
"""解析小说页面的数据"""
#运用正则来匹配自己想要的数据
pattern = re.compile(r'.*?detail_right".*?h1>(.*?)</h1.*?ul>.*?<li.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?showInfo".*?p>(.*?)</p.*?', re.S)
res = re.findall(pattern, self.html)
# for x in res:
# print(res)
#提取数据
title = res[0][0]
click_num = res[0][1]
size = res[0][2]
novel_type =res[0][3]
datetime =res[0][4]
status =res[0][5]
author =res[0][6]
run_sys =res[0][7]
content =res[0][8]
#保存数据
self.save_date(title,click_num,size,novel_type,datetime,status,author,run_sys,content)
6,保存数据,保存到Excel表中,代码如下:
def write_to_excel(self, idx, data):
#封装写入表格的函数
print(idx, data)
self.sheet.write(self.count, idx, data)
def save_data(self, *args):
self.count += 1
print('正在保存第{}本小说:{}'.format(self.count, args[0]))
# 1.基础写法
self.sheet.write(self.count, 0, args[0])
self.sheet.write(self.count, 1, args[1])
self.sheet.write(self.count, 2, args[2])
self.sheet.write(self.count, 3, args[3])
self.sheet.write(self.count, 4, args[4])
self.sheet.write(self.count, 5, args[5])
self.sheet.write(self.count, 6, args[6])
self.sheet.write(self.count, 7, args[7])
self.sheet.write(self.count, 8, args[8])
self.sheet.write(self.count, 9, args[9])
# 2.进阶写法
# *args 将元组看做一个容器,进行枚举
# for idx, data in enumerate(args):
# if idx == 8:
# data = data.replace(' ', ' ')
#
# self.write_to_excel(idx, data)
# 3.终极写法
# rs = map(lambda idx, data: self.sheet.write(self.count, idx, data), range(10), args)
# for x in rs:
# pass
self.workbook.save('小说数据.xls')
7.运行函数:
def run(self):
#想获取多少页的数据 就把range函数里面的后面的数字改一下
for x in range(1, 11):
print(''.center(50,'*'))
print('正在获取第%s页数据,请稍后....' % (x))
# 拼接完整的url地址
url = t_info[0] + 'index_{}.html'.format(x)
# 获取该页源代码
self.get_html(url)
# 解析源代码,提取数据
self.parse_index()
break
self.workbook.save('小说数据.xls')
以上就是爬取小说的全部步骤,全部代码如下:
# -*- coding: utf-8 -*-
__author__ = 'wj'
__date__ = '2018/8/10 9:08'
import re
from random import choice
from urllib import request
import xlwt
class NovelSpider(object):
def __init__(self):
self.url = 'https://www.qisuu.la/soft/sort01/'
self.html = ''
self.ua_list = [
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36'
]
self.total = 0
self.count = 0
self.retry_count = 0
self.workbook = xlwt.Workbook(encoding='utf-8')
self.sheet = self.workbook.add_sheet('novel_data')
self.create_excel()
def create_excel(self):
self.sheet.write(0, 0, '小说名称')
self.sheet.write(0, 1, '点击次数')
self.sheet.write(0, 2, '文本大小')
self.sheet.write(0, 3, '书籍类型')
self.sheet.write(0, 4, '更新日期')
self.sheet.write(0, 5, '连载状态')
self.sheet.write(0, 6, '书籍作者')
self.sheet.write(0, 7, '运行环境')
self.sheet.write(0, 8, '小说简介')
self.sheet.write(0, 9, '下载地址')
def get_html(self, url):
# 1.创建request对象,设置随机请求头
req = request.Request(url=url, headers={
'User-Agent': choice(self.ua_list)
})
try:
self.retry_count += 1
# 2.发起请求
response = request.urlopen(req)
# 3.接收数据
self.html = response.read().decode('utf-8')
except Exception as e:
# 请求重试次数大于3,放弃该请求
if self.retry_count > 3:
print('请求失败,地址:{}'.format(url))
return
# 重新发送请求
print('请求数据失败,正在尝试重新连接...')
self.get_html(url)
else:
# 归0
self.retry_count = 0
def get_total(self):
# 1.获取源代码
self.get_html(self.url)
# 2.准备正则
pattern = re.compile(r'<div class="tspage.*?/(.*?) ', re.S)
# 3.搜索
rs = re.search(pattern, self.html)
if rs:
self.total = int(rs.group(1))
print(self.total)
def parse_index(self):
# 1.准备正则
pattern = re.compile(r'<li.*?<div.*?class="s".*?<a href="(.*?)"', re.S)
# 2.搜索数据
results = re.findall(pattern, self.html)
# 3.循环遍历每一个小链接
for link in results:
url = 'https://www.qisuu.la' + link
# 4.获取详情页面的源代码
self.get_html(url)
# 5.解析详情页面数据
self.parse_detail()
def parse_detail(self):
#1 准备正则
pattern = re.compile(r"""<div class="detail_right.*?<h1>(.*?)</h1.*?<li.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?:(.*?)<.*?<div class="showInfo".*?<p.*?>(.*?)</p.*?get_down_url.*?,'(.*?)'""", re.S)
results = re.findall(pattern, self.html)
# 1.提取数据
title = results[0][0]
click_num = results[0][1]
file_size = results[0][2]
novel_type = results[0][3]
datetime = results[0][4]
status = results[0][5]
author = results[0][6]
run_sys = results[0][7]
description = results[0][8].replace(' ',' ')
download = results[0][9]
# 保存数据
self.save_data(title, click_num, file_size, novel_type, datetime, status, author, run_sys, description, download)
# 封装写入excel表格的函数
def write_to_excel(self, idx, data):
print(idx, data)
self.sheet.write(self.count, idx, data)
def save_data(self, *args):
self.count += 1
print('正在保存第{}本小说:{}'.format(self.count, args[0]))
# 1.基础写法
self.sheet.write(self.count, 0, args[0])
self.sheet.write(self.count, 1, args[1])
self.sheet.write(self.count, 2, args[2])
self.sheet.write(self.count, 3, args[3])
self.sheet.write(self.count, 4, args[4])
self.sheet.write(self.count, 5, args[5])
self.sheet.write(self.count, 6, args[6])
self.sheet.write(self.count, 7, args[7])
self.sheet.write(self.count, 8, args[8])
self.sheet.write(self.count, 9, args[9])
# 2.进阶写法
# *args 将元组看做一个容器,进行枚举
# for idx, data in enumerate(args):
# if idx == 8:
# data = data.replace(' ', ' ')
#
# self.write_to_excel(idx, data)
# 3.终极写法
# rs = map(lambda idx, data: self.sheet.write(self.count, idx, data), range(10), args)
# for x in rs:
# pass
self.workbook.save('小说数据.xls')
def parse_type(self):
pattern = re.compile(r'<div class="nav">(.*?)</div>', re.S)
res = re.search(pattern, self.html)
if res:
html = res.group(1)
results = re.findall(re.compile(r'<a.*? href="(.*?)".*?>(.*?)</a>',re.S), html)
# 返回所有分类地址
# x是一个小元组
return map(lambda x: ('https://www.qisuu.la'+x[0],x[1]), results[1:])
def run(self):
# 获取总页码
self.get_total()
# 获取所有分类地址
types = self.parse_type()
for t_info in types:
# print(t_info)
print('正在爬取{}下的小说.....'.format(t_info[1]))
for x in range(1, self.total + 1):
print(''.center(50,'*'))
print('正在获取%s下的第%s页数据,请稍后....' % (t_info[1], x))
# 拼接完整的url地址
url = t_info[0] + 'index_{}.html'.format(x)
# 获取该页源代码
self.get_html(url)
# 解析源代码,提取数据
self.parse_index()
break
self.workbook.save('小说数据.xls')
if __name__ == '__main__':
novel = NovelSpider()
novel.run()