import time, re, requests, xlwt, urllib.request, os
from requests.exceptions import ConnectionError
from fake_useragent import UserAgent
# 建立代理池
def get_proxy():
return requests.get('http://127.0.0.1:5010/get/').text
# 删除不可用的代理
def delete_proxy(proxy):
requests.get('http://127.0.0.1:5010/delete/?proxy={}'.format(proxy))
class QiShu(object):
ua = UserAgent()
def __init__(self):
self.headers = {
'Host': 'www.qisuu.la',
'User-Agent': self.ua.random
}
self.row = 1
# 拿到分类的源码
def get_list_page(self, url):
proxy = get_proxy()
# print('正在使用代理IP:{}请求页面{}'.format(proxy, url))
proxies = {'http': 'http://' + proxy}
try:
response = requests.get(url, headers=self.headers, proxies=proxies)
# print(response.status_code)
if response.status_code == 200:
# print('{}请求成功'.format(url))
return response.text
else:
print('{}请求异常'.format(url))
return None
except ConnectionError as e:
print('{}连接主机异常'.format(url))
return None
# 拿到每本书的href
def parse_list_page(self, list_html):
if list_html:
hrefs = re.findall(re.compile(r'<div class="s">.*?<a href="(.*?)">', re.S), list_html)
# 下一页href
next_href = re.search(re.compile(r"上一页.*?<a href='(.*?)'>", re.S), list_html).groups()[0]
hrefs.append(next_href)
for href in hrefs:
detail_url = 'https://www.qisuu.la' + href
yield detail_url
else:
print('{}获取了没有数据的网页')
# 拿到详情页的源码
def get_detail_page(self, detail_url):
proxy = get_proxy()
# print('正在使用代理IP:{}请求页面{}'.format(proxy, detail_url))
proxies = {'http': 'http://' + proxy}
try:
response = requests.get(detail_url, headers=self.headers, proxies=proxies)
# print(response.status_code)
if response.status_code == 200:
response.encoding = 'utf-8'
# print('{}请求成功'.format(detail_url))
return response.text
else:
print('{}请求异常'.format(detail_url))
return None
except ConnectionError as e:
print('{}连接主机异常'.format(detail_url))
return None
# 拿书的详情
def parse_detail_page(self, detail_html):
if detail_html:
img = re.findall(re.compile(r'<div class="detail">.*?<img src="(.*?)" onerror="(.*?)">', re.S), detail_html)[0]
pattern = re.compile(
r'<div class="detail">.*?<h1>(.*?)</h1>.*?<li class="small">(.*?)</li>.*?<li class="small">(.*?)</li>.*?<li class="small">(.*?)</li>.*?<li class="small">(.*?)</li>.*?<li class="small">(.*?)</li>.*?<li class="small">(.*?)</li>.*?<li class="small">.*?<a.*?>(.*?)</a>',
re.S)
tuple = re.search(pattern, detail_html).groups()
info = re.findall(
re.compile(r'<div class="showBox mt20">.*?<h1(.*?).*?<div class="showInfo">.*?<p>(.*?)</p>', re.S),
detail_html)[0]
link = re.search(re.compile(r"get_down_url.*?,'(.*?)'", re.S), detail_html).groups()
data = tuple + info + link
# print(data)
qishu.save_img(img)
return data
else:
print('{}获取了没有数据的网页')
def save_img(self, href):
os.chdir('奇书网图片')
href1 = 'https://www.qisuu.la' + href[0]
href2 = 'https://www.qisuu.la' + href[1].split("'")[1]
name1 = href1.split('/')[-1]
name2 = href2.split('/')[-1]
if requests.get(href1).status_code == 404:
urllib.request.urlretrieve(href2, name2)
else:
urllib.request.urlretrieve(href1, name1)
def open_file(self):
# 1.创建workbook对象
book = xlwt.Workbook(encoding='utf-8')
# 2.创建选项卡
sheet = book.add_sheet('奇书网')
# 3.添加头
# 第一个参数是行, 第二个是列
sheet.write(0, 0, '书名')
sheet.write(0, 1, '点击次数')
sheet.write(0, 2, '文件大小')
sheet.write(0, 3, '书籍类型')
sheet.write(0, 4, '更新日期')
sheet.write(0, 5, '连载状态')
sheet.write(0, 6, '书籍作者')
sheet.write(0, 7, '最新章节')
sheet.write(0, 8, '小说介绍')
sheet.write(0, 9, '连接')
return book, sheet
def write_data(self, data, sheet):
os.chdir(os.path.pardir)
title = data[0]
dianji = data[1].split(':')[-1]
wenjian = data[2].split(':')[-1]
shuji = data[3].split(':')[-1]
time = data[4].split(':')[-1]
statue = data[5].split(':')[-1]
author = data[6].split(':')[-1]
zhangjie = data[7].split(':')[-1]
info = data[9]
link = data[10]
sheet.write(self.row, 0, title)
sheet.write(self.row, 1, dianji)
sheet.write(self.row, 2, wenjian)
sheet.write(self.row, 3, shuji)
sheet.write(self.row, 4, time)
sheet.write(self.row, 5, statue)
sheet.write(self.row, 6, author)
sheet.write(self.row, 7, zhangjie)
sheet.write(self.row, 8, info)
sheet.write(self.row, 9, link)
book.save('奇书网小说.xls')
self.row += 1
def close_file(self, book):
book.save('奇书网小说.xls')
def main(url, book, sheet):
list_html = qishu.get_list_page(url) # 拿到分类的源码
if list_html:
list_data = qishu.parse_list_page(list_html) # 拿到每本书的href
for detail_url in list_data:
if 'index' not in detail_url:
detail_html = qishu.get_detail_page(detail_url) # 拿到每本书的源码
if detail_html:
data = qishu.parse_detail_page(detail_html) # 拿到每本书的详情
if data:
qishu.write_data(data, sheet) # 保存数据
qishu.close_file(book)
print('{}写入'.format(data[0]))
else:
main(detail_url, book, sheet)
if __name__ == '__main__':
qishu = QiShu()
proxy = ""
# ua = UserAgent()
book, sheet = qishu.open_file()
os.mkdir('奇书网图片')
for x in range(1, 11):
url = 'https://www.qisuu.la/soft/sort0{}/'.format(x) # 分类网址
main(url, book, sheet)