import requests
import re
import os,threading
from bs4 import BeautifulSoup
from urllib import request
thread_lock = threading.BoundedSemaphore(value=10)
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36"}
proxies = {"https":"https://125.118.151.214:6666"}
解析主页
def get_title_url(url):
try:
r = requests.get(url,headers = headers,proxies = proxies)
soup = BeautifulSoup(r.content,'lxml')
title_list = soup.select("nav > ul > li > a")
return title_list
except Exception as e:
print("请求错误")
获取分类URL列表
def get_chapter():
chapter_list = []
for list in get_title_url(url):
chapter_list.append(list.get('href'))
return chapter_list
获取每部书的URL列表
def get_book_url():
book_url_list = []
for book_url in get_chapter():
req = request.Request(book_url,headers = headers)
response = request.urlopen(book_url).read().decode("gbk")
pattern = re.compile(r'<a target="_blank" title="(.*?)" href="(.*?)" class="clearfix stitle">',re.S)
items = re.findall(pattern ,response)
book_url_list.extend(items)
return book_url_list
获取所有书名及其连接
def get_chapter_read_url():
chapter_url = []
for items in get_book_url():
r = requests.get(items[1],headers = headers,proxies = proxies)
r.encoding = "gbk"
html = r.text
soup = BeautifulSoup(html,'lxml')
read_list = soup.find('a',class_='reader')
chapter_url.append(read_list.get("href"))
for url in chapter_url:
r = requests.get(url)
r.encoding = "gbk"
html = r.text
soup = BeautifulSoup(html, 'lxml')
soup = soup.find('div', class_="clearfix dirconone").find_all('a')
n = 1
for i in soup:
reg = requests.get(i.get("href"))
reg.encoding = "gbk"
html1 = reg.text
tag = BeautifulSoup(html1, 'lxml')
tag = tag.find('div', class_="mainContenr")
try:
tag = tag.get_text()
print("正在下载{}第{}章".format(items[0], n))
except Exception as e:
print("转码错误")
path = items[0]
n = n + 1
try:
with open(path + '.txt', "a", encoding="utf-8")as f:
f.write(tag)
except Exception as e:
print("存储错误")
url = 'http://www.quanshuwang.com'
print(get_chapter_read_url())