闲来没事 刚学了爬虫 尝试用某笔X阁练手 初学者水平
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
@author:Evolve Hsu
@file:thread_book.py
@time:2021/03/26
"""
import urllib
import threading
from urllib import request, error # 制定URL 获取网页数据
from bs4 import BeautifulSoup # 网页解析 获取数据
import sqlite3 # sqlite3 数据库操作
import time
from book import Book
class MyThread(threading.Thread):
def __init__(self, func, args):
threading.Thread.__init__(self)
self.args = args
self.func = func
def run(self):
self.func(self.args)
# 获取小说首页
def get_index_html(url):
headers = {
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54'}
while True:
request = urllib.request.Request(url=url, headers=headers)
try:
resp = urllib.request.urlopen(request)
html = resp.read().decode("utf-8")
break
except urllib.error.URLError as e:
print(e)
time.sleep(5)
return html
# 根据页面获取小说每一章节的链接
def getElementList(url):
link_list = []
# 获取首页
html = get_index_html(url)
# 解析首页
bs = BeautifulSoup(html, "lxml")
for data in bs.find('div', id="list").find_all('a'):
# 根据 href名提取内容
link = data.get('href')
# 根据内容筛选 需要的url
if link.__contains__("/320227/"):
link = 'http://www.xbiquge.me' + link
if link_list.__contains__(link):
print(link + " 已存在")
else:
link_list.append(link)
set(link_list)
return link_list
# 解析数据
def resolve_element(url):
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57",
}
text = []
while True:
print("准备解析 html: " + url)
request = urllib.request.Request(url=url, headers=headers)
try:
resp = urllib.request.urlopen(request)
html = resp.read().decode("utf-8")
bs = BeautifulSoup(html, "lxml")
# 获取标题
title = bs.select('body > div.content_read > div > div.bookname > h1 > a')[0].text
# 获取内容
p = bs.find('div', id="content").find_all('p')
for item in p:
text.append(item.text)
db_path = "test/book1.db"
number = url.replace('http://www.xbiquge.me/320227/read_', '').replace('.html', '')
book = Book(title, url, int(number), ''.join(text).replace('xbiquge/最快更新!无广告!', ''))
save_book(db_path, book)
break
except Exception as e:
print(e)
time.sleep(5)
print('发生异常 休息5秒: ' + url)
print("本线程任务完成: " + url)
# 保存数据
def save_book(db_path, data):
# init_db(savePath) # 初始化数据库
conn = sqlite3.connect(db_path)
c = conn.cursor()
values = "'" + str(
data.number) + "'" + ',' + "'" + data.title + "'" + ',' + "'" + data.link + "'" + ',' + "'" + data.text + "'"
sql = "insert into book1 (number,title ,link, data) values (" + values + ")"
c.execute(sql)
conn.commit()
c.close()
conn.close()
# 初始化数据库-建表操作
def init_db(savePath):
sql = '''
create table book
(
id integer primary key autoincrement,
number integer ,
title varchar,
link varchar ,
data text
);
'''
conn = sqlite3.connect(savePath)
c = conn.cursor()
c.execute(sql)
conn.commit()
conn.close()
print("init_db success")
if __name__ == '__main__':
baseUrl = "http://www.xbiquge.me/320227/" # 目标小说的页面
elementList = getElementList(baseUrl)
# 多线程
threadList = [MyThread(resolve_element, element) for element in elementList]
startTotal = 0
joinTotal = 0
for t in threadList:
startTotal = startTotal + 1
t.setDaemon(True)
t.start()
if startTotal == 10:
print('启动线程达到10条 休息10秒')
time.sleep(10)
startTotal = 0
for i in threadList:
joinTotal = joinTotal + 1
i.join()
# url = "http://www.xbiquge.me/320227/read_267.html"
# resolveElement(url)