对于爬取速度来说,这进行数据解析,是真的比较消耗时间
因此多线程爬取或许是个不错的选择,虽然不是最优化,但是速度还是会明显加快(速度的快慢取决于你的线程个数以及程序的数据处理的细分程度)
预备知识:
requests库
一个 Python 的 HTTP 客户端库
requests.get()
获取网站 info
code
def getHtml(url):
header = {'User-Agent': 'Chrome/84.0.4147.89'}
html = get(url, headers=header, timeout=7)
time.sleep(.2) # wait 2s
html.encoding = 'utf-8'
return html.text
def extractChapters(url):
try:
html = getHtml(url)
return __luoxiaChapters(html)
except Exception as e:
raise ValueError(f"{__file__}:\n {url}: {e}")
def extractContent(url):
try:
html = getHtml(url)
return __luoxiaContent(html)
except Exception as e:
raise ValueError(f"{__file__}:\n{url}: {e}")
lxml库
一种使用 Python 编写的库,可以迅速、灵活地处理 XML 和 HTML。
lxml.etree.HTML()
将一串字符常量解析为HTML文档
lxml.etree.HTML().xpath()
提取树的文本内容
code
def __luoxiaChapters(html):
html = etree.HTML(html)
book_div = html.xpath('/html/body/div[2]/div')
for book_list in book_div:
if "book-list clearfix" == book_list.xpath('./@class')[0]:
book_a = book_list.xpath('./ul/li//*')
chapter_list = []
for chapter in book_a:
try:
chapter_link = chapter.xpath('./@href')[0]
except Exception as e:
chapter_link = chapter.xpath('./@onclick')[0]
chapter_link = findall(r'.*"(.*?)"', chapter_link, S)[0]
chapter_name = chapter.text
chapter_list.append((chapter_link, chapter_name))
return chapter_list
def __luoxiaContent(html):
html = etree.HTML(html)
page_content = html.xpath('.//div[@id="nr1"]/p')
for each_content in page_content:
if each_content.text is not None:
yield " " + each_content.text + "\n"
else:
continue
*queue库
实现了多生产者、多消费者队列。这特别适用于消息必须安全地在多线程间交换的线程编程。模块中的 Queue 类实现了所有所需的锁定语义
我们可以将爬取到的所有链接添加到队列中(in_queue)
这样可以了解所有链接进栈出栈的情况
也可以避免多线程爬取的时候,出现重复的情况
我会说明用到的库函数,以便观看
在最后用个结束标识符
queue.Queue()
创建一个FIFO 队列对象并初始化
queue.Queue().put()
将 item ( 可以是对象或元素 ) 放入队列
queue.Queue().get()
从队列中移除最上面的item
queue.Queue().qsize()
返回队列的有多少个item
code
import queue
def intoQueue(in_queue, links):
for link in links:
in_queue.put(link)
in_queue.put(endQ())
def exitQueue(out_queue, in_queue):
while True:
in_link = in_queue.get()
if isinstance(in_link, endQ):
return 'Stop!'
print(in_link)
download(chapter_link=in_link)
out_queue.put(in_link)
in_queue.put(endQ)
in_queue = queue.Queue()
out_queue = queue.Queue()
luoxia = 'https://www.luoxia.com/guichui'
*threading库
这个模块在较低级的模块 _thread 基础上建立较高级的线程接口
我之前一直是将目标函数写在 Thread 的 group参数中
导致主进程没有开启线程
直接运行目标函数
运行完才继续往下运行
也就在调用 start 的时候没有线程运行
后来意识到应该使用Thread的target属性
将fun传给target 这样才能通过调用run()
同时启动线程
至于多线程访问会不会出现线程竞争(多线程访问同一对象,会出现竞争,感觉队列应该不会有这种问题的)
感觉可能性非常小
threading.Thread()
添加线程
target
用于run()
方法调用的可调用对象
name
线程名字
args
用于调用目标函数的参数 格式为元组
daemon
将显式地设置该线程是否为守护模式
threading.Thread().start()
开始线程活动
它安排对象的 run() 方法在一个独立的控制进程中调用
code
work1 = threading.Thread(target=exitQueue, name='work1',
daemon=True, args=(out_queue, in_queue))
work2 = threading.Thread(target=exitQueue, name='work2',
daemon=True, args=(out_queue, in_queue))
work3 = threading.Thread(target=exitQueue, name='work3',
daemon=True, args=(out_queue, in_queue))
work4 = threading.Thread(target=exitQueue, name='work4',
daemon=True, args=(out_queue, in_queue))
work5 = threading.Thread(target=updateCanva, name='work5',
daemon=True, args=(root, out_queue))
works = [work1, work2, work3, work4, work5]
for work in works:
work.start()
完整代码
# -*- coding:utf -*-
import threading
import tkinter as tk
import queue
import time
from requests import get
from lxml import etree
from re import findall, S
import os
class endQ(object):
"""end of process mark"""
pass
def getHtml(url):
"""to get page in the url."""
header = {'User-Agent': 'Chrome/84.0.4147.89'}
html = get(url, headers=header, timeout=7)
time.sleep(.2) # wait 2s
html.encoding = 'utf-8'
return html.text
def __luoxiaChapters(html):
"""extract each chapters name and chapters link in the e-book
"""
html = etree.HTML(html)
book_div = html.xpath('/html/body/div[2]/div')
# print(len(cha_div))
for book_list in book_div:
if "book-list clearfix" == book_list.xpath('./@class')[0]:
book_a = book_list.xpath('./ul/li//*')
chapter_list = []
for chapter in book_a:
try:
chapter_link = chapter.xpath('./@href')[0]
except Exception as e:
chapter_link = chapter.xpath('./@onclick')[0]
chapter_link = findall(r'.*"(.*?)"', chapter_link, S)[0]
chapter_name = chapter.text
chapter_list.append((chapter_link, chapter_name))
return chapter_list
def __luoxiaContent(html):
"""extract www.luoxia.com article.
:param url: section's url
:return: Each paragraph's content --> generator
"""
html = etree.HTML(html)
page_content = html.xpath('.//div[@id="nr1"]/p')
for each_content in page_content:
if each_content.text is not None:
yield " " + each_content.text + "\n"
else:
continue
def extractChapters(url):
"""extract chapters in the e-book url"""
try:
html = getHtml(url)
return __luoxiaChapters(html)
except Exception as e:
raise ValueError(f"{__file__}:\n {url}: {e}")
def extractContent(url):
"""extract content in the Chapters"""
try:
html = getHtml(url)
return __luoxiaContent(html)
except Exception as e:
raise ValueError(f"{__file__}:\n{url}: {e}")
def save(content, book_name=None, chapter_name=None):
"""save content"""
if chapter_name:
book_name = os.path.join(".\\book\\", book_name)
if not os.path.exists(book_name):
os.mkdir(book_name)
if os.path.exists(os.path.join(book_name, chapter_name)):
raise ValueError("The chapter already exists.")
chapter_name = os.path.join(book_name, chapter_name + '.txt')
with open(chapter_name, 'w', encoding='utf-8') as fd:
for co in content:
if co:
co = str(co + '\n')
fd.write(co)
else:
continue
else:
file_name = os.path.join(".\\temp", content[:10]+'.txt')
with open(file_name, 'w', encoding='utf-8') as fd:
fd.write(content)
def download(chapter_link):
"""download the book by url"""
while True:
try:
save(extractContent(chapter_link), book_name='gcd',
chapter_name=chapter_link[-8:])
except Exception as e:
raise ValueError(f"{__file__}:\n{e}")
return True
def intoQueue(in_queue, links):
"""link into queue"""
for link in links:
in_queue.put(link)
in_queue.put(endQ())
def exitQueue(out_queue, in_queue):
"""link to exit queue"""
while True:
in_link = in_queue.get()
if isinstance(in_link, endQ):
return 'Stop!'
print(in_link)
download(chapter_link=in_link)
out_queue.put(in_link)
in_queue.put(endQ)
def updateCanva(widget, out_queue):
"""update canva display"""
canvas = tk.Canvas(widget, width=280, height=10, bg="white")
fill_line = canvas.create_rectangle(2, 2, 0, 10, fill="green")
canvas.pack()
count = 280/in_queue.qsize()
while True:
canvas.coords(fill_line, (0, 0, out_queue.qsize()*count, 10))
if out_queue.qsize()*count >= 280:
return 'Run Stop!'
else:
widget.update()
root = tk.Tk()
root.title = 'download'
in_queue = queue.Queue()
out_queue = queue.Queue()
luoxia = 'https://www.luoxia.com/guichui'
intoQueue(in_queue, [link[0] for link in extractChapters(luoxia)])
work1 = threading.Thread(target=exitQueue, name='work1',
daemon=True, args=(out_queue, in_queue))
work2 = threading.Thread(target=exitQueue, name='work2',
daemon=True, args=(out_queue, in_queue))
work3 = threading.Thread(target=exitQueue, name='work3',
daemon=True, args=(out_queue, in_queue))
work4 = threading.Thread(target=exitQueue, name='work4',
daemon=True, args=(out_queue, in_queue))
work5 = threading.Thread(target=updateCanva, name='work5',
daemon=True, args=(root, out_queue))
works = [work1, work2, work3, work4, work5]
for work in works:
work.start()
root.update()
root.mainloop()