多线程实现爬取

最新推荐文章于 2024-07-10 18:18:26 发布

hjhcos

最新推荐文章于 2024-07-10 18:18:26 发布

阅读量553

点赞数

分类专栏： Python

hjhcos

本文链接：https://blog.csdn.net/hjh_cos/article/details/107973919

版权

Python 专栏收录该内容

22 篇文章 0 订阅

订阅专栏

对于爬取速度来说，这进行数据解析，是真的比较消耗时间
因此多线程爬取或许是个不错的选择，虽然不是最优化，但是速度还是会明显加快（速度的快慢取决于你的线程个数以及程序的数据处理的细分程度）

预备知识：

文章目录

requests库

一个 Python 的 HTTP 客户端库

requests.get()

获取网站 info

code



def getHtml(url):

    header = {'User-Agent': 'Chrome/84.0.4147.89'}

    html = get(url, headers=header, timeout=7)
    time.sleep(.2)   # wait 2s
    html.encoding = 'utf-8'
    return html.text
 
 
def extractChapters(url):

    try:

        html = getHtml(url)
        return __luoxiaChapters(html)

    except Exception as e:
        raise ValueError(f"{__file__}:\n {url}: {e}")


def extractContent(url):

    try:
        html = getHtml(url)
        return __luoxiaContent(html)

    except Exception as e:
        raise ValueError(f"{__file__}:\n{url}: {e}")

lxml库

一种使用 Python 编写的库，可以迅速、灵活地处理 XML 和 HTML。

lxml.etree.HTML()

将一串字符常量解析为HTML文档

lxml.etree.HTML().xpath()

提取树的文本内容

code

def __luoxiaChapters(html):

    html = etree.HTML(html)
    book_div = html.xpath('/html/body/div[2]/div')
    
    for book_list in book_div:
        if "book-list clearfix" == book_list.xpath('./@class')[0]:
            book_a = book_list.xpath('./ul/li//*')
            chapter_list = []
            for chapter in book_a:
                try:
                    chapter_link = chapter.xpath('./@href')[0]
                except Exception as e:
                    chapter_link = chapter.xpath('./@onclick')[0]
                    chapter_link = findall(r'.*"(.*?)"', chapter_link, S)[0]
                chapter_name = chapter.text
                chapter_list.append((chapter_link, chapter_name))

            return chapter_list


def __luoxiaContent(html):

	html = etree.HTML(html)
    page_content = html.xpath('.//div[@id="nr1"]/p')
    for each_content in page_content:
        if each_content.text is not None:
            yield "    " + each_content.text + "\n"
        else:
            continue

*queue库

实现了多生产者、多消费者队列。这特别适用于消息必须安全地在多线程间交换的线程编程。模块中的 Queue 类实现了所有所需的锁定语义

我们可以将爬取到的所有链接添加到队列中（in_queue）
这样可以了解所有链接进栈出栈的情况
也可以避免多线程爬取的时候，出现重复的情况
我会说明用到的库函数，以便观看
在最后用个结束标识符

queue.Queue()

创建一个FIFO 队列对象并初始化

queue.Queue().put()

将 item ( 可以是对象或元素 ) 放入队列

queue.Queue().get()

从队列中移除最上面的item

queue.Queue().qsize()

返回队列的有多少个item

code

import queue


def intoQueue(in_queue, links):
    for link in links:
        in_queue.put(link)
    in_queue.put(endQ())


def exitQueue(out_queue, in_queue):
    while True:
        in_link = in_queue.get()
        if isinstance(in_link, endQ):
            return 'Stop!'
        print(in_link)
        download(chapter_link=in_link)
        out_queue.put(in_link)
        in_queue.put(endQ)


in_queue = queue.Queue()
out_queue = queue.Queue()

luoxia = 'https://www.luoxia.com/guichui'

*threading库

这个模块在较低级的模块 _thread 基础上建立较高级的线程接口

我之前一直是将目标函数写在 Thread 的 group参数中
导致主进程没有开启线程
直接运行目标函数
运行完才继续往下运行
也就在调用 start 的时候没有线程运行
后来意识到应该使用Thread的target属性
将fun传给target 这样才能通过调用run()
同时启动线程
至于多线程访问会不会出现线程竞争(多线程访问同一对象，会出现竞争，感觉队列应该不会有这种问题的)
感觉可能性非常小

threading.Thread()

添加线程
target 用于 run() 方法调用的可调用对象
name 线程名字
args 用于调用目标函数的参数格式为元组
daemon 将显式地设置该线程是否为守护模式

threading.Thread().start()

开始线程活动
它安排对象的 run() 方法在一个独立的控制进程中调用

code

work1 = threading.Thread(target=exitQueue, name='work1',
                         daemon=True, args=(out_queue, in_queue))
work2 = threading.Thread(target=exitQueue, name='work2',
                         daemon=True, args=(out_queue, in_queue))
work3 = threading.Thread(target=exitQueue, name='work3',
                         daemon=True, args=(out_queue, in_queue))
work4 = threading.Thread(target=exitQueue, name='work4',
                         daemon=True, args=(out_queue, in_queue))
work5 = threading.Thread(target=updateCanva, name='work5',
                         daemon=True, args=(root, out_queue))
works = [work1, work2, work3, work4, work5]

for work in works:
    work.start()

`完整代码`

# -*- coding:utf -*-
import threading
import tkinter as tk
import queue
import time
from requests import get
from lxml import etree
from re import findall, S
import os


class endQ(object):
    """end of process mark"""
    pass


def getHtml(url):
    """to get page in the url."""

    header = {'User-Agent': 'Chrome/84.0.4147.89'}

    html = get(url, headers=header, timeout=7)
    time.sleep(.2)   # wait 2s
    html.encoding = 'utf-8'
    return html.text


def __luoxiaChapters(html):
    """extract each chapters name and chapters link in the e-book
    """
    html = etree.HTML(html)
    book_div = html.xpath('/html/body/div[2]/div')
    # print(len(cha_div))
    for book_list in book_div:
        if "book-list clearfix" == book_list.xpath('./@class')[0]:
            book_a = book_list.xpath('./ul/li//*')
            chapter_list = []
            for chapter in book_a:
                try:
                    chapter_link = chapter.xpath('./@href')[0]
                except Exception as e:
                    chapter_link = chapter.xpath('./@onclick')[0]
                    chapter_link = findall(r'.*"(.*?)"', chapter_link, S)[0]
                chapter_name = chapter.text
                chapter_list.append((chapter_link, chapter_name))

            return chapter_list


def __luoxiaContent(html):
    """extract www.luoxia.com article.
        :param url: section's url
        :return: Each paragraph's content --> generator
    """
    html = etree.HTML(html)
    page_content = html.xpath('.//div[@id="nr1"]/p')
    for each_content in page_content:
        if each_content.text is not None:
            yield "    " + each_content.text + "\n"
        else:
            continue


def extractChapters(url):
    """extract chapters in the e-book url"""

    try:

        html = getHtml(url)
        return __luoxiaChapters(html)

    except Exception as e:
        raise ValueError(f"{__file__}:\n {url}: {e}")


def extractContent(url):
    """extract content in the Chapters"""
    try:
        html = getHtml(url)
        return __luoxiaContent(html)

    except Exception as e:
        raise ValueError(f"{__file__}:\n{url}: {e}")


def save(content, book_name=None, chapter_name=None):
    """save content"""
    if chapter_name:
        book_name = os.path.join(".\\book\\", book_name)
        if not os.path.exists(book_name):
            os.mkdir(book_name)
        if os.path.exists(os.path.join(book_name, chapter_name)):
            raise ValueError("The chapter already exists.")
        chapter_name = os.path.join(book_name, chapter_name + '.txt')
        with open(chapter_name, 'w', encoding='utf-8') as fd:
            for co in content:
                if co:
                    co = str(co + '\n')
                    fd.write(co)
                else:
                    continue
    else:
        file_name = os.path.join(".\\temp", content[:10]+'.txt')
        with open(file_name, 'w', encoding='utf-8') as fd:
            fd.write(content)


def download(chapter_link):
    """download the book by url"""

    while True:
        try:
            save(extractContent(chapter_link), book_name='gcd',
                 chapter_name=chapter_link[-8:])
        except Exception as e:
            raise ValueError(f"{__file__}:\n{e}")

        return True


def intoQueue(in_queue, links):
    """link into queue"""
    for link in links:
        in_queue.put(link)
    in_queue.put(endQ())


def exitQueue(out_queue, in_queue):
    """link to exit queue"""
    while True:
        in_link = in_queue.get()
        if isinstance(in_link, endQ):
            return 'Stop!'
        print(in_link)
        download(chapter_link=in_link)
        out_queue.put(in_link)
        in_queue.put(endQ)


def updateCanva(widget, out_queue):
    """update canva display"""
    canvas = tk.Canvas(widget, width=280, height=10, bg="white")
    fill_line = canvas.create_rectangle(2, 2, 0, 10, fill="green")
    canvas.pack()
    count = 280/in_queue.qsize()
    while True:
        canvas.coords(fill_line, (0, 0, out_queue.qsize()*count, 10))

        if out_queue.qsize()*count >= 280:
            return 'Run Stop!'
        else:
            widget.update()


root = tk.Tk()
root.title = 'download'


in_queue = queue.Queue()
out_queue = queue.Queue()

luoxia = 'https://www.luoxia.com/guichui'

intoQueue(in_queue, [link[0] for link in extractChapters(luoxia)])


work1 = threading.Thread(target=exitQueue, name='work1',
                         daemon=True, args=(out_queue, in_queue))
work2 = threading.Thread(target=exitQueue, name='work2',
                         daemon=True, args=(out_queue, in_queue))
work3 = threading.Thread(target=exitQueue, name='work3',
                         daemon=True, args=(out_queue, in_queue))
work4 = threading.Thread(target=exitQueue, name='work4',
                         daemon=True, args=(out_queue, in_queue))
work5 = threading.Thread(target=updateCanva, name='work5',
                         daemon=True, args=(root, out_queue))
works = [work1, work2, work3, work4, work5]

for work in works:
    work.start()

root.update()
root.mainloop()