关于多线程的爬取心得和用法

最新推荐文章于 2023-01-29 10:14:01 发布

stephen森

最新推荐文章于 2023-01-29 10:14:01 发布

阅读量382

点赞数

文章标签： python 爬虫数据挖掘

本文链接：https://blog.csdn.net/weixin_44620259/article/details/123021074

版权

最近爬取一些学习上所要用到的东西，因为要搞得东西比较多，写的爬取的速度大大下降，于是我就尝试提升以下程序爬取的速度。正好学过多线程，灵机一动就搞了个多线程的程序爬取
此次没有啥好说的，有啥不懂请看注释(●ˇ∀ˇ●)

import queue
import random
import threading
import requests
import time
import re
from fake_useragent import FakeUserAgent
from lxml import etree
#获得随机的用户代理
ua = FakeUserAgent().random
print(ua)

#将要爬取的url变成一个列表
urls = [
    f'https://www.maoyan.com/films?showType=3&offset={i*30}'
    for i in range(3)
]
#请求头 作用是伪装成浏览器
headers_ = {
    'User-Agent': ua,
    'Cookie': '__mta=142548493.1645077067393.1645078303210.1645078683163.21; uuid_n_v=v1; uuid=985E1D408FB511ECBAB6077BF2B540D7B6A7CAD4BAD6440B955714BDAE5F898A; _lxsdk_cuid=17f063bf5a7c8-0e48596ef8327f-230346c-1fa400-17f063bf5a7c8; _lxsdk=985E1D408FB511ECBAB6077BF2B540D7B6A7CAD4BAD6440B955714BDAE5F898A; _csrf=74e76c98c371318c396d8124a05b944c21e4d6606e4bc9f91ec8ed3dce8058f5; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1645077067,1645077074,1645161070,1645162206; __mta=142548493.1645077067393.1645162217688.1645162700788.23; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1645162713; _lxsdk_s=17f0b3dbede-f59-1a3-81a%7C%7C30',
    'Host': 'www.maoyan.com',
    'Referer': 'https://verify.maoyan.com/'
}

#爬取不同url的响应内容
def craw(urls):
    response = requests.get(urls, headers=headers_)
    #返回相应内容
    return response.text

#解析获得的相应内容
def parse(response):
    html_data = etree.HTML(response)
    #print(response)
    # 电影名称
    movie_name_list = html_data.xpath('//dd/div[2]/a/text()')
    #print(movie_name_list)
    # 电影封面
    movie_img_list = re.findall('<img data-src="(.*?)" alt=".*?" />', response)
    
    return [(movie_name_list[link], movie_img_list[link]) for link in range(len(movie_name_list))]


def do_craw(url_queue:queue.Queue,html_queue:queue.Queue):
    while True:
        #获得取响应内容
        url = url_queue.get()
        html = craw(url)
        #将响应到的内容添加到html_queue这个队列
        html_queue.put(html)
        #打印当前线程的名字，url_queue这个队列的大小
        print(threading.current_thread().name, f"craw{url}", "url_queue.size", url_queue.qsize())

        time.sleep(random.randint(1, 2))

def do_parse(html_queue:queue.Queue,file):
    while True:
        # 获得html_queue这个队列中的内容
        html = html_queue.get()
        #解析html中的响应内容
        results = parse(html)
        #print(results)
        for result in results:
            print(result)
            file.write(str(result) + '\n')
        print(threading.current_thread().name, "result.size", len(results), "html_queue", html_queue.qsize())
        time.sleep(random.randint(1, 2))

if __name__ == '__main__':
    # 创建一个生产者队列(获取url中的响应数据)
    url_queue = queue.Queue()
    # 创建一个消费者对列(将获取的数据保存)
    html_queue = queue.Queue()
    # 遍历url
    for url in urls:
        # url传入生产者队列中
        url_queue.put(url)

    # 创建相应的生产者线程来进行工作
    for idx in range(4):
        t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw{idx}")
        t.start()
    file = open("shuju.txt", "w", encoding="utf-8")
    # 创建相应的消费者线程来进行工作
    for index in range(3):
        t = threading.Thread(target=do_parse, args=(html_queue, file), name=f"parse{index}")
        t.start()