python+多协程爬虫

最新推荐文章于 2024-01-23 20:08:43 发布

xiaozhizhi__

最新推荐文章于 2024-01-23 20:08:43 发布

阅读量251

点赞数

文章标签： python

本文链接：https://blog.csdn.net/xiaozhizhi__/article/details/117394659

版权

import gevent
from gevent.queue import Queue, Empty
import time,json
import sys

sys.setrecursionlimit(1000000000)
from gevent import monkey  # 把下面有可能有IO操作的单独做上标记

monkey.patch_all()  # 将IO转为异步执行的函数
#
import requests

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

url_list = []
with open('./lyys8.com.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = json.loads(eachone)
        url_list.append(link["move_url"])
        print(link["move_url"])

start = time.time()
from lxml import etree
import json


def crawler(index):
    Process_id = 'Process-' + str(index)
    while True:
        url = workQueue.get(timeout=100)
        if url == "":
            break
        try:

            r = requests.get(url, timeout=10).text

            html_obj = etree.HTML(r)
            movie_name = html_obj.xpath('//div[@class="main-ui-meta"]/h1/text()')[0]
            type = html_obj.xpath('//div[@class="tags-body"]/a[1]/text()')[0]
            print(movie_name)
            print(type)
            if "/tv/" in url:
                type =  str("电视剧-" + type)
            else:
                type =  str("电影-" + type)
            dicts = {
                "domain_url": "https://www.lyys8.com/",
                "move_url": url,
                "movie_name": str(movie_name),
                "type":  type

            }
            print(url)
            print(dicts)
            with open("./log/www.lyys8.com" + str(Process_id) + ".txt", "a", encoding="utf-8")as f:
                f.write(json.dumps(dicts) + "\n")
        except Exception as e:
            print(Process_id, workQueue.qsize(), url, 'Error: ', e)


def boss():
    for url in url_list:
        workQueue.put_nowait(url)


if __name__ == '__main__':
    workQueue = Queue(100000)
    gevent.spawn(boss).join()
    jobs = []
    for i in range(10):
        jobs.append(gevent.spawn(crawler, i))
    gevent.joinall(jobs)

    end = time.time()
    print('gevent + Queue多协程爬虫的总时间为：', end - start)
    print('Main Ended')

xiaozhizhi__

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python+多协程爬虫

import geventfrom gevent.queue import Queue, Emptyimport time,jsonimport syssys.setrecursionlimit(1000000000)from gevent import monkey # 把下面有可能有IO操作的单独做上标记monkey.patch_all() # 将IO转为异步执行的函数#import requestsheaders = { "User-Agent": "Mozilla
复制链接

扫一扫