python+多协程爬虫

import gevent
from gevent.queue import Queue, Empty
import time,json
import sys

sys.setrecursionlimit(1000000000)
from gevent import monkey  # 把下面有可能有IO操作的单独做上标记

monkey.patch_all()  # 将IO转为异步执行的函数
#
import requests

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}

url_list = []
with open('./lyys8.com.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = json.loads(eachone)
        url_list.append(link["move_url"])
        print(link["move_url"])

start = time.time()
from lxml import etree
import json


def crawler(index):
    Process_id = 'Process-' + str(index)
    while True:
        url = workQueue.get(timeout=100)
        if url == "":
            break
        try:

            r = requests.get(url, timeout=10).text

            html_obj = etree.HTML(r)
            movie_name = html_obj.xpath('//div[@class="main-ui-meta"]/h1/text()')[0]
            type = html_obj.xpath('//div[@class="tags-body"]/a[1]/text()')[0]
            print(movie_name)
            print(type)
            if "/tv/" in url:
                type =  str("电视剧-" + type)
            else:
                type =  str("电影-" + type)
            dicts = {
                "domain_url": "https://www.lyys8.com/",
                "move_url": url,
                "movie_name": str(movie_name),
                "type":  type

            }
            print(url)
            print(dicts)
            with open("./log/www.lyys8.com" + str(Process_id) + ".txt", "a", encoding="utf-8")as f:
                f.write(json.dumps(dicts) + "\n")
        except Exception as e:
            print(Process_id, workQueue.qsize(), url, 'Error: ', e)


def boss():
    for url in url_list:
        workQueue.put_nowait(url)


if __name__ == '__main__':
    workQueue = Queue(100000)
    gevent.spawn(boss).join()
    jobs = []
    for i in range(10):
        jobs.append(gevent.spawn(crawler, i))
    gevent.joinall(jobs)

    end = time.time()
    print('gevent + Queue多协程爬虫的总时间为:', end - start)
    print('Main Ended')





  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值