import gevent
from gevent.queue import Queue, Empty
import time,json
import sys
sys.setrecursionlimit(1000000000)
from gevent import monkey # 把下面有可能有IO操作的单独做上标记
monkey.patch_all() # 将IO转为异步执行的函数
#
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
url_list = []
with open('./lyys8.com.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = json.loads(eachone)
url_list.append(link["move_url"])
print(link["move_url"])
start = time.time()
from lxml import etree
import json
def crawler(index):
Process_id = 'Process-' + str(index)
while True:
url = workQueue.get(timeout=100)
if url == "":
break
try:
r = requests.get(url, timeout=10).text
html_obj = etree.HTML(r)
movie_name = html_obj.xpath('//div[@class="main-ui-meta"]/h1/text()')[0]
type = html_obj.xpath('//div[@class="tags-body"]/a[1]/text()')[0]
print(movie_name)
print(type)
if "/tv/" in url:
type = str("电视剧-" + type)
else:
type = str("电影-" + type)
dicts = {
"domain_url": "https://www.lyys8.com/",
"move_url": url,
"movie_name": str(movie_name),
"type": type
}
print(url)
print(dicts)
with open("./log/www.lyys8.com" + str(Process_id) + ".txt", "a", encoding="utf-8")as f:
f.write(json.dumps(dicts) + "\n")
except Exception as e:
print(Process_id, workQueue.qsize(), url, 'Error: ', e)
def boss():
for url in url_list:
workQueue.put_nowait(url)
if __name__ == '__main__':
workQueue = Queue(100000)
gevent.spawn(boss).join()
jobs = []
for i in range(10):
jobs.append(gevent.spawn(crawler, i))
gevent.joinall(jobs)
end = time.time()
print('gevent + Queue多协程爬虫的总时间为:', end - start)
print('Main Ended')
python+多协程爬虫
最新推荐文章于 2024-01-23 20:08:43 发布