最近爬取一些学习上所要用到的东西,因为要搞得东西比较多,写的爬取的速度大大下降,于是我就尝试提升以下程序爬取的速度。正好学过多线程,灵机一动就搞了个多线程的程序爬取
此次没有啥好说的,有啥不懂请看注释(●ˇ∀ˇ●)
import queue
import random
import threading
import requests
import time
import re
from fake_useragent import FakeUserAgent
from lxml import etree
#获得随机的用户代理
ua = FakeUserAgent().random
print(ua)
#将要爬取的url变成一个列表
urls = [
f'https://www.maoyan.com/films?showType=3&offset={i*30}'
for i in range(3)
]
#请求头 作用是伪装成浏览器
headers_ = {
'User-Agent': ua,
'Cookie': '__mta=142548493.1645077067393.1645078303210.1645078683163.21; uuid_n_v=v1; uuid=985E1D408FB511ECBAB6077BF2B540D7B6A7CAD4BAD6440B955714BDAE5F898A; _lxsdk_cuid=17f063bf5a7c8-0e48596ef8327f-230346c-1fa400-17f063bf5a7c8; _lxsdk=985E1D408FB511ECBAB6077BF2B540D7B6A7CAD4BAD6440B955714BDAE5F898A; _csrf=74e76c98c371318c396d8124a05b944c21e4d6606e4bc9f91ec8ed3dce8058f5; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1645077067,1645077074,1645161070,1645162206; __mta=142548493.1645077067393.1645162217688.1645162700788.23; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1645162713; _lxsdk_s=17f0b3dbede-f59-1a3-81a%7C%7C30',
'Host': 'www.maoyan.com',
'Referer': 'https://verify.maoyan.com/'
}
#爬取不同url的响应内容
def craw(urls):
response = requests.get(urls, headers=headers_)
#返回相应内容
return response.text
#解析获得的相应内容
def parse(response):
html_data = etree.HTML(response)
#print(response)
# 电影名称
movie_name_list = html_data.xpath('//dd/div[2]/a/text()')
#print(movie_name_list)
# 电影封面
movie_img_list = re.findall('<img data-src="(.*?)" alt=".*?" />', response)
return [(movie_name_list[link], movie_img_list[link]) for link in range(len(movie_name_list))]
def do_craw(url_queue:queue.Queue,html_queue:queue.Queue):
while True:
#获得取响应内容
url = url_queue.get()
html = craw(url)
#将响应到的内容添加到html_queue这个队列
html_queue.put(html)
#打印当前线程的名字,url_queue这个队列的大小
print(threading.current_thread().name, f"craw{url}", "url_queue.size", url_queue.qsize())
time.sleep(random.randint(1, 2))
def do_parse(html_queue:queue.Queue,file):
while True:
# 获得html_queue这个队列中的内容
html = html_queue.get()
#解析html中的响应内容
results = parse(html)
#print(results)
for result in results:
print(result)
file.write(str(result) + '\n')
print(threading.current_thread().name, "result.size", len(results), "html_queue", html_queue.qsize())
time.sleep(random.randint(1, 2))
if __name__ == '__main__':
# 创建一个生产者队列(获取url中的响应数据)
url_queue = queue.Queue()
# 创建一个消费者对列(将获取的数据保存)
html_queue = queue.Queue()
# 遍历url
for url in urls:
# url传入生产者队列中
url_queue.put(url)
# 创建相应的生产者线程来进行工作
for idx in range(4):
t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw{idx}")
t.start()
file = open("shuju.txt", "w", encoding="utf-8")
# 创建相应的消费者线程来进行工作
for index in range(3):
t = threading.Thread(target=do_parse, args=(html_queue, file), name=f"parse{index}")
t.start()