在爬虫中使用线程池是个不错的选择,这里有个对比例子。
假设你已经知道爬虫的基本知识(可以网上查找),至于具体细节请自行参悟或者私信我。
1,常规方法
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import os
import re
article_numbers = input('请输入文章号:')
url = 'https://zhuanlan.zhihu.com/p/'+article_numbers
def get_one_page(url):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'}
#请求头要添加,不然访问失败
response = requests.get(url,headers=headers)
if(response.status_code == 200):
return response.text
#返回页面html文本
return None
except RequestException:
return None
def get_image_url(html):
soup = BeautifulSoup(html, 'lxml')
items = soup.select('figure')
#item_numbers = len(items)
#将figure节点选择出来
# print(item_numbers)
# for i in range(0,item_numbers):
# print(items[i])
return items
def save_image(item,i):
try:
os.chdir('D:\\Desktop\\math and Program\\发明tools')
# 改变工作目录
pattern = re.compile('<noscript>.*?data-original="(.*?)".*?</noscript>',re.S)
# 构建正则表达式
image_url = re.search(pattern,str(item))
print(type(image_url))
# 因为item格式为<soup.tag>,要转换成string
r = requests.get(image_url.group(1))
if r.status_code == 200:
file_path = str(i) + '.jpg'
with open(file_path, 'wb') as f:
f.write(r.content)
except requests.ConnectionError:
print('Failed to save image')
def main():
html = get_one_page(url)
image_urls = get_image_url(html)
i=0
for image_url in image_urls:
i += 1
save_image(image_url,i)
print('第',i,'张好看的壁纸!')
if __name__ == "__main__":
main()
2,使用线程池
import requests
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import os
import re
from multiprocessing.dummy import Pool
import time
url1 = 'https://zhuanlan.zhihu.com/p/133296596'
def init():
path = 'D:\Desktop\project\python\content'
#os.mkdir(path) # 创建文件夹
os.chdir(path) # 切换到指定路径
def get_page(url):
try:
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'}
response = requests.get(url1,headers=headers) # 发送GET请求
if(response.status_code == 200):
return response.text # 返回页面html文本
return None
except RequestException:
return '网页获取失败'
def split_list(lst):
n = len(lst)
base_size = n // 5
remainder = n % 5
result = []
start = 0
for i in range(5):
size = base_size
if remainder > 0:
size += 1
remainder -= 1
end = start + size
result.append(lst[start:end])
start = end
return result
def page_analysis(html):
try:
soup = BeautifulSoup(html, 'lxml')
items = soup.select('figure')
pattern = re.compile('<noscript>.*?data-original="(.*?)".*?</noscript>',re.S)
# 构建正则表达式
urls = []
for item in items:
url = re.search(pattern,str(items))
urls.append(url)
return split_list(urls)
except RequestException:
return '解析失败'
def download(urls, pool_name):
i = 0
for url in urls:
i += 1
r = requests.get(url.group(1))
file_path = pool_name + str(i) + '.jpg'
if r.status_code == 200:
with open(file_path, 'wb') as f:
f.write(r.content)
print('第',pool_name, i, '个')
def main(url):
init()
html = get_page(url)
if html:
image_urls = page_analysis(html)
if image_urls:
pool_names = ['a', 'b', 'c', 'd', 'e']
tasks = zip(image_urls, pool_names)
pool = Pool(5) # 创建包含5个线程的线程池
pool.starmap(download, tasks) # 并行地执行下载任务
pool.close() # 关闭线程池
else:
print('图片链接获取失败')
else:
print('页面获取失败')
if __name__ == "__main__":
#url = input('请输入网址:')
t1 = time.time()
main(url1)
cost_time = time.time()-t1
print('下载完成,时间为:'+str(cost_time) )

被折叠的 条评论
为什么被折叠?



