import requests
import re
from bs4 import BeautifulSoup
import os
import urllib
import threading #导入多线程库
def getTEXT(url):
try:
kv = {"user-agent": "mozilla/5.0"}
r = requests.get(url, headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
return '错误'
def get_erery_site_url(text):
every_urls = []
every_names = []
b = BeautifulSoup(text, "html.parser")
for tb in b.find_all('h2'):
th = tb.find('a')
every_urls.append(th.attrs['href'])
every_names.append(th.contents)
return every_urls,every_names
def congwangzhi_natudewangzhi(wangzhi):
shuju=getTEXT(wangzhi)
chuli=BeautifulSoup(shuju, "html.parser")
tb=chuli.find(id="post_content")
tupianwangzhis=[]
for tc in tb.find_all('p'):
try:
th2=tc.a.attrs['href']
tupianwangzhis.append(th2)
except:
for td in tc.find_all('img'):
tupianwangzhis.append(td.attrs['src'])
return tupianwangzhis
def xiazaitupian(tupianwangzhis,path,biaoti):
print(path+'图片开始下载,注意查看文件夹')
if not os.path.isdir(path):
os.makedirs(path) # 判断没有此路径则创建
paths = path + '\\'
for tupianwangzhi in tupianwangzhis:
tu = requests.get(tupianwangzhi)
with open(paths+biaoti[0]+tupianwangzhi[-6:], mode='wb') as obj:
obj.write(tu.content)
def pachong(i):
url='https://www.vooc.net/page/'+str(i)
a = getTEXT(url)
wangzhis,biaotis=get_erery_site_url(a)
for biaoti,wangzhi in zip(biaotis,wangzhis):
tupianwangzhis=congwangzhi_natudewangzhi(wangzhi)
path='E:\\tupian'
# path='E:\\tupian\\' + biaoti[0]
xiazaitupian(tupianwangzhis,path,biaoti)
i = int(input('你想爬取的页数:'))
for x in range(1,i+1):
threading.Thread(target=pachong,args=(x,)).start() # 启动多线程
学习:一个多线程爬虫下载图片
最新推荐文章于 2024-03-14 22:21:44 发布