import os
import requests
from bs4 import BeautifulSoup
def get_htmls(url, html_numbers): # 得到网页地址列表
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
out_list = soup.find_all('a', class_="j_th_tit", limit=html_numbers) # 获得足够数量的网页后缀,将被用来构成网址
sting = 'https://tieba.baidu.com/p/' # 贴吧网址通用部分
html_list = []
for each in out_list:
sting += each.get('href').split('/')[2] # 只获取网页的数字代号部分
html_list.append(sting)
sting = 'https://tieba.baidu.com/p/'
print('执行完get_htmls函数')
return html_list
def get_pictures(html_list): # 得到图片的地址列表
pictures_list = []
for each in html_list:
response = requests.get(each)
soup = BeautifulSoup(response.text, 'html.parser')
pictures_list += soup.find_all('img', class_='BDE_Image')
print('执行完get_pictures函数')
return pictures_list
def download_picture(download_file, html_numbers): # 主函数,从不同的网页下载图片
os.mkdir(download_file)
os.chdir(download_file)
url = 'https://tieba.baidu.com/f?ie=utf-8&kw=%E6%B5%B7%E8%B4%BC%E7%8E%8B' # 海贼王贴吧主页
html_list = get_htmls(url, html_numbers)
pictures_list = get_pictures(html_list)
i = 1
for each in pictures_list:
img_html = requests.get(each['src'])
path = str(i) + '.jpg' # 由于这里没有名字属性,所以简单以数字命名
with open(path, 'wb') as f:
f.write(img_html.content)
i = i + 1
if __name__ == '__main__':
download_picture('Picture_test', 10) # 这里自己设置文件夹位置及需要从多少个网址中提取楼主图片