百度贴吧下载图片【可搜索】

最新推荐文章于 2021-05-30 04:49:01 发布

黑白神

最新推荐文章于 2021-05-30 04:49:01 发布

阅读量1.7k

点赞数

分类专栏：爬虫文章标签：百度贴吧

本文链接：https://blog.csdn.net/Brilliantstars/article/details/96177271

版权

爬虫专栏收录该内容

3 篇文章 0 订阅

订阅专栏

我的这个是输入贴吧的名字，然后可以下载那个贴吧页的图片，可上下页，也可以进入其中一条帖子，并下载那个贴子中的图片，同样可以翻页
注：我代码中的http_list是我从一个网站上面获取的代理IP https://www.xicidaili.com/ 这就是地址，需要的话可以看一看，这里加代理是为了不让自己的电脑IP被封，因为如果一直使用一个IP说去数据，容易被发现是爬虫，所以要加代理


import requests,re,os
import urllib.request
import random

http_list = [
{'http': 'http://218.59.193.14:43669'},
{'http': 'http://115.211.45.41:9000'},
{'http': 'http://60.13.42.110:9999'},
{'http': 'http://117.90.4.137:9999'},
{'http': 'http://120.83.100.237:9999'},
{'http': 'http://122.243.8.73:9000'},
{'http': 'http://163.204.241.106:9999'},
{'http': 'http://112.85.168.5:9999'},
{'http': 'http://60.13.42.31:9999'},
{'http': 'http://59.62.26.145:9000'},
{'http': 'http://119.180.128.167:8060'},
{'http': 'http://117.63.1.137:9999'},
{'http': 'http://118.180.166.195:8060'},
{'http': 'http://60.13.42.82:9999'},
{'http': 'http://180.121.132.168:3128'},
]
http = random.choice(http_list )
# 伪装头
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
def get_data(name,page):
	url = f'https://tieba.baidu.com/f?kw={name}&ie=utf-8&pn={page}'
	response = requests.get(url=url,headers=headers,proxies=http).text
	title_list = re.findall(r'<a rel="noreferrer" href="(.*?)" title="(.*?)"',response)
	img_list = re.findall(r'bpic="(.*?)"',response)
	reply_list = re.findall(r'title="回复">(\d*?)</span>',response)
	lists = []
	id = 1
	for t,h in zip(title_list,reply_list):
		lists.
		append([id,t[0],t[1],h])
		id += 1
	return img_list,lists
def page_change(page,caozuo):
	if caozuo == 2:
		if page-50>=0:
			page -= 50
		else:
			print('已经是第一页了!')
	elif caozuo == 3:
		page += 50
	elif caozuo == 4:
		new_page = int(input('请输入要跳转的页数:'))-1
		if page == new_page*50:
			print('你已经在该页数的页面了')
		else:
			page = new_page*50
	return page
def download(list,name,page):
	shu = 0
	path = 'D:\贴吧爬虫图片\\'+name+'\第' +f'{page//50+1}'+'页'
	if not os.path.exists(path):
		os.makedirs(path)
	for i in list:
		print(i)
		try:
			urllib.request.urlretrieve(i,f'{path}\\{shu}.jpg')
			print(f'{shu}.jpg  下载成功')
		except:
			print('下载失败')
		shu+=1
def chaxun(list):
	for i in list:
		print(f'[{i[0]}]  回帖数:{i[3]} {i[2]}')
def into_detail(list,id,page):
	href = list[id][1]
	url = f'https://tieba.baidu.com{href}?pn={page}'
	response = requests.get(url=url,headers=headers,proxies=http).text
	max_page = re.findall(r'<span class="red">(\d*?)</span>',response)[0]
	photo_list = re.findall(r'<img class="BDE_Image" src="(.*?)" size=".*?".*?>',response)
	photo_list1 = re.findall(r'<img class="BDE_Image".*?src="(.*?)" >',response)
	photo_list = photo_list+photo_list1
	photo_list = [i.split('"')[0] for i in photo_list]
	return max_page,photo_list
def detail_dl(list,name,title,detail_page):
	print(f'共有{len(list)}张图片')
	shu = 0
	path = 'D:\贴吧爬虫图片\\'+name+'\\'+title+'\第'+f'{detail_page}'+'页'
	if not os.path.exists(path):
		os.makedirs(path)
	# if len(os.listdir(path)):
	# 	shu = len(os.listdir(path))
	for i in list:
		print(i)
		try:
			urllib.request.urlretrieve(i, f'{path}\\{shu}.jpg')
			print(f'{shu}.jpg  下载成功')
		except:
			print('下载失败')
		shu += 1
while True:
	page = 0
	name = input('--退出请输入 退出 --\n请输入你要搜索的贴吧名:')
	if name=='退出':
		break
	try:
		name = re.findall('(.*?)吧',name)[0]
	except:
		pass
	while True:
		img_list,lists = get_data(name, page)
		caozuo = input(f'当前为  {name}吧  第{page//50+1}页  共{len(img_list)}张图片\n'
		                   f'[1]下载图片   [2]上一页   [3]下一页   [4]跳转指定页数   [5]退出 \n'
		                   f'[6]进入某一条数据的详情页\n'
		                   f'请输入操作:')
		fanwei = [f'{i}' for i in range(1, 7)]
		if caozuo in fanwei:
			caozuo = int(caozuo)
			if caozuo == 1:
				download(img_list,name,page)
			elif caozuo == 2 or caozuo == 3 or caozuo == 4:
				page = page_change(page, caozuo)
			elif caozuo == 5:
				break
			elif caozuo == 6:
				chaxun(lists)
				id = int(input('请输入id:'))-1
				detail_page = 1
				title = lists[id][2]
				while True:
					max_page,photo_list=into_detail(lists,id,detail_page)
					operation = input(f'当前为  {title} 贴的第{detail_page}页  共有{max_page}页 有{len(photo_list)}张图片\n'
					                      f'[1]下载本页图片  [2]下载每一页图片  [3]上一页  [4]下一页  [5]跳转掉指定页数  [6]退出\n'
					                      f'请输入操作:')
					if operation in fanwei:
						operation = int(operation)
						if operation == 1:
							detail_dl(photo_list, name, title, detail_page)
						elif operation == 2:
							while detail_page<=int(max_page):
								max_page,photo_list=into_detail(lists,id,detail_page)
								print(f'现在在第{detail_page}页')
								detail_dl(photo_list, name, title, detail_page)
								detail_page += 1
						elif operation == 3:
							if detail_page>1:
								detail_page -= 1
						elif operation == 4:
							if detail_page<int(max_page):
								detail_page += 1
						elif operation == 5:
							detail_page = int(input('请输入要跳转的页数:'))
						elif operation == 6:
							break
					else:
						print('操作错误')
		else:
			print('操作错误')

黑白神

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
百度贴吧下载图片【可搜索】

我的这个是输入贴吧的名字，然后可以下载那个贴吧页的图片，可上下页，也可以进入其中一条帖子，并下载那个贴子中的图片，同样可以翻页import requests,re,osimport urllib.requestfrom dailichi import http# 伪装头headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW6...
复制链接

扫一扫