我的这个是输入贴吧的名字,然后可以下载那个贴吧页的图片,可上下页,也可以进入其中一条帖子,并下载那个贴子中的图片,同样可以翻页
注:我代码中的http_list是我从一个网站上面获取的代理IP https://www.xicidaili.com/ 这就是地址,需要的话可以看一看,这里加代理是为了不让自己的电脑IP被封,因为如果一直使用一个IP说去数据,容易被发现是爬虫,所以要加代理
import requests,re,os
import urllib.request
import random
http_list = [
{'http': 'http://218.59.193.14:43669'},
{'http': 'http://115.211.45.41:9000'},
{'http': 'http://60.13.42.110:9999'},
{'http': 'http://117.90.4.137:9999'},
{'http': 'http://120.83.100.237:9999'},
{'http': 'http://122.243.8.73:9000'},
{'http': 'http://163.204.241.106:9999'},
{'http': 'http://112.85.168.5:9999'},
{'http': 'http://60.13.42.31:9999'},
{'http': 'http://59.62.26.145:9000'},
{'http': 'http://119.180.128.167:8060'},
{'http': 'http://117.63.1.137:9999'},
{'http': 'http://118.180.166.195:8060'},
{'http': 'http://60.13.42.82:9999'},
{'http': 'http://180.121.132.168:3128'},
]
http = random.choice(http_list )
# 伪装头
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
def get_data(name,page):
url = f'https://tieba.baidu.com/f?kw={name}&ie=utf-8&pn={page}'
response = requests.get(url=url,headers=headers,proxies=http).text
title_list = re.findall(r'<a rel="noreferrer" href="(.*?)" title="(.*?)"',response)
img_list = re.findall(r'bpic="(.*?)"',response)
reply_list = re.findall(r'title="回复">(\d*?)</span>',response)
lists = []
id = 1
for t,h in zip(title_list,reply_list):
lists.
append([id,t[0],t[1],h])
id += 1
return img_list,lists
def page_change(page,caozuo):
if caozuo == 2:
if page-50>=0:
page -= 50
else:
print('已经是第一页了!')
elif caozuo == 3:
page += 50
elif caozuo == 4:
new_page = int(input('请输入要跳转的页数:'))-1
if page == new_page*50:
print('你已经在该页数的页面了')
else:
page = new_page*50
return page
def download(list,name,page):
shu = 0
path = 'D:\贴吧爬虫图片\\'+name+'\第' +f'{page//50+1}'+'页'
if not os.path.exists(path):
os.makedirs(path)
for i in list:
print(i)
try:
urllib.request.urlretrieve(i,f'{path}\\{shu}.jpg')
print(f'{shu}.jpg 下载成功')
except:
print('下载失败')
shu+=1
def chaxun(list):
for i in list:
print(f'[{i[0]}] 回帖数:{i[3]} {i[2]}')
def into_detail(list,id,page):
href = list[id][1]
url = f'https://tieba.baidu.com{href}?pn={page}'
response = requests.get(url=url,headers=headers,proxies=http).text
max_page = re.findall(r'<span class="red">(\d*?)</span>',response)[0]
photo_list = re.findall(r'<img class="BDE_Image" src="(.*?)" size=".*?".*?>',response)
photo_list1 = re.findall(r'<img class="BDE_Image".*?src="(.*?)" >',response)
photo_list = photo_list+photo_list1
photo_list = [i.split('"')[0] for i in photo_list]
return max_page,photo_list
def detail_dl(list,name,title,detail_page):
print(f'共有{len(list)}张图片')
shu = 0
path = 'D:\贴吧爬虫图片\\'+name+'\\'+title+'\第'+f'{detail_page}'+'页'
if not os.path.exists(path):
os.makedirs(path)
# if len(os.listdir(path)):
# shu = len(os.listdir(path))
for i in list:
print(i)
try:
urllib.request.urlretrieve(i, f'{path}\\{shu}.jpg')
print(f'{shu}.jpg 下载成功')
except:
print('下载失败')
shu += 1
while True:
page = 0
name = input('--退出请输入 退出 --\n请输入你要搜索的贴吧名:')
if name=='退出':
break
try:
name = re.findall('(.*?)吧',name)[0]
except:
pass
while True:
img_list,lists = get_data(name, page)
caozuo = input(f'当前为 {name}吧 第{page//50+1}页 共{len(img_list)}张图片\n'
f'[1]下载图片 [2]上一页 [3]下一页 [4]跳转指定页数 [5]退出 \n'
f'[6]进入某一条数据的详情页\n'
f'请输入操作:')
fanwei = [f'{i}' for i in range(1, 7)]
if caozuo in fanwei:
caozuo = int(caozuo)
if caozuo == 1:
download(img_list,name,page)
elif caozuo == 2 or caozuo == 3 or caozuo == 4:
page = page_change(page, caozuo)
elif caozuo == 5:
break
elif caozuo == 6:
chaxun(lists)
id = int(input('请输入id:'))-1
detail_page = 1
title = lists[id][2]
while True:
max_page,photo_list=into_detail(lists,id,detail_page)
operation = input(f'当前为 {title} 贴的第{detail_page}页 共有{max_page}页 有{len(photo_list)}张图片\n'
f'[1]下载本页图片 [2]下载每一页图片 [3]上一页 [4]下一页 [5]跳转掉指定页数 [6]退出\n'
f'请输入操作:')
if operation in fanwei:
operation = int(operation)
if operation == 1:
detail_dl(photo_list, name, title, detail_page)
elif operation == 2:
while detail_page<=int(max_page):
max_page,photo_list=into_detail(lists,id,detail_page)
print(f'现在在第{detail_page}页')
detail_dl(photo_list, name, title, detail_page)
detail_page += 1
elif operation == 3:
if detail_page>1:
detail_page -= 1
elif operation == 4:
if detail_page<int(max_page):
detail_page += 1
elif operation == 5:
detail_page = int(input('请输入要跳转的页数:'))
elif operation == 6:
break
else:
print('操作错误')
else:
print('操作错误')