import requests,os
import urllib.request
import re
from lxml import etree
from proxies import http
from fake_useragent import UserAgent
#页数函数
def page_chang(page,caozuo):
if caozuo==2:
if page>0:
page-=50
else:
page=0
elif caozuo==3:
page += 50
else:
p=input("请输入你要前往的页数:")
page=(int(p)-1)*50
return page
#获取需要信息列表函数
def index(goal,page):
if "吧" in goal:
print(f"您当前在{goal}的第{int(page)//50+1}页")
else:
print(f"您当前在{goal}吧的第{int(page)//50+1}页")
url=f'https://tieba.baidu.com/f?kw={goal}&pn={page}'
response=requests.get(url=url,headers=headers,proxies=http).content.decode("utf-8")
# tree=etree.HTML(response)
# pic_list=tree.xpath('.//img[@bpic]/@bpic')
video_list=re.findall(r'data-video="(.*?)"',response)
pic_list=re.findall(r'bpic="(.*?)"',response)
detail_list=re.findall(r'<a rel="noreferrer" href="(.*?)" title="(.*?)" target="_blank"',response)
reply_list=re.findall(r'title="回复">(\d+)</span>',response) #回复数 判断 回复如果为零可以删除
# print(pic_list) #图片列表
# print(video_list) #视屏列表
# 创建空列表接受详情页链接和标题
all_list=[]
id=1
for d,r in zip(detail_list,reply_list):
href=d[0]
topic=f'【{id}】{d[1]}'
reply = r
all_list.append([href,topic,reply])
id+=1
# print(all_list)
lists=video_list+pic_list
return pic_list,video_list,lists,all_list
#创建话题页函数
def detail(all_list):
urls=[]
for i in all_list: #循环 链接 标题 回复数
url=f'https://tieba.baidu.com{i[0]}'
topic=i[1]
reply=i[2]
print(topic+'\t\t'+'回复数:'+reply)
#将id 标题 回复数联合打印 供用户选择 完成输入id 进入话题页效果
urls.append(url)
id=int(input("请输入你要进入的话题的序号:"))-1
def dpic():
detail_url = urls[id]
response = requests.get(url=detail_url, headers=headers, proxies=http).content.decode("utf-8")
pag=re.findall(r'<span class="red">(\d+)</span>',response)[0]
dpic_lists=[]
for p in range(1, int(pag)+1): # 加页数 进去
url1 = f'{detail_url}?pn={p}'
response1 = requests.get(url=url1, headers=headers, proxies=http).content.decode("utf-8")
dpic_list1= re.findall(r'class="BDE_Image".*?src="(.*?)" size.*?>', response1, re.S)
dpic_list2 = re.findall(r'<img class="BDE_Image".*?src="(.*?)" >', response1)
dpic_list=dpic_list1+ dpic_list2
for dpic in dpic_list:
dpic=dpic.split('"')[0]
dpic_lists.append(dpic)
# print(dpic_lists)
return dpic_lists,topic
return dpic()
# pass
def downdpic(dpic_lists,page,pic_list,topic):
for inde, i in enumerate(dpic_lists, len(pic_list)+1):##跟在当页的全部图片的后面继续下载
# print(inde)
if not os.path.exists(f'E:\贴吧图片\{goal}\第{int(page)//50+1}页'):
os.makedirs(f'E:\贴吧图片\{goal}\第{int(page)//50+1}页')
urllib.request.urlretrieve(i, f'E:\贴吧图片\{goal}\第{int(page)//50+1}页\img{inde}.jpg')
print(f'第{int(page)//50+1}页所选话题中的图片已经下载完成')
def downpic(page,pic_list,video_list,lists):
choice = input("请输入你要下载的文件类型【0】我全都要【1】图片【2】视频:")
if choice!= '1' and choice!= '0'and choice!= '2':
print("指定错误,请重新输入")
else:
choice=int(choice)
if choice==1:
print('正在下载 请稍等')
for ind,i in enumerate(pic_list,1):
if not os.path.exists(f'E:\贴吧图片\{goal}\第{int(page)//50+1}页'):
os.makedirs(f'E:\贴吧图片\{goal}\第{int(page)//50+1}页')
urllib.request.urlretrieve(i,f'E:\贴吧图片\{goal}\第{int(page)//50+1}页\img{ind}.jpg')
print(f'第{int(page)//50+1}页图片已经下载完成')
elif choice==2:
print('正在下载 请稍等')
for ind,i in enumerate(video_list,1):
if not os.path.exists(f'E:\贴吧图片\{goal}\第{int(page)//50+1}页'):
os.makedirs(f'E:\贴吧图片\{goal}\第{int(page)//50+1}页')
urllib.request.urlretrieve(i,f'E:\贴吧图片\{goal}\第{int(page)//50+1}页\\video{ind}.mp4')
print(f'第{int(page)//50+1}页视频已经下载完成')
elif choice==0:
print('正在下载 请稍等……')
for ind,i in enumerate(lists,1):
if not os.path.exists(f'E:\贴吧图片\{goal}\第{int(page)//50+1}页'):
os.makedirs(f'E:\贴吧图片\{goal}\第{int(page)//50+1}页')
if i.split('.')[-1]=="mp4":
urllib.request.urlretrieve(i,f'E:\贴吧图片\{goal}\第{int(page)//50+1}页\\{ind}.mp4')
else:
urllib.request.urlretrieve(i,f'E:\贴吧图片\{goal}\第{int(page)//50+1}页\\{ind}.jpg')
print(f'第{int(page)//50+1}页视频已经下载完成')
else:
print("指令错误,请重新输入")
if __name__ == '__main__':
ua = UserAgent()
user = ua.random
# print(user)
headers = {
"User-Agent": user
}
while True:
page=0
panduan = input('[1]搜索贴吧\t[2]退出搜索\n请输入操作:')
if panduan != '1' and panduan != '2':
print("指定错误,请重新输入")
else:
panduan=int(panduan)
if panduan == 1:
# while True:
goal=input("请输入你要搜索的贴吧:")
try:
while True:
pic_list, video_list,lists,all_list=index(goal,page)
caozuo = input('[1]下载图片或视频\t[2]上一页\t[3]下一页\t[4]跳转指定页数\t[5]退出当前贴吧\t[6]进入话题\n请输入操作:')
if caozuo!='1'and caozuo!='2'and caozuo!='3'and caozuo!='4'and caozuo!='5'and caozuo!='6':
print("指定错误,请重新输入")
else:
caozuo=int(caozuo)
if caozuo==5:
break
elif caozuo==1:
downpic(page,pic_list,video_list,lists)
elif caozuo==2 or caozuo==3 or caozuo==4:
page=page_chang(page,caozuo)
elif caozuo==6:
dpic_lists,topic=detail(all_list)
pd=input('请输入你的操作【1】下载话题中图片 【2】退出')
if pd!="1" and pd!='2':
print("指定错误,请重新输入")
else:
pd=int(pd)
if pd==1:
downdpic(dpic_lists,page,pic_list,topic)
else:
continue
else:
print("指令错误,请重新输入")
except:
print('贴吧中并无内容哦,请搜索其他试试')
continue
elif panduan==2:
print("您已退出")
break
else:
print("指令错误,请重新输入")
爬虫 贴吧搜索 进阶
最新推荐文章于 2023-10-28 13:38:56 发布