import re
from urllib import request,parse
import os
# 突破下载函数
def download(img_html):
# 正则匹配图片url
img_url = re.findall('<img class="BDE_Image" src="(.*?)"',img_html,re.S)
# print(img_url)
# 判断文件夹不存在就创建
if not os.path.exists('tieba'):
os.mkdir('tieba')
# 遍历所有突破url
for i in img_url:
# 切割突破文件名
img_name = i.split('/')[-1]
print(i)
# 保存图片
request.urlretrieve(i,'./tieba/'+str(img_name))
def pn():
kw = input('输入贴吧名字')
# 转换为URL编码
kw = parse.quote(kw)
# 拼接url
base_url = 'https://tieba.baidu.com/f?kw='+kw
# print(base_url)
# 发送请求
response = request.urlopen(base_url)
# 转换格式
html = response.read().decode('utf-8')
# 获取最后一页的数据
url = re.findall(r'pn=(\d+)',html)
# 总页数 获取最后一个pn算出总页数
p = int(int(url[-1])/50+1)
print('总页数为'+str(p))
start = input('输入开始页:')
end = input('输入结束页:')
# 遍历所有列表页url
for i in range(int(start)-1,int(end)):
# 拼接列表url
pn_url = base_url+'&pn='+str(i*50)
print(pn_url)
# 发送详情页请求
response = request.urlopen(pn_url)
# 转换格式
html = (response.read().decode('utf-8'))
# 匹配详情页url
pat = re.findall(r'href="(/p/\d*)',html)
print(pat)
# 遍历所有详情页url
for j in pat:
deta_url = 'http://tieba.baidu.com'+j
print(deta_url)
response = request.urlopen(deta_url)
# deta_html = response.read().decode('utf-8')
img_html = response.read().decode('utf-8','ignore')
download(img_html)
# break
if __name__=='__main__':
pn()
urllib/request爬取百度贴吧图片
最新推荐文章于 2024-02-02 14:05:29 发布