代码:
# coding:utf-8
# 引入requests请求包
import requests
import urllib
# 给一个url参数 返回源代码
def get_datasource(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except:
return ""
# 获取总页数
def get_totalpage(html):
try:
page_start = '共<span class="red">'
start = html.find(page_start)
page_end = '</span>页'
end = html.find(page_end, start)
total_page = html[start + len(page_start):end]
# 返回数字总页码
return int(total_page)
except:
return 0
# 根据总页数 for循环轮流解析每页图片的url地址
def parse_href(total_page, numbers):
# 4.查找开始字符串和结束字符串
start_str = '<img class="BDE_Image" src="'
end_str = '.jpg'
# 获取该页共有多少张图片
for x in range(1, int(total_page) + 1):
try:
# 获取每一页的HTML源代码
url = 'https://tieba.baidu.com/p/%s?pn=%s' % (numbers, x)
try:
response = requests.get(url)
if response.status_code == 200:
html = response.text
except:
continue
with open('html.txt','w') as f:
f.write(html)
# 获取该页共有多少图片
total_count = html.count(start_str)
print('正在解析第%s页,该页共搜索%s张图片' % (x, total_count))
# 5.查找开始字符串位置
start = html.find(start_str)
# 6.只要start!=-1 那就说明找到了图片标签
count = 0
while start != -1:
# 7.从start开始向后查找到end结束的位置
end = html.find(end_str, start, len(html))
# 8.截取图片的链接字符串
href = html[start + len(start_str):end + len(end_str)]
# 9.继续查找下一个图片标签的位置如果找不到 为-1 循环结束
start = html.find(start_str, end + len(end_str), len(html))
# 分割图片名称
pic_name = href.split('/')[-1]
# 计算正在下载第几张count+=1 和count= count+1
count += 1
print('正在下载第%s张。。。。' % count)
print(href)
# 11.下载图片
urllib.urlretrieve(href, pic_name)
except:
continue
# 入口函数
def main():
# 1.准备url
numbers = input('请输入要下载图片帖子编号:')
url = 'https://tieba.baidu.com/p/%s' % numbers
# 2.发起请求 拿回相应数据
html = get_datasource(url)
# 3.根据源代码获取总页数
total_page = get_totalpage(html)
print(total_page)
# 4.根据总页数提取图片
parse_href(total_page, numbers)
# 执行main函数即可执行爬虫
main()