简单实现,还有很多不完善的地方
"""
输入任意贴吧名,爬取贴吧内任意页数中的图片
"""
import requests # 请求网页
from lxml import etree # 解析网页
tieba_name = input("请输入想爬取的贴吧名: ")
url = 'https://tieba.baidu.com/f?'
startPage = int(input("请输入爬取开始页: "))
endPage = int(input("请输入爬取结束页: "))
for page in range(startPage,endPage+1):
pn_name = (page-1)*50
response = requests.get(url,params={'kw':tieba_name,'pn':pn_name}).text
element = etree.HTML(response)
link_list = element.xpath('//a[@class="j_th_tit "]/@href')
for link in link_list:
full_link = 'https://tieba.baidu.com'+link
tiezi_html = requests.get(full_link).text
img_emement = etree.HTML(tiezi_html)
img_list = img_emement.xpath('//img[@class="BDE_Image"]/@src')
for my_img in img_list:
img = requests.get(my_img).content
img_name = my_img[-10:]
with open ('img/'+img_name,'wb') as f:
f.write(img)
print(img_name)