import requests
from lxml import etree
import os
url = "https://tieba.baidu.com/f?ie=utf-8"
# 用户输入要爬取的贴吧名称
kw = input("请输入您要爬取贴吧的名称:")
params = {"kw": kw}
# 起始页
start = int(input("请输入您要爬取起始页(从1开始):"))
end = int(input("请输入爬取的截止页:"))
name = 1
for n in range(start, end+1):
pn = (n - 1) * 50
full_url = url + '&pn=' + str(pn)
response = requests.get(full_url, params=params)
print("获取第", n, "页的帖子链接。。。")
print(response.url)
content = response.content
html = etree.HTML(content)
tieba_urls = html.xpath('//div[@class="threadlist_lz clearfix"]/div/a/@href')
print("tieba_urls===", )
print(tieba_urls)
# --------------------------
for tieba_url in tieba_urls:
tieba_url = 'https://tieba.baidu.com' + tieba_url
print('具體貼吧tieba_url',tieba_url)
response = requests.get(tieba_url)
content = response.content
html = etree.HTML(content)
img_urls = html.xpath('//div[@class="d_post_content j_d_post_content "]/img[@class="BDE_Image"]/@src')
for img_url in img_urls:
print(img_url)
# response = requests.get(img_url)
path = './imagesll3/'
if not os.path.exists(path):
os.mkdir(path)
print("正在下载图片:", img_url)
response = requests.get(img_url)
if response.status_code == 200:
with open(path + str(name) + '.jpg', 'wb') as f:
for block in response.iter_content(1024):
if not block:
break
else:
f.write(block)
print('222')
name += 1
print('+1')
爬取百度贴吧(搞笑吧)的图片
最新推荐文章于 2020-04-22 23:23:35 发布