爬取的是美女贴吧中的图片
代码如下:
import requests
from lxml import etree
import urllib
class Tieba:
def __init__(self, name, start_page, end_page):
self.headers = {"User-Agent": "Mozilla/5.0"}
self.name = name
self.start_page = start_page
self.end_page = end_page
def gettieba_ulr(self):
"""
获取帖子url
"""
for i in range(self.start_page - 1, self.end_page):
print(name)
url = "https://tieba.baidu.com/f?kw=" + self.name + "&ie=utf-8&pn=" + str(i * 50)
# print(url)
pattern = requests.get(url)
# pattern.encoding = "utf-8"
xml = etree.HTML(pattern.text)
tiezi_list = xml.xpath('//div[@class="t_con cleafix"]//a[@class="j_th_tit "]/@href')
# tiezi_list = xml.xpath('//*[@id="thread_list"]/li[10]/div/div[2]/div[1]/div[1]/a/@href')
print("*" * 25 + str(i + 1) + "*" * 25)
# print(tiezi_list)
for tiezi_url in tiezi_list:
# print(tiezi_url)
self.getImage(tiezi_url)
def getImage(self, tiezi_url):
"""
获取图片
"""
url = "https://tieba.baidu.com"
full_url = url + tiezi_url
# print(full_url)
pattern1 = requests.get(url=full_url)
# pattern1.encoding = "utf-8"
mxl1 = etree.HTML(pattern1.text)
img_list = mxl1.xpath('//div[@class="p_content "]//img[@class="BDE_Image"]/@src')
# print(img_list[:3])
for img in img_list:
print(img)
self.downloadImage(img)
def downloadImage(self, img):
"""
图片保存
"""
print(img)
try:
image = requests.get(url=img, stream=True)
filename = img[-10:]
file = "H:\\图片\\" + filename
with open(file, "wb") as f:
try:
with open(file, "wb") as f:
for byte in image.iter_content(chunk_size=1024):
f.write(byte)
except Exception as reason:
print(reason.args)
except:
print("{0},下载失败.")
else:
print("{0},下载完成.")
except:
pass
if __name__ == "__main__":
name = input("请输入想要爬取得贴吧:")
start_page = int(input("请输入想要爬取得贴的起始页:"))
end_page = int(input("请输入想要爬取得贴的末尾页:"))
tieba = Tieba(name, start_page, end_page)
tieba.gettieba_ulr()