import requests
from lxml import etree
class TieBa(object):
def __init__(self):
self.url = 'https://tieba.baidu.com/kw=%E7%BE%8E%E5%A5%B3'
self.headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"}
def send_response(self, url, params={} ):
response = requests.get(url, headers=self.headers, params=params)
data = response.content
return data
def write_file(self, data, image_name):
print(image_name)
image_path = 'images/' + image_name
with open(image_path, 'wb') as f:
f.write(data)
def parse_data(self, data, rule):
html_data = etree.HTML(data)
result_list = html_data.xpath(rule)
# print(result_list)
return result_list
def run(self):
params = {
"kw": "美女",
"pn": 0
}
data = self.send_response(self.url, params)
rule = '//div[@class="t_con cleafix"]/div/div/div/a/@href'
utl_list = self.parse_data(data, rule)
for detail in utl_list:
# 拼接url
detail_url = 'https://tieba.baidu.com' + detail
data = self.send_response(detail_url)
# 2.1 解析所有的图片的 url
image_rule = '//img[@class="BDE_Image"]/@src'
image_url_list = self.parse_data(data, image_rule)
# 3.发送图片的请求
for img_url in image_url_list:
img_data = self.send_response(img_url)
# 图片的名字
image_name = img_url[-15:]
# 保存图片
self.write_file(img_data, image_name)
# self.write_file(data)
if __name__ == '__main__':
TieBa().run()
爬取贴吧图片
最新推荐文章于 2021-02-22 13:32:12 发布