爬取煎蛋随手拍图
使用 requests + selenium 来进行图片的爬取
爬取结果
爬取思路
- 使用 selenium 发起请求
- 对页面进行数据的提取
- 取到页面上每个图片的 url
- 使用 requests 发起请求
- 将图片进行保存
实现代码
import os
import time
import requests
from selenium import webdriver
class JandanPic:
def __init__(self):
self.start_url = "http://jandan.net/ooxx"
self.driver = webdriver.Chrome()
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3608.4 Safari/537.36"
}
def parse_get_url(self, url):
resp = requests.get(url, headers=self.headers)
return resp.content
def get_content_list(self):
li_list = self.driver.find_elements_by_xpath("//ol[@class='commentlist']/li")
print(li_list)
content_list = []
for li in li_list:
items = {}
items["img_id"] = li.find_element_by_xpath(".//span[@class='righttext']/a").text if len(
li.find_elements_by_xpath(".//span[@class='righttext']/a")) > 0 else None
items["img_url"] = li.find_element_by_xpath(".//a[@class='view_img_link']").get_attribute("href") if len(
li.find_elements_by_xpath(".//a[@class='view_img_link']")) > 0 else None
print(items)
content_list.append(items)
# 下载图片
self.save_pic(content_list)
next_url = self.driver.find_elements_by_xpath("//a[@class='previous-comment-page']")
next_url = next_url[0] if len(next_url) > 0 else None
return next_url
def save_pic(self, content_list):
folder_path = "./image/jandan/"
if not os.path.exists(folder_path):
os.mkdir(folder_path)
for content in content_list:
if content["img_url"] is not None and content["img_id"] is not None:
pic_url = content["img_url"]
pic_name = content["img_id"] + content["img_url"][-4:]
# 发送请求,获取图片数据
img_bytes = self.parse_get_url(pic_url)
print("开始下载:{}".format(pic_url))
with open(folder_path + pic_name, 'wb') as f:
f.write(img_bytes)
def run(self):
# 发起请求,打开浏览器
self.driver.get(self.start_url)
time.sleep(6)
# 取数据,保存数据
next_url = self.get_content_list()
# 下一页
while next_url is not None:
next_url.click()
print("下一页")
time.sleep(6)
# 取数据,保存数据
next_url = self.get_content_list()
self.driver.quit()
if __name__ == '__main__':
jandan = JandanPic()
jandan.run()