看了小甲鱼爬取煎蛋网的视频,但是当时无论怎么写都写不出来,后来又看了一点其他视频,心血来潮把那段代码重写了一次,总算成功了,也算是解决一个心病吧,唯一美中不住的是还是不太会使用find函数,慢慢来吧,我相信总会搞懂的
import requests
from lxml import etree
import os
class picture:
def __init__(self):
self.url='http://jandan.net/ooxx/page-{}#comments'
self.header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
def get_url(self,url):
url = [self.url.format(i) for i in range(26)]
return url
def open_url(self,url):
response = requests.get(url,headers=self.header)
return response
def find_pic(self,html):
html_str = etree.HTML(html)
picture_list = html_str.xpath('//div[@class="text"]//p/img/@src')
return picture_list
def picture_all(self,picture_list):
picture_all = []
for picture_li in picture_list:
picture_ai = 'http:'+picture_li
picture_all.append(picture_ai)
return picture_all
def save_img(self,picture_all):
for picture_every in picture_all:
filename = picture_every.split('/')[-1]
img = self.open_url(picture_every).content
with open(filename,'wb')as f:
f.write(img)
print('保存成功')
def run(self):
folder = 'img'
os.mkdir(folder)
os.chdir(folder)
url_list = self.get_url(self.url)
for url in url_list:
html = self.open_url(url).content.decode()
picture_list = self.find_pic(html)
picture_all = self.picture_all(picture_list)
self.save_img(picture_all)
if __name__ == '__main__':
p = picture()
p.run()
在输出picture_ai时,因为http:一开始没加:,导致无法找到网址,后来观察了半天才发现
总的来说我感觉还可以