网址:煎蛋随手拍
函数库:requests bs4 lxml(这几个是需要你pip)
声明:爬取的图片仅供学习 不做其他用途!!
一 :找到图片的连接
进入网站以后你会看到有80页的图片,然后我们检查网页源代码,通过查找我们很容易就能找到链接的位置
通过下面的代码把图片的链接保存到list里面。
def get_content_page(html):
try:
soup = BeautifulSoup(html,"lxml")
div = soup.find('div',attrs = {'id':'comments'})
ol_list = div.find(name = 'ol')
for ol in ol_list.find_all(name = 'li'):
try:
text_list = ol.find('div',attrs = {'class':'text'})
#print(text_list)
global a_list
a_list = 'http:' + text_list.p.a['href']
#print(a_list)
if a_list not in totle_list:
print(a_list)
totle_list.append(a_list)
except:
pass
except:
print('没有该网值')
return totle_list
二:获取所有页面的链接
该网站有80页的图片,通过点击上一页和下一页发现 该网站的链接有一个特性,就是等号前面的两个字母不一样其,其他的都是一样的,但每一页它变换的字母都是随机的并没有什么规律,但我也懒得再取找规律,直接就做了一个list里面有二十六个大写和小写字母,让它遍历循环,如果有该网址就爬去上面的信息,如果没有就跳过。虽然有点耗时但简单暴力。
ur = ['a','s','d','f','g','h','j','k','l','z','x','c','v','b','n','m','q','w','e','r','t','y','u','i','o','p',
'A','S','D','F','G','H','J','K','L','Z','X','C,','V','B','N','M','Q','W','E','R','T','Y','U','I','O','P']
for i in tqdm(ur):
for n in tqdm(ur):
url = "http://jandan.net/ooxx/MjAyMDEwMTUtN{l1}{l2}=#comments".format(l1=i,l2=n)
def get_totle_page(url):
global headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
try:
response = requests.get(url ,headers =headers)
if response.status_code == 200:
return response.text
except requests.ConnectionError:
print("请求数据失败!!!")
return None
三:保存到本地文件夹
这没什么好说的直接上代码
def save_img_list(list):
if not os.path.exists('./picture_Libs'):
os.mkdir('./picture_Libs')
for totle in message:
try:
response = requests.get(url=totle, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
img_data = response.content
except TimeoutError:
print('请求超时!!!')
img_path = './picture_Libs/' + totle.split('/')[-1]
with open(img_path,'wb') as fp:
fp.write(img_data)
四: 最后的结果
全部源码:
import requests
from bs4 import BeautifulSoup
import lxml
import os
from tqdm import tqdm
totle_list = []
ur = ['a','s','d','f','g','h','j','k','l','z','x','c','v','b','n','m','q','w','e','r','t','y','u','i','o','p',
'A','S','D','F','G','H','J','K','L','Z','X','C,','V','B','N','M','Q','W','E','R','T','Y','U','I','O','P']
def get_totle_page(url):
global headers
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
try:
response = requests.get(url ,headers =headers)
if response.status_code == 200:
return response.text
except requests.ConnectionError:
print("请求数据失败!!!")
return None
def get_content_page(html):
try:
soup = BeautifulSoup(html,"lxml")
div = soup.find('div',attrs = {'id':'comments'})
ol_list = div.find(name = 'ol')
for ol in ol_list.find_all(name = 'li'):
try:
text_list = ol.find('div',attrs = {'class':'text'})
#print(text_list)
global a_list
a_list = 'http:' + text_list.p.a['href']
#print(a_list)
if a_list not in totle_list:
print(a_list)
totle_list.append(a_list)
except:
pass
except:
print('没有该网址')
return totle_list
def save_img_list(message):
if not os.path.exists('./grils'):
os.mkdir('./grils')
for totle in message:
try:
response = requests.get(url=totle, headers=headers)
if response.status_code == 200:
response.encoding = response.apparent_encoding
img_data = response.content
except TimeoutError:
print('请求超时!!!')
img_path = './picture_Libs/' + totle.split('/')[-1]
with open(img_path,'wb') as fp:
fp.write(img_data)
def main():
for i in tqdm(ur):
for n in tqdm(ur):
url = "http://jandan.net/ooxx/MjAyMDEwMTUtN{l1}{l2}=#comments".format(l1=i,l2=n)
print(url)
html = get_totle_page(url)
message = get_content_page(html)
main()
save_img_list(totle_list)