目的:根据输入的导航条名 获取对应的图片并分类保存到本地
首先导入所需要的模块,如果提示报错,显示未安装 就到cmd中使用pip 安装就可以了
import requests,os,shutil
import re
import time
from fake_useragent import UserAgent #随机一个User-Agent
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
最近在练习class 的使用方法所以这次就使用了类来编写 先上代码 解析在后
"""根据输入的类别,获取www.169we.com中的图片"""
# coding:utf-8
import requests,os,shutil
import re
import time
from fake_useragent import UserAgent #随机一个User-Agent
from bs4 import BeautifulSoup
from urllib.request import urlretrieve
# base_url = "http://www.169we.com/diannaobizhi/list_7_1.html"
class Pic169Spider(object):
def __init__(self,category):
self.category = category
self.base_url = "http://www.169we.com/{}/".format(self.category)
self.headers = {
"User-Agent":UserAgent().random,
"Host":"www.169we.com",
}
self.img_headers = {
"User-Agent": UserAgent().random,
"Host": "724.169pp.net",
}
self.dic = {
"diannaobizhi":"7",
"shoujibizhi":"6",
"wangyouzipai":"2",
"gaogensiwa":"3",
"xiyangmeinv":"4",
"guoneimeinv":"5",
"xingganmeinv":"1"
}
def getIndexPage(self,url):
"""获取图片分类的总页数"""
response = requests.get(url,headers=self.headers)
soup = BeautifulSoup(response.text,"lxml")
page_all = soup.select('body div.page ul li a')
return page_all[-3].get_text()
def getPage(self, url):
"""获取图片的总页数"""
response = requests.get(url, headers=self.headers)
soup = BeautifulSoup(response.text, "lxml")
page_all = soup.select('body div ul.pagelist li a')
return page_all[-2].get_text()
def getIndexImgHref(self,url):
response = requests.get(url,headers=self.headers)
response.encoding="gbk"
data = re.findall(r'<li><a href="(.*?)" class="pic".*?>.*?<p>(.*?)</p>',response.text,re.S)
return data
def getImgUrl(self,url,dirname,index):
"""获取图片的src"""
if not os.path.exists(self.category + "/" +dirname):
os.makedirs(self.category + "/" +dirname)
response = requests.get(url, headers=self.headers)
# response.encoding = "gbk"
img_urls = re.findall(r'<p align="center">.*?<img src="(.*?)".*?</p>',response.text,re.S)
for img_url in img_urls:
self.saveImg(img_url,dirname,index)
index+=1
def saveImg(self,img_url,dirname,index):
"""保存图片到本地"""
result = requests.get(img_url,headers=self.img_headers)
with open(self.category + "/" + dirname +"/"+ str(index) + ".jpg", "wb") as f:
f.write(result.content)
# time.sleep(1)
def main(self):
"""主函数"""
if not os.path.exists(self.category):
os.makedirs(self.category)
pages = int(self.getIndexPage(self.base_url))
for page in range(1,pages+1):
print(page)
num = self.dic[self.category]
url = "http://www.169we.com/{}/list_{}_{}.html".format(self.category,num,page)
for data in self.getIndexImgHref(url):
old_url = data[0]
pattern = re.compile(r'<.*?>',re.S)
dirname = re.sub(pattern,'',data[1])
for i in range(1,int(self.getPage(old_url))+1):
if i == 1:
new_url = old_url
else:
lista = old_url.split(".")
lista[-2] = lista[-2] + "_{}".format(i)
new_url = ".".join(lista)
self.getImgUrl(new_url,dirname,i)
class SingleUrl(object):
def __init__(self):
self.headers ={
"User-Agent":UserAgent().random
}
self.total_page = 1
def spider(self,url):
self.getPage(url)
dirname = url.split('/')[-1].split('.')[0]
if os.path.exists(dirname):
shutil.rmtree(dirname)
os.mkdir(dirname)
os.chdir(dirname)
self.getImgUrl(url)
for i in range(2,self.total_page+1):
lista = url.split(".")
lista[-2] = lista[-2] + "_{}".format(i)
new_url = ".".join(lista)
self.getImgUrl(new_url)
def getPage(self, url):
response = requests.get(url, headers=self.headers)
response.encoding="gbk"
self.total_page = int(re.findall(r'.*?共(.*?)页',response.text,re.S)[0])
def getImgUrl(self,url):
response = requests.get(url, headers=self.headers)
# response.encoding = "gbk"
img_urls = re.findall(r'<p align="center">.*?<img src="(.*?)".*?</p>', response.text, re.S)
for img_url in img_urls:
urlretrieve(img_url,img_url.split('/')[-1])
def all_img():
msg = """
提示信息:
diannaobizhi--——>电脑壁纸
shoujibizhi--——>手机壁纸
wangyouzipai--——>网友自拍
gaogensiwa--——>高跟丝袜
xiyangmeinv--——>西洋美女
guoneimeinv--——>国内美女
xingganmeinv--——>性感美女
"""
print(msg)
category = input("请输入你要获取的类别名:")
diannaobizhi = Pic169Spider(category)
diannaobizhi.main()
def single_url_img():
url = input("请将图片链接粘贴此处:")
single = SingleUrl()
single.spider(url)
if __name__ == "__main__":
all_img()
#single_url_img()
先介绍Pic169Spider类中的方法,
该方法为Pic169Spider类中的对象属性
该方法获取总页数如下图所示
该方法获取图片链接中的总页数
获取分类的链接
比如:从http://www.169we.com/diannaobizhi/list_7_2.html获取到http://www.169we.com/diannaobizhi/2017/0709/39113.html
获取单个图片的链接,用此链接保存到本地
根据图片链接保存图片
运行程序会执行该方法,根据提示输入就可以爬取了
比如想爬取电脑壁纸 在控制台输入 diannaobizhi 按下回车键就可以了
这个类是用来获取单个url的,比如我只想获取http://www.169we.com/diannaobizhi/2017/0630/38997.html
的图片,那么只需要调用single_url_img()函数,将该url传进去就可以了
个人文采不行,语言表达能力较弱,该代码不完善,待改进