很简单的一个爬美女图片的程序,直接上代码:
# -*- coding:utf-8 -*-
# python2运行
import requests
import os
from bs4 import BeautifulSoup
import time
request = requests.Session()
#返回向url请求所获得的html
def getHtml(url):
headers = {
'Referer':'http://www.mmjpg.com',
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5383.400 QQBrowser/10.0.1313.400"
}
rs = request.get(url,headers=headers)
return rs.content
#向指定的url下载图片
def savePic(url,path,index):
headers = {
'Referer':'http://www.mmjpg.com',
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5383.400 QQBrowser/10.0.1313.400"
}
rs = request.get(url,headers=headers)
with open(os.path.join(path,str(index)+".jpg"),'wb') as f:
f.write(rs.content)
f.close()
url = "http://www.mmjpg.com"
# index_soup = BeautifulSoup((getHtml(url)),'html.parser')
# pages_list = index_soup.find_all('li')
page_url_list=[]
# for curr in pages_list:
# page_url_list.append(curr.a['href'])
count = 0
while count <= 2000:
page_url = url +'/mm/' +str(count)
page_url_list.append(page_url)
count = count + 1
# print page_url_list
#对于每个链接再进行请求
dir_index = 1
for curr_url in page_url_list:
try:
soup = BeautifulSoup(getHtml(curr_url),'html.parser')
except Exception, e:
time_sleep_count = 0
while True:
time.sleep(1)
try:
soup = BeautifulSoup(getHtml(curr_url),'html.parser')
break
except Exception, e:
time_sleep_count = time_sleep_count + 1
if time_sleep_count == 100:
print "你的网络可能出问题,请检查"
break
continue
if len(soup.select('.article')) == 0:
continue;
title = soup.select('.article')[0].h2.get_text().encode('utf-8')
print "准备爬取:"+title
total = 1
page = soup.select('.page')[0]
for children in page.children:
if len(children.contents) > 0 and len(children.contents[0].encode('utf-8')) > 0 and len(children.contents[0].encode('utf-8')) < 4:
# print len(children.contents[0].encode('utf-8'))
curr = int(str(children.contents[0].encode('utf-8')))
if curr > total:
total = curr
# print total
#生成每个系列所有的图片地址所在页面的地址的列表
pages= []
pages.append(curr_url)
for i in range(2,total+1):
pages.append(curr_url+"/"+str(i))
#生成目录
index = 1
print "当前目录为:",os.getcwd()
path = os.path.join(os.getcwd(),str(dir_index))
print path
if not os.path.exists(path):
os.makedirs(path)
print path+"文件夹创建成功"
for url in pages:
soup = BeautifulSoup(getHtml(url),'html.parser')
pic_url = soup.select('.content')[0].img['src']
savePic(pic_url,path,index)
index = index + 1
dir_index = dir_index + 1
思路:
关键是自己去找规律,找到规律其实就是很简单,祖国栋梁,保重身体,哎。
我现在就是一页一页的去找每个图片系列一共有多少张图片,然后去每张图片对应的界面里面找那张图片的地址,向这个地址请求得到数据保存成图片就可以。
记得一定要加上referer,不然我自己在下载的时候老是一张很好看的风景图片哈哈。
不懂的地方欢迎留言,enjoy!