前阵时间了解接触到Python语言的威力,预算自动动手写了端代码直接抓取自己平时喜欢看的案例并分文件夹保存下载。
代码:
#python3 爬虫
#一个简单的爬去图片的爬虫
import urllib.request
from bs4 import BeautifulSoup
import os
def get_content(url , data = None):
webheader = {
'Access-Control-Allow-Credentials':'true',
'Access-Control-Allow-Headers':'x-requested-with,content-type,Cache-Control,Pragma,Date,x-timestamp',
'Access-Control-Allow-Methods':'POST, GET, OPTIONS, PUT, DELETE',
'Access-Control-Expose-Headers':'WWW-Authenticate, Server-Authorization'
}
req = urllib.request.Request(url=url, headers=webheader)
webPage=urllib.request.urlopen(req)
data = webPage.read()
data = data.decode('UTF-8')
return data
def get_case_url(html):
final =[]
bs = BeautifulSoup(html, "html.parser") # 创建BeautifulSoup对象
body = bs.body # 获取body部分
data = body.find(attrs={"class":"rows-left"}) #
center = data.find(attrs={"class":"grouppic"})
div = center.find_all(attrs={"class":"story"})
for content in div:
case_url = []
h4 = content.find('h4')
a = h4.find('a').get('href')
case_url.append(a)
# print(case_url)
final.append(case_url)
return final
def save_case_pic(case_html):
if (os.path.exists('TpCases')== False):
os.mkdir('TpCases') #创建主文件夹
bs = BeautifulSoup(case_html, "html.parser") # 创建BeautifulSoup对象
title = bs.title.string
# print(title)
body = bs.body # 获取body部分
content = body.find('div',id='main').find('div',class_="pdt10").find_all('img')
for case_pic in content:
pic_url = case_pic.get('src')
right = pic_url.rindex('/')
name = pic_url.replace(pic_url[:right+1],'')
list = ['120_120.gif','53bbb45c07c5c_thumb.jpg'] #排除列表
if(name in list):
continue
mkdir(title) #通过标题创建文件夹
savepath = 'TpCases/'+ title+'/'+ name #生成保存文件路径
try:
urllib.request.urlretrieve(pic_url, savepath) #下载图片
print(name+ ' save success!')
except:
print('失败')
#创建新目录
def mkdir(path):
path = path.strip()
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists('TpCases/'+path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
print(u"偷偷新建了名字叫做",path,u'的文件夹')
# 创建目录操作函数
os.makedirs('TpCases/'+path)
return True
else:
# 如果目录存在则不创建,并提示目录已存在
# print(u"名为",path,'的文件夹已经创建成功')
return False
if __name__ == '__main__':
y=1
for y in range(0,2):
page=(y+1)
url ="http://www.topthink.com/group/1525/%s"%page
html = get_content(url)
# print('正在下载第%d页'%page)
case_url = get_case_url(html) #找出案例链接
for x in case_url:
for l in x:
case_html = get_content(l) #案例内容
save_case_pic(case_html) #保存案例图片
# break
# break
# break