import requests
import os
from lxml import etree
from requests.exceptions import RequestException
url_list = []
base_url = 'http://online.sccnn.com/html/cion/index-'
# 存放文件的地址
pach = './ico/'
# 爬取的最大页数
max_page = 20
# 爬取的起始页数
start_page = 1
def get_page(url):
"""
获取url的HTML信息
:param url:传入一个主页url
:return: 返回一个二进制的字符串,用于XPATH解析
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3)'
' AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/65.0.3325.162 Safari/537.36'
}
try:
html = requests.get(url, headers=headers)
print('正在解析当前页面的信息……')
if html.status_code == 200:
return html.content
return None
except RequestException:
return None
def parse_html(content):
"""
:param content: 接受一个二进制的原生字符串,用于解析
:return: 返回一个分页的url列表
"""
result_list = []
html = etree.HTML(content)
result = html.xpath('//table//table//table//td//a/@href')
print("正在获取页面内各个图片集的链接……")
for i in result:
if 'http://online.sccnn.com/html/cion/png' in i and not i in result_list:
result_list.append(i)
else:
continue
return result_list
def make_dir(file_path):
"""
如果文件夹不存在,就创造一个文件夹
:param file_path: 传入一个文件夹的路径
"""
if not os.path.exists(file_path):
os.makedirs(file_path)
print("【%s】文件夹不存在,正在创建!" % file_path)
def get_ico(url_list):
"""
访问子页面的url,并获取图标的链接与名称。并且在保持路径下,名称文件夹内,保持图片文件。
:param url_list: 子页面的url
:return:
"""
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3)'
' AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/65.0.3325.162 Safari/537.36'
}
for url in url_list:
try:
html = requests.get(url, headers=headers)
if html.status_code == 200:
index_num = 0
href = etree.HTML(html.content)
main_name = href.xpath('//td[@style="WORD-BREAK: break-all"]/text()')
print('正在爬取【%s】目录下的图标' % main_name[0])
ico_src = href.xpath('//img[@title="点击查看原图"]/@src')
# 这里加一个判断,是因为html信息中,有部分空文件夹里面没有内容
if len(ico_src) > 3:
make_dir(pach + main_name[0])
for li in ico_src:
print('正在爬取%s第%d张' % (main_name, index_num + 1))
index_num += 1
get_jpg = requests.get('http://online.sccnn.com/' + li[8:])
with open(pach + main_name[0] + '/' + str(index_num) + '.png', 'wb') as f:
f.write(get_jpg.content)
except RequestException:
return None
if __name__ == '__main__':
for num in range(start_page, max_page):
print('正在爬取第%s页的图标……' % num)
home_page = get_page(base_url + str(num) + '.htm')
url_list = parse_html(home_page)
get_ico(url_list)
用requests库和lxml库,爬取PPT图像素材。
最新推荐文章于 2023-05-16 13:57:57 发布