Python爬取PPT模板(requests+BeautifulSoup+多线程)
快到做毕业设计的时间了,得去找点好看的PPT模板了,在http://www.ypppt.com这个网站上发现了很多不错的PPT,但是PPT有喜欢的有不喜欢的,一个一个下载打开又太慢,于是乎想到使用爬虫帮我批量下载
// 通过pip install 安装下面的库
// 我们去爬取这个网页上的PPT http://www.ypppt.com
import urllib.request as req //网页请求库
from bs4 import BeautifulSoup //网页解析库
import requests
import os
import time
from multiprocessing import Pool #线程
class PPT():
# 整个爬虫的类
def __init__(self):
self.baseUrl = "http://www.ypppt.com/moban/"
//伪造请求头
self.header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36\
(KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'}
self.page_count = 0
def getResponseContent(self, url):
#获取页面请求信息
try:
req_str = req.Request(url, headers = self.header)
response = req.urlopen(req_str, timeout = 10)
except:
print("请求失败")
else:
return response.read().decode('utf-8')
def getPageInfo(self, url):
#获取页面数量
html = self.getResponseContent(url)
soup = BeautifulSoup(html, 'html.parser')
#HTMLParser可以接收相应的HTML内容,并进行解析,
#遇到HTML的标签会自动调用相应的handler(处理方法)来处理,
#用户需要自己创建相应的子类来继承HTMLParser,并且复写相应的handler方法
page_info = soup.find('div', attrs={'class':'page-navi'})
a_list = page_info.find_all('a')
last_a = a_list[len(a_list) - 1]
href = last_a['href']
page_count = href.replace('list-','').replace('.html','')
self.page_count = int(page_count)
def spyder(self, url):
#解析第一个页面
html = self.getResponseContent(url)
soup = BeautifulSoup(html, 'html.parser')
divs = soup.find_all('div', attrs={'class':'wrapper'})
div = divs[1] # 目标div
ul = div.find_all('ul')[3]
li_list = ul.find_all('li')
ppt_link_list = []
for li in li_list:
aTag_href = li.find_all('a')[1]['href']
ppt_link = "http://www.ypppt.com" + aTag_href
ppt_link_list.append(ppt_link)
pool = Pool(processes = 4)
#利用multiprocessing中的Process动态成生多个进程
pool.map(self.PPT_info, ppt_link_list)
def PPT_info(self, url):
#PPT下载页面
html = self.getResponseContent(url)
soup = BeautifulSoup(html, 'html.parser')
down_button = soup.find('a', attrs={'class':'down-button'})['href']
down_url = "http://www.ypppt.com" + down_button
self.DL_PPT(down_url)
time.sleep(1)
def DL_PPT(self, url):
#下载PPT页面
html = self.getResponseContent(url)
soup = BeautifulSoup(html, 'html.parser')
ul = soup.find('ul', attrs={'class':'down clear'})
rar_link = ul.find_all('a')[0]['href']
# 个别PPT的下载页面不是绝对地址,需要手动添加
if rar_link.find('.com') > 0:
pass
else:
rar_link = 'http://www.ypppt.com' + rar_link
ppt_name = soup.find('h1').text
if ppt_name.find('-') > 0:
ppt_name = ppt_name.split('-')[0].strip()
f = requests.get(rar_link, headers = self.header)
with open(ppt_name + '.rar', 'wb') as rar:
rar.write(f.content)
print(ppt_name, "下载完成...")
if __name__ == "__main__":
ppt = PPT()
start_time = time.time()
ppt.getPageInfo(ppt.baseUrl)
ppt.spyder(ppt.baseUrl)
for page in range(2, ppt.page_count + 1):
url = ppt.baseUrl + "list-{}.html".format(page)
ppt.spyder(url)
time.sleep(1)
print(url, "下载完成...")
end_time = time.time()
print("需要:", end_time - start_time)
就这样,爬到了很多的PPT,然后就可以慢慢挑选了