import requests
from lxml import etree
from bs4 import BeautifulSoup
import lxml
import os
import math
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77'
}
num = 0
in_se = 0
def get_url():
select=int(input("你想爬多少个:").strip())
global in_se
in_se=select
sum=math.ceil(select/20)
url_list=[]
for i in range(sum):
url_list.append(f"http://www.1ppt.com/moban/ppt_moban_{i+1}.html")
for i in url_list:
url =i
page_text=requests.get(url=url,headers=headers)
page_text.encoding='gb2312'
soup=BeautifulSoup(page_text.text,'lxml')
li_url=soup.select(".tplist>li>a")
li_name = soup.select(".tplist>li>h2>a")
for i in range(len(li_url)):
url="http://www.1ppt.com"+li_url[i]['href']
name=li_name[i].text
get_down(url,name)
def get_down(url,name):
url=url
page_text = requests.get(url=url, headers=headers)
page_text.encoding='gb2312'
soup = BeautifulSoup(page_text.text, 'lxml')
li_down=soup.select(".downurllist>li>a")
down_url="http://www.1ppt.com/"+li_down[0]['href']
down(down_url,name)
def down(li_down,name):
url = li_down
page_text = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(page_text, 'lxml')
down_url = soup.select(".downloadlist>li>a")[0]['href']
page_text = requests.get(url=down_url, headers=headers).content
if not os.path.exists('./PPT'):
os.mkdir('./PPT')
path="./PPT/"+name+".zip"
global num
if num!=in_se:
with open(path,'wb')as f:
f.write(page_text)
print(f"{name}下载成功!")
num+=1
else:
print(f"下载完毕,当前下载{num}个PPT!")
exit()
if __name__=="__main__":
get_url()
选择类别爬取
import requests
from lxml import etree
from bs4 import BeautifulSoup
import lxml
import os
import math
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36 Edg/89.0.774.77'
}
num = 0
def Select():
data=input("搜索关键字:").strip()
page={
'q':data,
'click':1,
'cc':'1ppt.com',
's':'',
'nsid':''
}
url ="http://zhannei.baidu.com/cse/site?"
page_text=requests.get(url=url,params=page,headers=headers)
page_text.encoding='utf-8'
soup=BeautifulSoup(page_text.text,'lxml')
url_li=soup.select('.c-title>a')[0]['href']
try:
get_url(url_li)
except:
print("请重新输入关键字")
Select()
def get_url(url):
url=url
page_text = requests.get(url=url, headers=headers)
page_text.encoding = 'gb2312'
soup = BeautifulSoup(page_text.text, 'lxml')
li_url = soup.select(".tplist>li>a")
li_name = soup.select(".tplist>li>h2>a")
for i in range(len(li_url)):
url = "http://www.1ppt.com" + li_url[i]['href']
name = li_name[i].text
get_down(url, name)
def get_down(url,name):
url=url
page_text = requests.get(url=url, headers=headers)
page_text.encoding='gb2312'
soup = BeautifulSoup(page_text.text, 'lxml')
li_down=soup.select(".downurllist>li>a")
down_url="http://www.1ppt.com/"+li_down[0]['href']
down(down_url,name)
def down(li_down,name):
url = li_down
page_text = requests.get(url=url, headers=headers).text
soup = BeautifulSoup(page_text, 'lxml')
down_url = soup.select(".downloadlist>li>a")[0]['href']
page_text = requests.get(url=down_url, headers=headers).content
if not os.path.exists('./PPT'):
os.mkdir('./PPT')
path="./PPT/"+name+".zip"
with open(path, 'wb')as f:
f.write(page_text)
global num
num+=1
print(f"{name}\033[0;35m...下载成功,当前共下载{num}个\033[0m")
if __name__=="__main__":
Select()