中小学教材资源网站:http://www.xscbs.com
所用到的模块
- requests
- urllib
- BeautifulSoup
- re
- os
网站截图
python源码
# -*- coding: utf-8 -*-
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import os
import re
root_url=r"http://www.xscbs.com"
root_path="C:\\Users\\Administrator\\Desktop\\教学资源"
grade_path=""
book_path=""
session = requests.session()
if(not os.path.exists(root_path)):
os.mkdir(root_path)
#登陆
def login():
global session
#登陆后截获的登陆信息
login_data = {
'username': '你的账号',
'password': '你的密码',
'__hash__':' hash值'
}
session.post('http://www.xscbs.com/index.php/Index/login', data=login_data)
#完成了登陆
get_grade_url()
#下载pdf
def down_pdf(url,path,name):
global session
path=path+"\\"+name+".pdf"
#data =urlopen(url).read()
data=session.get(url).content
f = open(path,"wb+")
f.write(data)
f.close()
print(name+"完成下载!")
#获取网页html
def get_html(url):
global session
res = session.get(url)
res.encoding = 'utf-8'
html = res.text
return html
#网站的资源是按年级上下期->学科->学科章节
def get_grade_url():
#这个url包含有所需年级的上下学期地址
grade_url=r"http://www.xscbs.com/index.php/Resource/index/types/1"
html_text=get_html(grade_url)
soup=BeautifulSoup(html_text)
link_re=re.compile("/index.php/Resource/index/grade/\d*/types/1")
links=soup.find_all('a',attrs={"href":link_re})
grade_name=[]
grade_link=[]
index=0
for l in links:
grade_link.append(l["href"])
grade_name.append(l.string)
for grade_wenjianjia in grade_name :
global grade_path
grade_path=root_path+"\\"+grade_wenjianjia
if(not os.path.exists(grade_path)):
os.mkdir(grade_path)
get_book_url(root_url+grade_link[index])
index+=1
def get_book_url(url):
#获取对应教材的地址(这里我只要数学的,索引为0的那个,但后来发现有些并不是)
html_text=get_html(url)
soup=BeautifulSoup(html_text)
soup_a=soup.select(".zhiyuanlistbg > li > span > a")[0:3]
group_link=[]
group_str=[]
for l in soup_a:
group_link.append(l["href"])
group_str.append(l.string)
#index=0
#for book_name , book_url in group_link , group_str:
global book_path
book_path=grade_path+"\\"+group_str[0]
if(not os.path.exists(book_path)):
os.mkdir(book_path)
download_chapter(root_url+group_link[0])
def download_chapter(url):
#根据书的地址进入章节下载地址
html_text=get_html(url)
soup=BeautifulSoup(html_text)
chapter_urls=[]
chapter_names=[]
chapters_a=soup.select('.affff > a')
for l in chapters_a:
html_text=get_html(root_url+l["href"])
soup=BeautifulSoup(html_text)
chapter_urls.append(soup.select(".wrapper > .mxfl > .pageflt > font > a")[0]["href"])
#chapter_urls.append(l["href"])
chapter_names.append(l.string.strip().replace("\u3000",""))
index=0
for chapter_name in chapter_names:
down_pdf(root_url+chapter_urls[index],book_path,chapter_name)
index+=1
login()