# coding: utf-8
import urllib.request
import http.cookiejar
import logging,os
from lxml import etree
#配置日志级别为INFO
logging.basicConfig(level=logging.INFO)
#启用cookie自动管理
cj=http.cookiejar.CookieJar()
opener=urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cj))
urllib.request.install_opener(opener)
#post登陆请求参数
values={'userid':'******','userpass':'******'}
data = urllib.parse.urlencode(values)
data = data.encode('ascii') # data should be bytes
ks5u_login_url='http://www.ks5u.com//User/Chk_UserLogin.asp'
req = urllib.request.Request(ks5u_login_url, data)
#登陆网站,获得授权
logging.info('开始登陆网站!')
with urllib.request.urlopen(req) as response:
html=response.read()
# with open('ks5u_login.html','w') as outfile:
# outfile.write(html.decode('gb2312'))
logging.info('登陆成功!')
logging.info('访问2016高考真题下载页面!')
all_url='http://old.ks5u.com/shiti/gaokao/2016/'
with urllib.request.urlopen(all_url) as response:
html=response.read().decode('utf-8')
# with open('ks5u.html','w',encoding='utf-8') as outfile:
# outfile.write(html)
# logging.info('写入文件ks5u.html成功!')
root=etree.HTML(html) #利用lxml解析获得的html文件
#利用xpath搜索节点
#'//div[@class="sub_jiexi"]'
#//表示任意节点
#div节点名称
#[@...]属性满足的条件
for div in root.xpath('//div[@class="sub_jiexi"]'):
for a in div:
if a.text=='解析':
# headers中不能包含中文字符,否则'latin-1'不能编码
# 某些站点有所谓的反盗链设置,其实说穿了很简单,
# 就是检查你发送请求的header里面,referer站点是不是他自己,
# 所以我们只需要像把headers的referer改成该网站即可
headers = {
'Referer':a.get('href')
}
fileid=a.get('href').split('/')[-1].split('.')[0]
doc_url='http://www.ks5u.com/USER/INC/Downsch.asp?id='+fileid
req = urllib.request.Request(doc_url, None, headers)
logging.info('开始下载编号为%s的文件'%(fileid,))
if not fileid or not fileid.isdigit():
continue
with urllib.request.urlopen(req) as r2:
# 取得doc文件名时编码的转换由ISO-8859-1转为gbk
filename=r2.headers.get_filename().encode('ISO-8859-1').decode('gbk')
#如果文件存在,跳过
if os.path.exists(filename):
logging.info('文件%s已经存在!'%filename)
continue
logging.info('获得文件名%s'%filename)
#下载文件
with open(filename,'wb') as outfile:
try:
outfile.write(r2.read())
logging.info('保存文件%s成功!'%(filename,))
except :
pass
利用urllib从ks5u下载2016高考真题
最新推荐文章于 2024-07-22 20:15:41 发布