爬取百度文库,我们会发现用python的request和浏览器请求的结果会有所不同,在加载文档列如ppt或者word文本时,html代码里就只有网页的框架,根本看不见数据的存在,很快我们就知道,浏览器的数据都是脚本自动加载的。
通过对网页的分析
我们可以看到数据json格式的数据,由此我们可以看看请求头:
采用的是get方法,URL传入了很多参数,现在,我们就要分析这个URL,启发从html代码或者jsd代码中得到参数,组装成我们的URL。
在html代码中果然找到了我们想要的数据:
这是刚才从json格式数据的header中复制出来的:https://wenku.baidu.com/browse/getbcsurl?doc_id=44b5cb0abb68a98271fefa4f&pn=1&rn=99999&type=ppt&callback=jQuery1101016035481154345121_1545140616465&_=1545140616466 在html中&callback=jQuery1101016035481154345121_1545140616465&_=1545140616466这一部分数据总是找不到,后经过我的测试,发现https://wenku.baidu.com/browse/getbcsurl?doc_id=44b5cb0abb68a98271fefa4f&pn=1&rn=99999&type=ppt也能正常得到我想要的json格式数据,那么现在就大功告成了。
如下代码:
import requests
import re
req = requests.get("https://wenku.baidu.com/view/aa31a84bcf84b9d528ea7a2c.html")
text = req.text
docid = re.findall("'docId': '[\d\w]+'",text)[0]
print(docid[10:-1])
docid = docid[10:-1]
url = "https://wenku.baidu.com/browse/getbcsurl?doc_id=%s&pn=1&rn=99999&type=ppt" % (docid)
value = requests.get(url).text
print(value)
效果:
此时,我们只需要请求一下网址,把请求到的图片保存,就完成任务
详细的代码,包括ui界面:
def saveLocationSelect(results):
# print(results)
detailUI = TK.Tk()
detailUI.geometry("800x600")
label = TK.Label(detailUI, width=800)
label.pack()
filename = tkinter.filedialog.askdirectory()
if filename != '':
print(filename)
picUrls = re.findall("http[:\\\/\w\d\.\?&_=-]+", results)
for picUrl in picUrls:
picUrl = picUrl.replace('\\', '')
print(picUrl)
picName = re.findall("[\w\d=-]+$", picUrl)[0]
pn = picName.split('=')
localAddr = filename + "/" + pn[1] + "." + pn[0]
print(localAddr)
with open(localAddr, "wb+") as f:
f.write(requests.get(picUrl).content)
f.close()
label.config(text="数据已经加载%")
else:
label.config(text="您没有选择任何文件")
detailUI.mainloop()
def getPPT(response):
docid = re.findall(r"docId:[\d\w\s']+", response)[0]
docid = docid[8:-1]
url = "https://wenku.baidu.com/browse/getbcsurl?doc_id=%s&pn=1&rn=99999&type=ppt" % (docid)
value = requests.get(url).text
return value
response = requests.get(url,headers=headers).text
saveLocationSelect(getPPT(response ))
二:word文档爬取
同样,我们先打开浏览器分析网页
可以看到我们想要的数据。
通过在源码中搜索部分请求网址字符串,我们可以定位到如下图位置:
复制出一部分:
不难看出:这就是我们需要的json数据连接,其中以\x22作为分隔符:
下面让我用浏览器请求一下我们的网址(注意:需要把网址中的反斜杠去掉,eg:https:\\\/\\\/wkbjcloudbos.bdimg.com\\\/v1\\\/docconvert4826\\\/wk\\\/6833e……改写成:https://wkbjcloudbos.bdimg.com/v1/docconvert4826/wk/6833e……)
这就是我们访问到的数据,看样子没有错,可是细心的人就发现问题了,这里的编码格式有问题,是的,我在这里反复弄了很久。
我们来分析一下下面的例子:
我们可以看到从json中匹配出来的\u5b8b和print("\u5b8b")出来的结果不一样,而我们希望的是看到中文
怎么解决这个问题?
我们可以用json直接去解析就可以转化成中文的了,然后把中文匹配出来,中文的范围在\u4e00-\u9fa5之间
//匹配这些中文标点符号 。 ? ! , 、 ; : “ ” ‘ ' ( ) 《 》 〈 〉 【 】 『 』 「 」 ﹃ ﹄ 〔 〕 … — ~ ﹏ ¥
var reg = /[\u3002|\uff1f|\uff01|\uff0c|\u3001|\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5]/;
代码如下:
import requests
import json
import re
url = "https://wkbjcloudbos.bdimg.com/v1/docconvert4826/wk/6833e38d435c232bc108c33d2c5d7fa5/0.json?responseContentType=application%2Fjavascript&responseCacheControl=max-age%3D3888000&responseExpires=Sun%2C%2003%20Feb%202019%2022%3A16%3A31%20%2B0800&authorization=bce-auth-v1%2Ffa1126e91489401fa7cc85045ce7179e%2F2018-12-20T14%3A16%3A31Z%2F3600%2Fhost%2Fb8a795caa40c03202d9de6ad0121631ce36e09ee8ffe4032472dda92383e456c&x-bce-range=44356-60806&token=eyJ0eXAiOiJKSVQiLCJ2ZXIiOiIxLjAiLCJhbGciOiJIUzI1NiIsImV4cCI6MTU0NTMxODk5MSwidXJpIjp0cnVlLCJwYXJhbXMiOlsicmVzcG9uc2VDb250ZW50VHlwZSIsInJlc3BvbnNlQ2FjaGVDb250cm9sIiwicmVzcG9uc2VFeHBpcmVzIiwieC1iY2UtcmFuZ2UiXX0%3D.mvMZ5OHTegHoFK1r4e3i2HwYD0oheO8%2BP%2F%2BNweCbiyo%3D.1545318991"
req = requests.get(url)
mjson = json.dumps(json.loads(str(req.text)[8:-1]),ensure_ascii=False)
result = re.findall("[\u4e00-\u9fa5\u3002|\uff1f|\uff01|\uff0c|\u3001|\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5]+",mjson)
sss = ''
for res in result:
sss+=res
print(sss[2:])
到现在我们已经解决了所有的难题,其他的文库格式也类似,我就不再写了
看一下整体运行效果:
列表展示:
详细内容:
ppt的爬取:
希望这篇博客能够帮助到您,也希望您提出问题,互相学习。
全部代码如下:
#UI.py文件
import tkinter
import tkinter as TK
from tkinter.filedialog import askdirectory
import homework.baiduwenku.reptile as rep
import homework.baiduwenku.download as downlosd
import re
import requests
class View:
def __init__(self):
self.__reptile = rep.Reptile()
self.__root = TK.Tk()
self.__root.title("百度文库")
self.__root.geometry(self.__getLocation())
self.__addcomponet()
self.__root.mainloop()
def __addcomponet(self):
self.__searchFrame = TK.Frame(self.__root,width=100,height=20)
self.__search = TK.Entry(self.__searchFrame, width=100)
self.__search.grid(column=0,row=0,columnspan=5,padx=10)
self.__searchButton = TK.Button(self.__searchFrame,width=5,text="搜索",command=self.buttonClick)
self.__searchButton.grid(row=0,column=7,columnspan=1,padx=10)
self.__searchFrame.pack(anchor='nw',pady=10)
self.__canvas = TK.Canvas(self.__root,width=750,height=450,scrollregion=(0,0,0,5000))
self.__displayList = TK.Frame(self.__canvas)
self.__scroller = TK.Scrollbar(self.__root,orient="vertical",command=self.__canvas.yview)
self.__canvas.configure(yscrollcommand=self.__scroller.set)
self.__scroller.pack(side="right",fill="y")
self.__canvas.pack(side="left")
self.__canvas.create_window((0,0),window=self.__displayList,anchor='nw')
def buttonClick(self):
str = self.__search.get()
docName,docIntro,self.docUrl = self.__reptile.getWenKuData(str)
self.__reptile.clear()
for i in range(0,len(docName)):
label = TK.Label(self.__displayList,text=docName[i],font=20,fg='red',justify=TK.LEFT)
label.grid(row=i*2,column=0,sticky=TK.W)
label.bind("<Button-1>",self.lookMore)
try:
label = TK.Text(self.__displayList,height=5)
label.insert(TK.INSERT,docIntro[i])
label.grid(row=i*2+1,column=0,sticky=TK.W)
except:
pass
def lookMore(self,event):
index = re.search("[\d]+$",str(event.widget))
if index==None:
index = 0
else:
index = int(index[0])-1
docType,result = downlosd.urljudge(self.docUrl[index])
if docType=="doc" or docType == "docx" or docType == "pdf" or docType=="word":
self.disDOCContent(result)
elif docType == "ppt":
self.saveLocationSelect(result)
else:
print(docType)
def disDOCContent(self,result):
detailUI = TK.Tk()
detailUI.geometry("800x600")
textContent = TK.Text(detailUI, height=600, width=800)
scroller = TK.Scrollbar(detailUI, command=textContent.yview, orient="vertical")
textContent.configure(yscrollcommand=scroller.set)
scroller.pack(side="right", fill="y")
textContent.insert(TK.INSERT, result)
textContent.pack(side="left")
detailUI.mainloop()
def saveLocationSelect(self,results):
# print(results)
detailUI = TK.Tk()
detailUI.geometry("800x600")
label = TK.Label(detailUI,width=800)
label.pack()
filename = tkinter.filedialog.askdirectory()
if filename != '':
print(filename)
picUrls = re.findall("http[:\\\/\w\d\.\?&_=-]+",results)
for picUrl in picUrls:
picUrl = picUrl.replace('\\','')
print(picUrl)
picName = re.findall("[\w\d=-]+$", picUrl)[0]
pn = picName.split('=')
localAddr = filename+"/"+pn[1]+"."+pn[0]
print(localAddr)
with open(localAddr,"wb+") as f:
f.write(requests.get(picUrl).content)
f.close()
label.config(text="数据已经加载%")
else:
label.config(text="您没有选择任何文件")
detailUI.mainloop()
def __getLocation(self):
frame_w = 800
frame_h = 600
screen_w,screen_h = self.__root.maxsize()
left_distence = (screen_w-frame_w)/2
top_distence = (screen_h-frame_h)/2
res = "%dx%d+%d+%d" % (frame_w,frame_h,left_distence,top_distence)
return res
def addListContent(self,listName,listIntro,listUrl):
print()
if __name__ == "__main__":
view = View()
view.addListContent()
#reptile.py文件
import requests
from bs4 import BeautifulSoup
import re
class Reptile:
def __init__(self):
self.docUrl = list()
self.docName = list()
self.docIntro = list()
def clear(self):
self.docUrl=[]
self.docName=[]
self.docIntro=[]
def getWenKuData(self,searchStr):
self.netUrl = "https://wenku.baidu.com/search?word=%s&lm=0&od=0&fr=top_home&ie=utf-8" % (searchStr)
print(self.netUrl)
req = requests.get(self.netUrl)
text = req.content.decode('gbk')
bs = BeautifulSoup(text,"lxml")
res = bs.find_all(name='dl')
for item in res:
try:
self.docUrl.append(item.dt.p.a["href"])
self.docName.append(item.dt.p.a["title"])
self.docIntro.append(item.dd.div.p.text)
except:
print(item)
pass
return self.docName,self.docIntro,self.docUrl
if __name__ == "__main__":
reptile = Reptile()
reptile.getWenKuData("计算机")
#download.py文件
import requests
import re
import json
import time
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Mobile Safari/537.36"
} # 模拟手机
# ,response
# def get_num(url,response):
# # response = requests.get(url, headers=headers).text
# print("************************************************8\n")
# # print(response)
# resstring = " "
# result = re.search(
# r'&md5sum=(.*)&sign=(.*)&rtcs_flag=(.*)&rtcs_ver=(.*?)".*rsign":"(.*?)",', response, re.M | re.I) # 寻找参数
# try:
# reader = {
# "md5sum": result.group(1),
# "sign": result.group(2),
# "rtcs_flag": result.group(3),
# "rtcs_ver": result.group(4),
# "width": 176,
# "type": "org",
# "rsign": result.group(5)
# }
# except:
# return None
#
# result_page = re.findall(
# r'merge":"(.*?)".*?"page":(.*?)}', response) # 获取每页的标签
# doc_url = "https://wkretype.bdimg.com/retype/merge/" + url[29:-5] # 网页的前缀
# n = 0
# for i in range(len(result_page)): # 最大同时一次爬取10页
# if i % 10 is 0:
# doc_range = '_'.join([k for k, v in result_page[n:i]])
# reader['pn'] = n + 1
# reader['rn'] = 10
# reader['callback'] = 'sf_edu_wenku_retype_doc_jsonp_%s_10' % (
# reader.get('pn'))
# reader['range'] = doc_range
# n = i
# resstring+=get_page(doc_url, reader)
# else: # 剩余不足10页的
# doc_range = '_'.join([k for k, v in result_page[n:i + 1]])
# reader['pn'] = n + 1
# reader['rn'] = i - n + 1
# reader['callback'] = 'sf_edu_wenku_retype_doc_jsonp_%s_%s' % (
# reader.get('pn'), reader.get('rn'))
# reader['range'] = doc_range
# resstring+=get_page(doc_url, reader)
# # print(resstring)
# return resstring
# def get_page(url, data):
# print(url)
# print(data)
# print(headers)
# print("\n\n\n")
# response = requests.get(url, headers=headers, params=data).text
# response = response.encode(
# 'utf-8').decode('unicode_escape') # unciode转为utf-8 然后转为中文
# response = re.sub(r',"no_blank":true', '', response) # 清洗数据
# result = re.findall(r'c":"(.*?)"}', response) # 寻找文本匹配
# result = '\n'.join(result)
# # print(result)
# # print("\n\n"+result)
# return result
def getWord(url):
string = ''
req = requests.get(url)
result = re.findall("WkInfo.htmlUrls[\w\W]+WkInfo.verify_user_info", req.text)[0].replace('\\', '')
print(result)
urladdr = re.findall("https[^}]+", result)
try:
i=0
for uu in urladdr:
print(uu[:-3])
# i+=1
# if i > 8:
# i=0
# time.sleep(10)
# time.sleep(2)
string += getPage(uu[:-3])
except:
return string
return string
def getPage(url):
req = requests.get(url)
mjson = json.dumps(json.loads(str(req.text)[8:-1]), ensure_ascii=False)
result = re.findall(
"[\u4e00-\u9fa5\u3002|\uff1f|\uff01|\uff0c|\u3001|\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5]+",
mjson)
sss = ''
for res in result:
sss += res
print(sss[2:])
return sss[2:]
def getPPT(response):
docid = re.findall(r"docId:[\d\w\s']+", response)[0]
docid = docid[8:-1]
url = "https://wenku.baidu.com/browse/getbcsurl?doc_id=%s&pn=1&rn=99999&type=ppt" % (docid)
value = requests.get(url).text
return value
def urljudge(url):
response = requests.get(url,headers=headers).text
try:
docType = re.findall(r"docTypeName[\s\w:']+", response)
docType = re.findall("'[\w]+'$", docType[0])[0][1:-1]
if docType=="doc" or docType == "docx" or docType == "pdf" or docType=="word":
print(docType)
reptileRes=getWord(url)
elif docType == "ppt" or docType == "pptx":
print(docType)
reptileRes =getPPT(response)
elif docType == "xls" or docType=="xlsx":
print(docType)
reptileRes =getWord(response)
elif docType == "txt":
print(docType)
reptileRes =getWord(response)
elif docType == "wps":
print(docType)
reptileRes =getWord(response)
else:
print(docType)
reptileRes =getWord(response)
except:
print("文档类型读取错误")
reptileRes =getWord(response)
if reptileRes==None or reptileRes=="":
reptileRes = "加载数据失败"
return docType,reptileRes
if __name__ == '__main__':
# url = "https://wenku.baidu.com/view/aa31a84bcf84b9d528ea7a2c.html"
url="https://wenku.baidu.com/view/10759d4408a1284ac9504320.html"
# get_num(url)
谢谢!