python爬取文库文档_python 解析爬取某度文库 !

原理是 利用浏览器 切换成手机版找规律(存放在webapp...的url里想研究的可以去看看),主要是翻页 比较复杂

目前还不完善 能解析大部分 某度文库 的文档 和图片(有图片的就下载)Python资源共享群:626017123

# -*- coding: utf-8 -*-import requestsimport refrom json import loadsimport osfrom tqdm import tqdmclass Baidu(object): def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Mobile Safari/537.36' } self.rtcs_flag='1' self.rtcs_ver='3.1' self.base_url='http://wkrtcs.bdimg.com/rtcs/webapp' self.base_img='https://wkrtcs.bdimg.com/rtcs/image' self.flag=True self.cout=1 def get_info(self,url): try: r=requests.get(url,headers=self.headers).content.decode() except Exception: print('编码错误,切换编码!') r = requests.get(url, headers=self.headers).content.decode('gbk') self.bucketNum=re.findall('"bucketNum":(\d+),',r)[0] self.sign = re.findall('&sign=(.*?)&',r)[0] self.rsign=re.findall('"rsign":"(.*?)",',r)[0] self.md5sum=re.findall('&md5sum=(.*?)&',r)[0] self.page_list=re.findall('"rtcs_range_info":(.*),"rtcs_flow"',r)[0] self.page_count=re.findall('"rtcs_page_count":(.*?),',r)[0] self.firstpageurl=re.findall('data-firstpageurl="(.*?)"',r)[0].replace('amp;','') try: self.name=re.findall('

(.*?)',r)[0].strip() except Exception: self.name='百度文库百度文库' if not os.path.exists(self.name): os.mkdir(self.name) self.path=self.name+'/' #解析翻页参数 def parse(self): print('页数:',self.page_count) page_dics=loads(self.page_list) if int(self.page_count)>=4: self.get_first() pn = 2 rn = 4 while True: a = '' ranges=page_dics[pn-1:pn+rn-1] for r in tqdm(ranges): #进度条 a+=r.get('range')+'_' if (r is not ranges[-1]) else r.get('range') try: self.get_pages(pn,rn,a) except Exception: print('解析错误') pn = pn + rn rn = 5 if pn >int(self.page_count): break else: self.get_first() a='' pn=2 rn=4 ranges = page_dics[pn - 1:pn + rn - 1] for r in tqdm(ranges): a += r.get('range') + '_' if (r is not ranges[-1]) else r.get('range') try: self.get_pages(pn,rn,a) except Exception: pass #翻页写入文本 def get_pages(self,pn,rn,ranges): dic={ 'bucketNum':self.bucketNum, 'pn':pn, 'rn':rn, 'md5sum':self.md5sum, 'sign':self.sign, 'rtcs_flag':self.rtcs_flag, 'rtcs_ver':self.rtcs_ver, 'range':ranges, 'rsign':self.rsign } page=requests.get(self.base_url,params=dic,headers=self.headers).text[5:-1] b=loads(page) a = '' for i in b['document.xml']: for m in i['c']: a += '\n' for n in m['c']: try: if isinstance(n['c'], str): a += n['c'] except Exception: pass with open(self.path+self.name+'.doc','a',encoding='utf-8') as f: f.write(a) # 解析第一页 def get_first(self): print(self.firstpageurl) first_page=requests.get(url=self.firstpageurl,headers=self.headers).text[32:-1] b = loads(first_page) a = '' for i in tqdm(b['document.xml']): for m in i['c']: a += '\n' for n in m['c']: try: if isinstance(n['c'], str): a += n['c'] except Exception: pass with open(self.path+self.name+'.doc', 'a', encoding='utf-8') as f: f.write(a) print('第一页解析完成!!!') #下载图片 def down_img(self,cout,num): data={ 'md5sum':self.md5sum, 'sign':self.sign, 'rtcs_ver':'3', 'bucketNum':self.bucketNum, 'ipr':'{"c":"word/media/image%s.png"}'%cout } data=requests.get(url=self.base_img,params=data) if data.status_code ==200: with open(self.path + str(num) + '.jpg', 'wb+') as f: f.write(data.content) print(self.name+'下载完成!') else: couts=str(cout)+'_1' print(couts) data = { 'md5sum': self.md5sum, 'sign': self.sign, 'rtcs_ver': '3', 'bucketNum': self.bucketNum, 'ipr': '{"c":"word/media/image%s.png"}'%couts } data = requests.get(url=self.base_img, params=data) if data.status_code == 200: with open(self.path + str(num) + '.jpg', 'wb+') as f: f.write(data.content) print(self.name+'下载完成!') else: self.flag=False def run(self,url): num=0 self.get_info(url) self.parse() print('页面写入完成!!!'+'-'*20+'下载图片>>>>>>') while self.flag: num += 1 self.down_img(self.cout,num) self.cout+=1 if __name__ == '__main__': url=input('请输入网址:') b=Baidu() b.run(url)

免责声明:仅限用于学习和研究目的;不得将上述内容用于商业或者非法用途,否则,一切后果请用户自负。您必须在下载后的24个小时之内,从您的电脑或手机中彻底删除上述内容。

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值