备注:以下代码只为研究技术分析,相关下载请于24小时内删除
趁着春节休息,被老婆要求协助其下载网上的英语试题(她是个英语老师),和这位英语产品经理对齐目标后,确实希望能把百度文库里面一些比较好的文章能批量下抓取下来
研究了下,大约分两步
1,搜索百度文库,并按页解析出要下载的文档url
2,批量下载url下的文档
网上搜了下,Python爬取百度搜索结果的代码还是挺多的,但是没找到爬取百度文库的,对比百度搜索和百度文库的url,两者差异并不大,只是部分关键字,百度搜索用的缩写
百度搜索结果url
http://www.baidu.com/s?wd=%E5%8D%97%E4%BA%AC+%E5%8E%86%E5%B9%B4+%E8%8B%B1%E8%AF%AD+%E7%9C%9F%E9%A2%98&pn=0&cl=3&rn=100
百度文库搜索结果url
https://wenku.baidu.com/search?word=%C4%CF%BE%A9+%C0%FA%C4%EA+%D3%A2%D3%EF+%D5%E6%CC%E2&org=0&fd=0&lm=0&od=0&pn=20
找了下里面的几个区别,差别不是太大,只要把代码略微修改即可
1、http->https
2、wd->word
3、s->search
找到搜索结果url后,后面翻页,两者是一致的,都是用pn=?进行偏移,所以只要解析url的结果获取对应的文章地址,保存下来,第一步就算是完成了
第二步其实就是遍历拿到文档url,依次下载,只是需要url请求的时候需要模拟手机,因为电脑端会因为翻页问题导致无法完整下载
大概步骤就是这样,下面就直接上代码了
第一步骤代码:
# coding=utf8
import urllib2
import string
import traceback
import urllib
import re
import chardet
import random
from utils.FileUtils import writeContentStr2File, writeAppend2File
from utils.word.Alphabet import cleanWord
#设置多个user_agents,防止百度限制IP
user_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20130406 Firefox/23.0', \
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:18.0) Gecko/20100101 Firefox/18.0', \
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533+ \
(KHTML, like Gecko) Element Browser 5.0', \
'IBM WebExplorer /v0.94', 'Galaxy/1.0 [en] (Mac OS X 10.5.6; U; en)', \
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)', \
'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', \
'Mozilla/5.0 (iPad; CPU OS 6_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) \
Version/6.0 Mobile/10A5355d Safari/8536.25', \
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) \
Chrome/28.0.1468.0 Safari/537.36', \
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0; TheWorld)']
def baidu_search(keyword,pn):
# p= {'wd': keyword}
p= {'word': keyword}
# url = ("http://www.baidu.com/s?"+urllib.urlencode(p)+"&pn={0}&cl=3&rn=100").format(pn)
url = ("https://wenku.baidu.com/search?"+urllib.urlencode(p)+"&pn={0}&cl=3&rn=100").format(pn)
print url
res=urllib2.urlopen(url )
html=res.read()
# print html
return html
def getList(regex,text):
arr = []
res = re.findall(regex, text)
if res:
for r in res:
arr.append(r)
return arr
def getMatch(regex,text):
res = re.findall(regex, text)
if res:
return res[0]
return ""
def clearTag(text):
p = re.compile(u'<[^>]+>')
retval = p.sub("",text)
return retval
def downloadurl(url):
domain=urllib2.Request(url)
r=random.randint(0,9)
domain.add_header('User-agent', user_agents[r])
domain.add_header('connection','keep-alive')
response=urllib2.urlopen(domain)
data = response.read()
return data
def geturl(keyword):
for page in range(10):
pn=page*10
html = baidu_search(keyword,pn)
content = html
writeContentStr2File(content, "baidu.result.html", "GBK")
postfix = u"https://wenku.baidu.com/view"
arrList = getList(u"href=\""+ postfix + u"(.*?)\"", content)
for item in arrList:
url = postfix + item
url = url.replace("?from=search","")
print url
writeAppend2File(url+'\n', "wenku.doc.url.txt")
if __name__=='__main__':
geturl('南京 历年 英语 真题')
第二步骤代码:
# -*- coding: utf-8 -*-
# Created: huashan
import requests,traceback
import re
import json
from utils.FileUtils import getFileContentList, writeAutoEncodeContentStr2File
headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 5.0; SM-G900P Build/LRX21T) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Mobile Safari/537.36"
} # 模拟手机
def get_num(url):
response = requests.get(url, headers=headers).text
# print response
if "md5sum" not in response:
return
result = re.search(
r'&md5sum=(.*)&sign=(.*)&rtcs_flag=(.*)&rtcs_ver=(.*?)".*rsign":"(.*?)",', response, re.M | re.I) # 寻找参数
reader = {
"md5sum": result.group(1),
"sign": result.group(2),
"rtcs_flag": result.group(3),
"rtcs_ver": result.group(4),
"width": 176,
"type": "org",
"rsign": result.group(5)
}
# print reader
result_page = re.findall(
r'merge":"(.*?)".*?"page":(.*?)}', response) # 获取每页的标签
doc_url = "https://wkretype.bdimg.com/retype/merge/" + url[29:-5] # 网页的前缀
n = 0
for i in range(len(result_page)): # 最大同时一次爬取10页
if i % 10 is 0:
doc_range = '_'.join([k for k, v in result_page[n:i]])
reader['pn'] = n + 1
reader['rn'] = 10
reader['callback'] = 'sf_edu_wenku_retype_doc_jsonp_%s_10' % (
reader.get('pn'))
reader['range'] = doc_range
n = i
get_page(doc_url, reader)
else: # 剩余不足10页的
doc_range = '_'.join([k for k, v in result_page[n:i + 1]])
reader['pn'] = n + 1
reader['rn'] = i - n + 1
reader['callback'] = 'sf_edu_wenku_retype_doc_jsonp_%s_%s' % (
reader.get('pn'), reader.get('rn'))
reader['range'] = doc_range
get_page(doc_url, reader)
def get_page(url, data):
response = requests.get(url, headers=headers, params=data).text
response = response.encode(
'utf-8').decode('unicode_escape') # unciode转为utf-8 然后转为中文
response = re.sub(r',"no_blank":true', '', response) # 清洗数据
result = re.findall(r'c":"(.*?)"}', response) # 寻找文本匹配
result = '\n'.join(result)
title = result.split('\n')[0]
print title
writeAutoEncodeContentStr2File(result, title[:30] + ".txt")
def downloadFile():
content = getFileContentList("wenku.doc.url.txt")
for url in content:
try:
get_num(url)
except:
print 'traceback.print_exc():'; traceback.print_exc()
continue
if __name__ == '__main__':
# url = "https://wenku.baidu.com/view/aa31a84bcf84b9d528ea7a2c.html"
# url = "https://wenku.baidu.com/view/9868c6fea300a6c30c229fe6.html"
# get_num(url)
downloadFile()