知乎的反爬做得很严格
1.翻页信息在首页的url不更新
2.要拿到所有图片需要翻页,同时不能用问题首页的url,信息所在如下图,翻页信息,data里面几乎有所有需要的信息
3.访问频率限制,这严重影响了爬虫的速度,我多次用try-except做出调整,防止程序崩溃
这就是我爬取知乎信息后的一些经验分享,有很多不足,谢谢指正
详细思路在代码和注释中:
#爬取知乎回答下面照片的爬虫
#难点:异步加载,同时翻页信息不在html中
import requests
import json
from bs4 import BeautifulSoup
import re
import os
import random
from time import sleep
jsError = 0 #统计json报错次数,使用了一个全局变量
#在网上找了多个user-agent,然后每次访问时利用随机库在其中随机选择一个
headerstr = '''Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1
Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11
Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11
Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11
Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)
Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'''
def headerChange():
headerList = headerstr.split('\n')
length = len(headerList)
return headerList[random.randint(0,length - 1)]
def get_ip_list():
url = 'http://www.xicidaili.com/nn/'
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 Safari/537.36"
}
web_data = requests.get(url, headers=headers)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.find_all('tr')
ip_list = []
for i in range(1, len(ips)):
ip_info = ips[i]
tds = ip_info.find_all('td')
ip_list.append(tds[1].text + ':' + tds[2].text)
return ip_list
def get_random_ip(ip_list):
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + ip)
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}
return proxies
ipList = get_ip_list()
def getHTMLTxt(url):
querystring = {"status": "P"}
headers = {
'accept': "application/json, text/plain, */*",
'accept-encoding': "gzip, deflate, br",
'accept-language': "zh-CN,zh;q=0.8",
'authorization': "Bearer 2|1:0|10:1503143696|4:z_c0|80:MS4xQ0FCLUFnQUFBQUFtQUFBQVlBSlZUUkMwdjFtRk9yTVVjTi00Y29LUWJwZDRDMFlRazlZdjZBPT0=|700a3696133f8ceb69dc40a6249d2842f00c8147f02a9bfa09a8622cc6cc3ad2",
'connection': "keep-alive",
'cookie': "aliyungf_tc=AQAAAH6uiDX0DAYAOBXAer/FOseNpMyh; q_c1=95e8df94610e4278a356191bc9c8990c|1503143679000|1503143679000; q_c1=8b0cd17f5bcb45498611cd1b9b6579e5|1503143678000|1503143678000; _zap=3325fbf1-941e-4f26-a574-bc8a9ac27590; capsion_ticket=\"2|1:0|10:1503143681|14:capsion_ticket|44:MmNkMGNkYjcxYTg3NGFlN2JkYWYwNzIxZTI2MjM2ZTc=|c49c700a6aeb9ea8d5120e25938862991b4a1d2fedb893fd3ca3fc2001f0e9e6\"; z_c0=\"2|1:0|10:1503143696|4:z_c0|80:MS4xQ0FCLUFnQUFBQUFtQUFBQVlBSlZUUkMwdjFtRk9yTVVjTi00Y29LUWJwZDRDMFlRazlZdjZBPT0=|700a3696133f8ceb69dc40a6249d2842f00c8147f02a9bfa09a8622cc6cc3ad2\"; _xsrf=0daa4bf6-b573-46b2-a1c3-2289804939ec",
'host': "www.zhihu.com",
'referer': "https://www.zhihu.com/question/36759180",
'user-agent': headerChange(),
'cache-control': "no-cache",
'postman-token': "7e2c0f78-046a-09cd-2c11-f648d713b009"
}
html = ""
while html == "": #因为请求可能被知乎拒绝,采用循环+sleep的方式重复发送,但保持频率不太高
try:
proxies = get_random_ip(ipList)
print("\r这次试用ip:{}".format(proxies))
r = requests.request("GET", url, headers=headers, params=querystring, proxies=proxies, timeout=5)
html = r.text
return html
except:
print("Let me sleep for 3 seconds")
sleep(3)
print("Was a nice sleep, now let me continue...")
continue
def getPicURL(url, urlList): #这个函数拿到该页下的所有图片的url
global jsError
count = 0
flag = 1
while flag == 1:
try:
print("2333")
html = getHTMLTxt(url)
js = json.loads(html)
flag = 0
except:
jsError += 1
print("\rjson第{}次报错".format(jsError))
continue
#用type()可以确定在data下的content的内容是字符串类型的,也就是html可以用BeautifSoup进行解析
for ans in js['data']:
soup = BeautifulSoup(ans['content'], 'html.parser')
for img in soup.find_all('img'):
match = re.match(r'https://.*?\.jpg', img.get('src'))
if match:
urlList.append(match.group(0))
count += 1
return count
def findNextURL(urlPre):
global jsError
flag = 1
while flag == 1:
try:
print("3222")
html = getHTMLTxt(urlPre)
js = json.loads(html)
flag = 0
except:
jsError += 1
print("json第{}次报错".format(jsError))
continue
return js['paging']['next'], js['paging']['is_end']
def picDownload(url):
# 对图片进行存储
root = "D:/PY/CrawZhihuPic/pics/" # 这里注意一下转义符
path = root + url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
except:
print("文件保存出现错误")
def main():
urlPre = "https://www.zhihu.com/api/v4/questions/39838691/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.mark_infos%5B*%5D.url%3Bdata%5B*%5D.author.follower_count%2Cbadge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=3&limit=20&sort_by=default"
isEnd = False
text = getHTMLTxt(urlPre)
js = json.loads(text)
countOfAnswers = js['paging']['totals'] #记录总的回答数
print("本问题下共有{}个回答".format(countOfAnswers))
urlList = []
count = 0 #用于记录页数,打印进度
countOfPics = 0 #记录图片下载进度
while(isEnd != True and count < countOfAnswers):
count += getPicURL(urlPre, urlList)
print('\r当前进度:已完成爬取{}个回答,共{}个回答'.format(count, countOfAnswers), end="")
(urlNext, isEnd) = findNextURL(urlPre)
urlPre = urlNext
print("图片url列表已经生成,共{}张图片,正在下载图片。。。".format(len(urlList)))
for urlPic in urlList:
picDownload(urlPic)
print("\r当前进度,已下载{}张图片,共{}张".format(countOfPics, len(urlList)), end="")
countOfPics += 1
print("下载完成!")
main()