# -*- coding: utf-8 -*-
# @time : 2019/7/1 14:56
import requests
import time
baseUrl = 'http://baike.baidu.com/view/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'}
countToSleep = 300
writer = open("itemUrl.txt", "a+", encoding="utf8") //存储可以访问的词条url
filedWriter = open("filedItemUrl.txt", "a+", encoding="utf8") //存储由于百度服务端防爬策略而失败的 数字,最后再通过组装url进行采集
errorNumber = open("errorNumberItemUrl.txt", "a+", encoding="utf8") //组装的url不存在
for i in range(1, 15500000):
try:
countToSleep -= 1
if countToSleep > 0:
response = requests.get(baseUrl + str(i), headers=headers)
if 'error' in response.url:
errorNumber.write(str(i) + '\n')
else:
writer.write(response.url + '\n')
print("第" + str(i) + "个;当前url:" + response.url)
except:
filedWriter.write(str(i) + '\n')
print("服务端断开连接,重新连接爬取...")
time.sleep(4)
finally:
if not (countToSleep > 0):
time.sleep(2)
print("休息-------------------")
countToSleep = 300
writer.flush()
errorNumber.flush()
filedWriter.flush()