一.爬虫爬取三国演义
爬取网站url:https://www.shicimingju.com/book/sanguoyanyi.html
所用知识requests库,bs4解析库,time库。
二.安装这三个库:(1).pip install requests
(2).pip install bs4
(3).pip install time
import requests
from bs4 import BeautifulSoup
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
def zhangjie():#爬取url,与章节标题;list1=url;list2=章节目录。
url = 'https://www.shicimingju.com/book/sanguoyanyi.html'
res = requests.get(url=url,params=headers).text
time.sleep(4)
soup = BeautifulSoup(res, 'lxml')
soup1 = soup.find('div', class_='book-mulu')
soup2 = soup1.select('ul > li > a')
soup3 = soup1.select('ul > li a')
list1 = []
list2 = []
for i in range(len(soup2)):
list1.append('https://www.shicimingju.com' + soup1.select('ul > li a')[i]['href'])
for i in range(len(soup2)):
list2.append(soup1.select('ul > li a')[i].text)
return list1,list2
def neirong():#抓取章节内容。
list1,list2 = zhangjie()
fp = open('三国演义2.txt', 'a+', encoding='utf-8')
for i in range(0,len(list1)):
res = requests.get(url=list1[i],headers=headers,timeout=40).text
soup = BeautifulSoup(res,'lxml')
soup1 = soup.find('div',class_='chapter_content').text
fp.write(list2[i]+'\n'+soup1+'\n'*3)
print('第{}打印成功'.format(i+1))
a = i
if a == i:
time.sleep(i/10)
elif i>=30:
time.sleep(i/15)
elif i>30 & i<60:
time.sleep(i/20)
else:
time.sleep(i/25)
if __name__ =="__main__":
neirong()
本人小白,代码可能冗余,勿喷。代码运行时间可能会比较长。
有没有机会来个关注呢?