import re
import requests
url='http://www.shushun.cc/read_9444/' #需要爬取的网址
req=requests.get(url)
req.encoding='gbk' #将编码转为gbk格式
book_name=re.findall('<h1>(.*?)</h1>',req.text)[0] #取得书名
mulu=re.findall('class="f-green shushu">(.*?)</a>',req.text) #取得所有章节的目录
# for i in range(2,len(mulu)):
# print(mulu[i])
wangzhi=re.findall('<a href="(.*?).html"',req.text) #取得所有章节的正文网址
for i in range(len(wangzhi)):
print(f'http://www.shushun.cc{wangzhi[i]}.html')
# print(len(wangzhi))
dict1={}
for i in range(len(mulu)):
dict1[mulu[i]]=f'http://www.shushun.cc{wangzhi[i]}.html'
for k,v in dict1.items():
print(k,v)
网络爬虫
最新推荐文章于 2021-03-21 17:41:03 发布