爬取武侠小说-天龙八部(精简版)
一、爬取天龙八部五十章:
#三个功能函数,一个主函数
import requests
from bs4 import BeautifulSoup
#1.获取网页源代码的函数
def getHTMLText(url):
try:
r=requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
#2.解析源代码,输出url
def fillurlList(ulist,html):
soup=BeautifulSoup(html,'html.parser')
try:
for i in soup.find_all('dl')[0].find_all("a"):
a=i.attrs['href']
ulist.append('http://wuxia.net.cn'+a)
except:
pass
#3.解析源代码,输出文本
def fillUnivList(html):
soup=BeautifulSoup(html,'html.parser')
try:
for i in range(300):
print(soup.find_all('p')[i].text)
except:
pass
#主函数
def main():
urllist=[]
html=getHTMLText('http://wuxia.net.cn/book/tianlongbabu.html')
fillurlList(urllist,html)
for url in urllist:
html=getHTMLText(url)
fillUnivList(html)
main()