本文主要介绍如何爬取小说网站的一些免费小说:
主要分为三步:
第一步:获取小说每个章节的url和tag。
from urllib import request
from bs4 import BeautifulSoup
if __name__=='__main__':
url='http://www.44txt.com/read/11276/5270031.html'
head={}
head['User-Agent']='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
req=request.Request(url,headers=head)
response=request.urlopen(req)
html=response.read()
soup=BeautifulSoup(html,'lxml')
soup_texts=soup.find(“p")
print(soup_texts.text.replace('。','\n'))
第二步:获取小说每个章节的目录的url和tag。
from urllib import request
from bs4 import BeautifulSoup
url = 'http://www.44txt.com/read/11276.html'
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
req = request.Request(url, headers=head)
response = request.urlopen(req)
html = response.read()
# 解析目录页
soup = BeautifulSoup(html, 'lxml')
# find_next找到第二个<div>
soup_texts = soup.find('div', class_="book_con_list")
#hh = soup_texts.find_all('a')
print(soup_texts)
from urllib import request
from bs4 import BeautifulSoup
if __name__ == '__main__':
# 目录页
name={}
urls={}
k=0
f = open('六界之主.txt', 'w', encoding='utf-8')
url= 'http://www.44txt.com/read/11276.html'
url1='http://www.44txt.com/read/'
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
req = request.Request(url, headers = head)
response = request.urlopen(req)
html = response.read()
# 解析目录页
soup = BeautifulSoup(html, 'lxml')
# find_next找到第二个<div>
soup_texts = soup.find('div', class_="book_con_list")
soup_texts1=soup.find('p')
soup_texts11=soup_texts1.text.replace('。','\n')
hh=soup_texts.find_all('a')
for i in hh:
#print(i.string+':'+i.get('href'))
str=i.string
str2=url1+i.get('href')
name[k]=str
urls[k]=str2
k=k+1
for j in hh:
k=k-1
f.write(name[k])
req = request.Request(urls[k], headers=head)
response = request.urlopen(req)
html = response.read()
soup = BeautifulSoup(html, 'lxml')
soup_texts = soup.find('p')
f.write(soup_texts.text.replace('。', '\n'))