以爬取起点中文网免费页面为例讲解(以主神黑店为例讲解)
获取要爬取小说的第一章节的urlurl = https://read.qidian.com/chapter/CTxPsgzdPBfu4xLcYRGW6w2/7t9v4ciILvngn4SMoDUcDQ2
进行伪装,防止服务器不反回数据
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0'}
下面进行第一张爬取
从这里得到章节名
从这里获取内容
代码如下
import requests
from bs4 import Beautifulsoup
url = ' https://read.qidian.com/chapter/CTxPsgzdPBfu4xLcYRGW6w2/7t9v4ciILvngn4SMoDUcDQ2'headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0'}response = requests.get(url = url , headers = headers)html = response.text
soup = Beautifulsoup(html,'lxml')
novel_name = soup.find('h3',{'class':'j_chapterNmae'}).getText()
novel_txt = soup.find('div',{'class':'read-content j_readContent'})
novel = novel_name + '\n' +novel_txt
save_path = 'C:\\Users\\23219\\Desktop\\pycharm_project\\小说\\主神黑店'
save_name = '\\主神黑店' + '.txt'
full_path = save_path + save_name
fp = open(full_path,'w')
fp.write(novel)
fp.close()
这样第一章就被爬去下来
用while循环进行整本小说的爬取
这里获取下一章的url
代码如下next_url = soup.find('a',{'id','j_chapterNext'})
url = 'https:' + next_url['href']
这样就获得了下一章的url
使用while循环进行批量爬取小说。
代码整理如下import requests
from bs4 import BeautifulSoup
#定义请求函数
def get_html(url,headers):
response = requests.get(url , headers)
return response.text
#定义解析函数
def get_novel(html):
soup = BeautifulSoup( html , 'lxml' )
novel_name = soup.find( 'h3' , { 'class' : 'j_chapterName' } ).getText()
novel_txt = soup.find( 'div' , { 'class' : 'read-content j_readContent' }).getText().replace(" ","\n")
return novel_name + '\n' + novel_txt
#定义获取下一章节的url
def get_next_url(html):
soup = BeautifulSoup( html , 'lxml' )
next_url = soup.find( 'a' , { 'id' : 'j_chapterNext'})
return next_url['href']
def save(novel , i):
save_path = 'C:\\Users\\23219\\Desktop\\pycharm_project\\小说\\主神黑店'
save_name = '\\主神黑店' + str(i) + '.doc'
full_path = save_path + save_name
fp = open(full_path , 'w')
fp.write(novel)
fp.close()
#定义主函数
def main():
url = 'https://read.qidian.com/chapter/CTxPsgzdPBfu4xLcYRGW6w2/7t9v4ciILvngn4SMoDUcDQ2'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:61.0) Gecko/20100101 Firefox/61.0'}
i = 1
while i:
i += 1
#调用请求函数
html = get_html(url = url , headers = headers)
#调用获取下一章url
url = 'https:' + get_next_url(html = html)
#调用解析函数
novel = get_novel(html=html)
#调用存储函数
save(novel = novel, i = i)
main()