python 小说爬虫_零基础写python小说爬虫

分享

本帖子分享小说爬虫实现过程,代码有什么问题请各位大神不吝指教

1.导包,一般需要这两个包,包的下载方法就不说了,网上都有import requestsfrom bs4 import BeautifulSoup

2.跟Java爬虫类似,爬虫的核心就是根据网址获取网页对象def download_page(url):

data = requests.get(url).content

return data

3.其次是爬取逻辑,这个需要自己打开需要爬取的网页,然后找到需要的内容,根据所需要的内容的父节点获取def parse_html(html):

#从上一个方法中获取到的html对象需要进行转换

soup = BeautifulSoup(html)

#获取table对象

movie_list_soup = soup.find('table')

# print(movie_list_soup)

#书名

movie_list = []

#章节名

movie_name_list = []

if movie_list_soup is not None:

i = 1

#获取table中的每个tr

for movie_li in movie_list_soup.find_all('tr'):

#排除表格标题

if movie_li.find_all('th'):

continue

#获取每个tr中的a标签,主要为获取书籍地址

a_ = movie_li.find_all('td', attrs={'class': 'odd'})[0].find('a')

print(i, '.', a_.text)

movie_list.append(a_['href'])#获取地址存入集合

movie_name_list.append(a_.text)

i = i+1

#用户输入序号获取对应书籍地址

count = int(input('请输入书籍序号')) - 1

page = BeautifulSoup(download_page(movie_list[count]))

dd_s = page.find_all('dd')

file_handle = open('D:/SanMu/'+movie_name_list[count]+'.txt', mode='w')

for dd in dd_s:

beautiful_soup = BeautifulSoup(download_page(dd.find('a')['href']))

name = beautiful_soup.find_all('h1')[0].text

file_handle.write(name)

file_handle.write('\r\n')

catalogue_html = str(beautiful_soup.find('div', attrs={'id': 'content'}))

html_replace = catalogue_html.replace("

", "")

replace = html_replace.replace("/n", "").replace(

"

", "").replace("

", "")

split = replace.split("

")

for p_ in split:

file_handle.write(p_)

file_handle.write('\r\n')

file_handle.close()

最后就只需要调用一下这两个方法def main():

parse_html(download_page("https://www.biquge5200.com/modules/article/search.php?searchkey="+input("搜索:")))

main()

然后一个爬取笔趣阁的小说爬虫就完成了,是不是很简单,有问题请评论另附综合代码import requests

from bs4 import BeautifulSoup

def download_page(url):

data = requests.get(url).content

return data

def parse_html(html):

soup = BeautifulSoup(html)

movie_list_soup = soup.find('table')

# print(movie_list_soup)

movie_list = []

movie_name_list = []

if movie_list_soup is not None:

i = 1

for movie_li in movie_list_soup.find_all('tr'):

if movie_li.find_all('th'):

continue

a_ = movie_li.find_all('td', attrs={'class': 'odd'})[0].find('a')

print(i, '.', a_.text)

movie_list.append(a_['href'])

movie_name_list.append(a_.text)

i = i+1

count = int(input('请输入书籍序号')) - 1

page = BeautifulSoup(download_page(movie_list[count]))

dd_s = page.find_all('dd')

file_handle = open('D:/SanMu/'+movie_name_list[count]+'.txt', mode='w')

for dd in dd_s:

beautiful_soup = BeautifulSoup(download_page(dd.find('a')['href']))

name = beautiful_soup.find_all('h1')[0].text

file_handle.write(name)

file_handle.write('\r\n')

catalogue_html = str(beautiful_soup.find('div', attrs={'id': 'content'}))

html_replace = catalogue_html.replace("

", "")

replace = html_replace.replace("/n", "").replace(

"

", "").replace("

", "")

split = replace.split("

")

for p_ in split:

file_handle.write(p_)

file_handle.write('\r\n')

file_handle.close()

def main():

parse_html(download_page("https://www.biquge5200.com/modules/article/search.php?searchkey="+input("搜索:")))

main()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值