爬取的是盗版网的“免费”小说《三寸人间》(阅读小说请支持正版)
以下是源代码:
from urllib import request
from bs4 import BeautifulSoup
import re
//获取html源码
response=request.urlopen("http://www.biquge.com.tw/14_14055/")
html = response.read()
//解析html
soup = BeautifulSoup(html,"html.parser")
//通过正则匹配找到需要的超链接
all_href = soup.find_all(href=re.compile("^/14_14055/"))
//分割获取到的语句,取出标题和超链接
all_href_name = str(all_href).split(",")
//定义字典,把标题和超链接当作key,value
all_href_name_dict = {}
for each_href in all_href_name:
soup_href = BeautifulSoup(each_href,"html.parser")
key = soup_href.a["href"]
value = soup_href.get_text()
all_href_name_dict[key] = value
print(all_href_name_dict)
# del all_href_name_dict['/14_14055/']
//获取html源码
def get_html(url):
response = request.urlopen(url)
html = response.read()
return html
//解析源码,获取小说内容
def get_content(url):
content_html = get_html(url)
soup = BeautifulSoup(content_html,"html.parser")
txt_show = soup.find_all('div',attrs={'id':'content'})
return txt_show[0]
//把小说写进本地txt文本
def write_to_txt(context,title):
with open('sancunrenjian','a',encoding='utf-8')as f:
f.write('\n'+title+'\n'+context)
//开始执行
for k,v in all_href_name_dict.items():
charpter_url="http://www.biquge.com.tw"+k
print(charpter_url)
charpter_txt = get_content(charpter_url)
//把不需要的内容替换掉
txt = str(charpter_txt).replace("<div id=\"content\">","").replace("<br/>","").replace("</div>","")
write_to_txt(str(txt),v)
部分操作结果: