本篇博客仅作备忘。
研发过程中,重点是BeautifulSoup的使用,只要会用BeautifulSoup就能很轻松的实现下面的功能
1、将html文件中的内容解析成小说,并保存
# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re
class getnovel:
def __init__(self, url, path):
self.url = url
self.path = path
def doget(self):
req = requests.get(self.url)
soup = BeautifulSoup(req.text, 'lxml')
str = soup.select("#content > p")[0].encode("ISO-8859-1")
a = re.compile(r'\n| |\xa0|\\xa0|\u3000|\\u3000|\\u0020|\u0020|\t|\r')
clean_str = a.sub('', str)
clean_str = clean_str.replace('<br>', '\n').replace('<br/>', '\n').replace('<p>', '').replace('</p>', '')
title = soup.select('div[align="center"] > h2')[0].encode(encoding='ISO-8859-1').replace('<h2>', '').replace('</h2>', '').replace(' ', '') + '.txt'
with open('log.txt', 'w') as f:
f.write(clean_str)
with open('log.txt', 'r') as fr:
with open(self.path + title, 'w') as fw:
article = ""
for s in fr.readlines():
if s.find('\xb5\xda') >= 0 and s.find('\xd5\xc2')>=0:
article += "\n\n"
article += s
article += "\n"
elif s != "\n":
article += s.replace("\n", "")
fw.write(article)
if __name__ == '__main__':
url = 'http://www.二级域名.me/12/12126/972764.html'
a = getnovel(url = url, path = '')
a.doget()
运行此文件,将会把小说按照小说的名称以txt格式保存在本地。
2、爬取首页列表中显示的所有小说的url
# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re
class getUrlFromAllvisit:
def __init__(self, url):
self.url = url
def doget(self):
req = requests.get(self.url)
soup = BeautifulSoup(req.text, 'lxml')
lianjie = soup.select('div[id="alist"] > div[id="alistbox"] > div[class="info"] > div[class="title"] > h2 > a')
resList = []
for s in lianjie:
resList.append(s['href'])
return resList
if __name__ == '__main__':
url = "http://www.二级域名.me/modules/article/toplist.php?sort=allvisit&page=1"
test = getUrlFromAllvisit(url)
resList = test.doget()
for s in resList:
print(s)
3、爬取指定小说的所有章节的url
# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re
class getUrlFromBookHTML:
def __init__(self, url):
self.url = url
def doget(self):
req = requests.get(self.url)
soup = BeautifulSoup(req.text, 'lxml')
lianjie = soup.select('div[class="con"] > p[class="ti"] > a')
resList = []
for s in lianjie:
resList.append('http://www.二级域名.me' + s['href'])
return resList
if __name__ == '__main__':
url = "http://www.二级域名.me/book/6571.html"
test = getUrlFromBookHTML(url)
resList = test.doget()
for s in resList:
print s
4、测试
# -*- coding: utf-8 -*-
import urllib3
from bs4 import BeautifulSoup
import requests
import re
from getUrlFromAllvisit import getUrlFromAllvisit
from getUrlFromBookHTML import getUrlFromBookHTML
from getnovel import getnovel
url = "http://www.二级域名.me/modules/article/toplist.php?sort=allvisit&page=1"
resList1 = getUrlFromAllvisit(url).doget()
resList2 = []
for s in resList1:
temp = getUrlFromBookHTML(s).doget()
for ss in temp:
resList2.append(ss)
for s in resList2:
getnovel(s, 'novel/').doget(