一、创建reptileWuxia.py文件,使用BeautifulSoup模块,
1、环境安装:
1)安装Python 3.6.1,
2)配置环境变量:
测试:Python --version
3)安装BeautifulSoup
pip install beautifulsoup4
二、代码实现
用到多线程
# coding=utf-8
import urllib.request
import re
import time
import os
import threading
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from urllib.parse import urljoin
from threading import Thread
from concurrent import futures
#from concurrent.futures import ThreadPoolExecutor
class BookProperty:
def __init__(self, name, url, worksDict):
self.name = name
self.url = url
self.worksDict = worksDict
class OutputContent:
def createDirectory(self,directory):
localDir = os.path.join(os.getcwd(),directory) #dirname(__file__)
if not os.path.exists(localDir): #if os.path.exists(save_dir) is False:
os.makedirs(localDir)
return localDir
def createFile(self,newfile):
#if not os.path.exists(newfile):
f = open(newfile,'w',encoding='utf-8')
f.close()
def writeContent(self,fileName,chapterList):
try:
self.createFile(fileName)
#list = [chapterTitle,"\n",chapterContent,"\n"]
with open(fileName,'a',encoding='utf-8') as f:
f.writelines(chapterList)
except Exception as e:
print('save file error.'+str(e))
class ParserPage:
#加载页面,得到BeautifulSoup对象
def loadPage(self,url):
html=None;soup=None
try:
request = urllib.request.urlopen(url)
html = request.read().decode('gb2312','ignore')
except Exception as e:
print(e)
try:
soup = BeautifulSoup(html,'html.parser') #创建一个beautifulsoup的类
except Exception as ex:
print(ex)
#raise ex.reason
return soup
def __urlHandle(self,*url):
if len(url) > 1 :
return urljoin(url[0], url[1])
else:
result = urlparse(url[0])
return result.scheme + '://' + result.netloc
def __parsetAuthorWorks(self,url,soup):
worksDict = {}
linkList = soup.find_all(class_=re.compile('style2|style3',re.IGNORECASE)) #忽略大小写
for linkTag in linkList:
aTag = linkTag.contents
if len(aTag) > 0 and aTag[0].name == 'a' and aTag[0].get_text() !='':
href = self.__urlHandle(url,aTag[0].get('href'))
worksDict.update({href:aTag[0].get_text()}) #url,作者名(或书名)
return worksDict
#得到一个作者下所有作品和入口地址
def parserOneAuthorWorks(self,url):
soup = self.loadPage(url)
if soup is None: return
dirName ='Novel'
navList = soup.select('.LinkPath') #作者名
if len(navList) >1 :
authorName = navList[1].get_text()
worksDict = self.__parsetAuthorWorks(url,soup)
return {'authorName':authorName,'worksDict':worksDict};
#得到所有作者和作者作品入口地址,把回列表数据类型
def parserAllAuthorName(self,url,authorName):
soup = self.loadPage(url)
if soup is None: return
authorDict = self.__parsetAuthorWorks(url,soup)
return {'authorName':authorName,'url':url,'worksDict':authorDict}
#解析目录界面,得到正文url
def parserCatalogue(self,url):
soup = self.loadPage(url)
if soup is None: return
domain = self.__urlHandle(url)
#title = suop.select(".STYLE17")[0].get_text() #取书名,每一页都不一样,取消使用
aList = soup.find_all("a",{'class' : '1'})
urls = []
for aTag in aList:
urls.append(domain + aTag.attrs['href'])
return urls
#解析正文,并得到正文下一页
def parserOnePage(self,url):
soup = self.loadPage(url)
if soup is None: return
content = self.__parserPageContent(soup)
nextUrl = self.__isNextPage(soup,url)
return {'content':content[0],'nextUrl':nextUrl}
def parsetOnePageNotCatalog(self,url):
soup = self.loadPage(url)
if soup is None: return
content = self.__parserPageContent(soup)
nextUrl = self.__isNextPage(soup,url,content[1])
return {'content':content[0],'nextUrl':nextUrl}
def __parserPageContent(self,soup):
h3Tag = soup.find('h3')
spanTag = soup.find("span")
chapterData = chapterName = None
if not h3Tag is None :
chapterName = h3Tag.get_text() #读取章节标题名称
chapterData = chapterName+'\n'
if not spanTag is None:
chapterContent= spanTag.get_text() #读取正文
if not chapterContent is None:
chapterContent ="".join(chapterContent.split()) #删除空格
if not chapterData is None:
chapterData= chapterData+chapterContent+'\n'
else:
chapterData = chapterContent+'\n'
return chapterData,chapterName
#对没有目录页的页面的,特殊情况的判断
def __isNextPage(self,*args):
nextUrl = None
nextATag = args[0].find('a',{'class':'LinkNextArticle'})#返回下一页URL,因为目录没有所有页的url,不带域名
if not nextATag is None:
domain = self.__urlHandle(args[1])
nextUrl = domain + nextATag.attrs['href']
if len(args)>2 and not args[2] is None:
nextText = nextATag.get_text()
nextText="".join(nextText.split())
chapterName="".join(args[2].split())
if nextText[0:2] != chapterName[0:2]:
nextUrl = None
return nextUrl
class ReptileManager:
def __init__(self, url):
self.url = url
self.parser = ParserPage()
self.output = OutputContent()
#爬取一本书
def reptileBook(self,url,fileName):
urls = self.parser.parserCatalogue(url)
if urls is None: return
contentList=[]
if len(urls) > 0 :
nextUrl =None
for url in urls:
result = self.parser.parserOnePage(url)
if result is None: continue
nextUrl = result['nextUrl']
contentList.append(result['content'])
while nextUrl :
result = self.parser.parserOnePage(nextUrl)
if result is None: break
nextUrl = result['nextUrl']
contentList.append(result['content'])
else :
result = self.parser.parsetOnePageNotCatalog(url)
if result is None: return
contentList.append(result['content'])
nextUrl = result['nextUrl']
while nextUrl :
result = self.parser.parsetOnePageNotCatalog(nextUrl)
contentList.append(result['content'])
nextUrl = result['nextUrl']
if not contentList is None:
self.output.writeContent(fileName,contentList)#写文件
return fileName
#爬取1个作者下所有作品
def reptileOneAuthorWorksBooks(self):
print ('解析开始时间:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
works = self.parser.parserOneAuthorWorks(self.url)
self.__reptileMuchBooks(works)
print ('解析完成时间:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
#得到所有作者下所有作品名称与url,返回字典数据类型
def reptileAllAuthorAllWorks(self,url):
worksList =[];futureList=[]
result = self.parser.parserAllAuthorName(url,'')
with futures.ThreadPoolExecutor(max_workers=10) as executor:
for k,v in result['worksDict'].items():
future = executor.submit(self.parser.parserAllAuthorName,k,v)
futureList.append(future)
for future in futures.as_completed(futureList):
result = future.result()
worksList.append(result)
#for data in executor.map(parserAllAuthorName, authorDict.keys(),authorDict.values()):
return worksList;
#爬取所有作者下的所有作品
def reptileAllAuthorBoos(self):
print ('开始时间:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
worksList = self.reptileAllAuthorAllWorks(self.url)
#with futures.ThreadPoolExecutor(max_workers=5) as executor:
i=0
print(len(worksList))
for works in worksList:
i +=1
if i>89 :
self.__reptileMuchBooks(works)
print ('结束时间:'+ time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
def __reptileMuchBooks(self,works):
a=0;futureList=[]
filePath = self.output.createDirectory(works['authorName'])
with futures.ThreadPoolExecutor(max_workers=10) as executor:
for k,v in works['worksDict'].items():
a += 1
fileName = os.path.join(filePath,str(a)+v+'.txt')
future = executor.submit(self.reptileBook,k,fileName)
futureList.append(future)
for future in futures.as_completed(futureList):
result = future.result()
print(result)
if __name__ == '__main__':
reptile = ReptileManager('this is URL')
reptile.reptileAllAuthorBoos()
参考:
'''
参考:
https://beautifulsoup.readthedocs.io/zh_CN/latest/#
https://www.jianshu.com/p/62145aed2d49
https://www.jianshu.com/p/b9b3d66aa0be
https://github.com/yanbober/SmallReptileTraining/tree/master/ConcurrentSpider
https://www.gulongbbs.com/wuxia/ 测试使用
'''