python3多线程爬虫小说_python3+beautifulSoup4.6抓取某网站小说(四)多线程抓取

上一篇多文章,是二级目录,根目录“小说”,二级目录“作品名称”,之后就是小说文件。

本篇改造了部分代码,将目录设置为根目录->作者目录->作品目录->作品章节.txt.

但这并不是本章内容当重点,重点是使用这个爬虫程序抓取当时候,经常会因为网络丢包等原因导致程序中断,

本来想着是循环获取网站状态,然后重新发起请求,结果好像也没什么用。然后在虫师讲selenium的书中看到了多线程,正好就实验下,结果发现,速度很快,cool!

以下代码基本摘自虫师的selenium2

多线程的引用

import threading

方法调用:threading.Thread(target=music, args=('music方法参数1',music方法参数2) )

from time import sleep,ctime

import threading

def music(func,loop):

for i in range(loop):

print('music',func,ctime())

sleep(2)

def movie(func,loop):

for i in range(loop):

print('movie',func,ctime())

sleep(4)

def testOne():

music('简单的歌', 2)

movie('两杆大烟枪', 2)

print('all end', ctime())

def testTwo():

threads = []

t1 = threading.Thread(target=music, args=('喜欢的人',2) )

threads.append(t1)

t2 = threading.Thread(target=movie, args=('搏击俱乐部',2) )

threads.append(t2)

t3= threading.Thread(target=music, args=('喜欢的人2', 2))

threads.append(t3)

for t in threads:

t.start()

for t in threads:

t.join()

print('all end', ctime())

if __name__ == '__main__':

testOne()

#testTwo()

#testThree()

#threadsRun()

t.join方法用来串联线程,可以保证all end 语句在最后打印出来。

创建线程管理类

创建类名时就引入Thread:class MyThread(threading.Thread)

class MyThread(threading.Thread):

def __init__(self, func, args, name):

threading.Thread.__init__(self)

self.func = func

self.args = args

self.name = name

def run(self):

self.func(*self.args)

self:类实例,默认参数

func:调用方法名

args:参数

name:方法+".__name__"

完整代码:

1 classMyThread(threading.Thread):2

3 def __init__(self, func, args, name):4 threading.Thread.__init__(self)5 self.func =func6 self.args =args7 self.name =name8

9 defrun(self):10 self.func(*self.args)11

12 defsuper_play(file_,time):13 for i in range(3):14 print('play', file_, ctime())15 sleep(time)16

17

18 deftime(args):19 pass

20

21

22 deftestThree():23 threads =[]24 lists = {'气球.mp3': 3, '电影.rmvb': 4, 'last.avg' : 2}25 for file_, time_ inlists.items():26 t = MyThread(super_play, (file_, time_), super_play.__name__)27 threads.append(t)28

29 files =range(len(lists))30

31 for f infiles:32 threads[f].start()33 for f infiles:34 threads[f].join()35

36 print('all end', ctime())

View Code

改造小说爬虫

好了,多线程说完了,怎么调用咱们写的小说类呢,很简单

首先,改造pageOne

def readPageOneByThread(self,page,time_):

page_url = str(self.two_page_url)

new_page_url = page_url.replace("?", page)

print('第', page, '页---', new_page_url)

path = self.folder_path

self.readPageTwo(new_page_url, path)

sleep(time_)

# end readPageOneByThread ---------------------------------------

init方法中,self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"

接下来,编写添加线程的方法:

def threadsRun(self):

#self.readPageOne(122)

for i in range(1,123):

page = str(i)

t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__)

#t = threading.Thread(target=self.testRun, args=( str(i) ))

self.threads.append(t)

for t in self.threads:

t.start()

for t in self.threads:

t.join()

#t.join()

print('all end: %s' % ctime())

class MyThread(threading.Thread):

def __init__(self, func, args, name):

threading.Thread.__init__(self)

self.func = func

self.args = args

self.name = name

def run(self):

self.func(*self.args)

这里偷了个懒,直接写了总页数,其实也可以使用原来的pageone方法读取last的div获取页数

下面是完整代码:

1 #-*- coding: UTF-8 -*-

2 from urllib importrequest3 from bs4 importBeautifulSoup4 from time importsleep,ctime5 importos6 importthreading7 importre8 importrandom9

10 '''

11 使用BeautifulSoup抓取网页12 version:0.5 更新为本地缓存链接13 author:yaowei14 date:2018-03-2315 '''

16

17

18 classCapture():19

20 def __init__(self):21 self.index_page_url = 'http://www.cuiweijuxs.com/'

22 self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'

23 self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"

24 self.folder_path = '绯色/'

25 self.href_list =[]26 self.head ={}27 self.threads =[]28 #写入User Agent信息

29 self.head[30 'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'

31

32 #end __init__ ---------------------------------------

33

34 #获取BeautifulSoup

35 defgetSoup(self,query_url):36 req = request.Request(query_url, headers=self.head)37 webpage =request.urlopen(req)38 html =webpage.read()39 soup = BeautifulSoup(html, 'html.parser')40 returnsoup41 #soup = BeautifulSoup(html, 'html5lib')

42

43 #读取分版页面,打开分页链接

44 defreadPageOne(self,count,time_):45

46 print('count=====',count)47

48 #总页数

49 ifcount :50 item_size =count51 else:52 #读取页面

53 soup =self.getSoup(self.one_page_url)54 last = soup.find("a", 'last')55 item_size =int(last.string)56

57 print('item_size=====',item_size)58 page_url =str(self.two_page_url)59

60 #循环打开分页链接,读取分页页面

61 for item inrange(item_size):62 page = str(item + 1)63 new_page_url = page_url.replace("?", page)64 print('第', page, '页---', new_page_url)65 path =self.folder_path66 self.readPageTwo(new_page_url, path)67

68 sleep(time_)69 #end readPageOne ---------------------------------------

70

71 defreadPageOneByThread(self,page,time_):72 page_url =str(self.two_page_url)73 new_page_url = page_url.replace("?", page)74 print('第', page, '页---', new_page_url)75 path =self.folder_path76 self.readPageTwo(new_page_url, path)77 sleep(time_)78 #end readPageOneByThread ---------------------------------------

79

80 #读取分页页面

81 defreadPageTwo(self, page_url, path):82 soup =self.getSoup(page_url)83 #first div[id="newscontent"]->div[class="l"]

84 con_div = soup.find('div', {'id': 'newscontent'}).find('div', {'class': 'l'})85 #first div[id="newscontent"]->div[class="l"]->all spann[class="s2"]

86 span_list = con_div.find_all('span', {'class': 's2'})87

88 #遍历span

89 for span inspan_list:90 #找到父节点下的span[class="s5"],以作者为文件夹名字

91 author = span.parent.find('span', {'class': 's5'}).get_text()92

93 #span[class="s2"]->a

94 a_href = span.find('a')95 href = a_href.get('href') #单部作品链接

96 folder_name = a_href.get_text() #作品名字

97 print('a_href', href, '---folder_name', folder_name)98 new_path = path + '/' + author + '/' +folder_name99 self.createFolder(new_path) #创建文件夹

100

101 self.readPageThree(href, new_path) #读取单部作品

102

103 #t = threading.Thread(target=self.readPageThree, args={href, new_path})

104 #self.threads.append(t)

105 #end for

106

107 #end readPage ---------------------------------------

108

109 #打开作品链接,遍历单章

110 defreadPageThree(self, page_url, path):111 soup = self.getSoup(page_url) #作品页面

112 print('readPageThree--', page_url)113 a_list = soup.find('div', {'id': 'list'}).find_all('a')114 idx = 0 #序号

115 for a_href ina_list:116 idx = idx + 1

117 href = self.index_page_url + a_href.get('href')118 file_path = path + '/' + str(idx) + '_' + a_href.get_text() + '.txt'

119 print('file_a_href', href, '---file_path', file_path)120

121 '''

122 new_path = self.isTxt(file_path)123 if new_path:124 print(new_path)125 file_object = open('网页链接//hrefs.txt', 'w', encoding='utf-8')126 file_object.write(href+','+new_path)127 file_object.close()128 '''

129 self.readPageFour(href, file_path)130

131 #self.href_list.append({'href': href, 'file_path': file_path})

132

133 #多线程

134 #t = threading.Thread(target=self.readPageFour, args={href, file_path})

135 #t.start()

136 #t.join(15)

137

138 #end readPageThree ---------------------------------------

139

140 #读取单章内容并写入

141 defreadPageFour(self, page_url, path):142 new_path = self.isTxt(path) #是否存在,存在则返回'',没创建则返回合法文件名

143 ifnew_path:144 soup =self.getSoup(page_url)145 con_div = soup.find('div', {'id': 'content'}) #读取文本内容

146 content = con_div.get_text().replace('
', '\n').replace(' ', ' ')147 #content = content.replace('&','').replace('amp;','').replace('rdquo;','').replace('ldquo;','')

148 #content = content.rstrip("& amp;rdquo;amp;& amp;ldquo;")

149

150 self.writeTxt(new_path, content) #写入文件

151

152 #end readPageFour ---------------------------------------

153

154 defreadPageHtml(self, page_url, path):155 soup =self.getSoup(page_url)156 con_div = soup.find('div', {'id': 'content'})157 content = con_div.get_text().replace('
', '\n').replace(' ', ' ')158

159 defcreateFolder(self, path):160 path =path.strip()161 #去除尾部 \ 符号

162 path = path.rstrip("\\")163 rstr = r"[\:\*\?\"\\|]" #'/ \ : * ? " < > |'

164 new_path = re.sub(rstr, "_", path) #替换为下划线

165 is_exists =os.path.exists(new_path)166 #不存在则创建

167 if notis_exists:168 os.makedirs(new_path)169 print('目录:', new_path + 'create')170 else:171 print(new_path + '目录已存在')172

173 #end createFolder ---------------------------------------

174

175 defisTxt(self, path):176 path =path.strip()177 #去除尾部 \ 符号

178 path = path.rstrip("\\")179 rstr = r"[\:\*\?\"\\|]" #'/ \ : * ? " < > |'

180 new_path = re.sub(rstr, "_", path) #替换为下划线

181 isExists =os.path.exists(new_path)182 ifisExists:183 print(new_path, '已存在')184 return ''

185 else:186 returnnew_path187

188 #end createTxt ---------------------------------------

189

190 defwriteTxt(self, file_name, content):191 isExists =os.path.exists(file_name)192 ifisExists:193 print(file_name, '已存在')194 else:195 file_object = open(file_name, 'w', encoding='utf-8')196 file_object.write(content)197 file_object.close()198

199 #end writeTxt ------------------------------------------

200

201 defrun(self):202 try:203 self.readPageOne()204 exceptBaseException as error:205 print('error--', error)206

207 defrunTest(self):208 try:209 page_url = 'http://www.cuiweijuxs.com/4_4508/'

210 path = '小说/runTest'

211 self.readPageThree(page_url, path)212 exceptBaseException as error:213 print('error--', error)214

215 deftestRun(self,num,time_):216 for i in range(3):217 print('num=',num,ctime())218 sleep(time_)219

220 defthreadsRun(self):221

222 #self.readPageOne(122)

223

224 for i in range(1,123):225 page =str(i)226 t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__)227 #t = threading.Thread(target=self.testRun, args=( str(i) ))

228 self.threads.append(t)229

230 for t inself.threads:231 t.start()232 for t inself.threads:233 t.join()234 #t.join()

235

236 print('all end: %s' %ctime())237

238

239 classMyThread(threading.Thread):240

241 def __init__(self, func, args, name):242 threading.Thread.__init__(self)243 self.func =func244 self.args =args245 self.name =name246

247 defrun(self):248 self.func(*self.args)249

250

251 Capture().threadsRun()

View Code

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值