上一篇多文章,是二级目录,根目录“小说”,二级目录“作品名称”,之后就是小说文件。
本篇改造了部分代码,将目录设置为根目录->作者目录->作品目录->作品章节.txt.
但这并不是本章内容当重点,重点是使用这个爬虫程序抓取当时候,经常会因为网络丢包等原因导致程序中断,
本来想着是循环获取网站状态,然后重新发起请求,结果好像也没什么用。然后在虫师讲selenium的书中看到了多线程,正好就实验下,结果发现,速度很快,cool!
以下代码基本摘自虫师的selenium2
多线程的引用
import threading
方法调用:threading.Thread(target=music, args=('music方法参数1',music方法参数2) )
from time import sleep,ctime
import threading
def music(func,loop):
for i in range(loop):
print('music',func,ctime())
sleep(2)
def movie(func,loop):
for i in range(loop):
print('movie',func,ctime())
sleep(4)
def testOne():
music('简单的歌', 2)
movie('两杆大烟枪', 2)
print('all end', ctime())
def testTwo():
threads = []
t1 = threading.Thread(target=music, args=('喜欢的人',2) )
threads.append(t1)
t2 = threading.Thread(target=movie, args=('搏击俱乐部',2) )
threads.append(t2)
t3= threading.Thread(target=music, args=('喜欢的人2', 2))
threads.append(t3)
for t in threads:
t.start()
for t in threads:
t.join()
print('all end', ctime())
if __name__ == '__main__':
testOne()
#testTwo()
#testThree()
#threadsRun()
t.join方法用来串联线程,可以保证all end 语句在最后打印出来。
创建线程管理类
创建类名时就引入Thread:class MyThread(threading.Thread)
class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.func = func
self.args = args
self.name = name
def run(self):
self.func(*self.args)
self:类实例,默认参数
func:调用方法名
args:参数
name:方法+".__name__"
完整代码:
1 classMyThread(threading.Thread):2
3 def __init__(self, func, args, name):4 threading.Thread.__init__(self)5 self.func =func6 self.args =args7 self.name =name8
9 defrun(self):10 self.func(*self.args)11
12 defsuper_play(file_,time):13 for i in range(3):14 print('play', file_, ctime())15 sleep(time)16
17
18 deftime(args):19 pass
20
21
22 deftestThree():23 threads =[]24 lists = {'气球.mp3': 3, '电影.rmvb': 4, 'last.avg' : 2}25 for file_, time_ inlists.items():26 t = MyThread(super_play, (file_, time_), super_play.__name__)27 threads.append(t)28
29 files =range(len(lists))30
31 for f infiles:32 threads[f].start()33 for f infiles:34 threads[f].join()35
36 print('all end', ctime())
View Code
改造小说爬虫
好了,多线程说完了,怎么调用咱们写的小说类呢,很简单
首先,改造pageOne
def readPageOneByThread(self,page,time_):
page_url = str(self.two_page_url)
new_page_url = page_url.replace("?", page)
print('第', page, '页---', new_page_url)
path = self.folder_path
self.readPageTwo(new_page_url, path)
sleep(time_)
# end readPageOneByThread ---------------------------------------
init方法中,self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
接下来,编写添加线程的方法:
def threadsRun(self):
#self.readPageOne(122)
for i in range(1,123):
page = str(i)
t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__)
#t = threading.Thread(target=self.testRun, args=( str(i) ))
self.threads.append(t)
for t in self.threads:
t.start()
for t in self.threads:
t.join()
#t.join()
print('all end: %s' % ctime())
class MyThread(threading.Thread):
def __init__(self, func, args, name):
threading.Thread.__init__(self)
self.func = func
self.args = args
self.name = name
def run(self):
self.func(*self.args)
这里偷了个懒,直接写了总页数,其实也可以使用原来的pageone方法读取last的div获取页数
下面是完整代码:
1 #-*- coding: UTF-8 -*-
2 from urllib importrequest3 from bs4 importBeautifulSoup4 from time importsleep,ctime5 importos6 importthreading7 importre8 importrandom9
10 '''
11 使用BeautifulSoup抓取网页12 version:0.5 更新为本地缓存链接13 author:yaowei14 date:2018-03-2315 '''
16
17
18 classCapture():19
20 def __init__(self):21 self.index_page_url = 'http://www.cuiweijuxs.com/'
22 self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
23 self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
24 self.folder_path = '绯色/'
25 self.href_list =[]26 self.head ={}27 self.threads =[]28 #写入User Agent信息
29 self.head[30 'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Safari/535.19'
31
32 #end __init__ ---------------------------------------
33
34 #获取BeautifulSoup
35 defgetSoup(self,query_url):36 req = request.Request(query_url, headers=self.head)37 webpage =request.urlopen(req)38 html =webpage.read()39 soup = BeautifulSoup(html, 'html.parser')40 returnsoup41 #soup = BeautifulSoup(html, 'html5lib')
42
43 #读取分版页面,打开分页链接
44 defreadPageOne(self,count,time_):45
46 print('count=====',count)47
48 #总页数
49 ifcount :50 item_size =count51 else:52 #读取页面
53 soup =self.getSoup(self.one_page_url)54 last = soup.find("a", 'last')55 item_size =int(last.string)56
57 print('item_size=====',item_size)58 page_url =str(self.two_page_url)59
60 #循环打开分页链接,读取分页页面
61 for item inrange(item_size):62 page = str(item + 1)63 new_page_url = page_url.replace("?", page)64 print('第', page, '页---', new_page_url)65 path =self.folder_path66 self.readPageTwo(new_page_url, path)67
68 sleep(time_)69 #end readPageOne ---------------------------------------
70
71 defreadPageOneByThread(self,page,time_):72 page_url =str(self.two_page_url)73 new_page_url = page_url.replace("?", page)74 print('第', page, '页---', new_page_url)75 path =self.folder_path76 self.readPageTwo(new_page_url, path)77 sleep(time_)78 #end readPageOneByThread ---------------------------------------
79
80 #读取分页页面
81 defreadPageTwo(self, page_url, path):82 soup =self.getSoup(page_url)83 #first div[id="newscontent"]->div[class="l"]
84 con_div = soup.find('div', {'id': 'newscontent'}).find('div', {'class': 'l'})85 #first div[id="newscontent"]->div[class="l"]->all spann[class="s2"]
86 span_list = con_div.find_all('span', {'class': 's2'})87
88 #遍历span
89 for span inspan_list:90 #找到父节点下的span[class="s5"],以作者为文件夹名字
91 author = span.parent.find('span', {'class': 's5'}).get_text()92
93 #span[class="s2"]->a
94 a_href = span.find('a')95 href = a_href.get('href') #单部作品链接
96 folder_name = a_href.get_text() #作品名字
97 print('a_href', href, '---folder_name', folder_name)98 new_path = path + '/' + author + '/' +folder_name99 self.createFolder(new_path) #创建文件夹
100
101 self.readPageThree(href, new_path) #读取单部作品
102
103 #t = threading.Thread(target=self.readPageThree, args={href, new_path})
104 #self.threads.append(t)
105 #end for
106
107 #end readPage ---------------------------------------
108
109 #打开作品链接,遍历单章
110 defreadPageThree(self, page_url, path):111 soup = self.getSoup(page_url) #作品页面
112 print('readPageThree--', page_url)113 a_list = soup.find('div', {'id': 'list'}).find_all('a')114 idx = 0 #序号
115 for a_href ina_list:116 idx = idx + 1
117 href = self.index_page_url + a_href.get('href')118 file_path = path + '/' + str(idx) + '_' + a_href.get_text() + '.txt'
119 print('file_a_href', href, '---file_path', file_path)120
121 '''
122 new_path = self.isTxt(file_path)123 if new_path:124 print(new_path)125 file_object = open('网页链接//hrefs.txt', 'w', encoding='utf-8')126 file_object.write(href+','+new_path)127 file_object.close()128 '''
129 self.readPageFour(href, file_path)130
131 #self.href_list.append({'href': href, 'file_path': file_path})
132
133 #多线程
134 #t = threading.Thread(target=self.readPageFour, args={href, file_path})
135 #t.start()
136 #t.join(15)
137
138 #end readPageThree ---------------------------------------
139
140 #读取单章内容并写入
141 defreadPageFour(self, page_url, path):142 new_path = self.isTxt(path) #是否存在,存在则返回'',没创建则返回合法文件名
143 ifnew_path:144 soup =self.getSoup(page_url)145 con_div = soup.find('div', {'id': 'content'}) #读取文本内容
146 content = con_div.get_text().replace('
', '\n').replace(' ', ' ')147 #content = content.replace('&','').replace('amp;','').replace('rdquo;','').replace('ldquo;','')
148 #content = content.rstrip("& amp;rdquo;amp;& amp;ldquo;")
149
150 self.writeTxt(new_path, content) #写入文件
151
152 #end readPageFour ---------------------------------------
153
154 defreadPageHtml(self, page_url, path):155 soup =self.getSoup(page_url)156 con_div = soup.find('div', {'id': 'content'})157 content = con_div.get_text().replace('
', '\n').replace(' ', ' ')158
159 defcreateFolder(self, path):160 path =path.strip()161 #去除尾部 \ 符号
162 path = path.rstrip("\\")163 rstr = r"[\:\*\?\"\\|]" #'/ \ : * ? " < > |'
164 new_path = re.sub(rstr, "_", path) #替换为下划线
165 is_exists =os.path.exists(new_path)166 #不存在则创建
167 if notis_exists:168 os.makedirs(new_path)169 print('目录:', new_path + 'create')170 else:171 print(new_path + '目录已存在')172
173 #end createFolder ---------------------------------------
174
175 defisTxt(self, path):176 path =path.strip()177 #去除尾部 \ 符号
178 path = path.rstrip("\\")179 rstr = r"[\:\*\?\"\\|]" #'/ \ : * ? " < > |'
180 new_path = re.sub(rstr, "_", path) #替换为下划线
181 isExists =os.path.exists(new_path)182 ifisExists:183 print(new_path, '已存在')184 return ''
185 else:186 returnnew_path187
188 #end createTxt ---------------------------------------
189
190 defwriteTxt(self, file_name, content):191 isExists =os.path.exists(file_name)192 ifisExists:193 print(file_name, '已存在')194 else:195 file_object = open(file_name, 'w', encoding='utf-8')196 file_object.write(content)197 file_object.close()198
199 #end writeTxt ------------------------------------------
200
201 defrun(self):202 try:203 self.readPageOne()204 exceptBaseException as error:205 print('error--', error)206
207 defrunTest(self):208 try:209 page_url = 'http://www.cuiweijuxs.com/4_4508/'
210 path = '小说/runTest'
211 self.readPageThree(page_url, path)212 exceptBaseException as error:213 print('error--', error)214
215 deftestRun(self,num,time_):216 for i in range(3):217 print('num=',num,ctime())218 sleep(time_)219
220 defthreadsRun(self):221
222 #self.readPageOne(122)
223
224 for i in range(1,123):225 page =str(i)226 t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__)227 #t = threading.Thread(target=self.testRun, args=( str(i) ))
228 self.threads.append(t)229
230 for t inself.threads:231 t.start()232 for t inself.threads:233 t.join()234 #t.join()
235
236 print('all end: %s' %ctime())237
238
239 classMyThread(threading.Thread):240
241 def __init__(self, func, args, name):242 threading.Thread.__init__(self)243 self.func =func244 self.args =args245 self.name =name246
247 defrun(self):248 self.func(*self.args)249
250
251 Capture().threadsRun()
View Code