python3+beautifulSoup4.6抓取某网站小说（四）多线程抓取

最新推荐文章于 2022-06-19 22:54:03 发布

W2110764513

最新推荐文章于 2022-06-19 22:54:03 发布

阅读量125

点赞数

文章标签： python 爬虫移动开发

原文链接：http://www.cnblogs.com/yaomaomao/p/8746231.html

版权

上一篇多文章，是二级目录，根目录“小说”，二级目录“作品名称”，之后就是小说文件。

本篇改造了部分代码，将目录设置为根目录->作者目录->作品目录->作品章节.txt.

但这并不是本章内容当重点，重点是使用这个爬虫程序抓取当时候，经常会因为网络丢包等原因导致程序中断，

本来想着是循环获取网站状态，然后重新发起请求，结果好像也没什么用。然后在虫师讲selenium的书中看到了多线程，正好就实验下，结果发现，速度很快，cool！

以下代码基本摘自虫师的selenium2

多线程的引用

import threading

方法调用：threading.Thread(target=music, args=('music方法参数1',music方法参数2) )

from time import sleep,ctime
import threading

def music(func,loop):
    for i in range(loop):
        print('music',func,ctime())
        sleep(2)

def movie(func,loop):
    for i in range(loop):
        print('movie',func,ctime())
        sleep(4)

def testOne():
    music('简单的歌', 2)
    movie('两杆大烟枪', 2)
    print('all end', ctime())

def testTwo():
    threads = []
    t1 = threading.Thread(target=music, args=('喜欢的人',2) )
    threads.append(t1)

    t2 = threading.Thread(target=movie, args=('搏击俱乐部',2) )
    threads.append(t2)

    t3= threading.Thread(target=music, args=('喜欢的人2', 2))
    threads.append(t3)

    for t in threads:
        t.start()

    for t in threads:
        t.join()

    print('all end', ctime())

if __name__ == '__main__':
    testOne()
    #testTwo()
    #testThree()
    #threadsRun()

t.join方法用来串联线程，可以保证all end 语句在最后打印出来。

创建线程管理类

创建类名时就引入Thread：class MyThread(threading.Thread)

class MyThread(threading.Thread):

    def __init__(self, func, args, name):
        threading.Thread.__init__(self)
        self.func = func
        self.args = args
        self.name = name

    def run(self):
        self.func(*self.args)

　self：类实例，默认参数

　func：调用方法名

args：参数

name：方法+".__name__"

完整代码：

 1 class MyThread(threading.Thread):
 2 
 3     def __init__(self, func, args, name):
 4         threading.Thread.__init__(self)
 5         self.func = func
 6         self.args = args
 7         self.name = name
 8 
 9     def run(self):
10         self.func(*self.args)
11 
12 def super_play(file_,time):
13     for i in range(3):
14         print('play', file_, ctime())
15         sleep(time)
16 
17 
18 def time(args):
19     pass
20 
21 
22 def testThree():
23     threads = []
24     lists = {'气球.mp3': 3, '电影.rmvb': 4, 'last.avg' : 2}
25     for file_, time_ in lists.items():
26         t = MyThread(super_play, (file_, time_), super_play.__name__)
27         threads.append(t)
28 
29     files = range(len(lists))
30 
31     for f in files:
32         threads[f].start()
33     for f in files:
34         threads[f].join()
35 
36     print('all end', ctime())

View Code

改造小说爬虫

好了，多线程说完了，怎么调用咱们写的小说类呢，很简单

首先，改造pageOne

    def readPageOneByThread(self,page,time_):
        page_url = str(self.two_page_url)
        new_page_url = page_url.replace("?", page)
        print('第', page, '页---', new_page_url)
        path = self.folder_path              
        self.readPageTwo(new_page_url, path)
        sleep(time_)
    # end readPageOneByThread  ---------------------------------------

　init方法中，self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"

接下来，编写添加线程的方法：

    def threadsRun(self):

        #self.readPageOne(122)

        for i in range(1,123):
            page = str(i)
            t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__)
            #t = threading.Thread(target=self.testRun, args=( str(i) ))
            self.threads.append(t)

        for t in self.threads:
            t.start()
        for t in self.threads:
            t.join()
            #t.join()

        print('all end: %s' % ctime())


class MyThread(threading.Thread):

    def __init__(self, func, args, name):
        threading.Thread.__init__(self)
        self.func = func
        self.args = args
        self.name = name

    def run(self):
        self.func(*self.args)

　　这里偷了个懒，直接写了总页数，其实也可以使用原来的pageone方法读取last的div获取页数

下面是完整代码：

  1 # -*- coding: UTF-8 -*-
  2 from urllib import request
  3 from bs4 import BeautifulSoup
  4 from time import sleep,ctime
  5 import os
  6 import threading
  7 import re
  8 import random
  9 
 10 '''
 11 使用BeautifulSoup抓取网页
 12 version:0.5 更新为本地缓存链接
 13 author:yaowei
 14 date:2018-03-23
 15 '''
 16 
 17 
 18 class Capture():
 19 
 20     def __init__(self):
 21         self.index_page_url = 'http://www.cuiweijuxs.com/'
 22         self.one_page_url = 'http://www.cuiweijuxs.com/jingpinxiaoshuo/'
 23         self.two_page_url = "http://www.cuiweijuxs.com/jingpinxiaoshuo/5_?.html"
 24         self.folder_path = '绯色/'
 25         self.href_list = []
 26         self.head = {}
 27         self.threads = []
 28         # 写入User Agent信息
 29         self.head[
 30             'User-Agent'] = 'Mozilla/5.0 (Linux; Android 4.1.1; Nexus 7 Build/JRO03D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166  Safari/535.19'
 31 
 32     # end __init__ ---------------------------------------
 33 
 34     # 获取BeautifulSoup
 35     def getSoup(self,query_url):
 36         req = request.Request(query_url, headers=self.head)
 37         webpage = request.urlopen(req)
 38         html = webpage.read()
 39         soup = BeautifulSoup(html, 'html.parser')
 40         return soup
 41         # soup = BeautifulSoup(html, 'html5lib')
 42 
 43     # 读取分版页面，打开分页链接
 44     def readPageOne(self,count,time_):
 45 
 46         print('count=====',count)
 47 
 48         # 总页数
 49         if count :
 50             item_size = count
 51         else :
 52             # 读取页面
 53             soup = self.getSoup(self.one_page_url)
 54             last = soup.find("a", 'last')
 55             item_size = int(last.string)
 56 
 57         print('item_size=====',item_size)
 58         page_url = str(self.two_page_url)
 59 
 60         # 循环打开分页链接，读取分页页面
 61         for item in range(item_size):
 62             page = str(item + 1)
 63             new_page_url = page_url.replace("?", page)
 64             print('第', page, '页---', new_page_url)
 65             path = self.folder_path
 66             self.readPageTwo(new_page_url, path)
 67 
 68         sleep(time_)
 69     # end readPageOne  ---------------------------------------
 70 
 71     def readPageOneByThread(self,page,time_):
 72         page_url = str(self.two_page_url)
 73         new_page_url = page_url.replace("?", page)
 74         print('第', page, '页---', new_page_url)
 75         path = self.folder_path              
 76         self.readPageTwo(new_page_url, path)
 77         sleep(time_)
 78     # end readPageOneByThread  ---------------------------------------
 79 
 80     # 读取分页页面
 81     def readPageTwo(self, page_url, path):
 82         soup = self.getSoup(page_url)
 83         # first div[id="newscontent"]->div[class="l"]
 84         con_div = soup.find('div', {'id': 'newscontent'}).find('div', {'class': 'l'})
 85         # first div[id="newscontent"]->div[class="l"]->all spann[class="s2"]
 86         span_list = con_div.find_all('span', {'class': 's2'})
 87 
 88         # 遍历span
 89         for span in span_list:
 90             # 找到父节点下的span[class="s5"]，以作者为文件夹名字
 91             author = span.parent.find('span', {'class': 's5'}).get_text()
 92 
 93             # span[class="s2"]->a
 94             a_href = span.find('a')
 95             href = a_href.get('href')  # 单部作品链接
 96             folder_name = a_href.get_text()  # 作品名字
 97             print('a_href', href, '---folder_name', folder_name)
 98             new_path = path + '/' + author + '/' + folder_name
 99             self.createFolder(new_path)  # 创建文件夹
100 
101             self.readPageThree(href, new_path)  # 读取单部作品
102 
103             # t = threading.Thread(target=self.readPageThree, args={href, new_path})
104             # self.threads.append(t)
105             # end for
106 
107     # end readPage  ---------------------------------------
108 
109     # 打开作品链接，遍历单章
110     def readPageThree(self, page_url, path):
111         soup = self.getSoup(page_url)  # 作品页面
112         print('readPageThree--', page_url)
113         a_list = soup.find('div', {'id': 'list'}).find_all('a')
114         idx = 0  # 序号
115         for a_href in a_list:
116             idx = idx + 1
117             href = self.index_page_url + a_href.get('href')
118             file_path = path + '/' + str(idx) + '_' + a_href.get_text() + '.txt'
119             print('file_a_href', href, '---file_path', file_path)
120 
121             '''
122             new_path = self.isTxt(file_path)
123             if new_path:
124                 print(new_path)
125                 file_object = open('网页链接//hrefs.txt', 'w', encoding='utf-8')
126                 file_object.write(href+','+new_path)
127                 file_object.close()
128              '''
129             self.readPageFour(href, file_path)
130 
131             #self.href_list.append({'href': href, 'file_path': file_path})
132 
133             # 多线程
134             #t = threading.Thread(target=self.readPageFour, args={href, file_path})
135             #t.start()
136             #t.join(15)
137 
138     # end readPageThree  ---------------------------------------
139 
140     # 读取单章内容并写入
141     def readPageFour(self, page_url, path):
142         new_path = self.isTxt(path)  # 是否存在，存在则返回'',没创建则返回合法文件名
143         if new_path:
144             soup = self.getSoup(page_url)
145             con_div = soup.find('div', {'id': 'content'})  # 读取文本内容
146             content = con_div.get_text().replace('<br/>', '\n').replace('&nbsp;', ' ')
147             # content = content.replace('&amp;','').replace('amp;','').replace('rdquo;','').replace('ldquo;','')
148             # content = content.rstrip("& amp;rdquo;amp;& amp;ldquo;")
149 
150             self.writeTxt(new_path, content)  # 写入文件
151 
152     # end readPageFour  ---------------------------------------
153 
154     def readPageHtml(self, page_url, path):
155         soup = self.getSoup(page_url)
156         con_div = soup.find('div', {'id': 'content'})
157         content = con_div.get_text().replace('<br/>', '\n').replace('&nbsp;', ' ')
158 
159     def createFolder(self, path):
160         path = path.strip()
161         # 去除尾部 \ 符号
162         path = path.rstrip("\\")
163         rstr = r"[\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
164         new_path = re.sub(rstr, "_", path)  # 替换为下划线
165         is_exists = os.path.exists(new_path)
166         # 不存在则创建
167         if not is_exists:
168             os.makedirs(new_path)
169             print('目录:', new_path + ' create')
170         else:
171             print(new_path + ' 目录已存在')
172 
173     # end createFolder  ---------------------------------------
174 
175     def isTxt(self, path):
176         path = path.strip()
177         # 去除尾部 \ 符号
178         path = path.rstrip("\\")
179         rstr = r"[\:\*\?\"\<\>\|]"  # '/ \ : * ? " < > |'
180         new_path = re.sub(rstr, "_", path)  # 替换为下划线
181         isExists = os.path.exists(new_path)
182         if isExists:
183             print(new_path, '已存在')
184             return ''
185         else:
186             return new_path
187 
188     # end createTxt ---------------------------------------
189 
190     def writeTxt(self, file_name, content):
191         isExists = os.path.exists(file_name)
192         if isExists:
193             print(file_name, '已存在')
194         else:
195             file_object = open(file_name, 'w', encoding='utf-8')
196             file_object.write(content)
197             file_object.close()
198 
199     # end writeTxt ------------------------------------------
200 
201     def run(self):
202         try:
203             self.readPageOne()
204         except BaseException as error:
205             print('error--', error)
206 
207     def runTest(self):
208         try:
209             page_url = 'http://www.cuiweijuxs.com/4_4508/'
210             path = '小说/runTest'
211             self.readPageThree(page_url, path)
212         except BaseException as error:
213             print('error--', error)
214 
215     def testRun(self,num,time_):
216         for i in range(3):
217             print('num=',num,ctime())
218             sleep(time_)
219             
220     def threadsRun(self):
221 
222         #self.readPageOne(122)
223 
224         for i in range(1,123):
225             page = str(i)
226             t = MyThread( self.readPageOneByThread, (page,2) , self.readPageOneByThread.__name__)
227             #t = threading.Thread(target=self.testRun, args=( str(i) ))
228             self.threads.append(t)
229 
230         for t in self.threads:
231             t.start()
232         for t in self.threads:
233             t.join()
234             #t.join()
235 
236         print('all end: %s' % ctime())
237 
238 
239 class MyThread(threading.Thread):
240 
241     def __init__(self, func, args, name):
242         threading.Thread.__init__(self)
243         self.func = func
244         self.args = args
245         self.name = name
246 
247     def run(self):
248         self.func(*self.args)
249 
250 
251 Capture().threadsRun()