1 importthreading2 importtime3 importurllib.request4 from bs4 importBeautifulSoup5 importre6 importsocket7
8 #设置超时时间
9 socket.setdefaulttimeout(10)10
11 defgetHTMLData(strURL):12 try:13 data =urllib.request.urlopen(strURL).read()14 returndata15 except:16 print('i get the error')17 data =urllib.request.urlopen(strURL).read()18 returndata19
20 defgetDataOnMatch(data, name, att, match):21 soup = BeautifulSoup(data,"html.parser")22 result =soup.find_all(name, att)23 if match !="":24 re_pat =re.compile(match)25 list =re_pat.findall(str(result))26 returnlist27 returnresult28
29 #保存小说信息
30 classListBookInfo:31 def __init__(self):32 self.lock =threading.Lock()33 self.nLsBookCnt =034 self.lsBookInfo =[]35
36 defAddBookInfo(self, strBookName, strBookURL):37 self.lock.acquire()38 obj =[]39 obj.append(strBookName)40 obj.append(strBookURL)41 self.lsBookInfo.append(obj)42 self.nLsBookCnt += 1
43 self.lock.release()44
45 defGetHeadBookInfo(self):46 self.lock.acquire()47 if self.nLsBookCnt >0:48 bookInfo =self.lsBookInfo[0]49 delself.lsBookInfo[0]50 self.nLsBookCnt -= 1
51 self.lock.release()52 returnbookInfo53 else:54 self.lock.release()55 return056
57 defGetSize(self):58 self.lock.acquire()59 nSize =self.nLsBookCnt60 self.lock.release()61 returnnSize62
63 defClearLsBook(self):64 self.lock.acquire()65 self.lsBookInfo.clear()66 self.nLsBookCnt =067 self.lock.release()68
69 #保存小说名、小说章节信息(章节名、章节URL)
70 classBookPageInfo:71 def __init__(self):72 self.lock =threading.Lock()73 self.nBookPageCnt =074 self.lsBookPageInfo =[]75
76 defAddBookPageInfo(self, strBookName, lsBookPageURL):77 self.lock.acquire()78 obj =[]79 obj.append(strBookName)80 obj.append(lsBookPageURL)81 self.lsBookPageInfo.append(obj)82 self.nBookPageCnt += 1
83 self.lock.release()84
85 defGetHeadBookPageInfo(self):86 self.lock.acquire()87 if self.nBookPageCnt >0:88 bookInfo =self.lsBookPageInfo[0]89 delself.lsBookPageInfo[0]90 self.nBookPageCnt -= 1
91 self.lock.release()92 returnbookInfo93 else:94 self.lock.release()95 return096
97 defGetSize(self):98 self.lock.acquire()99 nSize =self.nBookPageCnt100 self.lock.release()101 returnnSize102
103 defClearLsBookPage(self):104 self.lock.acquire()105 self.lsBookPageInfo.clear()106 self.nBookPageCnt =0107 self.lock.release()108
109 defgetArticleType(data):110 soup =BeautifulSoup(data)111 data_ul = soup.find_all("ul", "channel-nav-list")112 print(data_ul)113 re_pat = re.compile('\(.*)\')114 list =re_pat.findall(str(data_ul))115 print(list)116 for i inlist:117 print("%s-->%s" % (i[1], i[0]))118 returnlist119
120 defgetArticle(strURL):121 data =getHTMLData(strURL)122 print(data)123 ll = getDataOnMatch(data, "ul", "seeWell cf", "\
(.*?)href=\"(.*?)\"(.*?)\")124 list =[]125 for i inll:126 obj =[]127 listReData = getDataOnMatch(getHTMLData(i[1]),"section","main b-detail", "(.*?)href=\"(.*?)\"(.*?)")128 obj.append(listReData[0][1])129 lf = re.findall("(.*?)alt=\"(.*?)\"(.*?)", i[2])130 obj.append(lf[0][1])131 list.append(obj)132 returnlist133134 defgetArticlePageContent(strURL):135 data =getHTMLData(strURL)136 InfoList = getDataOnMatch(data, "div", "clearfix dirconone", "\
(.*?)href=\"(.*?)\" title=\"(.*?)\"(.*?)\")137 pageInfoList =[]138 for i inInfoList:139 obj =[]140 obj.append(strURL + '/' + i[1])141 obj.append(i[2])142 pageInfoList.append(obj)143 returnpageInfoList144145 defgetArticleContent(strURL):146 try:147 data =getHTMLData(strURL)148 ll = getDataOnMatch(data, "div", "mainContenr", "")149 returnll150 except:151 print('i get the error')152 data =getHTMLData(strURL)153 ll = getDataOnMatch(data, "div", "mainContenr", "")154 returnll155
156 classCleverBookSys:157
158 def __init__(self):159 self.bExit =0160 self.eventBook =threading.Event()161 self.eventPage =threading.Event()162 self.lsBookInfo =ListBookInfo()163 self.lsBookPageInfo =BookPageInfo()164 self.thrParseBook = ThreadForParseAllBook(self, "ThreadForParseAllBook")165 self.thrParseBookPage = ThreadForParseBookPage(self, "ThreadForParseBookPage")166 nCount =0167 self.thrDownLoad =[]168 #开十个线程用于下载,视网速而定
169 while nCount < 10:170 thread = ThreadForDownloadTxt(self,"ThreadForDownloadTxt",nCount)171 thread.start()172 self.thrDownLoad.append(thread)173 nCount += 1
174 self.thrParseBook.start()175 self.thrParseBookPage.start()176
177 #用与抓取整个网站的小说名及URL(没写完、大概写了一些)
178 classThreadForParseAllBook(threading.Thread):179
180 def __init__(self, parent, strThrName):181 threading.Thread.__init__(self)182 self.parent =parent183 self.strThrName =strThrName184 #这里只针对这个网站的解析
185 self.lsArticle = getArticleType(getHTMLData("http://www.quanshuwang.com/"))186
187 defrun(self):188 print("Thread %s is Start!!!" %(self.strThrName))189 for art inself.lsArticle:190 bookInfoList =getArticle(art[0])191 #【0】:小说路径,[1]:小说名
192 for bookInfo inbookInfoList:193 self.parent.lsBookInfo.AddBookInfo(bookInfo[1], bookInfo[0])194 self.parent.eventBook.set()195
196 #用于抓取单本小说的所以章节名及URL
197 classThreadForParseBookPage(threading.Thread):198
199 def __init__(self, parent, strThrName):200 threading.Thread.__init__(self)201 self.parent =parent202 self.strThrName =strThrName203
204 defrun(self):205 print("Thread %s is Start!!!" %(self.strThrName))206 while self.parent.bExit ==0:207 nSize =self.parent.lsBookInfo.GetSize()208 print("ThreadForParseBookPage-->%d" %nSize)209 if (nSize >0):210 bookInfo =self.parent.lsBookInfo.GetHeadBookInfo()211 print("ThreadForParseBookPage->%s" %bookInfo)212 PageInfo = getArticlePageContent(bookInfo[1])213 print("ThreadForParseBookPage2->%s" %PageInfo)214 self.parent.lsBookPageInfo.AddBookPageInfo(bookInfo[0], PageInfo)215 self.parent.eventPage.set()216 else:217 print("self.parent.eventBook.wait()")218 self.parent.eventBook.wait()219 if(nSize <=0):220 self.parent.eventBook.clear()221 print("self.parent.eventBook.run()")222
223 #用于抓取单小说的所有章节内容并生成TXT文档保存
224 classThreadForDownloadTxt(threading.Thread):225
226 def __init__(self, parent, strThrName, nThrNO):227 threading.Thread.__init__(self)228 self.parent =parent229 self.strThrName =strThrName230 self.nThrNO =nThrNO231
232 defrun(self):233 print("Thread %s%d is Start!!!" %(self.strThrName,self.nThrNO))234 while self.parent.bExit ==0:235 nSize =self.parent.lsBookPageInfo.GetSize()236 print("ThreadForDownloadTxt-->%d" %nSize)237 if (nSize >0):238 bookPageInfoList =self.parent.lsBookPageInfo.GetHeadBookPageInfo()239 print("ThreadForDownloadTxt%d-->%s" %(self.nThrNO,bookPageInfoList))240 fileName = 'D:\\txt\\' + bookPageInfoList[0] + '.txt'
241 file_object = open(fileName, 'w',encoding='utf-8')242 for pageInfo in bookPageInfoList[1]:243 print("ThreadForDownloadTxt%d-->%s" %(self.nThrNO,pageInfo))244 content =getArticleContent(pageInfo[0])245 print(content)246 file_object.write(pageInfo[1])247 file_object.write(str(content))248 file_object.flush()249 file_object.close()250 else:251 print("self.parent.eventPage.wait()")252 self.parent.eventPage.wait()253 if(nSize <=0):254 self.parent.eventPage.clear()255 print("self.parent.eventPage.run()")256
257 cleverBook =CleverBookSys()258
259
260
261
262
263
264
265