python 爬qidian小说

最新推荐文章于 2024-07-31 14:30:10 发布

diaocoutan2075

最新推荐文章于 2024-07-31 14:30:10 发布

阅读量66

点赞数

文章标签： python 爬虫

原文链接：http://www.cnblogs.com/jjj-fly/p/6901022.html

版权

  1 import re
  2 import urllib.request
  3 from bs4 import BeautifulSoup
  4 import time
  5 
  6 url=input("第一章网址：")
  7 
  8 def gethtml(url):
  9                                       #获取页面源代码html
 10     page=urllib.request.urlopen(url)
 11     html=page.read().decode('utf-8')  #html是一个列表
 12     soup=BeautifulSoup(html,'html.parser')
 13     
 14     return soup
 15 
 16 def getcontent(soup,load):
 17     
 18     content=soup.find_all("div",{"class":"read-content j_readContent"})
 19     
 20     content1=re.compile(r'<p>([\s\S]*?)</p>')       #匹配到段落内容
 21     
 22     content2=content1.findall(str(content))
 23    
 24     content3=re.sub("</?\w+[^>]*>",'',content2[0])  #除掉html标签
 25     
 26     content4=content3.replace('。','。\n\n\0\0\0')  #把以句号换位“。\n\n\0\0\0   两个换行符三个空格”            到此，将章节内容获取完毕
 27 
 28     contentname=re.compile(r'<h3 class="j_chapterName">(.*?)</h3>')
 29     
 30     contentname1=contentname.findall(str(soup))     #获取章节名称
 31 
 32     book="----------------------------------------------------------------"+contentname1[0]+"------------------------------------------------------------\n\n\n"+content4   
 33 
 34     with open(load, 'a') as f:
 35 
 36         f.write(book)
 37 
 38     
 39 
 40 def nextcontent(soup):
 41 
 42     content=soup.find_all("div",{"class":"chapter-control dib-wrap"})
 43     
 44     #print(str(content))
 45     
 46     step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">')
 47 
 48     content1=step.findall(str(content))
 49 
 50     if content1 == []:                         #判断该页是否为最后一章，是，获取最后一章（特殊）的url，不是，以常规方法获取下一章url
 51 
 52         step1=re.compile(r'<a data-eid="qd_R118" href="(.*?)" id="j_chapterNext">')
 53 
 54         content2=step1.findall(str(content))
 55 
 56         url="http:"+content2[0]
 57 
 58         return url
 59     else:
 60         url="http:"+content1[0]
 61 
 62         return url
 63 
 64 def panduan(soup):
 65     
 66     content=soup.find_all("div",{"class":"chapter-control dib-wrap"})
 67     
 68     #print(str(content))
 69     
 70     step=re.compile(r'<a data-eid="qd_R109" href="(.*?)" id="j_chapterNext">')
 71     
 72     content1=step.findall(str(content))
 73     
 74     return content1
 75     #-------------------------------------------------------------------------
 76     
 77     
 78     
 79     #-------------------------------------------------------------------------
 80     
 81     
 82 soup=gethtml(url)
 83 bookname=re.findall(r'<h1>(.*?)</h1>' ,str(soup))          #匹配书名
 84 
 85  86 
 87 load="d:/88/%s.txt" % bookname[0]
 88 i=0
 89 while 1==1:
 90     soup=gethtml(url)
 91     getcontent(soup,load)
 92     url=nextcontent(soup)
 93     content1=panduan(soup)       #在该章里匹配下一章的url，若无法匹配到（输出为[]空），说明没有下一章
 94     i+=1
 95     print("第%d章下载完成" % i)
 96     
 97     if content1 == []:             #  
 98         break
 99        
100     time.sleep(0.2)
101

下一篇，将结合该篇写一个爬取某一页所有小说的爬虫

（本文仅供技术参考，请勿用作非法途径）

转载于:https://www.cnblogs.com/jjj-fly/p/6901022.html

diaocoutan2075

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 爬qidian小说

1 import re 2 import urllib.request 3 from bs4 import BeautifulSoup 4 import time 5 6 url=input("第一章网址：") 7 8 def gethtml(url): 9 ...
复制链接

扫一扫