程序 python 抓取新浪读书频道小说

版权声明

请尊重原创作品。转载请保持文章完整性,并以超链接形式注明原始作者“tingsking18”和主站点地址,方便其他朋友提问和指正。

二进制文件下载地址:

SinaGetBook

效果如图:

代码:

#!/usr/bin/env python #coding=utf-8 #!/usr/bin/env python #coding=utf-8 import traceback import sys import wx import re import urllib import wx.richtext as rt import wx.lib.buttonpanel as bp import Casing import Debug def trace_back(): try: return traceback.print_exc() except: return '' class Window(wx.Frame): def __init__(self): sys.setdefaultencoding("utf-8") wx.Frame.__init__(self,None,-1,u'新浪网图书频道抓取工具',pos=wx.Point(0, 0),size=(800,620)) l1 = wx.StaticText(self, -1, u"目录URL:") self.t1 = wx.TextCtrl(self, -1, "http://vip.book.sina.com.cn/book/?book=27633", size=(500, -1)) l2 = wx.StaticText(self, -1, u"内容URL前缀:") self.t2 = wx.TextCtrl(self, -1, "http://vip.book.sina.com.cn/book/", size=(500, -1)) l3 = wx.StaticText(self, -1, u"替换的内容:") self.t3 = wx.TextCtrl(self, -1, u"阅读‘刘猛’的其他作品: \n" u"http://vip.book.sina.com.cn/book/?book=39011《狼牙》作者新作:冰是睡着的水\n" u"http://vip.book.sina.com.cn/book/?book=41217刘猛展示狙击手神秘生活:刺客\n" u"http://vip.book.sina.com.cn/book/?book=38884中国特种部队生存实录:狼牙\n" u"http://vip.book.sina.com.cn/book/?book=43226刘猛最新力作:如临大敌", size=(500, 100), style=wx.TE_MULTILINE|wx.TE_PROCESS_ENTER) self.t3.SetInsertionPoint(0) l4 = wx.StaticText(self, -1, u"内容") #self.t4 = wx.TextCtrl(self, -1,"", # size=(600, 400), style=wx.TE_MULTILINE|wx.TE_PROCESS_ENTER) self.t4 = rt.RichTextCtrl(self,-1,"",size=(600, 400), style=wx.VSCROLL|wx.HSCROLL|wx.NO_BORDER); #self.t4.SetInsertionPoint(0) self.b = wx.Button(self, -1, u"开始抓取") self.Bind(wx.EVT_BUTTON, self.OnTestReplace, self.b) space = 2 bsizer = wx.BoxSizer(wx.VERTICAL) bsizer.Add(self.b, 0, wx.GROW|wx.ALL, space) sizer = wx.FlexGridSizer(cols=3, hgap=space, vgap=space) sizer.AddMany([ l1, self.t1, (0,0), l2, self.t2, (0,0), l3, self.t3, bsizer, l4, self.t4, (0,0), ]) border = wx.BoxSizer(wx.VERTICAL) border.Add(sizer, 0, wx.ALL, 15) self.SetSizer(border) self.SetAutoLayout(True) self.Show(True) def OnTestReplace(self, evt): #dlg = wx.MessageDialog(None, u'Data file is not exist,please download it!',u'Error',wx.OK | wx.ICON_INFORMATION) #dlg.ShowModal() #dlg.Destroy() listurl = self.t1.GetValue() prefix = self.t2.GetValue() #print prefix replace = self.t3.GetValue() #print replace.decode("utf-8").encode("GBK") rep = replace.split("\n") def f(): try: sock = urllib.urlopen(listurl) strhtml = sock.read() strhtml = unicode(strhtml, 'gb2312','ignore').encode('utf-8','ignore') strhtml =strhtml.lower() list = re.findall('''<a href="(chapter_+.*?)" target="_blank">''', strhtml) for one in list: try: sock1 = urllib.urlopen(prefix+one) htmlcontent = sock1.read() htmlcontent = unicode(htmlcontent, 'gb2312','ignore').encode('utf-8','ignore') title = re.findall('''<h1>(.*?)</h1>''', htmlcontent)[0] s_content = re.findall('''<div id="contTxt" class="contTxt1"><p>([\s\S]*?)</p></div>''', htmlcontent)[0] s_content = s_content.replace("<p>","") s_content = s_content.replace("</p>","") s_content = s_content.replace("*","") for reps in rep: s_content = s_content.replace(reps.decode("utf-8"),"") #print title.decode("utf-8").encode("GBK") #print s_content.decode("utf-8").encode("GBK") self.b.SetLabel(u"test") self.t4.AppendText(title.decode("utf-8").encode("GBK")+"\n") self.t4.AppendText(s_content.decode("utf-8").encode("GBK")) except: Debug.error.traceback() continue; except: Debug.error.traceback() d = Casing.Casing(f) d.start_thread() application = wx.PySimpleApp() Window() application.MainLoop()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值