#coding:utf-8 import urllib2 import re class BDTB: baseUrl='http://tieba.baidu.com/p/4896490947?see_lz=1#/pn=' #打开网址,获取源码,返回response对象内容 def getPage(self,pageNum): try: url = self.baseUrl+str(pageNum) request=urllib2.Request(url) #先request创建对象 response = urllib2.urlopen(request).read() #在通过request对象urlopen获取response对象,通过response对象read #print response return response except Exception,e: print e #获取标题 def Title(self,pageNum): html = self.getPage(pageNum) #调用第几页的源码 reg = re.compile(r'title="【原创】(.*?)"') #编辑正则表达式,提高效率 items = re.findall(reg,html) #items是list类型 #items=items.decode('GBK').encode('utf8') for item in items: #把爬取的标题放入到文件中间 if pageNum == 1: f = open('G:\\text1.txt','w') f.write('标题'+'\t'+item) f.close() else: f = open('G:\\text1.txt','a') f.write('标题'+'\t'+item) f.close() #print items return items #获取小说正文 def Text(self,pageNum): html = self.getPage(pageNum) #获取第几页正文 reg = re.compile(r'class="d_post_content j_d_post_content "> (.*?)</div><br>',re.S) #re.S是匹配换行符 req = re.findall(reg,html) if pageNum == 1: req = req[1:] for i in req: #i 是正文内容 removeAddr1 = re.compile(r'<a.*?>|</a>') removeAddr2 = re.compile(r'<img.*?>') removeAddr3 = re.compile('http.*?.html') i = re.sub(removeAddr1,"",i) #从i内容中找到removeAddr1用“”代替 i = re.sub(removeAddr2,"",i) i = re.sub(removeAddr3,"",i) i= i.replace('<br>','') f = open('G:\\text1.txt','a') # 追加模式 f.write('\n\n'+i) f.close() #print i #print req bdtb=BDTB() print '爬虫正在启动...' try: for i in range(1,21): #只爬取14页 print '正在爬取%s页的小说' %(i) bdtb.Title(i) bdtb.Text(i) except Exception,e: print e print '爬取结束...' #bdtb.getPage(1) #bdtb.Title(1) #bdtb.Text(1)
python爬取百度贴吧小说
最新推荐文章于 2023-10-15 21:38:16 发布