#coding:utf-8 #__author__='wang' #目的:1.对任意id的帖子进行内容的获取;2.增加是否只获取楼主信息; 3.将获取的数据保存在文件中 import urllib2,re #定义一个爬虫类 class BDTBSpider(object): def __init__(self,base_url,see_lz): self.base_url = base_url self.lz = '?see_lz='+str(see_lz) self.headers= {'User_Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'} self.file = None #记录楼层号 self.floor_num = 1 #定义获取帖子源代码的函数 def get_page_code(self,page_number): #需要根据base_url\see_lz\page_number这三个值,拼接出来一个帖子的详情 页面 abs_url = self.base_url + self.lz + '&pn=' + str(page_number) request = urllib2.Request(abs_url,headers=self.headers) try: response = urllib2.urlopen(request) except Exception,e: print '第{}页链接失败,原因是:{}'.format(page_number,e) return None else: print '第{}页链接成功!'.format(page_number) return response.read() #获取帖子标题 def get_title(self,html): pattern = re.compile(r'<h1 class="core_title_txt.*?>(.*?)</h1>', re.S) res = re.search(pattern,html) if res: return res.group(1).strip('\n') return None #获取帖子总页数 def get_total_page_number(self,html): pattren = re.compile(r'<span class="red">(.*?)</span>',re.S) number = re.search(pattren,html) if number: return number.group(1).strip() return None #获取每一页的数据 def get_content(self,html): pattern = re.compile(r'<a.*?class="p_author_name.*?>(.*?)</a> .*?<div id="post_content.*?>(.*?)</div>',re.S) content_list = re.findall(pattern,html) return content_list #定义创建文件的函数 def open_file(self,filename='百度贴吧'): self.file = open(filename+'.txt','w') #定义写入数据的函数 def write_data(self,data_list): for data_tuple in data_list: name = data_tuple[0] content = data_tuple[1] self.file.write('---------第{}楼----------'.format (self.floor_num)) self.file.write('\n') self.file.write('昵称:{}'.format(name)) self.file.write('\n') self.file.write('内容:{}'.format(content)) self.file.write('\n') self.floor_num +=1 #定义开始爬虫的函数 def start_spider(self): index_page = self.get_page_code(1) if index_page: title = self.get_title(index_page) total_number = self.get_total_page_number(index_page) print '该帖子一共有{}页数据'.format(total_number) #打开本地文件 self.open_file(title.decode('utf-8')) for page_num in xrange(1,int(total_number)+1): print '正在写入第{}页数据...'.format(page_num) html = self.get_page_code(page_num) content_list = self.get_content(html) self.write_data(content_list) self.file.close() if __name__ == '__main__': abs_url = 'http://tieba.baidu.com/p/' + raw_input('输入文章ID') print ''' 1.只查看楼主信息 0.查看该贴所有信息 ''' select_number = input('选择操作:') bdtb = BDTBSpider(abs_url,select_number) bdtb.start_spider()
百度贴吧(保存数据到文件)
最新推荐文章于 2024-07-01 11:16:19 发布