# coding:utf-8 import urllib2, re, random # 1.拼接url地址 # tie_ba_id = input("填写贴吧的ID:") absolute_url = "https://tieba.baidu.com/p/5147943292" # 2.准备请求头 user_agent_list = [ "Mozilla/5.0(Macintosh;IntelMacOSX10.6;rv:2.0.1)Gecko/ 20100101Firefox/4.0.1", "Mozilla/4.0(compatible;MSIE6.0;WindowsNT5.1)", "Opera/9.80(WindowsNT6.1;U;en)Presto/2.8.131Version/11.11", "Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/ 535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11", "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)", "Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1;Trident/ 4.0;SE2.XMetaSr1.0;SE2.XMetaSr1.0;.NETCLR2.0.50727;SE2.XMetaSr1.0)" ] headers = {'User-Agent':random.choice(user_agent_list)} request = urllib2.Request(absolute_url, headers=headers) response = urllib2.urlopen(request) html = response.read() # 3.解析首页源代码,标题、总页数提取出来 pattern = re.compile(r'<li class="l_reply_num".*?<span class="red"> (.*?)</span>', re.S) total_page = re.search(pattern, html) # 获取总页数 total_number = int(total_page.group(1)) # 帖子标题 title_pattern = re.compile(r'<h1 class="core_title_txt.*?>(.*?)</h1>' , re.S) title = re.search(title_pattern, html).group(1) # 将该贴的所有回复内容,写入到本地的txt文件中 file_test = open('{}.txt'.format(title).decode('utf-8'), 'w') # 4.for循环遍历所有页面的帖子回复 for x in xrange(1, 3): print '正在爬取第{}页数据...'.format(x) every_page_url = absolute_url + '?pn={}'.format(x) request = urllib2.Request(every_page_url, headers=headers) response = urllib2.urlopen(request) # 设置正则表达式,提取回复作者昵称,回复内容 pattern = re.compile(r'<a.*?class="p_author_name.*?>(.*?)</a>.*? <div id="post_content.*?>(.*?)</div>', re.S) html_string = response.read() results_list = re.findall(pattern, html_string) # 对获取的结果列表进行过滤,去除多余的标签 for res_tuple in results_list: remove_element = re.compile(r'<.*?>', re.S) replace_br = re.compile(r'<br>|<br/>') remove_n = re.compile(r'\n') remove_space = re.compile(r' ', re.S) name = res_tuple[0] name = re.sub(remove_n, '', name) name = re.sub(remove_space, '', name) name = re.sub(remove_element, '', name) content = res_tuple[1] content = re.sub(remove_n, '', content) content = re.sub(remove_element, '', content) content = re.sub(replace_br, '\n', content) content = re.sub(remove_space, '', content) # 将过滤后的内容写入到本地文件中 file_test.write('用户昵称:{}'.format(name)) file_test.write('\n') file_test.write('用户回复:{}'.format(content)) file_test.write('\n') file_test.close() print '数据写入完成!'
09-20