#!/usr/bin/env python
#coding=utf-8
import httplib2
import json
from lxml import etree
def replace(s):
s= s.replace('/p/','http://tieba.baidu.com/p/')
return s
def openhttp(url):
h2 = httplib2.Http('.cache')
(resp2,html) = h2.request(url,'GET')
return html
def store_file(reply_sum):
filehandle.write(reply_sum['topic'])
filehandle.write('\n')
try:
for i in reply_sum['every_floor']:
filehandle.write(str(i['floor']))
filehandle.write('\t')
filehandle.write(str(i['id']))
filehandle.write('\t')
filehandle.write(i['name'])
filehandle.write('\t')
filehandle.write(i['content'])
filehandle.write('\t')
filehandle.write(i['time'])
filehandle.write('\n')
except:
print('')
def parse_link(topic,link):
original_link = link
sub_html = openhttp(link)
now_page = etree.HTML(sub_html.decode('gbk'))
total_page = int(now_page.xpath(u'//*[@class="l_reply_num"]/span')[0].text)
#print total_page
print "共有页码数:%d" %(total_page)
print 'start=========================='
print "主题是:%s" %(topic.encode('utf8'))
floor = 0
reply_dict = {}
reply_list =[]
reply_sum = {}
reply_sum['topic'] = topic.encode('utf8')
n = 1
while(total_page>=n):
link = original_link
link = link + '?pn='+str(n)
print '准备检索的url'
print link
sub_html = openhttp(link)
now_page = etree.HTML(sub_html.decode('gbk'))
replies = now_page.xpath(u'//*[@class="l_post "]|//*[@class="l_post noborder"]')
for reply in replies:
try:
print "层数:%d" %(floor)
contents = reply.xpath(u'descendant::div[@class="d_post_content j_d_post_content"]')
json_str = reply.attrib['data-field']
author_data = json.loads(json_str)
author_id = author_data["author"]["id"]
author_name = author_data["author"]["name"]
author_time = author_data["content"]["date"]
for content in contents:
print ''
reply_dict['floor'] = floor+1
reply_dict['id']= author_id
reply_dict['content'] = content.text.encode('utf8')
reply_dict['time'] = author_time.encode('utf8')
reply_dict['name'] = author_name.encode('utf8')
except:
print('')
reply_list+=[reply_dict]
reply_dict = {}
floor = floor +1
n = n+1
for i in reply_list:
for m in i:
print i[m]
reply_sum['every_floor'] = reply_list
store_file(reply_sum)
#http://tieba.baidu.com/p/2259628273?pn=2d_post_content j_d_post_content
def main():
pn = 0
while pn < 50:
url = 'http://tieba.baidu.com/f?kw=%B0%CD%C0%E5%B5%BA&tp='+str(pn)
print url
main_html = openhttp(url)
l = []
page = etree.HTML(main_html.decode('gbk'))
p = page.xpath(u'//a[@target="_blank"][@class="j_th_tit"]')
print p[0].values()
for h in p:
l = h.values()
link = replace(l[0])
topic = l[1]
parse_link(topic,link)
print topic
pn = pn+50
if __name__=='__main__':
filehandle = open('aaaaaaaaa.txt','w')
main()
filehandle.close()
百度贴吧爬虫
最新推荐文章于 2018-04-22 00:07:34 发布