采集 网络爬虫吧 的所有贴吧信息
http://tieba.baidu.com/f?ie=utf-8&kw=%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB&fr=search
解决问题思路:
-
确认需求数据在哪
右键查看源代码
-
Fidder模拟发送数据
# -*- coding:utf-8 -*- import urllib2 import chardet from lxml import etree import json import urllib def GetTimeByArticle(url): request = urllib2.Request(url) response = urllib2.urlopen(request) resHtml = response.read() html = etree.HTML(resHtml) return html.xpath('.//*[@class="tail-info"]')[1].text def main(): output = open('tieba0628.json', 'w') queryUrl = {'kw': '网络爬虫'} request = urllib2.Request('http://tieba.baidu.com/f?ie=utf-8&'+ urllib.urlencode(queryUrl) +'&fr=search') response = urllib2.urlopen(request) print 'response start' resHtml = response.read() print 'response read' print chardet.detect(resHtml) html = etree.HTML(resHtml) result = html.xpath('//li[@data-field]') print result print len(result) for site in result: #print etree.tostring(site, encoding='utf-8') title = site.xpath('.//a[@title]')[0].text #title = site.xpath('.//a/@title')[0] author = site.xpath('.//*[@class="frs-author-name-wrap"]/a')[0].text lastName = site.xpath('.//*[@class="tb_icon_author_rely j_replyer"]/a')[0].text reply_date = site.xpath('.//span[@class="threadlist_reply_date pull_right j_reply_data"]')[0].text.strip() Article_url = site.xpath('.//*[@class ="j_th_tit "]')[0].attrib['href'] reply_date = GetTimeByArticle('http://tieba.baidu.com/'+Article_url) rep_num = site.xpath('.//*[@class="threadlist_rep_num center_text"]')[0].text field = json.loads(site.attrib['data-field']) print title,author,lastName,reply_date,rep_num,field item = {} item['title'] = title item['author'] = author item['lastName'] = lastName item['reply_date'] = reply_date item['rep_num'] = rep_num item['field'] = field print item line = json.dumps(item, ensure_ascii=False) print line print type(line) output.write(line.encode('utf-8') + "\n") break output.close() print 'end' if __name__ == '__main__': main()