Python爬虫实例1-抓取百度贴吧

采集 网络爬虫吧 的所有贴吧信息

http://tieba.baidu.com/f?ie=utf-8&kw=%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB&fr=search

解决问题思路:

  1. 确认需求数据在哪

    右键查看源代码

  2. Fidder模拟发送数据

# -*- coding:utf-8 -*-
import urllib2
import chardet
from lxml import etree
import json
import urllib

def GetTimeByArticle(url):

    request = urllib2.Request(url)
    response = urllib2.urlopen(request)
    resHtml = response.read()
    html = etree.HTML(resHtml)
    return html.xpath('.//*[@class="tail-info"]')[1].text

def main():

    output = open('tieba0628.json', 'w')

    queryUrl = {'kw': '网络爬虫'}

    request = urllib2.Request('http://tieba.baidu.com/f?ie=utf-8&'+ urllib.urlencode(queryUrl) +'&fr=search')
    response = urllib2.urlopen(request)
    print 'response start'
    resHtml = response.read()
    print 'response read'
    print chardet.detect(resHtml)

    html = etree.HTML(resHtml)

    result = html.xpath('//li[@data-field]')
    print result
    print len(result)
    for site in result:
        #print etree.tostring(site, encoding='utf-8')

        title = site.xpath('.//a[@title]')[0].text
        #title = site.xpath('.//a/@title')[0]
        author = site.xpath('.//*[@class="frs-author-name-wrap"]/a')[0].text

        lastName = site.xpath('.//*[@class="tb_icon_author_rely j_replyer"]/a')[0].text

        reply_date = site.xpath('.//span[@class="threadlist_reply_date pull_right j_reply_data"]')[0].text.strip()

        Article_url = site.xpath('.//*[@class ="j_th_tit "]')[0].attrib['href']
        reply_date = GetTimeByArticle('http://tieba.baidu.com/'+Article_url)

        rep_num = site.xpath('.//*[@class="threadlist_rep_num center_text"]')[0].text

        field = json.loads(site.attrib['data-field'])

        print title,author,lastName,reply_date,rep_num,field

        item = {}

        item['title'] = title
        item['author'] = author
        item['lastName'] = lastName
        item['reply_date'] = reply_date
        item['rep_num'] = rep_num
        item['field'] = field
        print item

        line = json.dumps(item, ensure_ascii=False)
        print line
        print type(line)

        output.write(line.encode('utf-8') + "\n")
        break
    output.close()
    print 'end'

if __name__ == '__main__':
    main()


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值