新浪微博数据挖掘食谱之五: 保存篇 (json mongodb格式)

本文介绍了如何将数据挖掘得到的新浪微博数据以JSON格式进行整理,并详细讲解了如何利用MongoDB进行有效存储,为后续的数据分析与处理打下基础。
摘要由CSDN通过智能技术生成
#!/usr/bin/python 
# -*- coding: utf-8 -*-

'''
Created on 2015-1-1
@author: beyondzhou
@name: json_data_mongodb.py
'''

'''
Config windows service for mongodb

Configure a windows service for MongoDb

The following procedure assumes you have installed MongoDB using the MSI installer, with the default path C:\Program Files\MongoDB 2.6 Standard

1. Open an Administrator command prompt
windows/cmd

2. Create directories
cd c:
mkdir c:\data\db
mkdir c:\data\log

3. Create a configuration file
echo logpath=c:\data\log\mongod.log> "C:\Program Files\MongoDB 2.6 Standard\mongod.cfg"
echo dbpath=c:\data\db>> "C:\Program Files\MongoDB 2.6 Standard\mongod.cfg"

4. Create the MongoDB service
sc.exe create MongoDB binPath= "\"C:\Program Files\MongoDB 2.6 Standard\bin\mongod.exe\" --service --config=\"C:\Program Files\MongoDB 2.6 Standard\mongod.cfg\"" DisplayName= "MongoDB 2.6 Standard" start= "auto"

5. Start the MongoDb service
net start MongoDB

6. Stop or remove the MongoDB service as needed
To stop the MongoDB service, use the following command:
net stop MongoDB

To remove the MongoDB service, first stop the service and then run the following command
sc.exe delete MongoDB

'''

# Get public timeline of sina weibo and save json response data into mongodb
def json_data_mongodb():
    
    # import 
    from login import weibo_login
    import json
    from data import save_to_mongo, load_from_mongo
    from bson import json_util
        
    # Access to sina api
    weibo_api = weibo_login()
    
    # Get public timeline
    public_timeline = weibo_api.statuses.public_timeline.get(count=200)

    # Output the public timeline
    # print json.dumps(public_timeline, indent=1)
    
    # Save the json data into mongodb
    save_to_mongo(public_timeline, 'public_timeline', 'publicTimeline')
    
    # Read the json data from mongodb
    results = load_from_mongo('public_timeline', 'publicTimeline')
    print json.dumps(results, indent=1, default=json_util.default)
    
if __name__ == '__main__':
    json_data_mongodb()
# Save json data into mongo
def save_to_mongo(data, mongo_db, mongo_db_coll, **mongo_conn_kw):
    
    import pymongo
      
    # Connect to the MongoDB server running on
    # localhost:27017 by default
    client = pymongo.MongoClient(**mongo_conn_kw)
    
    # Get a reference to a particular database
    db = client[mongo_db]
    
    # Reference a particular collection in the database
    coll = db[mongo_db_coll]
    
    # Perform a bulk insert and return IDs
    return coll.insert(data)

# Load json data from mongo
def load_from_mongo(mongo_db, mongo_db_coll, return_cursor=False,
                    criteria=None, projection=None, **mongo_conn_kw):
    import pymongo
    
    client = pymongo.MongoClient(**mongo_conn_kw)
    db = client[mongo_db]
    coll = db[mongo_db_coll]
    
    if criteria is None:
        criteria = {}
        
    if projection is None:
        cursor = coll.find(criteria)
    else:
        cursor = coll.find(criteria, projection)
        
    # Returning a cursor for large number of data
    if return_cursor:
        return cursor
    else:
        return [item for item in cursor]

Result:

    "reposts_count": 0, 
    "mid": "3794113072035799", 
    "idstr": "3794113072035799", 
    "geo": null, 
    "source": "<a href=\"http://app.weibo.com/t/feed/380tOv\" rel=\"nofollow\">\u7c89\u4e1d\u7ea2\u5305</a>", 
    "attitudes_count": 0, 
    "in_reply_to_screen_name": "", 
    "in_reply_to_user_id": "", 
    "in_reply_to_status_id": "", 
    "comments_count": 0, 
    "user": {
     "bi_followers_count": 5, 
     "domain": "", 
     "avatar_large": "http://tp4.sinaimg.cn/3077828255/180/22827608884/0", 
     "verified_source": "", 
     "ptype": 0, 
     "block_word": 0, 
     "star": 0, 
     "id": 3077828255, 
     "verified_reason_url": "", 
     "city": "10", 
     "allow_all_comment": true, 
     "credit_score": 80, 
     "block_app": 0, 
     "follow_me": false, 
     "verified_reason": "", 
     "followers_count": 47, 
     "location": "\u5c71\u4e1c \u5a01\u6d77", 
     "verified_trade": "", 
     "mbtype": 0, 
     "verified_source_url": "", 
     "profile_url": "u/3077828255", 
     "province": "37", 
     "avatar_hd": "http://tp4.sinaimg.cn/3077828255/180/22827608884/0", 
     "statuses_count": 183, 
     "description": "", 
     "friends_count": 293, 
     "online_status": 0, 
     "mbrank": 0, 
     "idstr": "3077828255", 
     "profile_image_url": "http://tp4.sinaimg.cn/3077828255/50/22827608884/0", 
     "allow_all_act_msg": false, 
     "verified": false, 
     "geo_enabled": true, 
     "class": 1, 
     "name": "\u738b\u97f3\u5947", 
     "lang": "zh-cn", 
     "weihao": "", 
     "remark": "", 
     "favourites_count": 7, 
     "screen_name": "\u738b\u97f3\u5947", 
     "url": "", 
     "gender": "f", 
     "created_at": "Tue Oct 30 16:55:43 +0800 2012", 
     "verified_type": -1, 
     "following": false, 
     "pagefriends_count": 0, 
     "urank": 13
    }, 
    "id": 3794113072035799, 
    "pic_urls": []
   }, 
   {
    "reposts_count": 0, 
    "truncated": false, 
    "text": "\u6709\u53f2\u4ee5\u6765 \u6536\u5230\u56de\u590d\u6700\u591a\u7684\u4e00\u6b21[doge][doge] \u624b\u673a\u90fd\u5feb\u70b8\u5f00\u4e86[\u7b11cry][\u7b11cry][\u7b11cry] \u4e00\u76f4\u55e1\u55e1\u55e1\u55e1\u7684 \u592a\u5413\u4eba\u4e86 \u54c8\u54c8\u54c8 \u5c0f\u4f19\u4f34\u4eec\u4e00\u5e74\u4e0d\u89c1\u5927\u5bb6\u90fd\u8fd8\u597d\u4e48 \u5341\u5206\u60f3\u5ff5\u4f60\u4eec\u554a\uff5e \u54c8\u54c8\u54c8\u54c8\u54c8\u30022015\u65e9\u4e0a\u597d\u554a\uff01  \u90fd\u7ed9\u4f60\u95ee\u5b89\u4e86 \u4f60\u53ef\u8981\u8ba9\u6211\u7f8e\u7f8e\u7684\u8fc7\u597d\u8fd9\u4e00\u5e74\u54df\uff5e \u4e48\u4e48\u54d2 \uff5e[\u7231\u4f60][\u7231\u4f60][\u7231\u4f60]", 
    "visible": {
     "type": 0, 
     "list_id": 0
    }, 
    "in_reply_to_status_id": "", 
    "bmiddle_pic": "http://ww3.sinaimg.cn/bmiddle/6a4c47adjw1entoz8g3qsj20f00qowfa.jpg", 
    "id": 3794113072035615, 
    "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/6a4c47adjw1entoz8g3qsj20f00qowfa.jpg", 
    "mid": "3794113072035615", 
    "source": "<a href=\"http://app.weibo.com/t/feed/3G5oUM\" rel=\"nofollow\">iPhone 5s</a>", 
    "attitudes_count": 0, 
    "in_reply_to_screen_name": "", 
    "in_reply_to_user_id": "", 
    "annotations": [
     {
      "client_mblogid": "iPhone-AA9C2CE3-2B7B-47CB-BE24-F053405C911D"
     }
    ], 
    "pic_urls": [
     {
      "thumbnail_pic": "http://ww3.sinaimg.cn/thumbnail/6a4c47adjw1entoz8g3qsj20f00qowfa.jpg"
     }
    ], 
    "darwin_tags": [], 
    "favorited": false, 
    "original_pic": "http://ww3.sinaimg.cn/large/6a4c47adjw1entoz8g3qsj20f00qowfa.jpg", 
    "idstr": "3794113072035615", 
    "source_type": 1, 
    "user": {
     "bi_followers_count": 129, 
     "domain": "leexiaoxi", 
     "avatar_large": "http://tp2.sinaimg.cn/1783383981/180/5714271615/0", 
     "verified_source": "", 
     "ptype": 0, 
     "cover_image_phone": "http://ww3.sinaimg.cn/crop.0.0.640.640.640/6ce2240djw1e9oc39jbm4j20hs0hs0tf.jpg", 
     "star": 0, 
     "id": 1783383981, 
     "verified_reason_url": "", 
     "city": "2", 
     "verified": false, 
     "credit_score": 80, 
     "block_app": 0, 
     "follow_me": false, 
     "verified_reason": "", 
     "followers_count": 792, 
     "location": "\u5317\u4eac \u897f\u57ce\u533a", 
     "verified_trade": "", 
     "mbtype": 0, 
     "verified_source_url": "", 
     "profile_url": "515217249", 
     "block_word": 0, 
     "avatar_hd": "http://ww3.sinaimg.cn/crop.0.1.640.640.1024/6a4c47adjw8enjfwbkcw4j20hs0hujsg.jpg", 
     "statuses_count": 706, 
     "description": "\u6211\u4e0d\u574f\uff0c\u4f46\u4e5f\u4e0d\u662f\u597d\u4eba\uff0c\u6211\u53ea\u662f\u968f\u7740\u6211\u7684\u6027\u5b50\u505a\u4e00\u4e9b\u559c\u6b22\u7684\u4e8b\u7f62\u4e86\uff01", 
     "friends_count": 170, 
     "online_status": 0, 
     "mbrank": 0, 
     "idstr": "1783383981", 
     "profile_image_url": "http://tp2.sinaimg.cn/1783383981/50/5714271615/0", 
     "allow_all_act_msg": false, 
     "allow_all_comment": true, 
     "geo_enabled": true, 
     "class": 1, 
     "name": "\u674e\u60f3_beedo", 
     "lang": "zh-tw", 
     "weihao": "515217249", 
     "remark": "", 
     "favourites_count": 77, 
     "screen_name": "\u674e\u60f3_beedo", 
     "url": "", 
     "province": "11", 
     "created_at": "Sat Jul 24 18:09:53 +0800 2010", 
     "verified_type": 220, 
     "gender": "f", 
     "following": false, 
     "pagefriends_count": 0, 
     "urank": 19
    }, 
    "geo": null, 
    "created_at": "Thu Jan 01 08:01:50 +0800 2015", 
    "mlevel": 0, 
    "comments_count": 0
   }, 
   {
    "reposts_count": 0, 
    "truncated": false, 
    "text": "\u540c\u4e8b\u4eec\u90fd\u8bf4\u6211\u6700\u8fd1\u597d\u6f6e\u597d\u65f6\u5c1a\u5462[\u5077\u7b11]\u6211\u624d\u4e0d\u4f1a\u544a\u8bc9\u4ed6\u4eec\uff0c\u6211\u662f\u5728\u4e00\u4e2a\u5de5\u5382\u5e97\u4e70\u7684\u5462\uff0c\u4ef7\u683c\u4e0d\u8d35\u4e0d\u8bf4\uff0c\u4e1c\u897f\u8d28\u91cf\u7279\u522b\u597d\uff0c\u6b3e\u5f0f\u4e5f\u591a\uff0c\u8fd8\u4f1a\u4e0d\u95f4\u65ad\u63a8\u51fa\u65b0\u6b3e\uff0c\u7b80\u76f4\u5c31\u662f\u8d64\u88f8\u88f8\u7684\u798f\u5229\u554a\uff01[\u5fc3][\u4e92\u7c89][\u5a01\u6b66]xin\uff1a3104359614", 
    "visible": {
     "type": 0, 
     "list_id": 0
    }, 
    "in_reply_to_status_id": "", 
    "bmiddle_pic": "http://ww4.sinaimg.cn/bmiddle/6c2e5c24jw1entoz8lesjj20j60j6tew.jpg", 
    "id": 3794113072035403, 
    "thumbnail_pic": "http://ww4.sinaimg.cn/thumbnail/6c2e5c24jw1entoz8lesjj20j60j6tew.jpg", 
    "mid": "3794113072035403", 
    "source": "<a href=\"http://app.weibo.com/t/feed/3KeSKP\" rel=\"nofollow\">WeicoPro</a>", 
    "attitudes_count": 0, 
    "in_reply_to_screen_name": "", 
    "in_reply_to_user_id": "", 
    "pic_urls": [
     {
      "thumbnail_pic": "http://ww4.sinaimg.cn/thumbnail/6c2e5c24jw1entoz8lesjj20j60j6tew.jpg"
     }
    ], 
    "darwin_tags": [], 
    "favorited": false, 
    "original_pic": "http://ww4.sinaimg.cn/large/6c2e5c24jw1entoz8lesjj20j60j6tew.jpg", 
    "idstr": "3794113072035403", 
    "source_type": 1, 
    "user": {
     "bi_followers_count": 2, 
     "domain": "", 
     "avatar_large": "http://tp1.sinaimg.cn/1814977572/180/5612144061/1", 
     "verified_source": "", 
     "ptype": 0, 
     "star": 0, 
     "id": 1814977572, 
     "verified_reason_url": "", 
     "city": "1000", 
     "verified": true, 
     "verified_contact_email": "", 
     "verified_reason_modified": "", 
     "credit_score": 80, 
     "block_app": 0, 
     "follow_me": false, 
     "verified_reason": "\u82cf\u5dde\u8717\u725b\u6570\u5b57\u79d1\u6280\u80a1\u4efd\u6709\u9650\u516c\u53f8\u6e38\u620f\u8717\u725b\u5e02\u573a\u7b56\u5212", 
     "followers_count": 64, 
     "location": "\u5176\u4ed6", 
     "verified_state": 0, 
     "verified_trade": "1321", 
     "mbtype": 0, 
     "verified_source_url": "", 
     "profile_url": "u/1814977572", 
     "block_word": 0, 
     "avatar_hd": "http://tp1.sinaimg.cn/1814977572/180/5612144061/1", 
     "statuses_count": 138, 
     "description": "\u8001\u751f\u6709\u7406\u4e86", 
     "friends_count": 8, 
     "online_status": 0, 
     "mbrank": 0, 
     "verified_level": 3, 
     "profile_image_url": "http://tp1.sinaimg.cn/1814977572/50/5612144061/1", 
     "idstr": "1814977572", 
     "verified_contact_mobile": "", 
     "allow_all_act_msg": false, 
     "allow_all_comment": true, 
     "geo_enabled": true, 
     "class": 1, 
     "name": "csmyg", 
     "lang": "zh-cn", 
     "weihao": "", 
     "remark": "", 
     "favourites_count": 0, 
     "screen_name": "csmyg", 
     "ability_tags": "\u6e38\u620f\u5e94\u7528", 
     "url": "", 
     "province": "100", 
     "created_at": "Tue Sep 14 20:36:46 +0800 2010", 
     "verified_contact_name": "", 
     "verified_type": 0, 
     "gender": "m", 
     "following": false, 
     "pagefriends_count": 0, 
     "urank": 5
    }, 
    "geo": null, 
    "created_at": "Thu Jan 01 08:01:50 +0800 2015", 
    "mlevel": 0, 
    "comments_count": 0
   }, 
   {
    "reposts_count": 0, 
    "truncated": false, 
    "text": "\u6211\u5728\u4e07\u91cc\u901aAPP\u6447\u5230\u4e8620\u79ef\u5206\u548c\u518d\u6765\u4e00\u6b21\uff01\u4e07\u91cc\u901aAPP\u4e13\u4eab\uff0c\u70b9\u8bc4\u7f8e\u98df\u56e2\u8d2d\u5347\u7ea7\uff0c\u8fd410\u500d\u79ef\u5206\uff01http://t.cn/RP3A5gR http://t.cn/8kcZwL0 ", 
    "visible": {
     "type": 0, 
     "list_id": 0
    }, 
    "in_reply_to_status_id": "", 
    "bmiddle_pic": "http://ww4.sinaimg.cn/bmiddle/9e61a97bjw1entoz5hnrsj2046046glj.jpg", 
    "id": 3794113072034439, 
    "thumbnail_pic": "http://ww4.sinaimg.cn/thumbnail/9e61a97bjw1entoz5hnrsj2046046glj.jpg", 
    "mid": "3794113072034439", 
    "source": "<a href=\"http://app.weibo.com/t/feed/4M9EaV\" rel=\"nofollow\">\u4e07\u91cc\u901a\u79ef\u5206</a>", 
    "attitudes_count": 0, 
    "in_reply_to_screen_name": "", 
    "in_reply_to_user_id": "", 
    "annotations": [
     {
      "client_mblogid": "iPhone-819A9206-E97C-47BC-AF88-B4DD36D10AB7"
     }
    ], 
    "pic_urls": [
     {
      "thumbnail_pic": "http://ww4.sinaimg.cn/thumbnail/9e61a97bjw1entoz5hnrsj2046046glj.jpg"
     }
    ], 
    "darwin_tags": [], 
    "favorited": false, 
    "original_pic": "http://ww4.sinaimg.cn/large/9e61a97bjw1entoz5hnrsj2046046glj.jpg", 
    "idstr": "3794113072034439", 
    "source_type": 1, 
    "user": {
     "bi_followers_count": 4, 
     "domain": "", 
     "avatar_large": "http://tp4.sinaimg.cn/2657200507/180/5636263014/0", 
     "verified_source": "", 
     "ptype": 0, 
     "block_word": 0, 
     "star": 0, 
     "id": 2657200507, 
     "verified_reason_url": "", 
     "city": "1000", 
     "allow_all_comment": true, 
     "credit_score": 80, 
     "block_app": 0, 
     "follow_me": false, 
     "verified_reason": "", 
     "followers_count": 25, 
     "location": "\u6d59\u6c5f", 
     "verified_trade": "", 
     "mbtype": 0, 
     "verified_source_url": "", 
     "profile_url": "u/2657200507", 
     "province": "33", 
     "avatar_hd": "http://tp4.sinaimg.cn/2657200507/180/5636263014/0", 
     "statuses_count": 378, 
     "description": "", 
     "friends_count": 19, 
     "online_status": 0, 
     "mbrank": 0, 
     "idstr": "2657200507", 
     "profile_image_url": "http://tp4.sinaimg.cn/2657200507/50/5636263014/0", 
     "allow_all_act_msg": false, 
     "verified": false, 
     "geo_enabled": true, 
     "class": 1, 
     "name": "\u4e0d\u5316\u5986\u7684\u5973\u5b69\u513f", 
     "lang": "zh-cn", 
     "weihao": "", 
     "remark": "", 
     "favourites_count": 67, 
     "screen_name": "\u4e0d\u5316\u5986\u7684\u5973\u5b69\u513f", 
     "url": "", 
     "gender": "f", 
     "created_at": "Fri Mar 09 15:12:25 +0800 2012", 
     "verified_type": -1, 
     "following": false, 
     "pagefriends_count": 0, 
     "urank": 9
    }, 
    "geo": null, 
    "created_at": "Thu Jan 01 08:01:50 +0800 2015", 
    "mlevel": 0, 
    "comments_count": 0
   }, 
   {
    "darwin_tags": [], 
    "favorited": false, 
    "mlevel": 0, 
    "source_type": 1, 
    "visible": {
     "type": 0, 
     "list_id": 0
    }, 
    "text": "#\u6e56\u5357\u536b\u89c6\u8de8\u5e74#\u8de8\u5e74\u5929\u4e0a\u6389\u7ea2\u5305\u5566\uff0c\u6211\u5728@\u6e56\u5357\u536b\u89c6 \u548c@\u5fae\u535a\u7535\u89c6 \u5171\u540c\u9001\u51fa\u7684\u7ea2\u5305\u91cc\u62bd\u5230\u4e86\u201c0.3\u5143\u73b0\u91d1\u201d\uff01\u60f3\u548c\u6211\u4e00\u6837\u4e48\uff1f\u901f\u6765\u62bd\u8de8\u5e74\u7ea2\u5305\u5427\uff01#\u536b\u89c6\u8de8\u5e74\u96f6\u70b9\u7ea2\u5305#http://t.cn/RZU44L0", 
    "created_at": "Thu Jan 01 08:01:50 +0800 2015", 
    "truncated": false, 
    "reposts_count": 0, 
    "mid": "3794113072034438", 
    "idstr": "3794113072034438", 
    "geo": null, 
    "source": "<a href=\"http://app.weibo.com/t/feed/380tOv\" rel=\"nofollow\">\u7c89\u4e1d\u7ea2\u5305</a>", 
    "attitudes_count": 0, 
    "in_reply_to_screen_name": "", 
    "in_reply_to_user_id": "", 
    "in_reply_to_status_id": "", 
    "comments_count": 0, 
    "user": {
     "bi_followers_count": 10, 
     "domain": "", 
     "avatar_large": "http://tp1.sinaimg.cn/1311128400/180/5704466286/0", 
     "verified_source": "", 
     "ptype": 0, 
     "cover_image_phone": "http://ww2.sinaimg.cn/crop.0.0.640.640.640/a1d3feabjw1ecassls6b2j20hs0hsq50.jpg", 
     "star": 0, 
     "id": 1311128400, 
     "verified_reason_url": "", 
     "city": "1", 
     "verified": false, 
     "credit_score": 80, 
     "block_app": 0, 
     "follow_me": false, 
     "verified_reason": "", 
     "followers_count": 64, 
     "location": "\u6e56\u5357 \u957f\u6c99", 
     "verified_trade": "", 
     "mbtype": 0, 
     "verified_source_url": "", 
     "profile_url": "u/1311128400", 
     "block_word": 0, 
     "avatar_hd": "http://ww2.sinaimg.cn/crop.0.0.640.640.1024/4e263b50jw8ejw8kwjp16j20hs0hsaao.jpg", 
     "statuses_count": 642, 
     "description": "\u77ee\u6cb9", 
     "friends_count": 157, 
     "online_status": 1, 
     "mbrank": 0, 
     "idstr": "1311128400", 
     "profile_image_url": "http://tp1.sinaimg.cn/1311128400/50/5704466286/0", 
     "allow_all_act_msg": false, 
     "allow_all_comment": false, 
     "geo_enabled": true, 
     "class": 1, 
     "name": "kekelin129", 
     "lang": "zh-cn", 
     "weihao&
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值