爬取分析雪球网实盘用户数据

对雪球网实盘用户(地理位置信息)进行爬取,数据分析。

结论:
1、遍历了约20%的雪球用户,检索到近9000个活跃实盘用户,预估所有实盘用户在50000个左右。
2、样本中约9000个实盘用户,大约一半近期实盘有操作;另外一半实盘被关闭,应该可以检索到历史数据。
3、基于目前样本,发现1265个有明确地理位置的用户,其地区分布与雪球官方公布的地区分布基本一致。

详细数据:
1、本次遍历雪球网,初步检索到 8253 个有实盘数据的用户,其中:
a. 现在依旧活跃的实盘用户:3808个
b. 目前已经关停实盘,只保留历史调仓记录的用户:3111个
c. 网站数据异常,需要再处理的用户:1118个
d. 网络连接超时,需要再处理的用户:276个

2、在3808个依旧活跃的实盘用户中,有明确地理位置的有1265个,地理位置分布如下:
在这里插入图片描述
3、下图是雪球官方公布的用户分布图:
在这里插入图片描述

爬虫代码L:

    cookie = [dict(cookies_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; xq_a_token=18b7f7dec4f54032863219716eaf839ee940199d; xqat=18b7f7dec4f54032863219716eaf839ee940199d; xq_r_token=f27bcc9f6c7b6446279ee9448db195b118b8f17c; xq_token_expire=Wed%20Nov%2021%202018%2019%3A41%3A19%20GMT%2B0800%20(CST); xq_is_login=1; u=7147604028; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540641065; __utmb=1.37.9.1540640362715"),
              dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; __utmt=1; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; xq_a_token=4458f8df93a013c35835d0320917b19dcaab0a24; xqat=4458f8df93a013c35835d0320917b19dcaab0a24; xq_r_token=4812b56991883e9913998e8816706912bff911e8; xq_is_login=1; u=6146826778; xq_token_expire=Wed%20Nov%2021%202018%2019%3A17%3A51%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540638965; __utmb=1.8.9.1540638934329"),
              dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xqat=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xq_r_token=bb8e27cca180872ab70314097a5077578ff119c8; xq_is_login=1; u=1559188240; xq_token_expire=Wed%20Nov%2021%202018%2019%3A24%3A59%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639362; __utmb=1.14.9.1540638934329"),
              dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xqat=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xq_r_token=b004ebba4649dfef7bba54f6ae7b703e5bca6a61; xq_token_expire=Wed%20Nov%2021%202018%2019%3A27%3A29%20GMT%2B0800%20(CST); xq_is_login=1; u=1497969916; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639507; __utmb=1.18.9.1540639426395")]


def get_location(num):
    res = []
    import requests
    import random
    import time
    import json
    url = u"https://xueqiu.com/statuses/original/show.json?user_id=" + num[3]
    url_1 = u"https://xueqiu.com/account/oauth/user/show.json?source=sina&userid=" + num[3]
    headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
                'Accept': 'application/json, text/plain, */*',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
                'Accept': 'application/json, text/plain, */*',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'}]
    cookie = [dict(cookies_are=u"device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; aliyungf_tc=AQAAADY46BenmA0AefNZ2iIVV7Y6rtgH; __utmc=1; xq_a_token.sig=XglA1uiAYkfyfKlhbuJdRhRTTM4; xq_r_token.sig=jW7KrLgtGYffUvfG3DfPexDR8RQ; xq_a_token=7c41909f4604aa33eb26b7c175f0468a1df2152b; xqat=7c41909f4604aa33eb26b7c175f0468a1df2152b; xq_r_token=b1914a7d50798c67bb7852f09954b82aa41a4a0b; xq_token_expire=Thu%20Nov%2008%202018%2022%3A39%3A32%20GMT%2B0800%20(CST); xq_is_login=1; u=7147604028; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; snbim_minify=true; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539526506,1539528869,1539567244,1539567517; _gid=GA1.2.1869629804.1540126302; __utma=1.191434752.1526174181.1540126285.1540168048.33; __utmt=1; __utmb=1.31.10.1540168048; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540171090"),
              dict(cookie_are=u"device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; __utmt=1; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; xq_a_token=4458f8df93a013c35835d0320917b19dcaab0a24; xqat=4458f8df93a013c35835d0320917b19dcaab0a24; xq_r_token=4812b56991883e9913998e8816706912bff911e8; xq_is_login=1; u=6146826778; xq_token_expire=Wed%20Nov%2021%202018%2019%3A17%3A51%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540638965; __utmb=1.8.9.1540638934329"),
              dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xqat=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xq_r_token=bb8e27cca180872ab70314097a5077578ff119c8; xq_is_login=1; u=1559188240; xq_token_expire=Wed%20Nov%2021%202018%2019%3A24%3A59%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639362; __utmb=1.14.9.1540638934329"),
              dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xqat=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xq_r_token=b004ebba4649dfef7bba54f6ae7b703e5bca6a61; xq_token_expire=Wed%20Nov%2021%202018%2019%3A27%3A29%20GMT%2B0800%20(CST); xq_is_login=1; u=1497969916; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639507; __utmb=1.18.9.1540639426395")]
    s = requests.Session()
    s.keep_alive = False
    try:
        obj = s.get(url, headers=random.choice(headers), cookies=random.choice(cookie), stream=True, allow_redirects=False).json()
        time.sleep(random.random() * 16)
        name = obj['user']['screen_name']
        gender = obj['user']['gender']
        province = obj['user']['province']
        city = obj['user']['city']
        followers_count = obj['user']['followers_count']
        friends_count = obj['user']['friends_count']
        status_count = obj['user']['status_count']
        stocks_count = obj['user']['stocks_count']
        res.append(num[2])
        res.append(num[3])
        res.append(name)
        res.append(gender)
        res.append(province)
        res.append(city)
        res.append(followers_count)
        res.append(friends_count)
        res.append(status_count)
        res.append(stocks_count)
        try:
            obj_1 = s.get(url_1, headers=random.choice(headers), cookies=random.choice(cookie), stream=True,
                        allow_redirects=False).json()
            time.sleep(random.random() * 15)
            weibo_uid = obj_1['id']
            res.append(weibo_uid)
            s.close()
            print(res)
            return res
        except KeyError or json.decoder.JSONDecodeError or IndexError:
            res.append("weibo地址不存在")
            s.close()
            print(res)
            return res
    except json.decoder.JSONDecodeError:
        res.append(num)
        res.append("异常")
        s.close
        print(res)
        return res

xueqiu_all = xueqiu_all_data[1:]

if __name__ == "__main__":
    final = []
    for num in xueqiu_all[0::40]:
        try:
            data = get_location(num)
            final.append(data)
        except KeyError:
            print("KeyError")
            pass
t = 0
for i in test[1:]:
    if i[5] in ["北京", "上海", "天津"] or i[6] not in ["", "未知", "其他", "异常", "不限", "城市/地区", None]:
        t += 1
        print(i)
print(t)
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值