对雪球网实盘用户(地理位置信息)进行爬取,数据分析。
结论:
1、遍历了约20%的雪球用户,检索到近9000个活跃实盘用户,预估所有实盘用户在50000个左右。
2、样本中约9000个实盘用户,大约一半近期实盘有操作;另外一半实盘被关闭,应该可以检索到历史数据。
3、基于目前样本,发现1265个有明确地理位置的用户,其地区分布与雪球官方公布的地区分布基本一致。
详细数据:
1、本次遍历雪球网,初步检索到 8253 个有实盘数据的用户,其中:
a. 现在依旧活跃的实盘用户:3808个
b. 目前已经关停实盘,只保留历史调仓记录的用户:3111个
c. 网站数据异常,需要再处理的用户:1118个
d. 网络连接超时,需要再处理的用户:276个
2、在3808个依旧活跃的实盘用户中,有明确地理位置的有1265个,地理位置分布如下:
3、下图是雪球官方公布的用户分布图:
爬虫代码L:
cookie = [dict(cookies_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; xq_a_token=18b7f7dec4f54032863219716eaf839ee940199d; xqat=18b7f7dec4f54032863219716eaf839ee940199d; xq_r_token=f27bcc9f6c7b6446279ee9448db195b118b8f17c; xq_token_expire=Wed%20Nov%2021%202018%2019%3A41%3A19%20GMT%2B0800%20(CST); xq_is_login=1; u=7147604028; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540641065; __utmb=1.37.9.1540640362715"),
dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; __utmt=1; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; xq_a_token=4458f8df93a013c35835d0320917b19dcaab0a24; xqat=4458f8df93a013c35835d0320917b19dcaab0a24; xq_r_token=4812b56991883e9913998e8816706912bff911e8; xq_is_login=1; u=6146826778; xq_token_expire=Wed%20Nov%2021%202018%2019%3A17%3A51%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540638965; __utmb=1.8.9.1540638934329"),
dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xqat=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xq_r_token=bb8e27cca180872ab70314097a5077578ff119c8; xq_is_login=1; u=1559188240; xq_token_expire=Wed%20Nov%2021%202018%2019%3A24%3A59%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639362; __utmb=1.14.9.1540638934329"),
dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xqat=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xq_r_token=b004ebba4649dfef7bba54f6ae7b703e5bca6a61; xq_token_expire=Wed%20Nov%2021%202018%2019%3A27%3A29%20GMT%2B0800%20(CST); xq_is_login=1; u=1497969916; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639507; __utmb=1.18.9.1540639426395")]
def get_location(num):
res = []
import requests
import random
import time
import json
url = u"https://xueqiu.com/statuses/original/show.json?user_id=" + num[3]
url_1 = u"https://xueqiu.com/account/oauth/user/show.json?source=sina&userid=" + num[3]
headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'}]
cookie = [dict(cookies_are=u"device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; aliyungf_tc=AQAAADY46BenmA0AefNZ2iIVV7Y6rtgH; __utmc=1; xq_a_token.sig=XglA1uiAYkfyfKlhbuJdRhRTTM4; xq_r_token.sig=jW7KrLgtGYffUvfG3DfPexDR8RQ; xq_a_token=7c41909f4604aa33eb26b7c175f0468a1df2152b; xqat=7c41909f4604aa33eb26b7c175f0468a1df2152b; xq_r_token=b1914a7d50798c67bb7852f09954b82aa41a4a0b; xq_token_expire=Thu%20Nov%2008%202018%2022%3A39%3A32%20GMT%2B0800%20(CST); xq_is_login=1; u=7147604028; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; snbim_minify=true; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539526506,1539528869,1539567244,1539567517; _gid=GA1.2.1869629804.1540126302; __utma=1.191434752.1526174181.1540126285.1540168048.33; __utmt=1; __utmb=1.31.10.1540168048; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540171090"),
dict(cookie_are=u"device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; __utmt=1; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; xq_a_token=4458f8df93a013c35835d0320917b19dcaab0a24; xqat=4458f8df93a013c35835d0320917b19dcaab0a24; xq_r_token=4812b56991883e9913998e8816706912bff911e8; xq_is_login=1; u=6146826778; xq_token_expire=Wed%20Nov%2021%202018%2019%3A17%3A51%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540638965; __utmb=1.8.9.1540638934329"),
dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xqat=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xq_r_token=bb8e27cca180872ab70314097a5077578ff119c8; xq_is_login=1; u=1559188240; xq_token_expire=Wed%20Nov%2021%202018%2019%3A24%3A59%20GMT%2B0800%20(CST); Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639362; __utmb=1.14.9.1540638934329"),
dict(cookie_are="device_id=06934df365e4a0fdf3e5c1efc4a302fd; __utmz=1.1526174181.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; s=fn15nngiri; _ga=GA1.2.191434752.1526174181; bid=ae1522508305909e11f0ccaefc21ae37_jn8z7lmc; aliyungf_tc=AQAAAEKOuhTXlwQABPNZ2lE9FlkxU221; snbim_minify=true; __utmc=1; __utma=1.191434752.1526174181.1540635588.1540638493.40; Hm_lvt_1db88642e346389874251b5a1eded6e3=1539567244,1539567517,1540450735,1540638620; _gid=GA1.2.1574455490.1540638621; __utmt=1; xq_a_token=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xqat=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xq_r_token=b004ebba4649dfef7bba54f6ae7b703e5bca6a61; xq_token_expire=Wed%20Nov%2021%202018%2019%3A27%3A29%20GMT%2B0800%20(CST); xq_is_login=1; u=1497969916; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540639507; __utmb=1.18.9.1540639426395")]
s = requests.Session()
s.keep_alive = False
try:
obj = s.get(url, headers=random.choice(headers), cookies=random.choice(cookie), stream=True, allow_redirects=False).json()
time.sleep(random.random() * 16)
name = obj['user']['screen_name']
gender = obj['user']['gender']
province = obj['user']['province']
city = obj['user']['city']
followers_count = obj['user']['followers_count']
friends_count = obj['user']['friends_count']
status_count = obj['user']['status_count']
stocks_count = obj['user']['stocks_count']
res.append(num[2])
res.append(num[3])
res.append(name)
res.append(gender)
res.append(province)
res.append(city)
res.append(followers_count)
res.append(friends_count)
res.append(status_count)
res.append(stocks_count)
try:
obj_1 = s.get(url_1, headers=random.choice(headers), cookies=random.choice(cookie), stream=True,
allow_redirects=False).json()
time.sleep(random.random() * 15)
weibo_uid = obj_1['id']
res.append(weibo_uid)
s.close()
print(res)
return res
except KeyError or json.decoder.JSONDecodeError or IndexError:
res.append("weibo地址不存在")
s.close()
print(res)
return res
except json.decoder.JSONDecodeError:
res.append(num)
res.append("异常")
s.close
print(res)
return res
xueqiu_all = xueqiu_all_data[1:]
if __name__ == "__main__":
final = []
for num in xueqiu_all[0::40]:
try:
data = get_location(num)
final.append(data)
except KeyError:
print("KeyError")
pass
t = 0
for i in test[1:]:
if i[5] in ["北京", "上海", "天津"] or i[6] not in ["", "未知", "其他", "异常", "不限", "城市/地区", None]:
t += 1
print(i)
print(t)