雪球网交易数据爬取,python源码。
雪球是一个投资者的社交网络平台,爬取交易数据。
代码:
def get_trade_behavior(uid):
import requests
import random
import time
import json
result = []
res = []
headers = [{
'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{
'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{
'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'}]
s = requests.Session()
s.keep_alive = False
# t = 1
try:
# while True:
url = "https://xueqiu.com/service/tc/snowx/PAMID/cubes/rebalancing/history?cube_symbol=SP" + uid + "&count=20&page=1"
obj = s.get(url, headers=random.choice(headers), stream=True, allow_redirects=False).json()
time.sleep(random.random() * 3)
maxpage = obj["maxPage"]
# if obj["list"] != []:
for k in range(1, maxpage + 1):
url = "https://xueqiu.com/service/tc/snowx/PAMID/cubes/rebalancing/history?cube_symbol=SP" + uid + "&count=20&page=" + str(k)
print("正在检索{%s}-第%d页-总共%d页" % (uid, k, maxpage))
obj = s.get(url, headers=random.choice(headers), stream=True, allow_redirects=False).json()
time.sleep(random.random() * 3)
for i in obj["list"]:
res.append(uid)
time_stamp = i["updated_at"]
time_stamp_10 = int(round(time_stamp) / 1000)
time_local = time.localtime(time_stamp_10)
trade_time = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
trade_history_stock_name = i["rebalancing_histories"][0]["stock_name"]
trade_history_stock_symbol = i["rebalancing_histories"][0]["stock_symbol"]
trade_history_stock_prev_weight = i["rebalancing_histories"][0]["prev_weight_adjusted"]
trade_history_stock_target_weight = i["rebalancing_histories"][0]["target_weight"]
trade_history_stock_exec_price = i["rebalancing_histories"][0]["price"]
res.append(trade_time)
res.append(trade_history_stock_name)
res.append(trade_history_stock_symbol)
res.append(trade_history_stock_prev_weight)
res.append(trade_history_stock_target_weight)
res.append(trade_history_stock_exec_price)
res_copy = res.copy()
result.append(res_copy)
res.clear()
print("{%s} 检索完毕!" % uid)
return result
except:
print("{%s} 异常!" % uid)
return [uid, "异常"]
def read_csv(name):
import csv
'''读取CSV文件数据'''
csv_file = csv.reader(open("C:\\Users\\viemax\\Desktop\\" + name + ".csv", "r"))
object_website = []
for i in csv_file:
object_website.append(i)
# print(i)
return object_website
no_data_id = read_csv("no_data_id")
obj = []
for i in no_data_id[2:]:
obj.append(i[1])
res = []
for i in obj[0::2]:
r = get_trade_behavior(i)
res.append(r)
def xueqiu(num):
import requests
from bs4 import BeautifulSoup
import random
import time
url = u"https://xueqiu.com/P/SP" + num
headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
'Accept': 'text/html;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'},
{'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
'Accept': 'application/json, text/plain, */*',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Connection': 'close'}]
cookie = [dict(cookies_are="device_id=33a80200aacb73cf594a45942b285a12; _ga=GA1.2.312459015.1529772425; s=ey177hmx06; bid=ae1522508305909e11f0ccaefc21ae37_jn93s7rs; __utmz=1.1539536073.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; Hm_lvt_fe218c11eab60b6ab1b6f84fb38bcc4a=1539591917; _gid=GA1.2.758749044.1540657586; aliyungf_tc=AQAAAIe8YFC/zwwAKvJZ2tC9k8DvMt34; __utmc=1; __utma=1.312459015.1529772425.1540825606.1540828390.19; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token.sig=p4pCAuWXphKrks3IjEzTbJFCcb4; xqat.sig=uWTQIYsOCqtgymFewPvkgLk8CyM; xq_r_token.sig=Q9P70D5S5ZuHuFEXVJ6umTRqL1o; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u.sig=Ra3Ht4oGmAXu5VtkPBpRXum-Ntc; Hm_lvt_1db88642e346389874251b5a1eded6e3=1540825899,1540828382,1540829378,1540829450; snbim_minify=true; __utmt=1; _gat_gtag_UA_16079156_4=1; xq_a_token=18b7f7dec4f54032863219716eaf839ee940199d; xqat=18b7f7dec4f54032863219716eaf839ee940199d; xq_r_token=f27bcc9f6c7b6446279ee9448db195b118b8f17c; xq_token_expire=Sat%20Nov%2024%202018%2001%3A55%3A26%20GMT%2B0800%20(CST); xq_is_login=1; u=7147604028; __utmb=1.52.10.1540828390; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540835763"),
dict(cookie_are="device_id=33a80200aacb73cf594a45942b285a12; _ga=GA1.2.312459015.1529772425; s=ey177hmx06; bid=ae1522508305909e11f0ccaefc21ae37_jn93s7rs; __utmz=1.1539536073.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; Hm_lvt_fe218c11eab60b6ab1b6f84fb38bcc4a=1539591917; _gid=GA1.2.758749044.1540657586; aliyungf_tc=AQAAAIe8YFC/zwwAKvJZ2tC9k8DvMt34; __utmc=1; __utma=1.312459015.1529772425.1540825606.1540828390.19; Hm_lvt_1db88642e346389874251b5a1eded6e3=1540825899,1540828382,1540829378,1540829450; snbim_minify=true; __utmt=1; xq_token_expire=Sat%20Nov%2024%202018%2001%3A55%3A26%20GMT%2B0800%20(CST); __utmb=1.52.10.1540828390; _gat_gtag_UA_16079156_4=1; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xq_a_token.sig=p4pCAuWXphKrks3IjEzTbJFCcb4; xqat=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xqat.sig=uWTQIYsOCqtgymFewPvkgLk8CyM; xq_r_token=bb8e27cca180872ab70314097a5077578ff119c8; xq_r_token.sig=Q9P70D5S5ZuHuFEXVJ6umTRqL1o; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1559188240; u.sig=Ra3Ht4oGmAXu5VtkPBpRXum-Ntc; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540835848"),
dict(cookie_are="device_id=33a80200aacb73cf594a45942b285a12; _ga=GA1.2.312459015.1529772425; s=ey177hmx06; bid=ae1522508305909e11f0ccaefc21ae37_jn93s7rs; __utmz=1.1539536073.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; Hm_lvt_fe218c11eab60b6ab1b6f84fb38bcc4a=1539591917; _gid=GA1.2.758749044.1540657586; aliyungf_tc=AQAAAIe8YFC/zwwAKvJZ2tC9k8DvMt34; __utmc=1; __utma=1.312459015.1529772425.1540825606.1540828390.19; Hm_lvt_1db88642e346389874251b5a1eded6e3=1540825899,1540828382,1540829378,1540829450; snbim_minify=true; __utmt=1; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token.sig=p4pCAuWXphKrks3IjEzTbJFCcb4; xqat.sig=uWTQIYsOCqtgymFewPvkgLk8CyM; xq_r_token.sig=Q9P70D5S5ZuHuFEXVJ6umTRqL1o; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u.sig=Ra3Ht4oGmAXu5VtkPBpRXum-Ntc; xq_a_token=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xqat=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xq_r_token=b004ebba4649dfef7bba54f6ae7b703e5bca6a61; xq_token_expire=Sat%20Nov%2024%202018%2001%3A58%3A30%20GMT%2B0800%20(CST); xq_is_login=1; u=1497969916; __utmb=1.56.10.1540828390; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540835925"),
dict(cookie_are="device_id=33a80200aacb73cf594a45942b285a12; _ga=GA1.2.312459015.1529772425; s=ey177hmx06; bid=ae1522508305909e11f0ccaefc21ae37_jn93s7rs; __utmz=1.1539536073.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; Hm_lvt_fe218c11eab60b6ab1b6f84fb38bcc4a=1539591917; _gid=GA1.2.758749044.1540657586; __utma=1.312459015.1529772425.1540825606.1540828390.19; xq_token_expire=Sat%20Nov%2024%202018%2001%3A58%3A30%20GMT%2B0800%20(CST); aliyungf_tc=AQAAAAVyoiWa1w4AKvJZ2ozyzTPwnciM; Hm_lvt_1db88642e346389874251b5a1eded6e3=1540829378,1540829450,1540836740,1540866196; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=4458f8df93a013c35835d0320917b19dcaab0a24; xq_a_token.sig=FfAS5LGC_XBO11rmXuA6Nb3o4VI; xqat=4458f8df93a013c35835d0320917b19dcaab0a24; xqat.sig=t2g7eE2UG80Frcg03R-7nudVIBA; xq_r_token=4812b56991883e9913998e8816706912bff911e8; xq_r_token.sig=R6AgMpKf0fhe6GkWdS_etJ0Y3Dw; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=6146826778; u.sig=h5P6Xki5cmObHzNcRMVufpWUnZc; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540866325")]
s = requests.Session()
# s.keep_alive = False
try:
cookies = random.choice(cookie)
obj = s.get(url, headers=random.choice(headers), cookies=cookies, stream=True, allow_redirects=False, timeout=20)
time.sleep(8 + random.random() * 3.2)
bs = BeautifulSoup(obj.content, 'lxml')
except requests.exceptions.Timeout:
print([num, "timeout", "timeout"])
return [num, "timeout", "timeout"]
try:
try:
res_current = bs.find_all(attrs={"class": "cube-closed"})[0].get_text()
except IndexError:
res_current = "未关停!"
res_id = bs.find_all(attrs={"class": "creator fn-clear"})[0].attrs["href"]
s.close()
print([num, res_id[1:], res_current])
return [num, res_id[1:], res_current]
except IndexError:
try:
res_404 = bs.find("title").get_text()
if res_404 == "404_雪球":
s.close()
print([num, "NaN", res_404])
return [num, "NaN", res_404]
except AttributeError:
s.close()
print([num, "AttributeError", "page_error"])
return [num, "AttributeError", "page_error"]
result = []
res_final = []
res_final.extend(res)
res_final.extend(res_0)
for i in res_final:
if i != []:
result.append(i)
final = []
for i in result:
if i[1] != "异常":
final.append(i)
except_id = []
for i in result:
if i[1] == "异常":
except_id.append(i)
need = []
for i in final:
need.extend(i)