python爬取雪球网交易数据

雪球网交易数据爬取,python源码。
雪球是一个投资者的社交网络平台,爬取交易数据。

代码:

def get_trade_behavior(uid):
    import requests
    import random
    import time
    import json
    result = []
    res = []
    headers = [{
                   'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
                   'Accept': 'text/html;q=0.9,*/*;q=0.8',
                   'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                   'Connection': 'close'},
               {
                   'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
                   'Accept': 'text/html;q=0.9,*/*;q=0.8',
                   'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                   'Connection': 'close'},
               {
                   'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
                   'Accept': 'text/html;q=0.9,*/*;q=0.8',
                   'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                   'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
                'Accept': 'application/json, text/plain, */*',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
                'Accept': 'application/json, text/plain, */*',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'}]

    s = requests.Session()
    s.keep_alive = False
    # t = 1
    try:
        # while True:
        url = "https://xueqiu.com/service/tc/snowx/PAMID/cubes/rebalancing/history?cube_symbol=SP" + uid + "&count=20&page=1"
        obj = s.get(url, headers=random.choice(headers), stream=True, allow_redirects=False).json()
        time.sleep(random.random() * 3)
        maxpage = obj["maxPage"]
        # if obj["list"] != []:
        for k in range(1, maxpage + 1):
            url = "https://xueqiu.com/service/tc/snowx/PAMID/cubes/rebalancing/history?cube_symbol=SP" + uid + "&count=20&page=" + str(k)
            print("正在检索{%s}-第%d页-总共%d页" % (uid, k, maxpage))
            obj = s.get(url, headers=random.choice(headers), stream=True, allow_redirects=False).json()
            time.sleep(random.random() * 3)
            for i in obj["list"]:
                res.append(uid)
                time_stamp = i["updated_at"]
                time_stamp_10 = int(round(time_stamp) / 1000)
                time_local = time.localtime(time_stamp_10)
                trade_time = time.strftime("%Y-%m-%d %H:%M:%S", time_local)
                trade_history_stock_name = i["rebalancing_histories"][0]["stock_name"]
                trade_history_stock_symbol = i["rebalancing_histories"][0]["stock_symbol"]
                trade_history_stock_prev_weight = i["rebalancing_histories"][0]["prev_weight_adjusted"]
                trade_history_stock_target_weight = i["rebalancing_histories"][0]["target_weight"]
                trade_history_stock_exec_price = i["rebalancing_histories"][0]["price"]
                res.append(trade_time)
                res.append(trade_history_stock_name)
                res.append(trade_history_stock_symbol)
                res.append(trade_history_stock_prev_weight)
                res.append(trade_history_stock_target_weight)
                res.append(trade_history_stock_exec_price)
                res_copy = res.copy()
                result.append(res_copy)
                res.clear()
        print("{%s} 检索完毕!" % uid)
        return result
    except:
        print("{%s} 异常!" % uid)
        return [uid, "异常"]





def read_csv(name):
    import csv
    '''读取CSV文件数据'''
    csv_file = csv.reader(open("C:\\Users\\viemax\\Desktop\\" + name + ".csv", "r"))
    object_website = []
    for i in csv_file:
        object_website.append(i)
        # print(i)
    return object_website



no_data_id = read_csv("no_data_id")

obj = []
for i in no_data_id[2:]:
    obj.append(i[1])


res = []
for i in obj[0::2]:
    r = get_trade_behavior(i)
    res.append(r)



def xueqiu(num):
    import requests
    from bs4 import BeautifulSoup
    import random
    import time
    url = u"https://xueqiu.com/P/SP" + num
    headers = [{'User-Agent': "Mozilla/5.0 (X11; CrOS x86_64 10066.0.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1 (KHTML, like Gecko) CriOS/69.0.3497.100 Mobile/13B143 Safari/601.1.46",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A",
                'Accept': 'text/html;q=0.9,*/*;q=0.8',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0",
                'Accept': 'application/json, text/plain, */*',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'},
               {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko",
                'Accept': 'application/json, text/plain, */*',
                'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
                'Connection': 'close'}]
    cookie = [dict(cookies_are="device_id=33a80200aacb73cf594a45942b285a12; _ga=GA1.2.312459015.1529772425; s=ey177hmx06; bid=ae1522508305909e11f0ccaefc21ae37_jn93s7rs; __utmz=1.1539536073.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; Hm_lvt_fe218c11eab60b6ab1b6f84fb38bcc4a=1539591917; _gid=GA1.2.758749044.1540657586; aliyungf_tc=AQAAAIe8YFC/zwwAKvJZ2tC9k8DvMt34; __utmc=1; __utma=1.312459015.1529772425.1540825606.1540828390.19; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token.sig=p4pCAuWXphKrks3IjEzTbJFCcb4; xqat.sig=uWTQIYsOCqtgymFewPvkgLk8CyM; xq_r_token.sig=Q9P70D5S5ZuHuFEXVJ6umTRqL1o; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u.sig=Ra3Ht4oGmAXu5VtkPBpRXum-Ntc; Hm_lvt_1db88642e346389874251b5a1eded6e3=1540825899,1540828382,1540829378,1540829450; snbim_minify=true; __utmt=1; _gat_gtag_UA_16079156_4=1; xq_a_token=18b7f7dec4f54032863219716eaf839ee940199d; xqat=18b7f7dec4f54032863219716eaf839ee940199d; xq_r_token=f27bcc9f6c7b6446279ee9448db195b118b8f17c; xq_token_expire=Sat%20Nov%2024%202018%2001%3A55%3A26%20GMT%2B0800%20(CST); xq_is_login=1; u=7147604028; __utmb=1.52.10.1540828390; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540835763"),
              dict(cookie_are="device_id=33a80200aacb73cf594a45942b285a12; _ga=GA1.2.312459015.1529772425; s=ey177hmx06; bid=ae1522508305909e11f0ccaefc21ae37_jn93s7rs; __utmz=1.1539536073.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; Hm_lvt_fe218c11eab60b6ab1b6f84fb38bcc4a=1539591917; _gid=GA1.2.758749044.1540657586; aliyungf_tc=AQAAAIe8YFC/zwwAKvJZ2tC9k8DvMt34; __utmc=1; __utma=1.312459015.1529772425.1540825606.1540828390.19; Hm_lvt_1db88642e346389874251b5a1eded6e3=1540825899,1540828382,1540829378,1540829450; snbim_minify=true; __utmt=1; xq_token_expire=Sat%20Nov%2024%202018%2001%3A55%3A26%20GMT%2B0800%20(CST); __utmb=1.52.10.1540828390; _gat_gtag_UA_16079156_4=1; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xq_a_token.sig=p4pCAuWXphKrks3IjEzTbJFCcb4; xqat=b2f21e25cd1817bf15c1c89cc72b25ad537495de; xqat.sig=uWTQIYsOCqtgymFewPvkgLk8CyM; xq_r_token=bb8e27cca180872ab70314097a5077578ff119c8; xq_r_token.sig=Q9P70D5S5ZuHuFEXVJ6umTRqL1o; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=1559188240; u.sig=Ra3Ht4oGmAXu5VtkPBpRXum-Ntc; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540835848"),
              dict(cookie_are="device_id=33a80200aacb73cf594a45942b285a12; _ga=GA1.2.312459015.1529772425; s=ey177hmx06; bid=ae1522508305909e11f0ccaefc21ae37_jn93s7rs; __utmz=1.1539536073.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; Hm_lvt_fe218c11eab60b6ab1b6f84fb38bcc4a=1539591917; _gid=GA1.2.758749044.1540657586; aliyungf_tc=AQAAAIe8YFC/zwwAKvJZ2tC9k8DvMt34; __utmc=1; __utma=1.312459015.1529772425.1540825606.1540828390.19; Hm_lvt_1db88642e346389874251b5a1eded6e3=1540825899,1540828382,1540829378,1540829450; snbim_minify=true; __utmt=1; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token.sig=p4pCAuWXphKrks3IjEzTbJFCcb4; xqat.sig=uWTQIYsOCqtgymFewPvkgLk8CyM; xq_r_token.sig=Q9P70D5S5ZuHuFEXVJ6umTRqL1o; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u.sig=Ra3Ht4oGmAXu5VtkPBpRXum-Ntc; xq_a_token=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xqat=b70e7188d32f804237b6a42c052b5bcf74ebeea2; xq_r_token=b004ebba4649dfef7bba54f6ae7b703e5bca6a61; xq_token_expire=Sat%20Nov%2024%202018%2001%3A58%3A30%20GMT%2B0800%20(CST); xq_is_login=1; u=1497969916; __utmb=1.56.10.1540828390; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540835925"),
              dict(cookie_are="device_id=33a80200aacb73cf594a45942b285a12; _ga=GA1.2.312459015.1529772425; s=ey177hmx06; bid=ae1522508305909e11f0ccaefc21ae37_jn93s7rs; __utmz=1.1539536073.4.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; Hm_lvt_fe218c11eab60b6ab1b6f84fb38bcc4a=1539591917; _gid=GA1.2.758749044.1540657586; __utma=1.312459015.1529772425.1540825606.1540828390.19; xq_token_expire=Sat%20Nov%2024%202018%2001%3A58%3A30%20GMT%2B0800%20(CST); aliyungf_tc=AQAAAAVyoiWa1w4AKvJZ2ozyzTPwnciM; Hm_lvt_1db88642e346389874251b5a1eded6e3=1540829378,1540829450,1540836740,1540866196; remember=1; remember.sig=K4F3faYzmVuqC0iXIERCQf55g2Y; xq_a_token=4458f8df93a013c35835d0320917b19dcaab0a24; xq_a_token.sig=FfAS5LGC_XBO11rmXuA6Nb3o4VI; xqat=4458f8df93a013c35835d0320917b19dcaab0a24; xqat.sig=t2g7eE2UG80Frcg03R-7nudVIBA; xq_r_token=4812b56991883e9913998e8816706912bff911e8; xq_r_token.sig=R6AgMpKf0fhe6GkWdS_etJ0Y3Dw; xq_is_login=1; xq_is_login.sig=J3LxgPVPUzbBg3Kee_PquUfih7Q; u=6146826778; u.sig=h5P6Xki5cmObHzNcRMVufpWUnZc; _gat_gtag_UA_16079156_4=1; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1540866325")]
    s = requests.Session()
    # s.keep_alive = False
    try:
        cookies = random.choice(cookie)
        obj = s.get(url, headers=random.choice(headers), cookies=cookies, stream=True, allow_redirects=False, timeout=20)
        time.sleep(8 + random.random() * 3.2)
        bs = BeautifulSoup(obj.content, 'lxml')
    except requests.exceptions.Timeout:
        print([num, "timeout", "timeout"])
        return [num, "timeout", "timeout"]
    try:
        try:
            res_current = bs.find_all(attrs={"class": "cube-closed"})[0].get_text()
        except IndexError:
            res_current = "未关停!"
        res_id = bs.find_all(attrs={"class": "creator fn-clear"})[0].attrs["href"]
        s.close()
        print([num, res_id[1:], res_current])
        return [num, res_id[1:], res_current]
    except IndexError:
        try:
            res_404 = bs.find("title").get_text()
            if res_404 == "404_雪球":
                s.close()
                print([num, "NaN", res_404])
                return [num, "NaN", res_404]
        except AttributeError:
            s.close()
            print([num, "AttributeError", "page_error"])
            return [num, "AttributeError", "page_error"]
result = []
res_final = []
res_final.extend(res)
res_final.extend(res_0)


for i in res_final:
    if i != []:
        result.append(i)

final = []
for i in result:
    if i[1] != "异常":
        final.append(i)


except_id = []
for i in result:
    if i[1] == "异常":
        except_id.append(i)
        

need = []
for i in final:
    need.extend(i)
  • 2
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值