Python实现获取(且慢)基金夏普比率爬虫

最新推荐文章于 2023-12-04 22:19:30 发布

.NET跨平台

最新推荐文章于 2023-12-04 22:19:30 发布

阅读量2.6k

点赞数

分类专栏：投资理财爬虫文章标签： python excel selenium pandas 基金

原文链接：https://blog.csdn.net/WuLex

版权

投资理财同时被 2 个专栏收录

75 篇文章 14 订阅

订阅专栏

爬虫

22 篇文章 3 订阅

订阅专栏

# -*- coding:utf-8 -*-

import os
import sys
import time
import traceback
import json
import bs4
import pandas as pd
from selenium import webdriver

'''
https://www.jianshu.com/p/4b89c92ff9b4
https://cuiqingcai.com/2577.html
'''

print(sys.getdefaultencoding())

driver = webdriver.PhantomJS(executable_path="./bin/phantomjs.exe")


def func_load_webpage(code):
    if not code.isdigit() or len(code) != 6:
        return False
    base_url = 'https://qieman.com/funds/%s' % code

    dict_select = {
        'fundname': '',      # 基金名称
        'scale': '',        # 最新规模
        'withdrawal': '',   # 最大回撤
        'sharp': '',        # 夏普比率
        'volatility': '',   # 波动率
        'code': code,       # 基金代码
    }

    driver.get(base_url)

    content = driver.page_source.encode('utf-8')

    target = ''
    soup = bs4.BeautifulSoup(content, "html.parser")
    for co in soup.find_all(['span', 'h1']):
        if target != '':
            '''dict_select[target] = co.text.encode('utf-8')'''
            dict_select[target] = co.text
            if target == 'volatility':
                break
            target = ''

        if co.name == 'h1':
            dict_select['fundname'] = co.text

        if co.text == u'最新规模':
            target = 'scale'
        elif co.text == u'最大回撤':
            target = 'withdrawal'
        elif co.text == u'夏普比率':
            target = 'sharp'
        elif co.text == u'波动率':
            target = 'volatility'

    return json.dumps(dict_select, ensure_ascii=False)


if __name__ == "__main__":
    codelist = ['000216','000961', '001071', '001513', '001549', '001550', '001632', '006751', '003095',
                '040046', '110011', '110023', '150303', '161033', '161122', '161130', '161725', '161903','163406', '163415', '519674']
    #codelist = ['000216', '000313']

    data = []  # 用于存储每一行的Json数据
    df = pd.DataFrame()  # 最后转换得到的结果

    for item in codelist:
        # print(func_load_webpage(item))
        data.append(func_load_webpage(item))
        time.sleep(1)

    for line in data:
        data_item = json.loads(line)
        df1 = pd.DataFrame(data_item, index=[0])
        df = df.append(df1)

    # 在excel表格的第1列写入, 不写入index
    df.to_excel('Fund.xlsx', sheet_name='Data', startcol=0, index=False)

    #code = '110027'
    # if len(sys.argv) > 1:
    #    code = sys.argv[1]
    # print(func_load_webpage(code))