# -*- coding:utf-8 -*-
import os
import sys
import time
import traceback
import json
import bs4
import pandas as pd
from selenium import webdriver
'''
https://www.jianshu.com/p/4b89c92ff9b4
https://cuiqingcai.com/2577.html
'''
print(sys.getdefaultencoding())
driver = webdriver.PhantomJS(executable_path="./bin/phantomjs.exe")
def func_load_webpage(code):
if not code.isdigit() or len(code) != 6:
return False
base_url = 'https://qieman.com/funds/%s' % code
dict_select = {
'fundname': '', # 基金名称
'scale': '', # 最新规模
'withdrawal': '', # 最大回撤
'sharp': '', # 夏普比率
'volatility': '', # 波动率
'code': code, # 基金代码
}
driver.get(base_url)
content = driver.page_source.encode('utf-8')
target = ''
soup = bs4.BeautifulSoup(content, "html.parser")
for co in soup.find_all(['span', 'h1']):
if target != '':
'''dict_select[target] = co.text.encode('utf-8')'''
dict_select[target] = co.text
if target == 'volatility':
break
target = ''
if co.name == 'h1':
dict_select['fundname'] = co.text
if co.text == u'最新规模':
target = 'scale'
elif co.text == u'最大回撤':
target = 'withdrawal'
elif co.text == u'夏普比率':
target = 'sharp'
elif co.text == u'波动率':
target = 'volatility'
return json.dumps(dict_select, ensure_ascii=False)
if __name__ == "__main__":
codelist = ['000216','000961', '001071', '001513', '001549', '001550', '001632', '006751', '003095',
'040046', '110011', '110023', '150303', '161033', '161122', '161130', '161725', '161903','163406', '163415', '519674']
#codelist = ['000216', '000313']
data = [] # 用于存储每一行的Json数据
df = pd.DataFrame() # 最后转换得到的结果
for item in codelist:
# print(func_load_webpage(item))
data.append(func_load_webpage(item))
time.sleep(1)
for line in data:
data_item = json.loads(line)
df1 = pd.DataFrame(data_item, index=[0])
df = df.append(df1)
# 在excel表格的第1列写入, 不写入index
df.to_excel('Fund.xlsx', sheet_name='Data', startcol=0, index=False)
#code = '110027'
# if len(sys.argv) > 1:
# code = sys.argv[1]
# print(func_load_webpage(code))
Python实现获取(且慢)基金夏普比率爬虫
最新推荐文章于 2023-12-04 22:19:30 发布