#!-- coding:utf-8 --
from urllib import request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import json
import os
import time
def main():
codes = {
“000977”,
“002230”,
“603019”,
“300457”,
“002049”,
}
syear, eyear = 0, 6
detail_dir = ‘./’
overwrite = False
for code in codes:
detail_url = "https://money.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/%s/ctrl/%s/displaytype/4.phtml"
tr_xpath = '//*[@id="BalanceSheetNewTable0"]/tbody/tr'
data_sign = 'balance_sheet'
scrapy_finance_indices(code, detail_url, syear, eyear, tr_xpath, data_sign, detail_dir, overwrite)
def scrapy_finance_indices(code, detail_url, syear, eyear, tr_xpath, data_sign, detail_dir, overwrite):
res_file = “%s/%s_%s.json” % (detail_dir, data_sign, code)
if not overwrite and os.path.isfile(res_file):
print(“skip scrapy, result exist in:” + res_file)
return False
if syear >= eyear:
print("skip scrapy, no vaild year range (%d, %d)" % (syear, eyear))
return False
# year parameters
pdate_list = []
nyear = int(time.strftime('%Y', time.localtime()))
for i in range(syear, eyear):
pdate_list.append(str(nyear - i))
date_dict = {}
for year_parameter in pdate_list:
resp = request.urlopen(detail_url % (code, year_parameter), timeout=30)
print("request url:" + resp.geturl())
rt_code = resp.getcode()
if rt_code == 200:
content = resp.read()
response = HtmlResponse(url=detail_url, body=content)
sel = Selector(response=response)
tr_list_sel = sel.xpath(tr_xpath)
header_title = None
first_val = None
for tr_sel in tr_list_sel:
td_sel = tr_sel.xpath('td//text()').extract()
if not td_sel:
continue
title_key = td_sel[0]
# one column means parent level title
if len(td_sel) == 1:
header_title = title_key
continue
# parse columns to val_list
val_list = []
for i in range(1, len(td_sel)):
val_list.append(td_sel[i])
# first row is report date row and finance data rows following
if not first_val:
first_val = val_list
else:
for i in range(0, len(val_list)):
info_dict = date_dict.get(first_val[i], {})
date_dict[first_val[i]] = info_dict
if header_title:
header_dict = info_dict.get(header_title, {})
info_dict[header_title] = header_dict
else:
header_dict = info_dict
header_dict[title_key] = val_list[i].strip()
write_file(date_dict, res_file)
else:
print("error return code: %s" % rt_code)
return True
def write_file(info_dict, res_file):
if info_dict:
try:
fb = open(res_file, ‘w’, encoding=‘utf-8’)
fb.write(json.dumps(info_dict) + ‘\n’)
except IOError as err:
print(‘IO Error:’, err)
else:
fb.close()
if name == ‘main’:
main()
11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
import pandas as pd
from sklearn.linear_model import LinearRegression
import datetime
clear_data = []
def get_risk_score_by_json(json_filename, next_date):
raw_data = pd.read_json(json_filename, encoding=‘utf-8’)
data_df = pd.DataFrame()
for date, items in raw_data.items():
# 将数据转换为DataFrame
data = pd.DataFrame({
'日期': [date],
'流动资产合计': [float(raw_data[date]['流动资产']['流动资产合计'].replace(',', ''))],
'非流动资产合计': [float(raw_data[date]['非流动资产']['非流动资产合计'].replace(',', ''))],
'资产总计': [float(raw_data[date]['非流动资产']['资产总计'].replace(',', ''))],
'流动负债合计': [float(raw_data[date]['流动负债']['流动负债合计'].replace(',', ''))],
'非流动负债合计': [float(raw_data[date]['非流动负债']['非流动负债合计'].replace(',', ''))],
'负债合计': [float(raw_data[date]['非流动负债']['负债合计'].replace(',', ''))],
'存货': [float(raw_data[date]['流动资产']['存货'].replace(',', ''))],
})
clear_data.append(data)
# 计算资产负债率
data['资产负债率'] = data['负债合计'] / data['资产总计']
# 计算流动比率
data['流动比率'] = data['流动资产合计'] / data['流动负债合计']
# 计算速动比率
data['速动比率'] = (data['流动资产合计'] - data['存货']) / data['流动负债合计']
# selected_indicators = ['资产负债率', '流动比率', '速动比率']
# # 计算评分指标得分
# weights = [0.4, 0.3, 0.3] # 指标权重
# for indicator in selected_indicators:
# data[indicator + '_Score'] = data[indicator] * weights[selected_indicators.index(indicator)]
#
# # 建立评分模型
# data['Risk_Score'] = data[selected_indicators[0] + '_Score'] + data[selected_indicators[1] + '_Score'] + data[
# selected_indicators[2] + '_Score']
data['Risk_Score'] = data['资产负债率'] * 0.5 + data['流动比率'] * 0.3 + data['速动比率']*0.2
# print("data_df:", data)
# 打印风险评分
# print(data[['日期', 'Risk_Score']])
# 将日期列转换为数值型
data['日期n'] = data['日期'].apply(lambda x: x.timestamp())
# 合并dataframe
data_df = pd.concat([data_df, data])
# 划分训练集和测试集
x_train = data_df[['日期n']]
y_train = data_df['Risk_Score']
# 将日期时间类型转换为数字类型
x_test = pd.DataFrame({'日期n': [datetime.datetime.strptime(next_date, '%Y-%m-%d').timestamp()]})
# 训练线性回归模型
model = LinearRegression()
model.fit(x_train, y_train)
# 预测下一个日期的Risk_Score
next_date_score = model.predict(x_test)
# print("下一个日期的Risk_Score预测值:", next_date_score)
return next_date_score
def main():
codes = {
“000977”: [“浪潮信息”, 0],
“002230”: [“科大讯飞”, 0],
“603019”: [“中科曙光”, 0],
“300457”: [“赢合科技”, 0],
“002049”: [“紫光国微”, 0],
}
corporate_risk_score = {}
for code, items in codes.items():
corporate_risk_score[items[0] + “(” + code + “)”] = get_risk_score_by_json(‘balance_sheet_’ + code + ‘.json’,
“2023-6-30”)[0]
# print(corporate_risk_score)
sorted_dict = sorted(corporate_risk_score.items(), key=lambda x: x[1])
for k, v in sorted_dict:
print(k, v)
if name == ‘main’:
main()
11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
#!-- coding:utf-8 --
from urllib import request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import json
import os
import time
def main():
codes = {
“000977”,
“002230”,
“603019”,
“300457”,
“002049”,
}
syear, eyear = 0, 6
detail_dir = ‘./’
overwrite = False
for code in codes:
detail_url = "https://money.finance.sina.com.cn/corp/go.php/vFD_BalanceSheet/stockid/%s/ctrl/%s/displaytype/4.phtml"
tr_xpath = '//*[@id="BalanceSheetNewTable0"]/tbody/tr'
data_sign = 'balance_sheet'
scrapy_finance_indices(code, detail_url, syear, eyear, tr_xpath, data_sign, detail_dir, overwrite)
def scrapy_finance_indices(code, detail_url, syear, eyear, tr_xpath, data_sign, detail_dir, overwrite):
res_file = “%s/%s_%s.json” % (detail_dir, data_sign, code)
if not overwrite and os.path.isfile(res_file):
print(“skip scrapy, result exist in:” + res_file)
return False
if syear >= eyear:
print("skip scrapy, no vaild year range (%d, %d)" % (syear, eyear))
return False
# year parameters
pdate_list = []
nyear = int(time.strftime('%Y', time.localtime()))
for i in range(syear, eyear):
pdate_list.append(str(nyear - i))
date_dict = {}
for year_parameter in pdate_list:
resp = request.urlopen(detail_url % (code, year_parameter), timeout=30)
print("request url:" + resp.geturl())
rt_code = resp.getcode()
if rt_code == 200:
content = resp.read()
response = HtmlResponse(url=detail_url, body=content)
sel = Selector(response=response)
tr_list_sel = sel.xpath(tr_xpath)
header_title = None
first_val = None
for tr_sel in tr_list_sel:
td_sel = tr_sel.xpath('td//text()').extract()
if not td_sel:
continue
title_key = td_sel[0]
# one column means parent level title
if len(td_sel) == 1:
header_title = title_key
continue
# parse columns to val_list
val_list = []
for i in range(1, len(td_sel)):
val_list.append(td_sel[i])
# first row is report date row and finance data rows following
if not first_val:
first_val = val_list
else:
for i in range(0, len(val_list)):
info_dict = date_dict.get(first_val[i], {})
date_dict[first_val[i]] = info_dict
if header_title:
header_dict = info_dict.get(header_title, {})
info_dict[header_title] = header_dict
else:
header_dict = info_dict
header_dict[title_key] = val_list[i].strip()
write_file(date_dict, res_file)
else:
print("error return code: %s" % rt_code)
return True
def write_file(info_dict, res_file):
if info_dict:
try:
fb = open(res_file, ‘w’, encoding=‘utf-8’)
fb.write(json.dumps(info_dict) + ‘\n’)
except IOError as err:
print(‘IO Error:’, err)
else:
fb.close()
if name == ‘main’:
main()
11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
import pandas as pd
from sklearn.linear_model import LinearRegression
import datetime
clear_data = []
def get_risk_score_by_json(json_filename, next_date):
raw_data = pd.read_json(json_filename, encoding=‘utf-8’)
data_df = pd.DataFrame()
for date, items in raw_data.items():
# 将数据转换为DataFrame
data = pd.DataFrame({
'日期': [date],
'流动资产合计': [float(raw_data[date]['流动资产']['流动资产合计'].replace(',', ''))],
'非流动资产合计': [float(raw_data[date]['非流动资产']['非流动资产合计'].replace(',', ''))],
'资产总计': [float(raw_data[date]['非流动资产']['资产总计'].replace(',', ''))],
'流动负债合计': [float(raw_data[date]['流动负债']['流动负债合计'].replace(',', ''))],
'非流动负债合计': [float(raw_data[date]['非流动负债']['非流动负债合计'].replace(',', ''))],
'负债合计': [float(raw_data[date]['非流动负债']['负债合计'].replace(',', ''))],
'存货': [float(raw_data[date]['流动资产']['存货'].replace(',', ''))],
})
clear_data.append(data)
# 计算资产负债率
data['资产负债率'] = data['负债合计'] / data['资产总计']
# 计算流动比率
data['流动比率'] = data['流动资产合计'] / data['流动负债合计']
# 计算速动比率
data['速动比率'] = (data['流动资产合计'] - data['存货']) / data['流动负债合计']
# selected_indicators = ['资产负债率', '流动比率', '速动比率']
# # 计算评分指标得分
# weights = [0.4, 0.3, 0.3] # 指标权重
# for indicator in selected_indicators:
# data[indicator + '_Score'] = data[indicator] * weights[selected_indicators.index(indicator)]
#
# # 建立评分模型
# data['Risk_Score'] = data[selected_indicators[0] + '_Score'] + data[selected_indicators[1] + '_Score'] + data[
# selected_indicators[2] + '_Score']
data['Risk_Score'] = data['资产负债率'] * 0.5 + data['流动比率'] * 0.3 + data['速动比率']*0.2
# print("data_df:", data)
# 打印风险评分
# print(data[['日期', 'Risk_Score']])
# 将日期列转换为数值型
data['日期n'] = data['日期'].apply(lambda x: x.timestamp())
# 合并dataframe
data_df = pd.concat([data_df, data])
# 划分训练集和测试集
x_train = data_df[['日期n']]
y_train = data_df['Risk_Score']
# 将日期时间类型转换为数字类型
x_test = pd.DataFrame({'日期n': [datetime.datetime.strptime(next_date, '%Y-%m-%d').timestamp()]})
# 训练线性回归模型
model = LinearRegression()
model.fit(x_train, y_train)
# 预测下一个日期的Risk_Score
next_date_score = model.predict(x_test)
# print("下一个日期的Risk_Score预测值:", next_date_score)
return next_date_score
def main():
codes = {
“000977”: [“浪潮信息”, 0],
“002230”: [“科大讯飞”, 0],
“603019”: [“中科曙光”, 0],
“300457”: [“赢合科技”, 0],
“002049”: [“紫光国微”, 0],
}
corporate_risk_score = {}
for code, items in codes.items():
corporate_risk_score[items[0] + “(” + code + “)”] = get_risk_score_by_json(‘balance_sheet_’ + code + ‘.json’,
“2023-6-30”)[0]
# print(corporate_risk_score)
sorted_dict = sorted(corporate_risk_score.items(), key=lambda x: x[1])
for k, v in sorted_dict:
print(k, v)
if name == ‘main’:
main()
11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111
companies_data = {}
num_companies = 5
quarters_per_year = 20
for i in range(num_companies):
company_start_index = i * (quarters_per_year + 1)
company_end_index = company_start_index + quarters_per_year + 1
company_data = clear_data[company_start_index:company_end_index]
companies_data[f’Company {i+1}'] = company_data
打印每家公司的数据
for company, data in companies_data.items():
print(f"{company} Data:“)
for entry in data:
print(entry)
print(”=" * 50)
计算每年的负债率并存储结果
liabilities_ratios_by_year = {}
遍历每家公司的数据
for company, data_frames in companies_data.items():
# 创建一个字典来存储每年的负债合计和资产总计
yearly_liabilities = {}
yearly_assets = {}
# 遍历每个时间点的数据框
for df in data_frames:
year = df['日期'].iloc[0].year # 假设日期列名是这样的
# 累加每年的负债合计和资产总计
yearly_liabilities[year] = yearly_liabilities.get(year, 0) + df['负债合计'].iloc[0]
yearly_assets[year] = yearly_assets.get(year, 0) + df['资产总计'].iloc[0]
# 计算每年的负债率并存储结果
liabilities_ratios_by_year[company] = {
year: liabilities / assets if assets != 0 else None
for year, liabilities in yearly_liabilities.items()
for year, assets in yearly_assets.items() # 使用两个for循环,遍历两个字典的键和值
}
打印计算结果
for company, ratios_by_year in liabilities_ratios_by_year.items():
print(f"{company} 的负债率如下:“)
for year, ratio in ratios_by_year.items():
print(f”{year}: {ratio}")
22222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222
import pandas as pd
import matplotlib.pyplot as plt
公司负债率数据
companies_data = {
‘000977’: {
2022: 0.33782028921390805,
2021: 0.38122553365530343,
2020: 0.4269912156296573,
2019: 0.6147255061188343,
2018: 0.6601938767030497
},
‘002230’: {
2022: 0.1937557406842251,
2021: 0.22159547006297672,
2020: 0.2717665662231457,
2019: 0.33819468982170936,
2018: 0.42798182086290537
},
‘603019’: {
2022: 0.25472321965381806,
2021: 0.30979914956635357,
2020: 0.41427850181554887,
2019: 0.45239789459122415,
2018: 0.6687705744490087
},
‘300457’: {
2022: 0.15622576789820164,
2021: 0.20946994930684334,
2020: 0.319094753398101,
2019: 0.40719460546266983,
2018: 0.48014811975158794
},
‘002049’: {
2022: 0.1308696989339981,
2021: 0.17703936060827447,
2020: 0.2518915792080338,
2019: 0.28782042409524033,
2018: 0.3256419971764099
}
}
创建折线图
plt.figure(figsize=(10, 6))
for company, ratios_by_year in companies_data.items():
years = list(ratios_by_year.keys())
ratios = list(ratios_by_year.values())
plt.plot(years, ratios, marker=‘o’, label=company)
plt.title(‘Asset Liability Ratio Trends’)
plt.xlabel(‘Year’)
plt.ylabel(‘Asset Liability Ratio’)
plt.legend()
plt.grid(True)
plt.show()
3.333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333333
import json
import os
存储文件名和股票代码的映射关系
file_mapping = {
“000977”: “balance_sheet_000977.json”,
“002049”: “balance_sheet_002049.json”,
“002230”: “balance_sheet_002230.json”,
“300457”: “balance_sheet_300457.json”,
“603019”: “balance_sheet_603019.json”
}
存储每个股票的数据
stock_data = {}
循环读取数据
for stock_code, file_name in file_mapping.items():
file_path = os.path.join(“./”, file_name) # 替换为您的数据文件所在的目录
with open(file_path, ‘r’) as file:
data = json.load(file)
stock_data[stock_code] = data
data_list = []
遍历第一层(公司代码)
for company_code, quarters in stock_data.items():
# 遍历第二层(季度日期)和第三层(资产类型)
for quarter_date, assets in quarters.items():
# 检查是否有应收账款这个资产类型
if ‘应收账款’ in assets[‘流动资产’]:
# 提取公司代码、日期和应收账款的值
receivables_value = assets[‘流动资产’][‘应收账款’]
# 将日期格式转换为标准格式
formatted_date = quarter_date.replace(‘-’, ‘/’)
# 将提取的数据添加到列表中
data_list.append([company_code, formatted_date, receivables_value])
from pyecharts import options as opts
from pyecharts.charts import Bar
from pyecharts.globals import ThemeType
整理数据,构建 ECharts 数据格式
companies = set()
years = [2018, 2019, 2020, 2021, 2022] # 近五年的年份
series_data = {year: [] for year in years}
for entry in data_list:
company_code = entry[0]
year = int(entry[1].split(‘/’)[0])
receivables = float(entry[2].replace(‘,’, ‘’))
companies.add(company_code)
if year in years:
series_data[year].append((company_code, receivables))
使用 pyecharts 绘制分组柱状图
bar = Bar(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
for year in years:
bar.add_xaxis([item[0] for item in series_data[year]])
bar.add_yaxis(f’{year}', [item[1] for item in series_data[year]])
bar.set_global_opts(
title_opts=opts.TitleOpts(title=“近五年每个季度的应收账款”),
xaxis_opts=opts.AxisOpts(axislabel_opts=opts.LabelOpts(rotate=45)),
datazoom_opts=[opts.DataZoomOpts(range_start=0, range_end=100)],
toolbox_opts=opts.ToolboxOpts(),
)
bar.render(“bar_chart.html”) # 将图表保存为 HTML 文件