毕设部分代码（测试）：处理专利数据+经纬度计算距离+插值法构建利率曲线，后续补充

輕栀

已于 2023-12-12 14:24:45 修改

阅读量104

点赞数

文章标签： python 算法

于 2023-10-15 16:52:36 首次发布

本文链接：https://blog.csdn.net/mnwl12_0/article/details/133844579

版权

# https://zhuanlan.zhihu.com/p/365897117
# https://blog.csdn.net/weixin_50646402/article/details/130780623
# https://blog.csdn.net/jtj2002/article/details/131263607

result = pd.DataFrame(columns = ['Stkcd','Accper','GPtotal','GPfaming','GPshiyong']) # 专利申请数
a=1
for code in jian_zhuanlidata['Stkcd'].unique():
    a=a+1
    format_jian_zhuanlidata = jian_zhuanlidata[jian_zhuanlidata['Stkcd'] == code] # 提取对应代码
    for nianfen in format_jian_zhuanlidata['Accper']:
        nianfen_format_jian_zhuanlidata = jian_zhuanlidata[jian_zhuanlidata['Accper'] == nianfen] # 提取对应代码所在年份
        faming_count = 0
        shiyong_count = 0
        total_count = 0
        for AppliNo in format_jian_zhuanlidata['AppliNo']: # 判断所在年份是1还是2
            print(AppliNo)
            print('AppliNo',AppliNo[4])
            if AppliNo[4] ==1:
                faming_count += 1
            if AppliNo[4] ==2:
                shiyong_count += 1
            total_count = faming_count + shiyong_count
            
        rt = pd.DataFrame({'Stkcd': [code],
                   'Accper': [nianfen],
                   'GPtotal': [total_count],
                   'GPfaming': [faming_count],
                   'GPshiyong': [shiyong_count]})

        # counts = format_jian_zhuanlidata['Accper'].value_counts().sort_index()
        # rt = pd.DataFrame(rt)
        result = pd.concat([result,rt])
    if a==1:
        break
result = result.reset_index(drop=True)
print(result)


# for code in jian_zhaunlidata['Stkcd'].unique():
#     format_jian_zhaunlidata = jian_zhaunlidata[jian_zhaunlidata['Stkcd'] == code]
#     for nianfen in format_jian_zhaunlidata['Accper']:
#         nianfen_format_jian_zhaunlidata = format_jian_zhaunlidata[format_jian_zhaunlidata['Accper'] == nianfen]
#         nianfen_format_jian_zhaunlidata['GPtotal'] = len(nianfen_format_jian_zhaunlidata.index)
#         nianfen_format_jian_zhaunlidata['GPfaming'] = len(nianfen_format_jian_zhaunlidata[nianfen_format_jian_zhaunlidata['AppliNo'] // 10000 % 10 == 1].index)
#         nianfen_format_jian_zhaunlidata['GPshiyong'] = len(nianfen_format_jian_zhaunlidata[nianfen_format_jian_zhaunlidata['AppliNo'] // 10000 % 10 == 2].index)
#         result = pd.concat([result, nianfen_format_jian_zhaunlidata], ignore_index=True)

# 输出结果
# print(result.head())

经纬度查询距离：

pip install cpca
from geopy.distance import geodesic
 
# 参数为两个元组，每个元组包含经度和纬度
coord_1 = (39.917978, 116.396288) # 北京天安门坐标
coord_2 = (31.230416, 121.473701) # 上海市区坐标
distance = geodesic(coord_1, coord_2).km # 距离结果单位为千米
 
print("距离为：{:.2f}千米".format(distance)) # 输出距离结果

pip install geopy

pip install chinese_province_city_area_mapper
# 方法一：直接通过 CPCATransformer 指定
cpca = CPCATransformer({"朝阳区":"北京市"})
df = cpca.transform(location_str)
 
# 方法二：通过内置模块 umap 调用默认地址字典
from chinese_province_city_area_mapper import myumap
cpca = CPCATransformer(myumap.umap)
df = cpca.transform(location_str)
from chinese_province_city_area_mapper.infrastructure import SuperMap
#地区到市的映射数据库，是一个字典类型（key为区名，value为其所属的市名），注意其中包含重复的区名
SuperMap.area_city_mapper
#重复的区名列表，列表类型，如果区名在这个列表中，则area_city_mapper的映射是不准确的
SuperMap.rep_areas
#市到省的映射数据库，字典类型（key为市的名称，value为省的名称）
SuperMap.city_province_mapper
#全国省市区的经纬度数据库，字典类型（key为"省,市,区",value为(维度,经度)）
SuperMap.lat_lon_mapper
#获取北京市朝阳区的经纬度
SuperMap.lat_lon_mapper.get("北京市,北京市,朝阳区")

插值法构建整个区间内的利率曲线:

from scipy.interpolate import CubicSpline
import numpy as np

tenors = ['1M', '3M', '6M', '9M', '1Y', '2Y', '3Y', '5Y', '7Y', '10Y', '15Y', '20Y', '30Y', '40Y', '50Y']
zeroRates = [1.7394,2.1815,2.2711,2.2586,2.2743,2.3635,2.4480,2.5514,2.7146,2.7103,2.8371,2.8882,3.0855,3.2349,3.2833] # # 中债即期收益率  东方财富-利率走势-债券数据库-即期收益率
# 将时间点转换为年份
tenors_in_years = []
for tenor in tenors:
    if tenor[-1] == 'M':
        tenors_in_years.append(int(tenor[:-1])/12)
    elif tenor[-1] == 'Y':
        tenors_in_years.append(int(tenor[:-1]))

# 使用样条插值方法计算即期利率
cs = CubicSpline(tenors_in_years, zeroRates)

spotRates = cs(np.arange(1/12, 50+1/12, 1/12))

sequence = []
for i in range(1, 602):
    sequence.append(str(i) + "M")
    
sequence = sequence[:-1] # Remove the last comma

# print(sequence)

# 输出结果
df = pd.DataFrame({'term': sequence, 'ytm': spotRates})
df

ALTER TABLE ks_crawler1.k_cbt_daily_report
DROP COLUMN date_time,
DROP COLUMN bondCode;
DELETE FROM ks_crawler1.k_cbt_daily_report WHERE bondCode IS NOT NULL;
SELECT  count(*)  FROM ks_crawler1.k_cbt_daily_report WHERE bondCode IS NOT NULL;