一、pandas实现数据处理
1.pandas读取数据
import pandas as pd
io = r'D:\study\多标签企业数据(推荐用).xlsx'
data = pd.read_excel(io, sheet_name = 0)
data.head(10)
结果如下图:
发现字段里面出现冗余数据‘-’,故尝试用0替换掉(上图数据量有限,未展示完全)
2.数据清洗
import re
registered_capital = data['注册资本']
capital = []
for i in registered_capital:
m = str(i).replace('-','0') #匹配到“-”并替换为“0”
now = re.match("\d*", m).group() #group(0) 同 group() 就是匹配正则表达式整体结果
capital.append(now)
data['注册资本新'] = pd.DataFrame(capital) #数据存入Excel列
data.to_excel(io) #数据保存
3.货币按汇率转换,并写入Excel
import regex
new_money = []
for item in data['注册资本新']:
m1=regex.compile('\d+\.?\d*')
money=float(m1.findall(item)[0])
print(money)
if '人民币' in item: money = money
if '美' in item: money = money * 6.4
if '欧' in item:money = money * 7.8
if '英' in item:money = money * 9.1
if '法' in item:money = money * 7.1
if '德' in item:money = money * 10.97
if '加元' in item:money = money * 5.3
if '新加坡' in item:money = money * 4.82
if '港' in item:money = money * 0.82
if '台币' in item:money = money * 0.232
money2 = str(round(money,1)) #取小数点后一位
new_money.append(money2)
data['汇率转换后的注册资本'] = pd.DataFrame(new_money)
data.to_excel(io)
二.企业经营领域划分
根据企业所属行业和经营范围匹配领域知识库,划分领域
import os
import numpy as np
import pandas as pd
class RecommendTo:
def __init__(self):
self.projectfilename = "D:/智慧园区20210426数据整合/多标签企业数据(推荐用).xlsx"
self.__keyworddict = self.test02()
projs = self.test()
projs = self.__evaluate_projects(projs)
self.myprojs = projs
def test02(self):
dir = "D:/NoteBook/area_keywords/"
keywordfilenames = []
for filename in os.listdir(dir):
if filename.endswith(".csv"):
keywordfilenames.append(filename)
else:
continue
keyworddict = {}
for fname in keywordfilenames:
keyworddict[fname.split('.')[0]] = np.concatenate(pd.read_csv(dir + fname).values)
return keyworddict
def test(self):
projectfilename = self.projectfilename
dir = "D:/NoteBook/area_keywords/"
projs = pd.read_excel(projectfilename)
return projs
def __evaluate_projects(self, projs):
# evaluate projects
npprojs = projs.values
projs_strs = []
for item in npprojs:
s = ""
for area, keywords in self.__keyworddict.items():
count = 0
for keyword in keywords:
try:
# print(item[2],item[3])
if item[9].find(keyword) != -1:
count += 1
if item[12].find(keyword) != -1:
count += 1
except:
pass
if count != 0:
s += area + ":" + str(count) + ","
s = s.strip(",")
projs_strs.append(s)
projs["领域符合度"] = pd.DataFrame(projs_strs)
projs.to_excel(self.projectfilename)
return projs
RecommendTo()
三.数据展示
data.head()
1.领域符合度计算
import re
usedate = data2['领域符合度']
shuju = []
for i in usedate:
try :
result = i.split(':')
a=len(result)-1
shuju.append(a)
except:
a = 0
shuju.append(a)
data['领域数量'] = pd.DataFrame(shuju)
data.to_excel(io)
2.数据描述
data2.describe()
可以发现参保人数并没有展示出来,去excel发现数值需转换为数字
数据清理,替换掉“-”
import re
insured = data2['参保人数']
insured_people = []
for i in insured:
m = str(i).replace('-','0') #匹配到“-”并替换为“0”
now = re.match("\d*", m).group() #group(0) 同 group() 就是匹配正则表达式整体结果
insured_people.append(now)
data2['参保人数新'] = pd.DataFrame(insured_people) #数据存入Excel列
data2.to_excel(r'D:\\多标签企业数据(推荐用).xlsx')
round(data2.describe(),1) #数据描述,并保留一位小数
由于极大值和极小值相差较大,且平均数受影响较大,于是考虑采用中位数,分别对注册资本和参保人数进行数据缺失补全。
注册资金根据中位数缺失数据补全:
import re
shiyong=data2['注册资本新']
shuju = []
for i in shiyong:
if i == 0:
i = 1000
shuju.append(i)
else:
i = i
shuju.append(i)
# print(shuju)
data2['注册资本补全'] = pd.DataFrame(shuju)
data2.to_excel(r'D:\多标签企业数据(推荐用).xlsx')
参保人数根据中位数缺失数据补全:
import re
shiyong=data2['参保人数新']
shuju = []
for i in shiyong:
if i == 0:
i = 3
shuju.append(i)
else:
i = i
shuju.append(i)
# print(shuju)
data2['参保人数补全'] = pd.DataFrame(shuju)
data2.to_excel(r'D:/多标签企业数据(推荐用).xlsx')
当前数据格式
data2.head(10)
四.数据权重归一化
from sklearn.preprocessing import MinMaxScaler
x1 = data[['注册资本补全']]
x2 = data[['参保人数补全']]
x3 = data[['领域数量']]
min_max_scaler = MinMaxScaler()
X_train_minmax1 = min_max_scaler.fit_transform(x1)#注册资本归一化后的结果
X_train_minmax2 = min_max_scaler.fit_transform(x2)#参保人数归一化后的结果
X_train_minmax3 = min_max_scaler.fit_transform(x3)#领域数量归一化后的结果
score = 10*(X_train_minmax1+X_train_minmax2+X_train_minmax3)+84 #数据加权化到[0,100]
round(score.max(),1) #保留一位小数
输出:99.0
score.min() 的输出为84.0,就是说整体权重划分到[84.100],在图上展示结果
import seaborn as sns
ax = sns.distplot(score)
ax.set_title('score')
归一化和加权得分写入Excel
data['score'] = pd.DataFrame(score)
data.to_excel(io)
data.head()
得到的最终结果,如下图所示
以上就是对通过数据清洗、补全、归一化、加权得到的最终企业综合评分。