import sys
import csv
import warnings
warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, Imputer
reload(sys)
sys.setdefaultencoding('utf8')
from Tools import Tools
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import tree
# from xgboost import XGBRegressor
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
1、忽略警告
import warnings warnings.filterwarnings("ignore")
2、matplotlib显示中文
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号
3、print中文
print "---------------加载数据----------------".decode('utf-8')
4、读取中文内容的表格
quality_1_df = read_tool.read_csv(filename=file_name.decode('utf-8'), encoding1='gbk', encoding2='utf-8',header=0)
import codecs
import pandas as pd
class Tools(object):
# def __init__(self):
def read_csv(self, filename, encoding1='utf-8', encoding2='utf-8', header=None):
"""
直接采用pd.read_csv难以处理文件名中带有中文的文件,需要借助open函数。
直接采用with open('xx.csv') as f 的方式难以处理csv的中文列名。
故最终采用with + codecs.open,能够同时解决上述两个问题。
注意:如果源文件是ansi格式,则应将encoding1设置为'gbk'.
"""
with codecs.open(filename, 'rb', encoding1) as f:
df = pd.read_csv(f, encoding=encoding2, header=header)
return df
4、使用pandas处理表格
处理一列的文件
quality_1_df = pd.DataFrame(quality_1_df.loc[lambda x: x['归属团队'.decode('utf-8')] == 'Runners'],columns=['归属团队'.decode('utf-8'), '领域'.decode('utf-8'), '位置'.decode('utf-8'),'提交归属迭代'.decode('utf-8'), '关闭迭代'.decode('utf-8'),'发现活动分类'.decode('utf-8')])
#分割
quality_1_df['迭代修改'.decode('utf-8')] = quality_1_df['提交归属迭代'.decode('utf-8')].str.split('(', expand=True)[0]
quality_1_df['迭代修改'.decode('utf-8')] = quality_1_df['迭代修改'.decode('utf-8')].str.split('/S', expand=True)[1]
#拼接
quality_1_Runners_df['领域/位置'.decode('utf-8')] = quality_1_Runners_df['领域'.decode('utf-8')].str.cat(quality_1_Runners_df['位置'.decode('utf-8')], sep='/')
#删除
quality_1_Runners_df = quality_1_Runners_df.drop('领域'.decode('utf-8'), axis=1)
#新建并,根据其他列设置内容
group处理
team_size = pd.DataFrame(commit_1_df.groupby(
['团队'.decode('utf-8'), '领域'.decode('utf-8'), '迭代'.decode('utf-8')]).sum())
对一列中的数据挨个处理
commit_1_df['完整迭代'.decode('utf-8')] = diedai_S(commit_1_df['迭代修改'.decode('utf-8')])
def diedai_S(df1):
list_tmp = []
for str in df1:
if type(str) == float:
pass
else:
tmp = str.decode('utf-8').split('-')[0]
tmp = int(tmp.decode('utf-8'))
# tmp=int(tmp.decode('utf-8'))
list_tmp.append(tmp)
# print list_tmp
return list_tmp
填充列
commit_1_df['迭代'.decode('utf-8')].fillna('S'.decode('utf-8'), inplace=True)
合并数据
result = pd.merge(quality_1_df, commit_1_df, on=['团队'.decode('utf-8'), '迭代'.decode('utf-8')], how='left')
result = pd.merge(result, code_1_df, on=['团队'.decode('utf-8'), '迭代'.decode('utf-8')], how='left')
#left,以第一个dataframe的索引为索引,第二个空值保留
插入一列
result.insert(5, '迭代'.decode('utf-8'), tmp)
求dataframe的相关系数
pearson相关系数:标准化后的数据求欧氏距离平方并经过简单的线性变化
1)pearson:相关系数来衡量两个数据集合是否在一条线上面,即针对线性数据的相关系数计算,针对非线性数据便会有误差。
2)kendall:用于反映分类变量相关性的指标,即针对无序序列的相关系数,非正太分布的数据
3)spearman:非线性的,非正太分析的数据的相关系数
result = read_tool.read_csv(filename=file_name.decode('utf-8'), encoding1='utf-8', encoding2='utf-8',header=0)
print '---------------corr-----------------'
result_pearson_corr = result.corr()
result_kendall_corr = result.corr(method='kendall')
result_spearman_corr = result.corr(method='spearman')
result_spearman_corr.to_csv("../../../../resource/处理后/合并/相关系数/spearman_corr.csv".decode('utf-8'), encoding='utf-8')
5、sklearn 求特征重要性
处理CSV文件变为可训练(np形式的)
def get_data_1(index1, index2, target_index):
file_name = "../../../../resource/处理后/合并/result.csv".decode('utf-8')
csvFile = open(file_name, "r")
csv_data = csv.reader(csvFile)
cancer = np.array([i for i in csv_data])
attribute_names = cancer[0, 5:]
# print attribute_names
attribute_data = cancer[index1:index2, 5:]
# print attribute_data
data = []
for i in attribute_data:
temp = []
for j in i:
if not j:
temp.append(np.nan)
else:
if '.' in j:
j = j.split('.')[0]
temp.append(int(j))
data.append(temp)
target = []
for i in cancer[index1:index2, target_index]:
target.append(int(i))
target_names = cancer[0, target_index]
from sklearn.datasets.base import Bunch
real_data = Bunch(data=data, target=target, feature_names=attribute_names, target_names=target_names)
# print real_data
return real_data
CSV到np
csvFile = open(file_name, "r")
csv_data = csv.reader(csvFile)
cancer = np.array([i for i in csv_data])
将训练数据字符串形式的,变为int,空变为np.nan
attribute_names = cancer[0, 5:]
data = []
for i in attribute_data:
temp = []
for j in i:
if not j:
temp.append(np.nan)
else:
if '.' in j:
j = j.split('.')[0]
#3.0的字符串,不能转变为float,在转成int
temp.append(int(j))
data.append(temp)
训练数据集
from sklearn.datasets.base import Bunch
real_data = Bunch(data=data, target=target, feature_names=attribute_names, target_names=target_names)
X, y = data.data, data.target
feature_names = data.feature_names
target_names = data.target_names
数据集中空值处理并归一化
imp = Imputer(strategy='mean')
X, y = data.data, data.target
feature_names = data.feature_names
target_names = data.target_names
X2 = imp.fit_transform(X)
X = MinMaxScaler().fit_transform(X2)
使用算法模型,训练数据,获得重要性
rf = RandomForestRegressor()
rf.fit(X, y)
importance = rf.feature_importances_
#重要性数据
important_data = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), feature_names), reverse=True)
for (x, y) in important_data:
print x, y.decode('utf-8')
rf = tree.DecisionTreeClassifier()
rf.fit(X, y)
important_data = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), feature_names), reverse=True)
6、使用matplotlib画图
import matplotlib.pyplot as plt
plt.subplot(2, 1, 1)
plt.title(target_names.decode('utf-8') + "与各特征之间的关系(决策树):".decode('utf-8'))
x_tick = [x.decode('utf-8') for x in feature_names]
plt.bar(x_tick, importance, width=0.35)
#设置横坐标倾斜
plt.xticks(x_tick, x_tick, rotation=30)
plt.show()
7、将字符串写入TXT文件
text = ''
for i in range(1, 5):
data_1 = get_data_1(index1, index2, i)
text += handle_data_1(data_1)
print text
write_file_path = "../../../../resource/处理后/合并/important_{3}{0}to{1}_{2}.txt".decode('utf-8').format(str(index1),str(index2),str(i),'tree')
with open(write_file_path, 'w') as f:
f.write(text)
8、format使用
write_file_path = "../../../../resource/处理后/合并/important_{3}{0}to{1}_{2}.txt".decode('utf-8').format(str(index1),str(index2),str(i),'tree')