数据分析中小项目的记录

import sys
import csv
import warnings

warnings.filterwarnings("ignore")
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler, Imputer

reload(sys)
sys.setdefaultencoding('utf8')
from Tools import Tools
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn import tree

# from xgboost import XGBRegressor

plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

1、忽略警告

import warnings
warnings.filterwarnings("ignore")

2、matplotlib显示中文

plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

3、print中文

print "---------------加载数据----------------".decode('utf-8')

4、读取中文内容的表格

quality_1_df = read_tool.read_csv(filename=file_name.decode('utf-8'), encoding1='gbk', encoding2='utf-8',header=0)
import codecs
import pandas as pd
class Tools(object):
    # def __init__(self):

    def read_csv(self, filename, encoding1='utf-8', encoding2='utf-8', header=None):
        """
        直接采用pd.read_csv难以处理文件名中带有中文的文件,需要借助open函数。
        直接采用with open('xx.csv') as f 的方式难以处理csv的中文列名。
        故最终采用with + codecs.open,能够同时解决上述两个问题。
        注意:如果源文件是ansi格式,则应将encoding1设置为'gbk'.
        """
        with codecs.open(filename, 'rb', encoding1) as f:
            df = pd.read_csv(f, encoding=encoding2, header=header)
        return df

4、使用pandas处理表格

处理一列的文件

    quality_1_df = pd.DataFrame(quality_1_df.loc[lambda x: x['归属团队'.decode('utf-8')] == 'Runners'],columns=['归属团队'.decode('utf-8'), '领域'.decode('utf-8'), '位置'.decode('utf-8'),'提交归属迭代'.decode('utf-8'), '关闭迭代'.decode('utf-8'),'发现活动分类'.decode('utf-8')])
#分割
    quality_1_df['迭代修改'.decode('utf-8')] = quality_1_df['提交归属迭代'.decode('utf-8')].str.split('(', expand=True)[0]
    quality_1_df['迭代修改'.decode('utf-8')] = quality_1_df['迭代修改'.decode('utf-8')].str.split('/S', expand=True)[1]
#拼接
    quality_1_Runners_df['领域/位置'.decode('utf-8')] = quality_1_Runners_df['领域'.decode('utf-8')].str.cat(quality_1_Runners_df['位置'.decode('utf-8')], sep='/')
#删除
    quality_1_Runners_df = quality_1_Runners_df.drop('领域'.decode('utf-8'), axis=1)
#新建并,根据其他列设置内容

group处理

team_size = pd.DataFrame(commit_1_df.groupby(
        ['团队'.decode('utf-8'), '领域'.decode('utf-8'), '迭代'.decode('utf-8')]).sum())

对一列中的数据挨个处理

commit_1_df['完整迭代'.decode('utf-8')] = diedai_S(commit_1_df['迭代修改'.decode('utf-8')])

def diedai_S(df1):
    list_tmp = []
    for str in df1:
        if type(str) == float:
            pass
        else:
            tmp = str.decode('utf-8').split('-')[0]
            tmp = int(tmp.decode('utf-8'))
        # tmp=int(tmp.decode('utf-8'))
        list_tmp.append(tmp)
    # print list_tmp
    return list_tmp

填充列

commit_1_df['迭代'.decode('utf-8')].fillna('S'.decode('utf-8'), inplace=True)

合并数据

result = pd.merge(quality_1_df, commit_1_df, on=['团队'.decode('utf-8'), '迭代'.decode('utf-8')], how='left')
result = pd.merge(result, code_1_df, on=['团队'.decode('utf-8'), '迭代'.decode('utf-8')], how='left')
#left,以第一个dataframe的索引为索引,第二个空值保留

插入一列

result.insert(5, '迭代'.decode('utf-8'), tmp)

求dataframe的相关系数

pearson相关系数:标准化后的数据求欧氏距离平方并经过简单的线性变化

image.png

 

1)pearson:相关系数来衡量两个数据集合是否在一条线上面,即针对线性数据的相关系数计算,针对非线性数据便会有误差。

2)kendall:用于反映分类变量相关性的指标,即针对无序序列的相关系数,非正太分布的数据

3)spearman:非线性的,非正太分析的数据的相关系数

result = read_tool.read_csv(filename=file_name.decode('utf-8'), encoding1='utf-8', encoding2='utf-8',header=0)
print '---------------corr-----------------'
result_pearson_corr = result.corr()
result_kendall_corr = result.corr(method='kendall')
result_spearman_corr = result.corr(method='spearman')
 result_spearman_corr.to_csv("../../../../resource/处理后/合并/相关系数/spearman_corr.csv".decode('utf-8'), encoding='utf-8')

5、sklearn 求特征重要性

处理CSV文件变为可训练(np形式的)

def get_data_1(index1, index2, target_index):
    file_name = "../../../../resource/处理后/合并/result.csv".decode('utf-8')
    csvFile = open(file_name, "r")
    csv_data = csv.reader(csvFile)
    cancer = np.array([i for i in csv_data])
    attribute_names = cancer[0, 5:]
    # print attribute_names
    attribute_data = cancer[index1:index2, 5:]
    # print attribute_data
    data = []
    for i in attribute_data:
        temp = []
        for j in i:
            if not j:
                temp.append(np.nan)
            else:
                if '.' in j:
                    j = j.split('.')[0]
                temp.append(int(j))
        data.append(temp)
    target = []
    for i in cancer[index1:index2, target_index]:
        target.append(int(i))
    target_names = cancer[0, target_index]
    from sklearn.datasets.base import Bunch
    real_data = Bunch(data=data, target=target, feature_names=attribute_names, target_names=target_names)
    # print real_data
    return real_data

CSV到np

csvFile = open(file_name, "r")
csv_data = csv.reader(csvFile)
cancer = np.array([i for i in csv_data])

将训练数据字符串形式的,变为int,空变为np.nan

attribute_names = cancer[0, 5:]
data = []
    for i in attribute_data:
        temp = []
        for j in i:
            if not j:
                temp.append(np.nan)
            else:
                if '.' in j:
                    j = j.split('.')[0]
#3.0的字符串,不能转变为float,在转成int
                temp.append(int(j))
        data.append(temp)

训练数据集

from sklearn.datasets.base import Bunch
real_data = Bunch(data=data, target=target, feature_names=attribute_names, target_names=target_names)
X, y = data.data, data.target
feature_names = data.feature_names
target_names = data.target_names

数据集中空值处理并归一化

imp = Imputer(strategy='mean')
X, y = data.data, data.target
feature_names = data.feature_names
target_names = data.target_names
X2 = imp.fit_transform(X)
X = MinMaxScaler().fit_transform(X2)

使用算法模型,训练数据,获得重要性

rf = RandomForestRegressor()
rf.fit(X, y)
importance = rf.feature_importances_
#重要性数据
important_data = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), feature_names), reverse=True)
for (x, y) in important_data:
print x, y.decode('utf-8')



rf = tree.DecisionTreeClassifier()
    rf.fit(X, y)
    important_data = sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), feature_names), reverse=True)

6、使用matplotlib画图

import matplotlib.pyplot as plt
plt.subplot(2, 1, 1)
plt.title(target_names.decode('utf-8') + "与各特征之间的关系(决策树):".decode('utf-8'))
x_tick = [x.decode('utf-8') for x in feature_names]
plt.bar(x_tick, importance, width=0.35)
#设置横坐标倾斜
plt.xticks(x_tick, x_tick, rotation=30)
plt.show()

7、将字符串写入TXT文件

text = ''
for i in range(1, 5):
    data_1 = get_data_1(index1, index2, i)
    text += handle_data_1(data_1)
print text
write_file_path = "../../../../resource/处理后/合并/important_{3}{0}to{1}_{2}.txt".decode('utf-8').format(str(index1),str(index2),str(i),'tree')
with open(write_file_path, 'w') as f:
f.write(text)

8、format使用

write_file_path = "../../../../resource/处理后/合并/important_{3}{0}to{1}_{2}.txt".decode('utf-8').format(str(index1),str(index2),str(i),'tree')

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值