# 数据的导入
# 三种方式
# csv特征:,分隔 文件头: 字段属性
from csv import reader
import numpy as np
filename = 'D:/0520代码+数据/第3、4次课:代码+数据/pima_data.csv'
with open(filename, 'rt') as raw_data:
readers = reader(raw_data, delimiter=',')
x = list(readers)
data = np.array(x).astype('float')
print(data.shape)
# pandas导入
from pandas import read_csv
filename = 'D:/0520代码+数据/第3、4次课:代码+数据/pima_data.csv'
names = ['preq', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
print(data.shape)
# numpy导入
from numpy import loadtxt
filename = 'D:/0520代码+数据/第3、4次课:代码+数据/pima_data.csv'
with open(filename, 'rt') as raw_data:
data = loadtxt(raw_data, delimiter=',')
print(data.shape)
# 查看数据
from pandas import read_csv
filename = 'D:/0520代码+数据/第3、4次课:代码+数据/pima_data.csv'
names = ['preq', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
data = read_csv(filename, names=names)
print(data.shape)
head = data.head(10)
print(head)
# 数据维度
print(data.shape)
# 数据的属性和类型
print(data.dtypes)
# 描述性统计
print(data.describe())
# 数据分布
print(data.groupby('class').size())
# 数据的相关性:皮尔逊相关系数
print(data.corr(method='pearson'))
# 数据的分布分析:高斯分布
print(data.skew())
后面这部分是python基础,以及一些numpy和pandas的基础知识
# 基本数据类型
# python 6种数据类型
# 字符串
data = 'Hello, Math'
print(data[0])
print(data[1:5])
print(len(data))
print(data)
# 数值
Value = 521
print(Value)
Value_2 = 1.2
print(Value_2)
# 布尔类型
true = True
false = False
print(true)
print(false)
# 变量赋值
a, b, c = 1, 'hello', True
print(a, b, c)
print(a)
print(b)
print(c)
# 空值
a = None
b = a
print(a)
print(b)
# 控制语句 三类:条件控制语句, 循环语句, 条件语句
# 条件控制语句
value = 1
if value == 1:
print('This is ture')
elif value > 20:
print('is is bigger than 20?')
else:
print('This is false')
# 循环语句
for i in range(5):
print(i)
# 条件循环
i = 0
while i < 50:
print(i)
i = i+1
# 复杂的数据类型
# 元组:只读, 元组的元素不能重新赋值
a = (1, 2, 3)
print(a)
print(a[1])
# 列表
a = [1, 2, 3]
print(a)
#add
a.append(4)
print(a)
print(a[3])
# 更新列表项
a[2] = 5
print(a)
for i in a:
print(i)
# 字典:可变容器类型
mydict ={'a': 6.18, 'b':'str', 'c': True}
print('A value: %.2f' %mydict ['a'])
# add
mydict['a'] = 523
print('A value: %d' %mydict ['a'])
print('key: %s' %mydict.keys())
print('value: %s' %mydict.values())
for key in mydict:
print(mydict[key])
# 函数:可重复利用的 def() return
#定义函数
def mysum(x, y):
return x + y
result = mysum(x=1, y= 2)
print(result)
# Numpy速成
# 创建数组
import numpy as np
myarray = np.array([1, 2, 3])
print(myarray)
print(myarray.shape)
myarray = np.array([[1,2,3], [2,3,4], [3,4,5]])
print(myarray)
print(myarray.shape)
# 访问数据
print('第一行数据: %s' %myarray[0])
print('最后一行行数据: %s' %myarray[-1])
print('整列(3列)的数据: %s' %myarray[:, 2])
# pandas速成
# series: 一维数组:list
import pandas as pd
myarray = np.array([1, 2, 3])
index = ['a', 'b', 'c']
myseries = pd.Series(myarray, index=index)
print(myseries)
print('series的第一个元素:')
print(myseries[0])
print('series的c index的元素:')
print(myseries['c'])
#Dataframe:可以指定行和列的二维数组
myarray = np.array([[1,2,3], [2,3,4], [3,4,5]])#类型: (x, y)
rowindex = ['row1', 'row2', 'row3']
colname = ['col1', 'col2', 'col3']
mydataframe = pd.DataFrame(data=myarray, index=rowindex, columns=colname)
print(mydataframe)
print('访问col3的数据:', mydataframe['col3'])