Pandas and numpy 学习笔记(代码)

numpy部分

# import numpy as np
#
# #a = np.array([1, 2, 3, 4, 5], ndmin=1)
# #a = np.array([[1,1,2],[1,2,3]])
# #a = np.arange(12).reshape((3,4))
#a = np.linspace(0,10,5) #从几到几,用几段
#a = np.linspace(0,10,10).reshape(2,5)
#a = np.array([1,2,3,4]).reshape(2,2)
# d = np.array([[1,1],[2,3]])
# b = np.arange(4)
#c = 10*np.sin(a)
# c = a*d
# c_dot = np.dot(a,d)#矩阵 ,c_cot_2 = a.dot(b)
#print (a)
#print (c)
# #print(a+b)
# #print(b<3) #b = np.arange(4)   [ True  True  True False]
# print (c,c_dot)



# #random

# import numpy as np
# a = np.random.random((2,4))
#
# print(a)
# print(np.sum(a,axis =1)) #还有min,max,axis 是维度 ,
# print(np.sum(a,axis =0))# axis 为1,列,为0,行
# # [[0.10429317 0.15675549 0.31883109 0.42421789]
# #  [0.76801578 0.33735384 0.97900856 0.00229303]]
# # [1.00409764 2.08667122]
# # [0.87230895 0.49410933 1.29783965 0.42651092]



#基础运算

# a = np.arange(12).reshape((3,4))
# print(a)
# print(np.argmax(a))
# print(np.mean(a)) #平均
# print(np.median(a)) #中位数
# print(np.cumsum(a)) #累加
# print(np.diff(a)) #累减
# # sort ,transport ,
# print(np.clip(a,3,5)) #截断
# print(np.mean(a,axis=1 ))
#


#5
#
# a = np.arange(12).reshape((3,4))
# # print(a)
# # print(a[2])
# # print(a[:,2])#第一列
# # print(a[1,1:3])
# #for循环
# for row in a:#输出行
#    print (row)
# for col in a.T: # a.T 为a的矩阵反转
#     print (col)
#
# for item in a.flat:
#     print(item) #矩阵拉成一行
#


#6
#a = np.array([[1,1,2],[1,2,1]])
# b = np.array([[2,1,2],[3,2,3]])
#
# c = np.vstack((a,b))#上下合并
# c = np.hstack((a,b))#左右
# print (c.shape,a.shape) #
# print (c)
# #改变维度
# a = np.array([1,1,1])[:,np.newaxis]
# b = np.array([2,2,2])[:,np.newaxis]
# # print(a.shape)
# # print(a+b)
# c = np.concatenate((a,a,b),axis = 0) #多个合并
# print(a)
# print(c)


#7
import pylab as p

a = np.arange(12).reshape(3,4)
#print(np.split(a,3,axis=0))#均等
#print(np.array_split(a,4,axis=0)) # bu均等几块,维度


#8
# a = np.arange(4)
# b = a
# c = a
# d = b
# a[0] = 11 #类似引用
# print(a == b)
# print( a == d)
# b = a.copy() #deep copy

pandas部分

#pandas
import pandas as pd

#1

# s = pd.Series([1,3,6,np.nan,44,1])
# # print(s)
# dates = pd.date_range('20220101',periods = 6)
# # print(dates)
#
# p = np.random.randn(6,4)
# #print(p)
# df = pd.DataFrame(np.random.randn(6,4 ),index = dates,columns=['a','b','c','d'])
# #创造一个类似二维数组的东西,填充数组大小,列名(或者内容),行名
# #print(df)
#
# df2 = pd.DataFrame({'a':['1','2','3','4'],
#                     'b':pd.Timestamp('20220101'),
#                     'c':pd.Series(1,index = list(range(4)),dtype='float32'),
#                     'd':np.array([3]*4,dtype='int32'),
#                     'f':'end',
#                     'g':range(3,11,2)})
# # print(df2)
# # print(df2.columns)
# # print(df2.values)
# # print(df2.describe())
# print(df2.T)


#2

dates = pd.date_range('20220101',periods = 6)
#df = pd.DataFrame(np.random.randn(6,4 ),index = dates,columns=['a','b','c','d'])
df = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns=['a','b','c','d'])
#print(df.b) #print(df[b])
#print(df[0:3],df['20220103':'20220106'])

#select by label:loc  loc的意思是基于标签(label-based selection),
# 输入为标签,也就是,行是(0,1,2,3,4)中的值,列是('a', 'b' ,'c')中的值。
# print(df.loc[:,['a','b']])
# print(df.loc[['20220103'],['a','b']])

#select by posstion:iloc  iloc的意思是基于索引(index-based selection),
# 输入为索引,也就是,行是(0,1,2,3,4)中的值,列是(0,1,2)中的值。
#print(df.iloc[3:5,1:3])
#print(df.iloc[[1,3,5],1:4])

#select by posstion:ix #融合了前两者
#print(df.ix[:3,['a','c']])

#boolean indexing
# print(df)
# print(df[df.a<8])


#3
# dates = pd.date_range('20220101',periods = 6)
# #df = pd.DataFrame(np.random.randn(6,4 ),index = dates,columns=['a','b','c','d'])
# df = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns=['a','b','c','d'])
# pd.Series([1,2,3,4,5,6],index=pd.date_range('20220101',periods = 6)) #序列,一一对应
# # df.iloc[2,2]= 1111
# # print(df)
# # df.loc['20220104','b'] = 2222
# print(df)
# df[df.a>4] = 33
# print(df)
# df.a[df.a>4] = 99
# print(df)
# df['f'] = np.arange(1,7)
# print(df)


#4
# dates = pd.date_range('20220101',periods = 6)
# #df = pd.DataFrame(np.random.randn(6,4 ),index = dates,columns=['a','b','c','d'])
# df = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns=['a','b','c','d'])
# #pd.Series([1,2,3,4,5,6],index=pd.date_range('20220101',periods = 6)) #序列,一一对应
# df.iloc[0,1] = np.nan
# df.iloc[1,2] = np.nan
# print(df.dropna(axis = 1 ,how ='any'))#how = {’any,,‘all’} 默认
# print(df.dropna(axis = 1 ,how ='all'))
# print(df.fillna(value = 0)) #填入
# print(df.isnull()) #检查是否缺失数据
# print(np.any(df.isnull()) == True) #检查


#5 读取录入目录
#C:\\Users\\txjoe\\Desktop\\2022年综合测评优秀学生干部评选原始分数.xlsx
# data = pd.read_excel('C:\\Users\\txjoe\\Desktop\\2022年综合测评优秀学生干部评选原始分数.xlsx')
# print(data)
# data.to_excel()
# pd.read_pickle('student.excel')
# #print(p)


#6 concatenating
# df1 = pd.DataFrame(np.arange(24).reshape(6,4),index = dates,columns=['a','b','c','d'])
# df2 = pd.DataFrame(np.arange(24,48).reshape(6,4),index = dates,columns=['a','b','c','d'])
# df3 = pd.DataFrame(np.arange(48,72).reshape(6,4),index = dates,columns=['a','b','c','d'])
# # print(df1)
# # print(df2)
# # print(df3)
# res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)#合并
# print(res)

# #join,['inner','outer']
# df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index = [1,2,3])
# df2 = pd.DataFrame(np.ones((3,4))*2,columns=['b','c','d','e'],index = [2,3,4])
# df3 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index = [2,3,4])
#res = pd.concat([df1,df2])#默认outer,还有inner
# res = pd.concat([df1,df2],join='inner',ignore_index=True)
# print(res)
#append
# res = df1.append(df2,ignore_index=True)
# res = df1.append([df3,df2],ignore_index=True)
# print(res)
# s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
# res = df1.append(s1,ignore_index = True)
# print(res)



#7 merge

# left = pd.DataFrame({'key':['L','M','N','K'],
#                      'key2':['1','2','3','4'],
#                      'A':['a','b','c','d']})
# right = pd.DataFrame({'key':['L','M','N','K'],
#                       'key2':['1','1','1','1'],
#                       'C ':['a','b','c','d']})
# # res = pd.merge(left,right,on = 'key')
# res = pd.merge(left,right,on = ['key','key2'],how='right')
# # res = pd.merge(left,right,on = ['key','key2'],how='inner')
# # res = pd.merge(left,right,on = ['key','key2'],how='outer')
#
# # print(left)
# # print((right))
#
# print(res)



#8
# import matplotlib.pyplot as plt
#
# data = pd.Series(np.random.randn(1000),index = np.arange(1000))
#
#
# data = pd.DataFrame(np.random.randn(1000,4),
#                     index=np.arange(1000),
#                     columns=list('ABCD'))
# data = data.cumsum()
# #print(data)
# ax = data.plot.scatter(x= 'A',y = 'B',color = 'r',lable = 'Class 1')
#
# data.plot.scatter(x= 'A',y = 'C',color = 'DarkGreen',lable = 'Class 2',ax = ax)
# plt.show()

92讲视频课+16大项目实战+课件源码  为什么学习数据分析?       人工智能、大数据时代有什么技能是可以运用在各种行业的?数据分析就是。      从海量数据中获得别人看不见的信息,创业者可以通过数据分析来优化产品,营销人员可以通过数据分析改进营销策略,产品经理可以通过数据分析洞察用户习惯,金融从业者可以通过数据分析规避投资风险,程序员可以通过数据分析进一步挖掘出数据价值,它和编程一样,本质上也是一个工具,通过数据来对现实事物进行分析和识别的能力。不管你从事什么行业,掌握了数据分析能力,往往在其岗位上更有竞争力。   本课程共包含五大模块: 一、先导篇: 通过分析数据分析师的一天,让学员了解全面了解成为一个数据分析师的所有必修功法,对数据分析师不在迷惑。  二、基础篇: 围绕Python基础语法介绍、数据预处理、数据可视化以及数据分析与挖掘......这些核心技能模块展开,帮助你快速而全面的掌握和了解成为一个数据分析师的所有必修功法。 三、数据采集篇: 通过网络爬虫实战解决数据分析的必经之路:数据从何来的问题,讲解常见的爬虫套路并利用三大实战帮助学员扎实数据采集能力,避免没有数据可分析的尴尬。  四、分析工具篇: 讲解数据分析避不开的科学计算库Numpy、数据分析工具Pandas及常见可视化工具Matplotlib。  五、算法篇: 算法是数据分析的精华,课程精选10大算法,包括分类、聚类、预测3大类型,每个算法都从原理和案例两个角度学习,让你不仅能用起来,了解原理,还能知道为什么这么做。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值