1. numpy
numpy对所有数据类型的精度都作了扩展,因此会有独立的数据类型
import numpy as np
data = np.array([1,2,3])
python_data = [1,2,3]
type(data[0]) # numpy.int32
type(python_data[0]) # int
1.1 数组的基本信息
- 维度 ndim
data_base = np.array([[1,2],[3,4]])
data_base.ndim # 维度为2-看有几个左中括号就有几维
- 行列数 shape 几行几列
data_base = np.array([[[1,2],[3,4]],[[5,6],[7,8]]])
data_base.shape (2,2,2)
data_base # 与shape结果一起比较更直观:
#array([[[1, 2],
# [3, 4]],
# [[5, 6],
# [7, 8]]])
1.2 切片(重要)
data_slice = np.array([[[1,2,3,4,5],[6,7,8,9,10]],[[11,12,13,14,15],[16,17,18,19,20]]])
data_slice[:,:,:] # 三个冒号用两个逗号隔开,表示三维数组数据,每个冒号又可以用1:2:1(起始:结束:步长)来精确指定切片
# output:
#array([[[ 1, 2, 3, 4, 5],
# [ 6, 7, 8, 9, 10]],
# [[11, 12, 13, 14, 15],
# [16, 17, 18, 19, 20]]])
# 面试题
p=np.array([1,2,3,4,5,6,7,8,9])
p[1:-2:2] # 结果是2,4,6,包头不包尾
p[-2:1:-2] #结果是 8,6,4,反向取,开头要小于结束索引
使用切片进行切图操作
face = plt.imread("face.jpg")
face_rect = face[0:170,0:150,:]
face_rect = face[:,:,::-1]
plt.imshow(face_rect)
变形 reshape
data_shape=np.array([[1,2,3],[4,5,6]])
data_shape.reshape(-1) #[1,2,3,4,5,6] 自动转换
data_shape.reshape(3,2) # 转换成3行2列
data_shape.reshape(-1,3) # 指定3列,行自动计算
data_shape.reshape(3,-1) # 指定3行,列数自动计算
转换数据类型
data_type = np.array([1,2,3,4,5,6])
data_type.astype(np.float) # array([1., 2., 3., 4., 5., 6.])转换成浮点数
numpy 广播机制
a = np.array([[[1,2],[3,4]],[[5,6],[7,8]]])
c = 2
b = np.array([10,100])
a*c
#array([[[ 2, 4],
[ 6, 8]],
[[10, 12],
[14, 16]]])
a*b
# array([[[ 10, 200],
[ 30, 400]],
[[ 50, 600],
[ 70, 800]]])
pandas
数据类型
- Series
import pandas as pd
s = pd.Series([1,2,3,4,5],index=["张三","李四","王五","赵六","钱七"]) # 增加自定义行号
# 结果:
#张三 1
#李四 2
#王五 3
#赵六 4
#钱七 5
#dtype: int64
# 切片(包头尾):
s["张三":"李四"]
s[0:1]
# 结果
#张三 1
#李四 2
#dtype: int64
- DataFrames
grade_pd = pd.read_csv("grade.csv")
# CRUD之查询
grade_pd[grade_pd['语文']>90] grade_pd['语文']>90得出的是类似array([False, False, False, True, True])
# 查询原理:来源于numpy的通过布尔值选择的原理
a = np.array([1,2,3,4,5])
b = np.array([True,False,True,False,True])
a[b]
a>3 # array([False, False, False, True, True])
a[a>3] #相对于 a[b]的机理,即通过布尔值进行查询
# 包含关键字
grade_pd[grade_pd['性别']=='男'] # 性别为男
grade_pd[grade_pd['性别'].str.contains('女')] #性别包含女
# 多条件
grade_pd[(grade_pd['语文']>90) & (grade_pd['数学']<90)] # & 且条件
grade_pd[(grade_pd['语文']>90) | (grade_pd['数学']<90)] # |或条件
# 选择行列
grade_pd.loc[1:2,"数学":"英语"] # 行列,行可以用条件代替如:
grade_pd.loc[grade_pd['性别'].str.contains("男"),"音乐":"体育"]
grade_pd.loc[grade_pd['性别'].str.contains("男"),["音乐":"体育"]]
# 删除
grade_pd.drop([1,2]) # 并不真正删除源数据
# 修改
grade_pd["数学"]=grade_pd['数学'].apply(lambda x:x+2)
# 自定义函数修改
def add_score(x):
return x+2
grade_pd['数学'] = grade_pd['数学'].apply(lambda x:add_score(x))
# 使用条件直接修改源数据
grade_pd.loc[grade_pd['性别'].str.contains('男'),"性别"]=1
# 增加
grade_pd['总分'] = grade_pd.sum(axis=1)
# 排序
grade_pd.sort_values(by="总分",ascending=False)
# 查看DataFrame信息
grade_pd.info()
IMDB
movies = pd.read_csv("IMDB.csv")
# 1. 探索数据
movies.info()
movies.describe()
movies.head(3)
# 2 .数据清洗与预处理 (省略)
# 3. 数据分析 确定业务指标,对业务有指导性意义的指标
#(1) 导演电影数量的前10位导演
director_count = movies['Director'].value_counts()
director_count[:10]
#(2) 票房前10的导演
director_sum = movies.loc[:,["Director","Revenue (Millions)"]].groupby('Director').sum()
type(director_sum)
director_sum.sort_values(by="Revenue (Millions)",ascending=False)[:10]
# (3)统计各种风格的电影
count_dict = {}
a = movies["Genre"]
a.values
for genres in a.values:
genres_list = genres.split(",")
for sing_genres in genres_list:
if sing_genres in count_dict:
count_dict[sing_genres] += 1
else:
count_dict[sing_genres] = 1
print(count_dict)
#(4) 按年份统计电影票房
movies['Year'] = movies['Year'].apply(lambda x:str(x))
movies.loc[:,["Year","Revenue (Millions)"]].groupby("Year").sum()
#(5) 哪个电影票房最高
movies.loc[:,["Title",'Revenue (Millions)']].sort_values(by="Revenue (Millions)",ascending=False)
ind = movies['Revenue (Millions)'].idxmax()
# pandas中的idxmax相对于numpy argmax f(x)函数取最大值时自变量x的值
movies[ind:ind+1]
#(5.2)哪个电影评分最高
start=movies['Rating'].idxmax()
movies[start:start+1]
# (6)细分统计
# 数据的下钻和上卷
result_Ridley = movies[movies['Director']=="Ridley Scott"]
result_sci = movies[movies['Genre'].str.contains("Sci-Fi")]
result_sci['Director'].value_counts()