数据科学原理与数据处理
Jupyter notebook
安装:pip install jupyter
运行:在命令行处cd到源代码文件目录,然后jupyter notebook
魔法命令:
%run
%run ./runable.py
%load
%load ./runable.py
print('run')
def runable(x):
print(x)
runable('abc')
from runable import runable
runable('aa')
# notebook对同一个文件只会导入一次
%timeit
%timeit li = [i**2 for i in range(1,100)]
#60.1 µs ± 10.3 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
# %timeit在执行时间较短时为了避免调度时间的偏差,会进行多次调用取平均值,在执行时间长时会减少调用次数
# %timeit后面只跟一句代码,测试代码块的执行时间用%%timeit
%%timeit
li = []
for i in range(1,1000):
li.append(i**2)
# 注释不要写在魔法命令前,会报错
# 在python中使用列表生成式比for高效
# 649 µs ± 101 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%time
# %time智慧车帘一次代码的执行时间
%time li = [i**2 for i in range(1,10000)]
# Wall time: 11 ms
%%time
li = []
for i in range(1,10000):
li.append(i**2)
# Wall time: 15 ms
import random
li = [random.random() for i in range(1,10000)]
%timeit li.sort()
# python的sort函数对已经排序国的序列有优化,后面的速度会越来越快,因此用timeit和time来计算sort得到的结果差很多
# 127 µs ± 13 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
li = [random.random() for i in range(1,10000)]
%time li.sort()
# Wall time: 2 ms
%%html
%%html
<div class = 'abc'>html content </div>
#hhhhh
%%js
document.querySelector('.abc').innerHTML='hhhhh'
%%writefile
%%writefile 'abcd.py'
import random
li = [random.random() for i in range(1,10000)]
%timeit li.sort()
# Writing abcd.py
Numpy
安装 pip install numpy
导入 import numpy as np
import numpy as np
# Python list
li = list(range(10))
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
li[5] = '5'
# [0, 1, 2, 3, 4, '5', 6, 7, 8, 9]
# list灵活,但效率低下,处理大型数据、复杂数据时效率低
# Pthon array
import array
arr = array.array('i',list(range(10)))
# array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
array[5] = '5'
# TypeError: 'module' object does not support item assignment
arr[5] = 20
#array('i', [0, 1, 2, 3, 4, 20, 6, 7, 8, 9])
# array处理效率比list高,且array类元素类型统一,但是在多维列表情况下array无法操作
numpy.ndarray
nparr = np.array(list(range(10)))
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
type(nparr)
# numpy.ndarray
nparr[5] = 5.333
# array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
# 整数类型的ndarray放入小数,会截取小数前整数
nparr2 = np.array([1,2,3.3])
nparr2.dtype
# dtype('float64')
nparr3 = np.array([1,2,3],dtype=float)
# array([1., 2., 3.])
# ndarray方便处理多维度数组或者矩阵之间的运算
# 在处理多维数组或者矩阵,运算效率高
def test1(n):
a = [i**2 for i in range(n)]
def test1(n):
a = [i**2 for i in range(n)]
b = [i**2 for i in range(n)]
c = []
for i in range(n):
c.append(a[i]+b[i])
return c
test1(10)
# [0, 2, 8, 18, 32, 50, 72, 98, 128, 162]
def test2(n):
a = np.arange(n)**2
b = np.arange(n)**2
c = a+b
return c
test2(10)
# array([ 0, 2, 8, 18, 32, 50, 72, 98, 128, 162], dtype=int32)
矩阵和随机数的生成
np.array([1,2])
np.array(range(10))
np.arange(10)
np.zeros
np.zeros(10)
# array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])dtype=int)
np.zeros(10,dtype=int)
# array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
np.zeros(shape=(3,5),dtype=int)
# array([[0, 0, 0, 0, 0],
# [0, 0, 0, 0, 0],
# [0, 0, 0, 0, 0]])
np.ones
np.ones(shape=(3,5),dtype=int)
#array([[1, 1, 1, 1, 1],
# [1, 1, 1, 1, 1],
# [1, 1, 1, 1, 1]])
np.full
np.full((3,5),99)
# array([[99, 99, 99, 99, 99],
# [99, 99, 99, 99, 99],
# [99, 99, 99, 99, 99]])
np.linespace
np.linspace(0,9,4)# 从0到9生成4个数的等差数列
# array([0., 3., 6., 9.])
生成随机数
np.random.randint(0,10,size=(3,5))
#array([[7, 8, 3, 4, 4],
# [2, 0, 4, 0, 5],
# [7, 7, 3, 8, 3]])
#0到1的浮点数
np.random.random((2,3))#0到1的浮点数
# array([[0.11293923, 0.16618284, 0.04187717],
# [0.05758509, 0.54190623, 0.74469047]])
ndarray基础操作
A = np.ones(shape=(3,5))
# reshape修改维度
a = A.reshape(5,-1)
print(a)
a = A.reshape(-1,3)
print(a)
a = A.reshape(5,3)
a
# [[1. 1. 1.]
# [1. 1. 1.]
# [1. 1. 1.]
# [1. 1. 1.]
# [1. 1. 1.]]
# [[1. 1. 1.]
# [1. 1. 1.]
# [1. 1. 1.]
# [1. 1. 1.]
# [1. 1. 1.]]
# array([[1., 1., 1.],
# [1., 1., 1.],
# [1., 1., 1.],
# [1., 1., 1.],
# [1., 1., 1.]])
取值
x = np.arange(15).reshape(3,5)
print(x)
print(x[0])
print(x[-1])
print(x[0][1])#对多维数组可能取不出来,不推荐
print(x[(0,1)])
print(x[0,1])#这种方式用的比较多
# [[ 0 1 2 3 4]
# [ 5 6 7 8 9]
# [10 11 12 13 14]]
# [0 1 2 3 4]
# [10 11 12 13 14]
# 1
# 1
# 1
二维数组切片
# 取x前两行前三列
print(x)
print(x[0:2,0:3])
print(x[:3][:2])#注意这种写法是先取前三行,再在前两行的基础上取前两行
print(x[::-1,::-1])#-1是反置
print(x.T)# 转置
# [[ 0 1 2 3 4]
# [ 5 6 7 8 9]
# [10 11 12 13 14]]
# [[0 1 2]
# [5 6 7]]
# [[0 1 2 3 4]
# [5 6 7 8 9]]
# [[14 13 12 11 10]
# [ 9 8 7 6 5]
# [ 4 3 2 1 0]]
# [[ 0 5 10]
# [ 1 6 11]
# [ 2 7 12]
# [ 3 8 13]
# [ 4 9 14]]
矩阵的合并
x1 = np.array([
[22,188],
[23,166]
])
x2 = np.array([
[0],[1]
])
x3 = np.concatenate([x1,x2],axis=1)#axis=0代表行合并,=1代表列合并
print(x3)
# [[ 22 188 0]
# [ 23 166 1]]
聚合操作
print(x)
print(x.sum())
print(x.max())
print(x.mean())#平均值
print(x)
print(x.sum())
print(x.max())
print(x.mean())#平均值
# [[ 0 1 2 3 4]
# [ 5 6 7 8 9]
# [10 11 12 13 14]]
# 105
# 14
# 7.0
Pandas
安装: pip install pandas