一、NumPy
import numpy as np
1.numpy.array
a = np.array([1, 2, 3, 4, 5], ndmin = 2)
a
array([[1, 2, 3, 4, 5]])
b = np.array([1, 2, 3, 4, 5], dtype=complex)
b
array([1.+0.j, 2.+0.j, 3.+0.j, 4.+0.j, 5.+0.j])
2.条件运算
score = np.array([[79, 90],[72, 50], [54, 75], [81, 95]])
a = score > 80
b = np.where(score < 60, 0, 90)
a
array([[False, True],
[False, False],
[False, False],
[ True, True]])
b
array([[90, 90],
[90, 0],
[ 0, 90],
[90, 90]])
3.统计运算
result = np.amax(score, axis=0)
result
array([81, 95])
result = np.amax(score, axis=1)
result
array([90, 72, 75, 95])
4.数组运算
score[:, 0] = score[:, 0]+5
score
array([[84, 90],
[77, 50],
[59, 75],
[86, 95]])
5.矩阵运算
q = np.array([[0.4], [0.6]])
score = np.array([[79, 90],[72, 50], [54, 75], [81, 95]])
result = np.dot(score, q)
result
array([[85.6],
[58.8],
[66.6],
[89.4]])
矩阵的拼接
v1 = np.array([1, 2, 3, 4])
v2 = np.array([10, 20, 30, 40])
result = np.vstack((v1, v2))
result
array([[ 1, 2, 3, 4],
[10, 20, 30, 40]])
result = np.hstack((v1, v2))
result
array([ 1, 2, 3, 4, 10, 20, 30, 40])
二、Pandas
import pandas as pd
import numpy as np
1.Series实例
s = pd.Series(np.random.randn(4))
s
0 -1.046153
1 0.991661
2 0.492546
3 -0.137610
dtype: float64
s = pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd'], name='创建Series')
s
a -0.273078
b 0.555600
c -0.792856
d 1.186401
Name: 创建Series, dtype: float64
[attr for attr in dir(s) if not attr.startswith('_')]
['T',
'a',
'abs',
'add',
'add_prefix',
'add_suffix',
'agg',
'aggregate',
'align',
'all',
'any',
'append',
'apply',
'argmax',
'argmin',
'argsort',
'array',
'asfreq',
'asof',
'astype',
'at',
'at_time',
'attrs',
'autocorr',
'axes',
'b',
'backfill',
'between',
'between_time',
'bfill',
'bool',
'c',
'clip',
'combine',
'combine_first',
'compare',
'convert_dtypes',
'copy',
'corr',
'count',
'cov',
'cummax',
'cummin',
'cumprod',
'cumsum',
'd',
'describe',
'diff',
'div',
'divide',
'divmod',
'dot',
'drop',
'drop_duplicates',
'droplevel',
'dropna',
'dtype',
'dtypes',
'duplicated',
'empty',
'eq',
'equals',
'ewm',
'expanding',
'explode',
'factorize',
'ffill',
'fillna',
'filter',
'first',
'first_valid_index',
'flags',
'floordiv',
'ge',
'get',
'groupby',
'gt',
'hasnans',
'head',
'hist',
'iat',
'idxmax',
'idxmin',
'iloc',
'index',
'infer_objects',
'info',
'interpolate',
'is_monotonic',
'is_monotonic_decreasing',
'is_monotonic_increasing',
'is_unique',
'isin',
'isna',
'isnull',
'item',
'items',
'iteritems',
'keys',
'kurt',
'kurtosis',
'last',
'last_valid_index',
'le',
'loc',
'lt',
'mad',
'map',
'mask',
'max',
'mean',
'median',
'memory_usage',
'min',
'mod',
'mode',
'mul',
'multiply',
'name',
'nbytes',
'ndim',
'ne',
'nlargest',
'notna',
'notnull',
'nsmallest',
'nunique',
'pad',
'pct_change',
'pipe',
'plot',
'pop',
'pow',
'prod',
'product',
'quantile',
'radd',
'rank',
'ravel',
'rdiv',
'rdivmod',
'reindex',
'reindex_like',
'rename',
'rename_axis',
'reorder_levels',
'repeat',
'replace',
'resample',
'reset_index',
'rfloordiv',
'rmod',
'rmul',
'rolling',
'round',
'rpow',
'rsub',
'rtruediv',
'sample',
'searchsorted',
'sem',
'set_axis',
'set_flags',
'shape',
'shift',
'size',
'skew',
'slice_shift',
'sort_index',
'sort_values',
'squeeze',
'std',
'sub',
'subtract',
'sum',
'swapaxes',
'swaplevel',
'tail',
'take',
'to_clipboard',
'to_csv',
'to_dict',
'to_excel',
'to_frame',
'to_hdf',
'to_json',
'to_latex',
'to_list',
'to_markdown',
'to_numpy',
'to_period',
'to_pickle',
'to_sql',
'to_string',
'to_timestamp',
'to_xarray',
'transform',
'transpose',
'truediv',
'truncate',
'tz_convert',
'tz_localize',
'unique',
'unstack',
'update',
'value_counts',
'values',
'var',
'view',
'where',
'xs']
2.DataFrame的创建
data = [['小明', 20], ['小红', 23], ['小芳', 20], ['小丽', 22]]
index = ['001', '002', '003', '004']
columns = ['姓名', '年龄']
df = pd.DataFrame(data, index, columns)
df
| 姓名 | 年龄 |
---|
001 | 小明 | 20 |
---|
002 | 小红 | 23 |
---|
003 | 小芳 | 20 |
---|
004 | 小丽 | 22 |
---|
3.数据集的清洗
查看数据集
df = pd.read_csv('C:/Users/Frank/Desktop/movie_metadata.csv')
df
| color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | ... | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | aspect_ratio | movie_facebook_likes |
---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | ... | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 1.78 | 33000 |
---|
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | ... | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 2.35 | 0 |
---|
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | ... | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 2.35 | 85000 |
---|
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | ... | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 2.35 | 164000 |
---|
4 | NaN | Doug Walker | NaN | NaN | 131.0 | NaN | Rob Walker | 131.0 | NaN | Documentary | ... | NaN | NaN | NaN | NaN | NaN | NaN | 12.0 | 7.1 | NaN | 0 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
5038 | Color | Scott Smith | 1.0 | 87.0 | 2.0 | 318.0 | Daphne Zuniga | 637.0 | NaN | Comedy|Drama | ... | 6.0 | English | Canada | NaN | NaN | 2013.0 | 470.0 | 7.7 | NaN | 84 |
---|
5039 | Color | NaN | 43.0 | 43.0 | NaN | 319.0 | Valorie Curry | 841.0 | NaN | Crime|Drama|Mystery|Thriller | ... | 359.0 | English | USA | TV-14 | NaN | NaN | 593.0 | 7.5 | 16.00 | 32000 |
---|
5040 | Color | Benjamin Roberds | 13.0 | 76.0 | 0.0 | 0.0 | Maxwell Moody | 0.0 | NaN | Drama|Horror|Thriller | ... | 3.0 | English | USA | NaN | 1400.0 | 2013.0 | 0.0 | 6.3 | NaN | 16 |
---|
5041 | Color | Daniel Hsia | 14.0 | 100.0 | 0.0 | 489.0 | Daniel Henney | 946.0 | 10443.0 | Comedy|Drama|Romance | ... | 9.0 | English | USA | PG-13 | NaN | 2012.0 | 719.0 | 6.3 | 2.35 | 660 |
---|
5042 | Color | Jon Gunn | 43.0 | 90.0 | 16.0 | 16.0 | Brian Herzlinger | 86.0 | 85222.0 | Documentary | ... | 84.0 | English | USA | PG | 1100.0 | 2004.0 | 23.0 | 6.6 | 1.85 | 456 |
---|
5043 rows × 28 columns
过滤全部含有控制NaN
df.dropna()
| color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | ... | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | aspect_ratio | movie_facebook_likes |
---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | ... | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 1.78 | 33000 |
---|
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | ... | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 2.35 | 0 |
---|
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | ... | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 2.35 | 85000 |
---|
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | ... | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 2.35 | 164000 |
---|
5 | Color | Andrew Stanton | 462.0 | 132.0 | 475.0 | 530.0 | Samantha Morton | 640.0 | 73058679.0 | Action|Adventure|Sci-Fi | ... | 738.0 | English | USA | PG-13 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 2.35 | 24000 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
5026 | Color | Olivier Assayas | 81.0 | 110.0 | 107.0 | 45.0 | Béatrice Dalle | 576.0 | 136007.0 | Drama|Music|Romance | ... | 39.0 | French | France | R | 4500.0 | 2004.0 | 133.0 | 6.9 | 2.35 | 171 |
---|
5027 | Color | Jafar Panahi | 64.0 | 90.0 | 397.0 | 0.0 | Nargess Mamizadeh | 5.0 | 673780.0 | Drama | ... | 26.0 | Persian | Iran | Not Rated | 10000.0 | 2000.0 | 0.0 | 7.5 | 1.85 | 697 |
---|
5033 | Color | Shane Carruth | 143.0 | 77.0 | 291.0 | 8.0 | David Sullivan | 291.0 | 424760.0 | Drama|Sci-Fi|Thriller | ... | 371.0 | English | USA | PG-13 | 7000.0 | 2004.0 | 45.0 | 7.0 | 1.85 | 19000 |
---|
5035 | Color | Robert Rodriguez | 56.0 | 81.0 | 0.0 | 6.0 | Peter Marquardt | 121.0 | 2040920.0 | Action|Crime|Drama|Romance|Thriller | ... | 130.0 | Spanish | USA | R | 7000.0 | 1992.0 | 20.0 | 6.9 | 1.37 | 0 |
---|
5042 | Color | Jon Gunn | 43.0 | 90.0 | 16.0 | 16.0 | Brian Herzlinger | 86.0 | 85222.0 | Documentary | ... | 84.0 | English | USA | PG | 1100.0 | 2004.0 | 23.0 | 6.6 | 1.85 | 456 |
---|
3756 rows × 28 columns
标准缺失与非标准缺失
df['actor_3_facebook_likes'].isnull()
0 False
1 False
2 False
3 False
4 True
...
5038 False
5039 False
5040 False
5041 False
5042 False
Name: actor_3_facebook_likes, Length: 5043, dtype: bool
重新定义缺失值
mis_values = ["na", "--"]
df = pd.read_csv('C:/Users/Frank/Desktop/movie_metadata.csv', na_values=mis_values)
df
| color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | ... | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | aspect_ratio | movie_facebook_likes |
---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | ... | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 1.78 | 33000 |
---|
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | ... | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 2.35 | 0 |
---|
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | ... | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 2.35 | 85000 |
---|
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | ... | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 2.35 | 164000 |
---|
4 | NaN | Doug Walker | NaN | NaN | 131.0 | NaN | Rob Walker | 131.0 | NaN | Documentary | ... | NaN | NaN | NaN | NaN | NaN | NaN | 12.0 | 7.1 | NaN | 0 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
5038 | Color | Scott Smith | 1.0 | 87.0 | 2.0 | 318.0 | Daphne Zuniga | 637.0 | NaN | Comedy|Drama | ... | 6.0 | English | Canada | NaN | NaN | 2013.0 | 470.0 | 7.7 | NaN | 84 |
---|
5039 | Color | NaN | 43.0 | 43.0 | NaN | 319.0 | Valorie Curry | 841.0 | NaN | Crime|Drama|Mystery|Thriller | ... | 359.0 | English | USA | TV-14 | NaN | NaN | 593.0 | 7.5 | 16.00 | 32000 |
---|
5040 | Color | Benjamin Roberds | 13.0 | 76.0 | 0.0 | 0.0 | Maxwell Moody | 0.0 | NaN | Drama|Horror|Thriller | ... | 3.0 | English | USA | NaN | 1400.0 | 2013.0 | 0.0 | 6.3 | NaN | 16 |
---|
5041 | Color | Daniel Hsia | 14.0 | 100.0 | 0.0 | 489.0 | Daniel Henney | 946.0 | 10443.0 | Comedy|Drama|Romance | ... | 9.0 | English | USA | PG-13 | NaN | 2012.0 | 719.0 | 6.3 | 2.35 | 660 |
---|
5042 | Color | Jon Gunn | 43.0 | 90.0 | 16.0 | 16.0 | Brian Herzlinger | 86.0 | 85222.0 | Documentary | ... | 84.0 | English | USA | PG | 1100.0 | 2004.0 | 23.0 | 6.6 | 1.85 | 456 |
---|
5043 rows × 28 columns
new_df = df.dropna()
new_df
| color | director_name | num_critic_for_reviews | duration | director_facebook_likes | actor_3_facebook_likes | actor_2_name | actor_1_facebook_likes | gross | genres | ... | num_user_for_reviews | language | country | content_rating | budget | title_year | actor_2_facebook_likes | imdb_score | aspect_ratio | movie_facebook_likes |
---|
0 | Color | James Cameron | 723.0 | 178.0 | 0.0 | 855.0 | Joel David Moore | 1000.0 | 760505847.0 | Action|Adventure|Fantasy|Sci-Fi | ... | 3054.0 | English | USA | PG-13 | 237000000.0 | 2009.0 | 936.0 | 7.9 | 1.78 | 33000 |
---|
1 | Color | Gore Verbinski | 302.0 | 169.0 | 563.0 | 1000.0 | Orlando Bloom | 40000.0 | 309404152.0 | Action|Adventure|Fantasy | ... | 1238.0 | English | USA | PG-13 | 300000000.0 | 2007.0 | 5000.0 | 7.1 | 2.35 | 0 |
---|
2 | Color | Sam Mendes | 602.0 | 148.0 | 0.0 | 161.0 | Rory Kinnear | 11000.0 | 200074175.0 | Action|Adventure|Thriller | ... | 994.0 | English | UK | PG-13 | 245000000.0 | 2015.0 | 393.0 | 6.8 | 2.35 | 85000 |
---|
3 | Color | Christopher Nolan | 813.0 | 164.0 | 22000.0 | 23000.0 | Christian Bale | 27000.0 | 448130642.0 | Action|Thriller | ... | 2701.0 | English | USA | PG-13 | 250000000.0 | 2012.0 | 23000.0 | 8.5 | 2.35 | 164000 |
---|
5 | Color | Andrew Stanton | 462.0 | 132.0 | 475.0 | 530.0 | Samantha Morton | 640.0 | 73058679.0 | Action|Adventure|Sci-Fi | ... | 738.0 | English | USA | PG-13 | 263700000.0 | 2012.0 | 632.0 | 6.6 | 2.35 | 24000 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
5026 | Color | Olivier Assayas | 81.0 | 110.0 | 107.0 | 45.0 | Béatrice Dalle | 576.0 | 136007.0 | Drama|Music|Romance | ... | 39.0 | French | France | R | 4500.0 | 2004.0 | 133.0 | 6.9 | 2.35 | 171 |
---|
5027 | Color | Jafar Panahi | 64.0 | 90.0 | 397.0 | 0.0 | Nargess Mamizadeh | 5.0 | 673780.0 | Drama | ... | 26.0 | Persian | Iran | Not Rated | 10000.0 | 2000.0 | 0.0 | 7.5 | 1.85 | 697 |
---|
5033 | Color | Shane Carruth | 143.0 | 77.0 | 291.0 | 8.0 | David Sullivan | 291.0 | 424760.0 | Drama|Sci-Fi|Thriller | ... | 371.0 | English | USA | PG-13 | 7000.0 | 2004.0 | 45.0 | 7.0 | 1.85 | 19000 |
---|
5035 | Color | Robert Rodriguez | 56.0 | 81.0 | 0.0 | 6.0 | Peter Marquardt | 121.0 | 2040920.0 | Action|Crime|Drama|Romance|Thriller | ... | 130.0 | Spanish | USA | R | 7000.0 | 1992.0 | 20.0 | 6.9 | 1.37 | 0 |
---|
5042 | Color | Jon Gunn | 43.0 | 90.0 | 16.0 | 16.0 | Brian Herzlinger | 86.0 | 85222.0 | Documentary | ... | 84.0 | English | USA | PG | 1100.0 | 2004.0 | 23.0 | 6.6 | 1.85 | 456 |
---|
3756 rows × 28 columns
重复值
person={
'brand':['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
'style':['cup', 'cup', 'cup', 'pack', 'pack'],
'rating':[4, 4, 3.4, 15, 5]}
df = pd.DataFrame(person)
df
| brand | style | rating |
---|
0 | Yum Yum | cup | 4.0 |
---|
1 | Yum Yum | cup | 4.0 |
---|
2 | Indomie | cup | 3.4 |
---|
3 | Indomie | pack | 15.0 |
---|
4 | Indomie | pack | 5.0 |
---|
df.duplicated()
0 False
1 True
2 False
3 False
4 False
dtype: bool
df.duplicated(keep='last')
0 True
1 False
2 False
3 False
4 False
dtype: bool
df.duplicated(keep=False)
0 True
1 True
2 False
3 False
4 False
dtype: bool
删除重复值
df.drop_duplicates()
| brand | style | rating |
---|
0 | Yum Yum | cup | 4.0 |
---|
2 | Indomie | cup | 3.4 |
---|
3 | Indomie | pack | 15.0 |
---|
4 | Indomie | pack | 5.0 |
---|
异常值替换
df.replace('Indomie', 'xyz')
| brand | style | rating |
---|
0 | Yum Yum | cup | 4.0 |
---|
1 | Yum Yum | cup | 4.0 |
---|
2 | xyz | cup | 3.4 |
---|
3 | xyz | pack | 15.0 |
---|
4 | xyz | pack | 5.0 |
---|