1.1 Pandas和NumPy的使用

文章介绍了Python的NumPy库,包括数组创建、条件运算、统计运算和矩阵运算。接着讲解了Pandas的Series和DataFrame,展示了如何创建、操作以及进行数据清洗,如过滤缺失值和处理异常值。
摘要由CSDN通过智能技术生成

一、NumPy

import numpy as np

1.numpy.array

a = np.array([1, 2, 3, 4, 5], ndmin = 2)
a
array([[1, 2, 3, 4, 5]])
b = np.array([1, 2, 3, 4, 5], dtype=complex)
b
array([1.+0.j, 2.+0.j, 3.+0.j, 4.+0.j, 5.+0.j])

2.条件运算

score = np.array([[79, 90],[72, 50], [54, 75], [81, 95]])
a = score > 80
b = np.where(score < 60, 0, 90)      #小于60的替换为0,其余替换为90
a
array([[False,  True],
       [False, False],
       [False, False],
       [ True,  True]])
b
array([[90, 90],
       [90,  0],
       [ 0, 90],
       [90, 90]])

3.统计运算

result = np.amax(score, axis=0)          #求每一列最大值
result
array([81, 95])
result = np.amax(score, axis=1)         #求每一行最大值
result
array([90, 72, 75, 95])

4.数组运算

score[:, 0] = score[:, 0]+5            #将数组第零列所有分数加5分
score
array([[84, 90],
       [77, 50],
       [59, 75],
       [86, 95]])

5.矩阵运算

q = np.array([[0.4], [0.6]])
score = np.array([[79, 90],[72, 50], [54, 75], [81, 95]])
result = np.dot(score, q)
result
array([[85.6],
       [58.8],
       [66.6],
       [89.4]])

矩阵的拼接

v1 = np.array([1, 2, 3, 4])
v2 = np.array([10, 20, 30, 40])
result = np.vstack((v1, v2))
result
array([[ 1,  2,  3,  4],
       [10, 20, 30, 40]])
result = np.hstack((v1, v2))
result
array([ 1,  2,  3,  4, 10, 20, 30, 40])

二、Pandas

import pandas as pd
import numpy as np

1.Series实例

s = pd.Series(np.random.randn(4))
s
0   -1.046153
1    0.991661
2    0.492546
3   -0.137610
dtype: float64
#指定索引值及name
s = pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd'], name='创建Series')
s
a   -0.273078
b    0.555600
c   -0.792856
d    1.186401
Name: 创建Series, dtype: float64
#可调用方法
[attr for attr in dir(s) if not attr.startswith('_')]
['T',
 'a',
 'abs',
 'add',
 'add_prefix',
 'add_suffix',
 'agg',
 'aggregate',
 'align',
 'all',
 'any',
 'append',
 'apply',
 'argmax',
 'argmin',
 'argsort',
 'array',
 'asfreq',
 'asof',
 'astype',
 'at',
 'at_time',
 'attrs',
 'autocorr',
 'axes',
 'b',
 'backfill',
 'between',
 'between_time',
 'bfill',
 'bool',
 'c',
 'clip',
 'combine',
 'combine_first',
 'compare',
 'convert_dtypes',
 'copy',
 'corr',
 'count',
 'cov',
 'cummax',
 'cummin',
 'cumprod',
 'cumsum',
 'd',
 'describe',
 'diff',
 'div',
 'divide',
 'divmod',
 'dot',
 'drop',
 'drop_duplicates',
 'droplevel',
 'dropna',
 'dtype',
 'dtypes',
 'duplicated',
 'empty',
 'eq',
 'equals',
 'ewm',
 'expanding',
 'explode',
 'factorize',
 'ffill',
 'fillna',
 'filter',
 'first',
 'first_valid_index',
 'flags',
 'floordiv',
 'ge',
 'get',
 'groupby',
 'gt',
 'hasnans',
 'head',
 'hist',
 'iat',
 'idxmax',
 'idxmin',
 'iloc',
 'index',
 'infer_objects',
 'info',
 'interpolate',
 'is_monotonic',
 'is_monotonic_decreasing',
 'is_monotonic_increasing',
 'is_unique',
 'isin',
 'isna',
 'isnull',
 'item',
 'items',
 'iteritems',
 'keys',
 'kurt',
 'kurtosis',
 'last',
 'last_valid_index',
 'le',
 'loc',
 'lt',
 'mad',
 'map',
 'mask',
 'max',
 'mean',
 'median',
 'memory_usage',
 'min',
 'mod',
 'mode',
 'mul',
 'multiply',
 'name',
 'nbytes',
 'ndim',
 'ne',
 'nlargest',
 'notna',
 'notnull',
 'nsmallest',
 'nunique',
 'pad',
 'pct_change',
 'pipe',
 'plot',
 'pop',
 'pow',
 'prod',
 'product',
 'quantile',
 'radd',
 'rank',
 'ravel',
 'rdiv',
 'rdivmod',
 'reindex',
 'reindex_like',
 'rename',
 'rename_axis',
 'reorder_levels',
 'repeat',
 'replace',
 'resample',
 'reset_index',
 'rfloordiv',
 'rmod',
 'rmul',
 'rolling',
 'round',
 'rpow',
 'rsub',
 'rtruediv',
 'sample',
 'searchsorted',
 'sem',
 'set_axis',
 'set_flags',
 'shape',
 'shift',
 'size',
 'skew',
 'slice_shift',
 'sort_index',
 'sort_values',
 'squeeze',
 'std',
 'sub',
 'subtract',
 'sum',
 'swapaxes',
 'swaplevel',
 'tail',
 'take',
 'to_clipboard',
 'to_csv',
 'to_dict',
 'to_excel',
 'to_frame',
 'to_hdf',
 'to_json',
 'to_latex',
 'to_list',
 'to_markdown',
 'to_numpy',
 'to_period',
 'to_pickle',
 'to_sql',
 'to_string',
 'to_timestamp',
 'to_xarray',
 'transform',
 'transpose',
 'truediv',
 'truncate',
 'tz_convert',
 'tz_localize',
 'unique',
 'unstack',
 'update',
 'value_counts',
 'values',
 'var',
 'view',
 'where',
 'xs']

2.DataFrame的创建

data = [['小明', 20], ['小红', 23], ['小芳', 20], ['小丽', 22]]
index = ['001', '002', '003', '004']
columns = ['姓名', '年龄']
df = pd.DataFrame(data, index, columns)
df
姓名年龄
001小明20
002小红23
003小芳20
004小丽22

3.数据集的清洗

查看数据集

df = pd.read_csv('C:/Users/Frank/Desktop/movie_metadata.csv')
df
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
4NaNDoug WalkerNaNNaN131.0NaNRob Walker131.0NaNDocumentary...NaNNaNNaNNaNNaNNaN12.07.1NaN0
..................................................................
5038ColorScott Smith1.087.02.0318.0Daphne Zuniga637.0NaNComedy|Drama...6.0EnglishCanadaNaNNaN2013.0470.07.7NaN84
5039ColorNaN43.043.0NaN319.0Valorie Curry841.0NaNCrime|Drama|Mystery|Thriller...359.0EnglishUSATV-14NaNNaN593.07.516.0032000
5040ColorBenjamin Roberds13.076.00.00.0Maxwell Moody0.0NaNDrama|Horror|Thriller...3.0EnglishUSANaN1400.02013.00.06.3NaN16
5041ColorDaniel Hsia14.0100.00.0489.0Daniel Henney946.010443.0Comedy|Drama|Romance...9.0EnglishUSAPG-13NaN2012.0719.06.32.35660
5042ColorJon Gunn43.090.016.016.0Brian Herzlinger86.085222.0Documentary...84.0EnglishUSAPG1100.02004.023.06.61.85456

5043 rows × 28 columns

过滤全部含有控制NaN

df.dropna()
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
5ColorAndrew Stanton462.0132.0475.0530.0Samantha Morton640.073058679.0Action|Adventure|Sci-Fi...738.0EnglishUSAPG-13263700000.02012.0632.06.62.3524000
..................................................................
5026ColorOlivier Assayas81.0110.0107.045.0Béatrice Dalle576.0136007.0Drama|Music|Romance...39.0FrenchFranceR4500.02004.0133.06.92.35171
5027ColorJafar Panahi64.090.0397.00.0Nargess Mamizadeh5.0673780.0Drama...26.0PersianIranNot Rated10000.02000.00.07.51.85697
5033ColorShane Carruth143.077.0291.08.0David Sullivan291.0424760.0Drama|Sci-Fi|Thriller...371.0EnglishUSAPG-137000.02004.045.07.01.8519000
5035ColorRobert Rodriguez56.081.00.06.0Peter Marquardt121.02040920.0Action|Crime|Drama|Romance|Thriller...130.0SpanishUSAR7000.01992.020.06.91.370
5042ColorJon Gunn43.090.016.016.0Brian Herzlinger86.085222.0Documentary...84.0EnglishUSAPG1100.02004.023.06.61.85456

3756 rows × 28 columns

标准缺失与非标准缺失

df['actor_3_facebook_likes'].isnull()
0       False
1       False
2       False
3       False
4        True
        ...  
5038    False
5039    False
5040    False
5041    False
5042    False
Name: actor_3_facebook_likes, Length: 5043, dtype: bool

重新定义缺失值

mis_values = ["na", "--"]
df = pd.read_csv('C:/Users/Frank/Desktop/movie_metadata.csv', na_values=mis_values)
df
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
4NaNDoug WalkerNaNNaN131.0NaNRob Walker131.0NaNDocumentary...NaNNaNNaNNaNNaNNaN12.07.1NaN0
..................................................................
5038ColorScott Smith1.087.02.0318.0Daphne Zuniga637.0NaNComedy|Drama...6.0EnglishCanadaNaNNaN2013.0470.07.7NaN84
5039ColorNaN43.043.0NaN319.0Valorie Curry841.0NaNCrime|Drama|Mystery|Thriller...359.0EnglishUSATV-14NaNNaN593.07.516.0032000
5040ColorBenjamin Roberds13.076.00.00.0Maxwell Moody0.0NaNDrama|Horror|Thriller...3.0EnglishUSANaN1400.02013.00.06.3NaN16
5041ColorDaniel Hsia14.0100.00.0489.0Daniel Henney946.010443.0Comedy|Drama|Romance...9.0EnglishUSAPG-13NaN2012.0719.06.32.35660
5042ColorJon Gunn43.090.016.016.0Brian Herzlinger86.085222.0Documentary...84.0EnglishUSAPG1100.02004.023.06.61.85456

5043 rows × 28 columns

#再次清洗
new_df = df.dropna()
new_df
colordirector_namenum_critic_for_reviewsdurationdirector_facebook_likesactor_3_facebook_likesactor_2_nameactor_1_facebook_likesgrossgenres...num_user_for_reviewslanguagecountrycontent_ratingbudgettitle_yearactor_2_facebook_likesimdb_scoreaspect_ratiomovie_facebook_likes
0ColorJames Cameron723.0178.00.0855.0Joel David Moore1000.0760505847.0Action|Adventure|Fantasy|Sci-Fi...3054.0EnglishUSAPG-13237000000.02009.0936.07.91.7833000
1ColorGore Verbinski302.0169.0563.01000.0Orlando Bloom40000.0309404152.0Action|Adventure|Fantasy...1238.0EnglishUSAPG-13300000000.02007.05000.07.12.350
2ColorSam Mendes602.0148.00.0161.0Rory Kinnear11000.0200074175.0Action|Adventure|Thriller...994.0EnglishUKPG-13245000000.02015.0393.06.82.3585000
3ColorChristopher Nolan813.0164.022000.023000.0Christian Bale27000.0448130642.0Action|Thriller...2701.0EnglishUSAPG-13250000000.02012.023000.08.52.35164000
5ColorAndrew Stanton462.0132.0475.0530.0Samantha Morton640.073058679.0Action|Adventure|Sci-Fi...738.0EnglishUSAPG-13263700000.02012.0632.06.62.3524000
..................................................................
5026ColorOlivier Assayas81.0110.0107.045.0Béatrice Dalle576.0136007.0Drama|Music|Romance...39.0FrenchFranceR4500.02004.0133.06.92.35171
5027ColorJafar Panahi64.090.0397.00.0Nargess Mamizadeh5.0673780.0Drama...26.0PersianIranNot Rated10000.02000.00.07.51.85697
5033ColorShane Carruth143.077.0291.08.0David Sullivan291.0424760.0Drama|Sci-Fi|Thriller...371.0EnglishUSAPG-137000.02004.045.07.01.8519000
5035ColorRobert Rodriguez56.081.00.06.0Peter Marquardt121.02040920.0Action|Crime|Drama|Romance|Thriller...130.0SpanishUSAR7000.01992.020.06.91.370
5042ColorJon Gunn43.090.016.016.0Brian Herzlinger86.085222.0Documentary...84.0EnglishUSAPG1100.02004.023.06.61.85456

3756 rows × 28 columns

重复值

person={
    'brand':['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
    'style':['cup', 'cup', 'cup', 'pack', 'pack'],
    'rating':[4, 4, 3.4, 15, 5]}
df = pd.DataFrame(person)
df
brandstylerating
0Yum Yumcup4.0
1Yum Yumcup4.0
2Indomiecup3.4
3Indomiepack15.0
4Indomiepack5.0
df.duplicated()
0    False
1     True
2    False
3    False
4    False
dtype: bool
df.duplicated(keep='last')
0     True
1    False
2    False
3    False
4    False
dtype: bool
df.duplicated(keep=False)
0     True
1     True
2    False
3    False
4    False
dtype: bool

删除重复值

df.drop_duplicates()
brandstylerating
0Yum Yumcup4.0
2Indomiecup3.4
3Indomiepack15.0
4Indomiepack5.0

异常值替换

df.replace('Indomie', 'xyz')
brandstylerating
0Yum Yumcup4.0
1Yum Yumcup4.0
2xyzcup3.4
3xyzpack15.0
4xyzpack5.0

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值