十套练习使用pandas数据分析.6
import pandas as pd
import numpy as np
data = pd. read_table( "D:/东华研/数据分析/pandas_exercise/exercise_data/wind.data" , sep= '\s+' , parse_dates= [ [ 0 , 1 , 2 ] ] )
data. head( )
Yr_Mo_Dy RPT VAL ROS KIL SHA BIR DUB CLA MUL CLO BEL MAL 0 2061-01-01 15.04 14.96 13.17 9.29 NaN 9.87 13.67 10.25 10.83 12.58 18.50 15.04 1 2061-01-02 14.71 NaN 10.83 6.50 12.62 7.67 11.50 10.04 9.79 9.67 17.54 13.83 2 2061-01-03 18.50 16.88 12.33 10.13 11.17 6.17 11.25 NaN 8.50 7.67 12.75 12.71 3 2061-01-04 10.58 6.63 11.75 4.58 4.54 2.88 8.63 1.79 5.83 5.88 5.46 10.88 4 2061-01-05 13.33 13.25 11.42 6.17 10.71 8.21 11.92 6.54 10.92 10.34 12.92 11.83
import datetime
def fix_century ( x) :
year = x. year - 100 if x. year > 1989 else x. year
return datetime. date( year, x. month, x. day)
data[ 'Yr_Mo_Dy' ] = data[ 'Yr_Mo_Dy' ] . apply ( fix_century)
data. head( )
Yr_Mo_Dy RPT VAL ROS KIL SHA BIR DUB CLA MUL CLO BEL MAL 0 1961-01-01 15.04 14.96 13.17 9.29 NaN 9.87 13.67 10.25 10.83 12.58 18.50 15.04 1 1961-01-02 14.71 NaN 10.83 6.50 12.62 7.67 11.50 10.04 9.79 9.67 17.54 13.83 2 1961-01-03 18.50 16.88 12.33 10.13 11.17 6.17 11.25 NaN 8.50 7.67 12.75 12.71 3 1961-01-04 10.58 6.63 11.75 4.58 4.54 2.88 8.63 1.79 5.83 5.88 5.46 10.88 4 1961-01-05 13.33 13.25 11.42 6.17 10.71 8.21 11.92 6.54 10.92 10.34 12.92 11.83
data[ "Yr_Mo_Dy" ] = pd. to_datetime( data[ "Yr_Mo_Dy" ] )
data. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6574 entries, 0 to 6573
Data columns (total 13 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Yr_Mo_Dy 6574 non-null datetime64[ns]
1 RPT 6568 non-null float64
2 VAL 6571 non-null float64
3 ROS 6572 non-null float64
4 KIL 6569 non-null float64
5 SHA 6572 non-null float64
6 BIR 6574 non-null float64
7 DUB 6571 non-null float64
8 CLA 6572 non-null float64
9 MUL 6571 non-null float64
10 CLO 6573 non-null float64
11 BEL 6574 non-null float64
12 MAL 6570 non-null float64
dtypes: datetime64[ns](1), float64(12)
memory usage: 667.8 KB
data= data. set_index( 'Yr_Mo_Dy' )
data. head( )
RPT VAL ROS KIL SHA BIR DUB CLA MUL CLO BEL MAL Yr_Mo_Dy 1961-01-01 15.04 14.96 13.17 9.29 NaN 9.87 13.67 10.25 10.83 12.58 18.50 15.04 1961-01-02 14.71 NaN 10.83 6.50 12.62 7.67 11.50 10.04 9.79 9.67 17.54 13.83 1961-01-03 18.50 16.88 12.33 10.13 11.17 6.17 11.25 NaN 8.50 7.67 12.75 12.71 1961-01-04 10.58 6.63 11.75 4.58 4.54 2.88 8.63 1.79 5.83 5.88 5.46 10.88 1961-01-05 13.33 13.25 11.42 6.17 10.71 8.21 11.92 6.54 10.92 10.34 12.92 11.83
data. isnull( ) . sum ( )
RPT 6
VAL 3
ROS 2
KIL 5
SHA 2
BIR 0
DUB 3
CLA 2
MUL 3
CLO 1
BEL 0
MAL 4
dtype: int64
data. shape[ 0 ] - data. isnull( ) . sum ( )
RPT 6568
VAL 6571
ROS 6572
KIL 6569
SHA 6572
BIR 6574
DUB 6571
CLA 6572
MUL 6571
CLO 6573
BEL 6574
MAL 6570
dtype: int64
data. mean( ) . mean( )
10.227982360836924
lco_stars= pd. DataFrame( )
lco_stars[ 'min' ] = data. min ( )
lco_stars[ 'max' ] = data. max ( )
lco_stars[ 'mean' ] = data. mean( )
lco_stars[ 'var' ] = data. var( )
lco_stars
min max mean var RPT 0.67 35.80 12.362987 31.566564 VAL 0.21 33.37 10.644314 27.745044 ROS 1.50 33.84 11.660526 25.084571 KIL 0.00 28.46 6.306468 13.001874 SHA 0.13 37.54 10.455834 24.365332 BIR 0.00 26.16 7.092254 15.750446 DUB 0.00 30.37 9.797343 24.776050 CLA 0.00 31.08 8.495053 20.245042 MUL 0.00 25.88 8.493590 17.362826 CLO 0.04 28.21 8.707332 20.285606 BEL 0.13 42.38 13.121007 34.047657 MAL 0.67 42.54 15.599079 44.887233
lco_stars_1= pd. DataFrame( )
lco_stars_1[ 'min' ] = data. min ( axis= 1 )
lco_stars_1[ 'max' ] = data. max ( axis= 1 )
lco_stars_1[ 'mean' ] = data. mean( axis= 1 )
lco_stars_1[ 'var' ] = data. var( axis= 1 )
lco_stars_1. head( )
min max mean var Yr_Mo_Dy 1961-01-01 9.29 18.50 13.018182 7.889776 1961-01-02 6.50 17.54 11.336364 10.169685 1961-01-03 6.17 18.50 11.641818 13.556476 1961-01-04 1.79 11.75 6.619167 10.228008 1961-01-05 6.17 13.33 10.630000 5.979764
data[ 'date' ] = data. index
data. head( )
RPT VAL ROS KIL SHA BIR DUB CLA MUL CLO BEL MAL date Yr_Mo_Dy 1961-01-01 15.04 14.96 13.17 9.29 NaN 9.87 13.67 10.25 10.83 12.58 18.50 15.04 1961-01-01 1961-01-02 14.71 NaN 10.83 6.50 12.62 7.67 11.50 10.04 9.79 9.67 17.54 13.83 1961-01-02 1961-01-03 18.50 16.88 12.33 10.13 11.17 6.17 11.25 NaN 8.50 7.67 12.75 12.71 1961-01-03 1961-01-04 10.58 6.63 11.75 4.58 4.54 2.88 8.63 1.79 5.83 5.88 5.46 10.88 1961-01-04 1961-01-05 13.33 13.25 11.42 6.17 10.71 8.21 11.92 6.54 10.92 10.34 12.92 11.83 1961-01-05
data[ data. index. month== 1 ] . mean( )
C:\WINDOWS\TEMP/ipykernel_5984/279758014.py:1: FutureWarning: DataFrame.mean and DataFrame.median with numeric_only=None will include datetime64 and datetime64tz columns in a future version.
data[data.index.month==1].mean()
RPT 14.847325
VAL 12.914560
ROS 13.299624
KIL 7.199498
SHA 11.667734
BIR 8.054839
DUB 11.819355
CLA 9.512047
MUL 9.543208
CLO 10.053566
BEL 14.550520
MAL 18.028763
dtype: float64
data[ 'month' ] = data[ 'date' ] . apply ( lambda x: x. month)
data[ 'year' ] = data[ 'date' ] . apply ( lambda date: date. year)
data[ 'day' ] = data[ 'date' ] . apply ( lambda date: date. day)
january_winds = data. query( 'month == 1' )
january_winds. loc[ : , 'RPT' : 'MAL' ] . mean( )
RPT 14.847325
VAL 12.914560
ROS 13.299624
KIL 7.199498
SHA 11.667734
BIR 8.054839
DUB 11.819355
CLA 9.512047
MUL 9.543208
CLO 10.053566
BEL 14.550520
MAL 18.028763
dtype: float64
data
RPT VAL ROS KIL SHA BIR DUB CLA MUL CLO BEL MAL date month year day Yr_Mo_Dy 1961-01-01 15.04 14.96 13.17 9.29 NaN 9.87 13.67 10.25 10.83 12.58 18.50 15.04 1961-01-01 1 1961 1 1961-01-02 14.71 NaN 10.83 6.50 12.62 7.67 11.50 10.04 9.79 9.67 17.54 13.83 1961-01-02 1 1961 2 1961-01-03 18.50 16.88 12.33 10.13 11.17 6.17 11.25 NaN 8.50 7.67 12.75 12.71 1961-01-03 1 1961 3 1961-01-04 10.58 6.63 11.75 4.58 4.54 2.88 8.63 1.79 5.83 5.88 5.46 10.88 1961-01-04 1 1961 4 1961-01-05 13.33 13.25 11.42 6.17 10.71 8.21 11.92 6.54 10.92 10.34 12.92 11.83 1961-01-05 1 1961 5 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 1978-12-27 17.58 16.96 17.62 8.08 13.21 11.67 14.46 15.59 14.04 14.00 17.21 40.08 1978-12-27 12 1978 27 1978-12-28 13.21 5.46 13.46 5.00 8.12 9.42 14.33 16.25 15.25 18.05 21.79 41.46 1978-12-28 12 1978 28 1978-12-29 14.00 10.29 14.42 8.71 9.71 10.54 19.17 12.46 14.50 16.42 18.88 29.58 1978-12-29 12 1978 29 1978-12-30 18.50 14.04 21.29 9.13 12.75 9.71 18.08 12.87 12.46 12.12 14.67 28.79 1978-12-30 12 1978 30 1978-12-31 20.33 17.41 27.29 9.59 12.08 10.13 19.25 11.63 11.58 11.38 12.08 22.08 1978-12-31 12 1978 31
6574 rows × 16 columns
data. query( 'month==1 and day==1' )
RPT VAL ROS KIL SHA BIR DUB CLA MUL CLO BEL MAL date month year day Yr_Mo_Dy 1961-01-01 15.04 14.96 13.17 9.29 NaN 9.87 13.67 10.25 10.83 12.58 18.50 15.04 1961-01-01 1 1961 1 1962-01-01 9.29 3.42 11.54 3.50 2.21 1.96 10.41 2.79 3.54 5.17 4.38 7.92 1962-01-01 1 1962 1 1963-01-01 15.59 13.62 19.79 8.38 12.25 10.00 23.45 15.71 13.59 14.37 17.58 34.13 1963-01-01 1 1963 1 1964-01-01 25.80 22.13 18.21 13.25 21.29 14.79 14.12 19.58 13.25 16.75 28.96 21.00 1964-01-01 1 1964 1 1965-01-01 9.54 11.92 9.00 4.38 6.08 5.21 10.25 6.08 5.71 8.63 12.04 17.41 1965-01-01 1 1965 1 1966-01-01 22.04 21.50 17.08 12.75 22.17 15.59 21.79 18.12 16.66 17.83 28.33 23.79 1966-01-01 1 1966 1 1967-01-01 6.46 4.46 6.50 3.21 6.67 3.79 11.38 3.83 7.71 9.08 10.67 20.91 1967-01-01 1 1967 1 1968-01-01 30.04 17.88 16.25 16.25 21.79 12.54 18.16 16.62 18.75 17.62 22.25 27.29 1968-01-01 1 1968 1 1969-01-01 6.13 1.63 5.41 1.08 2.54 1.00 8.50 2.42 4.58 6.34 9.17 16.71 1969-01-01 1 1969 1 1970-01-01 9.59 2.96 11.79 3.42 6.13 4.08 9.00 4.46 7.29 3.50 7.33 13.00 1970-01-01 1 1970 1 1971-01-01 3.71 0.79 4.71 0.17 1.42 1.04 4.63 0.75 1.54 1.08 4.21 9.54 1971-01-01 1 1971 1 1972-01-01 9.29 3.63 14.54 4.25 6.75 4.42 13.00 5.33 10.04 8.54 8.71 19.17 1972-01-01 1 1972 1 1973-01-01 16.50 15.92 14.62 7.41 8.29 11.21 13.54 7.79 10.46 10.79 13.37 9.71 1973-01-01 1 1973 1 1974-01-01 23.21 16.54 16.08 9.75 15.83 11.46 9.54 13.54 13.83 16.66 17.21 25.29 1974-01-01 1 1974 1 1975-01-01 14.04 13.54 11.29 5.46 12.58 5.58 8.12 8.96 9.29 5.17 7.71 11.63 1975-01-01 1 1975 1 1976-01-01 18.34 17.67 14.83 8.00 16.62 10.13 13.17 9.04 13.13 5.75 11.38 14.96 1976-01-01 1 1976 1 1977-01-01 20.04 11.92 20.25 9.13 9.29 8.04 10.75 5.88 9.00 9.00 14.88 25.70 1977-01-01 1 1977 1 1978-01-01 8.33 7.12 7.71 3.54 8.50 7.50 14.71 10.00 11.83 10.00 15.09 20.46 1978-01-01 1 1978 1
data. query( 'day==1' )
RPT VAL ROS KIL SHA BIR DUB CLA MUL CLO BEL MAL date month year day Yr_Mo_Dy 1961-01-01 15.04 14.96 13.17 9.29 NaN 9.87 13.67 10.25 10.83 12.58 18.50 15.04 1961-01-01 1 1961 1 1961-02-01 14.25 15.12 9.04 5.88 12.08 7.17 10.17 3.63 6.50 5.50 9.17 8.00 1961-02-01 2 1961 1 1961-03-01 12.67 13.13 11.79 6.42 9.79 8.54 10.25 13.29 NaN 12.21 20.62 NaN 1961-03-01 3 1961 1 1961-04-01 8.38 6.34 8.33 6.75 9.33 9.54 11.67 8.21 11.21 6.46 11.96 7.17 1961-04-01 4 1961 1 1961-05-01 15.87 13.88 15.37 9.79 13.46 10.17 9.96 14.04 9.75 9.92 18.63 11.12 1961-05-01 5 1961 1 ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 1978-08-01 19.33 15.09 20.17 8.83 12.62 10.41 9.33 12.33 9.50 9.92 15.75 18.00 1978-08-01 8 1978 1 1978-09-01 8.42 6.13 9.87 5.25 3.21 5.71 7.25 3.50 7.33 6.50 7.62 15.96 1978-09-01 9 1978 1 1978-10-01 9.50 6.83 10.50 3.88 6.13 4.58 4.21 6.50 6.38 6.54 10.63 14.09 1978-10-01 10 1978 1 1978-11-01 13.59 16.75 11.25 7.08 11.04 8.33 8.17 11.29 10.75 11.25 23.13 25.00 1978-11-01 11 1978 1 1978-12-01 21.29 16.29 24.04 12.79 18.21 19.29 21.54 17.21 16.71 17.83 17.75 25.70 1978-12-01 12 1978 1
216 rows × 16 columns