numpy能够帮助我们处理数值,但是pandas除了处理数值之外(基于numpy),还能够帮助我们处理其他类型的数据(字符串、时间序列等等)
Series
import pandas as pd
import numpy as np
import string
t= pd. Series( np. arange( 10 ) , index= list ( string. ascii_uppercase[ : 10 ] ) )
t
A 0
B 1
C 2
D 3
E 4
F 5
G 6
H 7
I 8
J 9
dtype: int64
type ( t)
pandas.core.series.Series
a= { string. ascii_uppercase[ i] : i for i in range ( 10 ) }
a
{'A': 0,
'B': 1,
'C': 2,
'D': 3,
'E': 4,
'F': 5,
'G': 6,
'H': 7,
'I': 8,
'J': 9}
pd. Series( a)
A 0
B 1
C 2
D 3
E 4
F 5
G 6
H 7
I 8
J 9
dtype: int64
pd. Series( a, index= list ( string. ascii_uppercase[ 5 : 15 ] ) )
F 5.0
G 6.0
H 7.0
I 8.0
J 9.0
K NaN
L NaN
M NaN
N NaN
O NaN
dtype: float64
t
A 0
B 1
C 2
D 3
E 4
F 5
G 6
H 7
I 8
J 9
dtype: int64
t[ "F" ]
5
t[ 1 ]
1
t[ [ 2 , 4 , 5 ] ]
C 2
E 4
F 5
dtype: int64
t[ [ "A" , "G" ] ]
A 0
G 6
dtype: int64
t. index
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype='object')
t. values
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
type ( t. index)
pandas.core.indexes.base.Index
type ( t. values)
numpy.ndarray
DataFrame
tdf= pd. DataFrame( np. arange( 12 ) . reshape( ( 3 , 4 ) ) )
tdf
tdf. shape
(3, 4)
tdf. dtypes
0 int64
1 int64
2 int64
3 int64
dtype: object
tdf. ndim
2
tdf. index
RangeIndex(start=0, stop=3, step=1)
tdf. columns
RangeIndex(start=0, stop=4, step=1)
tdf. values
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
tdf. head( 2 )
tdf. tail( 3 )
tdf. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 3 non-null int64
1 1 3 non-null int64
2 2 3 non-null int64
3 3 3 non-null int64
dtypes: int64(4)
memory usage: 224.0 bytes
df = pd. read_csv( "./day04/code/dogNames2.csv" )
print ( df[ ( 800 < df[ "Count_AnimalName" ] ) | ( df[ "Count_AnimalName" ] < 1000 ) ] , '\n' )
print ( df. info( ) )
Row_Labels Count_AnimalName
0 1 1
1 2 2
2 40804 1
3 90201 1
4 90203 1
... ... ...
16215 37916 1
16216 38282 1
16217 38583 1
16218 38948 1
16219 39743 1
[16220 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16220 entries, 0 to 16219
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Row_Labels 16217 non-null object
1 Count_AnimalName 16220 non-null int64
dtypes: int64(1), object(1)
memory usage: 253.6+ KB
None
DataFrame是由Series组成的
df[ : 20 ]
Row_Labels Count_AnimalName 0 1 1 1 2 2 2 40804 1 3 90201 1 4 90203 1 5 102201 1 6 3010271 1 7 MARCH 2 8 APRIL 51 9 AUGUST 14 10 DECEMBER 4 11 SUNDAY 13 12 MONDAY 4 13 FRIDAY 19 14 JAN 1 15 JUN 1 16 JANUARY 1 17 JUNE 24 18 JULY 9 19 MON 2
df[ "Count_AnimalName" ]
0 1
1 2
2 1
3 1
4 1
..
16215 1
16216 1
16217 1
16218 1
16219 1
Name: Count_AnimalName, Length: 16220, dtype: int64
type ( df[ "Count_AnimalName" ] )
pandas.core.series.Series
pandas的loc和iloc
df. loc[ 1 , "Row_Labels" ]
'2'
df. loc[ [ 1 , 2 , 4 , 5 ] , [ "Row_Labels" , "Count_AnimalName" ] ]
Row_Labels Count_AnimalName 1 2 2 2 40804 1 4 90203 1 5 102201 1
df. loc[ [ 4 ] , [ "Row_Labels" , "Count_AnimalName" ] ]
Row_Labels Count_AnimalName 4 90203 1
df. loc[ 1 : 8 , [ "Row_Labels" , "Count_AnimalName" ] ]
Row_Labels Count_AnimalName 1 2 2 2 40804 1 3 90201 1 4 90203 1 5 102201 1 6 3010271 1 7 MARCH 2 8 APRIL 51
df. iloc[ 1 : 6 , [ 0 , 1 ] ]
Row_Labels Count_AnimalName 1 2 2 2 40804 1 3 90201 1 4 90203 1 5 102201 1
df[ df[ "Count_AnimalName" ] > 999 ]
Row_Labels Count_AnimalName 1156 BELLA 1195 9140 MAX 1153
df[ ( df[ "Count_AnimalName" ] > 700 ) & ( df[ "Row_Labels" ] . str . len ( ) >= 4 ) ]
Row_Labels Count_AnimalName 1156 BELLA 1195 2660 CHARLIE 856 3251 COCO 852 8417 LOLA 795 8552 LUCKY 723 8560 LUCY 710 12368 ROCKY 823
file_path51 = "./day05/code/IMDB-Movie-Data.csv"
df51 = pd. read_csv( file_path51)
print ( df51. head( 1 ) )
print ( 'rating-meaning:' , df51[ "Rating" ] . mean( ) )
temp_actors_list = df51[ "Actors" ] . str . split( ", " ) . tolist( )
actors_list = [ i for j in temp_actors_list for i in j]
actors_num = len ( set ( actors_list) )
print ( actors_num)
Rank Title Genre \
0 1 Guardians of the Galaxy Action,Adventure,Sci-Fi
Description Director \
0 A group of intergalactic criminals are forced ... James Gunn
Actors Year Runtime (Minutes) \
0 Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... 2014 121
Rating Votes Revenue (Millions) Metascore
0 8.1 757074 333.13 76.0
rating-meaning: 6.723199999999999
2015
import pandas as pd
from matplotlib import pyplot as plt
file_path52 = "./day05/code/starbucks_store_worldwide.csv"
df52 = pd. read_csv( file_path52)
data1 = df52. groupby( by= "Country" ) . count( ) [ "Brand" ] . sort_values( ascending= False ) [ : 10 ]
print ( data1, '\n' )
_x = data1. index
_y = data1. values
plt. figure( figsize= ( 20 , 8 ) , dpi= 80 )
plt. bar( range ( len ( _x) ) , _y)
plt. xticks( range ( len ( _x) ) , _x)
plt. show( )
Country
US 13608
CN 2734
CA 1468
JP 1237
KR 993
GB 901
MX 579
TW 394
TR 326
PH 298
Name: Brand, dtype: int64
import pandas as pd
from matplotlib import pyplot as plt
file_path61 = "./day06/code/PM2.5/BeijingPM20100101_20151231.csv"
df61 = pd. read_csv( file_path61)
period = pd. PeriodIndex( year= df61[ "year" ] , month= df61[ "month" ] , day= df61[ "day" ] , hour= df61[ "hour" ] , freq= "H" )
df61[ "datetime" ] = period
print ( 'head():\n' , df61. head( ) , '\n' )
df61. set_index( "datetime" , inplace= True )
df61 = df61. resample( "7D" ) . mean( )
print ( 'resample:\n' , df61. head( ) )
data = df61[ "PM_US Post" ]
data_china = df61[ "PM_Nongzhanguan" ]
print ( data_china. head( ) )
_x = data. index
_x = [ i. strftime( "%Y%m%d" ) for i in _x]
_x_china = [ i. strftime( "%Y%m%d" ) for i in data_china. index]
print ( len ( _x_china) , len ( _x_china) )
_y = data. values
_y_china = data_china. values
plt. figure( figsize= ( 20 , 8 ) , dpi= 80 )
plt. plot( range ( len ( _x) ) , _y, label= "US_POST" , alpha= 0.7 )
plt. plot( range ( len ( _x_china) ) , _y_china, label= "CN_POST" , alpha= 0.7 )
plt. xticks( range ( 0 , len ( _x_china) , 10 ) , list ( _x_china) [ : : 10 ] , rotation= 45 )
plt. legend( loc= "best" )
plt. show( )
head():
No year month day hour season PM_Dongsi PM_Dongsihuan \
0 1 2010 1 1 0 4 NaN NaN
1 2 2010 1 1 1 4 NaN NaN
2 3 2010 1 1 2 4 NaN NaN
3 4 2010 1 1 3 4 NaN NaN
4 5 2010 1 1 4 4 NaN NaN
PM_Nongzhanguan PM_US Post DEWP HUMI PRES TEMP cbwd Iws \
0 NaN NaN -21.0 43.0 1021.0 -11.0 NW 1.79
1 NaN NaN -21.0 47.0 1020.0 -12.0 NW 4.92
2 NaN NaN -21.0 43.0 1019.0 -11.0 NW 6.71
3 NaN NaN -21.0 55.0 1019.0 -14.0 NW 9.84
4 NaN NaN -20.0 51.0 1018.0 -12.0 NW 12.97
precipitation Iprec datetime
0 0.0 0.0 2010-01-01 00:00
1 0.0 0.0 2010-01-01 01:00
2 0.0 0.0 2010-01-01 02:00
3 0.0 0.0 2010-01-01 03:00
4 0.0 0.0 2010-01-01 04:00
resample:
No year month day hour season PM_Dongsi \
datetime
2010-01-01 84.5 2010.0 1.000000 4.000000 11.5 4.0 NaN
2010-01-08 252.5 2010.0 1.000000 11.000000 11.5 4.0 NaN
2010-01-15 420.5 2010.0 1.000000 18.000000 11.5 4.0 NaN
2010-01-22 588.5 2010.0 1.000000 25.000000 11.5 4.0 NaN
2010-01-29 756.5 2010.0 1.571429 14.285714 11.5 4.0 NaN
PM_Dongsihuan PM_Nongzhanguan PM_US Post DEWP HUMI \
datetime
2010-01-01 NaN NaN 71.627586 -18.255952 54.395833
2010-01-08 NaN NaN 69.910714 -19.035714 49.386905
2010-01-15 NaN NaN 163.654762 -12.630952 57.755952
2010-01-22 NaN NaN 68.069307 -17.404762 34.095238
2010-01-29 NaN NaN 53.583333 -17.565476 34.928571
PRES TEMP Iws precipitation Iprec
datetime
2010-01-01 1027.910714 -10.202381 43.859821 0.066667 0.786905
2010-01-08 1030.035714 -10.029762 45.392083 0.000000 0.000000
2010-01-15 1030.386905 -4.946429 17.492976 0.000000 0.000000
2010-01-22 1026.196429 -2.672619 54.854048 0.000000 0.000000
2010-01-29 1025.273810 -2.083333 26.625119 0.000000 0.000000
datetime
2010-01-01 NaN
2010-01-08 NaN
2010-01-15 NaN
2010-01-22 NaN
2010-01-29 NaN
Freq: 7D, Name: PM_Nongzhanguan, dtype: float64
313 313