import numpy as np
import pandas as pd
print ( pd. __version__)
print ( pd. show_versions( as_json= True ) )
mylist = list ( 'abcedfghijklmnopqrstuvwxyz' )
myarr = np. arange( 26 )
mydict = dict ( zip ( mylist, myarr) )
s1= pd. Series( mylist)
s2= pd. Series( myarr)
s3= pd. Series( mydict)
print ( "*" * 50 )
print ( s1. head( ) )
print ( s2. head( ) )
print ( s3. head( ) )
0.25.1
{'system': {'commit': None, 'python': '3.7.4.final.0', 'python-bits': 64, 'OS': 'Windows', 'OS-release': '10', 'machine': 'AMD64', 'processor': 'Intel64 Family 6 Model 78 Stepping 3, GenuineIntel', 'byteorder': 'little', 'LC_ALL': 'None', 'LANG': 'None', 'LOCALE': 'None.None'}, 'dependencies': {'pandas': '0.25.1', 'numpy': '1.17.2', 'pytz': '2019.2', 'dateutil': '2.8.0', 'pip': '19.3.1', 'setuptools': '41.4.0', 'Cython': None, 'pytest': '4.3.0', 'hypothesis': None, 'sphinx': None, 'blosc': None, 'feather': None, 'xlsxwriter': '1.2.5', 'lxml.etree': '4.4.1', 'html5lib': None, 'pymysql': '0.9.2', 'psycopg2': None, 'jinja2': '2.10.1', 'IPython': '7.9.0', 'pandas_datareader': None, 'bs4': '4.8.0', 'bottleneck': None, 'fastparquet': None, 'gcsfs': None, 'matplotlib': '3.1.1', 'numexpr': None, 'odfpy': None, 'openpyxl': None, 'pandas_gbq': None, 'pyarrow': None, 'pytables': None, 's3fs': None, 'scipy': '1.3.1', 'sqlalchemy': '1.3.10', 'tables': None, 'xarray': None, 'xlrd': None, 'xlwt': '1.3.0'}}
None
**************************************************
0 a
1 b
2 c
3 e
4 d
dtype: object
0 0
1 1
2 2
3 3
4 4
dtype: int32
a 0
b 1
c 2
e 3
d 4
dtype: int64
mylist = list ( 'abcedfghijklmnopqrstuvwxyz' )
myarr = np. arange( 26 )
mydict = dict ( zip ( mylist, myarr) )
ser = pd. Series( mydict)
df= ser. to_frame( ) . reset_index( )
df. head( )
ser1 = pd. Series( list ( 'abcedfghijklmnopqrstuvwxyz' ) )
ser2 = pd. Series( np. arange( 26 ) )
df1= pd. concat( [ ser1, ser2] , axis= 1 )
df2= pd. DataFrame( { "col_1" : ser1, "col_2" : ser2} )
print ( df2. head( ) )
col_1 col_2
0 a 0
1 b 1
2 c 2
3 e 3
4 d 4
ser = pd. Series( list ( 'abcedfghijklmnopqrstuvwxyz' ) )
ser. name= "alphabets"
print ( ser. head( ) )
0 a
1 b
2 c
3 e
4 d
Name: alphabets, dtype: object
ser1 = pd. Series( [ 1 , 2 , 3 , 4 , 5 ] )
ser2 = pd. Series( [ 4 , 5 , 6 , 7 , 8 ] )
print ( ser1[ ~ ser1. isin( ser2) ] )
0 1
1 2
2 3
dtype: int64
ser1 = pd. Series( [ 1 , 2 , 3 , 4 , 5 ] )
ser2 = pd. Series( [ 4 , 5 , 6 , 7 , 8 ] )
ser_u = pd. Series( np. union1d( ser1, ser2) )
print ( ser_u)
ser_i = pd. Series( np. intersect1d( ser1, ser2) )
print ( "-" * 50 )
print ( ser_i)
print ( "-" * 50 )
print ( ser_u[ ~ ser_u. isin( ser_i) ] )
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
dtype: int64
--------------------------------------------------
0 4
1 5
dtype: int64
--------------------------------------------------
0 1
1 2
2 3
5 6
6 7
7 8
dtype: int64
state = np. random. RandomState( 100 )
ser = pd. Series( state. normal( 10 , 5 , 25 ) )
print ( ser)
result= np. percentile( ser, q= [ 0 , 25 , 50 , 75 , 100 ] )
print ( result)
0 1.251173
1 11.713402
2 15.765179
3 8.737820
4 14.906604
5 12.571094
6 11.105898
7 4.649783
8 9.052521
9 11.275007
10 7.709865
11 12.175817
12 7.082025
13 14.084235
14 13.363604
15 9.477944
16 7.343598
17 15.148663
18 7.809322
19 4.408409
20 18.094908
21 17.708026
22 8.740604
23 5.787821
24 10.922593
dtype: float64
[ 1.25117263 7.70986507 10.92259345 13.36360403 18.0949083 ]
ser = pd. Series( np. take( list ( 'abcdefgh' ) , np. random. randint( 8 , size= 30 ) ) )
print ( ser. value_counts( ) )
g 7
e 6
c 5
f 5
h 3
a 2
b 2
dtype: int64
np. random. RandomState( 100 )
ser= pd. Series( np. random. randint( 1 , 5 , 12 ) )
ser[ ~ ser. isin( ser. value_counts( ) . index[ : 2 ] ) ] = "Other"
print ( ser)
0 Other
1 1
2 1
3 Other
4 4
5 4
6 Other
7 4
8 Other
9 1
10 4
11 1
dtype: object
ser = pd. Series( np. random. randint( 1 , 10 , 35 ) )
df= pd. DataFrame( ser. values. reshape( 7 , 5 ) )
print ( df)
0 1 2 3 4
0 7 9 2 3 9
1 4 8 9 9 3
2 7 7 8 6 5
3 4 2 8 4 6
4 1 9 4 2 5
5 8 4 8 3 1
6 4 4 5 2 7
ser = pd. Series( np. random. randint( 1 , 10 , 7 ) )
print ( ser)
result= np. argwhere( ser% 3 == 0 )
result
0 1
1 6
2 5
3 4
4 9
5 6
6 1
dtype: int32
array([[1],
[4],
[5]], dtype=int64)
ser = pd. Series( list ( 'abcdefghijklmnopqrstuvwxyz' ) )
pos = [ 0 , 4 , 8 , 14 , 20 ]
result= ser. take( pos)
result
0 a
4 e
8 i
14 o
20 u
dtype: object
ser1 = pd. Series( range ( 5 ) )
ser2 = pd. Series( list ( 'abcde' ) )
ser1. append( ser2)
print ( ser1)
df = pd. concat( [ ser1, ser2] , axis= 1 )
df
0 0
1 1
2 2
3 3
4 4
dtype: int64
ser1 = pd. Series( [ 10 , 9 , 6 , 5 , 3 , 1 , 12 , 8 , 13 ] )
ser2 = pd. Series( [ 1 , 3 , 10 , 13 ] )
result= [ np. where( i== ser1) [ 0 ] . tolist( ) [ 0 ] for i in ser2]
print ( result)
result= [ pd. Index( ser1) . get_loc( i) for i in ser2]
result
[5, 4, 0, 8]
[5, 4, 0, 8]
truth = pd. Series( range ( 10 ) )
pred = pd. Series( range ( 10 ) ) + np. random. random( 10 )
np. mean( ( truth- pred) ** 2 )
0.42318488444073726
ser = pd. Series( [ 'how' , 'to' , 'kick' , 'ass?' ] )
pd. Series( [ i. title( ) for i in ser] )
0 How
1 To
2 Kick
3 Ass?
dtype: object
ser = pd. Series( [ 'how' , 'to' , 'kick' , 'ass?' ] )
ser. map ( lambda x: len ( x) )
0 3
1 2
2 4
3 4
dtype: int64
ser = pd. Series( [ 1 , 3 , 6 , 10 , 15 , 21 , 27 , 35 ] )
print ( ser. tolist( ) )
print ( ser. diff( ) . tolist( ) )
[1, 3, 6, 10, 15, 21, 27, 35]
[nan, 2.0, 3.0, 4.0, 5.0, 6.0, 6.0, 8.0]
ser = pd. Series( [ '01 Jan 2010' , '02-02-2011' , '20120303' ,
'2013/04/04' , '2014-05-05' , '2015-06-06T12:20' ] )
pd. to_datetime( ser)
0 2010-01-01 00:00:00
1 2011-02-02 00:00:00
2 2012-03-03 00:00:00
3 2013-04-04 00:00:00
4 2014-05-05 00:00:00
5 2015-06-06 12:20:00
dtype: datetime64[ns]
ser = pd. Series( [ '01 Jan 2010' , '02-02-2011' , '20120303' ,
'2013/04/04' , '2015-11-27' , '2019-11-27T12:20' ] )
from dateutil. parser import parse
ser_ts = ser. map ( lambda x: parse( x) )
print ( "这个月的第几天: " , ser_ts. dt. day. tolist( ) )
print ( "这一年的第几周: " , ser_ts. dt. weekofyear. tolist( ) )
print ( "这一年的第几天: " , ser_ts. dt. dayofyear. tolist( ) )
print ( "这一天的星期几: " , ser_ts. dt. weekday_name. tolist( ) )
这个月的第几天: [1, 2, 3, 4, 27, 27]
这一年的第几周: [53, 5, 9, 14, 48, 48]
这一年的第几天: [1, 33, 63, 94, 331, 331]
这一天的星期几: ['Friday', 'Wednesday', 'Saturday', 'Thursday', 'Friday', 'Wednesday']
ser = pd. Series( [ 'Jan 2010' , 'Feb 2011' , 'Mar 2012' ] )
from dateutil. parser import parse
ser. map ( lambda x: parse( '04 ' + x) )
0 2010-01-04
1 2011-02-04
2 2012-03-04
dtype: datetime64[ns]
ser = pd. Series( [ 'Apple' , 'Orange' , 'Plan' , 'Python' , 'Money' ] )
from collections import Counter
mask = ser. map ( lambda x: sum ( [ Counter( x. lower( ) ) . get( i, 0 )
for i in list ( 'aeiou' ) ] ) >= 2 )
ser[ mask]
0 Apple
1 Orange
4 Money
dtype: object
import re
emails = pd. Series( [ 'buying books at amazom.com' , 'rameses@egypt.com' ,
'matt@t.co' , 'narendra@modi.com' ] )
pattern = '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
emails. str . findall( pattern, flags= re. IGNORECASE)
0 []
1 [rameses@egypt.com]
2 [matt@t.co]
3 [narendra@modi.com]
dtype: object
fruit = pd. Series( np. random. choice( [ 'apple' , 'banana' , 'carrot' ] , 10 ) )
weights = pd. Series( np. linspace( 1 , 10 , 10 ) )
weights. groupby( fruit) . mean( )
apple 5.428571
banana 7.500000
carrot 2.000000
dtype: float64
p = pd. Series( [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ] )
q = pd. Series( [ 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 ] )
result= sum ( ( p - q) ** 2 ) ** .5
print ( result)
result= np. linalg. norm( p- q)
result
18.16590212458495
18.16590212458495
ser = pd. Series( [ 2 , 10 , 3 , 4 , 9 , 10 , 2 , 7 , 3 ] )
dd= np. diff( np. sign( np. diff( ser) ) )
print ( dd)
peak_locs= np. where( dd== - 2 ) [ 0 ] + 1
print ( peak_locs)
[-2 2 0 0 -2 2 -2]
[1 5 7]
my_str = 'dbc deb abed gade'
ser = pd. Series( list ( 'dbc deb abed gade' ) )
freq = ser. value_counts( )
print ( freq)
least_freq = freq. dropna( ) . index[ - 1 ]
result= "" . join( ser. replace( ' ' , least_freq) )
result
d 4
3
e 3
b 3
a 2
g 1
c 1
dtype: int64
'dbccdebcabedcgade'
ser = pd. Series( np. random. randint( 1 , 10 , 10 ) ,
pd. date_range( '2000-01-01' , periods= 10 , freq= 'W-SAT' ) )
print ( ser)
2000-01-01 7
2000-01-08 7
2000-01-15 3
2000-01-22 1
2000-01-29 4
2000-02-05 8
2000-02-12 7
2000-02-19 8
2000-02-26 3
2000-03-04 4
Freq: W-SAT, dtype: int32
ser = pd. Series( [ 1 , 10 , 3 , np. nan] , index= pd. to_datetime( [
'2000-01-01' , '2000-01-03' , '2000-01-06' , '2000-01-08' ] ) )
print ( ser)
result= ser. resample( 'D' ) . ffill( )
print ( result)
2000-01-01 1.0
2000-01-03 10.0
2000-01-06 3.0
2000-01-08 NaN
dtype: float64
2000-01-01 1.0
2000-01-02 1.0
2000-01-03 10.0
2000-01-04 10.0
2000-01-05 10.0
2000-01-06 3.0
2000-01-07 3.0
2000-01-08 NaN
Freq: D, dtype: float64
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv' , chunksize= 50 )
df2 = pd. concat( [ chunk. iloc[ 0 ] for chunk in df] , axis= 1 )
df2 = df2. transpose( )
df2
crim zn indus chas nox rm age dis rad tax ptratio b lstat medv 0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1.0 296.0 15.3 396.90 4.98 24.0 50 0.08873 21.0 5.64 0.0 0.439 5.963 45.7 6.8147 4.0 243.0 16.8 395.56 13.45 19.7 100 0.14866 0.0 8.56 0.0 0.520 6.727 79.9 2.7778 5.0 384.0 20.9 394.76 9.42 27.5 150 1.65660 0.0 19.58 0.0 0.871 6.122 97.3 1.6180 5.0 403.0 14.7 372.80 14.10 21.5 200 0.01778 95.0 1.47 0.0 0.403 7.135 13.9 7.6534 3.0 402.0 17.0 384.30 4.45 32.9 250 0.14030 22.0 5.86 0.0 0.431 6.487 13.0 7.3967 7.0 330.0 19.1 396.28 5.90 24.4 300 0.04417 70.0 2.24 0.0 0.400 6.871 47.4 7.8278 5.0 358.0 14.8 390.86 6.07 24.8 350 0.06211 40.0 1.25 0.0 0.429 6.490 44.4 8.7921 1.0 335.0 19.7 396.90 5.98 22.9 400 25.04610 0.0 18.10 0.0 0.693 5.987 100.0 1.5888 24.0 666.0 20.2 396.90 26.77 5.6 450 6.71772 0.0 18.10 0.0 0.713 6.749 92.6 2.3236 24.0 666.0 20.2 0.32 17.44 13.4 500 0.22438 0.0 9.69 0.0 0.585 6.027 79.7 2.4982 6.0 391.0 19.2 396.90 14.33 16.8
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv' ,
converters= { 'medv' : lambda x: 'High' if float ( x) > 25 else 'Low' } )
df
crim zn indus chas nox rm age dis rad tax ptratio b lstat medv 0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 Low 1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 Low 2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 High 3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 High 4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 High ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... 501 0.06263 0.0 11.93 0 0.573 6.593 69.1 2.4786 1 273 21.0 391.99 9.67 Low 502 0.04527 0.0 11.93 0 0.573 6.120 76.7 2.2875 1 273 21.0 396.90 9.08 Low 503 0.06076 0.0 11.93 0 0.573 6.976 91.0 2.1675 1 273 21.0 396.90 5.64 Low 504 0.10959 0.0 11.93 0 0.573 6.794 89.3 2.3889 1 273 21.0 393.45 6.48 Low 505 0.04741 0.0 11.93 0 0.573 6.030 80.8 2.5050 1 273 21.0 396.90 7.88 Low
506 rows × 14 columns
L = pd. Series( range ( 15 ) )
def gen_strides ( a, stride_len= 5 , window_len= 5 ) :
n_strides = ( ( a. size- window_len) // stride_len) + 1
return np. array( [ a[ s: ( s+ window_len) ] for s in np. arange( 0 , a. size, stride_len) [ : n_strides] ] )
gen_strides( L, stride_len= 2 , window_len= 4 )
array([[ 0, 1, 2, 3],
[ 2, 3, 4, 5],
[ 4, 5, 6, 7],
[ 6, 7, 8, 9],
[ 8, 9, 10, 11],
[10, 11, 12, 13]], dtype=int64)
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv' ,
usecols= [ 'crim' , 'medv' ] )
df. head( )
crim medv 0 0.00632 24.0 1 0.02731 21.6 2 0.02729 34.7 3 0.03237 33.4 4 0.06905 36.2
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
df. describe( )
Min.Price Price Max.Price MPG.city MPG.highway EngineSize Horsepower RPM Rev.per.mile Fuel.tank.capacity Passengers Length Wheelbase Width Turn.circle Rear.seat.room Luggage.room Weight count 86.000000 91.000000 88.000000 84.000000 91.000000 91.000000 86.000000 90.000000 87.000000 85.000000 91.000000 89.000000 92.000000 87.000000 88.000000 89.000000 74.000000 86.000000 mean 17.118605 19.616484 21.459091 22.404762 29.065934 2.658242 144.000000 5276.666667 2355.000000 16.683529 5.076923 182.865169 103.956522 69.448276 38.954545 27.853933 13.986486 3104.593023 std 8.828290 9.724280 10.696563 5.841520 5.370293 1.045845 53.455204 605.554811 486.916616 3.375748 1.045953 14.792651 6.856317 3.778023 3.304157 3.018129 3.120824 600.129993 min 6.700000 7.400000 7.900000 15.000000 20.000000 1.000000 55.000000 3800.000000 1320.000000 9.200000 2.000000 141.000000 90.000000 60.000000 32.000000 19.000000 6.000000 1695.000000 25% 10.825000 12.350000 14.575000 18.000000 26.000000 1.800000 100.750000 4800.000000 2017.500000 14.500000 4.000000 174.000000 98.000000 67.000000 36.000000 26.000000 12.000000 2647.500000 50% 14.600000 17.700000 19.150000 21.000000 28.000000 2.300000 140.000000 5200.000000 2360.000000 16.500000 5.000000 181.000000 103.000000 69.000000 39.000000 27.500000 14.000000 3085.000000 75% 20.250000 23.500000 24.825000 25.000000 31.000000 3.250000 170.000000 5787.500000 2565.000000 19.000000 6.000000 192.000000 110.000000 72.000000 42.000000 30.000000 16.000000 3567.500000 max 45.400000 61.900000 80.000000 46.000000 50.000000 5.700000 300.000000 6500.000000 3755.000000 27.000000 8.000000 219.000000 119.000000 78.000000 45.000000 36.000000 22.000000 4105.000000
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
max_price= df. loc[ df. Price == np. max ( df. Price) , [ 'Manufacturer' , 'Model' , 'Type' ] ]
print ( max_price)
row, col = np. where( df. values == np. max ( df. Price) )
print ( row, col)
print ( df. iloc[ row[ 0 ] , col[ 0 ] ] )
print ( df. at[ row[ 0 ] , 'Price' ] )
Manufacturer Model Type
58 Mercedes-Benz 300E Midsize
[58] [4]
61.9
61.9
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
df. columns = df. columns. map ( lambda x: x. replace( '.' , '_' ) )
print ( df. columns)
Index(['Manufacturer', 'Model', 'Type', 'Min_Price', 'Price', 'Max_Price',
'MPG_city', 'MPG_highway', 'AirBags', 'DriveTrain', 'Cylinders',
'EngineSize', 'Horsepower', 'RPM', 'Rev_per_mile', 'Man_trans_avail',
'Fuel_tank_capacity', 'Passengers', 'Length', 'Wheelbase', 'Width',
'Turn_circle', 'Rear_seat_room', 'Luggage_room', 'Weight', 'Origin',
'Make'],
dtype='object')
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
df. isnull( ) . values. any ( )
True
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
count= df. apply ( lambda x: x. isnull( ) . sum ( ) )
print ( count)
print ( count. idxmax( ) )
df = pd. DataFrame( np. arange( 20 ) . reshape( - 1 , 5 ) , columns= list ( 'abcde' ) )
print ( df)
print ( type ( df[ [ 'a' ] ] ) )
print ( type ( df. loc[ : , [ 'a' ] ] ) )
print ( type ( df. iloc[ : , [ 0 ] ] ) )
print ( type ( df. a) )
print ( type ( df[ 'a' ] ) )
print ( type ( df. loc[ : , 'a' ] ) )
print ( type ( df. iloc[ : , 1 ] ) )
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
df = pd. DataFrame( np. arange( 20 ) . reshape( - 1 , 5 ) , columns= list ( 'abcde' ) )
print ( df)
def switch_columns ( df, col_1= None , col_2= None ) :
colnames= df. columns. tolist( )
i_1, i_2= colnames. index( col_1) , colnames. index( col_2)
colnames[ i_2] , colnames[ i_1] = colnames[ i_1] , colnames[ i_2]
return df[ colnames]
df1= switch_columns( df, 'a' , 'c' )
print ( df1)
print ( sorted ( df. columns, reverse= True ) )
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
c b a d e
0 2 1 0 3 4
1 7 6 5 8 9
2 12 11 10 13 14
3 17 16 15 18 19
['e', 'd', 'c', 'b', 'a']
import pandas as pd
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
pd. set_option( "display.max_columns" , 10 )
pd. set_option( 'display.max_rows' , 10 )
df
Manufacturer Model Type Min.Price Price ... Rear.seat.room Luggage.room Weight Origin Make 0 Acura Integra Small 12.9 15.9 ... 26.5 NaN 2705.0 non-USA Acura Integra 1 NaN Legend Midsize 29.2 33.9 ... 30.0 15.0 3560.0 non-USA Acura Legend 2 Audi 90 Compact 25.9 29.1 ... 28.0 14.0 3375.0 non-USA Audi 90 3 Audi 100 Midsize NaN 37.7 ... 31.0 17.0 3405.0 non-USA Audi 100 4 BMW 535i Midsize NaN 30.0 ... 27.0 13.0 3640.0 non-USA BMW 535i ... ... ... ... ... ... ... ... ... ... ... ... 88 Volkswagen Eurovan Van 16.6 19.7 ... 34.0 NaN 3960.0 NaN Volkswagen Eurovan 89 Volkswagen Passat Compact 17.6 20.0 ... 31.5 14.0 2985.0 non-USA Volkswagen Passat 90 Volkswagen Corrado Sporty 22.9 23.3 ... 26.0 15.0 2810.0 non-USA Volkswagen Corrado 91 Volvo 240 Compact 21.8 22.7 ... 29.5 14.0 2985.0 non-USA Volvo 240 92 NaN 850 Midsize 24.8 26.7 ... 30.0 15.0 3245.0 non-USA Volvo 850
93 rows × 27 columns
df = pd. DataFrame( np. random. random( 4 ) , columns= [ 'random' ] )
print ( df. round ( 4 ) )
out= df. style. format ( {
'random' : '{0:.2%}' . format ,
} )
out
random
0 0.8620
1 0.7903
2 0.0159
3 0.5417
<tr>
<th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row0" class="row_heading level0 row0" >0</th>
<td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row0_col0" class="data row0 col0" >86.20%</td>
</tr>
<tr>
<th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row1" class="row_heading level0 row1" >1</th>
<td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row1_col0" class="data row1 col0" >79.03%</td>
</tr>
<tr>
<th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row2" class="row_heading level0 row2" >2</th>
<td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row2_col0" class="data row2 col0" >1.59%</td>
</tr>
<tr>
<th id="T_75726664_11f2_11ea_b9d2_cc2f7187c201level0_row3" class="row_heading level0 row3" >3</th>
<td id="T_75726664_11f2_11ea_b9d2_cc2f7187c201row3_col0" class="data row3 col0" >54.17%</td>
</tr>
</tbody></table>
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' )
df. iloc[ : : 20 , : ] [ [ 'Manufacturer' , 'Model' , 'Type' ] ]
Manufacturer Model Type 0 Acura Integra Small 20 Chrysler LeBaron Compact 40 Honda Prelude Sporty 60 Mercury Cougar Midsize 80 Subaru Loyale Small
import pandas as pd
df = pd. read_csv( 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv' ,
usecols= [ 0 , 1 , 2 , 3 , 5 ] )
df[ [ 'Manufacturer' , 'Model' , 'Type' ] ] = df[ [ 'Manufacturer' , 'Model' , 'Type' ] ] . fillna( 'missing' )
df. index = df. Manufacturer + '_' + df. Model + '_' + df. Type
print ( df. index. is_unique)
import pandas as pd
import numpy as np
df = pd. DataFrame( np. random. randint( 1 , 30 , 30 ) . reshape( 10 , - 1 ) , columns= list ( 'abc' ) )
print ( df[ 'a' ] )
print ( df[ 'a' ] . argsort( ) )
df[ 'a' ] . argsort( ) [ : : - 1 ] [ 5 ]
0 25
1 16
2 12
3 8
4 6
5 17
6 15
7 24
8 16
9 28
Name: a, dtype: int32
0 4
1 3
2 2
3 6
4 1
5 8
6 5
7 7
8 0
9 9
Name: a, dtype: int64
8
ser = pd. Series( np. random. randint( 1 , 100 , 15 ) )
print ( 'ser: ' , ser. tolist( ) , 'mean: ' , round ( ser. mean( ) ) )
np. argwhere( ser> ser. mean( ) )
ser: [54, 77, 49, 74, 24, 95, 94, 14, 7, 50, 69, 65, 72, 72, 58] mean: 58.0
array([[ 1],
[ 3],
[ 5],
[ 6],
[10],
[11],
[12],
[13]], dtype=int64)
df = pd. DataFrame( np. random. randint( 10 , 40 , 60 ) . reshape( - 1 , 4 ) )
rowsums = df. apply ( np. sum , axis= 1 )
print ( np. where( rowsums > 100 ) [ 0 ] [ - 2 : ] )
last_two_rows = df. iloc[ np. where( rowsums > 100 ) [ 0 ] [ - 2 : ] , : ]
last_two_rows
[11 14]
ser = pd. Series( np. logspace( - 2 , 2 , 30 ) )
def cap_outliers ( ser, low_perc, high_perc) :
low, high = ser. quantile( [ low_perc, high_perc] )
print ( low_perc, '%ile: ' , low, '|' , high_perc, '%ile: ' , high)
ser[ ser < low] = low
ser[ ser > high] = high
return ser
capped_ser = cap_outliers( ser, .05 , .95 )
print ( capped_ser)
0.05 %ile: 0.016049294076965887 | 0.95 %ile: 63.876672220183934
0 0.016049
1 0.016049
2 0.018874
3 0.025929
4 0.035622
5 0.048939
6 0.067234
7 0.092367
8 0.126896
9 0.174333
10 0.239503
11 0.329034
12 0.452035
13 0.621017
14 0.853168
15 1.172102
16 1.610262
17 2.212216
18 3.039195
19 4.175319
20 5.736153
21 7.880463
22 10.826367
23 14.873521
24 20.433597
25 28.072162
26 38.566204
27 52.983169
28 63.876672
29 63.876672
dtype: float64
df = pd. DataFrame( np. random. randint( - 20 , 50 , 100 ) . reshape( 10 , - 1 ) )
arr = df[ df > 0 ] . values. flatten( )
arr_qualified = arr[ ~ np. isnan( arr) ]
print ( arr_qualified)
top_indexes = np. argsort( arr_qualified) [ : : ]
print ( top_indexes[ : n** 2 ] )
n = int ( np. floor( arr_qualified. shape[ 0 ] ** .5 ) )
output = np. take( arr_qualified, sorted ( top_indexes[ : n** 2 ] ) ) . reshape( n, - 1 )
print ( output)
[ 6. 29. 48. 22. 14. 10. 49. 9. 18. 42. 31. 42. 16. 35. 45. 10. 2. 27.
48. 2. 16. 48. 22. 12. 23. 13. 34. 38. 18. 10. 12. 48. 39. 18. 49. 24.
35. 13. 16. 30. 35. 22. 44. 46. 8. 30. 1. 5. 30. 7. 15. 22. 6. 43.
47. 8. 32. 21. 46. 5. 20. 39. 9. 17.]
[46 19 16 59 47 0 52 49 44 55 62 7 5 15 29 30 23 25 37 4 50 12 38 20
63 8 33 28 60 57 22 51 3 41 24 35 17 1 45 39 48 10 56 26 40 13 36 27
32 61 11 9 53 42 14 43 58 54 2 18 21 31 34 6]
[[ 6. 29. 48. 22. 14. 10. 49. 9.]
[18. 42. 31. 42. 16. 35. 45. 10.]
[ 2. 27. 48. 2. 16. 48. 22. 12.]
[23. 13. 34. 38. 18. 10. 12. 48.]
[39. 18. 49. 24. 35. 13. 16. 30.]
[35. 22. 44. 46. 8. 30. 1. 5.]
[30. 7. 15. 22. 6. 43. 47. 8.]
[32. 21. 46. 5. 20. 39. 9. 17.]]
df= pd. DataFrame( np. arange( 25 ) . reshape( 5 , - 1 ) )
print ( df)
def swap_rows ( df, i1, i2) :
df. iloc[ i1, : ] , df. iloc[ i2, : ] = df. iloc[ i2, : ] . copy( ) , df. iloc[ i1, : ] . copy( )
return df
result= swap_rows( df, 1 , 2 )
result
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
4 20 21 22 23 24
0 1 2 3 4 0 0 1 2 3 4 1 10 11 12 13 14 2 5 6 7 8 9 3 15 16 17 18 19 4 20 21 22 23 24
df = pd. DataFrame( np. arange( 25 ) . reshape( 5 , - 1 ) )
print ( df)
df. iloc[ : : - 1 , : ]
0 1 2 3 4
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
4 20 21 22 23 24
0 1 2 3 4 4 20 21 22 23 24 3 15 16 17 18 19 2 10 11 12 13 14 1 5 6 7 8 9 0 0 1 2 3 4
df = pd. DataFrame( np. arange( 25 ) . reshape( 5 , - 1 ) , columns= list ( 'abcde' ) )
print ( df)
result= pd. get_dummies( df[ 'a' ] )
df_onehot= pd. concat( [ result, df[ list ( 'bcde' ) ] ] , axis= 1 )
df_onehot
a b c d e
0 0 1 2 3 4
1 5 6 7 8 9
2 10 11 12 13 14
3 15 16 17 18 19
4 20 21 22 23 24
0 5 10 15 20 b c d e 0 1 0 0 0 0 1 2 3 4 1 0 1 0 0 0 6 7 8 9 2 0 0 1 0 0 11 12 13 14 3 0 0 0 1 0 16 17 18 19 4 0 0 0 0 1 21 22 23 24
df = pd. DataFrame( np. random. randint( 1 , 100 , 40 ) . reshape( 10 , - 1 ) )
print ( df)
print ( df. apply ( np. argmax, axis= 1 ) )
print ( df. apply ( np. argmax, axis= 1 ) . value_counts( ) )
print ( df. apply ( np. argmax, axis= 1 ) . value_counts( ) . index[ 0 ] )
0 1 2 3
0 10 87 19 43
1 5 83 50 80
2 19 24 10 77
3 36 15 95 78
4 8 20 89 48
5 17 17 81 46
6 88 74 52 72
7 91 53 36 61
8 25 53 22 90
9 3 93 86 63
0 1
1 1
2 3
3 2
4 2
5 2
6 0
7 0
8 3
9 1
dtype: int64
2 3
1 3
3 2
0 2
dtype: int64
2
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) ,
index= list ( 'abcdefgh' ) , columns= list ( 'pqrstuvwxy' ) )
print ( df)
print ( df. corr( ) )
abs_corrmat = np. abs ( df. corr( ) )
print ( abs_corrmat)
max_corr = abs_corrmat. apply ( lambda x: sorted ( x) [ - 2 ] )
print ( max_corr)
print ( np. round ( max_corr. tolist( ) , 2 ) )
p q r s t u v w x y
a 41 72 5 31 67 26 45 65 21 60
b 15 56 72 91 99 32 38 14 52 36
c 7 92 96 84 26 79 81 12 75 50
d 73 46 42 15 80 76 10 34 45 5
e 15 72 55 14 17 54 9 35 36 18
f 12 73 47 84 85 9 31 67 13 64
g 25 43 56 76 62 43 93 25 53 99
h 80 70 30 68 40 74 2 41 7 47
p q r s t u v \
p 1.000000 -0.348616 -0.606492 -0.406070 0.056932 0.461015 -0.543905
q -0.348616 1.000000 0.213753 0.159442 -0.554585 0.088275 0.023752
r -0.606492 0.213753 1.000000 0.501096 -0.190063 0.290160 0.440666
s -0.406070 0.159442 0.501096 1.000000 0.260183 -0.243481 0.505580
t 0.056932 -0.554585 -0.190063 0.260183 1.000000 -0.568596 -0.009954
u 0.461015 0.088275 0.290160 -0.243481 -0.568596 1.000000 -0.118254
v -0.543905 0.023752 0.440666 0.505580 -0.009954 -0.118254 1.000000
w 0.207508 0.125992 -0.797659 -0.285267 0.192809 -0.562124 -0.358931
x -0.452943 -0.029441 0.798096 0.187066 -0.122999 0.376021 0.637259
y -0.294716 -0.043568 -0.043181 0.579817 0.083632 -0.434149 0.729595
w x y
p 0.207508 -0.452943 -0.294716
q 0.125992 -0.029441 -0.043568
r -0.797659 0.798096 -0.043181
s -0.285267 0.187066 0.579817
t 0.192809 -0.122999 0.083632
u -0.562124 0.376021 -0.434149
v -0.358931 0.637259 0.729595
w 1.000000 -0.835494 0.145546
x -0.835494 1.000000 -0.030812
y 0.145546 -0.030812 1.000000
p q r s t u v \
p 1.000000 0.348616 0.606492 0.406070 0.056932 0.461015 0.543905
q 0.348616 1.000000 0.213753 0.159442 0.554585 0.088275 0.023752
r 0.606492 0.213753 1.000000 0.501096 0.190063 0.290160 0.440666
s 0.406070 0.159442 0.501096 1.000000 0.260183 0.243481 0.505580
t 0.056932 0.554585 0.190063 0.260183 1.000000 0.568596 0.009954
u 0.461015 0.088275 0.290160 0.243481 0.568596 1.000000 0.118254
v 0.543905 0.023752 0.440666 0.505580 0.009954 0.118254 1.000000
w 0.207508 0.125992 0.797659 0.285267 0.192809 0.562124 0.358931
x 0.452943 0.029441 0.798096 0.187066 0.122999 0.376021 0.637259
y 0.294716 0.043568 0.043181 0.579817 0.083632 0.434149 0.729595
w x y
p 0.207508 0.452943 0.294716
q 0.125992 0.029441 0.043568
r 0.797659 0.798096 0.043181
s 0.285267 0.187066 0.579817
t 0.192809 0.122999 0.083632
u 0.562124 0.376021 0.434149
v 0.358931 0.637259 0.729595
w 1.000000 0.835494 0.145546
x 0.835494 1.000000 0.030812
y 0.145546 0.030812 1.000000
p 0.606492
q 0.554585
r 0.798096
s 0.579817
t 0.568596
u 0.568596
v 0.729595
w 0.835494
x 0.835494
y 0.729595
dtype: float64
[0.61 0.55 0.8 0.58 0.57 0.57 0.73 0.84 0.84 0.73]
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) )
print ( df)
min_by_max = np. min ( df, axis= 1 ) / np. max ( df, axis= 1 )
min_by_max
0 1 2 3 4 5 6 7 8 9
0 85 63 99 34 13 14 64 33 58 16
1 64 45 77 68 19 45 61 2 11 15
2 78 66 76 51 51 52 20 53 35 64
3 68 85 2 81 52 66 14 28 41 34
4 37 40 99 62 57 70 37 15 14 56
5 13 88 12 51 43 1 54 18 70 67
6 55 19 79 43 19 8 52 6 15 77
7 79 93 54 68 78 61 80 33 72 92
0 0.131313
1 0.025974
2 0.256410
3 0.023529
4 0.141414
5 0.011364
6 0.075949
7 0.354839
dtype: float64
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) )
result= df. apply ( lambda x: x. sort_values( ) . unique( ) [ - 2 ] , axis= 1 )
df[ 'penultimate' ] = result
df
0 1 2 3 4 5 6 7 8 9 penultimate 0 50 12 77 25 22 97 49 40 27 18 77 1 14 52 78 3 67 5 77 17 43 53 77 2 92 53 10 39 55 34 63 89 60 41 89 3 9 89 66 50 88 4 46 19 87 75 88 4 97 95 75 50 91 60 65 3 24 59 95 5 31 38 4 81 9 1 52 71 84 57 81 6 59 7 19 33 49 40 54 60 48 4 59 7 90 21 77 44 3 50 98 23 84 30 90
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) )
print ( df)
result= df. apply ( lambda x: (
( x - x. mean( ) ) / x. std( )
) . round ( 2 ) )
result
0 1 2 3 4 5 6 7 8 9
0 73 77 53 35 9 80 96 47 35 26
1 58 72 39 80 86 57 41 98 31 90
2 45 76 22 27 5 15 78 90 87 92
3 89 84 97 78 29 70 23 95 97 90
4 55 32 83 49 99 63 22 75 44 26
5 74 42 70 49 57 26 88 77 1 5
6 56 29 42 28 75 16 21 11 38 50
7 99 26 74 74 39 50 61 3 23 3
0 1 2 3 4 5 6 7 8 9 0 0.24 0.90 -0.28 -0.79 -1.16 1.31 1.36 -0.40 -0.30 -0.57 1 -0.57 0.70 -0.83 1.24 1.03 0.39 -0.41 0.96 -0.42 1.10 2 -1.28 0.86 -1.51 -1.15 -1.28 -1.28 0.78 0.75 1.32 1.15 3 1.10 1.18 1.47 1.15 -0.59 0.91 -0.99 0.88 1.63 1.10 4 -0.74 -0.92 0.91 -0.16 1.40 0.63 -1.02 0.35 -0.02 -0.57 5 0.29 -0.52 0.40 -0.16 0.20 -0.84 1.10 0.40 -1.35 -1.11 6 -0.68 -1.04 -0.71 -1.10 0.72 -1.24 -1.05 -1.36 -0.20 0.06 7 1.64 -1.16 0.56 0.97 -0.31 0.11 0.23 -1.57 -0.67 -1.17
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) )
print ( df)
result = df. apply ( lambda x: (
1 - ( x. max ( ) - x) / ( x. max ( ) - x. min ( ) )
) . round ( 2 ) )
result
0 1 2 3 4 5 6 7 8 9
0 78 7 59 4 77 81 93 66 39 28
1 60 51 88 19 23 29 70 82 10 24
2 2 80 7 59 72 51 82 28 38 25
3 36 88 3 8 43 7 87 60 28 99
4 29 69 89 84 87 15 95 87 75 54
5 82 78 60 57 15 29 41 93 57 13
6 72 28 63 2 20 25 6 72 71 32
7 60 2 13 87 82 97 41 23 81 16
0 1 2 3 4 5 6 7 8 9 0 0.95 0.06 0.65 0.02 0.86 0.82 0.98 0.61 0.41 0.17 1 0.72 0.57 0.99 0.20 0.11 0.24 0.72 0.84 0.00 0.13 2 0.00 0.91 0.05 0.67 0.79 0.49 0.85 0.07 0.39 0.14 3 0.43 1.00 0.00 0.07 0.39 0.00 0.91 0.53 0.25 1.00 4 0.34 0.78 1.00 0.96 1.00 0.09 1.00 0.91 0.92 0.48 5 1.00 0.88 0.66 0.65 0.00 0.24 0.39 1.00 0.66 0.00 6 0.88 0.30 0.70 0.00 0.07 0.20 0.00 0.70 0.86 0.22 7 0.72 0.00 0.12 1.00 0.93 1.00 0.39 0.00 1.00 0.03
df = pd. DataFrame( np. random. randint( 1 , 100 , 80 ) . reshape( 8 , - 1 ) )
print ( df)
print ( [ i for i in range ( df. shape[ 0 ] ) [ : - 1 ] ] )
result= [ df. iloc[ i] . corr( df. iloc[ i+ 1 ] ) for i in range ( df. shape[ 0 ] ) [ : - 1 ] ]
result
0 1 2 3 4 5 6 7 8 9
0 27 3 37 9 76 68 91 31 44 7
1 11 15 20 47 33 86 65 47 9 30
2 39 1 72 19 35 42 87 77 55 40
3 60 7 8 28 37 14 17 5 3 7
4 47 99 76 28 77 57 32 57 24 16
5 2 50 95 89 84 46 59 84 1 2
6 78 27 58 67 78 1 7 28 89 20
7 12 86 54 81 20 19 77 1 8 56
[0, 1, 2, 3, 4, 5, 6]
[0.5182965633327684,
0.2595376913412023,
-0.23874062518280761,
0.005261734793477499,
0.4687394611664755,
-0.06555011633952691,
-0.30907671467693215]
df = pd. DataFrame( np. random. randint( 1 , 100 , 100 ) . reshape( 10 , - 1 ) )
rows= df. shape[ 0 ]
for i in range ( rows) :
df. iat[ i, i] = 0
df. iat[ rows- i- 1 , i] = 0
df
0 1 2 3 4 5 6 7 8 9 0 0 65 92 82 10 1 51 71 32 0 1 79 0 11 99 28 68 24 8 0 83 2 34 4 0 35 11 91 83 0 41 29 3 84 72 5 0 65 76 0 25 25 64 4 98 14 2 10 0 0 2 94 40 84 5 75 8 8 27 0 0 23 62 73 95 6 23 43 38 0 36 43 0 7 65 6 7 80 96 0 82 92 79 64 0 61 67 8 29 0 96 96 76 21 94 72 0 4 9 0 26 27 65 95 19 19 1 90 0
df = pd. DataFrame( { 'col1' : [ 'apple' , 'banana' , 'orange' ] * 3 ,
'col2' : np. random. rand( 9 ) ,
'col3' : np. random. randint( 0 , 15 , 9 ) } )
print ( df)
df. groupby( df[ 'col1' ] ) . get_group( 'apple' )
col1 col2 col3
0 apple 0.703158 12
1 banana 0.535815 13
2 orange 0.177147 8
3 apple 0.159570 2
4 banana 0.411271 10
5 orange 0.279007 11
6 apple 0.576264 4
7 banana 0.578607 9
8 orange 0.242959 6
col1 col2 col3 0 apple 0.703158 12 3 apple 0.159570 2 6 apple 0.576264 4
df = pd. DataFrame( { 'fruit' : [ 'apple' , 'banana' , 'orange' ] * 4 ,
'taste' : np. random. rand( 12 ) ,
'price' : np. random. randint( 0 , 15 , 12 ) } )
banana= df[ 'taste' ] . groupby( df[ 'fruit' ] ) . get_group( 'banana' )
print ( banana)
print ( "特定结果:" , banana. sort_values( ) . iloc[ - 2 ] )
df
1 0.209485
4 0.549818
7 0.498802
10 0.006632
Name: taste, dtype: float64
特定结果: 0.4988018517868045
fruit taste price 0 apple 0.510446 7 1 banana 0.209485 1 2 orange 0.632166 1 3 apple 0.865764 4 4 banana 0.549818 9 5 orange 0.744718 5 6 apple 0.069171 0 7 banana 0.498802 14 8 orange 0.011808 2 9 apple 0.103222 13 10 banana 0.006632 6 11 orange 0.017787 13
df = pd. DataFrame( { 'fruit' : [ 'apple' , 'banana' , 'orange' ] * 3 ,
'rating' : np. random. rand( 9 ) ,
'price' : np. random. randint( 0 , 15 , 9 ) } )
print ( df)
out = df. groupby( df[ 'fruit' ] , as_index= False ) [ 'price' ] . mean( )
out
fruit rating price
0 apple 0.090672 2
1 banana 0.019506 0
2 orange 0.354463 5
3 apple 0.466694 14
4 banana 0.807733 8
5 orange 0.488868 4
6 apple 0.640913 8
7 banana 0.977691 8
8 orange 0.390033 0
fruit price 0 apple 8.000000 1 banana 5.333333 2 orange 3.000000
df1 = pd. DataFrame( { 'fruit' : [ 'apple' , 'banana' , 'orange' ] * 3 ,
'weight' : [ 'high' , 'medium' , 'low' ] * 3 ,
'price' : np. random. randint( 0 , 15 , 9 ) } )
df2 = pd. DataFrame( { 'pazham' : [ 'apple' , 'orange' , 'pine' ] * 2 ,
'pounds' : [ 'high' , 'low' ] * 3 ,
'price' : np. random. randint( 0 , 15 , 6 ) } )
print ( df1)
print ( df2)
pd. merge( df1, df2, how= 'inner' ,
left_on= [ 'fruit' , 'weight' ] ,
right_on= [ 'pazham' , 'pounds' ] ,
suffixes= [ '_left' , '_right' ] )
fruit weight price
0 apple high 13
1 banana medium 5
2 orange low 14
3 apple high 2
4 banana medium 2
5 orange low 6
6 apple high 7
7 banana medium 1
8 orange low 6
pazham pounds price
0 apple high 10
1 orange low 14
2 pine high 12
3 apple low 12
4 orange high 11
5 pine low 6
fruit weight price_left pazham pounds price_right 0 apple high 13 apple high 10 1 apple high 2 apple high 10 2 apple high 7 apple high 10 3 orange low 14 orange low 14 4 orange low 6 orange low 14 5 orange low 6 orange low 14
df = pd. DataFrame( { 'fruit1' : np. random. choice( [ 'apple' , 'orange' , 'banana' ] , 10 ) ,
'fruit2' : np. random. choice( [ 'apple' , 'orange' , 'banana' ] , 10 ) } )
print ( df)
np. where( df. fruit1 == df. fruit2) [ 0 ]
fruit1 fruit2
0 orange orange
1 orange apple
2 orange apple
3 banana banana
4 apple banana
5 orange orange
6 orange banana
7 banana orange
8 apple banana
9 orange banana
array([0, 3, 5], dtype=int64)
df = pd. DataFrame( np. random. randint( 1 , 100 , 20 ) . reshape( - 1 , 4 ) ,
columns = list ( 'abcd' ) )
print ( df)
df[ 'a_lag' ] = df[ 'a' ] . shift( 1 )
print ( df)
df[ 'b_lead' ] = df[ 'b' ] . shift( - 1 )
df
a b c d
0 31 79 72 32
1 8 18 82 25
2 98 23 41 79
3 2 87 74 76
4 16 89 12 86
a b c d a_lag
0 31 79 72 32 NaN
1 8 18 82 25 31.0
2 98 23 41 79 8.0
3 2 87 74 76 98.0
4 16 89 12 86 2.0
a b c d a_lag b_lead 0 31 79 72 32 NaN 18.0 1 8 18 82 25 31.0 23.0 2 98 23 41 79 8.0 87.0 3 2 87 74 76 98.0 89.0 4 16 89 12 86 2.0 NaN
df = pd. DataFrame( np. random. randint( 1 , 10 , 20 ) . reshape( - 1 , 4 ) ,
columns = list ( 'abcd' ) )
ravel= df. values. ravel( )
print ( ravel)
pd. value_counts( ravel)
[6 1 6 8 7 8 3 4 1 8 7 7 1 1 9 8 5 8 5 5]
8 5
1 4
7 3
5 3
6 2
9 1
4 1
3 1
dtype: int64
df = pd. DataFrame( [ "STD, City,State" ,
"33, Kolkata,West Bengal" ,
"44, Chennai,Tamil Nadu" ,
"40, Hyderabad ,Telengana" ,
"80, Bangalore,Karnataka" ] ,
columns= [ 'row' ] )
df_out = df. row. str . split( "," , expand= True )
new_header = df_out. iloc[ 0 ]
df_out = df_out[ 1 : ]
df_out. columns = new_header
print ( df_out)
0 STD City State
1 33 Kolkata West Bengal
2 44 Chennai Tamil Nadu
3 40 Hyderabad Telengana
4 80 Bangalore Karnataka