1、series如何将一日期字符串转换为时间
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib. pyplot as plt
import os
import csv
s1 = pd. Series( [ '01 Jan 2010' ,
'02-02-2011' ,
'20120303' ,
'2013/04/04' ,
'2014-05-05' ,
'2015-06-06T12:20' ] )
s1= pd. to_datetime( s1)
s1[ 4 ] = "2014-05-5T2:00"
2、series如何从时间序列中提取年/月/天/小时/分钟/秒
s1. dt. year
s1. dt. month
s1. dt. hour
s1. dt. minute
s1. dt. second
3、从series中找出包含两个以上元音字母的单词
def is2w ( word) :
w= list ( "aieouAIEOU" )
count= 0
for i in w:
if i in word:
count += 1
if ( count>= 2 ) :
return word
s2 = pd. Series( [ 'Apple' , 'Orange' , 'Plan' , 'Python' , 'Money' ] )
result= s2. map ( lambda x: is2w( x) )
result[ ~ result. isnull( ) ]
4、如何过滤掉series中的无效电子邮件
emails = pd. Series( [ 'buying books at amazom.com' ,
'rameses@egypt.com' ,
'matt@t.co' ,
'narendra@modi.com' ] )
import re
pattern = '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{3,4}'
valid = emails. str . findall( pattern, flags= re. IGNORECASE)
[ x[ 0 ] for x in valid if len ( x) ]
5、series A 以series B为分组依据, 然后计算分组后的平均值
np. random. seed( 2 )
d1= pd. DataFrame( { "fruit" : np. random. choice( [ "apple" , "banana" , "orange" ] , 10 )
, "sale" : np. random. uniform( 0 , 10 , 10 ) } )
d1[ "sale" ] . groupby( d1[ "fruit" ] ) . mean( )
6、如何计算两个系列之间的欧氏距离
p = pd. Series( [ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 ] )
q = pd. Series( [ 10 , 9 , 8 , 7 , 6 , 5 , 4 , 3 , 2 , 1 ] )
sum ( ( p- q) ** 2 ) ** 0.5
7、如何在数字系列中查找所有局部最大值(或峰值)
s3 = pd. Series( [ 2 , 10 , 3 , 4 , 9 , 10 , 2 , 7 , 3 ] )
dd = np. diff( np. sign( np. diff( s3) ) )
peak_locs = np. where( dd == - 2 ) [ 0 ] + 1
peak_locs
s4= pd. Series( np. sign( s3. diff( ) ) ) . diff( )
8、如何创建一个以’2000-01-02’开始包含10个周五的TimeSeries
pd. Series( np. random. randint( 1 , 10 , 10 ) ,
index= pd. date_range( '2000-01-02' , periods= 10 , freq= 'w-fri' ) )
9、如何填补TimeSeires的缺失日期
datetime= [ "2019-1-1" , "2019-1-5" , "2019-1-7" , "2019-1-11" , "2019-1-15" ]
s5= pd. Series( np. random. uniform( 1 , 10 , 5 ) , index= pd. to_datetime( datetime) )
s5. resample( "d" ) . bfill( )
10、如何计算series的自相关
s6= pd. Series( np. arange( 1 , 21 , 1 ) + np. random. RandomState( 1 ) . normal( 0 , 1 , 20 ) )
auto1= [ s6. autocorr( i) . round ( 2 ) for i in range ( 0 , 5 ) ]
11、读取csv时, 间隔几行读取数据
os. chdir( "C:/Users/ABC/其他/Desktop" )
fpath= "test.csv"
dt1= pd. DataFrame( { "a" : range ( 50 )
, "fruit" : np. random. choice( [ "apple" , "bananas" , "orange" ] , 50 ) } )
dt1. to_csv( fpath, index= None )
with open ( "test.csv" , "r" ) as f:
out= [ ]
reader= csv. reader( f)
for i, row in enumerate ( reader) :
if i% 5 == 0 :
out. append( row)
pd. DataFrame( out[ 1 : ] , columns= out[ 0 ] , index= [ i[ 0 ] for i in out] [ 1 : ] )
12、读取csv时进行数据转换
pd. read_csv( fpath, converters= {
"a" : lambda x: "low" if int ( x) < 50 else "high"
} ) . head( )
13、读取数据时,只读取某列
pd. read_csv( fpath, usecols= [ "fruit" ] ) . head( )
14.了解数据框每列的数据类型
dt1. dtypes
15、读取dataframe的行数和列数
dt2= pd. DataFrame(
{
'a' : range ( 100 ) ,
'b' : np. random. rand( 100 ) ,
'c' : [ 1 , 2 , 3 , 4 ] * 25 ,
'd' : [ 'apple' , 'banana' , 'carrot' ] * 33 + [ 'apple' ]
}
)
dt2. shape
16、dataframe每列的基本描述统计
dt2. describe( )
17、dataframe中找到a列最大值对应的行
dt2[ dt2[ "a" ] == dt2[ "a" ] . max ( ) ]
18、dataframe中获取c列最大值所在的行号
np. where( dt2[ "c" ] == dt2[ "c" ] . max ( ) )
19、在dataframe中根据行列数读取某个值
dt2. iat[ 1 , 2 ]
20、在dataframe中根据index和列名称读取某个值
dt2. at[ 1 , "a" ]
21、dataframe中重命名某一列
dt2. rename( columns= { "d" : "fruit" } ) . head( )