一、pandas中 字符串 处理(拆分与去除特殊字符)
对字符串 进行拆分连接(去空格或者其它符号)
import re
text ='foo bar \t good \tssda'
strlist = re.split ('\s+' ,text)
输出1 :
['foo' , 'bar' , 'good' , 'ssda' ]
"#" .join (strlist)
输出2 :
'foo#bar#good#ssda'
strlist = "#" .join (re.split ('\s+' ,text))
输出3 :
'foo#bar#good#ssda'
方法 说明 count 返回子串在字符串中的出现次数(非重叠) endswith,startswith 如果字符串以某个后缀结尾(以某个前缀幵头),则返回True join 将字符串用作连接其他字符串序列的分隔符 “#”.join(str1):对字符串用“#”连接 index 如果在字符串中找到子串,则返回子串第一个字符所在的位置。如果没有找到,则引发ValueError。 find 则返回发现的第一个字符
所在的位置。如果没有找到,则返回-1 rfind 则返回发现的最后一个字符
所在的位置。如果没有找到,则返回-1 replace 用另一个字符串替换指定子串 strip, rstrip、Istrip 去除空白符(包括换行符) split 通过指定的分隔符将字符串拆分为一组子串 lower(小写),upper(大写) 分别将字母字符转换为小写或大写 ljust rjust 用空格(或其他字符)填充字符串的空白侧以返回符合最低宽度的字符串
二、时间戳
from datetime import datetime
from datetime import timedelta
from dateutil.parser import parse
2.1 python 时间戳
获取当前时间并格式化
now = datetime.now()
stamp = datetime.now()
stamp.strftime("%Y -%m -%d " )
输出:
'2018-06-22'
推导式:
datastrs = ["1/6/2001" ,"12/9/2013" ]
[datetime.strptime (x ,"%d/%m/%Y" ) for x in datastrs]
>>>
[datetime.datetime (2001 , 6 , 1 , 0 , 0 ), datetime.datetime (2013 , 9 , 12 , 0 , 0 )]
parse("2011-03-23" )
>>>
datetime.datetime (2011 , 3 , 23 , 0 , 0 )
2.2 pandas 时间戳
from pandas import DataFrame,Series
import pandas as pd
import numpy as np
from datetime import datetime
字符串、时间戳—转成时间
datastrs = ["1/6/2001" ,"12/9/2013" ]
pd.to_datetime(datastrs)
>>>
DatetimeIndex(['2001-01-06' , '2013-12-09' ], dtype='datetime64[ns]' , freq=None )
------------------------------------------------------
import sys
import datetime
"""
将878887116 时间戳转换为星期几
"""
weekday = datetime.datetime.fromtimestamp(float(878887116 )).isoweekday()
判断时间:NaT
idx = pd.to_datetime(datastrs+[None ])
>>>
DatetimeIndex(['2001-01-06' , '2013-12-09' , 'NaT' ], dtype='datetime64[ns]' , freq=None )
mask = pd.isnull(idx)
>>>
array([False , False , True ])
idx[~mask]
>>>
DatetimeIndex(['2001-01-06' , '2013-12-09' ], dtype='datetime64[ns]' , freq=None )
创建一组时间日期
dates=[datetime(2011 ,1 ,2 ),datetime(2011 ,1 ,5 ),datetime(2011 ,1 ,7 ),
datetime(2011 ,1 ,8 ),datetime(2011 ,1 ,10 ),datetime(2011 ,1 ,12 )]
ts = Series(np.random.randn(6 ),index=dates)
>>>
2011 -01 -02 0.374627
2011 -01 -05 -1.154850
2011 -01 -07 -0.144595
2011 -01 -08 -0.331414
2011 -01 -10 0.859761
2011 -01 -12 -0.475668
dtype: float64
stamp = ts.index[2 ]
>>>
Timestamp('2011-01-07 00:00:00' )
ts["2011-01" ]
>>>
2011 -01 -02 0.374627
2011 -01 -05 -1.154850
2011 -01 -07 -0.144595
2011 -01 -08 -0.331414
2011 -01 -10 0.859761
2011 -01 -12 -0.475668
dtype: float64
longer_ts = Series(np.random.randn(1000 ),
index = pd.date_range("1/1/2000" ,periods=1000 ))
longer_ts["2001" ]
>>>
2001 -01 -01 0.404751
2001 -01 -02 -1.290699
2001 -01 -03 1.522409
2001 -01 -04 -1.349135
2001 -01 -05 0.797811
2001 -01 -06 -0.318972
……
2001 -12 -28 -1.255813
2001 -12 -29 0.342102
2001 -12 -30 -0.683155
2001 -12 -31 -0.031639
Freq: D, Length: 365 , dtype: float64
index = pd.date_range("1/23/2001" ,"5/23/2001" )
>>>
DatetimeIndex(['2001-01-23' , '2001-01-24' , '2001-01-25' , '2001-01-26' ,
'2001-01-27' , '2001-01-28' , '2001-01-29' , '2001-01-30' ,
'2001-01-31' , '2001-02-01' ,
...
'2001-05-14' , '2001-05-15' , '2001-05-16' , '2001-05-17' ,
'2001-05-18' , '2001-05-19' , '2001-05-20' , '2001-05-21' ,
'2001-05-22' , '2001-05-23' ],
dtype='datetime64[ns]' , length=121 , freq='D' )
index = pd.date_range(start="1/23/2001" ,end="5/23/2002" )
index = pd.date_range(end="1/23/2001" ,periods=20 )
index = pd.date_range(start="1/23/2001" ,periods=20 )
>>>
方法3 :
DatetimeIndex(['2001-01-23' , '2001-01-24' , '2001-01-25' , '2001-01-26' ,
'2001-01-27' , '2001-01-28' , '2001-01-29' , '2001-01-30' ,
'2001-01-31' , '2001-02-01' , '2001-02-02' , '2001-02-03' ,
'2001-02-04' , '2001-02-05' , '2001-02-06' , '2001-02-07' ,
'2001-02-08' , '2001-02-09' , '2001-02-10' , '2001-02-11' ],
dtype='datetime64[ns]' , freq='D' )
pandas读取csv 处理时间列
import datetime
import pandas as pd
import numpy as np
import pickle
dateparse = lambda dates: pd.datetime.strptime(dates,'%d/%m/%Y %H:%M' )
rawdata = pd.read_csv('RealMarketPriceDataPT.csv' ,
parse_dates={'timeline' :['date' ,'(UTC)' ]},
date_parser=dateparse)
def datestr2num (s) :
return datetime.datetime.strptime(s,'%Y-%m-%d %H:%M:%S' ).toordinal()
x = []
y = []
new_date = []
for i in range(rawdata.shape[0 ]):
x_convert = int(datestr2num(str(rawdata.ix[i,0 ])))
new_date.append(x_convert)
y_convert = rawdata.ix[i,1 ].astype(np.float32)
x.append(x_convert)
y.append(y_convert)
x = np.array(x).astype(np.float32)
"""
with open('price.pickle','wb') as f:
pickle.dump((x,y),f)
"""
print(datetime.datetime.fromordinal(new_date[0 ]),'------>>>>>>' ,new_date[0 ])
print(datetime.datetime.fromordinal(new_date[10 ]),'------>>>>>>' ,new_date[10 ])
print(datetime.datetime.fromordinal(new_date[20 ]),'------>>>>>>' ,new_date[20 ])
print(datetime.datetime.fromordinal(new_date[30 ]),'------>>>>>>' ,new_date[30 ])
print(datetime.datetime.fromordinal(new_date[40 ]),'------>>>>>>' ,new_date[40 ])
print(datetime.datetime.fromordinal(new_date[50 ]),'------>>>>>>' ,new_date[50 ])
pandas读取csv 处理时间戳列–将时间戳转成类别特征
rating_data = sc.textFile(u.data)
rating_data.first()
timestamps = rating_data.map(lambda line:line.split("\t" ))\
.map(lambda fields: int(fields[3 ]))
def extract_datatime (ts) :
import datatime
return datatime.datatime.fromtimestamp(ts)
hour_of_day = timestamps.map(lambda ts : extract_datatime(ts).hour)
hour_of_day.take(5 )
def assign_today (hr) :
time_of_day={
"morning" :range(7 ,12 ),
"lunch" :range(12 ,14 ),
"afternoon" :range(14 ,18 ),
"evening" :range(18 ,23 ),
"prenight" :range(23 ,24 ),
"night" :range(0 ,7 )
}
for k,v in time_of_day.items():
if hr in v:
return k
time_of_day = hour_of_day.map(lambda hr:assign_today(hr))
time_of_day.take(5 )