本文只是我在平时处理数据时一些常用操作,一个备忘录而已。
##################################################读取
1.csvcsv_file_object = csv.reader(open('train.csv', 'rb')) # Load in the csv file
header = csv_file_object.next() # Skip the fist line as it is a header
data=[] # Create a variable to hold the data
for row in csv_file_object: # Skip through each row in the csv file,
data.append(row[0:]) # adding each row to the data variable
data = np.array(data) # Then convert from a list to an array.
2.dataframe
train_df = pd.read_csv('train.csv', header=0) # Load the train file into a dataframe
##########对于大文件,分批读入
def read_csv_chunk_ukt(filename,chunk_size=100000):
reader = pd.read_csv(filename, iterator=True)
loop = True
chunks = []
while loop:
try:
chunk = reader.get_chunk(chunk_size)[
["user_id", "sku_id", "type","time"]]#选取想要的列
chunks.append(chunk)
except StopIteration:
loop = False
print "Iteration is stopped."
df = pd.concat(chunks, ignore_index=True)
return df
##################创建dataframe
re=pd.DataFrame({'del_buy':[result], 'user_id':[ group.user_id.values[0]] },
columns=['user_id','del_buy'] )
###############查看pandas 数据基本信息
train_df.describe() #count\mean\std\min\max...
###############################################类型转换
1.
train_df['Gender'] = train_df['Sex'].