Pandas
- 解压
import zipfile
with zipfile.ZipFile("../input/train.zip","r") as z:
z.extractall(".")
with zipfile.ZipFile("../input/test1.zip","r") as z:
z.extractall(".")
- 创建dataframe
#读取文件夹里的所有文件名
filenames = os.listdir("../input/train/train")
#创建对应的分类
categories = []
for filename in filenames:
category = filename.split('.')[0]
if category == 'dog':
categories.append(1)
else:
categories.append(0)
#创建dataframe
df = pd.DataFrame({
'filename': filenames,
'category': categories
})
#观察头尾
df.head()
df.tail()
- 观察标签
df['category'].value_counts().plot.bar()
- 观察图片
sample = random.choice(filenames)
image = load_img("../input/train/train/"+sample)
plt.imshow(image)
- train_test_split for dataframe
注意在split后trainning data和test data依然在同一个文件夹里
train_df, validate_df = train_test_split(df, test_size=0.20, random_state=42)
train_df = train_df.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)
#drop=True,删除原本的index,使用新的index
- dataframe大小
total_train = train_df.shape[0]
total_validate = validate_df.shape[0]
- 采样一行
example_df = train_df.sample(n=1).reset_index(drop=True)
- submission
submission_df = test_df.copy()
submission_df['id'] = submission_df['filename'].str.split('.').str[0]
submission_df['label'] = submission_df['category']
submission_df.drop(['filename', 'category'], axis=1, inplace=True)
submission_df.to_csv('submission.csv', index=False)
- Replace
test_df['category'] = test_df['category'].replace({ 'dog': 1, 'cat': 0 })
CV2
#使用lambda函数
convert = lambda category :(1 if category =='dog' else 0)
convert = lambda category : int(category == 'dog')
#读取图片及标签
category = convert(category)
img_array = cv2.imread(os.path.join(path,filename),
cv2.IMREAD_GRAYSCALE) #灰度图片
new_img_array = cv2.resize(img_array, dsize=(80, 80))
X.append(new_img_array)
y.append(category)
#转numpy array,reshape,以及标准化
X = np.array(X).reshape(-1, 80,80,1)
y = np.array(y)
X/=255.0
保存数据集
import pickle
with open("train_x", "rb") as fp:
X= pickle.load(fp)
with open("train_y", "rb") as fp:
y= pickle.load(fp)
print(X.shape)
Transpose&Reshape
import numpy as np
arr_1d_bigger = np.arange(24)
arr_3d = arr_1d_bigger.reshape((2, 3, 4))
print(arr_3d)
arr = arr_3d.transpose(0,2,1)
print(arr)
[[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
[[12 13 14 15]
[16 17 18 19]
[20 21 22 23]]]
[[[ 0 4 8]
[ 1 5 9]
[ 2 6 10]
[ 3 7 11]]