12月初甘肃省AI竞赛心得记录:
本次人工智能应用赛试题分为五部分:
1.人工智能开发环境的部署(机器学习/深度学习库的安装与测试)
2.样本数据预处理:将数据聚类并可视化,所给数据为10000个样本的特征数据,每一行表示一个样本的3维特征,将原始数据读取并在二维坐标轴上可视化;对数据进行聚类(聚为4类);将聚类结果在二维坐标轴上可视化,每一类的数据需要用不同颜色进行区分。代码如下:
from sklearn.datasets import make_circles
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
import pandas as pd
import time
import functools
def time_cost(func):
@functools.wraps(func)
def wrapper(*args, **kwargs):
t0 = time.time()
func(*args, **kwargs)
t1 = time.time()
print(args[0], ':%.2fs' % (t1 - t0))
return func(*args, **kwargs), t1 - t0
return wrapper
def load_data(file):
assert file != ''
df = pd.read_csv(r'C:\Users\yaya\Desktop\试题数据\第三部分\test.csv')
x = df.values
pca = PCA(n_components=2)
pca_result = pca.fit_transform(x)
return x, pca_result
@time_cost
def cluster_function(model_name, model, data):
y_pred = model.fit_predict(data)
return y_pred
if __name__ == '__main__':
model_list = { "KMeans": KMeans(n_clusters=4, random_state=10), }
x, pca_result = load_data(r'C:\Users\yaya\Desktop\试题数据\第三部分\test.csv')
plt.scatter(pca_result[:, 0], pca_result[:, 1], marker='.')
i = 1
fig = plt.figure(figsize=(15, 10))
for model in model_list:
fig.add_subplot(2, 3, i)
result = cluster_function(model, model_list[model], x)
plt.scatter(pca_result[:, 0], pca_result[:, 1], marker='.', c=result[0])
plt.title("{}({})".format(model, silhouette_score(x, result[0])))
plt.text(.99, .01, ('%.2fs' % (result[1])).lstrip('0'), transform=plt.gca().transAxes,
horizontalalignment='right')
i += 1
plt.show()
```python
在这里插入代码片
3.统计词频并输出高频词汇
统计txt文件中出现频率最高的十个单词,输出对应的单词内容和频率。代码如下:
import string
file = open('./data.txt', 'r', encoding='UTF-8')
创建一个字典用于存储每个单词出现的次数
word_appear_time = {}
sans_punctuation_documents = []
import string
for i in file:
# TODO
trantab = str.maketrans({key: None for key in string.punctuation})
j = i.translate(trantab)
sans_punctuation_documents.append(j)
print (sans_punctuation_documents)
for line in sans_punctuation_documents:
line = str.lower(line)
words = line.strip().split()
for word in words:
if word in word_appear_time:
word_appear_time[word] += 1
else:
word_appear_time[word] = 1
word_list = []
for word, times in word_appear_time.items():
word_list.append((times, word))
word_list.sort(reverse=True)
for times, word in word_list[:10]:
print(word, times)
print(type(word))
print(word_list[:10])
print(type(word_list[:10]))
m = dict(word_list[:10])
mi = dict(zip(m.values(), m.keys()))
print(mi)
file.close()
4.深度学习算法应用:
resnet18网络模型
from __future__ import print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
plt.ion() # interactive mode
data_transforms = {
'train': transforms.Compose([
transforms.RandomResizedCrop(224),
transforms.RandomHorizontalFlip(),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
'val': transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
data_dir = r'./data'
image_datasets = {x: datasets.ImageFolder(os.path.join(data_dir, x),
data_transforms[x])
for x in ['train', 'val']}
dataloaders = {x: torch.utils.data.DataLoader(image_datasets[x], batch_size=4,
shuffle=True, num_workers=4)
for x in ['train', 'val']}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
def train_model(model, criterion, optimizer, scheduler, num_epochs=25):
since = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
loss = criterion(outputs, labels)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
if phase == 'train':
scheduler.step()
epoch_loss = running_loss / dataset_sizes[phase]
epoch_acc = running_corrects.double() / dataset_sizes[phase]
print('{} Loss: {:.4f} Acc: {:.4f}'.format(
phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
print()
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(
time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
return model
def visualize_model(model, num_images=6):
was_training = model.training
model.eval()
images_so_far = 0
fig = plt.figure()
with torch.no_grad():
for i, (inputs, labels) in enumerate(dataloaders['val']):
inputs = inputs.to(device)
labels = labels.to(device)
outputs = model(inputs)
_, preds = torch.max(outputs, 1)
for j in range(inputs.size()[0]):
images_so_far += 1
ax = plt.subplot(num_images//2, 2, images_so_far)
ax.axis('off')
ax.set_title('predicted: {}'.format(class_names[preds[j]]))
imshow(inputs.cpu().data[j])
if images_so_far == num_images:
model.train(mode=was_training)
return
model.train(mode=was_training)
model_ft = models.resnet18(pretrained=True)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Linear(num_ftrs, 3)
model_ft = model_ft.to(device)
criterion = nn.CrossEntropyLoss()
optimizer_ft = optim.SGD(model_ft.parameters(), lr=0.001, momentum=0.9)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=7, gamma=0.1)
model_ft = train_model(model_ft, criterion, optimizer_ft, exp_lr_scheduler,
num_epochs=10)