蓝桥杯人工智能赛-数据预处理篇

最新推荐文章于 2024-09-10 14:49:32 发布

藤宫博野

最新推荐文章于 2024-09-10 14:49:32 发布

阅读量295

点赞数 2

分类专栏：蓝桥杯刷题机器学习深度学习文章标签：蓝桥杯机器学习

本文链接：https://blog.csdn.net/m0_73745065/article/details/138754065

版权

蓝桥杯刷题同时被 3 个专栏收录

9 篇文章 0 订阅

订阅专栏

深度学习

8 篇文章 0 订阅

订阅专栏

机器学习

4 篇文章 0 订阅

订阅专栏

这次的博客对四期模拟赛的预处理操作做一个总结，涉及到机器学习的各种文件的预处理。

传统机器学习数据预处理

import pandas as pd

data = pd.read_csv('songs_origin.csv')
data = data.fillna(data.mean())  # 直接用fillna对缺失值进行填充，缺失值在pandas里表现为NAN值
data = data[(data['acousticness_yr'] <= 1) & (data['acousticness_yr'] >= 0)]  # 反向思考，用索引的方式去除
data = data.drop_duplicates()  # 直接调用去除重复行的attribute

data.to_csv('songs_processed.csv', index=False)

图像数据预处理（本题使用OpenCV库）

这道题只需要完善Resize-Cut-Norm三个操作并编写在img_processor函数中即可，然而我认为这道题更大的价值在于如何处理图像文件数据，代码中给出处理图像的注释。

#task-start
import numpy as np
import json
import cv2
import os

def img_processor(data_path, dst_size = (224,224)):
    Image_std = [0.229, 0.224, 0.225]
    Image_mean = [0.485, 0.456, 0.406]
    _std = np.array(Image_std).reshape((1,1,3))
    _mean = np.array(Image_mean).reshape((1,1,3))
    image_src = cv2.imread(data_path)
    #TODO
    
    # resize操作，可以用cv2的resize，也可以用PIL的resize，这道题由于用CV2读取，PIL的写法更为繁琐
    image = cv2.resize(image_src, (256, 256))
    # 中心裁剪
    startx, starty = (256 - dst_size[0]) // 2, (256 - dst_size[1]) // 2
    image = image[startx:startx + dst_size[0], starty:starty+dst_size[1], :]
    # 中心化
    image = (image - _mean) / _std
    return image_src, image, (startx,starty)


def simple_generator(data_list, json_file, dst_size = (224, 224)):
    with open(json_file, 'r') as f:
        data = json.load(f)
    folder_map = {v[0]: (int(k), v[1]) for k,v in data.items()}

    for img_path in data_list:
        image_src, image, (startx,starty) = img_processor(img_path, dst_size)
        label = folder_map[img_path.split('/')[-2]]
        yield image_src, image, label, (startx,starty)

def main():
    Image_path = 'Imagedata/images'
    Json_path = 'Imagedata/image_class_index.json'   # json文件读取后为字典，key为标签索引[0...n]，Value为标签代号以及class name
    data_list = []
    for dirname, _, filenames in os.walk(Image_path):  # os.walk输出三元组，dirpath， dirnames, files
        # 例如文件为 'Imagedata/images/1/4.jpg' 以及 'Imagedata/images/2/3.jpg', walk输出则为['Imagedata/images/1', ['Imagedata/images/2']， ['1', '2'], ['4.jpg', '3.jpg']
        # 上述注释仅为便于理解
        if os.path.basename(dirname).startswith('n'):  # 判断dir文件夹是否为n开头，标签代号都为n开头
            for filename in filenames:
                data_list.append(os.path.join(dirname, filename)) 

    # 创建生成器
    generator = simple_generator(data_list, Json_path)  # 用yield创建生成器

    # 查看示例，检查图像、标签等属性的正确性
    num_samples = 5
    for _ in range(num_samples):
        image_src, image, label, (startx,starty) = next(generator)
        print("SrcImage shape:", image_src.shape)
        print("Image shape:", image.shape)
        print("Label:", label)
        print("startx and starty:",(startx,starty))

if __name__ == '__main__':
    main()
#task-end

文本同义词替换

#task-start
import random
import pandas as pd
from torch.utils.data import Dataset


class MakeDataset(Dataset):

    def __init__(self):
        self.data = pd.read_csv('data.csv')[['text_id', 'text']].values
        self.locs = open('loc.txt', 'r').read().split('\n')
        self.pers = open('per.txt', 'r').read().split('\n')

    def __getitem__(self, item):
        text_id, text = self.data[item]
        text, aug_info = self.augment(text)
        return text_id, text, aug_info

    def __len__(self):
        return len(self.data)

    def augment(self, text):
        aug_info = {'locs': [], 'pers': []}

        # TODO
        loc_temp, per_temp = [], []   # 存储text中有多少个需要替换的词
        for loc in self.locs:
            if loc in text:
                loc_temp.append(loc)
        for loc in loc_temp:
            loc_replace = random.choice(self.locs)
            text = text.replace(loc,loc_replace)
            aug_info['locs'].append({'original':loc,'replacement':loc_replace}) 
        for per in self.pers:
            if per in text:
                per_temp.append(per)
        for per in per_temp:
            per_replace = random.choice(self.pers)
            text = text.replace(per,per_replace)
            aug_info['pers'].append({'original':per,'replacement':per_replace})
        return text, aug_info


def main():
    dataset = MakeDataset()
    for data in dataset:
        print(data)

if __name__ == '__main__':
    main()
#task-end

这道题代码写的繁琐了，后续可以优化一下代码