data preprosessing

最新推荐文章于 2022-06-30 15:01:30 发布

xiaocainiao_nixi

最新推荐文章于 2022-06-30 15:01:30 发布

阅读量267

点赞数

分类专栏： maschine learning python

maschine learning 同时被 2 个专栏收录

8 篇文章 0 订阅

订阅专栏

python

4 篇文章 0 订阅

订阅专栏

import pandas as pd

#read the values

newdata = pd.read_csv('annotations_final.csv',sep="\t")

# show the head

newdata.head(5)

#show info

newdata.info()

newdata.columns

#concate two column

newdata[["clip_id","no voice"]]

# Some of the tags in the dataset are really close to each other. Lets merge them together
synonyms = [['beat', 'beats'],
            ['chant', 'chanting'],
            ['choir', 'choral'],
            ['classical', 'clasical', 'classic'],
            ['drum', 'drums'],
            ['electro', 'electronic', 'electronica', 'electric'],
            ['fast', 'fast beat', 'quick'],
            ['female', 'female singer', 'female singing', 'female vocals', 'female vocal', 'female voice', 'woman', 'woman singing', 'women'],
            ['flute', 'flutes'],
            ['guitar', 'guitars'],
            ['hard', 'hard rock'],
            ['harpsichord', 'harpsicord'],
            ['heavy', 'heavy metal', 'metal'],
            ['horn', 'horns'],
            ['india', 'indian'],
            ['jazz', 'jazzy'],
            ['male', 'male singer', 'male vocal', 'male vocals', 'male voice', 'man', 'man singing', 'men'],
            ['no beat', 'no drums'],
            ['no singer', 'no singing', 'no vocal','no vocals', 'no voice', 'no voices', 'instrumental'],
            ['opera', 'operatic'],
            ['orchestra', 'orchestral'],
            ['quiet', 'silence'],
            ['singer', 'singing'],
            ['space', 'spacey'],
            ['string', 'strings'],
            ['synth', 'synthesizer'],
            ['violin', 'violins'],
            ['vocal', 'vocals', 'voice', 'voices'],
            ['strange', 'weird']]

# Merge the synonyms and drop all other columns than the first one.
"""
Example:
Merge 'beat', 'beats' and save it to 'beat'.
Merge 'classical', 'clasical', 'classic' and save it to 'classical'.
"""
for synonym_list in synonyms:
    newdata[synonym_list[0]] = newdata[synonym_list].max(axis=1)
    newdata.drop(synonym_list[1:], axis=1, inplace=True)