项目：识别Twitter用户性别

最新推荐文章于 2024-02-08 08:00:00 发布

weixin_43579079

最新推荐文章于 2024-02-08 08:00:00 发布

阅读量1.9k

点赞数 1

分类专栏：深度学习

本文链接：https://blog.csdn.net/weixin_43579079/article/details/99666006

版权

1.相关知识点

网络爬虫
文本特征提取
图像特征提取
使用scikit-learn完成机器学习

2.分析步骤

1.查看数据
df_obj.info()
df_obj.shape()
df_obj.head()
2.明确分析目标
3.数据清洗
df_obj.dropna()
df_obj.fillna()
4.特征工程 model.fit()
特征提取、归一化、降维处理
5.选择模型
训练模型
交叉验证
6.模型测试
model.predict()

3.代码及注释

main.py

# -*- coding: utf-8 -*-

import os
import pandas as pd
from common_tools import get_dataset_filename, unzip, cal_acc
from pd_tools import inspect_dataset, check_profile_image, \
    split_train_test, clean_text, proc_text, get_word_list_from_data, \
    extract_tf_idf, extract_rgb_feat, extract_rgb_hist_feat
import nltk
from nltk.text import TextCollection
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA


# 声明数据集路径
dataset_path = './dataset'  # 数据集路径
zip_filename = 'twitter-user-gender-classification.zip'  # zip文件名
zip_filepath = os.path.join(dataset_path, zip_filename)  # zip文件路径
cln_datapath = './cln_data.csv'     # 清洗好的数据路径

# 是否第一次运行
is_first_run = False


def run_main():
    """
        主函数
    """
    # 声明变量
    dataset_filename = get_dataset_filename(zip_filepath)  # 数据集文件名（在zip中）
    dataset_filepath = os.path.join(dataset_path, dataset_filename)  # 数据集文件路径

    if is_first_run:

        print('解压zip...', end='')
        unzip(zip_filepath, dataset_path)
        print('完成.')

        # 读取数据
        data = pd.read_csv(dataset_filepath, encoding='latin1',
                           usecols=['gender', 'description', 'link_color',
                                    'profileimage', 'sidebar_color', 'text'])
        # 1. 查看加载的数据集
        inspect_dataset(data)

        # 2. 数据清洗
        # 2.1. 根据 'gender' 列过滤数据
        filtered_data = data[(data['gender'] == 'male') | (data['gender'] == 'female')]

        # 2.2 过滤掉 'description' 列为空的数据
        filtered_data = filtered_data.dropna(subset=['description'])

        # 2.3 过滤掉 'link_color' 列和 'sidebar_color' 列非法的16进制数据
        filtered_data = filtered_data[filtered_data['link_color'].str.len() == 6]
        filtered_data = filtered_data[filtered_data['sidebar_color'].str.len() == 6]

        # 2.4 清洗文本数据
        print('清洗文本数据...')
        cln_desc = filtered_data['description'].apply(clean_text)
        cln_text = filtered_data['text'].apply(clean_text)
        filtered_data['cln_desc'] = cln_desc
        filtered_data['cln_text'] = cln_text

        # 2.5 根据profileimage的链接判断头像图片是否有效，
        # 并生成新的列代表头像图片保存的路径
        print('下载头像数据...')
        saved_img_s = filtered_data['profileimage'].apply(check_profile_image)
        filtered_data['saved_image'] = saved_img_s
        # 过滤掉无效的头像数据
        filtered_data = filtered_data[filtered_data['saved_image'] != '']

        # 保存处理好的数据
        filtered_data.to_csv(cln_datapath, index=False)

    # 读取处理好的数据
    clean_data = pd.read_csv(cln_datapath, encoding='latin1',
                             usecols=['gender', 'cln_desc', 'cln_text',
                                      'link_color', 'sidebar_color', 'saved_image'])

    # 查看label的分布
    print(clean_data.groupby('gender').size())

    # 替换male->0, female->1
    clean_data

最低0.47元/天解锁文章