sofasofa-机器读中文:根据名字判断性别

题目地址:机器读中文:根据名字判断性别

简单单子划分法(python)

使用朴素贝叶斯方法,sofasofa官方答案用python2写的,python3直接抄会出错。这里是经改写的答案,外加我对代码的一些理解。修改了关于encode(‘utf-8’),和set方法,map方法。
map方法python2中返回list,python3中返回iterator,所以我这套了个list()将返回值转换会list。

import pandas as pd
import numpy as np
from collections import Counter

# 读取数据
train = pd.read_table('train.txt', ',')
test = pd.read_table('test.txt', ',')
submit = pd.read_csv('sample_submit.csv')

#所有男生的名字
train_male=train[train['gender']==1]
names_male="".join(train_male['name'])#""是空字符串,后面的join它就是让所有字都变成字符串

#所有女生的名字
train_female=train[train['gender']==0]
names_female="".join(train_female['name'])

# 所有男生的名字中频率最高的250个字top_chars_male
# 所有女生的名字中频率最高的250个字top_chars_female
n_top=250
lists_male=list(map(lambda x: x.encode('utf-8'),names_male))#map里第一个参数是函数,第二个参数是该函数的参数
counts_male=Counter(lists_male).most_common(n_top)
top_chars_male=[]
for [x,y] in counts_male: #返回例如[('a', 5), ('r', 2), ('b', 2)]
    top_chars_male.append(x)
lists_female = list(map(lambda x: x.encode('utf-8'), names_female))
counts_female = Counter(lists_female).most_common(n_top)
top_chars_female = []
for [x, y] in counts_female:
    top_chars_female.append(x)

# 去除top_chars_male和top_chars_female中重复的字
top_chars_male = set(top_chars_male)
top_chars_female = set(top_chars_female)

# 如果名字中包含top_chars_male中的字,那么预测性别为男
# 否则查看名字中包含top_chars_female中的字,如果有,则预测性别为女
# 再则,随机设置一个性别
preds = []
for i in range(len(test)):
    if any(s in test['name'].iat[i].encode('utf-8') for s in top_chars_male):#s是test['name']的第一个名字的字符串的任意一个字符,如果任意s字符在top_chars_male中
        pred = 1
    elif any(s in test['name'].iat[i].encode('utf-8') for s in top_chars_female):
        pred = 0
    else:
        pred = np.random.choice(2)
    preds.append(pred)

# 输出预测结果至my_top250_prediction.csv
submit['gender'] = np.array(preds)
submit.to_csv('my_top250_prediction.csv', index=False)

基于TF的GBDT模型(Python)

修改方式同上

import pandas as pd
import numpy as np
from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier

# 读取数据
train = pd.read_table('train.txt', ',')
test = pd.read_table('test.txt', ',')
submit = pd.read_csv('sample_submit.csv')

# 所有男生的名字
train_male = train[train['gender'] == 1]
m_cnt = len(train_male)
names_male = "".join(train_male['name'])

# 所有女生的名字
train_female = train[train['gender'] == 0]
f_cnt = len(train_female)
names_female = "".join(train_female['name'])

# 统计每个字在男生、女生名字中出现的总次数
lists_male = list(map(lambda x: x.encode('utf-8'), names_male))
counts_male = Counter(lists_male)
lists_female = list(map(lambda x: x.encode('utf-8'), names_female))
counts_female = Counter(lists_female)

# 得到训练集中每个人的每个字的词频(Term Frequency,通常简称TF)
train_encoded = []
for i in range(len(train)):
    name = train.at[i, 'name']
    chs = list(map(lambda x: x.encode('utf-8'), name))
    row = [0., 0., 0., 0, train.at[i, 'gender']]
    for j in range(len(chs)):
        row[2* j] = counts_female[chs[j]] * 1. / f_cnt
        row[2* j + 1] = counts_male[chs[j]] * 1. / m_cnt
    train_encoded.append(row)

# 得到测试集中每个人的每个字的词频(Term Frequency,通常简称TF)
test_encoded = []
for i in range(len(test)):
    name = test.at[i, 'name']
    chs = list(map(lambda x: x.encode('utf-8'), name))
    row = [0., 0., 0., 0.,]
    for j in range(len(chs)):
        try:
            row[2 * j] = counts_female[chs[j]] * 1. / f_cnt
        except:
            pass
        try:
            row[2 * j + 1] = counts_male[chs[j]] * 1. / m_cnt
        except:
            pass
    test_encoded.append(row)

# 转换为pandas.DataFrame的形式
# 1_f是指这个人的第一个字在训练集中所有女生的字中出现的频率
# 2_f是指这个人的第二个字在训练集中所有女生的字中出现的频率
# 1_m是指这个人的第一个字在训练集中所有男生的字中出现的频率
# 2_m是指这个人的第二个字在训练集中所有男生的字中出现的频率
train_encoded = pd.DataFrame(train_encoded, columns=['1_f', '1_m', '2_f', '2_m', 'gender'])
test_encoded = pd.DataFrame(test_encoded, columns=['1_f', '1_m', '2_f', '2_m'])

# 训练GBDT模型
clf = GradientBoostingClassifier()
clf.fit(train_encoded.drop('gender', axis=1), train_encoded['gender'])
preds = clf.predict(test_encoded)

# 输出预测结果至my_TF_GBDT_prediction.csv
submit['gender'] = np.array(preds)
submit.to_csv('my_TF_GBDT_prediction.csv', index=False)
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值