NGender 根据中文姓名猜测其性别

NGender

根据中文姓名猜测其性别

  • 不到20行纯Python代码(核心部分)
  • 无任何依赖库
  • 兼容python3, python2, pypy
  • 82%的准确率
  • 可用于猜测性别
  • 也可用于判断名字的男性化/女性化程度

使用

pip install ngender

或者(OSX)

brew install https://raw.githubusercontent.com/observerss/homebrew/61b3623967dc9507958dfb517e7f746baa96dcf1/Library/Formula/ngender.rb

然后在命令行中

$ ng 赵本山 宋丹丹
name: 赵本山 => gender: male, probability: 0.9836229687547046
name: 宋丹丹 => gender: female, probability: 0.9759486128949907

当然也可以在Python程序中用

>>> import ngender
>>> ngender.guess('赵本山')
('male', 0.9836229687547046)

>>> ngender.guess('宋丹丹')
('female', 0.9759486128949907)

>>> %timeit guess('宋丹丹')
100000 loops, best of 3: 4.01 µs per loop

原理

数学

贝叶斯公式: P(Y|X) = P(X|Y) * P(Y) / P(X)

当X条件独立时, P(X|Y) = P(X1|Y) * P(X2|Y) * ...

应用到猜名字上

P(gender=男|name=本山) 
= P(name=本山|gender=男) * P(gender=男) / P(name=本山)
= P(name has 本|gender=男) * P(name has 山|gender=男) * P(gender=男) / P(name=本山)

计算

元数据是1.csv的内容

char,male,female
明,378860,63221
伟,378757,51232
军,378096,29518
建,366515,51477
华,344928,174529
文,314939,114048
国,314608,29055

测试代码
step1:分析每个字在男女名字的占比
step2:p为男女的概率
step3:每个字在男女总字数里出现的概率
step4:最后用男女的对比算的占比推断出性别的概率

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os

__all__ = ['guess']


def py2compat(name):
    try:
        name = name.decode('utf-8')
    except:
        pass
    return name


class Guesser(object):
    #step1:init class
    def __init__(self):
        self._load_model()
    #
    def _load_model(self):
        self.male_total = 0
        self.female_total = 0
        self.freq = {}

        with open(os.path.join(os.path.dirname(__file__),
                               '1.csv'),
                  'rb') as f:
            # skip first line
            next(f)
            for line in f:
                line = line.decode('utf-8')
                char, male, female = line.split(',')
                #
                char = py2compat(char)
                self.male_total += int(male)
                self.female_total += int(female)
                self.freq[char] = (int(female), int(male))


        self.total = self.male_total + self.female_total
        print(self.total)
        print(self.male_total)
        print(self.female_total)
        for char in self.freq:
            female, male = self.freq[char]
            self.freq[char] = (1. * female / self.female_total,
                               1. * male / self.male_total)
        print(self.freq) #step1:分析每个字在男女名字的占比
        #{'醪': (9.270400048354406e-08, 0.0), '咨': (2.781120014506322e-06, 2.396555163413343e-06), '屛':


    def guess(self, name):
        name = py2compat(name)
        firstname = name[1:]
        for char in firstname:
            assert u'\u4e00' <= char <= u'\u9fa0', u'姓名必须为中文'

        pf = self.prob_for_gender(firstname, 0)

        print('------------')
        pm = self.prob_for_gender(firstname, 1)

        #step4:最后用男女的对比算的占比推断出性别的概率
        if pm > pf:
            return ('male', 1. * pm / (pm + pf))
        elif pm < pf:
            return ('female', 1. * pf / (pm + pf))
        else:
            return ('unknown', 0)

    def prob_for_gender(self, firstname, gender=0):
        p = 1. * self.female_total / self.total \
            if gender == 0 \
            else 1. * self.male_total / self.total
        print(p)#step2:p为男女的概率
        for char in firstname:
            p *= self.freq.get(char, (0, 0))[gender] #step3:每个字在男女总字数里出现的概率
            print(char)
            print(p)
        return p


guesser = Guesser()


def guess(name):
    return guesser.guess(name)


if __name__ == '__main__':
    print(guess("张结论"))
  1. 文件charfreq.csv是怎么来的?

    曾经有个东西叫开房记录.avi(雾),里面有名字和性别, 2000w条, 统计一下得出

  2. 怎么算 P(name has 本|gender=男)?

    “本”在男性名字中出现的次数 / 男性字出现的总次数

  3. 怎么算 P(gender=男)?

    男性名出现的次数 / 总次数

  4. 怎么算 P(name=本山)?

    不用算, 在算概率的时候会互相约去

>>> ngender.guess('李胜男')
('male', 0.851334658742)
  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
等基本信息。下面是一个基于Qt的对话框示例,可以输入姓名性别、学号和生日: ```c++ #include <QDialog> #include <QLabel> #include <QLineEdit> #include <QComboBox> #include <QDateEdit> #include <QDialogButtonBox> #include <QHBoxLayout> #include <QVBoxLayout> class InfoDialog : public QDialog { Q_OBJECT public: InfoDialog(QWidget *parent = nullptr); QString getName() const; QString getGender() const; QString getStudentID() const; QDate getBirthday() const; private slots: void accept(); private: QLabel *nameLabel; QLineEdit *nameEdit; QLabel *genderLabel; QComboBox *genderComboBox; QLabel *studentIDLabel; QLineEdit *studentIDEdit; QLabel *birthdayLabel; QDateEdit *birthdayEdit; QDialogButtonBox *buttonBox; }; InfoDialog::InfoDialog(QWidget *parent) : QDialog(parent) { nameLabel = new QLabel(tr("Name:")); nameEdit = new QLineEdit; genderLabel = new QLabel(tr("Gender:")); genderComboBox = new QComboBox; genderComboBox->addItem(tr("Male")); genderComboBox->addItem(tr("Female")); studentIDLabel = new QLabel(tr("Student ID:")); studentIDEdit = new QLineEdit; birthdayLabel = new QLabel(tr("Birthday:")); birthdayEdit = new QDateEdit; birthdayEdit->setDisplayFormat("yyyy-MM-dd"); birthdayEdit->setCalendarPopup(true); birthdayEdit->setMaximumDate(QDate::currentDate()); buttonBox = new QDialogButtonBox(QDialogButtonBox::Ok | QDialogButtonBox::Cancel); connect(buttonBox, SIGNAL(accepted()), this, SLOT(accept())); connect(buttonBox, SIGNAL(rejected()), this, SLOT(reject())); QHBoxLayout *nameLayout = new QHBoxLayout; nameLayout->addWidget(nameLabel); nameLayout->addWidget(nameEdit); QHBoxLayout *genderLayout = new QHBoxLayout; genderLayout->addWidget(genderLabel); genderLayout->addWidget(genderComboBox); QHBoxLayout *studentIDLayout = new QHBoxLayout; studentIDLayout->addWidget(studentIDLabel); studentIDLayout->addWidget(studentIDEdit); QHBoxLayout *birthdayLayout = new QHBoxLayout; birthdayLayout->addWidget(birthdayLabel); birthdayLayout->addWidget(birthdayEdit); QVBoxLayout *mainLayout = new QVBoxLayout; mainLayout->addLayout(nameLayout); mainLayout->addLayout(genderLayout); mainLayout->addLayout(studentIDLayout); mainLayout->addLayout(birthdayLayout); mainLayout->addWidget(buttonBox); setLayout(mainLayout); setWindowTitle(tr("Information")); } QString InfoDialog::getName() const { return nameEdit->text(); } QString InfoDialog::getGender() const { return genderComboBox->currentText(); } QString InfoDialog::getStudentID() const { return studentIDEdit->text(); } QDate InfoDialog::getBirthday() const { return birthdayEdit->date(); } void InfoDialog::accept() { if (nameEdit->text().isEmpty() || studentIDEdit->text().isEmpty()) { QMessageBox::warning(this, tr("Warning"), tr("Name and Student ID are required.")); } else { QDialog::accept(); } } ``` 在主窗口中使用该对话框的示例代码: ```c++ #include <QApplication> #include <QPushButton> #include <QMessageBox> #include "infodialog.h" int main(int argc, char *argv[]) { QApplication a(argc, argv); QPushButton button("Show Info Dialog"); QObject::connect(&button, &QPushButton::clicked, [&](){ InfoDialog dialog; if (dialog.exec() == QDialog::Accepted) { QString name = dialog.getName(); QString gender = dialog.getGender(); QString studentID = dialog.getStudentID(); QDate birthday = dialog.getBirthday(); QMessageBox::information(nullptr, "Information", QString("Name: %1\nGender: %2\nStudent ID: %3\nBirthday: %4") .arg(name).arg(gender).arg(studentID).arg(birthday.toString("yyyy-MM-dd"))); } }); button.show(); return a.exec(); } ``` 运行后点击“Show Info Dialog”按钮,就可以弹出一个对话框来输入基本信息。如果输入的姓名和学号为空,点击“OK”按钮时会弹出一个警告框提示用户。如果输入的信息都是正确的,点击“OK”按钮后会返回`QDialog::Accepted`,然后可以通过对话框的公共槽函数获取输入的信息。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值