基于Tensorflow的英文评论二分类模型
前言
经过机器学习生成的模型,可以判断英语的肯定或否定含义,减轻了人的工作量,使得对大量意见进行归集,判断成为可能
==>源代码Github下载
导读
你可以逐行阅读源码和我的注释,然后运行感兴趣的Python文件,就可以让模型工作起来
源码列表
性质 | 内容 | 详细信息 |
---|---|---|
源码 | Python文件4篇 | data_helpers.py |
- | - | train.py |
- | - | eval.py |
- | - | text_cnn.py |
数据集 | 训练测试数据 | rt-polarity.neg |
- | - | rt-polarity.pos |
data_helpers.py 源码及分析
import numpy as np
import re
import itertools
from collections import Counter
def clean_str(string):
#清理数据替换掉无词义的符号,这些符号,无法向量化
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data_and_labels(positive_data_file, negative_data_file):
# 将 词 和 标签,组成一个向量,维度是词,深度是2 positive和negative
# 正样本语料库词标签为[0,1]
# 负样本语料库词标签为[1,0]
"""
Loads MR polarity data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
# Load data from files
positive_examples = list(open(positive_data_file, "r").readlines())
positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open(negative_data_file, "r").readlines())
negative_examples = [s.strip() for s in negative_examples]
# Split by words
x_text = positive_examples + negative_examples
x_text = [clean_str(sent) for sent in x_text]
# Generate labels
positive_labels = [[0, 1] for _ in positive_examples]
negative_labels = [[1, 0] for _ in negative_examples]
y = np.concatenate([positive_labels, negative_labels], 0)
return [x_text, y]
def batch_iter(data, batch_size, num_epochs, shuffle=True):
# 主要功能
# 1 选择每次迭代,是否洗数据,像洗牌意义
# 2 用生成器,每次只输出shuffled_data[start_index:end_index]这么多
"""
Generates a batch iterator for a dataset.
"""
data = np.array(data)
data_size = len(data)
num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
else:
shuffled_data = data
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index]
train.py 源码及分析
#! /usr/bin/env python
import tensorflow as tf
import numpy as np
import os
import time
import datetime
import data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn
# Parameters
# ==================================================
# 语料文件路径定义
# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1, "Percentage of the training data to use for validation")
tf.flags.DEFINE_string("positive_data_file", "./data/rt-polaritydata/rt-polarity.pos", "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file", "./data/rt-polaritydata/rt-polarity.neg", "Data source for the negative data.")
# 模型关键参数定义,卷积想象成这样
#[1] [0] [0]
#[0] [1] [0]
#[0] [0] [1]
# 如上,类推3/4/5 ==>filter_sizes