以下是基于对论文的简单的在 1-gram 下面的实现,有五种模型,因为没有弄到reading所以,那种模型还没做。本代码所对应的语料不公布
# -*-coding:utf-8 -*-
import sys
import re
import os
from jieba import *
import jieba
reload(sys)
sys.setdefaultencoding('utf8')
# lecture地址
lecture_url = 'firstproject/lecture.txt'
class model_1_gram(object):
# 得到一元条件下的对于lecture的词的采集(去除了停用词),把采集结果放到一个list里面返回
def lecture_1_gram(self):
with open('firstproject/lecture.txt','r') as lecture:
# Windows下面有编码问题,这一步解决
content = lecture.read().strip().decode('gbk').encode('utf-8')
# print content
# 按空格,逗号或句号分词
# lecture_list = re.split(', |\. | "|" |\.|\n| ',content)
lecture_list = re.findall('([A-Za-z\']+)', content)
# print lecture_list
# 下面这一大块都是去除停用词
with open('firstproject/stopword.txt','r') as stopword:
stopword_content = stopword.read().strip().decode('gbk').encode('utf-8')
stopword_content = re.split(' \n',stopword_content)
lecture_list_new = []
for word in lecture_list:
if word.lower() not in stopword_content:
if word != '':
# lecture_list.remove(word)
lecture_list_new.append(word)
# print lecture_list_new
return lecture_list_new
# 这个函数返回word这个词在essay里面出现的次数
def CxE(self,word,url):
with open(url,'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
essay_list = re.findall('([A-Za-z\']+)', results)
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
num = essay_list.count(word)
return num
# 返回lecture的所有词也就是n
def getNum(self,url):
with open(url,'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
lecture_list = re.findall('([A-Za-z\']+)', results)
# lecture_list = re.split(', |\. | "|" |\.|\n| ', results)
num = len(lecture_list)
return num
# 第一个模型的权值
def naive_getWeight(self):
return 1
# 第二个模型权值
def prob_getWeight(self,word):
num = self.CxE(word,lecture_url)
lecture_num = self.getNum(lecture_url)
W = num*1.0/lecture_num
# print W
return W
# 第三个模型权值
def position_getWeight(self,word):
with open(lecture_url,'r') as lecture:
results = lecture.read().strip().decode('gbk').encode('utf-8')
# lecture_list = re.split(', |\. | "|" |\.|\n| ', results)
lecture_list = re.findall('([A-Za-z\']+)', results)
lecture_list_new = []
for i in lecture_list:
if i!='':
lecture_list_new.append(i)
n = 0
for unit in lecture_list_new:
n += 1
if unit == word:
break
lecture_num = self.getNum(lecture_url)
W = n*1.0/lecture_num
return W
# 第五个模型权值
def Good_getWeight(self,word):
url4 = 'firstproject/essay/4/'
url5 = 'firstproject/essay/5/'
# os获取目录url
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
num = 0
total = 0
for single_file4 in file_name4:
with open(url4+single_file4,'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
with open(url5+single_file5,'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
W = num*1.0/total
return W
# 第六个模型权值
def GoodVsBad_getWeight(self,word):
url4 = 'firstproject/essay/4/'
url5 = 'firstproject/essay/5/'
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
num = 0
total = 0
for single_file4 in file_name4:
with open(url4 + single_file4, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
with open(url5 + single_file5, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
Good_W = num * 1.0 / total
# 上面是算好的,下面是算坏的
url1 = 'firstproject/essay/1/'
url2 = 'firstproject/essay/2/'
file_name1 = os.listdir(url1)
file_name2 = os.listdir(url2)
Bad_num = 0
Bad_total = 0
for single_file1 in file_name1:
with open(url1 + single_file1, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
Bad_num += 1
Bad_total += 1
for single_file2 in file_name2:
with open(url2 + single_file2, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
Bad_num += 1
Bad_total += 1
Bad_W = Bad_num * 1.0 / Bad_total
W = Good_W-Bad_W
# print W
return W
# 从模型得到权值然后算分的函数
def getScore(self):
n=0
while n<5:
n += 1
# 拼接完整的url
url = 'firstproject/essay/'+str(n)
# 调用os获取一个文件夹下的所有url,然后循环调用
file_name = os.listdir(url)
score = 0
for single_file in file_name:
with open(url+'/'+single_file,'r') as essay:
content = essay.read()
# essay_list = re.split(', |\. | "|" |\.|\n| ', content)
essay_list = re.findall('([A-Za-z\']+)', content)
lecture_list = self.lecture_1_gram()
for unit in lecture_list:
if unit in essay_list:
# W = self.GoodVsBad_getWeight(unit)
W = self.naive_getWeight()
# W = self.prob_getWeight(unit)
# W = self.position_getWeight(unit)
# W = self.Good_getWeight(unit)
cxe = self.CxE(unit,url+'/'+single_file)
score += W*cxe*1.0
num = self.getNum(url+'/'+single_file)
score /= num
print str(score)+' '
model_1_gram = model_1_gram() model_1_gram.getScore()