# -*-coding:utf-8 -*-
import sys
import re
import os
from jieba import *
import jieba
reload(sys)
sys.setdefaultencoding('utf8')
import scipy
import numpy
import sklearn
from scipy.stats import pearsonr
# lecture地址
lecture_url = '/home/hjs/Downloads/firstproject/lecture.txt'
class model_1_gram(object):
# 得到一元条件下的对于lecture的词的采集(去除了停用词),把采集结果放到一个list里面返回
def lecture_1_gram(self):
with open('/home/hjs/Downloads/firstproject/lecture.txt','r') as lecture:
# Windows下面有编码问题,这一步解决
content = lecture.read().strip().decode('gbk').encode('utf-8')
# print content
# 按空格,逗号或句号分词
# lecture_list = re.split(', |\. | "|" |\.|\n| ',content)
lecture_list = re.findall('([A-Za-z\']+)', content)
# print lecture_list
# 下面这一大块都是去除停用词
with open('/home/hjs/Downloads/firstproject/stopword.txt','r') as stopword:
stopword_content = stopword.read().strip().decode('gbk').encode('utf-8')
stopword_content = re.split(' \n',stopword_content)
lecture_list_new = []
for word in lecture_list:
if word.lower() not in stopword_content:
if word != '':
# lecture_list.remove(word)
lecture_list_new.append(word)
# print lecture_list_new
return lecture_list_new
# 这个函数返回word这个词在essay里面出现的次数
def CxE(self,word,url):
with open(url,'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
essay_list = re.findall('([A-Za-z\']+)', results)
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
num = essay_list.count(word)
return num
# 返回lecture的所有词也就是n
def getNum(self,url):
with open(url,'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
lecture_list = re.findall('([A-Za-z\']+)', results)
# lecture_list = re.split(', |\. | "|" |\.|\n| ', results)
num = len(lecture_list)
return num
# 第一个模型的权值
def naive_getWeight(self):
return 1
# 第二个模型权值
def prob_getWeight(self,word):
num = self.CxE(word,lecture_url)
lecture_num = self.getNum(lecture_url)
W = num*1.0/lecture_num
# print W
return W
# 第三个模型权值
def position_getWeight(self,word):
with open(lecture_url,'r') as lecture:
results = lecture.read().strip().decode('gbk').encode('utf-8')
# lecture_list = re.split(', |\. | "|" |\.|\n| ', results)
lecture_list = re.findall('([A-Za-z\']+)', results)
lecture_list_new = []
for i in lecture_list:
if i!='':
lecture_list_new.append(i)
n = 0
for unit in lecture_list_new:
n += 1
if unit == word:
break
lecture_num = self.getNum(lecture_url)
W = n*1.0/lecture_num
return W
# 第五个模型权值
def Good_getWeight(self, word):
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
num = 0
total = 0
for single_file4 in file_name4:
with open(url4 + single_file4, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
with open(url5 + single_file5, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
W = num / total*1.0
return W
# 第六个模型权值
def GoodVsBad_getWeight(self,word):
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
num = 0
total = 0
for single_file4 in file_name4:
with open(url4 + single_file4, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
with open(url5 + single_file5, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
Good_W = num * 1.0 / total
# 上面是算好的,下面是算坏的
url1 = '/home/hjs/Downloads/firstproject/essay/1/'
url2 = '/home/hjs/Downloads/firstproject/essay/2/'
file_name1 = os.listdir(url1)
file_name2 = os.listdir(url2)
Bad_num = 0
Bad_total = 0
for single_file1 in file_name1:
with open(url1 + single_file1, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
Bad_num += 1
Bad_total += 1
for single_file2 in file_name2:
with open(url2 + single_file2, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
Bad_num += 1
Bad_total += 1
Bad_W = Bad_num * 1.0 / Bad_total
W = Good_W-Bad_W
# print W
return W
# 从模型得到权值然后算分的函数
def getScore(self):
x= []
y= []
n=0
while n<5:
n += 1
# 拼接完整的url
url = '/home/hjs/Downloads/firstproject/essay/'+str(n)
# 调用os获取一个文件夹下的所有url,然后循环调用
file_name = os.listdir(url)
print str(n) + '分下的文章分数'
for single_file in file_name:
score = 0
with open(url+'/'+single_file,'r') as essay:
content = essay.read()
# essay_list = re.split(', |\. | "|" |\.|\n| ', content)
essay_list = re.findall('([A-Za-z\']+)', content)
lecture_list = self.lecture_1_gram()
for unit in lecture_list:
if unit in essay_list:
# W = self.GoodVsBad_getWeight(unit)
# W = self.naive_getWeight()
# W = self.prob_getWeight(unit)
# W = self.position_getWeight(unit)
W = self.Good_getWeight(unit)
cxe = self.CxE(unit,url+'/'+single_file)
score += W*cxe*1.0
num = self.getNum(url+'/'+single_file)
score /= num
# print str(score)+' '
x.append(float(score))
y.append(float(n))
print scipy.stats.pearsonr(x, y)
print '\n\n\n'
class model_4_gram():
def lecture_4_gram(self):
with open('/home/hjs/Downloads/firstproject/lecture.txt','r') as lecture:
content = lecture.read().strip().decode('gbk').encode('utf-8')
lecture_list = re.findall('([A-Za-z\']+)', content)
lecture_list_new = []
length = len(lecture_list)
i = 0
while i< length - 1:
if (i+3) > (length-1):
break
lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
i += 4
i = 1
while i < length - 1:
if (i+3) > (length-1):
break
lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
i += 4
i = 2
while i < length - 1:
if (i+3) > (length-1):
break
lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
i += 4
i = 3
while i < length - 1:
if (i+3) > (length-1):
break
lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
i += 4
return lecture_list_new
def essay_4_gram(self,url):
with open(url, 'r') as essay:
content = essay.read().strip().decode('gbk').encode('utf-8')
essay_list = re.findall('([A-Za-z\']+)', content)
essay_list_new = []
length = len(essay_list)
i = 0
while i < length - 1:
if (i + 3) > (length - 1):
break
essay_list_new.append(
essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
essay_list[i + 3])
i += 4
i = 1
while i < length - 1:
if (i + 3) > (length - 1):
break
essay_list_new.append(
essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
essay_list[i + 3])
i += 4
i = 2
while i < length - 1:
if (i + 3) > (length - 1):
break
essay_list_new.append(
essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
essay_list[i + 3])
i += 4
i = 3
while i < length - 1:
if (i + 3) > (length - 1):
break
essay_list_new.append(
essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
essay_list[i + 3])
i += 4
return essay_list_new
# 这个函数返回word这个词在essay里面出现的次数
def CxE(self, word, url):
essay_list = self.essay_4_gram(url)
num = essay_list.count(word)
# with open(url, 'r') as essay:
# results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.findall('([A-Za-z\']+)', results)
# num = essay_list.count(word)
return num
# 返回lecture的所有词也就是n
def getNum(self, url):
with open(url, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
lecture_list = re.findall('([A-Za-z\']+)', results)
num = len(lecture_list)
return num
# 第一个模型的权值
def naive_getWeight(self):
return 1
# 第二个模型权值
def prob_getWeight(self, word):
num = self.CxE(word, lecture_url)
lecture_num = self.getNum(lecture_url)
W = num * 1.0 / lecture_num
return W
# 第三个模型权值
def position_getWeight(self, word):
with open(lecture_url, 'r') as lecture:
results = lecture.read().strip().decode('gbk').encode('utf-8')
lecture_list = re.findall('([A-Za-z\']+)', results)
single_word = re.search('([A-Za-z\']+) ([A-Za-z\']+) ([A-Za-z\']+) ([A-Za-z\']+)',word)
n = 0
flag = 0
num = 1
for unit in lecture_list:
n += 1
if unit == single_word.group(num):
flag += 1
num += 1
else:
num = 1
flag = 0
if flag == 4:
break
# print n
n = n-3
lecture_num = self.getNum(lecture_url)
W = n * 1.0 / lecture_num
return W
# 第五个模型权值
def Good_getWeight(self, word):
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
# os获取目录url
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
num = 0
total = 0
for single_file4 in file_name4:
essay_list = self.essay_4_gram(url4 + single_file4)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
essay_list = self.essay_4_gram(url5 + single_file5)
if word in essay_list:
num += 1
total += 1
W = num * 1.0 / total
return W
# 第六个模型权值
def GoodVsBad_getWeight(self, word):
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
num = 0
total = 0
for single_file4 in file_name4:
essay_list = self.essay_4_gram(url4 + single_file4)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
essay_list = self.essay_4_gram(url5 + single_file5)
if word in essay_list:
num += 1
total += 1
Good_W = num * 1.0 / total
# 上面是算好的,下面是算坏的
url1 = '/home/hjs/Downloads/firstproject/essay/1/'
url2 = '/home/hjs/Downloads/firstproject/essay/2/'
file_name1 = os.listdir(url1)
file_name2 = os.listdir(url2)
Bad_num = 0
Bad_total = 0
for single_file1 in file_name1:
essay_list = self.essay_4_gram(url1 + single_file1)
if word in essay_list:
Bad_num += 1
Bad_total += 1
for single_file2 in file_name2:
essay_list = self.essay_4_gram(url2 + single_file2)
if word in essay_list:
Bad_num += 1
Bad_total += 1
Bad_W = Bad_num * 1.0 / Bad_total
W = Good_W - Bad_W
return W
# 从模型得到权值然后算分的函数
def getScore(self):
x = []
y = []
n = 0
while n < 5:
n += 1
# 拼接完整的url
url = '/home/hjs/Downloads/firstproject/essay/' + str(n)
# 调用os获取一个文件夹下的所有url,然后循环调用
file_name = os.listdir(url)
print str(n) + '分下的文章分数'
for single_file in file_name:
score = 0
essay_list_new = self.essay_4_gram(url + '/' + single_file)
lecture_list = self.lecture_4_gram()
for unit in lecture_list:
if unit in essay_list_new:
# W = self.GoodVsBad_getWeight(unit)
W = self.naive_getWeight()
# W = self.prob_getWeight(unit)
# W = self.position_getWeight(unit)
# W = self.Good_getWeight(unit)
cxe = self.CxE(unit, url + '/' + single_file)
score += W * cxe * 1.0
num = self.getNum(url + '/' + single_file)
score /= num
# print str(score)
x.append(float(score))
y.append(float(n))
print scipy.stats.pearsonr(x,y)
print '\n\n\n'
class model_2_gram(object):
# 得到二元条件下的对于lecture或essay的词的采集(去除了停用词),把采集结果放到一个list里面返回
def lecture_2_gram(self, url):
with open(url, 'r') as lecture:
# Windows下面有编码问题,这一步解决
content = lecture.read().strip().decode('gbk', 'ignore').encode('utf-8')
# 将每段开头的4个空格改为1个
content = re.sub(r"\s{2,}", " ", content)
# print content
# 从开头依次截取2个单位长度的单词存入列表lecture_list_new
lecture_2_list = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content)
# print lecture_2_list
lecture_list_new = []
for i in lecture_2_list:
word = str(i[0]) + str(" ") + str(i[2])
# print word
lecture_list_new.append(word)
# 除去第一个单词,依次截取2个单位长度的单词存入列表lecture_list_new
content1 = re.search('[A-Za-z\']+\s([\s\S]*)', content)
# print content1.group(1)
lecture_2_list1 = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content1.group(1))
for j in lecture_2_list1:
word = str(j[0]) + str(" ") + str(j[2])
# print word
lecture_list_new.append(word)
# print lecture_list_new
return lecture_list_new
# 得到nL
def getNum(self, url):
with open(url, 'r')as lecture:
content = lecture.read().strip().decode('gbk').encode('utf-8')
content = re.sub(r"\s{2,}", " ", content)
lecture_list = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content)
num1 = len(lecture_list)
content1 = re.search('[A-Za-z\']+\s([\s\S]*)', content)
lecture_list1 = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content1.group(1))
num2 = len(lecture_list1)
num = num1 + num2
# print num
return num
# 得到C(x/L)
def getCxL(self, world, url):
lecture_list = self.lecture_2_gram(url)
num = lecture_list.count(world)
# print num
return num
# 得到FP(x)
def getFPx(self, world, url):
lecture_list = self.lecture_2_gram(url)
num = lecture_list.index(world)
# print num+1
# print lecture_list
return num + 1
# 第一个模型的权值
def naive_getWeight(self):
return 1
# 第二个模型的权值
def prob_getWeight(self, world):
num = self.getCxL(world, lecture_url)
lecture_num = self.getNum(lecture_url)
W = num * 1.0 / lecture_num
# print W
return W
# 第三个模型的权值
def position_getWeight(self, world):
FPX = self.getFPx(world, lecture_url)
lecture_num = self.getNum(lecture_url)
W = FPX * 1.0 / lecture_num
# print W
return W
# 第五个模型的权值
def Good_getWeight(self, word):
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
# 获取目录下每个文件
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
# print file_name4
# print file_name5
num = 0
total = 0
for single_file4 in file_name4:
essay_list = self.lecture_2_gram(url4 + single_file4)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
essay_list = self.lecture_2_gram(url5 + single_file5)
if word in essay_list:
num += 1
total += 1
W = num * 1.0 / total * 1.0
# print W
return W
# 第六个模型的权值
def GoodVsBad_getWeight(self, word):
# 计算出现过X的好的样本占所有好的样本比例
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
# 获取目录下每个文件
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
# print file_name4
# print file_name5
num = 0
total = 0
for single_file4 in file_name4:
essay_list = self.lecture_2_gram(url4 + single_file4)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
essay_list = self.lecture_2_gram(url5 + single_file5)
if word in essay_list:
num += 1
total += 1
Good_W = num * 1.0 / total
# 计算出现过X的差的样本占所有差的样本比例
url1 = '/home/hjs/Downloads/firstproject/essay/1/'
url2 = '/home/hjs/Downloads/firstproject/essay/2/'
# 获取目录下每个文件
file_name1 = os.listdir(url1)
file_name2 = os.listdir(url2)
# print file_name1
# print file_name2
num = 0
total = 0
for single_file1 in file_name1:
essay_list = self.lecture_2_gram(url1 + single_file1)
if word in essay_list:
num += 1
total += 1
for single_file2 in file_name2:
essay_list = self.lecture_2_gram(url2 + single_file2)
if word in essay_list:
num += 1
total += 1
Bad_W = num * 1.0 / total
W = Good_W - Bad_W
# print W
return W
# 通过公式计算不同模型的分数
def getScore(self):
x = []
y = []
url = '/home/hjs/Downloads/内容组作文据(李霞)/答题作文/'
score = 0
n = 1
while n < 6:
url = '/home/hjs/Downloads/firstproject/essay/' + str(n)
# print url
print str(n) + '分下的文章分数'
file_name = os.listdir(url)
for single_file in file_name:
with open(url + '/' + single_file, 'r')as essay:
# content = essay.read()
essay_list = self.lecture_2_gram(url + '/' + single_file)
# print essay_list
lecture_list = self.lecture_2_gram(lecture_url)
# print lecture_list
for unit in essay_list:
if unit in lecture_list:
# W=self.naive_getWeight()
# W=self.prob_getWeight(unit)
W=self.position_getWeight(unit)
# W=self.Good_getWeight(unit)
# W = self.GoodVsBad_getWeight(unit)
cxe = self.getCxL(unit, url + '/' + single_file)
# print W
score += W * cxe * 1.0
nE = self.getNum(url + '/' + single_file)
# print nE
score /= nE
# print score
x.append(float(score))
y.append(float(n))
n += 1
print scipy.stats.pearsonr(x, y)
print '\n\n\n'
model_1_gram = model_1_gram()
model_1_gram.getScore()
# model_2_gram = model_2_gram()
# model_2_gram.getScore()
# model_4_gram = model_4_gram()
# model_4_gram.getScore()
import sys
import re
import os
from jieba import *
import jieba
reload(sys)
sys.setdefaultencoding('utf8')
import scipy
import numpy
import sklearn
from scipy.stats import pearsonr
# lecture地址
lecture_url = '/home/hjs/Downloads/firstproject/lecture.txt'
class model_1_gram(object):
# 得到一元条件下的对于lecture的词的采集(去除了停用词),把采集结果放到一个list里面返回
def lecture_1_gram(self):
with open('/home/hjs/Downloads/firstproject/lecture.txt','r') as lecture:
# Windows下面有编码问题,这一步解决
content = lecture.read().strip().decode('gbk').encode('utf-8')
# print content
# 按空格,逗号或句号分词
# lecture_list = re.split(', |\. | "|" |\.|\n| ',content)
lecture_list = re.findall('([A-Za-z\']+)', content)
# print lecture_list
# 下面这一大块都是去除停用词
with open('/home/hjs/Downloads/firstproject/stopword.txt','r') as stopword:
stopword_content = stopword.read().strip().decode('gbk').encode('utf-8')
stopword_content = re.split(' \n',stopword_content)
lecture_list_new = []
for word in lecture_list:
if word.lower() not in stopword_content:
if word != '':
# lecture_list.remove(word)
lecture_list_new.append(word)
# print lecture_list_new
return lecture_list_new
# 这个函数返回word这个词在essay里面出现的次数
def CxE(self,word,url):
with open(url,'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
essay_list = re.findall('([A-Za-z\']+)', results)
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
num = essay_list.count(word)
return num
# 返回lecture的所有词也就是n
def getNum(self,url):
with open(url,'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
lecture_list = re.findall('([A-Za-z\']+)', results)
# lecture_list = re.split(', |\. | "|" |\.|\n| ', results)
num = len(lecture_list)
return num
# 第一个模型的权值
def naive_getWeight(self):
return 1
# 第二个模型权值
def prob_getWeight(self,word):
num = self.CxE(word,lecture_url)
lecture_num = self.getNum(lecture_url)
W = num*1.0/lecture_num
# print W
return W
# 第三个模型权值
def position_getWeight(self,word):
with open(lecture_url,'r') as lecture:
results = lecture.read().strip().decode('gbk').encode('utf-8')
# lecture_list = re.split(', |\. | "|" |\.|\n| ', results)
lecture_list = re.findall('([A-Za-z\']+)', results)
lecture_list_new = []
for i in lecture_list:
if i!='':
lecture_list_new.append(i)
n = 0
for unit in lecture_list_new:
n += 1
if unit == word:
break
lecture_num = self.getNum(lecture_url)
W = n*1.0/lecture_num
return W
# 第五个模型权值
def Good_getWeight(self, word):
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
num = 0
total = 0
for single_file4 in file_name4:
with open(url4 + single_file4, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
with open(url5 + single_file5, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
W = num / total*1.0
return W
# 第六个模型权值
def GoodVsBad_getWeight(self,word):
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
num = 0
total = 0
for single_file4 in file_name4:
with open(url4 + single_file4, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
with open(url5 + single_file5, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
num += 1
total += 1
Good_W = num * 1.0 / total
# 上面是算好的,下面是算坏的
url1 = '/home/hjs/Downloads/firstproject/essay/1/'
url2 = '/home/hjs/Downloads/firstproject/essay/2/'
file_name1 = os.listdir(url1)
file_name2 = os.listdir(url2)
Bad_num = 0
Bad_total = 0
for single_file1 in file_name1:
with open(url1 + single_file1, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
Bad_num += 1
Bad_total += 1
for single_file2 in file_name2:
with open(url2 + single_file2, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.split(', |\. | "|" |\.|\n| ', results)
essay_list = re.findall('([A-Za-z\']+)', results)
if word in essay_list:
Bad_num += 1
Bad_total += 1
Bad_W = Bad_num * 1.0 / Bad_total
W = Good_W-Bad_W
# print W
return W
# 从模型得到权值然后算分的函数
def getScore(self):
x= []
y= []
n=0
while n<5:
n += 1
# 拼接完整的url
url = '/home/hjs/Downloads/firstproject/essay/'+str(n)
# 调用os获取一个文件夹下的所有url,然后循环调用
file_name = os.listdir(url)
print str(n) + '分下的文章分数'
for single_file in file_name:
score = 0
with open(url+'/'+single_file,'r') as essay:
content = essay.read()
# essay_list = re.split(', |\. | "|" |\.|\n| ', content)
essay_list = re.findall('([A-Za-z\']+)', content)
lecture_list = self.lecture_1_gram()
for unit in lecture_list:
if unit in essay_list:
# W = self.GoodVsBad_getWeight(unit)
# W = self.naive_getWeight()
# W = self.prob_getWeight(unit)
# W = self.position_getWeight(unit)
W = self.Good_getWeight(unit)
cxe = self.CxE(unit,url+'/'+single_file)
score += W*cxe*1.0
num = self.getNum(url+'/'+single_file)
score /= num
# print str(score)+' '
x.append(float(score))
y.append(float(n))
print scipy.stats.pearsonr(x, y)
print '\n\n\n'
class model_4_gram():
def lecture_4_gram(self):
with open('/home/hjs/Downloads/firstproject/lecture.txt','r') as lecture:
content = lecture.read().strip().decode('gbk').encode('utf-8')
lecture_list = re.findall('([A-Za-z\']+)', content)
lecture_list_new = []
length = len(lecture_list)
i = 0
while i< length - 1:
if (i+3) > (length-1):
break
lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
i += 4
i = 1
while i < length - 1:
if (i+3) > (length-1):
break
lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
i += 4
i = 2
while i < length - 1:
if (i+3) > (length-1):
break
lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
i += 4
i = 3
while i < length - 1:
if (i+3) > (length-1):
break
lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
i += 4
return lecture_list_new
def essay_4_gram(self,url):
with open(url, 'r') as essay:
content = essay.read().strip().decode('gbk').encode('utf-8')
essay_list = re.findall('([A-Za-z\']+)', content)
essay_list_new = []
length = len(essay_list)
i = 0
while i < length - 1:
if (i + 3) > (length - 1):
break
essay_list_new.append(
essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
essay_list[i + 3])
i += 4
i = 1
while i < length - 1:
if (i + 3) > (length - 1):
break
essay_list_new.append(
essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
essay_list[i + 3])
i += 4
i = 2
while i < length - 1:
if (i + 3) > (length - 1):
break
essay_list_new.append(
essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
essay_list[i + 3])
i += 4
i = 3
while i < length - 1:
if (i + 3) > (length - 1):
break
essay_list_new.append(
essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
essay_list[i + 3])
i += 4
return essay_list_new
# 这个函数返回word这个词在essay里面出现的次数
def CxE(self, word, url):
essay_list = self.essay_4_gram(url)
num = essay_list.count(word)
# with open(url, 'r') as essay:
# results = essay.read().strip().decode('gbk').encode('utf-8')
# essay_list = re.findall('([A-Za-z\']+)', results)
# num = essay_list.count(word)
return num
# 返回lecture的所有词也就是n
def getNum(self, url):
with open(url, 'r') as essay:
results = essay.read().strip().decode('gbk').encode('utf-8')
lecture_list = re.findall('([A-Za-z\']+)', results)
num = len(lecture_list)
return num
# 第一个模型的权值
def naive_getWeight(self):
return 1
# 第二个模型权值
def prob_getWeight(self, word):
num = self.CxE(word, lecture_url)
lecture_num = self.getNum(lecture_url)
W = num * 1.0 / lecture_num
return W
# 第三个模型权值
def position_getWeight(self, word):
with open(lecture_url, 'r') as lecture:
results = lecture.read().strip().decode('gbk').encode('utf-8')
lecture_list = re.findall('([A-Za-z\']+)', results)
single_word = re.search('([A-Za-z\']+) ([A-Za-z\']+) ([A-Za-z\']+) ([A-Za-z\']+)',word)
n = 0
flag = 0
num = 1
for unit in lecture_list:
n += 1
if unit == single_word.group(num):
flag += 1
num += 1
else:
num = 1
flag = 0
if flag == 4:
break
# print n
n = n-3
lecture_num = self.getNum(lecture_url)
W = n * 1.0 / lecture_num
return W
# 第五个模型权值
def Good_getWeight(self, word):
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
# os获取目录url
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
num = 0
total = 0
for single_file4 in file_name4:
essay_list = self.essay_4_gram(url4 + single_file4)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
essay_list = self.essay_4_gram(url5 + single_file5)
if word in essay_list:
num += 1
total += 1
W = num * 1.0 / total
return W
# 第六个模型权值
def GoodVsBad_getWeight(self, word):
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
num = 0
total = 0
for single_file4 in file_name4:
essay_list = self.essay_4_gram(url4 + single_file4)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
essay_list = self.essay_4_gram(url5 + single_file5)
if word in essay_list:
num += 1
total += 1
Good_W = num * 1.0 / total
# 上面是算好的,下面是算坏的
url1 = '/home/hjs/Downloads/firstproject/essay/1/'
url2 = '/home/hjs/Downloads/firstproject/essay/2/'
file_name1 = os.listdir(url1)
file_name2 = os.listdir(url2)
Bad_num = 0
Bad_total = 0
for single_file1 in file_name1:
essay_list = self.essay_4_gram(url1 + single_file1)
if word in essay_list:
Bad_num += 1
Bad_total += 1
for single_file2 in file_name2:
essay_list = self.essay_4_gram(url2 + single_file2)
if word in essay_list:
Bad_num += 1
Bad_total += 1
Bad_W = Bad_num * 1.0 / Bad_total
W = Good_W - Bad_W
return W
# 从模型得到权值然后算分的函数
def getScore(self):
x = []
y = []
n = 0
while n < 5:
n += 1
# 拼接完整的url
url = '/home/hjs/Downloads/firstproject/essay/' + str(n)
# 调用os获取一个文件夹下的所有url,然后循环调用
file_name = os.listdir(url)
print str(n) + '分下的文章分数'
for single_file in file_name:
score = 0
essay_list_new = self.essay_4_gram(url + '/' + single_file)
lecture_list = self.lecture_4_gram()
for unit in lecture_list:
if unit in essay_list_new:
# W = self.GoodVsBad_getWeight(unit)
W = self.naive_getWeight()
# W = self.prob_getWeight(unit)
# W = self.position_getWeight(unit)
# W = self.Good_getWeight(unit)
cxe = self.CxE(unit, url + '/' + single_file)
score += W * cxe * 1.0
num = self.getNum(url + '/' + single_file)
score /= num
# print str(score)
x.append(float(score))
y.append(float(n))
print scipy.stats.pearsonr(x,y)
print '\n\n\n'
class model_2_gram(object):
# 得到二元条件下的对于lecture或essay的词的采集(去除了停用词),把采集结果放到一个list里面返回
def lecture_2_gram(self, url):
with open(url, 'r') as lecture:
# Windows下面有编码问题,这一步解决
content = lecture.read().strip().decode('gbk', 'ignore').encode('utf-8')
# 将每段开头的4个空格改为1个
content = re.sub(r"\s{2,}", " ", content)
# print content
# 从开头依次截取2个单位长度的单词存入列表lecture_list_new
lecture_2_list = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content)
# print lecture_2_list
lecture_list_new = []
for i in lecture_2_list:
word = str(i[0]) + str(" ") + str(i[2])
# print word
lecture_list_new.append(word)
# 除去第一个单词,依次截取2个单位长度的单词存入列表lecture_list_new
content1 = re.search('[A-Za-z\']+\s([\s\S]*)', content)
# print content1.group(1)
lecture_2_list1 = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content1.group(1))
for j in lecture_2_list1:
word = str(j[0]) + str(" ") + str(j[2])
# print word
lecture_list_new.append(word)
# print lecture_list_new
return lecture_list_new
# 得到nL
def getNum(self, url):
with open(url, 'r')as lecture:
content = lecture.read().strip().decode('gbk').encode('utf-8')
content = re.sub(r"\s{2,}", " ", content)
lecture_list = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content)
num1 = len(lecture_list)
content1 = re.search('[A-Za-z\']+\s([\s\S]*)', content)
lecture_list1 = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content1.group(1))
num2 = len(lecture_list1)
num = num1 + num2
# print num
return num
# 得到C(x/L)
def getCxL(self, world, url):
lecture_list = self.lecture_2_gram(url)
num = lecture_list.count(world)
# print num
return num
# 得到FP(x)
def getFPx(self, world, url):
lecture_list = self.lecture_2_gram(url)
num = lecture_list.index(world)
# print num+1
# print lecture_list
return num + 1
# 第一个模型的权值
def naive_getWeight(self):
return 1
# 第二个模型的权值
def prob_getWeight(self, world):
num = self.getCxL(world, lecture_url)
lecture_num = self.getNum(lecture_url)
W = num * 1.0 / lecture_num
# print W
return W
# 第三个模型的权值
def position_getWeight(self, world):
FPX = self.getFPx(world, lecture_url)
lecture_num = self.getNum(lecture_url)
W = FPX * 1.0 / lecture_num
# print W
return W
# 第五个模型的权值
def Good_getWeight(self, word):
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
# 获取目录下每个文件
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
# print file_name4
# print file_name5
num = 0
total = 0
for single_file4 in file_name4:
essay_list = self.lecture_2_gram(url4 + single_file4)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
essay_list = self.lecture_2_gram(url5 + single_file5)
if word in essay_list:
num += 1
total += 1
W = num * 1.0 / total * 1.0
# print W
return W
# 第六个模型的权值
def GoodVsBad_getWeight(self, word):
# 计算出现过X的好的样本占所有好的样本比例
url4 = '/home/hjs/Downloads/firstproject/essay/4/'
url5 = '/home/hjs/Downloads/firstproject/essay/5/'
# 获取目录下每个文件
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
# print file_name4
# print file_name5
num = 0
total = 0
for single_file4 in file_name4:
essay_list = self.lecture_2_gram(url4 + single_file4)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
essay_list = self.lecture_2_gram(url5 + single_file5)
if word in essay_list:
num += 1
total += 1
Good_W = num * 1.0 / total
# 计算出现过X的差的样本占所有差的样本比例
url1 = '/home/hjs/Downloads/firstproject/essay/1/'
url2 = '/home/hjs/Downloads/firstproject/essay/2/'
# 获取目录下每个文件
file_name1 = os.listdir(url1)
file_name2 = os.listdir(url2)
# print file_name1
# print file_name2
num = 0
total = 0
for single_file1 in file_name1:
essay_list = self.lecture_2_gram(url1 + single_file1)
if word in essay_list:
num += 1
total += 1
for single_file2 in file_name2:
essay_list = self.lecture_2_gram(url2 + single_file2)
if word in essay_list:
num += 1
total += 1
Bad_W = num * 1.0 / total
W = Good_W - Bad_W
# print W
return W
# 通过公式计算不同模型的分数
def getScore(self):
x = []
y = []
url = '/home/hjs/Downloads/内容组作文据(李霞)/答题作文/'
score = 0
n = 1
while n < 6:
url = '/home/hjs/Downloads/firstproject/essay/' + str(n)
# print url
print str(n) + '分下的文章分数'
file_name = os.listdir(url)
for single_file in file_name:
with open(url + '/' + single_file, 'r')as essay:
# content = essay.read()
essay_list = self.lecture_2_gram(url + '/' + single_file)
# print essay_list
lecture_list = self.lecture_2_gram(lecture_url)
# print lecture_list
for unit in essay_list:
if unit in lecture_list:
# W=self.naive_getWeight()
# W=self.prob_getWeight(unit)
W=self.position_getWeight(unit)
# W=self.Good_getWeight(unit)
# W = self.GoodVsBad_getWeight(unit)
cxe = self.getCxL(unit, url + '/' + single_file)
# print W
score += W * cxe * 1.0
nE = self.getNum(url + '/' + single_file)
# print nE
score /= nE
# print score
x.append(float(score))
y.append(float(n))
n += 1
print scipy.stats.pearsonr(x, y)
print '\n\n\n'
model_1_gram = model_1_gram()
model_1_gram.getScore()
# model_2_gram = model_2_gram()
# model_2_gram.getScore()
# model_4_gram = model_4_gram()
# model_4_gram.getScore()