# -*-coding:utf-8 -*-
import sys
import re
import os
from jieba import *
import jieba
reload(sys)
sys.setdefaultencoding('utf8')
# lecture地址
lecture_url ='/home/hjs/Downloads/
class model_2_gram(object):
# 得到二元条件下的对于lecture或essay的词的采集(去除了停用词),把采集结果放到一个list里面返回
def lecture_2_gram(self,url):
with open(url,'r') as lecture:
#Windows下面有编码问题,这一步解决
content = lecture.read().strip().decode('gbk','ignore').encode('utf-8')
#将每段开头的4个空格改为1个
content = re.sub(r"\s{2,}", " ", content)
# print content
#从开头依次截取2个单位长度的单词存入列表lecture_list_new
lecture_2_list = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)',content)
# print lecture_2_list
lecture_list_new = []
for i in lecture_2_list:
word = str(i[0])+str(" ")+str(i[2])
# print word
lecture_list_new.append(word)
#除去第一个单词,依次截取2个单位长度的单词存入列表lecture_list_new
content1 = re.search('[A-Za-z\']+\s([\s\S]*)',content)
# print content1.group(1)
lecture_2_list1 = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content1.group(1))
for j in lecture_2_list1:
word = str(j[0])+str(" ")+str(j[2])
# print word
lecture_list_new.append(word)
# print lecture_list_new
return lecture_list_new
#得到nL
def getNum(self,url):
with open(url,'r')as lecture:
content = lecture.read().strip().decode('gbk').encode('utf-8')
content = re.sub(r"\s{2,}", " ", content)
lecture_list = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)',content)
num1 = len(lecture_list)
content1 = re.search('[A-Za-z\']+\s([\s\S]*)',content)
lecture_list1 = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)',content1.group(1))
num2 = len(lecture_list1)
num =num1+num2
# print num
return num
#得到C(x/L)
def getCxL(self,world,url):
lecture_list = self.lecture_2_gram(url)
num = lecture_list.count(world)
# print num
return num
#得到FP(x)
def getFPx(self,world,url):
lecture_list = self.lecture_2_gram(url)
num = lecture_list.index(world)
# print num+1
# print lecture_list
return num+1
#第一个模型的权值
def naive_getWeight(self):
return 1
#第二个模型的权值
def prob_getWeight(self,world):
num = self.getCxL(world,lecture_url)
lecture_num = self.getNum(lecture_url)
W = num*1.0/lecture_num
# print W
return W
# 第三个模型的权值
def position_getWeight(self,world):
FPX = self.getFPx(world,lecture_url)
lecture_num = self.getNum(lecture_url)
W = FPX*1.0/lecture_num
# print W
return W
#第五个模型的权值
def Good_getWeight(self,word):
url4 = '/home/hjs/Downloads/
url5 = '/home/hjs/Downloads/
#获取目录下每个文件
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
# print file_name4
# print file_name5
num = 0
total = 0
for single_file4 in file_name4:
essay_list = self.lecture_2_gram(url4+single_file4)
if word in essay_list:
num+=1
total +=1
for single_file5 in file_name5:
essay_list =self.lecture_2_gram(url5+single_file5)
if word in essay_list:
num+=1
total+=1
W = num*1.0/total*1.0
print W
return W
#第六个模型的权值
def GoodVsBad_getWeight(self,word):
#计算出现过X的好的样本占所有好的样本比例
url4 = '/home/hjs/Downloads/
url5 = '/home/hjs/Downloads/
# 获取目录下每个文件
file_name4 = os.listdir(url4)
file_name5 = os.listdir(url5)
# print file_name4
# print file_name5
num = 0
total = 0
for single_file4 in file_name4:
essay_list = self.lecture_2_gram(url4 + single_file4)
if word in essay_list:
num += 1
total += 1
for single_file5 in file_name5:
essay_list = self.lecture_2_gram(url5 + single_file5)
if word in essay_list:
num += 1
total += 1
Good_W = num*1.0/total
#计算出现过X的差的样本占所有差的样本比例
url1 = '/home/hjs/Downloads/
url2 = '/home/hjs/Downloads/
# 获取目录下每个文件
file_name1 = os.listdir(url1)
file_name2 = os.listdir(url2)
# print file_name1
# print file_name2
num = 0
total = 0
for single_file1 in file_name1:
essay_list = self.lecture_2_gram(url1 + single_file1)
if word in essay_list:
num += 1
total += 1
for single_file2 in file_name2:
essay_list = self.lecture_2_gram(url2 + single_file2)
if word in essay_list:
num += 1
total += 1
Bad_W = num*1.0/ total
W = Good_W - Bad_W
# print W
return W
#通过公式计算不同模型的分数
def getScore(self):
url = '/home/hjs/Downloads/
score = 0
n=1
while n<6:
url = '/home/hjs/Downloads/ + str(n)
print url
file_name = os.listdir(url)
for single_file in file_name:
with open(url+'/'+single_file,'r')as essay:
content = essay.read()
essay_list = self.lecture_2_gram(url+'/'+single_file)
# print essay_list
lecture_list = self.lecture_2_gram(lecture_url)
# print lecture_list
for x in essay_list:
if x in lecture_list:
# W=self.naive_getWeight()
# W=self.prob_getWeight(x)
# W=self.position_getWeight(x)
# W=self.Good_getWeight(x)
W=self.GoodVsBad_getWeight(x)
cxe = self.getCxL(x,url+'/'+single_file)
# print W
score += W * cxe * 1.0
nE = self.getNum(url+'/'+single_file)
# print nE
score /= nE
print score
n+=1
a = model_2_gram()
a.getScore()
# lecture_2_gram()
# getNum(lecture_url)
# getCxL('a',lecture_url)
# getFPx('It',lecture_url)
# Good_getWeight('a')
基于对论文Content Importance Models for Scoring Writing From Sources的简单的2-gram的实现
最新推荐文章于 2020-05-06 15:26:34 发布