Content Importance Models for Scoring Writing From Sources

# -*-coding:utf-8 -*-
import sys
import re
import os
from jieba import *
import jieba
reload(sys)
sys.setdefaultencoding('utf8')
import scipy
import numpy
import sklearn
from scipy.stats import pearsonr
 
# lecture地址
lecture_url = '/home/hjs/Downloads/firstproject/lecture.txt'
 
 
class model_1_gram(object):
    # 得到一元条件下的对于lecture的词的采集(去除了停用词),把采集结果放到一个list里面返回
    def lecture_1_gram(self):
        with open('/home/hjs/Downloads/firstproject/lecture.txt','r') as lecture:
            # Windows下面有编码问题,这一步解决
            content = lecture.read().strip().decode('gbk').encode('utf-8')
            # print content
            # 按空格,逗号或句号分词
            # lecture_list = re.split(', |\. | "|" |\.|\n| ',content)
            lecture_list = re.findall('([A-Za-z\']+)', content)
            # print lecture_list
 
            # 下面这一大块都是去除停用词
            with open('/home/hjs/Downloads/firstproject/stopword.txt','r') as stopword:
                stopword_content = stopword.read().strip().decode('gbk').encode('utf-8')
                stopword_content = re.split('  \n',stopword_content)
                lecture_list_new = []
                for word in lecture_list:
                    if word.lower() not in stopword_content:
                        if word != '':
                            # lecture_list.remove(word)
                            lecture_list_new.append(word)
        # print lecture_list_new
        return lecture_list_new
 
    # 这个函数返回word这个词在essay里面出现的次数
    def CxE(self,word,url):
        with open(url,'r') as essay:
            results = essay.read().strip().decode('gbk').encode('utf-8')
            essay_list = re.findall('([A-Za-z\']+)', results)
            # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
            num = essay_list.count(word)
            return num
 
    # 返回lecture的所有词也就是n
    def getNum(self,url):
        with open(url,'r') as essay:
            results = essay.read().strip().decode('gbk').encode('utf-8')
            lecture_list = re.findall('([A-Za-z\']+)', results)
            # lecture_list = re.split(', |\. | "|" |\.|\n| ', results)
            num = len(lecture_list)
            return num
 
    # 第一个模型的权值
    def naive_getWeight(self):
        return 1
 
    # 第二个模型权值
    def prob_getWeight(self,word):
        num = self.CxE(word,lecture_url)
        lecture_num = self.getNum(lecture_url)
        W = num*1.0/lecture_num
        # print W
        return W
 
 
    # 第三个模型权值
    def position_getWeight(self,word):
        with open(lecture_url,'r') as lecture:
            results = lecture.read().strip().decode('gbk').encode('utf-8')
            # lecture_list = re.split(', |\. | "|" |\.|\n| ', results)
            lecture_list = re.findall('([A-Za-z\']+)', results)
            lecture_list_new = []
            for i in lecture_list:
                if i!='':
                    lecture_list_new.append(i)
            n = 0
            for unit in lecture_list_new:
                n += 1
                if unit == word:
                    break
            lecture_num = self.getNum(lecture_url)
            W = n*1.0/lecture_num
            return W
 
 
    # 第五个模型权值
 
    def Good_getWeight(self, word):
        url4 = '/home/hjs/Downloads/firstproject/essay/4/'
        url5 = '/home/hjs/Downloads/firstproject/essay/5/'
        file_name4 = os.listdir(url4)
        file_name5 = os.listdir(url5)
        num = 0
        total = 0
        for single_file4 in file_name4:
            with open(url4 + single_file4, 'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    num += 1
            total += 1
 
        for single_file5 in file_name5:
            with open(url5 + single_file5, 'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    num += 1
            total += 1
 
        W = num / total*1.0
        return W
 
    # 第六个模型权值
    def GoodVsBad_getWeight(self,word):
        url4 = '/home/hjs/Downloads/firstproject/essay/4/'
        url5 = '/home/hjs/Downloads/firstproject/essay/5/'
        file_name4 = os.listdir(url4)
        file_name5 = os.listdir(url5)
        num = 0
        total = 0
        for single_file4 in file_name4:
            with open(url4 + single_file4, 'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    num += 1
            total += 1
 
        for single_file5 in file_name5:
            with open(url5 + single_file5, 'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    num += 1
            total += 1
        Good_W = num * 1.0 / total
        # 上面是算好的,下面是算坏的
        url1 = '/home/hjs/Downloads/firstproject/essay/1/'
        url2 = '/home/hjs/Downloads/firstproject/essay/2/'
        file_name1 = os.listdir(url1)
        file_name2 = os.listdir(url2)
        Bad_num = 0
        Bad_total = 0
        for single_file1 in file_name1:
            with open(url1 + single_file1, 'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    Bad_num += 1
            Bad_total += 1
 
        for single_file2 in file_name2:
            with open(url2 + single_file2, 'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    Bad_num += 1
            Bad_total += 1
 
        Bad_W = Bad_num * 1.0 / Bad_total
        W = Good_W-Bad_W
        # print W
        return W
 
 
 
 
    # 从模型得到权值然后算分的函数
    def getScore(self):
        x= []
        y= []
        n=0
        while n<5:
            n += 1
            # 拼接完整的url
            url = '/home/hjs/Downloads/firstproject/essay/'+str(n)
            # 调用os获取一个文件夹下的所有url,然后循环调用
            file_name = os.listdir(url)
            print str(n) + '分下的文章分数'
            for single_file in file_name:
                score = 0
                with open(url+'/'+single_file,'r') as essay:
                    content = essay.read()
                    # essay_list = re.split(', |\. | "|" |\.|\n| ', content)
                    essay_list = re.findall('([A-Za-z\']+)', content)
                    lecture_list = self.lecture_1_gram()
                    for unit in lecture_list:
                        if unit in essay_list:
                            # W = self.GoodVsBad_getWeight(unit)
                            # W = self.naive_getWeight()
                            # W = self.prob_getWeight(unit)
                            # W = self.position_getWeight(unit)
                            W = self.Good_getWeight(unit)
                            cxe = self.CxE(unit,url+'/'+single_file)
                            score += W*cxe*1.0
                    num = self.getNum(url+'/'+single_file)
                    score /= num
                    # print str(score)+' '
            x.append(float(score))
            y.append(float(n))
        print scipy.stats.pearsonr(x, y)
        print '\n\n\n'
 
 
 
class model_4_gram():
    def lecture_4_gram(self):
        with open('/home/hjs/Downloads/firstproject/lecture.txt','r') as lecture:
            content = lecture.read().strip().decode('gbk').encode('utf-8')
            lecture_list = re.findall('([A-Za-z\']+)', content)
            lecture_list_new = []
            length = len(lecture_list)
            i = 0
            while i< length - 1:
                if (i+3) > (length-1):
                    break
                lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
                i += 4
            i = 1
            while i < length - 1:
                if (i+3) > (length-1):
                    break
                lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
                i += 4
            i = 2
            while i < length - 1:
                if (i+3) > (length-1):
                    break
                lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
                i += 4
            i = 3
            while i < length - 1:
                if (i+3) > (length-1):
                    break
                lecture_list_new.append(lecture_list[i]+' '+lecture_list[i+1]+' '+lecture_list[i+2]+' '+lecture_list[i+3])
                i += 4
        return lecture_list_new
 
    def essay_4_gram(self,url):
        with open(url, 'r') as essay:
            content = essay.read().strip().decode('gbk').encode('utf-8')
            essay_list = re.findall('([A-Za-z\']+)', content)
            essay_list_new = []
            length = len(essay_list)
            i = 0
            while i < length - 1:
                if (i + 3) > (length - 1):
                    break
                essay_list_new.append(
                    essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
                    essay_list[i + 3])
                i += 4
            i = 1
            while i < length - 1:
                if (i + 3) > (length - 1):
                    break
                essay_list_new.append(
                    essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
                    essay_list[i + 3])
                i += 4
            i = 2
            while i < length - 1:
                if (i + 3) > (length - 1):
                    break
                essay_list_new.append(
                    essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
                    essay_list[i + 3])
                i += 4
            i = 3
            while i < length - 1:
                if (i + 3) > (length - 1):
                    break
                essay_list_new.append(
                    essay_list[i] + ' ' + essay_list[i + 1] + ' ' + essay_list[i + 2] + ' ' +
                    essay_list[i + 3])
                i += 4
        return essay_list_new
    # 这个函数返回word这个词在essay里面出现的次数
    def CxE(self, word, url):
        essay_list = self.essay_4_gram(url)
        num = essay_list.count(word)
        # with open(url, 'r') as essay:
        #     results = essay.read().strip().decode('gbk').encode('utf-8')
        #     essay_list = re.findall('([A-Za-z\']+)', results)
        #     num = essay_list.count(word)
        return num
    # 返回lecture的所有词也就是n
    def getNum(self, url):
        with open(url, 'r') as essay:
            results = essay.read().strip().decode('gbk').encode('utf-8')
            lecture_list = re.findall('([A-Za-z\']+)', results)
            num = len(lecture_list)
            return num
    # 第一个模型的权值
    def naive_getWeight(self):
        return 1
    # 第二个模型权值
    def prob_getWeight(self, word):
        num = self.CxE(word, lecture_url)
        lecture_num = self.getNum(lecture_url)
        W = num * 1.0 / lecture_num
        return W
    # 第三个模型权值
    def position_getWeight(self, word):
        with open(lecture_url, 'r') as lecture:
            results = lecture.read().strip().decode('gbk').encode('utf-8')
            lecture_list = re.findall('([A-Za-z\']+)', results)
            single_word = re.search('([A-Za-z\']+) ([A-Za-z\']+) ([A-Za-z\']+) ([A-Za-z\']+)',word)
            n = 0
            flag = 0
            num = 1
            for unit in lecture_list:
                n += 1
                if unit == single_word.group(num):
                    flag += 1
                    num += 1
                else:
                    num = 1
                    flag = 0
                if flag == 4:
                    break
            # print n
            n = n-3
            lecture_num = self.getNum(lecture_url)
            W = n * 1.0 / lecture_num
            return W
    # 第五个模型权值
    def Good_getWeight(self, word):
        url4 = '/home/hjs/Downloads/firstproject/essay/4/'
        url5 = '/home/hjs/Downloads/firstproject/essay/5/'
        # os获取目录url
        file_name4 = os.listdir(url4)
        file_name5 = os.listdir(url5)
        num = 0
        total = 0
        for single_file4 in file_name4:
            essay_list = self.essay_4_gram(url4 + single_file4)
            if word in essay_list:
                num += 1
            total += 1
 
        for single_file5 in file_name5:
            essay_list = self.essay_4_gram(url5 + single_file5)
            if word in essay_list:
                num += 1
            total += 1
 
        W = num * 1.0 / total
        return W
    # 第六个模型权值
    def GoodVsBad_getWeight(self, word):
        url4 = '/home/hjs/Downloads/firstproject/essay/4/'
        url5 = '/home/hjs/Downloads/firstproject/essay/5/'
        file_name4 = os.listdir(url4)
        file_name5 = os.listdir(url5)
        num = 0
        total = 0
        for single_file4 in file_name4:
            essay_list = self.essay_4_gram(url4 + single_file4)
            if word in essay_list:
                num += 1
            total += 1
 
        for single_file5 in file_name5:
            essay_list = self.essay_4_gram(url5 + single_file5)
            if word in essay_list:
                num += 1
            total += 1
        Good_W = num * 1.0 / total
        # 上面是算好的,下面是算坏的
        url1 = '/home/hjs/Downloads/firstproject/essay/1/'
        url2 = '/home/hjs/Downloads/firstproject/essay/2/'
        file_name1 = os.listdir(url1)
        file_name2 = os.listdir(url2)
        Bad_num = 0
        Bad_total = 0
        for single_file1 in file_name1:
            essay_list = self.essay_4_gram(url1 + single_file1)
            if word in essay_list:
                Bad_num += 1
            Bad_total += 1
 
        for single_file2 in file_name2:
            essay_list = self.essay_4_gram(url2 + single_file2)
            if word in essay_list:
                Bad_num += 1
            Bad_total += 1
 
        Bad_W = Bad_num * 1.0 / Bad_total
        W = Good_W - Bad_W
        return W
 
    # 从模型得到权值然后算分的函数
    def getScore(self):
        x = []
        y = []
        n = 0
        while n < 5:
            n += 1
            # 拼接完整的url
            url = '/home/hjs/Downloads/firstproject/essay/' + str(n)
            # 调用os获取一个文件夹下的所有url,然后循环调用
            file_name = os.listdir(url)
            print str(n) + '分下的文章分数'
            for single_file in file_name:
                score = 0
                essay_list_new = self.essay_4_gram(url + '/' + single_file)
                lecture_list = self.lecture_4_gram()
                for unit in lecture_list:
                    if unit in essay_list_new:
                        # W = self.GoodVsBad_getWeight(unit)
                        W = self.naive_getWeight()
                        # W = self.prob_getWeight(unit)
                        # W = self.position_getWeight(unit)
                        # W = self.Good_getWeight(unit)
                        cxe = self.CxE(unit, url + '/' + single_file)
                        score += W * cxe * 1.0
                num = self.getNum(url + '/' + single_file)
                score /= num
                # print str(score)
            x.append(float(score))
            y.append(float(n))
        print scipy.stats.pearsonr(x,y)
 
        print '\n\n\n'
 
class model_2_gram(object):
    # 得到二元条件下的对于lecture或essay的词的采集(去除了停用词),把采集结果放到一个list里面返回
    def lecture_2_gram(self, url):
        with open(url, 'r') as lecture:
            # Windows下面有编码问题,这一步解决
            content = lecture.read().strip().decode('gbk', 'ignore').encode('utf-8')
            # 将每段开头的4个空格改为1个
            content = re.sub(r"\s{2,}", " ", content)
            # print content
            # 从开头依次截取2个单位长度的单词存入列表lecture_list_new
            lecture_2_list = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content)
            # print lecture_2_list
            lecture_list_new = []
            for i in lecture_2_list:
                word = str(i[0]) + str(" ") + str(i[2])
                # print word
                lecture_list_new.append(word)
 
            # 除去第一个单词,依次截取2个单位长度的单词存入列表lecture_list_new
            content1 = re.search('[A-Za-z\']+\s([\s\S]*)', content)
            # print content1.group(1)
            lecture_2_list1 = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content1.group(1))
            for j in lecture_2_list1:
                word = str(j[0]) + str(" ") + str(j[2])
                # print word
                lecture_list_new.append(word)
 
            # print lecture_list_new
            return lecture_list_new
 
    # 得到nL
    def getNum(self, url):
        with open(url, 'r')as lecture:
            content = lecture.read().strip().decode('gbk').encode('utf-8')
            content = re.sub(r"\s{2,}", " ", content)
            lecture_list = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content)
            num1 = len(lecture_list)
            content1 = re.search('[A-Za-z\']+\s([\s\S]*)', content)
            lecture_list1 = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content1.group(1))
            num2 = len(lecture_list1)
            num = num1 + num2
            # print num
            return num
 
    # 得到C(x/L)
    def getCxL(self, world, url):
        lecture_list = self.lecture_2_gram(url)
        num = lecture_list.count(world)
        # print num
        return num
 
    # 得到FP(x)
    def getFPx(self, world, url):
        lecture_list = self.lecture_2_gram(url)
        num = lecture_list.index(world)
        # print num+1
        # print lecture_list
        return num + 1
 
    # 第一个模型的权值
    def naive_getWeight(self):
        return 1
 
    # 第二个模型的权值
    def prob_getWeight(self, world):
        num = self.getCxL(world, lecture_url)
        lecture_num = self.getNum(lecture_url)
        W = num * 1.0 / lecture_num
        # print W
        return W
 
    # 第三个模型的权值
    def position_getWeight(self, world):
        FPX = self.getFPx(world, lecture_url)
        lecture_num = self.getNum(lecture_url)
        W = FPX * 1.0 / lecture_num
        # print W
        return W
 
    # 第五个模型的权值
    def Good_getWeight(self, word):
        url4 = '/home/hjs/Downloads/firstproject/essay/4/'
        url5 = '/home/hjs/Downloads/firstproject/essay/5/'
        # 获取目录下每个文件
        file_name4 = os.listdir(url4)
        file_name5 = os.listdir(url5)
        # print file_name4
        # print file_name5
        num = 0
        total = 0
        for single_file4 in file_name4:
            essay_list = self.lecture_2_gram(url4 + single_file4)
            if word in essay_list:
                num += 1
            total += 1
 
        for single_file5 in file_name5:
            essay_list = self.lecture_2_gram(url5 + single_file5)
            if word in essay_list:
                num += 1
            total += 1
 
        W = num * 1.0 / total * 1.0
        # print W
        return W
 
    # 第六个模型的权值
    def GoodVsBad_getWeight(self, word):
        # 计算出现过X的好的样本占所有好的样本比例
        url4 = '/home/hjs/Downloads/firstproject/essay/4/'
        url5 = '/home/hjs/Downloads/firstproject/essay/5/'
        # 获取目录下每个文件
        file_name4 = os.listdir(url4)
        file_name5 = os.listdir(url5)
        # print file_name4
        # print file_name5
        num = 0
        total = 0
        for single_file4 in file_name4:
            essay_list = self.lecture_2_gram(url4 + single_file4)
            if word in essay_list:
                num += 1
            total += 1
 
        for single_file5 in file_name5:
            essay_list = self.lecture_2_gram(url5 + single_file5)
            if word in essay_list:
                num += 1
            total += 1
        Good_W = num * 1.0 / total
 
        # 计算出现过X的差的样本占所有差的样本比例
        url1 = '/home/hjs/Downloads/firstproject/essay/1/'
        url2 = '/home/hjs/Downloads/firstproject/essay/2/'
        # 获取目录下每个文件
        file_name1 = os.listdir(url1)
        file_name2 = os.listdir(url2)
        # print file_name1
        # print file_name2
        num = 0
        total = 0
        for single_file1 in file_name1:
            essay_list = self.lecture_2_gram(url1 + single_file1)
            if word in essay_list:
                num += 1
            total += 1
 
        for single_file2 in file_name2:
            essay_list = self.lecture_2_gram(url2 + single_file2)
            if word in essay_list:
                num += 1
            total += 1
        Bad_W = num * 1.0 / total
        W = Good_W - Bad_W
        # print W
        return W
 
    # 通过公式计算不同模型的分数
    def getScore(self):
        x = []
        y = []
        url = '/home/hjs/Downloads/内容组作文据(李霞)/答题作文/'
        score = 0
        n = 1
        while n < 6:
            url = '/home/hjs/Downloads/firstproject/essay/' + str(n)
            # print url
            print str(n) + '分下的文章分数'
            file_name = os.listdir(url)
            for single_file in file_name:
                with open(url + '/' + single_file, 'r')as essay:
                    # content = essay.read()
                    essay_list = self.lecture_2_gram(url + '/' + single_file)
                    # print essay_list
                    lecture_list = self.lecture_2_gram(lecture_url)
                    # print lecture_list
                    for unit in essay_list:
                        if unit in lecture_list:
                            # W=self.naive_getWeight()
                            # W=self.prob_getWeight(unit)
                            W=self.position_getWeight(unit)
                            # W=self.Good_getWeight(unit)
                            # W = self.GoodVsBad_getWeight(unit)
                            cxe = self.getCxL(unit, url + '/' + single_file)
                            # print W
                            score += W * cxe * 1.0
                    nE = self.getNum(url + '/' + single_file)
                    # print nE
                    score /= nE
                    # print score
            x.append(float(score))
            y.append(float(n))
            n += 1
        print scipy.stats.pearsonr(x, y)
        print '\n\n\n'
 
 
 
model_1_gram = model_1_gram()
model_1_gram.getScore()
 
# model_2_gram = model_2_gram()
# model_2_gram.getScore()
 
# model_4_gram = model_4_gram()
# model_4_gram.getScore()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值