基于对论文Content Importance Models for Scoring Writing From Sources的简单的1-gram的实现

以下是基于对论文的简单的在 1-gram 下面的实现,有五种模型,因为没有弄到reading所以,那种模型还没做。本代码所对应的语料不公布




# -*-coding:utf-8 -*-
import sys
import re
import os
from jieba import *
import jieba
reload(sys)
sys.setdefaultencoding('utf8')

# lecture地址
lecture_url = 'firstproject/lecture.txt'


class model_1_gram(object):
    # 得到一元条件下的对于lecture的词的采集(去除了停用词),把采集结果放到一个list里面返回
    def lecture_1_gram(self):
        with open('firstproject/lecture.txt','r') as lecture:
            # Windows下面有编码问题,这一步解决
            content = lecture.read().strip().decode('gbk').encode('utf-8')
            # print content
            # 按空格,逗号或句号分词
            # lecture_list = re.split(', |\. | "|" |\.|\n| ',content)
            lecture_list = re.findall('([A-Za-z\']+)', content)
            # print lecture_list

            # 下面这一大块都是去除停用词
            with open('firstproject/stopword.txt','r') as stopword:
                stopword_content = stopword.read().strip().decode('gbk').encode('utf-8')
                stopword_content = re.split('  \n',stopword_content)
                lecture_list_new = []
                for word in lecture_list:
                    if word.lower() not in stopword_content:
                        if word != '':
                            # lecture_list.remove(word)
                            lecture_list_new.append(word)
        # print lecture_list_new
        return lecture_list_new

    # 这个函数返回word这个词在essay里面出现的次数
    def CxE(self,word,url):
        with open(url,'r') as essay:
            results = essay.read().strip().decode('gbk').encode('utf-8')
            essay_list = re.findall('([A-Za-z\']+)', results)
            # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
            num = essay_list.count(word)
            return num

    # 返回lecture的所有词也就是n
    def getNum(self,url):
        with open(url,'r') as essay:
            results = essay.read().strip().decode('gbk').encode('utf-8')
            lecture_list = re.findall('([A-Za-z\']+)', results)
            # lecture_list = re.split(', |\. | "|" |\.|\n| ', results)
            num = len(lecture_list)
            return num

    # 第一个模型的权值
    def naive_getWeight(self):
        return 1

    # 第二个模型权值
    def prob_getWeight(self,word):
        num = self.CxE(word,lecture_url)
        lecture_num = self.getNum(lecture_url)
        W = num*1.0/lecture_num
        # print W
        return W


    # 第三个模型权值
    def position_getWeight(self,word):
        with open(lecture_url,'r') as lecture:
            results = lecture.read().strip().decode('gbk').encode('utf-8')
            # lecture_list = re.split(', |\. | "|" |\.|\n| ', results)
            lecture_list = re.findall('([A-Za-z\']+)', results)
            lecture_list_new = []
            for i in lecture_list:
                if i!='':
                    lecture_list_new.append(i)
            n = 0
            for unit in lecture_list_new:
                n += 1
                if unit == word:
                    break
            lecture_num = self.getNum(lecture_url)
            W = n*1.0/lecture_num
            return W


    # 第五个模型权值
    def Good_getWeight(self,word):
        url4 = 'firstproject/essay/4/'
        url5 = 'firstproject/essay/5/'
        # os获取目录url
        file_name4 = os.listdir(url4)
        file_name5 = os.listdir(url5)
        num = 0
        total = 0
        for single_file4 in file_name4:
            with open(url4+single_file4,'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    num += 1
            total += 1

        for single_file5 in file_name5:
            with open(url5+single_file5,'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    num += 1
            total += 1

        W = num*1.0/total
        return W

    # 第六个模型权值
    def GoodVsBad_getWeight(self,word):
        url4 = 'firstproject/essay/4/'
        url5 = 'firstproject/essay/5/'
        file_name4 = os.listdir(url4)
        file_name5 = os.listdir(url5)
        num = 0
        total = 0
        for single_file4 in file_name4:
            with open(url4 + single_file4, 'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    num += 1
            total += 1

        for single_file5 in file_name5:
            with open(url5 + single_file5, 'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    num += 1
            total += 1
        Good_W = num * 1.0 / total
        # 上面是算好的,下面是算坏的
        url1 = 'firstproject/essay/1/'
        url2 = 'firstproject/essay/2/'
        file_name1 = os.listdir(url1)
        file_name2 = os.listdir(url2)
        Bad_num = 0
        Bad_total = 0
        for single_file1 in file_name1:
            with open(url1 + single_file1, 'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    Bad_num += 1
            Bad_total += 1

        for single_file2 in file_name2:
            with open(url2 + single_file2, 'r') as essay:
                results = essay.read().strip().decode('gbk').encode('utf-8')
                # essay_list = re.split(', |\. | "|" |\.|\n| ', results)
                essay_list = re.findall('([A-Za-z\']+)', results)
                if word in essay_list:
                    Bad_num += 1
            Bad_total += 1

        Bad_W = Bad_num * 1.0 / Bad_total
        W = Good_W-Bad_W
        # print W
        return W




    # 从模型得到权值然后算分的函数
    def getScore(self):
        n=0
        while n<5:
            n += 1
            # 拼接完整的url
            url = 'firstproject/essay/'+str(n)
            # 调用os获取一个文件夹下的所有url,然后循环调用
            file_name = os.listdir(url)
            score = 0
            for single_file in file_name:
                with open(url+'/'+single_file,'r') as essay:
                    content = essay.read()
                    # essay_list = re.split(', |\. | "|" |\.|\n| ', content)
                    essay_list = re.findall('([A-Za-z\']+)', content)
                    lecture_list = self.lecture_1_gram()
                    for unit in lecture_list:
                        if unit in essay_list:
                            # W = self.GoodVsBad_getWeight(unit)
                            W = self.naive_getWeight()
                            # W = self.prob_getWeight(unit)
                            # W = self.position_getWeight(unit)
                            # W = self.Good_getWeight(unit)
                            cxe = self.CxE(unit,url+'/'+single_file)
                            score += W*cxe*1.0
                    num = self.getNum(url+'/'+single_file)
                    score /= num
                    print str(score)+' '
model_1_gram = model_1_gram()
model_1_gram.getScore()







评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

_我走路带风

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值