基于对论文Content Importance Models for Scoring Writing From Sources的简单的2-gram的实现

# -*-coding:utf-8 -*-
import sys
import re
import os
from jieba import *
import jieba
reload(sys)
sys.setdefaultencoding('utf8')

# lecture地址
lecture_url ='/home/hjs/Downloads/



class model_2_gram(object):
    # 得到二元条件下的对于lecture或essay的词的采集(去除了停用词),把采集结果放到一个list里面返回
    def lecture_2_gram(self,url):
        with open(url,'r') as lecture:
            #Windows下面有编码问题,这一步解决
            content = lecture.read().strip().decode('gbk','ignore').encode('utf-8')
            #将每段开头的4个空格改为1个
            content = re.sub(r"\s{2,}", " ", content)
            # print content
            #从开头依次截取2个单位长度的单词存入列表lecture_list_new
            lecture_2_list = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)',content)
            # print lecture_2_list
            lecture_list_new = []
            for i in lecture_2_list:
                word = str(i[0])+str(" ")+str(i[2])
                # print word
                lecture_list_new.append(word)

            #除去第一个单词,依次截取2个单位长度的单词存入列表lecture_list_new
            content1 = re.search('[A-Za-z\']+\s([\s\S]*)',content)
            # print content1.group(1)
            lecture_2_list1 = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)', content1.group(1))
            for j in lecture_2_list1:
                word = str(j[0])+str(" ")+str(j[2])
                # print word
                lecture_list_new.append(word)

            # print lecture_list_new
            return lecture_list_new

    #得到nL
    def getNum(self,url):
        with open(url,'r')as lecture:
            content = lecture.read().strip().decode('gbk').encode('utf-8')
            content = re.sub(r"\s{2,}", " ", content)
            lecture_list = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)',content)
            num1 = len(lecture_list)
            content1 = re.search('[A-Za-z\']+\s([\s\S]*)',content)
            lecture_list1 = re.findall('([A-Za-z\']+)(, |\. | "|" |\.|\n| )([A-Za-z\']+)',content1.group(1))
            num2 = len(lecture_list1)
            num =num1+num2
            # print num
            return num

    #得到C(x/L)
    def getCxL(self,world,url):
        lecture_list = self.lecture_2_gram(url)
        num = lecture_list.count(world)
        # print num
        return num

    #得到FP(x)
    def getFPx(self,world,url):
        lecture_list = self.lecture_2_gram(url)
        num = lecture_list.index(world)
        # print num+1
        # print lecture_list
        return num+1

    #第一个模型的权值
    def naive_getWeight(self):
        return 1

    #第二个模型的权值
    def prob_getWeight(self,world):
        num = self.getCxL(world,lecture_url)
        lecture_num = self.getNum(lecture_url)
        W = num*1.0/lecture_num
        # print W
        return W

    # 第三个模型的权值
    def position_getWeight(self,world):
        FPX = self.getFPx(world,lecture_url)
        lecture_num = self.getNum(lecture_url)
        W = FPX*1.0/lecture_num
        # print W
        return W

    #第五个模型的权值
    def Good_getWeight(self,word):
        url4 = '/home/hjs/Downloads/
        url5 = '/home/hjs/Downloads/
        #获取目录下每个文件
        file_name4 = os.listdir(url4)
        file_name5 = os.listdir(url5)
        # print file_name4
        # print file_name5
        num = 0
        total = 0
        for single_file4 in file_name4:
            essay_list = self.lecture_2_gram(url4+single_file4)
            if word in essay_list:
                num+=1
            total +=1

        for single_file5 in file_name5:
            essay_list =self.lecture_2_gram(url5+single_file5)
            if word in essay_list:
                num+=1
            total+=1


        W = num*1.0/total*1.0
        print W
        return W

    #第六个模型的权值
    def GoodVsBad_getWeight(self,word):
        #计算出现过X的好的样本占所有好的样本比例
        url4 = '/home/hjs/Downloads/
        url5 = '/home/hjs/Downloads/
        # 获取目录下每个文件
        file_name4 = os.listdir(url4)
        file_name5 = os.listdir(url5)
        # print file_name4
        # print file_name5
        num = 0
        total = 0
        for single_file4 in file_name4:
            essay_list = self.lecture_2_gram(url4 + single_file4)
            if word in essay_list:
                num += 1
            total += 1

        for single_file5 in file_name5:
            essay_list = self.lecture_2_gram(url5 + single_file5)
            if word in essay_list:
                num += 1
            total += 1
        Good_W = num*1.0/total

        #计算出现过X的差的样本占所有差的样本比例
        url1 = '/home/hjs/Downloads/
        url2 = '/home/hjs/Downloads/
        # 获取目录下每个文件
        file_name1 = os.listdir(url1)
        file_name2 = os.listdir(url2)
        # print file_name1
        # print file_name2
        num = 0
        total = 0
        for single_file1 in file_name1:
            essay_list = self.lecture_2_gram(url1 + single_file1)
            if word in essay_list:
                num += 1
            total += 1

        for single_file2 in file_name2:
            essay_list = self.lecture_2_gram(url2 + single_file2)
            if word in essay_list:
                num += 1
            total += 1
        Bad_W = num*1.0/ total
        W = Good_W - Bad_W
        # print W
        return W

    #通过公式计算不同模型的分数
    def getScore(self):
        url = '/home/hjs/Downloads/
        score = 0
        n=1
        while n<6:
            url = '/home/hjs/Downloads/ + str(n)
            print url
            file_name = os.listdir(url)
            for single_file in file_name:
                with open(url+'/'+single_file,'r')as essay:
                    content = essay.read()
                    essay_list = self.lecture_2_gram(url+'/'+single_file)
                    # print essay_list
                    lecture_list = self.lecture_2_gram(lecture_url)
                    # print lecture_list
                    for x in essay_list:
                        if x in lecture_list:
                            # W=self.naive_getWeight()
                            # W=self.prob_getWeight(x)
                            # W=self.position_getWeight(x)
                            # W=self.Good_getWeight(x)
                            W=self.GoodVsBad_getWeight(x)
                            cxe = self.getCxL(x,url+'/'+single_file)
                            # print W
                            score += W * cxe * 1.0
                    nE = self.getNum(url+'/'+single_file)
                    # print nE
                    score /= nE
                    print score
            n+=1

a = model_2_gram()
a.getScore()






# lecture_2_gram()
# getNum(lecture_url)
# getCxL('a',lecture_url)
# getFPx('It',lecture_url)
# Good_getWeight('a')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值