信息检索与数据挖掘——倒排索引2

信息检索实验报告

实验题目

Ranked retrieval model

实验内容

  • 在 Homework 1.1的基础上实现最基本的 Ranked retrieval model;
  • Use SMART notation: lnc.ltc;
  • 在Dictionary和posting list中存储每个term的DF;

实验过程

  • tf and df存储

添加其它需要的数据结构来分别记录DF:

postings = defaultdict(dict)
document_frequency = defaultdict(int)

遍历tweets时记录tf

unique_terms = set(line)
for term in unique_terms:
    postings[term][tweetid] = line.count(term)

记录tf完成后的输出文件如下:
在这里插入图片描述
记录df

for term in postings:
    document_frequency[term] = len(postings[term])

记录df完成后的输出文件如下:

在这里插入图片描述
注:这里的len是取term出现在文档的个数

  • 查询实现

  • 查询relevant_tweetids

利用已有数据结构实现查询的功能,对于输入的一串语句,进行相同的token处理,而后为了加快检索速率,避免去遍历所有tweet计算每一个F(q,d),可以先对于查询检索提取出相关的tweetid,得到relevant_tweetids列表,然后对相关tweetid计算score,降序输出topk个相关tweets。

  unique_query = set(query)
  relevant_tweetids = Union([set(postings[term].keys()) for term in unique_query])
  • 计算score

然后对query中的每一个键值对(term,id),计算score

    for term in unique_query:
        wtq=query.count(term)/len(query)

        if (term in postings) and (id in postings[term].keys()):
            wtd = (1 + math.log(postings[term][id]) * math.log((document_numbers + 1) / document_frequency[term]))
            similarity = wtq*wtd
  • 排序

调用sorted函数对score进行排序

        scores = sorted([(id, Getscore(query, id) 
                          for id in relevant_tweetids],
                         key=lambda x: x[1],
                         reverse=True)
  • 输出

输出前k个值

        for (id, score) in scores3:
            if i<=10:
                result.append(id)
                print(str(score) + ": " + id)
                i = i + 1
            else:
                break
        print("finished")

运行程序,就可以得到实验结果

Search query >> you and me

<<<<<Score(PLN)–Tweeetid>>>>>

一共有5955条相关tweet!,前10条为:

-2.7199255236867512: 31906716899610625

-2.7199255236867512: 32102808354295808

-2.7199255236867512: 311720996262514688

-2.7199255236867512: 307623685546708992

-2.7199255236867512: 310524290028158976

-2.7199255236867512: 305453745569943553

-2.94681270696307: 302167869108654080

-2.94681270696307: 297854975344791553

-3.51945038316577: 31880216670380032

-3.51945038316577: 624975258089619456

finished

代码

import sys
import math
from functools import reduce  # py3
from textblob import TextBlob
from textblob import Word
from collections import defaultdict

uselessTerm = ["username", "text", "tweetid"]
postings = defaultdict(dict)
document_frequency = defaultdict(int)
document_lengths = defaultdict(int)
document_numbers = len(document_lengths)
avdl = 0


def main():
    get_postings_dl()
    initialize_document_frequencies()
    initialize_avdl()
    # print(postings)
    # print(document_lengths)
    # print("平均tweet 长度为:" + str(avdl))
    # 可以修改do_search方法中的返回数据,得到使用不同模型的result
    # my_result_PLN.txt/my_result_BM25.txt/
    #result_name = "my_result_PLN_BM25.txt"

    #get_result(result_name)
    while True:
        do_search()

def tokenize_tweet(document):
    global uselessTerm
    document = document.lower()
    a = document.index("username")
    b = document.index("clusterno")
    c = document.rindex("tweetid") - 1
    d = document.rindex("errorcode")
    e = document.index("text")
    f = document.index("timestr") - 3
    # 提取用户名、tweet内容和tweetid三部分主要信息
    document = document[c:d] + document[a:b] + document[e:f]
    terms = TextBlob(document).words.singularize()

    result = []
    for word in terms:
        expected_str = Word(word)
        expected_str = expected_str.lemmatize("v")
        if expected_str not in uselessTerm:
            result.append(expected_str)

    return result


# 这里和上一个是一样的


def get_postings_dl():
    global postings, document_lengths
    f = open(
        r"C:\Users\ASUS\Desktop\tweets.txt")
    lines = f.readlines()  # 读取全部内容

    for line in lines:
        line = tokenize_tweet(line)
        tweetid = line[0]  # 这里记录一下tweetid,就弹出
        line.pop(0)
        document_lengths[tweetid] = len(line)  # 这一步直接记录词数
        unique_terms = set(line)
        for te in unique_terms:
            postings[te][tweetid] = line.count(te)
    # 按字典序对postings进行升序排序,但返回的是列表,失去了键值的信息
    # postings = sorted(postings.items(),key = lambda asd:asd[0],reverse=False)
    mylog = open(r"C:\Users\ASUS\Desktop\Invertedtf.txt", mode='a', encoding='utf-8')

    print(postings,file=mylog)


def initialize_document_frequencies():
    global document_frequency, postings
    for term in postings:
        document_frequency[term] = len(postings[term])
    mylog2 = open(r"C:\Users\ASUS\Desktop\Inverteddf.txt", mode='a', encoding='utf-8')
    print(document_frequency,file=mylog2)

# 求平均文档长度
def initialize_avdl():
    global document_lengths, avdl
    count = 0
    for twid in document_lengths:
        count += document_lengths[twid]
    avdl = count / len(document_lengths)


# 对输入的查询内容正则化
def token(doc):
    doc = doc.lower()
    terms = TextBlob(doc).words.singularize()

    result = []
    for word in terms:
        expected_str = Word(word)
        expected_str = expected_str.lemmatize("v")
        result.append(expected_str)

    return result


# 这是对每件的每个单词进行查询
def get_result(file_name):
    with open(file_name, 'w', encoding='utf-8') as f_out:

        Quaries = get_queries()
        qkeys = Quaries.keys()
        # 这里取得是索引
        for key in qkeys:

            q_result = do_search(Quaries[key])
            # 想清楚Quaries[key]的含义
            for tweetid in q_result:
                f_out.write(str(key) + ' ' + tweetid + '\n')


# queries是建立的查询索引
def get_queries():
    # 输入为qrels2014.txt
    # 输出为对于查询token后的结果:id + 查询字符串
    # 这个先忽略
    queries = {}
    keyid = 171
    fq = open(
        r"C:\Users\ASUS\Desktop\qrels2014.txt")
    lines = fq.readlines()
    for line in lines:
        index1 = line.find("<query>")
        if index1 >= 0:
            # 得到<query>和</query>之间的查询内容
            index1 += 8
            index2 = line.find("</query>")
            # print(line[index1:index2])
            # 添加到queries字典
            queries[keyid] = line[index1:index2]
            keyid += 1
    return queries


# 上面的两个函数的意思其实是对给定的qres文件进行查询,不需要的话直接删去就好
def do_search():
    query = token(input("Search query >> "))
    result = []  # 返回对于query的所有tweetid排序后的列表
    #query = token(query)

    if query == []:
        sys.exit()

    unique_query = set(query)
    # 避免遍历所有的tweet,可先提取出有相关性的tweetid,tweet中包含查询的关键词之一便可认为相关
    relevant_tweetids = Union([set(postings[term].keys()) for term in unique_query])

    # print(relevant_tweetids)
    print ("<<<<<Score(PLN)--Tweeetid>>>>>")
    print("PLN一共有"+str(len(relevant_tweetids))+"条相关tweet!")
    if not relevant_tweetids:
        print("No tweets matched any query terms for")
        print(query)
    # 下面是这份代码最核心的东西了
    else:
        # PLN
        scores3 = sorted([(id,similarity_PLN(query,id))for id in relevant_tweetids],key=lambda x: x[1],reverse=False)
        # BM25
        #        scores2 = sorted([(id,similarity_BM25(query,id))
        #                         for id in relevant_tweetids],
        #                        key=lambda x: x[1],
        #                        reverse=False)

        # PLN+BM25
        '''scores3 = sorted([(id, similarity_BM25(query, id) + similarity_PLN(query, id))
                          for id in relevant_tweetids],
                         key=lambda x: x[1],
                         reverse=False)'''
        i = 1
        for (id, score) in scores3:
            if i<=10:
                result.append(id)
                print(str(score) + ": " + id)
                i = i + 1
            else:
                break
        print("finished")
   # return result

#    for (id,score) in scores1:
#        print (str(score)+": "+id)
#
#    print ("<<<<<Score(BM25)--Tweeetid>>>>>")
#    print("BM25一共有"+str(len(scores2))+"条相关tweet!")
#    for (id,score) in scores2:
#        print (str(score)+": "+id)

def similarity_PLN(query, id):
    global postings, avdl
    fenmu = 1 - 0.1 + 0.1 * (document_lengths[id] / avdl)
    similarity = 0.0
    unique_query = set(query)
    for term in unique_query:
        wtq=query.count(term)/len(query)

        if (term in postings) and (id in postings[term].keys()):
            wtd = (1 + math.log(postings[term][id]) * math.log((document_numbers + 1) / document_frequency[term]))
            similarity = wtq*wtd
            # 使用ln(1+ln(C(w,d)+1))后发现相关性的分数都为负数很小
            #similarity += ((query.count(term)) * (math.log(math.log(postings[term][id] + 1) + 1)) * math.log(
               #(document_numbers + 1) / document_frequency[term])) / fenmu

    return similarity


#

def similarity_BM25(query, id):
    global postings, avdl
    fenmu = 1 - 0.2 + 0.2 * (document_lengths[id] / avdl)
    k = 1
    similarity = 0.00

    unique_query = set(query)
    for term in unique_query:
        if (term in postings) and (id in postings[term].keys()):
            C_wd = postings[term][id]
            # 使用ln(1+ln(C(w,d)+1))后发现相关性的分数都为负数很小
            similarity += (query.count(term) * (k + 1) * C_wd * math.log(
                (document_numbers + 1) / document_frequency[term])) / (k * fenmu + C_wd)

    return similarity


def Union(sets):
    return reduce(set.union, [s for s in sets])


if __name__ == "__main__":
    main()

©️2020 CSDN 皮肤主题: 黑客帝国 设计师:上身试试 返回首页