【news retrieval system:基于pysolr的新闻检索】

本文首先对十类新闻文本语料库中的语料进行预处理,然后将预处理后的语料通过pysolr接口载入到solr中,通过solr实现一般检索功能。(这里需要首先配置solr服务并开启)

  1. dataProcessing.py
    该部分实现对不同排版的新闻题目和内容进行不同的正则化匹配。
import os
import re
from detect_text_formatting import detect_text_formatting
dataset_path="./文本分类语料库"


def dataProcessing(dataset_path):
    TXT_container = []
    pattern = r'^[\u4e00-\u9fff]+'
    pattern1 = r"【 日  期 】"
    pattern2 = r"日月光华 --"
    pattern3 = r".*?社.*?\d+日电"
    pattern4 = r"浏览次数:(\d+)"
    patterns = {
        pattern1: "pattern1",
        pattern2: "pattern2",
        pattern3: "pattern3",
        pattern4: "pattern4",
    }
    count_NO = 0
    list_NO = []
    # print(re.match(pattern,"./文本分类语料库\交通214\4190.txt".split('\\')[1]))
    for root, dirs, files in os.walk(dataset_path):
        # print(root)
        for file in files:
            if file.endswith('.txt') or file.endswith('.TXT'):
                container = {}
                file_path = os.path.join(root, file)
                # print(file_path)
                with open(file_path, 'r', encoding='gbk', errors="ignore") as f:
                    file_content = f.read()
                name = detect_text_formatting(patterns, file_content)
                # print(name)
                if name == "pattern1":
                    title_start = '【 标  题 】'
                    title_end = '\n'
                    text_start = '【 正  文 】\n'
                    text = file_content.split(text_start)[-1]
                    title = file_content.split(title_start)[-1].split(title_end)[0]
                    container["cata"] = re.match(pattern, file_path.split('\\')[1]).group()
                    container["title"] = title
                    container["text"] = text
                    TXT_container.append(container)
                elif name == "pattern2":
                    match_obj_title = re.search(r'标题:(.*)\n', file_content)
                    match_obj_TEXT = re.search(r'发信站:(.*)\n', file_content)
                    container["cata"] = re.match(pattern, file_path.split('\\')[1]).group()
                    if match_obj_TEXT:
                        result = file_content.split(match_obj_TEXT.group(1))[-1]
                        container["text"] = result
                    else:
                        container["text"] = "未找到匹配项"
                    if match_obj_title:
                        result = match_obj_title.group(1)
                        container["title"] = result
                    else:
                        container["title"] = "未找到匹配项"
                    TXT_container.append(container)
                elif name == "pattern3":
                    container["cata"] = re.match(pattern, file_path.split('\\')[1]).group()
                    match_obj = file_content.split(re.search(r'\n(.*)电((.*))', file_content).group())
                    if match_obj:
                        container["title"] = match_obj[0].strip()
                        # 提取正文
                        container["text"] = "".join(match_obj[1:])
                    TXT_container.append(container)
                elif name == "pattern4":
                    container["cata"] = re.match(pattern, file_path.split('\\')[1]).group()
                    if re.search(r'浏览次数:(\d+)', file_content):
                        container["title"] = file_content.split(re.search(r'浏览次数:(\d+)', file_content).group())[0].strip()
                        container["text"] = "".join(
                            file_content.split(re.search(r'浏览次数:(\d+)', file_content).group())[1:])
                    else:
                        container["title"] = "未找到匹配项"
                        container["text"] = "未找到匹配项"
                    TXT_container.append(container)
                else:
                    container["cata"] = re.match(pattern, file_path.split('\\')[1]).group()
                    container["title"] = re.match(pattern, file_path.split('\\')[1]).group().join("-").join(file)
                    container["text"] = file_content
                    list_NO.append(file_path)
                    count_NO += 1
                    TXT_container.append(container)
                    continue
    return TXT_container
  1. detect_text_formatting.py
    该部分实现对新闻排版格式的检测
def detect_text_formatting(patterns,str):
    for pattern,name in patterns.items():
        if re.search(pattern,str):
            return name
    return "NO"
  1. solr_client.py
    该部分是solr类对象
from pysolr import Solr


class SolrClient:
    def __init__(self, solr_url='http://127.0.0.1:8983/solr/newsCore',always_commit=True):
        self.solr = Solr(solr_url,always_commit=always_commit,)

    def add_document(self, documents):
        self.solr.add(documents)

    def delete_document(self, document_id):
        self.solr.delete(id=document_id)

    def update_document(self, document):
        self.solr.add([document])

    def search_documents(self, query):
        results = self.solr.search(query, rows=10)
        return results.docs

    def clear_all_documents(self):
        self.solr.delete(q='*:*')
  1. retrievalPage.py
    该部分实现检索
from solr_client import SolrClient

from dataProcessing import dataProcessing
dataset_path="./文本分类语料库"
c=dataProcessing(dataset_path)
solr_client = SolrClient()

#添加文档
# solr_client.add_document(c)


print("="*10,"新闻查询系统","="*10)
while 1:
    print("尊敬的鲍鱼先生,请输入查询:")
    q=input()
    query = "title:"+q
    results= solr_client.search_documents(query)
    for result in results:
        print(result["title"][0])
  • 5
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值