本文首先对十类新闻文本语料库中的语料进行预处理,然后将预处理后的语料通过pysolr接口载入到solr中,通过solr实现一般检索功能。(这里需要首先配置solr服务并开启)
- dataProcessing.py
该部分实现对不同排版的新闻题目和内容进行不同的正则化匹配。
import os
import re
from detect_text_formatting import detect_text_formatting
dataset_path="./文本分类语料库"
def dataProcessing(dataset_path):
TXT_container = []
pattern = r'^[\u4e00-\u9fff]+'
pattern1 = r"【 日 期 】"
pattern2 = r"日月光华 --"
pattern3 = r".*?社.*?\d+日电"
pattern4 = r"浏览次数:(\d+)"
patterns = {
pattern1: "pattern1",
pattern2: "pattern2",
pattern3: "pattern3",
pattern4: "pattern4",
}
count_NO = 0
list_NO = []
# print(re.match(pattern,"./文本分类语料库\交通214\4190.txt".split('\\')[1]))
for root, dirs, files in os.walk(dataset_path):
# print(root)
for file in files:
if file.endswith('.txt') or file.endswith('.TXT'):
container = {}
file_path = os.path.join(root, file)
# print(file_path)
with open(file_path, 'r', encoding='gbk', errors="ignore") as f:
file_content = f.read()
name = detect_text_formatting(patterns, file_content)
# print(name)
if name == "pattern1":
title_start = '【 标 题 】'
title_end = '\n'
text_start = '【 正 文 】\n'
text = file_content.split(text_start)[-1]
title = file_content.split(title_start)[-1].split(title_end)[0]
container["cata"] = re.match(pattern, file_path.split('\\')[1]).group()
container["title"] = title
container["text"] = text
TXT_container.append(container)
elif name == "pattern2":
match_obj_title = re.search(r'标题:(.*)\n', file_content)
match_obj_TEXT = re.search(r'发信站:(.*)\n', file_content)
container["cata"] = re.match(pattern, file_path.split('\\')[1]).group()
if match_obj_TEXT:
result = file_content.split(match_obj_TEXT.group(1))[-1]
container["text"] = result
else:
container["text"] = "未找到匹配项"
if match_obj_title:
result = match_obj_title.group(1)
container["title"] = result
else:
container["title"] = "未找到匹配项"
TXT_container.append(container)
elif name == "pattern3":
container["cata"] = re.match(pattern, file_path.split('\\')[1]).group()
match_obj = file_content.split(re.search(r'\n(.*)电((.*))', file_content).group())
if match_obj:
container["title"] = match_obj[0].strip()
# 提取正文
container["text"] = "".join(match_obj[1:])
TXT_container.append(container)
elif name == "pattern4":
container["cata"] = re.match(pattern, file_path.split('\\')[1]).group()
if re.search(r'浏览次数:(\d+)', file_content):
container["title"] = file_content.split(re.search(r'浏览次数:(\d+)', file_content).group())[0].strip()
container["text"] = "".join(
file_content.split(re.search(r'浏览次数:(\d+)', file_content).group())[1:])
else:
container["title"] = "未找到匹配项"
container["text"] = "未找到匹配项"
TXT_container.append(container)
else:
container["cata"] = re.match(pattern, file_path.split('\\')[1]).group()
container["title"] = re.match(pattern, file_path.split('\\')[1]).group().join("-").join(file)
container["text"] = file_content
list_NO.append(file_path)
count_NO += 1
TXT_container.append(container)
continue
return TXT_container
- detect_text_formatting.py
该部分实现对新闻排版格式的检测
def detect_text_formatting(patterns,str):
for pattern,name in patterns.items():
if re.search(pattern,str):
return name
return "NO"
- solr_client.py
该部分是solr类对象
from pysolr import Solr
class SolrClient:
def __init__(self, solr_url='http://127.0.0.1:8983/solr/newsCore',always_commit=True):
self.solr = Solr(solr_url,always_commit=always_commit,)
def add_document(self, documents):
self.solr.add(documents)
def delete_document(self, document_id):
self.solr.delete(id=document_id)
def update_document(self, document):
self.solr.add([document])
def search_documents(self, query):
results = self.solr.search(query, rows=10)
return results.docs
def clear_all_documents(self):
self.solr.delete(q='*:*')
- retrievalPage.py
该部分实现检索
from solr_client import SolrClient
from dataProcessing import dataProcessing
dataset_path="./文本分类语料库"
c=dataProcessing(dataset_path)
solr_client = SolrClient()
#添加文档
# solr_client.add_document(c)
print("="*10,"新闻查询系统","="*10)
while 1:
print("尊敬的鲍鱼先生,请输入查询:")
q=input()
query = "title:"+q
results= solr_client.search_documents(query)
for result in results:
print(result["title"][0])