使用whoosh 针对自己的博客完成全文搜索功能
安装whoosh 和jieba(用于中文分词处理)
在这里搜索需要的安装包,复制安装命令即可
先将博客数据保存为 json格式,注意需要使用headers
from bs4 import BeautifulSoup
import requests
import time
import json
with open('blogs_20180729.html', encoding='utf8', mode='r') as f:
soup = BeautifulSoup(f.read(), 'html.parser')
blog_urls = [{'url': i['href'], 'title': i.text} for i in soup.select('.blog h2 a[name]')]
headers = {
'Host': 'my.oschina.net',
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Mobile Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8',
}
print(len(blog_urls))
blogs = []
for blog in blog_urls:
# print(blog)
title = blog["title"]
html = requests.get(blog['url'], headers=headers).text
url = blog['url']
try:
blogs.append(
{
'url': url,
'title': title,
'html': html,
}
)
except Exception as e:
print(blogs, e)
time.sleep(0.2)
with open('blogs.json', mode='w+', encoding='utf8') as f:
json.dump(blogs, f)
创建索引
from whoosh.index import create_in
from whoosh.fields import *
from jieba.analyse import ChineseAnalyzer
import json
# 导入中文分词工具
analyser = ChineseAnalyzer()
# 可以设置不同类型的索引
# schema = Schema(phone_name=TEXT(stored=True, analyzer=analyser),
# price=NUMERIC(stored=True),
# phoneid=ID(stored=True)) # 创建索引结构
schema = Schema(
title=TEXT(stored=True, analyzer=analyser),
html=TEXT(stored=True),
url=TEXT(stored=True)
)
# path 为索引创建的地址,blog_index 为索引名称
ix = create_in("path", schema=schema, indexname='blog_index')
writer = ix.writer()
# 写入数据
with open('blogs.json', mode='r') as f:
blogs = json.load(f)
for blog in blogs:
writer.add_document(
title=blog['title'],
html=blog['html'],
url=blog['url']
)
writer.commit()
print("建立完成一个索引")
执行查询
查询条件为,只要关键字在题目和html中出现就匹配
from whoosh.index import open_dir
from whoosh.query import *
index = open_dir("path", indexname='blog_index') # 读取建立好的索引
def find(words):
with index.searcher() as searcher:
myquery = And([
Or([Term("html", w), Term("title", w)])
for w in words
]
)
results = searcher.search(myquery, limit=None)
for res in results:
print(res['title'], res['url'])
s = 'vue webpack'
words = [w.strip() for w in s.split()]
find(words)
结果
高亮检索到的单词
for res in results:
print(res.highlights('title'))
转载至链接:https://my.oschina.net/ahaoboy/blog/1919654