1 先下载wikipedia
$wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
2 利用wikiextractor将其提取出来
$git clone https://github.com/attardi/wikiextractor
$python wikiextractor/WikiExtractor.py INPUT_FILE -o OUTPUT_PATH --json
3 将wikidedia的数据进行清洗,具体做法如下
- 首先用下面的代码去除doc和空格,只保留文本
def deal_enwiki(read_path, write_path): # 去掉doc和空格,只保留文本
regex_str = "[^<doc.*>$]|[^</doc>$]"
file = open(read_path, "r", encoding="utf-8")
output = open(write_path, "w+", encoding="utf-8")
content_line = file.readline()
article_contents = ""
while content_line:
match_obj = re.match(regex_str, content_line)
if content_line.isspace():
content_line = file.readline()
continue
else:
content_line = content_line.strip("\n")
if len(content_line) > 0:
if match_obj:
article_contents += content_line + " "
else:
if len(article_contents) > 0:
output.write(article_contents + "\n")
article_contents = ""
# output.write(content_line+"\n")
content_line = file.readline()
file.close()
output.close()
def generate_corpus():
wiki_path = "/Wiki_processing/wikiextractor/extracted/AA"
save_path = "/Wiki_processing/wikiextractor/extracted/AA"
for i in range(14):
if i < 10:
file_path = os.path.join(wiki_path, str("wiki_0%s" % str(i)))
else:
file_path = os.path.join(wiki_path, str("wiki_%s" % str(i)))
deal_enwiki(file_path, os.path.join(save_path, "wiki_corpus%s" % str(i)))
print("Finish "+str(i)+" files generate")
- 将所有的句子变成一个文件,生成wiki_corpus_not_divide
def merge_corpus(): # 合并corpus
output = open("/Wiki_processing/wikiextractor/extracted/AA/wiki_corpus_not_divide", "w", encoding="utf-8")
input = "/Wiki_processing/wikiextractor/extracted/AA"
for i in range(14):
file_path = os.path.join(input, str("wiki_corpus%s" % str(i)))
file = open(file_path, "r", encoding="utf-8")
line = file.readline()
while line:
output.writelines(line)
line = file.readline()
file.close()
print("Finish " + str(i) + " files merge")
output.close()
- 将wiki_corpus_not_divide里面的文件变成一句话一行
def divide_sentence(read_path, write_path): # 把句子分开
punkt_param = PunktParameters()
abbreviation = ['i.e', 'e.g', "U.S"]
punkt_param.abbrev_types = set(abbreviation)
tokenizer = PunktSentenceTokenizer(punkt_param)
file = open(read_path, "r", encoding="utf-8")
output = open(write_path, "w+", encoding="utf-8")
content_line = file.readline()
while content_line:
sentence_list = tokenizer.tokenize(content_line)
for sentence in sentence_list:
output.write(sentence + "\n")
print("finish divide")
content_line = file.readline()
file.close()
output.close()
if __name__ == "__main__":
# generate_corpus()
# print("generate_corpus is finishing")
# merge_corpus()
# print("merge_corpus is finishing")
divide_sentence("/Wiki_processing/wikiextractor/extracted/AA/wiki_corpus_not_divide",
"/Wiki_processing/wikiextractor/extracted/AA/wiki_corpus")
4 利用Elasticsearch进行检索
- 进入elasticsearch安装包里
$cd elasticsearch-5.0.0/
$./bin/elasticsearch -d
- 打开另外一个终端进行测试
$curl 'localhost:9200/_cat/health?v'
如果status为green,则说明为好的
- 创建索引 mainCreateIndex
def deleteInices(my_index):
if True and es.indices.exists(my_index): # 确认删除再改为True
print("delete is complete")
es.indices.delete(index=my_index)
def createIndex(my_index, my_doc):
# index settings
settings = \
{
"mappings": {
my_doc: {
"properties": {
"my_id": {"type": "integer"},
"my_word": {"type": "text"}
}
}
}
}
# create index
es.indices.create(index=my_index, ignore=400, body=settings)
print("creating index is succeed!")
if __name__ == "__main__":
mainCreateIndex()
运行完上面的.py文件之后,打开终端进行查看,是否创建成功索引
$curl 'localhost:9200/_cat/indices?v'
上述,出现index,就说明索引创建成功了
- 插入wikipedia的文本数据 mainInsert
def insertData(words, my_index, my_doc, one_bulk):
# 插入数据
# one_bulk表示一个bulk里装多少个
body = []
body_count = 0 # 记录body里面有多少个.
# 最后一个bulk可能没满one_bulk,但也要插入
print("need to insert %d" % len(words))
pbar = tqdm(total=len(words))
for id, word in words:
data1 = {"my_id": id, # id
"my_word": word} # text
every_body = \
{
"_index": my_index, # 索引文件名
"_type": my_doc, # 数据文件名
"_source": data1 # 文档正文
}
if body_count < one_bulk:
body.append(every_body)
body_count += 1
else:
helpers.bulk(es, body) # 还是要用bulk啊,不然太慢了
pbar.update(one_bulk)
body_count = 0
body = []
body.append(every_body)
body_count += 1
if len(body) > 0:
# 如果body里面还有,则再插入一次(最后非整块的)
helpers.bulk(es, body)
# pbar.update(len(body))
print('done2')
pbar.close()
# res = es.index(index=my_index,doc_type=my_doc,id=my_key_id,body=data1) #一条插入
print("insert data completed!")
def mainInsert():
# 调用后插入数据
my_index = "my_index"
my_doc = "my_doc"
words = getAllWords(path="/Wiki_processing/wikiextractor/extracted/AA/wikipedia_sentences.txt")
insertData(words, my_index, my_doc, one_bulk=5000)
if __name__ == "__main__":
mainInsert()
运行完上面的.py文件之后,打开终端进行查看,可以通过查看store.size的大小来验证是否成功插入数据
$curl 'localhost:9200/_cat/indices?v'
可以看到store.size的大小明显发生了变化,说明成功插入数据
- 用自己的数据进行检索 mainSearch
def mainSearch():
# 调用后检索数据
my_index = "my_index"
my_doc = "my_doc"
keywords1 = "Vaccinations needed come Doha"
keywordSearch(keywords1, my_index, my_doc)
if __name__ == "__main__":
mainSearch()
以上就是如果用python针对ES进行检索的详细步骤。如有疑惑,欢迎评论交流