python使用Elasticsearch对wikipedia的数据进行检索（详细流程）

最新推荐文章于 2021-11-21 20:30:10 发布

梨小青椒

最新推荐文章于 2021-11-21 20:30:10 发布

阅读量993

点赞数 2

本文链接：https://blog.csdn.net/Herbe_chanceux/article/details/108761823

版权

NLP 同时被 3 个专栏收录

9 篇文章 0 订阅

订阅专栏

Elasticsearch

1 篇文章 0 订阅

订阅专栏

wikipedia

1 篇文章 0 订阅

订阅专栏

1 先下载wikipedia

$wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2

2 利用wikiextractor将其提取出来

$git clone https://github.com/attardi/wikiextractor
$python wikiextractor/WikiExtractor.py INPUT_FILE -o OUTPUT_PATH --json

3 将wikidedia的数据进行清洗，具体做法如下

首先用下面的代码去除doc和空格，只保留文本

def deal_enwiki(read_path, write_path):  # 去掉doc和空格，只保留文本
    regex_str = "[^<doc.*>$]|[^</doc>$]"
    file = open(read_path, "r", encoding="utf-8")
    output = open(write_path, "w+", encoding="utf-8")
    content_line = file.readline()
    article_contents = ""
    while content_line:
        match_obj = re.match(regex_str, content_line)
        if content_line.isspace():
            content_line = file.readline()
            continue
        else:
            content_line = content_line.strip("\n")
        if len(content_line) > 0:
            if match_obj:
                article_contents += content_line + " "
            else:
                if len(article_contents) > 0:
                    output.write(article_contents + "\n")
                    article_contents = ""
        # output.write(content_line+"\n")
        content_line = file.readline()
    file.close()
    output.close()

def generate_corpus():
    wiki_path = "/Wiki_processing/wikiextractor/extracted/AA"
    save_path = "/Wiki_processing/wikiextractor/extracted/AA"
    for i in range(14):
        if i < 10:
            file_path = os.path.join(wiki_path, str("wiki_0%s" % str(i)))
        else:
            file_path = os.path.join(wiki_path, str("wiki_%s" % str(i)))
        deal_enwiki(file_path, os.path.join(save_path, "wiki_corpus%s" % str(i)))
        print("Finish "+str(i)+" files generate")

将所有的句子变成一个文件,生成wiki_corpus_not_divide

def merge_corpus():  # 合并corpus
    output = open("/Wiki_processing/wikiextractor/extracted/AA/wiki_corpus_not_divide", "w", encoding="utf-8")
    input = "/Wiki_processing/wikiextractor/extracted/AA"
    for i in range(14):
        file_path = os.path.join(input, str("wiki_corpus%s" % str(i)))
        file = open(file_path, "r", encoding="utf-8")
        line = file.readline()
        while line:
            output.writelines(line)
            line = file.readline()
        file.close()
        print("Finish " + str(i) + " files merge")
    output.close()

将wiki_corpus_not_divide里面的文件变成一句话一行

def divide_sentence(read_path, write_path):  # 把句子分开
    punkt_param = PunktParameters()
    abbreviation = ['i.e', 'e.g', "U.S"]
    punkt_param.abbrev_types = set(abbreviation)
    tokenizer = PunktSentenceTokenizer(punkt_param)

    file = open(read_path, "r", encoding="utf-8")
    output = open(write_path, "w+", encoding="utf-8")
    content_line = file.readline()
    while content_line:
        sentence_list = tokenizer.tokenize(content_line)
        for sentence in sentence_list:
            output.write(sentence + "\n")
        print("finish divide")
        content_line = file.readline()
    file.close()
    output.close()

if __name__ == "__main__":
    # generate_corpus()
    # print("generate_corpus is finishing")
    # merge_corpus()
    # print("merge_corpus is finishing")
    divide_sentence("/Wiki_processing/wikiextractor/extracted/AA/wiki_corpus_not_divide",
                    "/Wiki_processing/wikiextractor/extracted/AA/wiki_corpus")

4 利用Elasticsearch进行检索

进入elasticsearch安装包里

$cd elasticsearch-5.0.0/
$./bin/elasticsearch -d

打开另外一个终端进行测试

$curl 'localhost:9200/_cat/health?v'

如果status为green,则说明为好的

创建索引 mainCreateIndex

def deleteInices(my_index):
    if True and es.indices.exists(my_index):  # 确认删除再改为True
        print("delete is complete")
        es.indices.delete(index=my_index)


def createIndex(my_index, my_doc):
    # index settings
    settings = \
        {
            "mappings": {
                my_doc: {
                    "properties": {
                        "my_id": {"type": "integer"},
                        "my_word": {"type": "text"}
                    }
                }
            }
        }
    # create index
    es.indices.create(index=my_index, ignore=400, body=settings)
    print("creating index is succeed!")
if __name__ == "__main__":
    mainCreateIndex()

运行完上面的.py文件之后，打开终端进行查看，是否创建成功索引

$curl 'localhost:9200/_cat/indices?v'

上述，出现index，就说明索引创建成功了

插入wikipedia的文本数据 mainInsert

def insertData(words, my_index, my_doc, one_bulk):
    # 插入数据
    # one_bulk表示一个bulk里装多少个
    body = []
    body_count = 0  # 记录body里面有多少个.
    # 最后一个bulk可能没满one_bulk,但也要插入

    print("need to insert %d" % len(words))
    pbar = tqdm(total=len(words))

    for id, word in words:
        data1 = {"my_id": id,  # id
                 "my_word": word}  # text
        every_body = \
            {
                "_index": my_index,  # 索引文件名
                "_type": my_doc,   # 数据文件名
                "_source": data1  # 文档正文
            }

        if body_count < one_bulk:
            body.append(every_body)
            body_count += 1
        else:
            helpers.bulk(es, body)  # 还是要用bulk啊，不然太慢了
            pbar.update(one_bulk)
            body_count = 0
            body = []
            body.append(every_body)
            body_count += 1

    if len(body) > 0:
        # 如果body里面还有，则再插入一次（最后非整块的）
        helpers.bulk(es, body)
        # pbar.update(len(body))
        print('done2')

    pbar.close()
    # res = es.index(index=my_index,doc_type=my_doc,id=my_key_id,body=data1)  #一条插入
    print("insert data completed!")


def mainInsert():
    # 调用后插入数据
    my_index = "my_index"
    my_doc = "my_doc"
    words = getAllWords(path="/Wiki_processing/wikiextractor/extracted/AA/wikipedia_sentences.txt")
    insertData(words, my_index, my_doc, one_bulk=5000)

if __name__ == "__main__":
     mainInsert()

运行完上面的.py文件之后，打开终端进行查看，可以通过查看store.size的大小来验证是否成功插入数据

$curl 'localhost:9200/_cat/indices?v'

在这里插入图片描述
可以看到store.size的大小明显发生了变化，说明成功插入数据

用自己的数据进行检索 mainSearch

def mainSearch():
    # 调用后检索数据
    my_index = "my_index"
    my_doc = "my_doc"
    keywords1 = "Vaccinations needed come Doha"
    keywordSearch(keywords1, my_index, my_doc)

if __name__ == "__main__":
    mainSearch()

以上就是如果用python针对ES进行检索的详细步骤。如有疑惑，欢迎评论交流