【Elasticsearch】使用IMDB学习ES(4)导入数据

目录

本系列正在更新,点击下方查看

导入初始数据集

在上一篇文章中,我们设计了映射结构,这也是为了第一步准备的数据集的导入做准备。

在实际业务中,上线ES服务,我们会导入之前已经存在的业务数据,来保证版本上线之后的业务数据一致。

导入脚本

这里使用python作为导入数据的脚本语言。

脚本内容如下

import datetime
import logging
import time
import csv
from elasticsearch import Elasticsearch
import logging

log_file = datetime.datetime.fromtimestamp( time.time() ).strftime('%Y-%m-%d')
logging.basicConfig(filename='./imdb_'+log_file+'.log',level=logging.DEBUG)

def read_from_tsv(file_path: str, column_names: list) -> list:
    csv.register_dialect('tsv_dialect', delimiter='\t', quoting=csv.QUOTE_ALL)
    with open(file_path, "r") as wf:
        reader = csv.DictReader(wf, fieldnames=column_names, dialect='tsv_dialect')
        for row in reader:
            data = dict(row)
            print(data)
            if data['titleId'] == 'titleId':
                continue
            if data['titleId'] == '':
                break
            id = data['titleId'] + str(data['ordering'])
            titleId = data['titleId']
            ordering = int(str(data['ordering']))
            title = data['title']
            region =  data['region'].replace("\n", "")
            language =  data['language'].replace("\n", "")
            types =  data['types'].replace("\n", "")
            attributes = data['types'].replace("\n", "")
            isOriginalTitle = bool(data['isOriginalTitle'])
            insertToEs(id,titleId,ordering,title,region,language,types,attributes,isOriginalTitle)
        
    csv.unregister_dialect('tsv_dialect')
    return datas

def insertToEs(id,titleId,ordering,title,region,language,types,attributes,isOriginalTitle):
    try:
        data = {
            'id':id,
            'titleId':titleId,
            'ordering':ordering,
            'title':title,
            'region':region,
            'language':language,
            'types':types.split(' '),
            'attributes':attributes.split(' '),
            'isOriginalTitle':isOriginalTitle
        }
        es.index(
            index="imdb",
            id=id,
            doc_type="_doc",
            body=data
        )
        logging.info("cursor:"+id)
    except Exception as e:
        print(e)  

if __name__ == "__main__":
    begin_time = time.time()
    es = Elasticsearch([{'host': '127.0.0.1', 'port': 9200}], timeout=6000)
  
    read_from_tsv('title.akas.tsv',['titleId','ordering','title','region','language','types','attributes','isOriginalTitle']);
    print("done with: ",time.time()-begin_time)


检查数据

查看索引

我们可以在索引管理中直接查看

在这里插入图片描述

查询

我们可以直接在开发工具中进行操作,也可以实验性质的拼接查询条件,再然后在自己的系统中完成查询方法的封装。

GET imdb

{
  "imdb" : {
    "aliases" : { },
    "mappings" : {
      "properties" : {
        "attributes" : {
          "type" : "keyword"
        },
        "id" : {
          "type" : "text"
        },
        "isOriginalTitle" : {
          "type" : "boolean"
        },
        "language" : {
          "type" : "keyword"
        },
        "ordering" : {
          "type" : "integer"
        },
        "region" : {
          "type" : "keyword"
        },
        "title" : {
          "type" : "text"
        },
        "titleId" : {
          "type" : "text"
        },
        "types" : {
          "type" : "keyword"
        }
      }
    },
    "settings" : {
      "index" : {
        "routing" : {
          "allocation" : {
            "include" : {
              "_tier_preference" : "data_content"
            }
          }
        },
        "number_of_shards" : "1",
        "provided_name" : "imdb",
        "creation_date" : "1618961115453",
        "number_of_replicas" : "1",
        "uuid" : "nCG_YzsHQV-YmvJfOxOZZg",
        "version" : {
          "created" : "7120099"
        }
      }
    }
  }
}

GET imdb/_search
{
  "query":{
    "match":{
      "title":"clown"
    }
  }
}


{
  "took" : 747,
  "timed_out" : false,
  "_shards" : {
    "total" : 1,
    "successful" : 1,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : {
      "value" : 41,
      "relation" : "eq"
    },
    "max_score" : 9.891902,
    "hits" : [
      {
        "_index" : "imdb",
        "_type" : "_doc",
        "_id" : "tt00065201",
        "_score" : 9.891902,
        "_source" : {
          "id" : "tt00065201",
          "titleId" : "tt0006520",
          "ordering" : 1,
          "title" : "The Clown",
          "region" : "US",
          "language" : """\N""",
          "types" : [
            """\N"""
          ],
          "attributes" : [
            """\N"""
          ],
          "isOriginalTitle" : true
        }
      },
      {
        "_index" : "imdb",
        "_type" : "_doc",
        "_id" : "tt00136041",
        "_score" : 9.891902,
        "_source" : {
          "id" : "tt00136041",
          "titleId" : "tt0013604",
          "ordering" : 1,
          "title" : "Le clown",
          "region" : "FR",
          "language" : """\N""",
          "types" : [
            "imdbDisplay"
          ],
          "attributes" : [
            "imdbDisplay"
          ],
          "isOriginalTitle" : true
        }
      },
      {
        "_index" : "imdb",
        "_type" : "_doc",
        "_id" : "tt00162122",
        "_score" : 9.891902,
        "_source" : {
          "id" : "tt00162122",
          "titleId" : "tt0016212",
          "ordering" : 2,
          "title" : "O clown",
          "region" : "GR",
          "language" : """\N""",
          "types" : [
            """\N"""
          ],
          "attributes" : [
            """\N"""
          ],
          "isOriginalTitle" : true
        }
      },
      {
        "_index" : "imdb",
        "_type" : "_doc",
        "_id" : "tt00177613",
        "_score" : 9.891902,
        "_source" : {
          "id" : "tt00177613",
          "titleId" : "tt0017761",
          "ordering" : 3,
          "title" : "The Clown",
          "region" : "US",
          "language" : """\N""",
          "types" : [
            """\N"""
          ],
          "attributes" : [
            """\N"""
          ],
          "isOriginalTitle" : true
        }
      },
      {
        "_index" : "imdb",
        "_type" : "_doc",
        "_id" : "tt00177614",
        "_score" : 9.891902,
        "_source" : {
          "id" : "tt00177614",
          "titleId" : "tt0017761",
          "ordering" : 4,
          "title" : "The Clown",
          "region" : """\N""",
          "language" : """\N""",
          "types" : [
            "original"
          ],
          "attributes" : [
            "original"
          ],
          "isOriginalTitle" : true
        }
      },
      {
        "_index" : "imdb",
        "_type" : "_doc",
        "_id" : "tt00311621",
        "_score" : 9.891902,
        "_source" : {
          "id" : "tt00311621",
          "titleId" : "tt0031162",
          "ordering" : 1,
          "title" : "Clown Princes",
          "region" : "US",
          "language" : """\N""",
          "types" : [
            """\N"""
          ],
          "attributes" : [
            """\N"""
          ],
          "isOriginalTitle" : true
        }
      },
      {
        "_index" : "imdb",
        "_type" : "_doc",
        "_id" : "tt00356199",
        "_score" : 9.891902,
        "_source" : {
          "id" : "tt00356199",
          "titleId" : "tt0035619",
          "ordering" : 9,
          "title" : "Akrovatis clown",
          "region" : "GR",
          "language" : """\N""",
          "types" : [
            """\N"""
          ],
          "attributes" : [
            """\N"""
          ],
          "isOriginalTitle" : true
        }
      },
      {
        "_index" : "imdb",
        "_type" : "_doc",
        "_id" : "tt00000191",
        "_score" : 8.570279,
        "_source" : {
          "id" : "tt00000191",
          "titleId" : "tt0000019",
          "ordering" : 1,
          "title" : "The Clown Barber",
          "region" : "GB",
          "language" : """\N""",
          "types" : [
            """\N"""
          ],
          "attributes" : [
            """\N"""
          ],
          "isOriginalTitle" : true
        }
      },
      {
        "_index" : "imdb",
        "_type" : "_doc",
        "_id" : "tt00071241",
        "_score" : 8.570279,
        "_source" : {
          "id" : "tt00071241",
          "titleId" : "tt0007124",
          "ordering" : 1,
          "title" : "The New Clown",
          "region" : "GB",
          "language" : """\N""",
          "types" : [
            """\N"""
          ],
          "attributes" : [
            """\N"""
          ],
          "isOriginalTitle" : true
        }
      },
      {
        "_index" : "imdb",
        "_type" : "_doc",
        "_id" : "tt00123942",
        "_score" : 8.570279,
        "_source" : {
          "id" : "tt00123942",
          "titleId" : "tt0012394",
          "ordering" : 2,
          "title" : "The Little Clown",
          "region" : "US",
          "language" : """\N""",
          "types" : [
            """\N"""
          ],
          "attributes" : [
            """\N"""
          ],
          "isOriginalTitle" : true
        }
      }
    ]
  }
}

总结

到此为止,基础的ES业务实践系列完结。
接下来的一段实践,我们将开始进行复杂的ES业务中对于技术细节的学习。

雨果虾滑猫 CSDN认证博客专家 PHP MySQL Python
CSDN博客专家。PHP/MYSQL/Elasticsearch,PMP项目管理 ,产品设计。6年服务端开发,2年产品,连续创业者,曾经参与区块链、新零售行业创业。Gitchat作者,曾在Gitcchat发布智能合约相关课程。
相关推荐
©️2020 CSDN 皮肤主题: 博客之星2020 设计师:CY__ 返回首页
实付 19.90元
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值