导入tmdb
tmdb是电影数据,他的数据量很大,非常适合用作es实践。直接谷歌kaggle tmdb下载。
首先还是要在es上建立mapping:
PUT /movie
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 1
},
"mappings": {
"properties": {
"title":{"type": "text","analyzer": "english"},
"tagline":{"type": "text","analyzer": "english"},
"release_date":{"type": "date","format": "8yyyy/MM/dd||yyyy/M/dd||yyyy/MM/d||yyyy/M/d"},
"popularity":{"type": "double"},
"overview":{"type": "text","analyzer": "english"},
"cast":{
"type": "object",
"properties": {
"character":{"type":"text","analyzer":"standard"},
"name":{"type":"text","analyzer":"standard"}
}
}
}
}
}
接下来创建一个程序
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>org.elasticsearch</groupId>
<artifactId>elasticsearch</artifactId>
<version>7.6.1</version>
</dependency>
<dependency>
<groupId>org.elasticsearch.client</groupId>
<artifactId>transport</artifactId>
<version>7.6.1</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.elasticsearch.plugin</groupId>
<artifactId>transport-netty4-client</artifactId>
<version>7.6.1</version>
</dependency>
<dependency>
<groupId>com.opencsv</groupId>
<artifactId>opencsv</artifactId>
<version>4.2</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.58</version>
</dependency>
@Configuration
public class ESConfig {
@Bean
public TransportClient getClient(){
TransportClient transportClient = null;
try {
Settings settings = Settings.builder()
.put("cluster.name","dianping-app").build();
transportClient = new PreBuiltTransportClient(settings);
TransportAddress firstAddress = new TransportAddress(InetAddress.getByName("127.0.0.1"),Integer.parseInt("9300"));
TransportAddress secondAddress = new TransportAddress(InetAddress.getByName("127.0.0.1"),Integer.parseInt("9301"));
TransportAddress thirdAddress = new TransportAddress(InetAddress.getByName("127.0.0.1"),Integer.parseInt("9302"));
transportClient.addTransportAddress(firstAddress);
transportClient.addTransportAddress(secondAddress);
transportClient.addTransportAddress(thirdAddress);
}catch (Exception e){
e.printStackTrace();
}
return transportClient;
}
}
@Controller("/es")
@RequestMapping("/es")
public class ESController {
@Autowired
private TransportClient transportClient;
@RequestMapping("/get")
@ResponseBody
public ResponseEntity get(@RequestParam(name="id")Integer id){
GetResponse getResponse = transportClient.prepareGet("movie",null,id.toString()).get();
return new ResponseEntity(getResponse.getSource(), HttpStatus.OK);
}
@RequestMapping("/importdata")
@ResponseBody
public ResponseEntity importdata() throws IOException {
//批量插入
BulkRequest bulkRequest = new BulkRequest();
int lineId = 0;
InputStreamReader in = new InputStreamReader(new FileInputStream("./tmdb_5000_movies.csv"), Charset.forName("UTF-8"));
CSVReader reader = new CSVReader(in, ',');
List<String[]> allRecords = reader.readAll();
for (String[] records : allRecords) {
lineId++;
if(lineId == 1){
continue;
}
try{
JSONArray castJsonArray = JSONArray.parseArray(records[20]);
String character = (String) castJsonArray.getJSONObject(0).get("character");
String name = (String) castJsonArray.getJSONObject(0).get("name");
JSONObject cast = new JSONObject();
cast.put("character",character);
cast.put("name",name);
String date = records[11];
if(date == null || date.equals("")){
date = "1970/01/01";
}
//IndexRequest一条索引记录
bulkRequest.add(new IndexRequest("movie", "_doc", String.valueOf(lineId-1)).source(XContentType.JSON,
"title", records[17],
"tagline",records[16],
"release_date",date,
"popularity",records[8],
"cast",cast,