1 构建工程
2 public class HtmlBean {
private int id;
private String title;
private String content;
private String url;
public int getId() {
return id;
}
public void setId(int id) {
this.id = id;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
}
package com.sxt.es;
import java.util.ArrayList;
import java.util.List;
public class PageBean<T> {
private int size = 10;//每页显示记录
private int index = 1;// 当前页号
private int totalCount = 0;// 记录总数
private int totalPageCount = 1;// 总页
private int[] numbers;//展示页数集合
protected List<T> list;//要显示到页面的数据集
/**
* 得到
* @return
*/
public int getStartRow() {
return (index - 1) * size;
}
/**
* 得到结束记录
* @return
*/
public int getEndRow() {
return index * size;
}
/**
* @return Returns the size.
*/
public int getSize() {
return size;
}
/**
* @param size
* The size to set.
*/
public void setSize(int size) {
if (size > 0) {
this.size = size;
}
}
/**
* @return Returns the currentPageNo.
*/
public int getIndex() {
if (totalPageCount == 0) {
return 0;
}
return index;
}
/**
* @param currentPageNo
* The currentPageNo to set.
*/
public void setIndex(int index) {
if (index > 0) {
this.index = index;
}
}
/**
* @return Returns the totalCount.
*/
public int getTotalCount() {
return totalCount;
}
/**
* @param totalCount
* The totalCount to set.
*/
public void setTotalCount(int totalCount) {
if (totalCount >= 0) {
this.totalCount = totalCount;
setTotalPageCountByRs();//根据总记录数计算总页
}
}
public int getTotalPageCount() {
return this.totalPageCount;
}
/**
* 根据总记录数计算总页
*/
private void setTotalPageCountByRs() {
if (this.size > 0 && this.totalCount > 0 && this.totalCount % this.size == 0) {
this.totalPageCount = this.totalCount / this.size;
} else if (this.size > 0 && this.totalCount > 0 && this.totalCount % this.size > 0) {
this.totalPageCount = (this.totalCount / this.size) + 1;
} else {
this.totalPageCount = 0;
}
setNumbers(totalPageCount);//获取展示页数集合
}
public int[] getNumbers() {
return numbers;
}
/**
* 设置显示页数集合
* @param totalPageCount
*/
public void setNumbers(int totalPageCount) {
if(totalPageCount>0){
//!.当前数组的长度
int[] numbers = new int[totalPageCount>10?10:totalPageCount];//页面要显示的页数集合
int k =0;
//
//1.数组长度<10 1 2 3 4 .... 7
//2.数组长度>=10
// 当前页<=6 1 2 3 4 10
// 当前页>=总页数-5 ......12 13 14 15
// 其他 5 6 7 8 当前页(10) 10 11 12 13
for(int i = 0;i < totalPageCount;i++){
//保证当前页为集合的中�?
if((i>=index- (numbers.length/2+1) || i >= totalPageCount-numbers.length) && k<numbers.length){
numbers[k] = i+1;
k++;
}else if(k>=numbers.length){
break;
}
}
this.numbers = numbers;
}
}
public void setNumbers(int[] numbers) {
this.numbers = numbers;
}
public List<T> getList() {
return list;
}
public void setList(List<T> list) {
this.list = list;
}
public void setBean(T bean){
if(this.list==null){
list =new ArrayList<T>();
}
list.add(bean);
}
/*
public static int getTotalPageCount(int iTotalRecordCount, int iPageSize) {
if (iPageSize == 0) {
return 0;
} else {
return (iTotalRecordCount % iPageSize) == 0 ? (iTotalRecordCount / iPageSize) : (iTotalRecordCount / iPageSize) + 1;
}
}*/
}
public class IndexService {
//存放html文件的目录
public static String DATA_DIR="d:\\data\\";
private static Client client;
static {
Settings settings = Settings.settingsBuilder()
.put("cluster.name", "bjsxt-es").build();
try {
client = TransportClient
.builder()
.settings(settings)
.build()
.addTransportAddress(
new InetSocketTransportAddress(InetAddress
.getByName("node01"), 9300))
.addTransportAddress(
new InetSocketTransportAddress(InetAddress
.getByName("node02"), 9300))
.addTransportAddress(
new InetSocketTransportAddress(InetAddress
.getByName("node03"), 9300));
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 创建索引库
* @throws Exception
*/
public void createIndex() throws Exception {
IndicesExistsResponse resp = client.admin().indices().prepareExists("bjsxt")
.execute().actionGet();
if (resp.isExists()) {
client.admin().indices().prepareExists("bjsxt").execute().actionGet();
}
client.admin().indices().prepareExists("bjsxt").execute().actionGet();
new XContentFactory();
XContentBuilder builder = XContentFactory.jsonBuilder().startObject()
.startObject("htmlbean").startObject("properties")
.startObject("title").field("type", "string")
.field("store", "yes").field("analyzer", "ik_max_word")
.field("search_analyzer", "ik_max_word").endObject()
.startObject("content").field("type", "string")
.field("store", "yes").field("analyzer", "ik_max_word")
.field("search_analyzer", "ik_max_word").endObject()
// .startObject("url").field("type", "string")
// .field("store", "yes").field("analyzer", "ik_max_word")
// .field("search_analyzer", "ik_max_word").endObject()
.endObject().endObject().endObject();
PutMappingRequest mapping = Requests.putMappingRequest("bjsxt").type("htmlbean").source(builder);
client.admin().indices().putMapping(mapping).actionGet();
}
/**
* 将html文件添加到索引库
*/
@Test
public void addHtmlToES(){
readHtml(new File(DATA_DIR));
}
/**
* 遍历数据文件目录d:/data ,递归方法
* @param file
*/
public void readHtml(File file){
if (file.isDirectory()){
File[] fs = file.listFiles();
for (int i = 0; i < fs.length; i++) {
File f = fs[i];
readHtml(f);
}
} else {
HtmlBean bean;
try {
bean = HtmlTool.parserHtml(file.getPath());
if(bean!=null){
Map<String, String> dataMap =new HashMap<String, String>();
dataMap.put("title", bean.getTitle());
dataMap.put("content", bean.getContent());
dataMap.put("url", bean.getUrl());
//写索引
client.prepareIndex("bjsxt", "htmlbean").setSource(dataMap).execute().actionGet();
}
} catch(Throwable e) {
e.printStackTrace();
}
}
}
}
package com.sxt.util;
import java.io.File;
import com.sxt.es.HtmlBean;
import com.sxt.es.IndexService;
import net.htmlparser.jericho.CharacterReference;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.Source;
public class HtmlTool {
/**
*
* @param path html 文件路径
*/
public static HtmlBean parserHtml(String path)throws Throwable{
HtmlBean bean = new HtmlBean();
Source source=new Source(new File(path));
// Call fullSequentialParse manually as most of the source will be parsed.
source.fullSequentialParse();
Element titleElement=source.getFirstElement(HTMLElementName.TITLE);
if(titleElement==null){
return null;
}else{
String title=CharacterReference.decodeCollapseWhiteSpace(titleElement.getContent());
bean.setTitle(title);
}
String content =source.getTextExtractor().setIncludeAttributes(true).toString();
String url =path.substring(IndexService.DATA_DIR.length());
bean.setContent(content);
bean.setUrl(url);
return bean;
}
public static void main(String[] args) {
try {
System.out.println(parserHtml("d:\\data\\news.cctv.com\\2019\\01\\02\\ARTIr9zKNrYdpwG0N8d1fkpC190102.shtml").getContent());
} catch (Throwable e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
junit 测试如下:
访问es服务 http://node03:9200/_plugin/head/