仅供记录,代码存在缺失
依赖:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>3.11</version>
<!-- <version>4.1.0</version>-->
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<!-- <version>4.1.0</version>-->
<version>3.11</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.11</version>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
<version>4.0.1</version>
<scope>provided</scope>
</dependency>
import cn.hutool.Hutool;
import cn.hutool.core.io.file.FileWriter;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpUtil;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.google.gson.JsonObject;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.directory.api.util.Strings;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.utils.HttpClientUtils;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import restclient.com.fasterxml.jackson.annotation.JsonProperty;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
* author: hyxu.xhy
* create time: 2021/8/22
* description: 工具类
*/
@Component
@Slf4j
public class SmallUtils {
/*
* @description: 写文件
* @author: hyxu.xhy
* @Param filePath:
* @Param content:
* @return: void
*/
public void writeFile(String filePath,String content) throws IOException {
FileWriter writer = new FileWriter(filePath);
writer.write(content);
}
/*
* @title: readFile
* @description: 读取文件所有内容
* @author: hyxu.xhy
* @updateTime: 2021/9/27 19:48
* @return: java.lang.String
* @throws:
*/
public String readFile(String filePath) throws IOException {
byte[] filecontent = new byte[1024];
try {
InputStream in = getClass().getClassLoader().getResourceAsStream(filePath);
in.read(filecontent);
in.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return new String(filecontent);
}
public List<String> readFileLines(String filePath){
Path path = Paths.get(filePath);
try {
return Files.readAllLines(path);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/*
* @title: getByXpath
* @description: 根据xpath匹配html页面
* @author: hyxu.xhy
* @updateTime: 2021/9/2 17:10
* @return: java.lang.String
* @throws:
*/
public String getByXpath(String xPath,String html){
TagNode tagNode = new HtmlCleaner().clean(html);
Object value = null;
try {
org.w3c.dom.Document doc = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
XPath xpath = XPathFactory.newInstance().newXPath();
value = xpath.evaluate(xPath, doc, XPathConstants.NODESET);
} catch (Exception e) {
System.out.println("Extract value error. " + e.getMessage());
e.printStackTrace();
}
String nodeString = null;
StringBuffer resultString = new StringBuffer();
if (value instanceof NodeList) {
NodeList nodeList = (NodeList) value;
for (int j = 0; j < nodeList.getLength(); j++) {
Node node = nodeList.item(j);
nodeString = node.getNodeValue();
if (StringUtils.isNotBlank(nodeString)) {
resultString.append(nodeString);//拼接介绍字符串
}
}
}
return resultString.toString();
}
public String getHtml(String url){
//1.生成httpclient,相当于该打开一个浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = null;
//2.创建get请求,相当于在浏览器地址栏输入 网址
HttpGet request = new HttpGet(url);
String html = null;
try {
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(request);
//4.判断响应状态为200,进行处理
if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
//5.获取响应内容
HttpEntity httpEntity = response.getEntity();
html = EntityUtils.toString(httpEntity, "utf-8");
} else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
System.out.println("返回状态不是200");
System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//6.关闭
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
return html;
}
//使用代理爬取页面
public String getHtmlProxy(String url){
Map map = new HashMap();
map.put("ip","10.200.125.65");
map.put("port","8080");
//1.生成httpclient,相当于该打开一个浏览器
CloseableHttpClient httpClient = null;
CloseableHttpResponse response = null;
//2.创建get请求,相当于在浏览器地址栏输入 网址
HttpGet request = new HttpGet(url);
String html = null;
try {
HttpHost proxy = new HttpHost("10.200.125.65", 8080);
RequestConfig defaultRequestConfig = RequestConfig.custom()
.setConnectTimeout(6000).setSocketTimeout(6000)
.setProxy(proxy).build();
httpClient = HttpClients.custom().setDefaultRequestConfig(defaultRequestConfig).build();
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(request);
//4.判断响应状态为200,进行处理
if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
//5.获取响应内容
HttpEntity httpEntity = response.getEntity();
html = EntityUtils.toString(httpEntity, "utf-8");
} else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
System.out.println("返回状态不是200");
System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//6.关闭
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
return html;
}
public static String regexFirst(String content,String regex){
String result = null;
Pattern r = Pattern.compile(regex);
// 创建 matcher 对象
Matcher m = r.matcher(content);
if(m.find()){
result = m.group(1);
}
return result;
}
public static List<String> regexAll(String content,String regex){
List<String> result = new ArrayList<>();
Pattern r = Pattern.compile(regex);
Matcher m = r.matcher(content);
while(m.find()) {
result.add(m.group(1));
}
return result;
}
/*
* @title: ReadExcel
* @description: 读取sheet
* @author: hyxu.xhy
* @updateTime: 2021/8/22 19:37
* @throws:
*/
public Sheet getSheet(String filePath, int sheetNum){
InputStream is = null;
Workbook workbook = null;
try {
is = this.getClass().getClassLoader().getResourceAsStream(filePath);
workbook = new XSSFWorkbook(is);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
try {
is.close();//关闭资源
} catch (IOException e) {
e.printStackTrace();
}
return workbook.getSheetAt(sheetNum);
}
/*
* @title: readExcel
* @description: 读取excel
* @author: hyxu.xhy
* @updateTime: 2021/8/22 20:09
* @throws:
*/
public List<Map<String,String>> readExcel(String filePath, int sheetNum){
Sheet sheet = getSheet(filePath, sheetNum);
List<Map<String,String>> dataList = new ArrayList<>();
boolean isHead = false;
int startIndex=0;
int lastIndex=0;
//读取表头
List<String> headRow = new ArrayList<>();
for(int rowIndex=0;rowIndex<=sheet.getLastRowNum();rowIndex++){
Row row = sheet.getRow(rowIndex);
//过滤空行
if(isRowEmpty(row)){
continue;
}
//获取表头数据
if(!isHead){
startIndex = row.getFirstCellNum();
lastIndex = row.getLastCellNum();
isHead = true;
for(int cellIndex=startIndex;cellIndex<lastIndex;cellIndex++){
Cell cell = row.getCell(cellIndex);
cell.setCellType(Cell.CELL_TYPE_STRING);
headRow.add(cell.toString());
}
}else{
//读取表身
Map<String,String> map = new HashMap<>();
for(int k=startIndex;k<lastIndex;k++){
Cell cell = row.getCell(k);
if(cell == null){
map.put(headRow.get(k),"");
}else{
cell.setCellType(Cell.CELL_TYPE_STRING);
map.put(headRow.get(k),cell.toString());
}
}
dataList.add(map);
}
}
return dataList;
}
/*
* @title: isRowEmpty
* @description: 是否空行
* @author: hyxu.xhy
* @updateTime: 2021/8/22 20:16
* @return: boolean
* @throws:
*/
public static boolean isRowEmpty(Row row){
if(row==null)
return true;
for (int i = row.getFirstCellNum(); i < row.getLastCellNum(); i++) {
Cell cell = row.getCell(i);
if (cell != null && cell.getCellType() != Cell.CELL_TYPE_BLANK){
return false;
}
}
return true;
}
}