最近因为项目需求,抓取了大大小小多个网站的新闻,刚开始写用的是jsoup解析页面,每个站点都有写一套解析方案,效率较慢,后来利用xpath解析,开发数度有了很大的提升,在一周内完成了一百多个站点的新闻抓取。
下面是我一个简单示例,博主刚毕业,还是个技术小白,如有写的不对或不妥的地方,请评论指出类,大家共同进步,下图是测试效果,不同的网站只需要更改xpath即可
为了帮助有需要的朋友,下面贴上我写的代码模型,由于新闻网站一般没有反爬,所有demo中没有反爬的相关策略,一般的爬虫项目由下载器、调度器、解析器组成,本demo中没有实现调度器。
1、项目是基于maven搭建的,首先引入相关依赖
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<!-- apache.httpclient -->
<httpclient_version>4.5.1</httpclient_version>
<!-- htmlcleaner -->
<htmlcleaner_version>2.16</htmlcleaner_version>
<!-- logger -->
<log4j_version>1.2.17</log4j_version>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/log4j/log4j -->
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>${log4j_version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>19.0</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.4</version>
</dependency>
<dependency>
<groupId>commons-beanutils</groupId>
<artifactId>commons-beanutils</artifactId>
<version>1.9.2</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlcleaner/htmlcleaner -->
<dependency>
<groupId>net.sourceforge.htmlcleaner</groupId>
<artifactId>htmlcleaner</artifactId>
<version>${htmlcleaner_version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>3.8.1</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>fluent-hc</artifactId>
<version>${httpclient_version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>${httpclient_version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.3</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
<version>${httpclient_version}</version>
</dependency>
</dependencies>
2、下载器使用的是apache的开源项目httpclient,包含了httpclient连接池,工具类等
2.1 HttpClient连接池
package com.zhb.ims.utils.httpclient;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import javax.net.ssl.HostnameVerifier;
import javax.net.ssl.SSLContext;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.ssl.SSLContexts;
import org.apache.http.ssl.TrustStrategy;
public class HttpClientManger {
private PoolingHttpClientConnectionManager connectionManager;
private HttpRequestRetryHandler httpRequestRetryHandler;
private static HttpClientManger httpClientManger;
private static Lock lock = new ReentrantLock();
private volatile AtomicBoolean isShutDown;
public void init() {
try {
SSLContext sslContext = SSLContexts.custom().loadTrustMaterial(null, new DefaultTrustStrategy()).build();
@SuppressWarnings("deprecation")
HostnameVerifier hostnameVerifier = SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER;
SSLConnectionSocketFactory sslsf = new SSLConnectionSocketFactory(sslContext, hostnameVerifier);
Registry<ConnectionSocketFactory> socketFactoryRegistry = RegistryBuilder.<ConnectionSocketFactory> create()
.register("http", PlainConnectionSocketFactory.getSocketFactory()).register("https", sslsf).build();
connectionManager = new PoolingHttpClientConnectionManager(socketFactoryRegistry);
connectionManager.setMaxTotal(800);
connectionManager.setDefaultMaxPerRoute(20);
httpRequestRetryHandler = new DefaultRequestRetryHandler();
this.isShutDown = new AtomicBoolean(false);
} catch (Exception e) {
e.printStackTrace();
}
}
private HttpClientManger() {
super();
this.isShutDown = new AtomicBoolean(true);
init();
}
public static HttpClientManger newInstance() {
lock.lock();
if (httpClientManger == null) {
httpClientManger = new HttpClientManger();
}
lock.unlock();
return httpClientManger;
}
public CloseableHttpClient getClient() {
CloseableHttpClient client = null;
lock.lock();
if (this.isShutDown.compareAndSet(false, true)) {
client = HttpClients.custom().setConnectionManager(this.connectionManager).setRetryHandler(httpRequestRetryHandler).build();
}else {
init();
client = HttpClients.custom().setConnectionManager(connectionManager).setRetryHandler(httpRequestRetryHandler).build();
}
lock.unlock();
return client;
}
public void destory() {
if (this.isShutDown.compareAndSet(false, true)) {
this.connectionManager.shutdown();
}
isShutDown = new AtomicBoolean(true);
}
class DefaultTrustStrategy implements TrustStrategy{
@Override
public boolean isTrusted(X509Certificate[] arg0, String arg1) throws CertificateException {
return true;
}
}
}
2.2默认的重连策略
package com.zhb.ims.utils.httpclient;
import java.io.IOException;
import java.io.InterruptedIOException;
import java.net.UnknownHostException;
import java.util.Iterator;
import java.util.List;
import javax.net.ssl.SSLException;
import javax.net.ssl.SSLHandshakeException;
import org.apache.http.NoHttpResponseException;
import org.apache.http.client.HttpRequestRetryHandler;
import org.apache.http.conn.ConnectTimeoutException;
import org.apache.http.protocol.HttpContext;
import com.google.common.collect.Lists;
public class DefaultRequestRetryHandler implements HttpRequestRetryHandler {
private int executionCount;
List<Class<? extends Exception>> ignoreException;
List<Class<? extends Exception>> dealException;
public DefaultRequestRetryHandler() {
super();
Init();
}
@SuppressWarnings("unchecked")
public DefaultRequestRetryHandler(int executionCount) {
super();
this.executionCount = executionCount;
ignoreException = ignoreException.isEmpty()? Lists.newArrayList(ConnectTimeoutException.class,SSLException.class,UnknownHostException.class
,InterruptedIOException.class,SSLHandshakeException.class): ignoreException;
dealException = dealException.isEmpty()? Lists.newArrayList(NoHttpResponseException.class): dealException;
}
@SuppressWarnings("unchecked")
public DefaultRequestRetryHandler(int executionCount, List<Class<? extends Exception>> ignoreException) {
super();
this.executionCount = executionCount;
this.ignoreException = ignoreException;
dealException = dealException.isEmpty()? Lists.newArrayList(NoHttpResponseException.class): dealException;
}
public DefaultRequestRetryHandler(int executionCount, List<Class<? extends Exception>> ignoreException,
List<Class<? extends Exception>> dealException) {
super();
this.executionCount = executionCount;
this.ignoreException = ignoreException;
this.dealException = dealException;
}
@SuppressWarnings("unchecked")
private void Init() {
executionCount = executionCount <= 0 ? 5 : executionCount;
ignoreException = (ignoreException == null ||ignoreException.isEmpty())? Lists.newArrayList(ConnectTimeoutException.class,SSLException.class,UnknownHostException.class
,InterruptedIOException.class,SSLHandshakeException.class): ignoreException;
dealException = (dealException ==null || dealException.isEmpty())? Lists.newArrayList(NoHttpResponseException.class): dealException;
}
@Override
public boolean retryRequest(IOException exception, int executionCount, HttpContext context) {
if (executionCount >= this.executionCount) {
return false;
}
for (Iterator<Class<? extends Exception>> iterator = ignoreException.iterator(); iterator.hasNext();) {
Class<? extends Exception> clazz = (Class<? extends Exception>) iterator.next();
if (exception.getClass().isAssignableFrom(clazz)) {
return false;
}
}
for (Iterator<Class<? extends Exception>> iterator = dealException.iterator(); iterator.hasNext();) {
Class<? extends Exception> clazz = (Class<? extends Exception>) iterator.next();
if (exception.getClass().isAssignableFrom(clazz)) {
return true;
}
}
exception.printStackTrace();
return false;
}
}
2.3工具类
package com.zhb.ims.utils.httpclient;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.imageio.ImageIO;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.http.Header;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;
public class ClientMethodUtils {
private static final String DefaultCharSet = "utf-8";
/**
* 给post或者get添加header参数的泛型方法
*
* @param c
* HttpPost/HttpGet对象
* @param map
* 存放header的key-value的Map
* @return
* @throws Exception
* @throws InstantiationException
*/
public static <T extends HttpRequestBase> T addHeader(T t, Map<String, String> map) {
if (t != null && map != null && map.size() > 0) {
Iterator<Entry<String, String>> iterable = map.entrySet().iterator();
while (iterable.hasNext()) {
try {
Entry<String, String> entry = iterable.next();
t.setHeader(entry.getKey(), entry.getValue());
} catch (Exception e) {
System.out.println(e == null ? "HttpRequestBase Add Params Error!" : e.getMessage());
}
}
} else {
System.out.println("Parama Is Illegal!");
}
return t;
}
/**
* 给post方法添加参数
*
* @param post
* @param paramsMap
* @param charSet
* @return
*/
public static HttpPost addPostWithParams(HttpPost post, Map<String, String> paramsMap, String charSet) {
if (post != null && paramsMap != null && paramsMap.size() > 0) {
List<BasicNameValuePair> nvps = new ArrayList<>();
Iterator<Entry<String, String>> iterator = paramsMap.entrySet().iterator();
try {
while (iterator.hasNext()) {
Entry<String, String> entry = iterator.next();
String key = entry.getKey();
String value = entry.getValue();
if (key != null) {
nvps.add(new BasicNameValuePair(key, value == null ? "" : value));
} else {
continue;
}
}
post.setEntity(new UrlEncodedFormEntity(nvps, charSet));
} catch (Exception e) {
System.out.println("Add Params Error!");
}
} else {
System.out.println("Params Is Illegal!");
}
return post;
}
/**
* 从页面中解析字体编码
* @param htmlPage
* @return
*/
private static String getCharSet(final String htmlPage) {
String regex1 = "<meta.*charset=([^;^\"]*).*";
String value1 = com.lhh.util.StringUtils.getRegexIndex(htmlPage, regex1, 1);
if (StringUtils.isNotBlank(value1)) {
return value1;
}
return null;
}
/**
* 从ResponseHeader头中读取字体编码
* @param response
* @return
*/
public static String charSet(final CloseableHttpResponse response){
String charSet = null;
if (response != null) {
Header[] headers = response.getHeaders("Content-Type");
String regex = "charset=([\\s\\S]*?);{0,1}";
Pattern pattern = Pattern.compile(regex);
for (Header header : headers) {
String value = header.getValue().toLowerCase();
Matcher matcher = pattern.matcher(value);
while (matcher.find()) {
charSet = matcher.group(1);
return charSet;
}
}
}
return charSet;
}
/**
*
* @param client
* @param httpRequestBase
* @param charSet
* @return
*/
public static String getContent(CloseableHttpClient client, HttpRequestBase httpRequestBase) {
String pageContent = "";
if (client != null && httpRequestBase != null) {
try (ByteArrayOutputStream baos = new ByteArrayOutputStream();) {
CloseableHttpResponse response = client.execute(httpRequestBase);
IOUtils.copy(response.getEntity().getContent(), baos);
InputStream stream1 = new ByteArrayInputStream(baos.toByteArray());
InputStream stream2 = new ByteArrayInputStream(baos.toByteArray());
String htmlPage = IOUtils.toString(stream1);
String charSet = charSet(response);
if (StringUtils.isBlank(charSet)) {
charSet = getCharSet(htmlPage);
}
//未解析到字体编码,使用默认的字体编码
pageContent = IOUtils.toString(stream2, StringUtils.isNotBlank(charSet) ? charSet : DefaultCharSet);
response.getEntity().getContent().close();
EntityUtils.consume(response.getEntity());
response.close();
httpRequestBase.abort();
stream1.close();
stream2.close();
stream1 = null;
stream2 = null;
} catch (Exception e) {
System.out.println(e == null ? "Do Execute Error!" : e.getMessage());
e.printStackTrace();
}finally {
}
} else {
System.out.println("Params Is Illegal!");
}
return pageContent;
}
/**
* 得到验证码图片
*
* @param client
* @param url
* @return
*/
public static BufferedImage getImageByNet(CloseableHttpClient client, String url, String filePath) {
HttpGet get = new HttpGet(url);
CloseableHttpResponse response;
BufferedImage image = null;
try {
response = client.execute(get);
InputStream is = response.getEntity().getContent();
File f = new File(filePath);
if (!f.exists()) {
f.createNewFile();
}
FileOutputStream fos = new FileOutputStream(f);
byte[] b = new byte[1024];
int len = -1;
while ((len = is.read(b)) != -1) {
fos.write(b, 0, len);
}
response.close();
get.abort();
fos.close();
image = ImageIO.read(f);
} catch (Exception e) {
e.printStackTrace();
}
return image;
}
}
2.4简单下载器
package com.lhh.request;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.impl.client.CloseableHttpClient;
import com.zhb.ims.utils.httpclient.ClientMethodUtils;
import com.zhb.ims.utils.httpclient.HttpClientManger;
public class BaseTestRequest {
public static String getContent(String url) {
HttpGet get = new HttpGet(url);
CloseableHttpClient client = HttpClientManger.newInstance().getClient();
String page = ClientMethodUtils.getContent(client, get);
return page;
}
public static String postContent(String url) {
HttpPost get = new HttpPost(url);
CloseableHttpClient client = HttpClientManger.newInstance().getClient();
String page = ClientMethodUtils.getContent(client, get);
return page;
}
}
3、解析器代码
package com.lhh.parse;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import javax.xml.namespace.QName;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang.StringUtils;
import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.DomSerializer;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import com.lhh.util.LoggerUtils;
import com.lhh.util.ObjectUtils;
public class BaseParse {
public static String getNodeValue(final Object result) {
if (result != null) {
if (result instanceof NodeList) {
final StringBuffer stringBuffer = new StringBuffer();
NodeList nodeList = (NodeList) result;
for (int i = 0; i < nodeList.getLength(); i++) {
Node node = nodeList.item(i);
stringBuffer.append(node.getNodeValue().trim().replaceAll("\n", "") + " ");
}
return stringBuffer.toString();
}else {
LoggerUtils.warn("Result Is Not A Node Or NodeList");
}
}else {
LoggerUtils.warn("Result Is Null");
}
return null;
}
public static <T> T parseObject(Class<T> clazz,String htmlPage, Map<String, String> itemMap) throws Exception {
HtmlCleaner hcCleaner = new HtmlCleaner();
TagNode tagNode = hcCleaner.clean(htmlPage);
Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Iterator<Entry<String, String>> iterator = itemMap.entrySet().iterator();
XPath xPath = XPathFactory.newInstance().newXPath();
Map<Object, Object> resultMap = new HashMap<>();
while (iterator.hasNext()) {
Entry<String, String> entry = iterator.next();
String key = entry.getKey();
String xpathStr = entry.getValue();
if (StringUtils.isNotBlank(key) && StringUtils.isNotBlank(xpathStr)) {
Object result = xPath.evaluate(xpathStr, dom, XPathConstants.NODESET);
resultMap.put(key, getNodeValue(result));
}else {
LoggerUtils.warn("Key Or Xpath Is Blank!");
}
}
T t = clazz.newInstance();
ObjectUtils.copyWithMap(t, resultMap);
return t;
}
public static Object parse(String htmlPage,String xPathStr,QName qName) throws Exception{
Object result = null;
if (StringUtils.isNotBlank(htmlPage) && StringUtils.isNotBlank(xPathStr)) {
HtmlCleaner hcCleaner = new HtmlCleaner();
TagNode tagNode = hcCleaner.clean(htmlPage);
Document dom = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
XPath xPath = XPathFactory.newInstance().newXPath();
result = xPath.evaluate(xPathStr, dom, qName);
}else {
LoggerUtils.warn("Key Or Xpath Is Blank!");
}
return result;
}
}
4、用到的工具类
4.1、Logger工具类
package com.lhh.util;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import org.apache.log4j.Logger;
//import org.apache.log4j.Logger;
/**
*
* @author liuhang
*
*/
public class LoggerUtils {
static class LoggerWapper {
private Logger logger;
private StackTraceElement stackTraceElement;
private String methodName;
private int lineNum;
private Object message;
private Object wapperMessage;
private Class<?> clazz;
public Logger getLogger() {
return logger;
}
public void setLogger(Logger logger) {
this.logger = logger;
}
public StackTraceElement getStackTraceElement() {
return stackTraceElement;
}
public void setStackTraceElement(StackTraceElement stackTraceElement) {
this.stackTraceElement = stackTraceElement;
}
public String getMethodName() {
return methodName;
}
public void setMethodName(String methodName) {
this.methodName = methodName;
}
public int getLineNum() {
return lineNum;
}
public void setLineNum(int lineNum) {
this.lineNum = lineNum;
}
public Object getMessage() {
return message;
}
public void setMessage(Object message) {
this.message = message;
}
public Object getWapperMessage() {
return wapperMessage;
}
public void setWapperMessage(Object wapperMessage) {
this.wapperMessage = wapperMessage;
}
public Class<?> getClazz() {
return clazz;
}
public void setClazz(Class<?> clazz) {
this.clazz = clazz;
}
public LoggerWapper(Object message) {
super();
this.message = message;
}
@Override
public String toString() {
return "LoggerWapper [logger=" + logger + ", stackTraceElement=" + stackTraceElement + ", methodName="
+ methodName + ", lineNum=" + lineNum + ", message=" + message + ", wapperMessage=" + wapperMessage
+ ", clazz=" + clazz + "]";
}
}
private static Class<?> getInvokeClass(StackTraceElement stackTraceElement) {
if (stackTraceElement != null) {
Class<?> clazz;
try {
clazz = Class.forName(stackTraceElement.getClassName());
return clazz;
} catch (ClassNotFoundException e) {
e.printStackTrace();
}
}
return null;
}
private static String getInvokeMethodName(StackTraceElement stackTraceElement) {
if (stackTraceElement != null) {
String methodName = null;
methodName = stackTraceElement.getMethodName();
return methodName;
}
return null;
}
private static Object msgWapper(Object message, StackTraceElement stackTraceElement) {
if (stackTraceElement != null) {
StringBuffer stringBuffer = new StringBuffer("");
int lineNum = getInvokeLineNum(stackTraceElement);
String methodName = getInvokeMethodName(stackTraceElement);
Class<?> clazz = getInvokeClass(stackTraceElement);
if (lineNum > 0) {
stringBuffer.append(
clazz.getName() + "." + methodName + "(" + clazz.getSimpleName() + ".java:" + lineNum + ")");
stringBuffer.append(" - " + message);
}
return stringBuffer.toString();
}
return message;
}
private static int getInvokeLineNum(StackTraceElement stackTraceElement) {
int num = 0;
if (stackTraceElement != null) {
num = stackTraceElement.getLineNumber();
}
return num;
}
private static StackTraceElement getInvokeInfo(int num) {
if (num > -1) {
Lock lock = new ReentrantLock();
lock.lock();
StackTraceElement[] stackTraceElements = Thread.currentThread().getStackTrace();
lock.unlock();
if (stackTraceElements != null && stackTraceElements.length > num) {
StackTraceElement stackTraceElement = stackTraceElements[num];
return stackTraceElement;
}
}
return null;
}
private static LoggerWapper getLoggerWapper(Object message) {
LoggerWapper loggerWapper = new LoggerWapper(message);
StackTraceElement stackTraceElement = getInvokeInfo(4);
loggerWapper.setStackTraceElement(stackTraceElement);
Class<?> clazz = getInvokeClass(stackTraceElement);
loggerWapper.setClazz(clazz);
Logger logger = Logger.getLogger(clazz);
loggerWapper.setLogger(logger);
String methodName = getInvokeMethodName(stackTraceElement);
loggerWapper.setMethodName(methodName);
int lineNum = getInvokeLineNum(stackTraceElement);
loggerWapper.setLineNum(lineNum);
Object wapperMessage = msgWapper(message, loggerWapper.getStackTraceElement());
loggerWapper.setWapperMessage(wapperMessage);;
return loggerWapper;
}
public static void debug(Object message) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().debug(loggerWapper.getWapperMessage());
}
public static void debug(Object message, Throwable t) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().debug(loggerWapper.getWapperMessage(), t);
}
public static void error(Object message) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().error(loggerWapper.getWapperMessage());
}
public static void error(Object message, Throwable t) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().error(loggerWapper.getWapperMessage(), t);
}
public static void fatal(Object message) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().fatal(loggerWapper.getWapperMessage());
}
public static void fatal(Object message, Throwable t) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().fatal(loggerWapper.getWapperMessage(), t);
}
public static void info(Object message) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().info(loggerWapper.getWapperMessage());
}
public static void info(Object message, Throwable t) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().info(loggerWapper.getWapperMessage(), t);
}
public static void warn(Object message) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().warn(loggerWapper.getWapperMessage());
}
public static void warn(Object message, Throwable t) {
LoggerWapper loggerWapper = getLoggerWapper(message);
loggerWapper.getLogger().warn(loggerWapper.getWapperMessage(), t);
}
}
4.2、字符串工具类
package com.lhh.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class StringUtils extends org.apache.commons.lang.StringUtils {
public static String getRegexIndex(final String str, final String regex, final int index) {
if (isNotBlank(regex) && isNotBlank(str)) {
if (index >= 1) {
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(str);
while (matcher.find()) {
if (matcher.groupCount() < index) {
LoggerUtils.warn("Index Is OutOfBounds!");
} else {
return matcher.group(index);
}
return "";
}
} else {
LoggerUtils.warn("Index Is Illegal!");
}
} else {
LoggerUtils.warn("Str Or Regex Is Blank!");
}
return null;
}
}
4.3、对象工具类
package com.lhh.util;
import java.lang.reflect.Field;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
public class ObjectUtils {
/**
* 把r对象的所有属性拷贝到t对象中
* @param t
* @param r
*/
public static <T, R> void copy(final T t, final R r) {
if (t != null && r != null) {
Field[] rfields = r.getClass().getDeclaredFields();
Field[] tfields = t.getClass().getDeclaredFields();
L: for (Field rfield : rfields) {
rfield.setAccessible(true);
for (Field tfield : tfields) {
if (rfield.getName().equals(tfield.getName())) {
tfield.setAccessible(true);
try {
tfield.set(t, rfield.get(r));
} catch (Exception e) {
continue L;
}
}
}
}
}
}
/**
* 把map对象的key-value拷贝到t对象中
* @param t
* @param r
*/
public static <T> void copyWithMap(final T t, final Map<Object, Object> resMap) {
if (t != null && resMap != null) {
Field[] tfields = t.getClass().getDeclaredFields();
Iterator<Entry<Object, Object>> iterator = resMap.entrySet().iterator();
L: while (iterator.hasNext()) {
Entry<Object, Object> entry = iterator.next();
if (entry != null) {
Object key = entry.getKey();
Object value = entry.getValue();
if (key != null && entry != null) {
for (Field tfield : tfields) {
if (key.toString().equals(tfield.getName())) {
tfield.setAccessible(true);
try {
tfield.set(t, value);
} catch (Exception e) {
continue L;
}
}
}
}
}
}
}
}
}
5、模型对象
package com.lhh.model;
public class NewModel {
private String title;
private String content;
private String time;
private String source;
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public String getTime() {
return time;
}
public void setTime(String time) {
this.time = time;
}
public String getSource() {
return source;
}
public void setSource(String source) {
this.source = source;
}
@Override
public String toString() {
return "NewModel [title=" + title + ", content=" + content + ", time=" + time + ", source=" + source + "]";
}
}
6、测试
package com.lhh.test;
import java.util.HashMap;
import java.util.Map;
import javax.xml.xpath.XPathConstants;
import org.apache.http.client.methods.HttpGet;
import com.lhh.model.NewModel;
import com.lhh.parse.BaseParse;
import com.lhh.request.BaseTestRequest;
import com.lhh.util.LoggerUtils;
import com.zhb.ims.utils.httpclient.ClientMethodUtils;
import com.zhb.ims.utils.httpclient.HttpClientManger;
public class Test {
public static void main(String[] args) throws Exception {
//新闻列表url
String newListUrl = "http://roll.news.sina.com.cn/s/channel.php?ch=01#col=89&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=60&asc=&page=1";
String newListPage = BaseTestRequest.getContent(newListUrl);
//获取新闻列表页面上新闻Url
String xpath = "//div[@id='d_list']/ul/li/span[@class='c_tit']/a/@href";
Object result = BaseParse.parse(newListPage, xpath, XPathConstants.NODESET);
String urlList = BaseParse.getNodeValue(result);
String [] urlArray = urlList.split(" ");
for (int i = 0; i < urlArray.length; i++) {
Map<String, String> map = new HashMap<>();
//配置新闻标题的xpath
<span style="white-space:pre"> </span>map.put("title", "//*[@id='main_title']/text() | //*[@id='artibodyTitle']/text()");
<span style="white-space:pre"> </span>//配置新闻发布时间的xpath
<span style="white-space:pre"> </span>map.put("time", "//*[@id='page-tools']/span/span[@class='titer']/text() | //*[@id='navtimeSource']/text()");
<span style="white-space:pre"> </span>//配置新闻正文内容的xpath
<span style="white-space:pre"> </span>map.put("content", "//*[@id='artibody']/p/text()");
HttpGet get = new HttpGet(urlArray[i]);
//使用下载器下载页面元素
String page = ClientMethodUtils.getContent(HttpClientManger.newInstance().getClient(), get);
//调用解析取解析页面数据
NewModel weatherPojo = BaseParse.parseObject(NewModel.class, page, map);
LoggerUtils.error(weatherPojo.toString());
}
}
}