java

2 篇文章 0 订阅
1 篇文章 0 订阅

package xml;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.StringWriter;
import java.net.HttpURLConnection;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.UUID;
import java.util.Map.Entry;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

import xml.util.EscapeChars;

public class TestXpathFromXml {
public static void main(String[] args) throws ParserConfigurationException,
SAXException, IOException, URISyntaxException, TransformerException {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true); // never forget this!
DocumentBuilder builder = factory.newDocumentBuilder();
//Document doc = builder.parse("books.xml");
Document doc = builder.parse("forum.txt");
//接下来创建 XPathFactory:
XPathFactory factory2 = XPathFactory.newInstance();
//然后使用这个工厂创建 XPath 对象:
XPath xpath = factory2.newXPath();
xpath.setNamespaceContext(new UniversalNamespaceResolver(doc));
//XPath 对象编译 XPath 表达式:
Object result;
Map<String, String> imageSrcMap = new HashMap<String, String>();
Map<String, String> abimageMap = new HashMap<String, String>();
try {
//get image
XPathExpression expr = xpath.compile("//*[@class=\"wenda_con\"]/descendant::img");
result = expr.evaluate(doc, XPathConstants.NODESET);
NodeList nodes = (NodeList) result;
URI base=new URI("http://www.XXXX.com/xx.html");//基本网页URI
Set<String> imagesSet = new HashSet<String>();
Set<String> abimagesSet = new HashSet<String>();
for (int i = 0; i < nodes.getLength(); i++) {
String imagesrc =nodes.item(i).getAttributes().getNamedItem("src").getNodeValue();
URI abs=base.resolve(imagesrc);//解析于上述网页的相对URL,得到绝对URI
URL absURL=abs.toURL();//转成URL
String fileName="image/"+UUID.randomUUID().toString().replace("-","")+".jpg";
imageSrcMap.put(imagesrc, fileName);
imagesSet.add(imagesrc);
abimageMap.put(imagesrc,absURL.toString());
}
//抓图
for(String str:imagesSet){
System.out.println("get image:"+str+"\n\t"+imageSrcMap.get(str));
fetchContentByJDKConnection(abimageMap.get(str),imageSrcMap.get(str));
}
//get content and replace //*[@id="list_con_bg"]/div/div[2]/div[1]/div
XPathExpression expr2 = xpath.compile("//*[@class=\"wenda_con\"]");
Object result1 = expr2.evaluate(doc, XPathConstants.NODESET);
NodeList nodes2 = (NodeList) result1;
TransformerFactory transFactory = TransformerFactory.newInstance();
Transformer transformer = transFactory.newTransformer();
StringWriter buffer = new StringWriter();
for (int i = 0; i < nodes2.getLength(); i++){
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.transform(new DOMSource(nodes2.item(i)),new StreamResult(buffer));
}
String str = buffer.toString();
str=StringEscapeUtils.unescapeXml(str);
for(Entry<String, String> entry : imageSrcMap.entrySet()){
//System.out.println(entry.getKey()+":"+entry.getValue());
str=str.replaceAll(EscapeChars.forRegex(entry.getKey()),entry.getValue());
}
System.out.println(str);
} catch (XPathExpressionException e) {
e.printStackTrace();
}
}
private static void fetchContentByJDKConnection(String contentUrl,String fileName) throws IOException {

HttpURLConnection connection = (HttpURLConnection) new URL(contentUrl).openConnection();
// 设置Socket超时
connection.setReadTimeout(10 * 1000);
try {
connection.connect();

// 真正发出请求
InputStream input;
try {
input = connection.getInputStream();
} catch (FileNotFoundException e) {
//response.sendError(HttpServletResponse.SC_NOT_FOUND, contentUrl + " is not found.");
return;
}

// 设置Header
//response.setContentType(connection.getContentType());
if (connection.getContentLength() > 0) {
//response.setContentLength(connection.getContentLength());
}
File file = new File(fileName);
OutputStream output = new FileOutputStream(file);
// 输出内容
//OutputStream output = new OutputStream();//response.getOutputStream();
try {
// 基于byte数组读取InputStream并直接写入OutputStream, 数组默认大小为4k.
IOUtils.copy(input, output);
output.flush();
} finally {
// 保证InputStream的关闭.
IOUtils.closeQuietly(input);
}
} finally {
connection.disconnect();
}
}
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值