cyberneko和dom4j解析html

最新推荐文章于 2022-08-26 17:20:54 发布

yangxiaojun9238

最新推荐文章于 2022-08-26 17:20:54 发布

阅读量1k

点赞数

闲着无聊用cyberneko和dom4j写了个解析html图片的小程序，在这里贴出主要代码如下

Java代码

public void testPaseHtml(){
String url = "http://www.iteye.com/";
Document document = this.getDoc(url);//获取document
String gz = "//xmlns:IMG/@src";//xpath匹配
List nodes = this.getAttr(document,gz);//获取属性
String[] photoUrl = this.getUrl(nodes);//获取url
this.insertPhoto(this.getConn(),photoUrl);//插入图片
}
/**
* 获取属性
* @param document
* @param gz
* @return
*/
public List getAttr(Document document,String gz){
Map nameSpaces=new HashMap();
XPath xpath=new DefaultXPath(gz);
nameSpaces.put("xmlns","http://www.w3.org/1999/xhtml");
xpath.setNamespaceContext(new SimpleNamespaceContext(nameSpaces));
List nodes=xpath.selectNodes(document);
return nodes;
}
/**
* 获取Document
* @param url
* @return
*/
public Document getDoc(String url){
DOMParser parser = new DOMParser();
try {
parser.parse(url);
} catch (SAXException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
org.w3c.dom.Document doc=parser.getDocument();
//SAXReader reader = new SAXReader();
DOMReader domReader=new DOMReader();
Document document =domReader.read(doc);
return document;
}
/**
* 获取图片路径
* @param nodes
* @return
*/
public String[] getUrl(List nodes){
if(nodes==null){
return null;
}
Iterator iter=nodes.iterator();
String[] url = new String[nodes.size()];
for(int i=0;iter.hasNext();i++){
Attribute attribute=(Attribute)iter.next();
url[i] = attribute.getValue();
}
return url;
}
/**
* 获取图片流
* @param path
* @return
*/
public InputStream getInput(String path){
URL url = null;
try {
url = new URL(path);
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
URLConnection conn = null;
try {
conn = url.openConnection();
} catch (IOException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
conn.setDoOutput(true);
InputStream inputStream = null;
try {
inputStream = conn.getInputStream();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return inputStream;
}
/**
* 插入图片入数据库
* @param conn
* @param photoUrl
*/
public void insertPhoto(Connection conn,String[] photoUrl){
String sql = "insert into photo set photo = ?";
PreparedStatement ps;
try {
conn.setAutoCommit(false);
ps = conn.prepareStatement(sql);
for(int i=0;i<photoUrl.length;i++){
if("".equals(photoUrl[i]))continue;
InputStream inputStream = this.getInput("http://www.iteye.com/"+photoUrl[i]);
ps.setBinaryStream(1, inputStream, inputStream.available());
ps.addBatch();
inputStream.close();
}
ps.executeBatch();
conn.commit();
conn.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

yangxiaojun9238

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
cyberneko和dom4j解析html

闲着无聊用cyberneko和dom4j写了个解析html图片的小程序，在这里贴出主要代码如下Java代码 public void testPaseHtml(){ String url = "http://www.iteye.com/"; Document document = this.getDoc(
复制链接

扫一扫