使用HtmlParser读取论坛图片

最新推荐文章于 2024-06-26 09:25:55 发布

ccjjyy

最新推荐文章于 2024-06-26 09:25:55 发布

阅读量1.9k

点赞数

分类专栏： web测试优化文章标签： path url string filter buffer null

web测试优化专栏收录该内容

10 篇文章 0 订阅

订阅专栏

心血来潮，写了一个读取论坛图片的程序，能够自动把图片保存到硬盘上去，使用HtmlParse组件。

http://hintcnuie.javaeye.com/blog/172132

package com.chen;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/
public class HttpGet {
private static int BUFFER_SIZE = 8096;// 缓冲区大小
/**
* 将HTTP资源另存为文件
*
* @param destUrl
* String
* @param title
* @param fileName
* String
* @throws IOException
* @throws Exception
*/
public static void saveToFile(String destUrl, String title) {
FileOutputStream fos = null;
BufferedInputStream bis = null;
HttpURLConnection httpUrl = null;
URL url = null;
byte[] buf = new byte[BUFFER_SIZE];
int size = 0;
int pos = destUrl.lastIndexOf('/');
String fileName = "";
if (pos != -1)
fileName = destUrl.substring(pos + 1, destUrl.length());
else
fileName = destUrl.substring(destUrl.length() - 10, destUrl
.length());
String path = "D:" + File.separator + "temp" + File.separator
+ "images" + File.separator;
System.out.println("title: " + title);
if (null != title && !"".equals(title)) {
File file = new File(path + title + File.separator);
if (!file.exists()) {
file.mkdirs();
}
path = file.getPath();
}
path = path + File.separator + fileName;
System.out.print("/t" + path);
// 建立链接
try {
url = new URL(destUrl);
httpUrl = (HttpURLConnection) url.openConnection();
// 连接指定的资源
httpUrl.connect();
// 获取网络输入流
bis = new BufferedInputStream(httpUrl.getInputStream());
// 建立文件
fos = new FileOutputStream(path);
// 保存文件
while ((size = bis.read(buf)) != -1)
fos.write(buf, 0, size);
fos.close();
bis.close();
httpUrl.disconnect();
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException ex) {
// TODO Auto-generated catch block
ex.printStackTrace();
}
System.out.println(" /tsave completely");
}
/**
* 主方法
*
* @param argv
* String[]
*/
public static void main(String argv[]) {
String url = "http://xxx.com";
// getImagesFromSinglePage(url);
try {
String page = null;
for(int i=2;i<=105;i++){
page="http://xx.com/html/13/13_"+i+".shtml";
getPageLinks(page);
}
getPageLinks(url);
// String title=getTitle(url);
// getImages(url,title);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// getImagesByParser(url);
}
private static void getPageLinks(String page) throws ParserException {
Parser myParser = new Parser(page);
// 设置编码
myParser.setEncoding("UTF-8");
String filterStr = "a";
NodeFilter filter = new TagNameFilter(filterStr);
NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
System.out.println("size: " + nodeList.size());
for (int i = 0; i < nodeList.size(); i++) {
LinkTag linkTag = (LinkTag) nodeList.elementAt(i);
String link = linkTag.getLink();
String text = linkTag.getLinkText();
text = TextProcess(text);
if (link.endsWith(".shtml") && text.length() > 2) {
try {
getImages(link,text);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
private static String TextProcess(String text) {
text = text.trim();
text = text.replaceAll(">", "");
text = text.replaceAll("<", "");
text = text.replaceAll("/", "");
text = text.replaceAll(">", "");
text = text.replaceAll(" ", "");
int pos = 0;
if ((pos = text.indexOf(":-")) != -1)
text = text.substring(pos + 2);
pos = text.indexOf("－");
if (pos != -1)
text = text.substring(0, pos);
pos = text.indexOf("-");
if (pos != -1)
text = text.substring(0, pos);
text = text.replace(".", "");
text = text.replaceAll(",", "");
text = text.replaceAll("，", "");
return text;
}
private static String getTitle(String url) throws ParserException {
Parser myParser = new Parser(url);
// 设置编码
myParser.setEncoding("UTF-8");
String titleTag = "title";
NodeFilter titleFilter = new TagNameFilter(titleTag);
NodeList titleList = myParser.extractAllNodesThatMatch(titleFilter);
int size = titleList.size();
String title = null;
if (size == 1) {
TitleTag titleT = (TitleTag) titleList.elementAt(0);
title = titleT.getTitle();
}
return title;
}
public static void getImages(String resource, String title)
throws Exception {
// Set
Set<String> imagesSet = new HashSet<String>();
Parser myParser = new Parser(resource);
// 设置编码
myParser.setEncoding("UTF-8");
String filterStr = "img";
NodeFilter filter = new TagNameFilter(filterStr);
NodeList nodeList = myParser.extractAllNodesThatMatch(filter);
System.out.println("size: " + nodeList.size());
for (int i = 0; i < nodeList.size(); i++) {
ImageTag imageTag = (ImageTag) nodeList.elementAt(i);
String imageUrl = imageTag.getImageURL();
System.out.println("iamge " + i + ": " + imageUrl);
if (!imagesSet.contains(imageUrl)) {
System.out.print("/t saving ...");
saveToFile(imageTag.getImageURL(), title);
} else {
System.out.print("/t exist already,no need to save");
}
}
}
}

ccjjyy

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录