以下是全部代码
恩其中还 测试了 log4j在非web项目也可以使用(需要log4j的配置文件,log4j.properties或者log4j.xml)
在代码中加载配置文件 获得logger即可详细代码如下
package net.rytong.myspider;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.HashSet;
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
/**
*
* @author zhou_dong
* 下载鬼吹灯的电子书所用的网络爬虫
*
*/
public class MyHtmlParser {
//访问过的link
private static Set<String> vistedLinks = new HashSet<String>();
//
private static Integer failureCount = 0;
//判断是否已经访问过了
private static boolean judgeUrl(String url)
{
boolean flag = false;
if(vistedLinks != null && vistedLinks.size() > 0)
{
if(vistedLinks.contains(url))
{
flag = true;
}
}
return flag;
}
//加载lo4j文件
public static Logger loadLog4j()
{
PropertyConfigurator.configure("E:/rytong/myeclipsework/my_spider/config/log4j.properties");
final Logger logger = Logger.getLogger("");
return logger;
}
public static void main(String[] args) {
String url = "http://www.bxwx.org/b/3/3870/";
//获得鬼吹灯的所有的url
final Set<String> aUrl = getAUrl(url);
System.out.println("总url数:"+aUrl.size());
//多线程启动
myThread(aUrl,5);
}
private static void myThread(final Set<String> aUrl,Integer count) {
for( int i=0;i<count;i++)
{
Thread thread=new Thread(new Runnable(){
public void run() {
while(true){
try {
synchronized (aUrl) {
Thread.sleep(500);
//用多个线程实现数据的抓取
myImportTxt(aUrl);
//System.out.println("线程"+num+"--->"+Thread.currentThread().getName()+":已启动");
}
} catch (InterruptedException e) {
e.printStackTrace();
continue;
}
}
}
}, "Thread"+i);
thread.start();
// System.out.println("线程"+i+":"+Thread.currentThread().getName()+":启动");
}
}
//读取每一章节的内容,并保存到文本
private static void myImportTxt(Set<String> aUrl) {
//获得获取div的id为content的内容的过滤器
HasAttributeFilter filter = new HasAttributeFilter("id", "content");
//设置div的过滤器
OrFilter divContext = new OrFilter(new NodeClassFilter(TitleTag.class),filter);
OutputStream output = null;
//PrintWriter pw = null;
//InputStream input = null;
int y = 1;
//for(int num=0;num<aUrl.size();num++)
for(String myUrl:aUrl)
{
//String myUrl = aUrl.get(num);
//判断url是否已经被使用过就不读取了
if(judgeUrl(myUrl))
{
//aUrl.remove(myUrl);
System.out.println("myUrl:"+myUrl+"已移除");
break;
}
//System.out.println("小说章节的url:---->"+myUrl);
try {
Parser parser = new Parser(myUrl);
parser.setEncoding("gb2312");
//获得所有符合id为content的div的标签集合
NodeList list = parser.extractAllNodesThatMatch(divContext);
StringBuffer text = new StringBuffer();
for (int i = 0; i < list.size(); i++)
{
//获得标签的内容
text = text.append(list.elementAt(i).toPlainTextString() + "\r\n");
}
String myText = text.toString();
//System.out.println(myText);
byte[] bytes = new byte[1024];
bytes = myText.getBytes();
long currentTimeMillis = System.currentTimeMillis();
String bookTxt = "E:/rytong/mytext/test4/"+"鬼吹灯"+y+"("+String.valueOf(currentTimeMillis)+").txt";
output = new FileOutputStream(new File(bookTxt));
//pw = new PrintWriter(output,true);
//pw.write(myText);
output.write(bytes, 0, bytes.length);
output.flush();
//System.out.println("被读取的url:"+myUrl);
//System.out.println("E:/rytong/mytext/test3/"+"鬼吹灯"+y+"("+String.valueOf(currentTimeMillis)+").txt"+"导入成功");
//System.out.println(bookTxt+"导入成功");
//log4j的获得及使用
Logger myLogger = loadLog4j();
myLogger.info(bookTxt+"导入成功");
y++;
//把访问过的url添加到vistedLinks
vistedLinks.add(myUrl);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
failureCount++;
System.out.println(myUrl);
continue;
}
}
/*//pw.close();
try {
//output.close();
} catch (IOException e) {
e.printStackTrace();
}*/
}
//获取所有的url路径
@SuppressWarnings("serial")
public static Set<String> getAUrl(String url)
{
Set<String> myUrls = new HashSet<String>();
try {
//解析一个url
Parser parser = new Parser(url);
//设置编码
parser.setEncoding("gb2312");
//过滤<frame>标签的filter 在本文中没有使用到 只是给大家 举个例子 说明filter的用法。
NodeFilter frameFilter = new NodeFilter() {
public boolean accept(Node node) {
String text = node.getText();
//System.out.println("frame 标签的"+text);
if(text.startsWith("frame src = "))
{
return true;
}else{
return false;
}
}
};
//
OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter);
//获得所有匹配的url
NodeList nodeList = parser.extractAllNodesThatMatch(linkFilter);
for(int i=0;i<nodeList.size();i++)
{
Node readNode = nodeList.elementAt(i);
if(readNode instanceof LinkTag)
{
//获得所用的a标签
String link = ((LinkTag) readNode).getLink();
myUrls.add(link);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return myUrls;
}
}