网上看到很多python的爬虫,不得不说python很强大。。
我为了加深对java的了解,写了一个。。嘎嘎。。
废话少说上代码。
package com.app;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class URLClient {
StringBuilder sb = new StringBuilder();
/**
* 连接到服务器,并请求给定的文档
*
* @param urlString
* :文档地址
* @return:String
*/
public String getDocumentAt(String urlString) {
try {
URL url = new URL(urlString);// 生成url对象
URLConnection urlConnection = url.openConnection();// 打开url连接
BufferedReader br = new BufferedReader(new InputStreamReader(
urlConnection.getInputStream()));
String line = null;
while ((line = br.readLine()) != null) {
sb.append(line + "\n");
}
} catch (MalformedURLException e) {
System.out.println("不能连接到URL:" + urlString);
e.printStackTrace();
} catch (IOException e) {
System.out.println("连接到URL抛出异常信息:" + urlString);
e.printStackTrace();
}
return sb.toString();
}
/**
* @param args
*/
public static List<String> result(String url,String match ){
al=new ArrayList<String>();
//Pattern pattern2 = Pattern.compile("<span id=\"text110\">([\\w\\W]*?)</span>");
//Pattern pattern = Pattern.compile("<a href=\"(.*?)\"target=\"_blank\" >");
Pattern pattern = Pattern.compile(match);
Matcher matcher = pattern.matcher(url);
String buffer =new String() ;
while(matcher.find()){
buffer=matcher.group(1);
// String[] s=buffer.split("<P>");
// System.out.println(""+buffer.toString()+"/r/n");
String fuck=buffer.replaceAll("<P>|</P>|<BR>","");
al.add(fuck);
}
return al;
}
/**
* 写入文件
*
* @param str 是写入的笑话
*
* @param path 是写入的路径
*/
public static void write(String str,String path) throws Exception{
File f=new File(path);
FileOutputStream in = new FileOutputStream(f,true); //追加方式写入。
// 向文件写入内容(输出流)
byte bt[] = new byte[1024];
bt = str.getBytes();
in.write(bt, 0, bt.length);
in.close();
}
static List<String> al;
static List<String> JokeList;
public static void main(String[] args) throws Exception {
URLClient client = new URLClient();
String url = client.getDocumentAt("http://www.jokeji.cn/list.htm");
String home = "http://www.jokeji.cn/";
// System.out.println(url);
al=result(url, "<a href=\"(.*?)\"target=\"_blank\" >");
for(String link:al){
String xiaohua=client.getDocumentAt(home+link);//得到笑话的链接。
System.out.println(link+"/n");
JokeList=result(xiaohua, "<span id=\"text110\">([\\w\\W]*?)</span>");//提取笑话
}
for(String joke: JokeList){
//写入文件
write(""+joke+"\n\n","E://xiaohua.txt");
//System.out.println(joke+"/n");
}
}
}
</pre><pre>
//thanks...每天进步一点点,一周就是一大步。。嘎嘎