Java实现网络爬虫比较麻烦的 还是Python 比较好用一些,这里放一些简单的代码 没有实现防爬虫网站的爬取,简单的代码作为参考。
如果想了解爬虫 还是建议读一下《Python网络爬虫》等书。
package com.get.image;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class Main extends Thread{
// 地址
private static final String URL = "http://tieba.baidu.com/f?kw=%C9%BD%B6%AB%D3%A2%B2%C5%D1%A7%D4%BA&fr=ala0&tpl=5";
// 获取img标签正则
private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
// 获取src路径的正则
private static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*";
public static void main(String[] args) {
try {
new Main().start();
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void run()
{
try {
Main cm=new Main();
//获得html文本内容
String HTML = cm.getHtml(URL);
//获取图片标签
List<String> imgUrl = cm.getImageUrl(HTML);
//获取图片src地址
List<String> imgSrc = cm.getImageSrc(imgUrl);
//下载图片
cm.Download(imgSrc);
Thread.sleep(10000);
}catch (Exception e){
System.out.println("发生错误");
}
super.run();
}
//获取HTML内容
private String getHtml(String url ) {
InputStream in = null;
InputStreamReader isr=null;
BufferedReader br = null ;
StringBuffer sb = null;
try {
URL url1=new URL(url);
URLConnection connection;
connection = url1.openConnection();
in=connection.getInputStream();
isr=new InputStreamReader(in);
br=new BufferedReader(isr);
String line;
sb=new StringBuffer();
while((line=br.readLine())!=null){
sb.append(line,0,line.length());
sb.append('\n');
}
} catch (Exception e) {
e.printStackTrace();
}finally{
if (br!=null) {
try {
isr.close();
br.close();
in.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
return sb.toString();
}
//获取ImageUrl地址
private List<String> getImageUrl(String html){
Matcher matcher=Pattern.compile(IMGURL_REG).matcher(html);
List<String>listimgurl=new ArrayList<String>();
while (matcher.find()){
listimgurl.add(matcher.group());
}
return listimgurl;
}
//获取ImageSrc地址
private List<String> getImageSrc(List<String> listimageurl){
List<String> listImageSrc=new ArrayList<String>();
for (String image:listimageurl){
Matcher matcher=Pattern.compile(IMGSRC_REG).matcher(image);
while (matcher.find()){
listImageSrc.add(matcher.group().substring(0, matcher.group().length()-1));
}
}
return listImageSrc;
}
//下载图片
private void Download(List<String> listImgSrc) {
File dir = new File("D:/image");
if( !dir.exists() )
dir.mkdirs();
int num = 1;
System.out.println( dir + ":创建成功" );
try {
//开始时间
Date begindate = new Date();
for (String url : listImgSrc) {
//开始时间
Date begindate2 = new Date();
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
URL uri = new URL(url);
/* InputStream in = uri.openStream();
FileOutputStream fo = new FileOutputStream(new File(imageName+".jpg"));
*/
File file = new File( dir, (num++) + ".jpg" );
if( file.exists() )
{
System.out.println( file + "已存在" );
continue;
}
BufferedInputStream biStream = new BufferedInputStream(
uri.openStream());
BufferedOutputStream boStream = new BufferedOutputStream(
new FileOutputStream( file ) );
byte[] buf = new byte[1024];
int length = 0;
System.out.println("开始下载:" + url);
while( (length = biStream.read( buf )) != -1 )
{
boStream.write( buf, 0, length );
}
boStream.close();
biStream.close();
System.out.println(imageName + "下载完成");
//结束时间
Date overdate2 = new Date();
double time = overdate2.getTime() - begindate2.getTime();
System.out.println("耗时:" + time / 1000 + "s");
}
Date overdate = new Date();
double time = overdate.getTime() - begindate.getTime();
System.out.println("总耗时:" + time / 1000 + "s");
} catch (Exception e) {
//System.out.println("下载失败");
e.printStackTrace();
}
}
}