网络爬虫之批量图片下载

最近在开源社区时不时看到有人上传爬虫软件,用于下载姐脱吧图片等,可谓屌丝神器。不过基本上都是python写的,我就跟风编了个初级的java版爬虫工具,用于下载百度贴吧的图片。

功能介绍:通过广度遍历方法搜索贴吧网页,这边本来想把搜索结果存入数据库的,因为想着先简单搞搞,就用ArrayList和HashSet存入内存了,只是用于小规模的爬取。最终将爬取到的图片存入工程目录下的downloadImage文件夹下。

主程序代码如下:

package spider;

import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;


public class SpiderOne {
ArrayList<String> noViewURL;
HashSet<String> alreadyViewURL;
String webBody;
public static String tieba = "http://tieba.baidu.com";
public static int maxNum = 300;
public SpiderOne(String orignURL){
noViewURL = new ArrayList<String>();
alreadyViewURL = new HashSet<String>();
alreadyViewURL.add("http://tieba.baidu.com/f?kw=%BD%E3%CD%D1");
noViewURL.add(orignURL);
}
public void analysisWeb(){
while(noViewURL.size()>0){
System.out.println("网页读取中···");
//readWeb(noViewURL.remove(0));
try {
readWebSecond(noViewURL.remove(0));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("网页读取完毕!");
System.out.println("读取有用网址中...");
analysisURL();
System.out.println("网址读取完毕!");
System.out.println("下载图片中···");
DownloadImage();
System.out.println("一个网页图片下载完毕!");
}
}
/**
* 提取有用网址
*/
public void analysisURL(){
int beginPosition = 0;
int endPosition = 0;
String oneUrl = "";
while (true) {
if(noViewURL.size()>maxNum) break;
beginPosition = webBody.indexOf("a href=\"/p/",endPosition);
endPosition = webBody.indexOf("\"", beginPosition+10);
if (beginPosition == -1 || endPosition == -1)
break;
oneUrl = tieba+webBody.substring(beginPosition+8
, endPosition);
if(oneUrl.indexOf("?")==-1&& !alreadyViewURL.contains(oneUrl)){
noViewURL.add(oneUrl);
alreadyViewURL.add(oneUrl);
System.out.println("存储了网址:"+oneUrl);
}

}
beginPosition = 0;
endPosition = 0;
while (true) {
if(noViewURL.size()>maxNum) break;
beginPosition = webBody.indexOf("a href=\"/f?kw=",endPosition);
endPosition = webBody.indexOf("\"", beginPosition+14);
if (beginPosition == -1 || endPosition == -1)
break;
oneUrl = tieba+webBody.substring(beginPosition+8
, endPosition);
if(!alreadyViewURL.contains(oneUrl)){
noViewURL.add(oneUrl);
alreadyViewURL.add(oneUrl);
System.out.println("存储了跳转网址:"+oneUrl);
}
}
}
/**
* 下载贴吧图片
*/
synchronized public void DownloadImage(){
int beginPosition = 0;
int endPosition = 0;
String oneUrl = "";
while (true) {
beginPosition = webBody.indexOf("http://imgsrc.",endPosition);
endPosition = webBody.indexOf("\"", beginPosition+14);
if (beginPosition == -1 || endPosition == -1)
break;
oneUrl = webBody.substring(beginPosition
, endPosition);
DownloadFile df = new DownloadFile(oneUrl);
Thread thread = new Thread(df);
thread.start();
while(DownloadFile.number>10){ 
System.out.println("线程数过多,等待中!");
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}
} 

}

public String readWebSecond(String url) throws Exception{
System.out.println("读入网址:"+url);
// 临时标准网页的内存字节数组输出流,长度自动增长
ByteArrayOutputStream baos = new ByteArrayOutputStream();


// 打开网络连接
URL startUrl;
URLConnection urlConnection;

// 获取网络输入流
InputStream is = null;

// 重置保存网页内容的内存字节数组输出流
baos.reset();

// 创建网页连接
startUrl = new URL(url);
urlConnection = startUrl.openConnection();

// 获取网页的输入流
is = urlConnection.getInputStream();
/***************************************
获取编码
***************************************/
String strencoding = "";
Map<String, List<String>> map = urlConnection.getHeaderFields();  
       Set<String> keys = map.keySet();  
       Iterator<String> iterator = keys.iterator();   
       // 遍历,查找字符编码  
       String key = null;  
       String tmp = null;  
       while (iterator.hasNext()) {  
           key = iterator.next();  
           tmp = map.get(key).toString().toLowerCase();  
           // 获取content-type charset  
           if (key != null && key.equals("Content-Type")) {  
               int m = tmp.indexOf("charset=");  
               if (m != -1) {  
                   strencoding = tmp.substring(m + 8).replace("]", "");  
               }  
           }  
       }
/*****************************************strencoding有可能为空******************/
       //if(strencoding.length()==0) strencoding="gbk";
// 读取网页内容,保存在网页内容的内存字节数组输出流
int oneByte = is.read();
int readflag = 0;
while (oneByte > 0) {
// 忽略网页内容开头处的空格字符、回车、换行符
//if (readflag == 0 && (oneByte == 32||oneByte==13||oneByte==10)) {
if (readflag == 0 && oneByte<=32) {
oneByte = is.read();
System.out.println("内层oneByte:"+oneByte);
continue;
}
// 如果网页不是文本网页(通过判断网页的第一个字符是不是“<”(编码为60)),则退出
if (readflag == 0 && oneByte != 60)
break;
// 如果网页是文本网页,设置标识变量
else
readflag = 1;


// 将读取的网页字符内容写入内存字节数组输出流

baos.write(oneByte);


// 继续读下一个网页字符
oneByte = is.read();
}
// 如果是文本网页
if (readflag == 1) {
// 获取网页文本内容

if(strencoding.length()!=0) webBody = new String(baos.toByteArray(),strencoding);
else webBody = new String(baos.toByteArray(),"UTF-8");
}
//System.out.println(webBody);
return webBody;
}
public static void main(String[] args){
SpiderOne so = new SpiderOne("http://tieba.baidu.com/f?kw=%BD%E3%CD%D1&tp=0&pn=");
so.analysisWeb();
}
}

下载图片类:

package spider;


import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;


public class DownloadFile implements Runnable{
String urlString;
static volatile int number = 0;
public DownloadFile(String url){
//设置属性
urlString = url;
number++;
}
public void run(){
downLoadFile();
}
public synchronized void downLoadFile(){

//网络文件的相关信息
StringBuffer info = new StringBuffer();
try{ 
//网络文件的URL
URL url = new URL(urlString);
System.out.println(url.toString());
//打开该网络文件的URL连接
URLConnection urlConn = url.openConnection();

//添加网络文件的相关信息
info.append("主机:"+url.getHost()+"\n");
info.append("端口:"+url.getDefaultPort()+"\n");
info.append("网络文件的类型:"+urlConn.getContentType()+"\n");
info.append("长度:"+urlConn.getContentLength()+"\n");
info.append("正在下载···");
System.out.println(info.toString());
//创建网络文件的输入流
InputStream is = urlConn.getInputStream();

//获取网络文件的文件名称
String localFileName = url.getFile().substring(url.getFile().lastIndexOf("/")+1);

//创建本地文件输出流
FileOutputStream fos = new FileOutputStream("downloadImage/"+localFileName);

//读取网络文件到本地文件
int data;
while((data = is.read())!=-1){
fos.write(data);
}

//关闭流
is.close();
fos.close();
//Thread.sleep(1000);
}catch(Exception e){
System.out.println(e.getMessage());
}
System.out.println("下载完毕!");
number--;
}
}


评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值