最近在开源社区时不时看到有人上传爬虫软件,用于下载姐脱吧图片等,可谓屌丝神器。不过基本上都是python写的,我就跟风编了个初级的java版爬虫工具,用于下载百度贴吧的图片。
功能介绍:通过广度遍历方法搜索贴吧网页,这边本来想把搜索结果存入数据库的,因为想着先简单搞搞,就用ArrayList和HashSet存入内存了,只是用于小规模的爬取。最终将爬取到的图片存入工程目录下的downloadImage文件夹下。
主程序代码如下:
package spider;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class SpiderOne {
ArrayList<String> noViewURL;
HashSet<String> alreadyViewURL;
String webBody;
public static String tieba = "http://tieba.baidu.com";
public static int maxNum = 300;
public SpiderOne(String orignURL){
noViewURL = new ArrayList<String>();
alreadyViewURL = new HashSet<String>();
alreadyViewURL.add("http://tieba.baidu.com/f?kw=%BD%E3%CD%D1");
noViewURL.add(orignURL);
}
public void analysisWeb(){
while(noViewURL.size()>0){
System.out.println("网页读取中···");
//readWeb(noViewURL.remove(0));
try {
readWebSecond(noViewURL.remove(0));
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
System.out.println("网页读取完毕!");
System.out.println("读取有用网址中...");
analysisURL();
System.out.println("网址读取完毕!");
System.out.println("下载图片中···");
DownloadImage();
System.out.println("一个网页图片下载完毕!");
}
}
/**
* 提取有用网址
*/
public void analysisURL(){
int beginPosition = 0;
int endPosition = 0;
String oneUrl = "";
while (true) {
if(noViewURL.size()>maxNum) break;
beginPosition = webBody.indexOf("a href=\"/p/",endPosition);
endPosition = webBody.indexOf("\"", beginPosition+10);
if (beginPosition == -1 || endPosition == -1)
break;
oneUrl = tieba+webBody.substring(beginPosition+8
, endPosition);
if(oneUrl.indexOf("?")==-1&& !alreadyViewURL.contains(oneUrl)){
noViewURL.add(oneUrl);
alreadyViewURL.add(oneUrl);
System.out.println("存储了网址:"+oneUrl);
}
}
beginPosition = 0;
endPosition = 0;
while (true) {
if(noViewURL.size()>maxNum) break;
beginPosition = webBody.indexOf("a href=\"/f?kw=",endPosition);
endPosition = webBody.indexOf("\"", beginPosition+14);
if (beginPosition == -1 || endPosition == -1)
break;
oneUrl = tieba+webBody.substring(beginPosition+8
, endPosition);
if(!alreadyViewURL.contains(oneUrl)){
noViewURL.add(oneUrl);
alreadyViewURL.add(oneUrl);
System.out.println("存储了跳转网址:"+oneUrl);
}
}
}
/**
* 下载贴吧图片
*/
synchronized public void DownloadImage(){
int beginPosition = 0;
int endPosition = 0;
String oneUrl = "";
while (true) {
beginPosition = webBody.indexOf("http://imgsrc.",endPosition);
endPosition = webBody.indexOf("\"", beginPosition+14);
if (beginPosition == -1 || endPosition == -1)
break;
oneUrl = webBody.substring(beginPosition
, endPosition);
DownloadFile df = new DownloadFile(oneUrl);
Thread thread = new Thread(df);
thread.start();
while(DownloadFile.number>10){
System.out.println("线程数过多,等待中!");
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
}
public String readWebSecond(String url) throws Exception{
System.out.println("读入网址:"+url);
// 临时标准网页的内存字节数组输出流,长度自动增长
ByteArrayOutputStream baos = new ByteArrayOutputStream();
// 打开网络连接
URL startUrl;
URLConnection urlConnection;
// 获取网络输入流
InputStream is = null;
// 重置保存网页内容的内存字节数组输出流
baos.reset();
// 创建网页连接
startUrl = new URL(url);
urlConnection = startUrl.openConnection();
// 获取网页的输入流
is = urlConnection.getInputStream();
/***************************************
获取编码
***************************************/
String strencoding = "";
Map<String, List<String>> map = urlConnection.getHeaderFields();
Set<String> keys = map.keySet();
Iterator<String> iterator = keys.iterator();
// 遍历,查找字符编码
String key = null;
String tmp = null;
while (iterator.hasNext()) {
key = iterator.next();
tmp = map.get(key).toString().toLowerCase();
// 获取content-type charset
if (key != null && key.equals("Content-Type")) {
int m = tmp.indexOf("charset=");
if (m != -1) {
strencoding = tmp.substring(m + 8).replace("]", "");
}
}
}
/*****************************************strencoding有可能为空******************/
//if(strencoding.length()==0) strencoding="gbk";
// 读取网页内容,保存在网页内容的内存字节数组输出流
int oneByte = is.read();
int readflag = 0;
while (oneByte > 0) {
// 忽略网页内容开头处的空格字符、回车、换行符
//if (readflag == 0 && (oneByte == 32||oneByte==13||oneByte==10)) {
if (readflag == 0 && oneByte<=32) {
oneByte = is.read();
System.out.println("内层oneByte:"+oneByte);
continue;
}
// 如果网页不是文本网页(通过判断网页的第一个字符是不是“<”(编码为60)),则退出
if (readflag == 0 && oneByte != 60)
break;
// 如果网页是文本网页,设置标识变量
else
readflag = 1;
// 将读取的网页字符内容写入内存字节数组输出流
baos.write(oneByte);
// 继续读下一个网页字符
oneByte = is.read();
}
// 如果是文本网页
if (readflag == 1) {
// 获取网页文本内容
if(strencoding.length()!=0) webBody = new String(baos.toByteArray(),strencoding);
else webBody = new String(baos.toByteArray(),"UTF-8");
}
//System.out.println(webBody);
return webBody;
}
public static void main(String[] args){
SpiderOne so = new SpiderOne("http://tieba.baidu.com/f?kw=%BD%E3%CD%D1&tp=0&pn=");
so.analysisWeb();
}
}
下载图片类:
package spider;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
public class DownloadFile implements Runnable{
String urlString;
static volatile int number = 0;
public DownloadFile(String url){
//设置属性
urlString = url;
number++;
}
public void run(){
downLoadFile();
}
public synchronized void downLoadFile(){
//网络文件的相关信息
StringBuffer info = new StringBuffer();
try{
//网络文件的URL
URL url = new URL(urlString);
System.out.println(url.toString());
//打开该网络文件的URL连接
URLConnection urlConn = url.openConnection();
//添加网络文件的相关信息
info.append("主机:"+url.getHost()+"\n");
info.append("端口:"+url.getDefaultPort()+"\n");
info.append("网络文件的类型:"+urlConn.getContentType()+"\n");
info.append("长度:"+urlConn.getContentLength()+"\n");
info.append("正在下载···");
System.out.println(info.toString());
//创建网络文件的输入流
InputStream is = urlConn.getInputStream();
//获取网络文件的文件名称
String localFileName = url.getFile().substring(url.getFile().lastIndexOf("/")+1);
//创建本地文件输出流
FileOutputStream fos = new FileOutputStream("downloadImage/"+localFileName);
//读取网络文件到本地文件
int data;
while((data = is.read())!=-1){
fos.write(data);
}
//关闭流
is.close();
fos.close();
//Thread.sleep(1000);
}catch(Exception e){
System.out.println(e.getMessage());
}
System.out.println("下载完毕!");
number--;
}
}