爬图片URL

想要获取到100条图片链接,还是自己写来的方便些,代码有不足之处,之后再继续改进^_^

package zw;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.junit.experimental.theories.Theories;
import org.springframework.core.io.UrlResource;

import com.mysql.fabric.xmlrpc.base.Array;
import com.sleepycat.utilint.StringUtils;

public class ImageCrawler {
	public static Queue<String> htmlQueue=new LinkedList<String>();
	public static ArrayList<String> imageUrlArrayList=new ArrayList<>();
	public static int imageUrlCount;
	public static int maxCount=200;
	public static boolean flagEnd;

	public static void startCrawl(String url,int maxImageUrlCount) {
		maxCount=maxImageUrlCount;
		htmlQueue.add(url);
		while(!htmlQueue.isEmpty()) {
			String urlHtml=htmlQueue.element();
			htmlQueue.remove();
			crawl(urlHtml);
			if(flagEnd)
				return;
		}
	}

	public static void crawl(String url) {
		String result=request(url);
		
		//将图片连接添加进容器
		//http(s):// 表示开头是http://或者https://
		//\\S* 表示任意个不含空格的字符 (结果证明这个用来获取URL并不严谨,大部分是可以)
		//\\. 表示.
		//(jpg|png) 表示jpg或者png
		Pattern pattern=Pattern.compile("http(s)://\\S*\\.(jpg|png)");
		Matcher matcher=pattern.matcher(result);
		while(matcher.find()) {
			
			if(!imageUrlArrayList.contains(matcher.group())) {
				imageUrlArrayList.add(matcher.group());
				imageUrlCount++;	
				if(imageUrlCount>maxCount) {
					flagEnd=true;
					return;
				}
				System.out.println("imageUrl="+matcher.group()+" imageUrlCount="+imageUrlCount);
			}
		}

		//将网页连接添加进队列
		pattern=Pattern.compile("http(s)://\\S*(\\.html|jsp|php)");
		matcher=pattern.matcher(result);
		while(matcher.find()) {
			if(!htmlQueue.contains(matcher.group())) {
				htmlQueue.add(matcher.group());
			}
		}
	}


	//获取网页的主体数据
	public static String request(String url) {
		BufferedReader bufferedReader=null;
		String result="";
		try {
			URL requestUrl=new URL(url);
			HttpURLConnection urlConnection=(HttpURLConnection) requestUrl.openConnection();
			urlConnection.connect();
			
			bufferedReader=new BufferedReader(
					new InputStreamReader(urlConnection.getInputStream(),"gbk"));
			String line;
			while((line=bufferedReader.readLine())!=null) {
				result+=line+"\r\n";
			}		
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}finally {
			if(bufferedReader!=null) {
				try {
					bufferedReader.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		return result;
	}

}

package zw;

import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;

import com.mysql.fabric.xmlrpc.base.Array;

public class ImageCrawlerTest{

	public static void main(String[] args) {
		ImageCrawler.startCrawl("http://news.qq.com/",100);
		ArrayList<String> arrayList=ImageCrawler.imageUrlArrayList;
		
		BufferedWriter writer=null;
		try {
			writer=new BufferedWriter(new FileWriter("f:/imageurl.txt"));
			writer.write("String[] imageUrl=new String[]{\n");
			writer.flush();
			int i=0;
			for(String item:arrayList) {
				i++;
				if(i<arrayList.size()) {
						writer.write("\""+item+"\",");
				}else {
					writer.write("\""+item+"\"};");
				}
				writer.newLine();
				writer.flush();
			}
		} catch (IOException e) {
			e.printStackTrace();
		}finally {
			try {
				writer.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
	}
}
得到的imageurl.txt文件:

String[] imageUrl=new String[]{
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/a/ap8ymiq8ac6ot78.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/4/482kbm3ynxy5hod.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/s/s98gibd18rpnh8x.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/v/v7ficapjz5lp9wk.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/9/9h8c52fr93m5a1d.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/r/rr8egv2y28ryszt.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/d/ddwbsu03l3riujw.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/f/f7lwk6wvsuze56o.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/g/gp4luezse7zydgb.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/g/g9bh6v53j4f9k4q.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/o/o9thh6b3y7k6w5k.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/n/n6j4h0i09jdfmvj.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/a/atige61a93darmk.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/5/5gx8y0zjb3wmmbu.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/q/qb7hxq8pn7scjk5.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/h/h7prazd5bqgtcob.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/y/y0u420qkxxequ20.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/p/p804hd60rbze39m.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/h/h81ii0umt6an3pf.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/m/m9pxexwf0drpnmp.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/8/81qsl9dzx9u3qsy.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/y/yx4t32ljyoi3u3f.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/n/nlidnunqfxozg06.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/o/oaaonxbpo2vug9z.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/j/jipf150styr6rzz.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/q/qfjtkh8xj1g6qlu.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/v/vj7ly51u3duxl0m.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/7/7kcf8a09yt0mkzh.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/r/r0y22kbbxwqrka0.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/4/4h6yggtuy79e0l2.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/6/6q6xn0f30eicfg6.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/d/dhcvxrp01wjp6mx.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/c/cp5204jkj4sz4ej.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/i/idzj991qx2yz1d6.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/o/o9l1w0min9rkswt.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/s/slow8lu2fd47jmk.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/z/z39368mvancetak.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/n/nynyqh4j3ogjros.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/d/dymgkvvf7r71o01.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/h/h130ic47259w194.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/t/tgzp8w3o6vsixsk.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/r/rw8ti5h44xagmdc.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/j/jcj12yel6i21unz.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/9/9j1cato2kzi36td.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/b/bzck5lw3aoxq30t.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideohori/j/jzzcn4i0k9q3v8x.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/d/dt9sevrupdt1eqr.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/q/qwvcfzl987ydm63.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/z/zwyy7umzhkroixx.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/z/z6dbu03dirzxqw3.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/f/f9nnlzuo3t1ivqr.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/5/50rljwzmo3ur1xk.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/c/ceusrubximxm0g8.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/f/fse52rd4klx7qn2.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/e/e0jts33la7wbnfk.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/v/vjz5aa3svg25k9o.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/p/plcojs8omlnmx0u.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/1/1wo8nqtcxiywyt0.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/j/jxj5pth9h3a71sg.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/k/ky9n4c5q142hoie.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/l/lesch825u8n3f7q.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/f/fmy3srlfpa5wr53.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/b/b5plj5f6ubqefhl.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/0/0jg7avmrqufxs1c.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/z/z9r9fm4ly2rfvpd.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/y/ytt1vehodssd72f.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/e/e5ut86o83jknzb4.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/q/q66b2paz6v51hit.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/c/cgrk72z9g8ki79m.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/b/bmwttzsp8ynzgob.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/r/ru614whbc3iohz9.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/0/0kehru9lyz4jgza.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/a/a31m5tpokxzs2d2.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/t/tq3gi5f7bk9mdpo.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/i/idx47vmqorfcejz.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/s/shdkyyel6l054gi.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/u/ugojzwahhig6mrt.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/u/u9odb4umfd24clo.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/3/32xk4q5adald1c5.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/4/4iis09kg00j2jad.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/o/oztunl1y10j0c0o.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/z/ztsspja63l5unjy.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/i/iwc4pwpge54ke28.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/2/2y2sozvwasnzied.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/4/4fapbxqqoib0rmp.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/k/k94tbbn7zz7mmih.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/l/ldqd9xqar0b2ubz.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/t/tlnbop5i4hizeua.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/i/iasejxg44sniqsc.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/3/3v6gtppa4kjqpiv.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/d/dim0ikmncob56tu.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/t/tfn9ak1h0u6ip2o.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/3/3wwvag2dw1brrhk.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/6/6oysp96a4pku9up.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/b/b4e8yjwgokhr0lj.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/n/nuijxf6k13t6z9b.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/z/zze649rclsb5ymh.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/n/nlbv1rg9ddl85gp.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/p/pw3qbsy8y9u6jcu.jpg",
"https://i.gtimg.cn/qqlive/img/jpgcache/files/qqvideo/hori/2/21u6d3wzccq4g2y.jpg"};



评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值