//只能抓取一部分图片,像折800有些子路径的一行图片代码有好多个img,而且排列不规律,我的能力根本就没法截取下来
package test;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class getImageByUrl4 {
/**
* @param args
*/
private List imageUrl = new ArrayList();//用于存储图片的url
private int count = 0;//图片计数器
public static void main(String[] args) {
String netUrl = "http://www.zhe800.com";//要爬的网页
new getImageByUrl4().init(netUrl);
}
public void init(String netUrl){
getPage(netUrl);
while(imageUrl.size()!=0)
{
getImage(imageUrl.remove(0));
}
}
//获取网页信息line中的图片url并加入到集合中
public void getImageUrl(String line,String netUrl){
//三种正则表达式
//其他网站的图片,http开头如:src = "http://www.ecoc.com:8080/pic/jfjiejf.jpg
//String searchImgReg = "(src|SRC|background|BACKGROUND|background-image|BACKGROUND-IMAGE)=('|\")(http://([\\w-]+\\.)+[\\w-]+(:[0-9]+)*(/+[\\w-]+)*(/+[\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))";
String searchImgReg = "(src|SRC|background|BACKGROUND|background-image|BACKGROUND-IMAGE)\\w{0,2}=('|\")http(s)*://.{1,}.(jpg|JPG|png|PNG|gif|GIF)\"\\s";
//项目中的图片,绝对路径如:src = "/ecoc/lala/jj/ooellaie.jpg
//String searchImgReg2 = "(src|SRC|background|BACKGROUND|background-image|BACKGROUND-IMAGE)\\w{0,2}=('|\")/*(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))('\")";
String searchImgReg2 = "(src|SRC|background|BACKGROUND|background-image|BACKGROUND-IMAGE)\\w{0,2}=('|\")/*(([\\w-]+/)*([\\w-]+\\.(jpg|JPG|png|PNG|gif|GIF)))\"\\s";
try {
Pattern pat = Pattern.compile(searchImgReg);
Matcher matcher=pat.matcher(line);
String str =null;
while(matcher.find())
{
str = matcher.group();
String []sttr = str.split(" ");System.out.println(str);
for(int i = 0;i
String s = sttr[i];
Integer index_denghao = s.indexOf("=")+2;
imageUrl.add(s.substring(index_denghao,s.length()-1));
}
}
pat = Pattern.compile(searchImgReg2);
matcher=pat.matcher(line);
while(matcher.find())
{
str = matcher.group(); System.out.println(str);
String []sttr = str.split(" ");
for(int i = 0;i
String s = sttr[i];System.out.println(s);
Integer index_denghao = s.indexOf("=")+2;
Integer index_2 = netUrl.indexOf("/", 8);
if(index_2==-1)
index_2 = netUrl.length();
imageUrl.add(netUrl.substring(0, index_2)+"/"+s.substring(index_denghao,s.length()-1));
}
}
} catch (Exception e) {
}
}
//爬取网页中的信息。
public void getPage(String netUrl){
BufferedReader mybr = null;
try {
URL myurl = new URL(netUrl);
URLConnection myconn = myurl.openConnection();
InputStream myin = myconn.getInputStream();
mybr = new BufferedReader(new InputStreamReader(myin,"UTF-8"));
String line;
while((line = mybr.readLine())!= null)
{
getImageUrl(line,netUrl);//判断网页中的jpg图片
}
} catch (MalformedURLException e) {
System.out.println("getPage url异常");
} catch (IOException e) {
System.out.println("url连接异常");
e.printStackTrace();
}finally {
if( mybr != null)
{
try {
mybr.close();
} catch (IOException e) {
System.out.println("读入流关闭异常");
}
}
}
}
//下载该图片!
public void getImage(String imageUrl){
InputStream myin = null;
BufferedOutputStream myos = null;
try {
File file = new File("H:\\pic\\");
File[] files = file.listFiles();
for (File file2 : files) {
Integer fileName = Integer.valueOf(file2.getName().substring(0, file2.getName().indexOf(".")));
if(count
count = fileName;
}
}
URL myurl = new URL(imageUrl);
URLConnection myconn = myurl.openConnection();
myin = myconn.getInputStream();
myos = new BufferedOutputStream(new FileOutputStream("H:\\pic\\"+(++count)+".jpg"));
byte[] buff = new byte[1024];
int num = 0;
while((num = myin.read(buff))!= -1)
{
myos.write(buff, 0, num);
myos.flush();
}
} catch (MalformedURLException e) {
System.out.println("getImage url异常");
e.printStackTrace();
} catch (IOException e) {
System.out.println("下载图片url连接异常");
e.printStackTrace();
}
finally{
if( myin != null){
try {
myin.close();
} catch (IOException e) {
System.out.println("读入流关闭异常");
}
}
if( myos != null){
try {
myos.close();
} catch (IOException e) {
System.out.println("输出流关闭异常");
}
}
}
}
}