java抓取图片_Java 抓取网页上的图片

最新推荐文章于 2022-04-26 12:36:06 发布

周同强

最新推荐文章于 2022-04-26 12:36:06 发布

阅读量247

点赞数

文章标签： java抓取图片

本文链接：https://blog.csdn.net/weixin_35618325/article/details/114053216

版权

该博客展示了如何使用Java实现从网页中抓取并下载图片。通过正则表达式匹配图片链接，然后将图片保存到本地。代码包括获取网页源码、解析图片URL、下载图片等步骤。

摘要由CSDN通过智能技术生成

public class Picture {

private String title;

private String source;

private String upPath;

//get set ...

}

import java.io.BufferedReader;

import java.io.File;

import java.io.FileOutputStream;

import java.io.InputStream;

import java.io.InputStreamReader;

import java.io.OutputStream;

import java.net.URL;

import java.net.URLConnection;

import java.util.ArrayList;

import java.util.Calendar;

import java.util.List;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import com.sun.xml.internal.fastinfoset.stax.events.Util;

public class CatchPicture {

/**

* @param args

public static void main(String[] args) {

// TODO Auto-generated method stub

//定义抓取图片的正则表达式

String regular="[*].*? $\'(.*?)\'$

List list=new CatchPicture().lookWeiboPic("http://gaoxiao.jokeji.cn/GrapHtml/dongtai/20120921221658.htm","GBK",regular,"2,1");

System.out.println(list.size());

}

//根据URL查看网站上的图片

public List lookWeiboPic(String url,String charset,String regular,String attIndex){

List list=new ArrayList();

try {

//获取填写的url

//判断所属网站获取正则表达式

//获取图片存放到 list集合

if(!Util.isEmptyString(url)){

String htmls = getPageSource(url.trim(),charset);

Pattern pattern =null;

pattern = Pattern.compile(regular.trim());

if(!Util.isEmptyString(htmls)){

Matcher matcher = pattern.matcher(htmls);

//得到参数属性顺序

String[] sort = regular.trim().split(","); //下标：0 表示标题title ， 1 表示图片路径

//判断后缀后得到网站的请求头部 http://www.moonbasa.com/p-032111106.html-->得到 http://www.moonbasa.com

String[] suffix;

suffix =url.trim().split("cn");

String httphread = "";

if (suffix.length > 1) {

httphread = suffix[0] + "cn";

} else {

suffix = url.trim().split("com");

httphread = suffix[0] + "com";

}

//循环匹配找到的

while(matcher.find()){

Picture picture=new Picture();

//匹配出title

if (-1 == Integer.parseInt(sort[0])) {

// 页面上抓不到标题

picture.setTitle("");

} else {

// 去标题的#

String title=matcher.group(Integer.parseInt(sort[0])).replace("#", " ");

picture.setTitle(title);

}

//匹配出source

if (-1 == Integer.parseInt(sort[1])) {

// 页面上抓不到图片路径

picture.setSource("");

}else{

String webImgUrl=matcher.group(Integer.parseInt(sort[1]));

//判断是绝对路径还是相对路径

String[] pathType=webImgUrl.split(":");

if(pathType.length>1){

//绝对路径

picture.setSource(webImgUrl);

}else{

//判断相对路径是否含有..

pathType=webImgUrl.split("\\.\\.");

if(pathType.length>1){

picture.setSource(httphread+pathType[1]);

}else{

if(webImgUrl.startsWith("/")){

picture.setSource(httphread+pathType[0]);

}else{

picture.setSource(httphread+"/"+pathType[0]);

}

String upPath=upload(picture.getSource(),"d:\\image\\");

picture.setUpPath(upPath);

list.add(picture);

}//--end while

}

}catch (Exception e) {

e.printStackTrace();

}

return list;

}

/**

* 根据网路路径获取页面源码

* @param pageUrl

* @param encoding

* @return

public String getPageSource(String pageUrl,String encoding) {

StringBuffer sb = new StringBuffer();

try {

//构建一URL对象

URL url = new URL(pageUrl);

//使用openStream得到一输入流并由此构造一个BufferedReader对象

BufferedReader in = new BufferedReader(new InputStreamReader(url

.openStream(), encoding));

String line;

//读取www资源

while ((line = in.readLine()) != null) {

sb.append(line);

sb.append("\n");

}

in.close();

} catch (Exception ex) {

System.err.println(ex);

}

return sb.toString();

}

/**

* 上传图片

* @param urlStr

* @param path

* @return

* @throws Exception

public String upload(String urlStr,String path) throws Exception{

Calendar calendar = Calendar.getInstance();

String month = calendar.get(Calendar.YEAR) + "/"

+ (calendar.get(Calendar.MONTH) + 1);

String filename = java.util.UUID.randomUUID().toString()

+ getExtension(urlStr);

path =path + month + "/";

download(urlStr,path,filename);

return path+month + "/" + filename;

}

/**

* 根据路径下载图片然后保存到对应的目录下

* @param urlString

* @param filename

* @param savePath

* @return

* @throws Exception

public void download(String urlString, String filename,String savePath) throws Exception {

// 构造URL

URL url = new URL(urlString);

// 打开连接

URLConnection con = url.openConnection();

//设置请求的路径

con.setConnectTimeout(5*1000);

// 输入流

InputStream is = con.getInputStream();

// 1K的数据缓冲

byte[] bs = new byte[1024];

// 读取到的数据长度

int len;

// 输出的文件流

File sf=new File(savePath);

if(!sf.exists()){

sf.mkdirs();

}

OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename);

// 开始读取

while ((len = is.read(bs)) != -1) {

os.write(bs, 0, len);

}

// 完毕，关闭所有链接

os.close();

is.close();

}

/**

* 根据文件名获取文件的后缀名

* @param fileUrl

* @return

public String getExtension(String fileUrl){

return fileUrl.substring(fileUrl.lastIndexOf("."), fileUrl.length());

}

顶

踩

分享到：

2012-09-23 17:28

你这个正则我试了一下，匹配不到图片的？楼主是怎么抓到图片的呢？

周同强

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java抓取图片_Java 抓取网页上的图片

public class Picture {private String title;private String source;private String upPath;//get set ...}import java.io.BufferedReader;import java.io.File;import java.io.FileOutputStream;import java.io.In...
复制链接

扫一扫