java 爬取 豆瓣_java 爬虫 爬取豆瓣 请不要害羞 图片

importorg.apache.http.HttpEntity;importorg.apache.http.HttpResponse;importorg.apache.http.client.HttpClient;importorg.apache.http.client.methods.HttpGet;importorg.apache.http.impl.client.DefaultHttpClient;import java.io.*;importjava.net.HttpURLConnection;importjava.net.URL;importjava.text.SimpleDateFormat;importjava.util.ArrayList;importjava.util.Date;importjava.util.List;importjava.util.regex.Matcher;importjava.util.regex.Pattern;/*** Created by liwj on 2017/5/25.*/

public classSpider {private static String IMAGE_REG = "(https://img1.doubanio.com/view/group_topic/large/public/p)[0-9]{0,}(.jpg)";private static String HTTP_REG = "(https://www.douban.com/group/topic/)[0-9]{0,}(/)";private static String FILE_NAME="[0-9]{0,}(.jpg)";private static SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");/*** 根据url获取网页源码

*

*@paramurl

*@return

*/

private staticString getResultByUrl(String url) {

HttpClient hc= newDefaultHttpClient();try{

HttpGet httpget= newHttpGet(url);

httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.1 Safari/525.13");

httpget.setHeader("Accept-Encoding", "utf-8");

HttpResponse response=hc.execute(httpget);

HttpEntity entity=response.getEntity();if (entity != null) {

InputStream in=entity.getContent();

BufferedReader br= new BufferedReader(new InputStreamReader(in, "utf-8"));

StringBuffer buffer= newStringBuffer();

String line= "";while ((line = br.readLine()) != null) {

buffer.append(line);

}

in.close();

entity.getContent().close();returnbuffer.toString();

}

}catch(Exception e) {

e.printStackTrace();

}return "";

}/*** 获取帖子或者图片url

*

*@paramhtml

*@return

*/

private static ListgetAllUrl(String reg, String html) {

List urls = new ArrayList();

Pattern pattern=Pattern.compile(reg);

Matcher matcher=pattern.matcher(html);while(matcher.find()) {

urls.add(matcher.group());

}returnurls;

}/*** 下载文件

*@paramfileUrl

*@paramfileName

*@paramsavePath

*@throwsException*/

private static void downloadFileFromUrl(String fileUrl, String fileName, String savePath) throwsException {//获取连接

URL url = newURL(fileUrl);

HttpURLConnection connection=(HttpURLConnection) url.openConnection();

connection.setConnectTimeout(3 * 1000);//设置请求头

connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.110 Safari/537.36");//获取输入流

InputStream in =connection.getInputStream();

File saveDir= newFile(savePath);if (!saveDir.exists()) {

saveDir.mkdirs();

}

File file= new File(savePath +fileName);

OutputStream out= newFileOutputStream(file);byte[] bytes = new byte[1024];int len = 0;while ((len = in.read(bytes)) != -1) {

out.write(bytes,0, len);

}

out.close();

in.close();

}public static voidmain(String[] args) {for (int page = 25; page <= 25; page += 25) {

String url= "https://www.douban.com/group/haixiuzu/discussion?start=" +page;

String html=getResultByUrl(url);//System.out.println(html);

List webPages=getAllUrl(HTTP_REG,html);for(String webPage:webPages){

String webHtml=getResultByUrl(webPage);

List images=getAllUrl(IMAGE_REG,webHtml);for(String image:images){

String fileName="";

Matcher matcher=Pattern.compile(FILE_NAME).matcher(image);if(matcher.find()){

fileName=matcher.group();

}try{

downloadFileFromUrl(image,fileName,"E:\\image\\");

System.out.println(df.format(new Date())+" 图片保存成功------["+fileName+"]");

}catch(Exception e){

System.err.println(df.format(new Date())+" 图片保存失败------["+fileName+"]");

}

}

}

}

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值