java爬取百度贴吧_JAVA爬取百度贴吧图片

packagecom.wang.xiaowei.utils;importcom.sun.image.codec.jpeg.JPEGCodec;importcom.sun.image.codec.jpeg.JPEGImageEncoder;importorg.apache.http.HttpEntity;importorg.apache.http.client.config.RequestConfig;importorg.apache.http.client.methods.HttpGet;importorg.apache.http.impl.client.CloseableHttpClient;importorg.apache.http.impl.client.HttpClients;importorg.apache.http.util.EntityUtils;importorg.jsoup.Jsoup;importorg.jsoup.nodes.Document;importorg.jsoup.nodes.Element;importorg.jsoup.select.Elements;importjavax.imageio.ImageIO;import java.awt.*;importjava.awt.image.BufferedImage;importjava.io.File;importjava.io.FileOutputStream;importjava.io.InputStream;importjava.io.OutputStream;importjava.net.URL;importjava.text.SimpleDateFormat;importjava.util.Date;importjava.util.HashMap;importjava.util.Map;importjava.util.UUID;/***@authorWXW on 2017/11/22.*/

public classTieBaImageDownload {private static final RequestConfig REQUEST_CONFIG =RequestConfig.custom()

.setSocketTimeout(15000)

.setConnectTimeout(15000)

.setConnectionRequestTimeout(15000)

.build();/**贴吧主路径*/

private static final String TB_BASE_URL = "https://tieba.baidu.com";/**用于存放url*/

private static MapURL_MAP;/**图片保存的路径*/

private static final String IMAGE_SAVE_DIRECT = "E:/baiduimage/";/**HttpClient对象*/

private static CloseableHttpClient httpClient = null;/**每页有多少条帖子*/

private static final int EVERY_PAGE_COUNT_SIZE = 50;/**水印图片路径*/

private static final String WATER_IMAGE_PATH = "E://baiduimage//3.png";/**透明度*/

private static final float WATER_IMAGE_ALPHA = 0.5F;/**X间距*/

private static final int WATER_IMAGE_MARGIN_Y = 100;/**Y间距*/

private static final int WATER_IMAGE_MARGIN_X = 100;/**水印图片选中角度*/

private static final int WATER_IMAGE_RADIANS = 30;/*** 获取指定贴吧的全部内容

*@paramkey 贴吧关键字

*@parammaxPage 最大页码

*@paramonlySeeLz 是否只看楼主

*@paramaddWaterImage 是否添加水印

*@throwsException e*/

public static void getHttpUrl(String key,int maxPage,boolean onlySeeLz,boolean addWaterImage) throwsException{//初始化httpclient对象

httpClient =HttpClients.createDefault();//开始按照页面爬取内容

while(maxPage > 0){

System.out.println("=============正在处理第"+maxPage+"页===================");//每页有50条数据

int pageIndex = (maxPage - 1) *EVERY_PAGE_COUNT_SIZE;//路径

String spaderUrl = TB_BASE_URL + "/f?kw=" + key + "&ie=utf-8&pn="+pageIndex;

System.out.println("spaderUrl==== "+spaderUrl);

String responseContent=getHtmlContent(spaderUrl);

processHtml(responseContent,onlySeeLz);

maxPage--;

}

httpClient.close();

downLoadImage(addWaterImage);

}/***

* 根据url获取页面源码内容

*@paramurl url

*@return页面源码内容

*@throwsException e*/

private static String getHtmlContent(String url) throwsException {//get方式获取页面内容

HttpGet get = newHttpGet(url);

get.setConfig(REQUEST_CONFIG);

HttpEntity entity=httpClient.execute(get).getEntity();return EntityUtils.toString(entity, "UTF-8");

}/***

* 处理html内容

*@paramresponseContent html内容

*@paramonlySeeLz 是否只看楼主

*@throwsException e*/

private static void processHtml(String responseContent,boolean onlySeeLz) throwsException {

Document doc=Jsoup.parse(responseContent);//获取所有 class=j_th_tit 的 a标签;帖子的具体连接

Elements urls = doc.select("a.j_th_tit");for(Element e : urls){//帖子标题

String tText =e.text();//帖子连接

String tUrl = e.attr("href");

tUrl= TB_BASE_URL + "" +tUrl;//只看楼主

if(onlySeeLz){

tUrl= tUrl + "?see_lz=1";

}//将获取到的帖子url放入Map

URL_MAP.put(tText,tUrl);

}

}/***

* 获取每个帖子内容中的图片信息

*@paramaddWaterImage 是否加水印

*@throwsException e*/

private static void downLoadImage(boolean addWaterImage) throwsException {for(String str : URL_MAP.values()){//帖子的url

System.out.println("帖子的url=== "+str);

Document doc=Jsoup.connect(str).get();//帖子中 class=img.BDE_Image的元素

Elements images = doc.select("img.BDE_Image");for(Element e : images){//获取图片url

String imageUrl = e.attr("src");

System.out.println("imageUrl============ "+imageUrl);

saveImage(imageUrl,addWaterImage);

}

}

}/*** 将图片保存到本地

*@paramimageUrl imageUrl

*@paramaddWaterImage addWaterImage

*@throwsException e*/

private static void saveImage(String imageUrl,boolean addWaterImage) throwsException{//每天创建一个目录

SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");

String filePath= sdf.format(newDate());

File imageFile= new File(IMAGE_SAVE_DIRECT+"//"+filePath);if(!imageFile.exists()){if(imageFile.mkdirs()){

System.out.println("---------创建目录成功-------------");

}

}//随机生成图片名称

String fileName = UUID.randomUUID().toString().replaceAll("-","");

URL url= newURL(imageUrl);

InputStream is=url.openStream();

OutputStream os= new FileOutputStream(IMAGE_SAVE_DIRECT+"//"+filePath + "//" + fileName +".jpg");if(addWaterImage){

addWaterImage(url,os);

}if(!addWaterImage){

saveImageWithoutWaterImage(is,os);

}

}private static void saveImageWithoutWaterImage(InputStream is,OutputStream os) throwsException{byte[] buff = new byte[1024];intreaded;while ((readed = is.read(buff)) != -1) {

os.write(buff,0, readed);

}

is.close();

os.close();

}/***

* 打印水印

*@paramsourceImagePath 原图片路径

*@paramos os

*@throwsException e*/

private static void addWaterImage(URL sourceImagePath,OutputStream os) throwsException{//根据图片路径生成图片对象。获取图片的宽度高度

Image image =ImageIO.read(sourceImagePath);int width = image.getWidth(null);int height = image.getHeight(null);//根据图片的宽高,生成画布,将原图画到画布

BufferedImage bufferedImage = newBufferedImage(width, height, BufferedImage.TYPE_INT_RGB);

Graphics2D graphics2d=bufferedImage.createGraphics();

graphics2d.drawImage(image,0, 0, width, height, null);//水印图片

Image waterImage = ImageIO.read(newFile(WATER_IMAGE_PATH));int waterImageWidth = waterImage.getWidth(null);int waterImageHeight = waterImage.getHeight(null);//水印透明设置

graphics2d.setComposite(AlphaComposite.getInstance(AlphaComposite.SRC_ATOP, WATER_IMAGE_ALPHA));//旋转 rotate(选中度数,圆心x坐标,圆心y坐标)

graphics2d.rotate(Math.toRadians(WATER_IMAGE_RADIANS), bufferedImage.getWidth()/2, bufferedImage.getHeight()/2);//循环打印水印图片

int waterImageX = -width / 2;while(waterImageX < width * 1.5){int waterImageY = -height / 2;while(waterImageY < height * 1.5){

graphics2d.drawImage(waterImage, waterImageX, waterImageY,null);

waterImageY+= waterImageHeight +WATER_IMAGE_MARGIN_Y;

}

waterImageX+= waterImageWidth +WATER_IMAGE_MARGIN_X;

}

graphics2d.dispose();//创建图像编码工具类

JPEGImageEncoder en =JPEGCodec.createJPEGEncoder(os);//使用图像编码工具类,输出缓存图像到目标文件

en.encode(bufferedImage);

os.close();

}public static voidmain(String[] args){try{

URL_MAP= new HashMap<>();boolean onlySeeLz = true;

String key= "柳岩";int maxPage = 1;boolean addWaterImage = true;

getHttpUrl(key,maxPage,onlySeeLz,addWaterImage);

}catch(Exception e) {

e.printStackTrace();

}

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值