java中简单爬取网站信息以及生成图片的缩略图

java中简单爬取网站信息以及生成图片的缩略图

获取当前网站的Code

 /**
     * 获取当前网页的code
     *
     * @param httpUrl 网页地址
     * @return
     * @throws IOException
     */
    public static String getHtmlCode(String httpUrl) throws IOException {
        String content = "";        // 定义字符串content
        URL url = new URL(httpUrl); // 生成传入的URL的对象
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        connection.setDoInput(true);
        connection.setDoOutput(true);
        connection.setRequestMethod("GET");
        connection.connect();
        String fileEncode ="UTF-8";
        BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), fileEncode));// 获得当前url的字节流(缓冲)
        String input;
        while ((input = reader.readLine()) != null) { // 当前行存在数据时
            content += input;        // 将读取数据赋给content
        }
        reader.close();            // 关闭缓冲区
        return content;
    }

引入jsoup包

<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.9.2</version>
</dependency>

然后使用jsoup进行数据的提取,这里以博库网为例

	for (Element element1 : doc.select("div[class=wd-980 fl scale-box br-1-e8]").select("div[class=wd-640 fl]")) {
			//通过jscouo获取类型作者,出版社,定价,售价,出版时间,副标题
			String publishingtime = element1.select("div[class=wd-30p fl to-hd cl-9]").text();
			String a = element1.select("a[class=db fs-16 lh-30 to-hd fw-bd hover]").attr("href");
			
			String title = element1.select("a[class=db fs-16 lh-30 to-hd fw-bd hover]").text();
			String price = element1.select("div[class=lh-30]").select("del[class=cl-9 mr-10]").text();
			String author = element1.select("div[class=wd-30p fl to-hd mr-10]").select("a").attr("href");
			String publisher = element1.select("div[class=wd-30p fl to-hd cl-9 mr-10]").select("a[class=hover]").text();
			//String oprice=element1.select("div[class=lh-30]").select("span[class=fs-21 cl-rd-l fw-bd mr-20]").text();
			String sub_desc=element1.select("div[class=cl-9 lh-20 ht-40 oh fs-12]").text();
			//通过第二个链接获取简介和isbn码
			String D= GetWeb.getHtmlCode("https:"+a);
			Document HTML1 = Jsoup.parse(D);
			//读取图片地址
			String imags1=HTML1.select("div[id=slider]").select("li[class=pr cp]").attr("data-thumb");
			//爬取图片
			//给图片重命名
			String filename1=UUID.randomUUID().toString();
			if(filename1!=null) {
	
				//获取图片
				URL imgURL = new URL(imags1.trim());//转换URL
				HttpURLConnection urlConn = (HttpURLConnection) imgURL.openConnection();//构造连接
				urlConn.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.79 Safari/537.36");
				urlConn.connect();
				//存储地址
				String path = PathUtil.getClasspath() + "/uploadFiles/bookimages";
				if (urlConn.getResponseCode() == 200) {//返回的状态码是200 表示成功
					InputStream ins = urlConn.getInputStream(); //获取输入流,从网站读取数据到 内存中
					FileOutputStream fos = FileUtils.openOutputStream(new File(path + "/" + filename1 + ".jpg"));//打开FileOutStrean流
					IOUtils.copy(ins, fos);//将MultipartFile file转成二进制流并输入到FileOutStrean
					fos.close();
				}
				//获得缩略图
				String pathname=path+"/"+filename1+"yasuo.jpg";//
				File file=new File(pathname);
				ReduceImg.downloadCompressedPicture(file, imags1);
		}

获取缩略图

public class ReduceImg {
    /**
     * url下载压缩图
     * 需要jdk8
     */
    public static boolean downloadCompressedPicture(File file,String urlstr){
        URL url=null;
        try{
            url=new URL(urlstr);
            //1.获取url的输入流 dataInputStream
            DataInputStream dataInputStream=new DataInputStream(url.openStream());
            //2.加一层BufferedInputStream
            BufferedInputStream bufferedInputStream=new BufferedInputStream(dataInputStream);
            //3.构造原始图片流 preImage
            BufferedImage preImage=ImageIO.read(bufferedInputStream);
            //4.获得原始图片的长宽 width/height
            int width=preImage.getWidth();
            int height=preImage.getHeight();
            //5.构造压缩后的图片流 image 长宽各为原来的几分之几
            BufferedImage image=new BufferedImage(width/6, height/6, BufferedImage.TYPE_INT_RGB);
            //6.给image创建Graphic ,在Graphic上绘制压缩后的图片
            Graphics graphic=image.createGraphics();
            graphic.drawImage(preImage, 0, 0, width/6, height/6, null);
            //7.为file生成对应的文件输出流
            //将image传给输出流
            FileOutputStream fileOutputStream = new FileOutputStream(file);
            BufferedOutputStream bufferedOutputStream=new BufferedOutputStream(fileOutputStream);
            //8.将image写入到file中
            ImageIO.write(image, "bmp", bufferedOutputStream);
            //9.关闭输入输出流
            bufferedInputStream.close();
            bufferedOutputStream.close();
            return true;
        }catch(IOException e){
            System.out.println(e);
        }

        return false;
    }
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值