Jsoup爬取网络内容(包括图片文件),保存到本地和保存到数据库(一)

背景:

项目需要某个区县的天气数据,需要从中国气象局的官网中进行爬取。但是,中国气象局服务器调用接口返回的数据没有我想要的信息,比如说是未来24小时的天气温度,气压,风速等信息,这些数据接口中都没有,那么只能从页面中进行获取,所以使用了Jsoup,有一些信息是图片的形式返回,需要将图片下载下来存储到数据库中,用到的数据库是postgres数据库。工具类是Hutool和Jsoup

爬取的目标信息

target.xml

<div id=day0 class="clearfix pull-left">
    <div class="hour3 hbg">
        <div> 11:00 </div>
        <div class=hourimg>
            <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
            </div>
            <div> - </div>
            <div class=tmp_lte_20> 16.2℃ </div>
            <div> 1.7m/s </div>
            <div> 东北风 </div>
            <div> 1002.1hPa </div>
            <div> 56% </div>
            <div class=hide> 0% </div>
        </div>
        <div class="hour3 hbg">
            <div> 14:00 </div>
            <div class=hourimg>
                <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
                </div>
                <div> - </div>
                <div class=tmp_lte_20> 17.8℃ </div>
                <div> 1.8m/s </div>
                <div> 东北风 </div>
                <div> 1000.5hPa </div>
                <div> 50.5% </div>
                <div class=hide> 0% </div>
            </div>
            <div class="hour3 hbg">
                <div> 17:00 </div>
                <div class=hourimg>
                    <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
                    </div>
                    <div> - </div>
                    <div class=tmp_lte_15> 13.1℃ </div>
                    <div> 1.3m/s </div>
                    <div> 东风 </div>
                    <div> 1000.4hPa </div>
                    <div> 61.6% </div>
                    <div class=hide> 0% </div>
                </div>
                <div class="hour3 hbg">
                    <div> 20:00 </div>
                    <div class=hourimg>
                        <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
                        </div>
                        <div> - </div>
                        <div class=tmp_lte_10> 8.3℃ </div>
                        <div> 0.8m/s </div>
                        <div> 东风 </div>
                        <div> 1002hPa </div>
                        <div> 65.5% </div>
                        <div class=hide> 0.2% </div>
                    </div>
                    <div class="hour3 hbg">
                        <div> 23:00 </div>
                        <div class=hourimg>
                            <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
                            </div>
                            <div> - </div>
                            <div class=tmp_lte_10> 6.6℃ </div>
                            <div> 0.4m/s </div>
                            <div> 东南风 </div>
                            <div> 1001hPa </div>
                            <div> 71.7% </div>
                            <div class=hide> 10.1% </div>
                        </div>
                        <div class="hour3 ">
                            <div> 14日02:00 </div>
                            <div class=hourimg>
                                <img src="http://image.nmc.cn/assets/img/w/40x40/3/0.png">
                                </div>
                                <div> - </div>
                                <div class=tmp_lte_10> 6.2℃ </div>
                                <div> 0.5m/s </div>
                                <div> 南风 </div>
                                <div> 1000.5hPa </div>
                                <div> 81.9% </div>
                                <div class=hide> 10.9% </div>
                            </div>
                            <div class="hour3 ">
                                <div> 05:00 </div>
                                <div class=hourimg>
                                    <img src="http://image.nmc.cn/assets/img/w/40x40/3/1.png">
                                    </div>
                                    <div> - </div>
                                    <div class=tmp_lte_10> 7.8℃ </div>
                                    <div> 0.7m/s </div>
                                    <div> 南风 </div>
                                    <div> 1000.1hPa </div>
                                    <div> 93.6% </div>
                                    <div class=hide> 52.6% </div>
                                </div>
                                <div class="hour3 ">
                                    <div> 08:00 </div>
                                    <div class=hourimg>
                                        <img src="http://image.nmc.cn/assets/img/w/40x40/3/1.png">
                                        </div>
                                        <div> - </div>
                                        <div class=tmp_lte_10> 9.1℃ </div>
                                        <div> 0.9m/s </div>
                                        <div> 南风 </div>
                                        <div> 1000.3hPa </div>
                                        <div> 84.5% </div>
                                        <div class=hide> 61.4% </div>
                                    </div>
                                </div>

爬取的目标数据展示

建立实体类

TwentyFourHoursDomain.java

package com.imegaware.crawler.weatherForecast.psd;

import java.io.File;

/**
 * 24小时每隔三个小时需要的属性
 * @author pshdhx
 *
 */
public class TwentyFourHoursDomain {
	private String monitorTime; //未来监测时间
	private String url; //天气图标url :http://image.nmc.cn/assets/img/w/40x40/3/0.png=晴天
	private File file; //天气图标url :http://image.nmc.cn/assets/img/w/40x40/3/0.png=晴天
	private String rain; //降水 -
	private String temperature; //气温
	private String windSpeed; //风速
	private String windDirection; //风向
	private String airPressure; //气压
	private String humidity;//湿度
	private String url2; //本地url
	
	public String getUrl2() {
		return url2;
	}
	public void setUrl2(String url2) {
		this.url2 = url2;
	}
	public String getMonitorTime() {
		return monitorTime;
	}
	public void setMonitorTime(String monitorTime) {
		this.monitorTime = monitorTime;
	}
	public String getUrl() {
		return url;
	}
	public void setUrl(String url) {
		this.url = url;
	}
	
	public File getFile() {
		return file;
	}
	public void setFile(File file) {
		this.file = file;
	}
	public String getRain() {
		return rain;
	}
	public void setRain(String rain) {
		this.rain = rain;
	}
	public String getTemperature() {
		return temperature;
	}
	public void setTemperature(String temperature) {
		this.temperature = temperature;
	}
	public String getWindSpeed() {
		return windSpeed;
	}
	public void setWindSpeed(String windSpeed) {
		this.windSpeed = windSpeed;
	}
	public String getWindDirection() {
		return windDirection;
	}
	public void setWindDirection(String windDirection) {
		this.windDirection = windDirection;
	}
	public String getAirPressure() {
		return airPressure;
	}
	public void setAirPressure(String airPressure) {
		this.airPressure = airPressure;
	}
	public String getHumidity() {
		return humidity;
	}
	public void setHumidity(String humidity) {
		this.humidity = humidity;
	}
	@Override
	public String toString() {
		return "TwentyFourHoursDomain [monitorTime=" + monitorTime + ", url=" + url + ", file=" + file + ", rain="
				+ rain + ", temperature=" + temperature + ", windSpeed=" + windSpeed + ", windDirection="
				+ windDirection + ", airPressure=" + airPressure + ", humidity=" + humidity + "]";
	}
	
}

JdbcUtil.java

package com.imegaware.crawler.util;

import java.io.InputStream;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.Properties;

/**
 * 
 * @author pshdhx
 *
 */
public class JdbcUtil {

	private static String USERNAME;
	private static String PASSWORD;
	private static String DRIVER;
	private static String URL;

	static {
		loadConfig();
	}

	/**
	 * 加载数据库配置信息
	 */
	public static void loadConfig() {
		try {
			InputStream inStream = JdbcUtil.class.getClassLoader().getResourceAsStream("jdbc.properties");
			Properties prop = new Properties();
			prop.load(inStream);
			USERNAME = prop.getProperty("jdbc.username");
			PASSWORD = prop.getProperty("jdbc.password");
			DRIVER = prop.getProperty("jdbc.driver");
			URL = prop.getProperty("jdbc.url");
		} catch (Exception e) {
			throw new RuntimeException("读取数据库配置文件异常!");
		}
	}

	public JdbcUtil() {
		super();
	}

	/**
	 * 获取数据库连接
	 *
	 * @return
	 */
	public static Connection getConnection() throws RuntimeException {
		try {
			Class.forName(DRIVER);
			Connection connection = DriverManager.getConnection(URL, USERNAME, PASSWORD);
			return connection;
		} catch (Exception e) {
			throw new RuntimeException("无法获取数据库连接!", e);
		}
	}

	/**
	 * @Title close
	 * @Description 释放连接
	 * @author maven
	 * @param connection
	 * @param preparedStatement
	 * @param resultSet
	 * @return void
	 */
	public static void close(Connection connection, PreparedStatement preparedStatement, ResultSet resultSet) {
		if (resultSet != null) {
			try {
				resultSet.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
		if (preparedStatement != null) {
			try {
				preparedStatement.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
		if (connection != null) {
			try {
				connection.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}

	public static void close(PreparedStatement preparedStatement, ResultSet resultSet) {
		if (resultSet != null) {
			try {
				resultSet.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}

		if (preparedStatement != null) {
			try {
				preparedStatement.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}

	public static void close(PreparedStatement preparedStatement) {

		if (preparedStatement != null) {
			try {
				preparedStatement.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}

	public static void close(ResultSet resultSet) {
		if (resultSet != null) {
			try {
				resultSet.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}

	public static void close(Connection connection) {
		if (connection != null) {
			try {
				connection.close();
			} catch (SQLException e) {
				e.printStackTrace();
			}
		}
	}

}

ImageUtil.java

package com.imegaware.crawler.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
/**
 * 图片工具类
 * @author pshdhx
 *
 */
public class ImageUtil {
	// 读取本地图片获取输入流
    public static FileInputStream readImage(String path) throws IOException {
        return new FileInputStream(new File(path));
    }

    // 读取表中图片获取输出流
    public static void readBin2Image(InputStream in, String targetPath) {
        File file = new File(targetPath);
        String path = targetPath.substring(0, targetPath.lastIndexOf("/"));
        if (!file.exists()) {
            new File(path).mkdir();
        }
        FileOutputStream fos = null;
        try {
            fos = new FileOutputStream(file);
            int len = 0;
            byte[] buf = new byte[1024];
            while ((len = in.read(buf)) != -1) {
                fos.write(buf, 0, len);
            }
           fos.flush();
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (null != fos) {
                try {
                    fos.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
}

爬取主类

TwentyFourHoursWeather.java

package com.imegaware.crawler.weatherForecast.psd;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.imegaware.crawler.util.ImageUtil;
import com.imegaware.crawler.util.JdbcUtil;
import com.ruoyi.common.utils.StringUtils;

import cn.hutool.core.io.FileUtil;
import cn.hutool.http.HttpUtil;

/**
 * 利用jsoup爬取气象局未来24小时的天气情况,温度,湿度,降水,风速,风向,气压
 * @author pshdhx
 *
 */
public class TwentyFourHoursWeather {
	
	//图片存储到本地的路径
	private static String myurl = "/home/sdzw/JavaPaChong/ruoyi/imw-crawler/src/main/java/com/imegaware/crawler/weatherForecast/psd/";
	
	private static List<TwentyFourHoursDomain> getItemContent() {
		// TODO Auto-generated method stub
		Document doc ;
		List<TwentyFourHoursDomain> list = new ArrayList<>();
		try {
			doc = Jsoup.connect("http://www.nmc.cn/publish/forecast/ASD/zichuan.html").get();
			Element day0 = doc.getElementById("day0");
			
			for(int i=0;i<day0.childNodeSize();i++) {
				Node childNode = day0.childNode(i);
				//System.out.println(childNode.childNodeSize()+"--8个模块的子模块=需要的参数个数");
					TwentyFourHoursDomain pojo = new TwentyFourHoursDomain();
					pojo.setMonitorTime(StringUtils.trim(childNode.childNode(0).childNode(0)+""));
					String url = childNode.childNode(1).childNode(0).attr("src")+""; //网址
					long size = HttpUtil.downloadFile(url, FileUtil.file(myurl));
					//System.out.println("Download size: " + size);
					String[] split = url.split("[/]");
					File file = new File(myurl+split[split.length-1]);
					pojo.setFile(file);
					pojo.setUrl2(myurl+split[split.length-1]);
					//System.out.println(pojo.getUrl2()+"========"+"本地地址");
					pojo.setUrl(url);
					pojo.setRain(StringUtils.trim(childNode.childNode(2).childNode(0)+""));
					pojo.setTemperature(StringUtils.trim(childNode.childNode(3).childNode(0)+""));
					pojo.setWindSpeed(StringUtils.trim(childNode.childNode(4).childNode(0)+""));
					pojo.setWindDirection(StringUtils.trim(childNode.childNode(5).childNode(0)+""));
					pojo.setAirPressure(StringUtils.trim(childNode.childNode(6).childNode(0)+""));
					pojo.setHumidity(StringUtils.trim(childNode.childNode(7).childNode(0)+""));
				
				list.add(pojo);
			}
			saveData(list);
			
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return list;
	}
	
	/**
	 * @Title saveData
	 * @Description 数据入库
	 * @author maven
	 * @param param
	 * @return
	 * @return int
	 * @throws SQLException
	 */
	private static void saveData(List<TwentyFourHoursDomain> list) throws RuntimeException {
		int rows = 0;
		PreparedStatement pstmt;
		FileInputStream in = null;
		try {
			Connection conn = JdbcUtil.getConnection();
			String sql = "INSERT INTO twenty_four_hour_weather(id,monitor_time,url,file,rain,temperature,wind_speed,wind_direction,air_pressure,humidity,region_name,region_code,task_time) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,now());";
			pstmt = conn.prepareStatement(sql);
			for (int i = 0; i < list.size(); i++) {
				pstmt.setString(1, (i+100)*Math.random()*1000000+""+i+(i+10)*Math.random()*1000000);
				pstmt.setString(2, list.get(i).getMonitorTime()+"");
				pstmt.setString(3, list.get(i).getUrl());
				File file = list.get(i).getFile();
				in = ImageUtil.readImage(list.get(i).getUrl2());
				pstmt.setBinaryStream(4, in, in.available());
				pstmt.setString(5, list.get(i).getRain());
				pstmt.setString(6, list.get(i).getTemperature());
				pstmt.setString(7, list.get(i).getWindSpeed());
				pstmt.setString(8, list.get(i).getWindDirection());
				pstmt.setString(9, list.get(i).getAirPressure());
				pstmt.setString(10, list.get(i).getHumidity());
				pstmt.setString(11, "淄川区");
				pstmt.setString(12, "370302");
				pstmt.addBatch();
			}
			int[] x = pstmt.executeBatch();
			JdbcUtil.close(conn, pstmt, null);
			for (int i : x) {
				rows += i;
			}
			System.out.println("入库完成,共插入" + rows + "行数据");
		} catch (Exception e) {
			throw new RuntimeException("数据入库失败!", e);
		}
		
	}
	public static void main(String[] args) {
		TwentyFourHoursWeather test = new TwentyFourHoursWeather();
		List<TwentyFourHoursDomain> itemContent = TwentyFourHoursWeather.getItemContent();
		for(int i=0;i<itemContent.size();i++) {
			System.out.println(itemContent.get(i).toString());
		}
		System.out.println(itemContent.size());
	}
}

pom.xml

<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.11.3</version>
</dependency>

<dependency>
			<groupId>cn.hutool</groupId>
			<artifactId>hutool-all</artifactId>
			<version>5.4.7</version>
</dependency>

功能实现截图:

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值