有搜索条件根据url抓取网页数据(java爬取网页数据)

最近有一个任务抓取如下图的网页数据  要获取前一天的数据进行翻页抓取数据并存入数据库

 

如果就只是抓取当前页的数据  没有条件和翻页数据 这个就比较简单了 但是要选取前一天的数据,还有分页数据

一开始的思路就想错了(开始想的是触发查询按钮和翻页按钮)导致任务一度没有进展 后来在技术经理的协助下搞定

话不多说 直接贴出代码

 

<dependency>
 	<groupId>org.jsoup</groupId>
  	<artifactId>jsoup</artifactId>
  	<version>1.11.3</version>
</dependency>

 

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.Properties;

import org.apache.http.HttpStatus;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;



@Scheduled(cron = "0 0 03 * * ?")//每天凌晨3点抓取数据
	//@Scheduled(cron="0/10 * *  * * ? ")  //测试 10秒执行一次
	//@Scheduled(cron="0 */10 * * * ?") //测试 10分钟执行一次
	@Transactional
	public void getNotice() throws ClientProtocolException, IOException, ParseException {
		//获取当前时间的前一天
		Calendar calendar = Calendar.getInstance();
		calendar.setTime(new Date());
		calendar.add(Calendar.DAY_OF_MONTH, -1);
		SimpleDateFormat df = new SimpleDateFormat("yyyy-MM-dd");//设置日期格式
		String format = df.format(calendar.getTime());
		//读取配置文件中的企业信息
		Properties properties = new Properties();
		// 使用ClassLoader加载properties配置文件生成对应的输入流
		InputStream in = WryVoluntarilyMonitorService.class.getClassLoader()
				.getResourceAsStream("config/syqy.properties");
		// 使用properties对象加载输入流 设置字符集以防读取中文时乱码
		properties.load(new InputStreamReader(in, "utf-8"));
		// 获取key对应的value值
		String property = properties.getProperty("value");
		String[] split = property.split(";");
		System.out.println(split.length);
		for (String s : split) {
			String[] split2 = s.split(",");
			// System.out.println(split2[0]+"======="+split2[1]);split2[0]企业名称;split2[1])企业url
			String html = getHtmlByUrl(split2[1],format);
			if (html != null && !"".equals(html)) {
				//获取选中的时间有多少页数据
				Document doc1 = Jsoup.parse(html);
				Elements select = doc1.select("input");
				String attr = select.get(4).attr("value");
				//循环每页的数据并写入数据库
				for(int k=1;k<=Integer.parseInt(attr);k++) {
					String htmlByUrlData = getHtmlByUrlData(split2[1],format,k);
					Document doc = Jsoup.parse(htmlByUrlData);
					Elements linksElements = doc.select(".tb_ls >tbody >tr");
					for (int i = 1; i < linksElements.size(); i++) {
						Element element = linksElements.get(i);
						/**
						 * element.select(">td").get(0).text() 获取到的是序号 
						 * 判断是否有数据 (element.select(">td").get(0).text().equals("暂无数据!") 返回true是没有数据)
						 */
						if (!element.select(">td").get(0).text().equals("暂无数据!")) {
							String aqi1 = element.select(">td").get(1).text();//检测点位
							String aqi2 = element.select(">td").get(2).text();//检测时间
							String aqi3 = element.select(">td").get(3).text();//检测项目
							String aqi4 = element.select(">td").get(4).text();//检测结果
							String aqi5 = element.select(">td").get(5).text();//检测限值
							String aqi6 = element.select(">td").get(6).text();//检测单位
							String aqi7 = element.select(">td").get(7).text();//是否达标
							String aqi8 = element.select(">td").get(8).text();//超标倍数
							String att9 = element.select(">td").get(9).getElementsByTag("td").attr("title");//评价标准
							String aqi10 = element.select(">td").get(10).getElementsByTag("td").attr("title");//排放去向 内容太多需要读取title才能抓取完全
							String aqi11 = element.select(">td").get(11).text();//排放方式
							String aqi12 = element.select(">td").get(12).text();//备注
							WryVoluntarilyMonitor wryVoluntarilyMonitor = new WryVoluntarilyMonitor();
							wryVoluntarilyMonitor.setPkid(keyGenerator.getNext());
							wryVoluntarilyMonitor.setCompanyName(split2[0]);
							wryVoluntarilyMonitor.setDetectionPoint(aqi1);
							wryVoluntarilyMonitor.setDetectionTime(StringToDate(aqi2));
							wryVoluntarilyMonitor.setDetectionProject(aqi3);
							wryVoluntarilyMonitor.setDetectionResult(aqi4);
							wryVoluntarilyMonitor.setStandardLimitingValue(aqi5);
							wryVoluntarilyMonitor.setUnit(aqi6);
							wryVoluntarilyMonitor.setIsStandards(aqi7);
							wryVoluntarilyMonitor.setExceedingMultiple(aqi8);
							wryVoluntarilyMonitor.setEvaluationCriterion(att9);
							wryVoluntarilyMonitor.setEmissionsTo(aqi10);
							wryVoluntarilyMonitor.setEmissionsWay(aqi11);
							wryVoluntarilyMonitor.setRemarks(aqi12);
							super.insert(wryVoluntarilyMonitor);
						}
					}
				}
			}
		}
		System.out.println("执行成功");
	}

	/**
	 * String转date
	 * @param times
	 * @return
	 * @throws ParseException
	 */
	public Date StringToDate(String times) throws ParseException {
		SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        return sdf.parse(times);
	}
	
	/**
	 * 根据URL和时间获得所有的html信息
	 * 
	 * @param url
	 * @return
	 * @throws IOException
	 * @throws ClientProtocolException
	 */

	public static String getHtmlByUrl(String url,String date) throws ClientProtocolException, IOException{
        String html = null;
        //创建httpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response=null;
        try {
        	//以get方式请求该URL
            //HttpGet httpget = new HttpGet(url);
            HttpPost httppost = new HttpPost(url);
            String query="startTime="+date;
            StringEntity stringEntity = new StringEntity(query,"UTF-8");
            stringEntity.setContentType("application/x-www-form-urlencoded");
            httppost.setEntity(stringEntity);
            //CloseableHttpResponse response = httpClient.execute(httpget);
            response = httpClient.execute(httppost);
            //得到responce对象
            //HttpResponse responce = httpClient.execute(httpget);
            //返回码
            int resStatu = response.getStatusLine().getStatusCode();
            if (resStatu==HttpStatus.SC_OK) {//200正常  其他就不对
                //获得输入流
                InputStream entity = response.getEntity().getContent();
                if (entity!=null) {
                    //通过输入流转为字符串获得html源代码  注:可以获得实体,然后通过 EntityUtils.toString方法获得html
                	//但是有可能出现乱码,因此在这里采用了这种方式
                    html=getStreamString(entity);
                    // System.out.println(html);
                }
            }
        } catch (Exception e) {
            //System.out.println("访问【"+url+"】出现异常!");
            e.printStackTrace();
        } finally {
            //httpClient.getConnectionManager().shutdown();
            //response.close();
            try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        return html;
    }

	/**
	 * 根据url,时间和当前页获取数据
	 * @param url
	 * @param date
	 * @param page
	 * @return
	 * @throws ClientProtocolException
	 * @throws IOException
	 */
	public static String getHtmlByUrlData(String url,String date,Integer page) throws ClientProtocolException, IOException{
	       
    	String html = null;
        //创建httpClient对象
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response=null;
        try {
        	//以get方式请求该URL
            //HttpGet httpget = new HttpGet(url);
            HttpPost httppost = new HttpPost(url);
            String query="startTime="+date+"&pageIndex="+page;
            StringEntity stringEntity = new StringEntity(query,"UTF-8");
            stringEntity.setContentType("application/x-www-form-urlencoded");
            httppost.setEntity(stringEntity);
            //CloseableHttpResponse response = httpClient.execute(httpget);
            response = httpClient.execute(httppost);
            //得到responce对象
            //HttpResponse responce = httpClient.execute(httpget);
            //返回码
            int resStatu = response.getStatusLine().getStatusCode();
            if (resStatu==HttpStatus.SC_OK) {//200正常  其他就不对
                //获得输入流
                InputStream entity = response.getEntity().getContent();
                if (entity!=null) {
                    //通过输入流转为字符串获得html源代码  注:可以获得实体,然后通过 EntityUtils.toString方法获得html
                	//但是有可能出现乱码,因此在这里采用了这种方式
                    html=getStreamString(entity);
                    // System.out.println(html);
                }
            }
        } catch (Exception e) {
            //System.out.println("访问【"+url+"】出现异常!");
            e.printStackTrace();
        } finally {
            //httpClient.getConnectionManager().shutdown();
            //response.close();
            try {
				httpClient.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
        }
        return html;
}
	
	
	/**
	 * 将一个输入流转化为字符串
	 */
	public static String getStreamString(InputStream tInputStream) {
		if (tInputStream != null) {
			try {
				BufferedReader tBufferedReader = new BufferedReader(new InputStreamReader(tInputStream, "utf-8"));
				StringBuffer tStringBuffer = new StringBuffer();
				String sTempOneLine = new String("");
				while ((sTempOneLine = tBufferedReader.readLine()) != null) {
					tStringBuffer.append(sTempOneLine + "\n");
				}
				return tStringBuffer.toString();
			} catch (Exception ex) {
				ex.printStackTrace();
			}
		}
		return null;
	}

循环

代码截图  为什么从一开始循环  应为第一行是表格的开头

页面检查截图

 

读取title

 

 

贴出检查页面图startTime选择时间  pageIndex当前页码

相关推荐
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页