selenium获取页面数据入数据库

Java是一门强大的编程语言,有很多库和框架可以用于网页抓取。常用的包括Jsoup、Selenium和HttpClient等。其中,Jsoup适用于抓取静态网页,Selenium适用于抓取动态网页,HttpClient适用于发送HTTP请求。根据实际需求选择合适的工具。

开发环境是内网(局域网),项目工程是web项目,jdk使用的1.8,tomcat使用的 8。

web项目: 启动时使用tomcat, tomcat会先加载web.xml配置文件里内容;
maven项目: 和web项目的区别是,需要的jar包,交给maven维护,不用自己导入jar包;
java项目: 启动时,使用main方法,没有web.xml配置文件;

一、创建项目
在这里插入图片描述
web.xml:web项目的配置文件,随着tomcat启动而加载;
TimerConfig.xml:springMVC定时配置文件;
proxool.xml: 配置数据库连接池;
log4j.properties:日志文件;
hibernate.cfg.xml: hibernate配置文件;
LoadsRealTimeTask: 定时任务类;

二、环境准备

1、导入selenium所需要的包
在这里插入图片描述
2、导入项目所需要的包
3、安装chromedriver.exe
因为我使用的谷歌浏览器来打开页面,所以需要将chromedriver.exe安装到Chrome目录下。
在这里插入图片描述

三、编写代码

web.xml

<?xml version="1.0" encoding="UTF-8"?>
<web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xmlns="http://java.sun.com/xml/ns/javaee" 
xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_3_0.xsd"
 id="WebApp_ID" version="3.0">
 
  <display-name>GDreptile</display-name>
  <welcome-file-list>
    <welcome-file>index.html</welcome-file>
    <welcome-file>index.htm</welcome-file>
    <welcome-file>index.jsp</welcome-file>
    <welcome-file>default.html</welcome-file>
    <welcome-file>default.htm</welcome-file>
    <welcome-file>default.jsp</welcome-file>
  </welcome-file-list>
  
  <!-- 定时器配置文件-->
  <context-param> 
      <param-name>contextConfigLocation</param-name> 
      <param-value>/WEB-INF/TimerConfig.xml</param-value> 
  </context-param>
  
  <listener> 
      <listener-class> 
          org.springframework.web.context.ContextLoaderListener 
      </listener-class> 
  </listener> 
  
  <!-- proxool -->
  <servlet>
       <servlet-name>ServletConfigurator</servlet-name>
       <servlet-class>
              org.logicalcobwebs.proxool.configuration.ServletConfigurator
        </servlet-class>
        <init-param>
              <param-name>xmlFile</param-name>
              <param-value>/WEB-INF/proxool.xml</param-value>
         </init-param>
         <load-on-startup>1</load-on-startup>
    </servlet>
    <!-- proxool提供的管理监控工具,可查看当前数据库连接情况。如果运行不成功,请删除本行 -->
    <servlet>
    	<servlet-name>Admin</servlet-name>
		<servlet-class>org.logicalcobwebs.proxool.admin.servlet.AdminServlet</servlet-class>
    </servlet>
    <servlet-mapping>
    	<servlet-name>Admin</servlet-name>
    	<url-pattern>/admin</url-pattern>
    </servlet-mapping>
</web-app>

TimerConfig.xml

<?xml version="1.0" encoding="UTF-8"?>
<beans xmlns="http://www.springframework.org/schema/beans"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:context="http://www.springframework.org/schema/context"
    xmlns:mvc="http://www.springframework.org/schema/mvc" xmlns:tx="http://www.springframework.org/schema/tx"
    xmlns:task="http://www.springframework.org/schema/task"
    xsi:schemaLocation="http://www.springframework.org/schema/beans
        http://www.springframework.org/schema/beans/spring-beans-3.1.xsd
        http://www.springframework.org/schema/context
        http://www.springframework.org/schema/context/spring-context-3.1.xsd
        http://www.springframework.org/schema/tx
        http://www.springframework.org/schema/tx/spring-tx-3.1.xsd
        http://www.springframework.org/schema/task 
        http://www.springframework.org/schema/task/spring-task-3.1.xsd">

    <!-- springMVC 定时器开关 -->
    <task:annotation-driven />
    <bean id="historyTask" class="com.sgcc.gridDispa.LoadsHistoryTask"></bean>
    <bean id="realTimeTask" class="com.sgcc.gridDispa.LoadsRealTimeTask"></bean>
    
    <task:scheduled-tasks>
        <task:scheduled ref="historyTask" method="run" cron="0 30 7 * * ?" />   <!-- 这里表示的是每天7:30执行一次    --> 
        <task:scheduled ref="realTimeTask" method="run" cron="0 20 0-23 * * ?" />  <!-- 这里表示的是每小时20分执行一次     -->
    </task:scheduled-tasks>
    
</beans>  

proxool.xml

<?xml version="1.0" encoding="UTF-8"?>
<something-else-entirely>
 <proxool>  
        <alias>proxoolpool</alias>
       
		<driver-url>jdbc:oracle:thin:@XX.XX.XX.XX:1521/XXXXX</driver-url> 
        <driver-class>oracle.jdbc.driver.OracleDriver</driver-class> 
        <driver-properties>   
             <property name="user" value="XXXXX" />  
             <property name="password" value="XXXXX" /> 
        </driver-properties> 

        <maximum-connection-count>200</maximum-connection-count>
        <minimum-connection-count>10</minimum-connection-count>     
        <house-keeping-sleep-time>30000</house-keeping-sleep-time>  
        <maximum-new-connections>10</maximum-new-connections>    
        <prototype-count>5</prototype-count>    
        <test-before-use>true</test-before-use>  
        <house-keeping-test-sql>select sysdate from dual</house-keeping-test-sql>  
    </proxool>  

</something-else-entirely>

log4j.properties

log4j.rootLogger=DEBUG,console,FILE  
  
log4j.appender.console=org.apache.log4j.ConsoleAppender  
log4j.appender.console.threshold=INFO  
log4j.appender.console.layout=org.apache.log4j.PatternLayout  
log4j.appender.console.layout.ConversionPattern=%d{yyyy-MM-dd HH\:mm\:ss} [%5p] - %c -%F(%L) -%m%n  
  
log4j.appender.FILE=org.apache.log4j.RollingFileAppender  
log4j.appender.FILE.Append=true  
log4j.appender.FILE.File=D:/log/pachong/logs
log4j.appender.FILE.Threshold=INFO  
log4j.appender.FILE.layout=org.apache.log4j.PatternLayout  
log4j.appender.FILE.layout.ConversionPattern=%d{yyyy-MM-dd HH\:mm\:ss} [%5p] - %c -%F(%L) -%m%n  
log4j.appender.FILE.MaxFileSize=10MB     

hibernate.cfg.xml

<!DOCTYPE hibernate-configuration PUBLIC
	"-//Hibernate/Hibernate Configuration DTD 3.0//EN"
	"http://hibernate.sourceforge.net/hibernate-configuration-3.0.dtd">

<hibernate-configuration>
<session-factory>

	<!-- hibernate自身属性相关参数 -->
	<property name="dialect">
		com.sgcc.gridDispa.utils.BlobOracleDialect
	</property>
	<!-- <property name="hbm2ddl.auto">update</property> -->
	<property name="hibernate.jdbc.batch_size">50</property>
	<!-- 设置连接数 -->
	<!-- <property name="connection.pool_size">60</property> -->
	<property name="show_sql">false</property>
	<property name="format_sql">false</property>
	<property name="current_session_context_class">thread</property>
	<!-- 提交事务后关闭连接 -->
	<property name="connection.release_mode">
		after_transaction
	</property>
	<!-- 提交事务后关闭会话 -->
	<property name="transaction.auto_close_session">true</property>
	<!-- 统计信息 -->
	<property name="hibernate.generate_statistics">true</property>

	<!-- proxool连接池 -->
	<property name="hibernate.proxool.pool_alias">proxoolpool</property>
	<property name="hibernate.proxool.xml">proxool.xml</property>
	<property name="hibernate.connection.provider_class">
		org.hibernate.connection.ProxoolConnectionProvider
	</property>
	<property name="hibernate.proxool.existing_pool">true</property>

	<!-- 映射文件的注册 -->
	<mapping resource="com/sgcc/gridDispa/po/LoadsHistory.hbm.xml" />
	<mapping resource="com/sgcc/gridDispa/po/LoadsToday.hbm.xml" />
	<mapping resource="com/sgcc/gridDispa/po/TgridLoads.hbm.xml" />
    <mapping resource="com/sgcc/gridDispa/po/LoadsRealTime.hbm.xml" />

</session-factory>
</hibernate-configuration>    

LoadsRealTimeTask

package com.sgcc.gridDispa;

import java.util.TimerTask;

import com.sgcc.gridDispa.impl.LoadsHistoryImpl;
import com.sgcc.gridDispa.impl.LoadsRealTimeImpl;

public class LoadsRealTimeTask extends TimerTask{

	LoadsRealTimeImpl tsk= new LoadsRealTimeImpl();
	
	@Override
	public void run() {
		try {
			
			Thread thread=new Thread(tsk);
			thread.start();
		} catch (Exception e) {
			e.printStackTrace();
		}
	}
}

utils文件夹下
在这里插入图片描述
WebDriverUtil.java

package com.sgcc.gridDispa.utils;

import java.util.concurrent.TimeUnit;

import org.openqa.selenium.Dimension;
import org.openqa.selenium.Platform;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.chrome.ChromeDriver;
import org.openqa.selenium.chrome.ChromeDriverService;
import org.openqa.selenium.remote.CapabilityType;

/**
 * @Description:crawler
 * @Author: old
 * @CreateTime:2017-11-15 :15:16:16
 */
public class WebDriverUtil {

    /**
     * 创建Chrome
     *
     * @param path 路径
     * @return
     * @throws Exception
     */
    public static WebDriver createChromeWebDriver(String path) throws Exception {
        if (path == null || "".equals(path)) {
            throw new Exception("配置错误, 没有配置:chrome path");
        }
        System.setProperty("webdriver.chrome.driver", path);
        WebDriver webDriver = new ChromeDriver();
        webDriver.manage().timeouts().pageLoadTimeout(1200, TimeUnit.SECONDS);
        webDriver.manage().window().setSize(new Dimension(1024, 768));
        return webDriver;
    }

}

LogWriter.java

package com.sgcc.gridDispa.utils;

import org.apache.log4j.Logger;


public class LogWriter {
	private static Logger logger  = Logger.getLogger(LogWriter.class);
	public static void error(Object obj){
		logger.error(obj);
	}
	
	public static void error(Object message,Throwable obj){
		logger.error(message,obj);
	}
	
	public static void info(Object obj){
		logger.info(obj);
	}
	public static String getError(Throwable e){
		StringBuilder sb=new StringBuilder();
		sb.append(e.toString()+System.getProperty("line.separator"));
        StackTraceElement[] trace = e.getStackTrace();
        for (int i=0; i < trace.length; i++)
            sb.append("\tat " + trace[i]+System.getProperty("line.separator"));
        return sb.toString();
	}
}

JDBCUtil.java

package com.sgcc.gridDispa.utils;

import org.hibernate.Session;
import org.hibernate.SessionFactory;
import org.hibernate.cfg.Configuration;
import org.hibernate.stat.SessionStatistics;
import org.hibernate.stat.Statistics;
/**
 * 获得hibernate session对象
 * @author kuang
 *
 */
public final class JDBCUtil {
	private static SessionFactory sessionFactory ;
	private JDBCUtil(){}
	static{
		try{
			sessionFactory = new Configuration().configure("hibernate.cfg.xml").buildSessionFactory();
		
		}catch(Exception e){
			e.printStackTrace();
			LogWriter.error(e);
		}
	}
	public static Session getThreadSession(){
		return sessionFactory.getCurrentSession();
	}
	public static Session noOpen(){
		return getThreadSession();
	}
	public static  Session open(){
		getThreadSession().beginTransaction();
		return getThreadSession();
	}
	public static void commit(){
		getThreadSession().getTransaction().commit();
		getThreadSession().close();
	}

	public static void close(){
		if(getThreadSession()!=null)
			getThreadSession().close();
	}
	
	public static void getStatistics(){
		SessionStatistics ss = getThreadSession().getStatistics();
		LogWriter.info("SessionStatistics:"+ss);
		Statistics st = sessionFactory.getStatistics();
		LogWriter.info("Statistics:"+st);
	}
	public static void rollback(){
		getThreadSession().getTransaction().rollback();
	}
}

BlobOracleDialect.java

package com.sgcc.gridDispa.utils;

import java.sql.Types;
import org.hibernate.Hibernate;
import org.hibernate.dialect.OracleDialect;



public class BlobOracleDialect extends OracleDialect {

	public BlobOracleDialect(){
		super();
		registerHibernateType(Types.LONGVARBINARY,Hibernate.BLOB.getName());
	}
}

BasicDaoImpl.java

package com.sgcc.gridDispa.utils;

import java.util.List;

import org.hibernate.HibernateException;
import org.hibernate.Query;
import org.hibernate.SQLQuery;
import org.hibernate.Session;
import org.hibernate.transform.Transformers;



//obj的属性名
public class BasicDaoImpl<T>  {
	
	/*
	 * 保存实体对象
	 */
	public void saveOrUpdate(T t) {
		try {
			JDBCUtil.open().save(t);
			JDBCUtil.commit();
		} catch (HibernateException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}finally{
			JDBCUtil.close();
		}
	}
	
	/*
	 * 查询sql,返回list
	 */
	public List queryListBySql(String sql){
		try {
			Session session =JDBCUtil.open();
			SQLQuery sqlQuery = session.createSQLQuery(sql);
			List result = sqlQuery.list();
			JDBCUtil.commit();
			return result;
		} catch (HibernateException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}finally{
			JDBCUtil.close();
		}
		return null;
	}
	
	/*
	 * 查询sql,返回list Map
	 */
	public List queryListMapBySql(String sql){
		try {
			Session session =JDBCUtil.open();
			SQLQuery sqlQuery = session.createSQLQuery(sql);
			Query query =sqlQuery.setResultTransformer(Transformers.ALIAS_TO_ENTITY_MAP);
			List result = sqlQuery.list();
			JDBCUtil.commit();
			return result;
		} catch (HibernateException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}finally{
			JDBCUtil.close();
		}
		return null;
	}
	
	
	/**
	 * 对获取到的气象数据进行过滤,对无效、null进行处理
	 * @return 过滤后的数据
	 */
	protected  String filterMothed(String object) {
		if(object.contains("9999")){
			return "";
		}
		
		if(object.equals("null")){
			return "";
		}
		
		if(object==null){
			return "";
		}
		return object.trim();
	}
}

impl文件夹下:

LoadsRealTimeImpl.java

package com.sgcc.gridDispa.impl;

import java.math.BigDecimal;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.List;
import java.util.Map;

import org.apache.log4j.Logger;
import org.openqa.selenium.By;
import org.openqa.selenium.WebDriver;
import org.openqa.selenium.WebElement;

import com.sgcc.gridDispa.po.LoadsHistory;
import com.sgcc.gridDispa.po.LoadsRealTime;
import com.sgcc.gridDispa.utils.BasicDaoImpl;
import com.sgcc.gridDispa.utils.WebDriverUtil;

public class LoadsRealTimeImpl  extends BasicDaoImpl implements Runnable{

	private Logger logger=Logger.getLogger(LoadsRealTimeImpl.class);
	
	@Override
	public void run() {
		logger.info("=======各省实时负荷数据【定时任务】===============");
		saveAllRealTimeLoad();
	}
	
	/**
	 *  将各省实时负荷数据写入数据库
	 */
	public synchronized void saveAllRealTimeLoad(){
		
        WebDriver webDriver = null;
		try {
	       
	        webDriver = WebDriverUtil.createChromeWebDriver("D:\\chrome\\Chrome\\Application\\chromedriver.exe");
	        /*webDriver = WebDriverUtil.createChromeWebDriver("D:\\基础软件\\Chrome\\Application\\chromedriver.exe");*/
            webDriver.get("http://10.19.13.50:8080//MWWebSite//PROJECT-HOME//exchange//YYJC//AJBZHDPSJ.jsp");
            Thread.sleep(3000);
            System.out.println(webDriver.getTitle());
            System.out.println(webDriver.getPageSource());
            
            WebElement  webBody=webDriver.findElement(By.xpath("//body"));
            String bodyStr=webBody.getText();
            String[] bodyStrs=bodyStr.split("\n");
            String  bool=",";
            for (int i = 47; i < bodyStrs.length; i++) {
            	String date="";
            	if(i==47){
	            	int index=bodyStrs[0].indexOf("='");
	            	date=bodyStrs[0].substring(index+2, index+12);
            	}
            	
            	if(i>=49 && i<=85){
            		String allLoad=bodyStrs[i].replaceAll("\\s{2,}", ",").trim();
            		String[] allLoadStr=allLoad.split(",");
            		LoadsRealTime loadRealTime = new LoadsRealTime();
            		loadRealTime.setDeptName(allLoadStr[1]); //电网名称
            		
            		//处理电网、省公司id
            		if(allLoadStr[1].equals("华北电网") || allLoadStr[1].equals("华东电网") || allLoadStr[1].equals("华中电网") || allLoadStr[1].equals("东北电网") || allLoadStr[1].equals("西北电网") || allLoadStr[1].equals("西南电网")){
            			List<Map> result = queryAreaIdByname(allLoadStr[1]);
            			if(result.size() >0){
                			for(Map map : result){
                				String companyId = map.get("COMPANY_ID").toString();
                				loadRealTime.setDeptId(companyId);
                			}
                		}
            		}else{
            			List<Map> result = queryCompanyIdByname(allLoadStr[1]);
                		if(result.size() >0){
                			for(Map map : result){
                				String companyId = map.get("COMPANY_ID").toString();
                				loadRealTime.setDeptId(companyId);
                			}
                		}
            		}
            		
            		Float yesterdayLoad = Float.parseFloat(allLoadStr[2])/10; //实时负荷
            		loadRealTime.setRealtimeLoad(Math.round(yesterdayLoad)+"");
            		SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
            		int index = allLoadStr[3].indexOf("'");
        			String rkTime=allLoadStr[3].substring(index+1, index+20);
            		loadRealTime.setRkTie(format.parse(rkTime));
            		this.saveOrUpdate(loadRealTime);
            	}
			}
	    } catch (Exception e) {
	        e.printStackTrace();
	    } finally {
	        if (webDriver != null) {
	            //webDriver.close();
	            webDriver.quit();
	        }
	    }
	}
	
	/**
	 * 根据省公司名称查询公司id
	 */
	private List queryCompanyIdByname(String companyName){
		
		String sql= "select y.company_id from t_company y  where  y.dwjb='3' ";
		if(companyName != null && !companyName.equals("")){
			sql +=  "and y.company_name  like '%" + companyName + "%' ";
		}
		return this.queryListMapBySql(sql);
	}
	
	/**
	 * 根据各分部电网查询分部id
	 */
	private List queryAreaIdByname(String companyName){
		
		String sql= "select y.company_id from t_company y  where  y.dwjb='2' ";
		if(companyName != null && !companyName.equals("")){
			companyName = companyName.replace("电网", "");
			sql +=  "and y.company_name  like '%" + companyName + "%' ";
		}	
		return this.queryListMapBySql(sql);
	}
}

四、页面中数据

在这里插入图片描述

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

大数据同盟会

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值