如何使用JDBC快速处理大数据

最新推荐文章于 2023-02-22 16:30:29 发布

iteye_12675

最新推荐文章于 2023-02-22 16:30:29 发布

阅读量288

点赞数

文章标签：大数据数据库

在实习工作中，要处理一张存有204万记录的表，由于记录是从互联网上取到的，所以里面有一些不太理想的词，比如一些词里混有特殊字符，标点符号，或者有些词根本就是标点符号等等。我写这个程序的目的就是把这些不理想的词找出来，能修改的就修改，没有必要修改的就直接删除。

for(int i=0;i<205;i++)
			{
				String sql="select * from cat_keyword order by id limit "+i*10001+",10000";			
				String best="select * from cat_keyword where id>=(select id from cat_keyword order by id limit "+i*10001+",1)limit 10000";
				rs=stmt.executeQuery(best);
				go.filt(rs);
			}

经试验，第一条sql语句的执行效率是明显不如第二条的。

String best="select * from cat_keyword where id>=(select id from cat_keyword order by id limit "+i*10001+",1)limit 10000";

这条SQL语句尤其适用于offset特别大的情景，因为200多万条记录，到中后期，这个offset是很大的，使用未经优化的SQL语句，到后期是越来越慢。

另外limit的取值也很有考究，我试过1000,10000,70000,100000的，最后发现limit取10000的时候速度是最快的，这个跟数据的大小，电脑以及数据库内存的分配都有一定的关系，大家可以根据具体情况具体讨论。我这里仅仅提一下需要注意，可以优化的point。

我真正想说的是我想的另外一种解决方案，运用JDBC

下面给大家说下我这个程序的构造，首先是一个实现将需要删除和修改的词插入另一张表的方法

//将找到的违规词插入cat_garbage表里
   public void insert(ResultSet rs) throws SQLException
   {
	   Connection conn=null;
		Statement stmt=null;
	   try{			
			conn=Good.getConnection();
				stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE
	 				, ResultSet.CONCUR_UPDATABLE);
				String query="select * from cat_garbage where id='"+rs.getInt("id")+"'";
				ResultSet rst=stmt.executeQuery(query);
				if(!rst.next())
				{
					String sql1="insert into cat_garbage(id,cid,name,keyword) values('"+rs.getInt("id")+"','"+rs.getInt("cid")+"','"+rs.getString("name")+"','"+rs.getString("keyword")+"')";
					stmt.executeUpdate(sql1);
				}
		}
		catch(SQLException|ClassNotFoundException|IOException e)
		{
			e.printStackTrace();
		}
		finally
		{
			if(stmt!= null) 
				stmt.close(); 		
			if(conn!= null) 
				conn.close(); 
		}
   }

然后是一个将可以修复的词修复的方法

public void modify(ResultSet rs,String str,String reg) throws SQLException
   {
	  
	   Connection conn=null;
	   Statement stmt=null;
	   Statement stmt1=null;
	   try{					
		    conn=Good.getConnection();
			stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE
	 				, ResultSet.CONCUR_UPDATABLE);
			stmt1= conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE
	 				, ResultSet.CONCUR_UPDATABLE);
			String ok=null;
			if(str.indexOf(reg)==str.length()-1)
			{
				 ok=str.substring(0,str.indexOf(reg));
				 System.out.println(ok);
				 System.out.println(ok);
				 System.out.println(1);
			}
			else if(str.indexOf(reg)==0)
			{
				 ok=str.substring(1);
				 while(ok.indexOf(reg)!=-1)
					  ok=ok.substring(1);
				 System.out.println(2);
			}
			else
			{
				System.out.println(3);
				System.out.println(str);
				ok=str.substring(0,str.indexOf(reg));
				
			}
			System.out.println(ok);
			new Good().insert(rs);
			//String query="select * from cat_keyword where cid='"+rs.getInt("cid")+"'and keyword='"+ok+"'";
			String sql="update cat_garbage1 set new='"+ok+"' where id='"+rs.getInt(1)+"'";
			//String sql1="update cat_keyword set keyword='"+ok+"' where id='"+rs.getInt("id")+"'";
			stmt.executeUpdate(sql);
	   }
		
		catch(SQLException|ClassNotFoundException|IOException e)
		{
			e.printStackTrace();
		}
		finally
		{
			if(stmt!= null) 
				stmt.close(); 	
			if(stmt1!= null) 
				stmt1.close(); 
			if(conn!= null) 
				conn.close(); 
		}
   }

最后就是一个核心的取词过滤方法，里面有很多正则表达式实现的取词规则。

public void filt(ResultSet rs)
	{		
	 try
	  {
		String str;		
		double a=0;double b=0;double c=0;double d=0;
		int num=0;
		Good good=new Good();
		while(rs.next())
	   {	
		str=rs.getString("keyword");
		if(str.equals(",")|str.equals(".")|str.equals(":")|str.equals("-")|str.equals("=")|str.equals("+")|str.equals("*")|str.equals("/")|str.equals("?")|str.equals("|")|str.equals("\\")|str.equals("%")|str.equals("#")|str.equals("!")||str.equals("~")||str.equals("、"))
		{
			good.insert(rs);
			a++;
		}
		//字符串后面带逗号
		if(rule(str,"^\\S+[,]$").matches())
		{
			good.modify(rs,str,",");
			num++;
		}
		//字符串后面带句号
		if(rule(str,"^\\S+[.]$").matches())
		{
			good.modify(rs,str,".");
			num++;
		}
		//字符串前面带句号
		if(rule(str,"^[.]\\S+$").matches())
		{
			System.out.println(123);
			if(str.indexOf(".net")==-1)
			{
				if(str.indexOf(".com")==-1)
				{
					if(str.indexOf(".cn")==-1)
					{
						if(str.indexOf(".org")==-1)
						{
							good.modify(rs, str, ".");
							num++;
						}
					}
				}
			}
		}
		//字符串后面带-
		if(rule(str,"^\\S+[-]$").matches())
		{
			good.modify(rs, str, "-");
			num++;
		}
		//字符串前面带-
		if(rule(str,"^[-]\\S+$").matches())
		{
			good.modify(rs, str, "-");
			num++;
		}
		//字符串前面带冒号
		if(rule(str,"^[:]\\S+$").matches())
		{
			good.modify(rs, str, ":");
			num++;
		}
		//字符串后面带冒号
		if(rule(str,"^\\S+[:]$").matches())
		{
			good.modify(rs, str, ":");
			num++;
		}
		//字符串后面带/
		if(rule(str,"^\\S+[/]$").matches())
		{
			good.modify(rs, str, "/");
			num++;
		}
		//字符串前面带/
		if(rule(str,"^[/]\\S+$").matches())
		{
			good.modify(rs, str, "/");
			num++;
		}
		//字符串后面带？
		if(rule(str,"^\\S+[?]$").matches())
		{
			good.modify(rs, str, "?");
			num++;
		}
		//字符串前面带？
		if(rule(str,"^[?]\\S+$").matches())
		{
			good.modify(rs, str, "?");
			num++;
		}
		//字符串前面带|
		if(rule(str,"^[|]\\S+$").matches())
		{
			good.modify(rs, str, "|");
			num++;
		}
		//字符串后面带|
		if(rule(str,"^\\S+[|]$").matches())
		{
			good.modify(rs, str, "|");
			num++;
		}
		//字符串前面带(
		if(rule(str,"^[(][\\S\\s]+$").matches())
		{
			if(rule(str,"^[(]\\S+[)]\\S*$").matches())
			{
				str=str.replaceAll("\\)", " ");
				good.modify(rs, str, "(");
				num++;
			}
			else
			{
				good.modify(rs, str, "(");
				num++;
			}
		}
		//字符串前面带[
		if(rule(str,"^[\\[][\\S\\s]+$").matches())
		{
			if(rule(str,"^[\\[]\\S+[\\]]\\S*$").matches())
			{
				str=str.replaceAll("\\]", " ");
				good.modify(rs, str, "[");
				num++;
			}
			else
			{
				good.modify(rs, str, "[");
				num++;
			}
		}
		//字符串后面带)或者里面带()
		if(rule(str,"^\\S+\\s*\\S+[)]$").matches())
		{
			System.out.println("qwe");
			if(rule(str,"^\\S+[(]\\S+[)]$").matches())
			{
				str=str.replaceAll("\\(", " ");
				good.modify(rs, str, ")");
				num++;
			}
			else if(rule(str,"^\\S+\\s*[(]\\S+[)]$").matches())
			{
				str=str.replaceAll("\\(", "");
				good.modify(rs, str, ")");
				num++;
			}
			else
				{
					good.modify(rs, str, ")");
					num++;
				}
		}
		//字符串后面带]或者里面带[]
				if(rule(str,"^\\S+\\s*\\S+[\\]]$").matches())
				{
					System.out.println("qwe");
					if(rule(str,"^\\S+[\\[]\\S+[\\]]$").matches())
					{
						str=str.replaceAll("\\[", " ");
						good.modify(rs, str, "]");
						num++;
					}
					else if(rule(str,"^\\S+\\s*[\\[]\\S+[\\]]$").matches())
					{
						str=str.replaceAll("\\[", "");
						good.modify(rs, str, "]");
						num++;
					}
					else
						{
							good.modify(rs, str, "]");
							num++;
						}
				}
		//字符串内包含书名号
		if(rule(str,"^[\\s\\S]*[《][\\s\\S]+[》]").matches())
		{
			str=str.replaceAll("》", " ");
			good.modify(rs, str, "《");
			num++;
		}
		//字符串内包含中文（）
		if(rule(str,"^[\\s\\S]*[（][\\s\\S]+[）]").matches())
		{
			str=str.replaceAll("（", " ");
			good.modify(rs, str, "）");
			num++;
		}
		//纯数字
		if(rule(str,"^[1-9]*|0$").matches())
		{
			good.insert(rs);
			b++;
		}
		//单字母
		if(rule(str,"^[a-zA-Z]$").matches())
		{
			int flag=0;
			String shit=rs.getString(3);
			for(int i=0;i<shit.length();i++)
				{
					if(str.charAt(0)!=shit.charAt(i))				
					flag++;							
					if(flag==shit.length())
					{
						good.insert(rs);
						c++;						
					}				
				}
		}
		//单汉字
		if(rule(str,"^[\u4e00-\u9fa5]$").matches())
		{
			int flag=0;
			String shit=rs.getString(3);
			for(int i=0;i<shit.length();i++)
			{				
				if(str.charAt(0)!=shit.charAt(i))										
				flag++;													
				if(flag==shit.length())
				{
					good.insert(rs);
					d++;						
				}									
			}					
		}
	  }
		System.out.println("单标点百分比："+100*a/2046036+"%");
		System.out.println("纯数字百分比："+100*b/2046036+"%");
		System.out.println("单字母百分比："+100*c/2046036+"%");
		System.out.println("单汉字百分比："+100*d/2046036+"%");
		System.out.println(num);
		}
		catch(SQLException e)
		{
			e.printStackTrace();
		}	
	 	
	}

下面是程序的主方法

public static void main(String[] args) throws SQLException 
	{
		Connection conn=null;
		Statement stmt=null;
		try{
			long a=System.currentTimeMillis();
			ResultSet rs;
			conn=Good.getConnection();
			stmt = conn.createStatement(ResultSet.TYPE_SCROLL_INSENSITIVE
     				, ResultSet.CONCUR_UPDATABLE);
			Good go=new Good();
			/*for(int i=0;i<205;i++)
			{
				//String sql="select * from cat_keyword order by id limit "+i*10001+",10000";			
				String best="select * from cat_keyword where id>=(select id from cat_keyword order by id limit "+i*10001+",1)limit 10000";
				rs=stmt.executeQuery(best);
				go.filt(rs);
			}*/
			String sql="select * from cat_keyword";
			rs=stmt.executeQuery(sql);
			go.filt(rs);
			long b=System.currentTimeMillis();
			System.out.println("耗时： "+(b-a)/60000+"分钟");
			}
		catch(ClassNotFoundException|SQLException|IOException e)
		{
			e.printStackTrace();
		}
		finally
		{
			if(stmt!= null) 
				stmt.close(); 		
			if(conn!= null) 
				conn.close(); 
		}
	}

用一个SQL语句，将整个表的数据一次性作为参数传给负责取词过滤的filt()方法，然后利用ResultSet rs这个指针依次向下对每个词进行处理，这个方案运行下来处理完全表204万记录只需11分钟，而是用优化之后的limit语句获得的结果集再传给filt()方法那个方案，却需要26分钟。可见使用JDBC，直接传入所有数据的结果集，比使用limit多次限量导入数据要快的多，而且不止是一倍的关系。前者11分钟完胜后者的26分钟。

iteye_12675

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
如何使用JDBC快速处理大数据

在实习工作中，要处理一张存有204万记录的表，由于记录是从互联网上取到的，所以里面有一些不太理想的词，比如一些词里混有特殊字符，标点符号，或者有些词根本就是标点符号等等。我写这个程序的目的就是把这些不理想的词找出来，能修改的就修改，没有必要修改的就直接删除。for(int i=0;i&lt;205;i++) { String sql="select * from cat_keywor...
复制链接

扫一扫