最近由于公司业务需要,需要从某网站爬取数据,在正则表达式上费了一番功夫,其他地方还算OK,这篇文章从网站爬取数据开始(分页的),到用jdbc插入MySQL数据库,然后从MySQL数据库中导出数据,一条龙!!!话不多说,直接上代码:
package com;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CrawlInfoUtil {
publicstatic void main(String[] args) throws Exception{
//1、趴取网页上信息,拼装List
for(int i =1; i < 50; i++){
StringBuffersb = newStringBuffer("http://www.chinaplasonline.com/ExhibitorList16/lang-simp/page-");
sb.append(i).append("/src-12/s-cps11/od-0/hall-/asc-1/ChemicalZone.aspx");
System.out.println("-----------第"+i+"页开始-----------");
System.out.println("第"+i+"页URL:"+sb.toString());
Stringcontent = httpRequest(sb.toString());
List list =htmlFilter(content);
//2、连接数据库
Connectionconn = connectDataBase();
//3、将拼装的数据存入数据库中
insertData2DataBase(list, conn);
}
}
privatestatic String httpRequest(String requestUrl) {
StringBufferbuffer = null;
BufferedReader bufferedReader = null;
InputStreamReader inputStreamReader = null;
InputStreaminputStream = null;
HttpURLConnection httpUrlConn = null;
try {
//建立get请求
URL url =new URL(requestUrl);
httpUrlConn= (HttpURLConnection) url.openConnection();
httpUrlConn.setDoInput(true);
httpUrlConn.setRequestMethod("GET");
//获取输入流
inputStream= httpUrlConn.getInputStream();
inputStreamReader = new InputStreamReader(inputStream,"utf-8");
bufferedReader = new BufferedReader(inputStreamReader);
//从输入流读取结果
buffer = newStringBuffer();
String str =null;
while ((str= bufferedReader.readLine()) != null) {
buffer.append(str);
}
} catch(Exception e) {
e.printStackTrace();
} finally{
//释放资源
if(bufferedReader != null) {
try {
bufferedReader.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(inputStreamReader != null) {
try {
inputStreamReader.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(inputStream != null) {
try {
inputStream.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(httpUrlConn != null) {
httpUrlConn.disconnect();
}
}
returnbuffer.toString();
}
privatestatic List htmlFilter(String html){
List list =new ArrayList();
System.out.println("----------------开始解析-----------------");
Pattern p =Pattern.compile("()(.*?)()");
Matcher m =p.matcher(html);
int i =0;
while(m.find()){
EnterprisInfoVo vo = new EnterprisInfoVo();
StringinnerTR = m.group(2);
//获取展商名称和URL
//链接到新页面的
PatterntdPattern = Pattern.compile("(.*)(
package com;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class CrawlInfoUtil {
publicstatic void main(String[] args) throws Exception{
//1、趴取网页上信息,拼装List
for(int i =1; i < 50; i++){
StringBuffersb = newStringBuffer("http://www.chinaplasonline.com/ExhibitorList16/lang-simp/page-");
sb.append(i).append("/src-12/s-cps11/od-0/hall-/asc-1/ChemicalZone.aspx");
System.out.println("-----------第"+i+"页开始-----------");
System.out.println("第"+i+"页URL:"+sb.toString());
Stringcontent = httpRequest(sb.toString());
List list =htmlFilter(content);
//2、连接数据库
Connectionconn = connectDataBase();
//3、将拼装的数据存入数据库中
insertData2DataBase(list, conn);
}
}
privatestatic String httpRequest(String requestUrl) {
StringBufferbuffer = null;
BufferedReader bufferedReader = null;
InputStreamReader inputStreamReader = null;
InputStreaminputStream = null;
HttpURLConnection httpUrlConn = null;
try {
//建立get请求
URL url =new URL(requestUrl);
httpUrlConn= (HttpURLConnection) url.openConnection();
httpUrlConn.setDoInput(true);
httpUrlConn.setRequestMethod("GET");
//获取输入流
inputStream= httpUrlConn.getInputStream();
inputStreamReader = new InputStreamReader(inputStream,"utf-8");
bufferedReader = new BufferedReader(inputStreamReader);
//从输入流读取结果
buffer = newStringBuffer();
String str =null;
while ((str= bufferedReader.readLine()) != null) {
buffer.append(str);
}
} catch(Exception e) {
e.printStackTrace();
} finally{
//释放资源
if(bufferedReader != null) {
try {
bufferedReader.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(inputStreamReader != null) {
try {
inputStreamReader.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(inputStream != null) {
try {
inputStream.close();
} catch(IOException e) {
e.printStackTrace();
}
}
if(httpUrlConn != null) {
httpUrlConn.disconnect();
}
}
returnbuffer.toString();
}
privatestatic List htmlFilter(String html){
List list =new ArrayList();
System.out.println("----------------开始解析-----------------");
Pattern p =Pattern.compile("()(.*?)()");
Matcher m =p.matcher(html);
int i =0;
while(m.find()){
EnterprisInfoVo vo = new EnterprisInfoVo();
StringinnerTR = m.group(2);
//获取展商名称和URL
//链接到新页面的
PatterntdPattern = Pattern.compile("(.*)(