今天工作需要做了个在www.chemicalbook.com上抓取数据的任务啊!
步骤为:1、先弄清抓取网站的url规则;2、设置正则表达式抓取规则;3、存放到同一个Excel中
代码如下:
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
public class GetChemicalbookSiteCompanyUtil {
public static void main(String[] args) {
try{
String tUrl = "";
String strUrl = "http://www.chemicalbook.com/ProductSupplierlist_";
HSSFWorkbook hssfworkbook = new HSSFWorkbook();
HSSFSheet hssfsheet = hssfworkbook.createSheet("公司基本信息");
//公司网址索引a-z
for (int j = 36; j <= 61; j++) {//61
tUrl = "";
for (int i = 0; i < 700; i=i+100) {
tUrl = strUrl+j+"_"+i+".htm";
System.out.println(tUrl);
String pageData = "";
int len;
URL url = new URL(tUrl);
HttpURLConnection url_con = null;
url_con = (HttpURLConnection) url.openConnection();
url_con.setFollowRedirects(true);
url_con.setInstanceFollowRedirects(false);
url_con.setRequestMethod("GET");
if(url_con.getResponseCode()==200){
InputStream in = url_con.getInputStream();
byte[] by = new byte[1024];
while((len=in.read(by))!=-1){
pageData +=new String(by,0,len);
}
in.close();
url_con.disconnect();
//设置抓取规则
String regEx =
"<tr><tdwidth=\"230\">(.+?)</td>" +
"<tdwidth=\"230\">(.+?)</td>"+
"<tdwidth=\"210\">(.+?)</td>"+
"<tdwidth=\"200\">(.+?)</td>"+
"<tdwidth=\"80\">(.+?)</td></tr>";
String companyName = null;
String tel = null;
String email = null;
String website = null;
String country = null;
pageData=pageData.replaceAll("\\s|\\t|\\r", "");
Matcher mat=Pattern.compile(regEx).matcher(pageData);
while(mat.find()){
companyName = mat.group(1).replaceAll("<a.*?>|</a>", "");
tel = mat.group(2).replaceAll("<span.*?>|</span>", "");
email = mat.group(3).replaceAll("<span.*?>|</span>", "");
website = mat.group(4).replaceAll("<a.*?>|</a>", "");
country = mat.group(5).replaceAll("<span.*?>|</span>", "");
if(country.equals("中国")){
HSSFRow hssfrow = hssfsheet.createRow(hssfsheet.getLastRowNum()+1);
hssfrow.createCell((short)0).setCellValue(companyName);
hssfrow.createCell((short)1).setCellValue(email);
hssfrow.createCell((short)2).setCellValue(tel);
hssfrow.createCell((short)3).setCellValue(website);
hssfrow.createCell((short)4).setCellValue(country);
url_con.disconnect();
}
}
url_con.disconnect();
}
//将数据导入到excel中
File fileName = new File("F:\\材料\\chemicalbook_DBinfo.xls");
if(!fileName.exists()){
fileName.mkdir();
}
FileOutputStream output = new FileOutputStream(fileName);
hssfworkbook.write(output);
output.close();
}
}
}catch (Exception e) {
e.printStackTrace();
}
}
}