最近无聊,写个小程序抓取阿里巴巴企业的详细信息,用htmlparser解析.
不多说了,直接上代码;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.List;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.TagNameFilter;
public class TestYehoo {
public static void main(String[] args) throws Exception
{
int from = 488;
int to = 500;
String resource = "http://search.china.alibaba.com/search/company_search.htm?province=%D5%E3%BD%AD&city=%C4%FE%B2%A8&filt=y&begin_page=";
getAlibabaCompanyInfo(resource,from,to);
System.out.println("====================================");
}
/**
* 得到阿里巴巴企业信息
*/
@SuppressWarnings("unchecked")
public static void getAlibabaCompanyInfo(String content,int from,int to) throws Exception
{
for(int j=from-1;j<to;j++)
{
System.out.println("=========开始取第"+(j+1)+"页数据=============");
List list = new ArrayList();
Parser myParser = new Parser(content+(j+1));
//过滤条件
NodeFilter aFilter = new TagNameFilter ("A");
NodeFilter classfilter_l = new HasAttributeFilter("class","l");
//解析公司名称和公司网址
AndFilter companyNameFilter = new AndFilter();
companyNameFilter.setPredicates(new NodeFilter[]{aFilter,classfilter_l});
Node[] nodes = myParser.parse(companyNameFilter).toNodeArray();
for(int i=0;i<nodes.length;i++){
QyxxDomain domain = new QyxxDomain();
Node anode = (Node) nodes[i];
int num = (j*30+i+1);
// if("0".equals(countRecord(num))){
System.out.println(num);
System.out.println("公司名:"+anode.toPlainTextString());
String text = anode.getText();
String website = text.substring(text.indexOf("href=\"")+6, text.indexOf("/\"")+1);
System.out.println("网址:"+website);
String url = website+"athena/companyprofile/"+website.substring(website.indexOf("http://")+7, website.indexOf(".cn.alibaba.com/"))+".html";
String url1 = website+"athena/contact/"+website.substring(website.indexOf("http://")+7, website.indexOf(".cn.alibaba.com/"))+".html";
domain.setQymc(anode.toPlainTextString().trim());
domain.setId(new Long(num).toString());
try{
getCompanyDetail(url,domain);
}catch (Exception e) {
System.out.println("取"+domain.getQymc()+"详细信息时发生错误!");
e.printStackTrace();
continue;
}
try{
getCompanyContact(url1,domain);
}catch (Exception e) {
System.out.println("取"+domain.getQymc()+"联系信息时发生错误!");
e.printStackTrace();
continue;
}
list.add(domain);
// }
}
System.out.println("=========结束取第"+(j+1)+"页数据=============");
System.out.println("=========开始插入第"+(j+1)+"页数据=============");
doInsertDB(list);
System.out.println("=========结束插入第"+(j+1)+"页数据=============");
}
}
private static String countRecord(int id)throws Exception{
//驱动程序名
String driverName="com.mysql.jdbc.Driver";
//数据库用户名
String userName="root";
//密码
String userPasswd="root";
//数据库名
String dbName="qiyetong";
//表名
String tableName="qyxx";
//联结字符串
String url="jdbc:mysql://localhost/"+dbName+"?user="+userName+"&password="+userPasswd;
Class.forName(driverName).newInstance();
Connection connection=DriverManager.getConnection(url);
Statement statement = connection.createStatement();
String sql = "select count(*) as count from qyxx where id="+id;
ResultSet rs = statement.executeQuery(sql);
String count="";
while(rs.next()){
count=rs.getString("count");
}
return count;
}
@SuppressWarnings("unchecked")
private static void doInsertDB(List list) throws InstantiationException,
IllegalAccessException, ClassNotFoundException, SQLException {
//驱动程序名
String driverName="com.mysql.jdbc.Driver";
//数据库用户名
String userName="root";
//密码
String userPasswd="root";
//数据库名
String dbName="qiyetong";
//表名
String tableName="qyxx";
//联结字符串
String url="jdbc:mysql://localhost/"+dbName+"?user="+userName+"&password="+userPasswd;
Class.forName(driverName).newInstance();
Connection connection=DriverManager.getConnection(url);
Statement statement = connection.createStatement();
int succNum = 0;
for(int k=0;k<list.size();k++)
{
try{
QyxxDomain domain = (QyxxDomain)list.get(k);
String sql = "insert into qyxx(id,qymc,gsgk,zycphfw,zyhy,qylx,jyms,zczb,gszcd,ygrs,gsclsj,fddbr,zykh,nyye,zyyydd,zysc,khyh,yhzh,oem,zlkz,ncke,yfbmrs,cfmj,ycl,gszy,lxr,dh,yddh,cz,dz,yb)" +
"VALUES(" +domain.getId()+",'"+
SQLFilter.filtrateSQL(domain.getQymc(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getGsgk(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getZycphfw(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getZyhy(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getQylx(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getJyms(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getZczb(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getGszcd(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getYgrs(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getGsclsj(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getFddbr(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getZykh(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getNyye(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getZyyydd(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getZysc(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getKhyh(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getYhzh(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getOem(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getZlkz(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getNcke(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getYfbmrs(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getCfmj(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getYcl(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getGszy(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getLxr(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getDh(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getYddh(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getCz(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getDz(), 1)+"','"+
SQLFilter.filtrateSQL(domain.getYb(), 1)+"');";
System.out.println(sql);
statement.execute(sql);
succNum++;
}catch (Exception e) {
System.out.println("插入第"+k+"条信息时发生错误!");
continue;
}
}
System.out.println("=========成功插入"+succNum+"条数据=============");
}
/**
* 得到企业详细信息
*/
public static void getCompanyDetail(String content,QyxxDomain domain) throws Exception
{
Parser myParser = new Parser(content);
Parser myParser1 = new Parser(content);
//过滤条件
NodeFilter divFilter = new TagNameFilter ("DIV");
NodeFilter tdFilter = new TagNameFilter ("TD");
NodeFilter classfilter_companyinfo = new HasAttributeFilter("class","companyinfo mainTextColor");
NodeFilter classfilter_Slh15 = new HasAttributeFilter("class","S lh15");
//解析公司概况
AndFilter detailFilter1 = new AndFilter();
detailFilter1.setPredicates(new NodeFilter[]{divFilter,classfilter_companyinfo});
Node[] nodes1 = myParser1.parse(detailFilter1).toNodeArray();
for(int i=0;i<nodes1.length;i++){
Node anode = (Node) nodes1[i];
String gsgk = "";
try{
gsgk = splitAndFilterString(anode.getChildren().toNodeArray()[6].toPlainTextString());
}catch (Exception e) {
gsgk = "";
}
domain.setGsgk(gsgk);
System.out.println(domain.getGsgk());
}
//解析公司详细资料
AndFilter detailFilter = new AndFilter();
detailFilter.setPredicates(new NodeFilter[]{tdFilter,classfilter_Slh15});
Node[] nodes = myParser.parse(detailFilter).toNodeArray();
for(int i=0;i<nodes.length;i++){
Node anode = (Node) nodes[i];
if(anode.toPlainTextString().indexOf("主营产品或服务")!=-1){
domain.setZycphfw(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("主营产品或服务:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("主营行业")!=-1){
domain.setZyhy(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("主营行业:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("企业类型")!=-1){
domain.setQylx(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("企业类型:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("经营模式")!=-1){
domain.setJyms(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("经营模式:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("注册资本")!=-1){
domain.setZczb(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("注册资本:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("公司注册地")!=-1){
domain.setGszcd(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("公司注册地:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("员工人数")!=-1){
domain.setYgrs(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("员工人数:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("公司成立时间")!=-1){
domain.setGsclsj(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("公司成立时间:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("法定代表人/负责人")!=-1){
domain.setFddbr(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("法定代表人/负责人:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("主要客户")!=-1){
domain.setZykh(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("主要客户:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("年营业额")!=-1){
domain.setNyye(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("年营业额:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("主要经营地点")!=-1){
domain.setZyyydd(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("主要经营地点:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("主要市场")!=-1){
domain.setZysc(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("主要市场:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("开户银行")!=-1){
domain.setKhyh(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("开户银行:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("银行帐号")!=-1){
domain.setYhzh(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("银行帐号:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("是否提供OEM代加工")!=-1){
domain.setOem(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("是否提供OEM代加工:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("质量控制")!=-1){
domain.setZlkz(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("质量控制:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("年出口额")!=-1){
domain.setNcke(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("年出口额:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("研发部门人数")!=-1){
domain.setYfbmrs(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("研发部门人数:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("厂房面积")!=-1){
domain.setCfmj(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("厂房面积:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("月产量")!=-1){
domain.setYcl(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("月产量:"+((Node) nodes[i+1]).toPlainTextString());
}
if(anode.toPlainTextString().indexOf("公司主页")!=-1){
domain.setGszy(splitAndFilterString(((Node) nodes[i+1]).toPlainTextString()));
//System.out.println("公司主页:"+((Node) nodes[i+1]).toPlainTextString());
}
}
}
/**
* 得到企业详细信息
*/
public static void getCompanyContact(String content,QyxxDomain domain) throws Exception
{
Parser myParser = new Parser(content);
Parser myParser1 = new Parser(content);
//过滤条件
NodeFilter divFilter = new TagNameFilter ("DIV");
NodeFilter liFilter = new TagNameFilter ("LI");
NodeFilter classfilter_companyinfo = new HasAttributeFilter("class","title ml15 b mb20 mt20 mainTextColor");
//解析联系人
AndFilter detailFilter1 = new AndFilter();
detailFilter1.setPredicates(new NodeFilter[]{divFilter,classfilter_companyinfo});
Node[] nodes1 = myParser1.parse(detailFilter1).toNodeArray();
for(int i=0;i<nodes1.length;i++){
Node anode = (Node) nodes1[i];
domain.setLxr(splitAndFilterString(anode.getChildren().toNodeArray()[1].toPlainTextString())+splitAndFilterString(anode.getChildren().toNodeArray()[2].toPlainTextString()));
//System.out.println(domain.getLxr());
}
//解析联系方式
Node[] nodes = myParser.parse(liFilter).toNodeArray();
for(int i=0;i<nodes.length;i++){
Node anode = (Node) nodes[i];
if(anode.toPlainTextString().indexOf("电 话")!=-1){
domain.setDh(splitAndFilterString(anode.toPlainTextString()).substring(3));
//System.out.println(splitAndFilterString(anode.toPlainTextString()).substring(3));
}
if(anode.toPlainTextString().indexOf("移动电话")!=-1){
domain.setYddh(splitAndFilterString(anode.toPlainTextString()).substring(5));
//System.out.println(splitAndFilterString(anode.toPlainTextString()).substring(5));
}
if(anode.toPlainTextString().indexOf("传 真")!=-1){
domain.setCz(splitAndFilterString(anode.toPlainTextString()).substring(3));
//System.out.println(splitAndFilterString(anode.toPlainTextString()).substring(3));
}
if(anode.toPlainTextString().indexOf("地 址")!=-1){
domain.setDz(splitAndFilterString(anode.toPlainTextString()).substring(3));
//System.out.println(splitAndFilterString(anode.toPlainTextString()).substring(3));
}
if(anode.toPlainTextString().indexOf("邮 编")!=-1){
domain.setYb(splitAndFilterString(anode.toPlainTextString()).substring(3));
//System.out.println(splitAndFilterString(anode.toPlainTextString()).substring(3));
}
}
}
/**
* 读取一个文件到字符串里.
*
* @param sFileName 文件名
* @param sEncode String
* @return 文件内容
*/
public static String readTextFile(String sFileName, String sEncode)
{
StringBuffer sbStr = new StringBuffer();
try
{
File ff = new File(sFileName);
InputStreamReader read = new InputStreamReader(new FileInputStream(ff),
sEncode);
BufferedReader ins = new BufferedReader(read);
String dataLine = "";
while (null != (dataLine = ins.readLine()))
{
sbStr.append(dataLine);
sbStr.append("\r\n");
}
ins.close();
}
catch (Exception e)
{
e.printStackTrace();
}
return sbStr.toString();
}
/**
* 去掉左右空格后字符串是否为空
* @param astr String
* @return boolean
*/
public static boolean isTrimEmpty(String astr)
{
if ((null == astr) || (astr.length() == 0))
{
return true;
}
if (isBlank(astr.trim()))
{
return true;
}
return false;
}
/**
* 字符串是否为空:null或者长度为0.
* @param astr 源字符串.
* @return boolean
*/
public static boolean isBlank(String astr)
{
if ((null == astr) || (astr.length() == 0))
{
return true;
}
else
{
return false;
}
}
/**
* 删除input字符串中的html格式
*
* @param input
* @param length
* @return
*/
public static String splitAndFilterString(String input) {
if (input == null || input.trim().equals("")) {
return "";
}
// 去掉所有html元素,
String str = input.replaceAll("\\&[a-zA-Z]{1,10};", "").replaceAll(
"<[^>]*>", "");
str = str.replaceAll("[(/>)<]", "");
str = str.replaceAll(" ", "");
str = str.replaceAll("\\r", "");
str = str.replaceAll("\\n", "");
str = str.replaceAll("\\t", "");
return str;
}
最近找了个网络爬虫web Harvest 只是跑了个例子,打算摸索下抓取阿里巴巴的企业信息看看效果怎么样,到时候发出来给大家分享!