百度搜索-爬虫保存结果

百度搜索-爬虫保存结果
BaiduResult.java

package com.reptileBaidu.domain;

public class BaiduResult {
    /** 标题 */
    private String title;
    /** 地址 */
    private String url;
    /** 概述 */
    private String assumably;
    /** 关键字 */
    private String searchContent;

    public BaiduResult(){

    }
    public BaiduResult(String searchContent) {
        this.searchContent = searchContent;
    }
    public String getTitle() {
        return title;
    }
    public void setTitle(String title) {
        this.title = title;
    }
    public String getUrl() {
        return url;
    }
    public void setUrl(String url) {
        this.url = url;
    }
    public String getAssumably() {
        return assumably;
    }
    public void setAssumably(String assumably) {
        this.assumably = assumably;
    }
    public String getSearchContent() {
        return searchContent;
    }
    public void setSearchContent(String searchContent) {
        this.searchContent = searchContent;
    }
    @Override
    public String toString() {
        return "[title=" + title + ", url=" + url + ", assumably="
                + assumably + "]";
    }


}

ConnnectionManager.java

package com.reptileBaidu.sql.util;

import java.sql.DriverManager;

import com.mysql.jdbc.Connection;

public class ConnnectionManager {

    private static final ThreadLocal<Connection> connectionHolder = new ThreadLocal<Connection>();

    private static final String BETADBURL = "jdbc:mysql://192.168.1.10:3306/reptilebaidu?useUnicode=true&characterEncoding=utf8&autoReconnect=true&user=root&password=pass4you";


    public static Connection getConnectionFromThreadLocal() {
        Connection conn = connectionHolder.get();
        try {
            if (conn == null || conn.isClosed()) {
                Connection con = ConnnectionManager.getConnection();
                connectionHolder.set(con);
                System.out.println("[Thread]" + Thread.currentThread().getName());
                return con;
            }
            return conn;
        } catch (Exception e) {
            System.out.println("[ThreadLocal Get Connection Error]" + e.getMessage());
        }
        return null;


    }

    public static Connection getConnection() {
        Connection conn = null;
        try {
            Class.forName("com.mysql.jdbc.Driver");
            conn = (Connection) DriverManager.getConnection(BETADBURL);
        } catch (Exception e) {
            System.out.println("[Get Connection Error]" + e.getMessage());
        }
        return conn;
    }
}

DataUpdater.java

package com.reptileBaidu.sql.util;

import java.sql.SQLException;
import java.util.List;

import com.mysql.jdbc.PreparedStatement;
import com.reptileBaidu.domain.BaiduResult;

public class DataUpdater implements Runnable {

    private PreparedStatement pst;

    private List<BaiduResult> baiduResults;

    private final String SQL = "insert into reptilebaidu (`title` ,`url` , `assumably` , searchContent) VALUES (?, ? ,?,?)";

    public DataUpdater(List<BaiduResult> baiduResults) {
        this.baiduResults = baiduResults;
    }

    public void run() {
        try {
            pst = (PreparedStatement) ConnnectionManager.getConnectionFromThreadLocal().prepareStatement(SQL);
            for (BaiduResult baiduResult : baiduResults) {
                pst.setString(1, baiduResult.getTitle());
                pst.setString(2, baiduResult.getUrl());
                pst.setString(3, baiduResult.getAssumably());
                pst.setString(4, baiduResult.getSearchContent());
                pst.addBatch();
            }
            pst.executeBatch();
        } catch (Exception e) {
            System.err.println("[SQL ERROR MESSAGE]" + e.getMessage());
        } finally {
            close(pst);
        }

    }

    public void close(PreparedStatement pst) {
        if (pst != null) {
            try {
                pst.close();
            } catch (SQLException e) {
                System.err.println("[Close Statement Error]" + e.getMessage());
            }
        }
    }
}

QunarThreadPoolExecutor.java

package com.reptileBaidu.sql.util;

import java.util.concurrent.BlockingQueue;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
/**
 * 继承ThreadPoolExecutor来进行线程池配置
 * @author 玮
 *
 */
public class QunarThreadPoolExecutor  extends ThreadPoolExecutor {

    // 记录每个线程执行任务开始时间
    private ThreadLocal<Long> start = new ThreadLocal<Long>();

    // 记录所有任务完成使用的时间
    private AtomicLong totals = new AtomicLong();

    // 记录线程池完成的任务数
    private AtomicInteger tasks = new AtomicInteger();

    public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
            BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory, RejectedExecutionHandler handler) {
        super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory, handler);
    }

    public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
            BlockingQueue<Runnable> workQueue, RejectedExecutionHandler handler) {
        super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, handler);
    }

    public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
            BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory) {
        super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory);
    }
    /**
     * 重写构造方法
     * @param corePoolSize 核心线程池大小
     * @param maximumPoolSize 最大线程池大小
     * @param keepAliveTime 线程池中超过corePoolSize数目的空闲线程最大存活时间;可以allowCoreThreadTimeOut(true)使得核心线程有效时间
     * @param unit keepAliveTime时间单位
     * @param workQueue 阻塞任务队列
     */
    public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
            BlockingQueue<Runnable> workQueue) {
        super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);
    }

     /**
     * 每个线程在调用run方法之前调用该方法
     * */
    @Override
    protected void beforeExecute(Thread t, Runnable r) {
        super.beforeExecute(t, r);
        start.set(System.currentTimeMillis());
    }

    /**
     * 每个线程在执行完run方法后调用该方法
     * */
    @Override
    protected void afterExecute(Runnable r, Throwable t) {
        super.afterExecute(r, t);
        tasks.incrementAndGet();
        totals.addAndGet(System.currentTimeMillis() - start.get());
    }
    /**
     * 关闭线程池时调用的方法
     */
    @Override
    protected void terminated() {
        super.terminated();
        System.out.println("完成"+ tasks.get() +"个任务,平均耗时: [" + totals.get() / tasks.get() + "] ms");
    }

}


DataUpdaterMain .java

package com.reptileBaidu.sql.util;

import java.util.List;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;

import com.reptileBaidu.domain.BaiduResult;
/**
 * 利用线程池保存数据
 * @author 玮
 *
 */
public class DataUpdaterMain {

   private LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>();

   private QunarThreadPoolExecutor qunarThreadPoolExecutor = new QunarThreadPoolExecutor(5, 8, 5, TimeUnit.MINUTES, queue);

   /**
    * 关闭线程池
    */
   public void shutThreadPool() {
       if (qunarThreadPoolExecutor != null) {
           qunarThreadPoolExecutor.shutdown();
           try {
               if (!qunarThreadPoolExecutor.awaitTermination(20 , TimeUnit.MINUTES)) {
                   qunarThreadPoolExecutor.shutdownNow();
               }
           } catch (Exception e) {
               System.err.println("[ThreadPool Close Error]" + e.getMessage());
           }

       }
   }


   public boolean update(List<BaiduResult> baiduResults) {
       qunarThreadPoolExecutor.execute(new DataUpdater(baiduResults));
       return true;
   }

}

ReptileBaidu.java

package com.reptileBaidu.sql.util;

import java.util.ArrayList;
import java.util.List;

import org.apache.log4j.Logger;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.json.JSONException;
import org.json.JSONObject;

import com.reptileBaidu.domain.BaiduResult;

public class ReptileBaidu {

    /**
     * 百度搜索,解析查询结果
     * @param searchContent 搜索内容
     * @param startNum 起始条数 默认0
     * @param onePageNum 每页最大条数  最大为50
     * @return
     * @throws JSONException
     */
    public static List<BaiduResult> baiduSearch(String searchContent,int startNum,int onePageNum) throws JSONException{
        String url = "http://www.baidu.com/s?word="+searchContent+"&cl=3&pn="+startNum+"&rn="+onePageNum;
        List<BaiduResult> baiduList = new ArrayList<BaiduResult>();
        try {
            Parser parser = new Parser(url);
            // parser.setEncoding("utf-8");
            // 得到所有经过过滤的标签
            NodeList list = parser.extractAllNodesThatMatch(new OrFilter(new AndFilter(new HasAttributeFilter("class","c-abstract"),new NodeClassFilter(Div.class)),new AndFilter(new HasAttributeFilter("class","c-tools"),new NodeClassFilter(Div.class))));
            //创建搜索结果对象,设置关键字
            BaiduResult baidu = new BaiduResult(searchContent);
            for (int i = 0; i < list.size(); i++) {
                Div div = (Div)list.elementAt(i);
                if(div.getAttribute("id") == null){
                    //设置大概
                    baidu.setAssumably(div.getStringText());
                }else if(baidu.getAssumably()!= ""){
                    String data = div.getAttribute("data-tools");
                    if(data != null){
                        JSONObject  json1 = new JSONObject(data);
                        //设置标题
                        baidu.setTitle(json1.getString("title"));
                        //设置地址
                        baidu.setUrl(json1.getString("url"));
                        System.out.println(baidu.toString());
                        baiduList.add(baidu);
                        baidu = new BaiduResult(searchContent);
                    }
                }
            }
        } catch (ParserException e) {
            System.out.println(url + "-->不存在");
            System.out.println(e.getMessage());
        }
        return baiduList;
    }
    public static void main(String[] args) {
        for(int i=1;i<11;i++){
            SearchRunnable r0 = new SearchRunnable("阿拉善", 1000*(i-1), 1000*i);
            Thread t0 = new Thread(r0);// 创建线程
            t0.start(); // 线程开启
        }
    }
}
class SearchRunnable implements Runnable {

    private static Logger logger = Logger.getLogger(SearchRunnable.class);

    /** 关键字 */
    private String searchContent;
    /** 起始条数 */
    private int startNum = 0;
    /** 终止条数 */
    private int endNum = 10000;

    public SearchRunnable(String searchContent, int startNum, int endNum) {
        this.searchContent = searchContent;
        this.startNum = startNum;
        this.endNum = endNum;
    }

    public void run() {
        System.out.println("开启线程");
        long start = System.currentTimeMillis();
        List<BaiduResult> baiduList = new ArrayList<BaiduResult>();
        DataUpdaterMain dataUpdaterMain = new DataUpdaterMain();
        int size = 0;
        try {
            for (int i = startNum; i < endNum; i += 50) {
                //进行爬虫
                baiduList.addAll(ReptileBaidu.baiduSearch(searchContent, i,50));
                if(baiduList.size() >=100){
                    //保存爬虫结果
                    dataUpdaterMain.update(baiduList);
                    size+=baiduList.size();
                    baiduList = new ArrayList<BaiduResult>();
                }
            }
            dataUpdaterMain.update(baiduList);
            size+=baiduList.size();
        } catch (Exception e) {
            System.out.println(e.getMessage());
        } finally {
            dataUpdaterMain.shutThreadPool();
            logger.info("耗时[" + (System.currentTimeMillis() - start) + "]ms,保存"+size+"条数据");
        }
    }
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值