百度搜索-爬虫保存结果
BaiduResult.java
package com.reptileBaidu.domain;
public class BaiduResult {
/** 标题 */
private String title;
/** 地址 */
private String url;
/** 概述 */
private String assumably;
/** 关键字 */
private String searchContent;
public BaiduResult(){
}
public BaiduResult(String searchContent) {
this.searchContent = searchContent;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getAssumably() {
return assumably;
}
public void setAssumably(String assumably) {
this.assumably = assumably;
}
public String getSearchContent() {
return searchContent;
}
public void setSearchContent(String searchContent) {
this.searchContent = searchContent;
}
@Override
public String toString() {
return "[title=" + title + ", url=" + url + ", assumably="
+ assumably + "]";
}
}
ConnnectionManager.java
package com.reptileBaidu.sql.util;
import java.sql.DriverManager;
import com.mysql.jdbc.Connection;
public class ConnnectionManager {
private static final ThreadLocal<Connection> connectionHolder = new ThreadLocal<Connection>();
private static final String BETADBURL = "jdbc:mysql://192.168.1.10:3306/reptilebaidu?useUnicode=true&characterEncoding=utf8&autoReconnect=true&user=root&password=pass4you";
public static Connection getConnectionFromThreadLocal() {
Connection conn = connectionHolder.get();
try {
if (conn == null || conn.isClosed()) {
Connection con = ConnnectionManager.getConnection();
connectionHolder.set(con);
System.out.println("[Thread]" + Thread.currentThread().getName());
return con;
}
return conn;
} catch (Exception e) {
System.out.println("[ThreadLocal Get Connection Error]" + e.getMessage());
}
return null;
}
public static Connection getConnection() {
Connection conn = null;
try {
Class.forName("com.mysql.jdbc.Driver");
conn = (Connection) DriverManager.getConnection(BETADBURL);
} catch (Exception e) {
System.out.println("[Get Connection Error]" + e.getMessage());
}
return conn;
}
}
DataUpdater.java
package com.reptileBaidu.sql.util;
import java.sql.SQLException;
import java.util.List;
import com.mysql.jdbc.PreparedStatement;
import com.reptileBaidu.domain.BaiduResult;
public class DataUpdater implements Runnable {
private PreparedStatement pst;
private List<BaiduResult> baiduResults;
private final String SQL = "insert into reptilebaidu (`title` ,`url` , `assumably` , searchContent) VALUES (?, ? ,?,?)";
public DataUpdater(List<BaiduResult> baiduResults) {
this.baiduResults = baiduResults;
}
public void run() {
try {
pst = (PreparedStatement) ConnnectionManager.getConnectionFromThreadLocal().prepareStatement(SQL);
for (BaiduResult baiduResult : baiduResults) {
pst.setString(1, baiduResult.getTitle());
pst.setString(2, baiduResult.getUrl());
pst.setString(3, baiduResult.getAssumably());
pst.setString(4, baiduResult.getSearchContent());
pst.addBatch();
}
pst.executeBatch();
} catch (Exception e) {
System.err.println("[SQL ERROR MESSAGE]" + e.getMessage());
} finally {
close(pst);
}
}
public void close(PreparedStatement pst) {
if (pst != null) {
try {
pst.close();
} catch (SQLException e) {
System.err.println("[Close Statement Error]" + e.getMessage());
}
}
}
}
QunarThreadPoolExecutor.java
package com.reptileBaidu.sql.util;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.RejectedExecutionHandler;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
/**
* 继承ThreadPoolExecutor来进行线程池配置
* @author 玮
*
*/
public class QunarThreadPoolExecutor extends ThreadPoolExecutor {
// 记录每个线程执行任务开始时间
private ThreadLocal<Long> start = new ThreadLocal<Long>();
// 记录所有任务完成使用的时间
private AtomicLong totals = new AtomicLong();
// 记录线程池完成的任务数
private AtomicInteger tasks = new AtomicInteger();
public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory, RejectedExecutionHandler handler) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory, handler);
}
public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
BlockingQueue<Runnable> workQueue, RejectedExecutionHandler handler) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, handler);
}
public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
BlockingQueue<Runnable> workQueue, ThreadFactory threadFactory) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue, threadFactory);
}
/**
* 重写构造方法
* @param corePoolSize 核心线程池大小
* @param maximumPoolSize 最大线程池大小
* @param keepAliveTime 线程池中超过corePoolSize数目的空闲线程最大存活时间;可以allowCoreThreadTimeOut(true)使得核心线程有效时间
* @param unit keepAliveTime时间单位
* @param workQueue 阻塞任务队列
*/
public QunarThreadPoolExecutor (int corePoolSize, int maximumPoolSize, long keepAliveTime, TimeUnit unit,
BlockingQueue<Runnable> workQueue) {
super(corePoolSize, maximumPoolSize, keepAliveTime, unit, workQueue);
}
/**
* 每个线程在调用run方法之前调用该方法
* */
@Override
protected void beforeExecute(Thread t, Runnable r) {
super.beforeExecute(t, r);
start.set(System.currentTimeMillis());
}
/**
* 每个线程在执行完run方法后调用该方法
* */
@Override
protected void afterExecute(Runnable r, Throwable t) {
super.afterExecute(r, t);
tasks.incrementAndGet();
totals.addAndGet(System.currentTimeMillis() - start.get());
}
/**
* 关闭线程池时调用的方法
*/
@Override
protected void terminated() {
super.terminated();
System.out.println("完成"+ tasks.get() +"个任务,平均耗时: [" + totals.get() / tasks.get() + "] ms");
}
}
DataUpdaterMain .java
package com.reptileBaidu.sql.util;
import java.util.List;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import com.reptileBaidu.domain.BaiduResult;
/**
* 利用线程池保存数据
* @author 玮
*
*/
public class DataUpdaterMain {
private LinkedBlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>();
private QunarThreadPoolExecutor qunarThreadPoolExecutor = new QunarThreadPoolExecutor(5, 8, 5, TimeUnit.MINUTES, queue);
/**
* 关闭线程池
*/
public void shutThreadPool() {
if (qunarThreadPoolExecutor != null) {
qunarThreadPoolExecutor.shutdown();
try {
if (!qunarThreadPoolExecutor.awaitTermination(20 , TimeUnit.MINUTES)) {
qunarThreadPoolExecutor.shutdownNow();
}
} catch (Exception e) {
System.err.println("[ThreadPool Close Error]" + e.getMessage());
}
}
}
public boolean update(List<BaiduResult> baiduResults) {
qunarThreadPoolExecutor.execute(new DataUpdater(baiduResults));
return true;
}
}
ReptileBaidu.java
package com.reptileBaidu.sql.util;
import java.util.ArrayList;
import java.util.List;
import org.apache.log4j.Logger;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.tags.Div;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.json.JSONException;
import org.json.JSONObject;
import com.reptileBaidu.domain.BaiduResult;
public class ReptileBaidu {
/**
* 百度搜索,解析查询结果
* @param searchContent 搜索内容
* @param startNum 起始条数 默认0
* @param onePageNum 每页最大条数 最大为50
* @return
* @throws JSONException
*/
public static List<BaiduResult> baiduSearch(String searchContent,int startNum,int onePageNum) throws JSONException{
String url = "http://www.baidu.com/s?word="+searchContent+"&cl=3&pn="+startNum+"&rn="+onePageNum;
List<BaiduResult> baiduList = new ArrayList<BaiduResult>();
try {
Parser parser = new Parser(url);
// parser.setEncoding("utf-8");
// 得到所有经过过滤的标签
NodeList list = parser.extractAllNodesThatMatch(new OrFilter(new AndFilter(new HasAttributeFilter("class","c-abstract"),new NodeClassFilter(Div.class)),new AndFilter(new HasAttributeFilter("class","c-tools"),new NodeClassFilter(Div.class))));
//创建搜索结果对象,设置关键字
BaiduResult baidu = new BaiduResult(searchContent);
for (int i = 0; i < list.size(); i++) {
Div div = (Div)list.elementAt(i);
if(div.getAttribute("id") == null){
//设置大概
baidu.setAssumably(div.getStringText());
}else if(baidu.getAssumably()!= ""){
String data = div.getAttribute("data-tools");
if(data != null){
JSONObject json1 = new JSONObject(data);
//设置标题
baidu.setTitle(json1.getString("title"));
//设置地址
baidu.setUrl(json1.getString("url"));
System.out.println(baidu.toString());
baiduList.add(baidu);
baidu = new BaiduResult(searchContent);
}
}
}
} catch (ParserException e) {
System.out.println(url + "-->不存在");
System.out.println(e.getMessage());
}
return baiduList;
}
public static void main(String[] args) {
for(int i=1;i<11;i++){
SearchRunnable r0 = new SearchRunnable("阿拉善", 1000*(i-1), 1000*i);
Thread t0 = new Thread(r0);// 创建线程
t0.start(); // 线程开启
}
}
}
class SearchRunnable implements Runnable {
private static Logger logger = Logger.getLogger(SearchRunnable.class);
/** 关键字 */
private String searchContent;
/** 起始条数 */
private int startNum = 0;
/** 终止条数 */
private int endNum = 10000;
public SearchRunnable(String searchContent, int startNum, int endNum) {
this.searchContent = searchContent;
this.startNum = startNum;
this.endNum = endNum;
}
public void run() {
System.out.println("开启线程");
long start = System.currentTimeMillis();
List<BaiduResult> baiduList = new ArrayList<BaiduResult>();
DataUpdaterMain dataUpdaterMain = new DataUpdaterMain();
int size = 0;
try {
for (int i = startNum; i < endNum; i += 50) {
//进行爬虫
baiduList.addAll(ReptileBaidu.baiduSearch(searchContent, i,50));
if(baiduList.size() >=100){
//保存爬虫结果
dataUpdaterMain.update(baiduList);
size+=baiduList.size();
baiduList = new ArrayList<BaiduResult>();
}
}
dataUpdaterMain.update(baiduList);
size+=baiduList.size();
} catch (Exception e) {
System.out.println(e.getMessage());
} finally {
dataUpdaterMain.shutThreadPool();
logger.info("耗时[" + (System.currentTimeMillis() - start) + "]ms,保存"+size+"条数据");
}
}
}