###一、下载驱动
大家可以去MongoDB官网下载,我也整理了MongoDB的Java驱动包上传到了CSDN,点击下载
###二、代码简介
本爬虫是基于HttpClient+Jsonp框架编写,数据库采用MongoDB。功能是通过提供CSDN中博主的id名,将该博主的博文信息采集入库。大体过程是通过HttpClient访问网页,Jsonp解析将爬取的博文地址(URL)进行存储在集合 blog 中并加上访问标志位,然后再从集合blog中获取没有被访问的URL,再通过URL获取博文的详情然后入口,最后将访问过的URL标记为访问状态。
###三、完整代码
1.MongoDBJDBC.java文件
package com.csdn.dao;
import java.util.ArrayList;
import java.util.List;
import org.bson.Document;
import com.mongodb.BasicDBObject;
import com.mongodb.MongoClient;
import com.mongodb.MongoClientOptions;
import com.mongodb.MongoClientOptions.Builder;
import com.mongodb.MongoException;
import com.mongodb.ServerAddress;
import com.mongodb.client.FindIterable;
import com.mongodb.client.MongoCollection;
import com.mongodb.client.MongoCursor;
import com.mongodb.client.MongoDatabase;
import com.mongodb.client.result.UpdateResult;
public class MongoDBJDBC {
private MongoClient mongoClient = null;
/**
* 构造方法,连接MongoDB服务器
* @author ouyang
* @param ip
* @param port
* @return
*/
public MongoDBJDBC(String ip, int port) {
if (mongoClient == null) {
ServerAddress serverAddress = new ServerAddress(ip, port);
// 设置连接参数
Builder builder = MongoClientOptions.builder()
.connectTimeout(1000 * 60) // 设置连接超时为60秒
.maxWaitTime(1000 * 60 * 2) // 设置最大等待时间为120秒
.connectionsPerHost(50); // 设置最大连接数为50
MongoClientOptions options = builder.build();
// 连接MongoDB服务,有多种方式
try {
// MongoClient mongoClient = new MongoClient(ip,port);
mongoClient = new MongoClient(serverAddress, options);
} catch (MongoException e) {
e.printStackTrace();
}
}
}
/******** 单例模式声明开始,采用饿汉式方式生成,保证线程安全 ********************/
// 类初始化时,自行实例化,饿汉式单例模式
private static final MongoDBJDBC mongoDBJDBC = new MongoDBJDBC(
"127.0.0.1", 20001);
/**
* 单例的静态工厂方法
* @author ouyang
* @return
*/
public static MongoDBJDBC getMongoDBJDBC() {
return mongoDBJDBC;
}
/************************ 单例模式声明结束 *************************************/
/**
* 根据指定条件获取Document
* @author ouyang
* @param dbName
* @param collectionName
* @param keys
* @param values
* @param num
* @return
*/
public List<Document> find(String dbName, String collectionName,
String[] keys, Object[] values, int num) {
// 创建返回的结果集
List<Document> resultList = new ArrayList<Document>();
MongoDatabase db = null;
MongoCollection<Document> dbCollection = null;
FindIterable<Document> cursor = null;
if (keys != null && values != null) {
if (keys.length != values.length) {
// 如果传来的查询参数对不对,直接返回空的结果集
return resultList;
} else {
try {
// 获取数据库实例
db = mongoClient.getDatabase(dbName);
// 获取数据库中指定的collection集合
dbCollection = db.getCollection(collectionName);
// 构建查询条件
BasicDBObject queryObj = new BasicDBObject();
// 填充查询条件
for (int i = 0; i < keys.length; i++) {
queryObj.put(keys[i], values[i]);
}
// 查询获取数据
cursor = dbCollection.find(queryObj);
if (num != -1) {
// 判断是否是返回全部数据,num=-1返回查询全部数据,num!=-1则返回指定的num数据
MongoCursor<Document> mongoCursor = cursor.iterator();
while(mongoCursor.hasNext()) {
resultList.add(mongoCursor.next());
}
return resultList;
} else {
MongoCursor<Document> mongoCursor = cursor.iterator();
int i = 0;
while(mongoCursor.hasNext() && i < num) {
resultList.add(mongoCursor.next());
i++;
System.out.println(i);
}
return resultList;
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
return resultList;
}
/**
* 插入文档
* @author ouyang
* @param dbName
* @param collectionName
* @param documents
* @return
*/
public Boolean inSert(String dbName, String collectionName, List<Document> documents) {
MongoDatabase db = null;
MongoCollection<Document> dbCollection = null;
if (documents.size() < 1) {
return false;
} else {
// 获取数据库实例
db = mongoClient.getDatabase(dbName);
// 获取数据库中指定的collection集合
dbCollection = db.getCollection(collectionName);
dbCollection.insertMany(documents);
return true;
}
}
/**
* 更新文档
* @author ouyang
* @param dbName
* @param collectionName
* @param documents
* @return
*/
public Boolean update(String dbName, String collectionName,
BasicDBObject whereDoc,BasicDBObject updateDoc) {
MongoDatabase db = null;
MongoCollection<Document> dbCollection = null;
if (whereDoc.size() > 0 && updateDoc.size() > 0) {
return false;
} else {
// 获取数据库实例
db = mongoClient.getDatabase(dbName);
// 获取数据库中指定的collection集合
dbCollection = db.getCollection(collectionName);
UpdateResult updateManyResult = dbCollection.updateMany(whereDoc,
new Document("$set",updateDoc));
System.out.println("更新成功:" + updateManyResult + "个");
return true;
}
}
/**
* 更新一个文档
* @author ouyang
* @param dbName
* @param collectionName
* @param documents
* @return
*/
public Boolean updateOne(String dbName, String collectionName,
BasicDBObject whereDoc,BasicDBObject updateDoc) {
MongoDatabase db = null;
MongoCollection<Document> dbCollection = null;
if (whereDoc.size() < 1 && updateDoc.size() < 1) {
return false;
} else {
// 获取数据库实例
db = mongoClient.getDatabase(dbName);
// 获取数据库中指定的collection集合
dbCollection = db.getCollection(collectionName);
UpdateResult updateManyResult = dbCollection.updateOne(whereDoc,
new Document("$set",updateDoc));
System.out.println("更新成功:" + updateManyResult + "个");
return true;
}
}
/**
* 获取指定MongoCollection
* @author ouyang
* @param dbName
* @param collectionName
* @return
*/
public MongoCollection<Document> getCollection(String dbName,
String collectionName) {
return mongoClient.getDatabase(dbName).getCollection(collectionName);
}
/**
* 根据数据库名获取指定数据库实例
* @author ouyang
* @param dbName
* @return
*/
public MongoDatabase getDatabase(String dbName) {
return mongoClient.getDatabase(dbName);
}
}
2.GetHttpResponse.java文件
package com.csdn.crawler;
import java.io.IOException;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.HttpClients;
public class GetHttpResponse {
public static HttpResponse getHttpClient(String url) throws ClientProtocolException, IOException {
//创建HttpClient对象实例
HttpClient httpClient = HttpClients.createDefault();
//创建get请求对象
HttpGet httpGet = new HttpGet(url);
RequestConfig config = RequestConfig.custom()
.setConnectTimeout(5000) //设置响应时间
.setConnectionRequestTimeout(5000) //设置请求超时
.setCookieSpec(CookieSpecs.IGNORE_COOKIES) //设置cookie策略
.build();
httpGet.setConfig(config);
//设置头信息,不然请求不到网页
httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.8");
httpGet.setHeader("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; rv:6.0.2) Gecko/20100101 Firefox/6.0.2");
HttpResponse httpResponse = httpClient.execute(httpGet);;
return httpResponse;
}
}
3.GetCSDNInfo.java文件
package com.csdn.crawler;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpResponse;
import org.apache.http.util.EntityUtils;
import org.bson.types.ObjectId;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.csdn.dao.MongoDBJDBC;
import com.mongodb.BasicDBObject;
public class GetCSDNInfo {
/**
* csdn的url结构为:https://blog.csdn.net/用户名/article/list/页数
*
*/
public String userName = "qq_24598601";
public static String url = "https://blog.csdn.net/";
public GetCSDNInfo(String url, String userName) {
super();
if("".equals(userName) || userName == null) {
System.out.println("用户名不能为空");
} else if("".equals(url) || !url.equals(url)) {
System.out.println("url不正确");
}
this.userName = userName;
}
/**
* 描述:获取要获取信息的链接
* @author 欧阳
* @serialData 20180728
* @param useName
* @return
*/
public void getUrl() {
System.out.println("**********开始获取" + userName + "发布的博文信息**********");
//记录页数
int pageNum = 1;
int count = 0; //记录爬取博文条数
while(pageNum > 0) {
String urlStr = url +
userName +
"/article/list/" +
String.valueOf(pageNum);
try {
//获取网页信息
HttpResponse httpResponse = GetHttpResponse.getHttpClient(urlStr);
//将网页内容进去转码
String html = EntityUtils.toString(httpResponse.getEntity(), "UTF-8");
//通过Jsoup将页面转成Document对象进行解析
Document documents = Jsoup.parse(html);
Elements elements = documents.select(".article-list");
/*
* 如果获取到的Elements为空,则已经没有博文了
*/
if(elements.isEmpty()) {
System.out.println("已经没有博文了!");
break;
} else {
pageNum++; //设置获取下一页
}
//获取当前页的博文条数
Elements contents = documents.select("h4 a");
List<org.bson.Document> listD = new ArrayList<org.bson.Document>();
for(Element e : contents) {
org.bson.Document documet = new org.bson.Document();
documet.put("is", e.text().split(" ")[0]);
documet.put("title", e.text().split(" ")[1]);
documet.put("url", e.attr("href"));
documet.put("status", "0");
listD.add(documet);
count++; //计数
}
Boolean flag = MongoDBJDBC.getMongoDBJDBC().inSert("crawler", "blog", listD);
System.out.print("url=" + urlStr + "--");
if(flag) {
System.out.println(listD.size() + "条存入数据库");
} else {
System.out.println("存入数据库失败!");
}
//清空list中的数据,准备下一下存放
listD.clear();
} catch (IOException e) {
e.printStackTrace();
System.out.println("发生未知错误!");
}
}
System.out.println("成功找到并存入数据库:" + count + "条博文");
System.out.println("***********结束获取" + userName + "发布的博文信息*********");
}
/**
* 从数据库中获取每一条博文的链接并通过链接获取详情信息
* @author 欧阳
* @serialData 20180728
*/
public void getBlogInfo() {
System.out.println("*******************开始获取博文信息******************");
//添加条件,status为0表示未被访问过
String[] key = {"status"};
Object[] value = {"0"};
//从数据库中获取未被访问的节点
List<org.bson.Document> documents = MongoDBJDBC.getMongoDBJDBC()
.find("crawler", "blog", key, value, 10);
//直到数据库中的节点都被访问才结束
while(documents.size() > 0) {
for(org.bson.Document document : documents) {
String url = document.getString("url"); //获取链接
ObjectId _id = document.getObjectId("_id"); //获取id
try {
//获取网页信息
HttpResponse httpResponse = GetHttpResponse.getHttpClient(url);
//将网页内容进去转码
String html = EntityUtils.toString(httpResponse.getEntity(), "UTF-8");
//通过Jsoup将页面转成Document对象进行解析
Document docs = Jsoup.parse(html);
Elements creatTime = docs.select(".time"); //发布时间
Elements readNum = docs.select(".read-count"); //阅读数
List<org.bson.Document> listD = new ArrayList<org.bson.Document>();
org.bson.Document documet = new org.bson.Document();
documet.put("pkid", _id);
documet.put("creatTime", creatTime.get(0).text());
documet.put("readNum", readNum.get(0).text().split(":")[1]);
listD.add(documet);
Boolean flag = MongoDBJDBC.getMongoDBJDBC().inSert("crawler", "blogInfo", listD);
if(flag) {
/*
* 修改已经访问过的数据的状态
*/
//添加修改时查询的条件
BasicDBObject whereDoc = new BasicDBObject();
whereDoc.put("_id", _id);
//添加修改后的值
BasicDBObject updateDoc = new BasicDBObject();
updateDoc.put("status", "1");
//修改
MongoDBJDBC.getMongoDBJDBC().updateOne("crawler", "blog", whereDoc, updateDoc);
} else {
System.out.println("存入数据库失败!");
}
//清空list中的数据,准备下一下存放
listD.clear();
} catch (IOException e) {
e.printStackTrace();
System.out.println("发生未知错误!");
}
}
//再次从数据库中获取未被访问的节点
documents = MongoDBJDBC.getMongoDBJDBC()
.find("crawler", "blog", key, value, 10);
}
System.out.println("所有节点都已经被访问!");
System.out.println("*******************结束获取博文信息******************");
}
}
4.测试文件GetInfoTest.java
package com.csdn.crawler.test;
import com.csdn.crawler.GetCSDNInfo;
public class GetInfoTest {
public static void main(String[] args) {
GetCSDNInfo info = new GetCSDNInfo("https://blog.csdn.net/",
"qq_24598601");
for(int i=0; i<10; i++) {
info.getUrl();
info.getBlogInfo();
}
}
}
###五、结束语
Java操作MongoDB数据库还有许多方法:MongoDB API Documentation for Java,这是MongoDB-Java的API。