暑假学习了一下基于Java的爬虫,拿b站的排行榜来练下手。
我用的框架如下
com.db :连接MySQL的类及设置MySQL账户信息的设定。
com.main :主程序。
com.model :对每个视频的信息进行包装。
com.parse :对需求爬取的信息的筛选过滤。
com.util :连接b站排行榜页面的类。
—————————————————————————————————————————————
com.db包:
package com.db;
import javax.sql.DataSource;
import org.apache.commons.dbcp2.BasicDataSource;
//设定MySQL的信息的类
public class MyDataSource {
public static DataSource getDataSource(String URL){
BasicDataSource ds = new BasicDataSource();
ds.setUsername("账号");
ds.setPassword("密码");
ds.setUrl(URL);
return ds;
}
}
package com.db;
import java.util.List;
import java.util.ArrayList;
import javax.sql.DataSource;
import org.apache.commons.dbutils.QueryRunner;
import com.model.videoModel;
public class MySQLControl {
static DataSource ds = MyDataSource.getDataSource("jdbc:mysql://127.0.0.1:3306/bilicrawler?useSSL=false&useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC");
//使用传入url的方法来构造一个datasource,再把ds传入queryRunner来连接MySQL
static QueryRunner qr = new QueryRunner(ds);
public static void Insert(List<videoModel> data) {
Object[][] params = new Object[data.size()][6];
for(int i = 0;i<params.length;i++) {
params[i][0] = data.get(i).getRank();
params[i][1] = data.get(i).getTitle();
params[i][2] = data.get(i).getAuthor();
params[i][3] = data.get(i).getPlayNum();
params[i][4] = data.get(i).getDanmuNum();
params[i][5] = data.get(i).getPoint();
}
//把视频的信息塞入二维数组params中。
try {
qr.batch("insert into rankinfo(rank_index, title, author, playNum, danmuNum, point)VALUES(?, ?, ?, ?, ?, ?)",params);
}
//批量执行insert语句。
catch(Exception e) {
e.printStackTrace();
}
}
}
1、编码方面:注意我连接url后面的那几个参数,我的机器一定要有这几个设定才可以运行。
2、注意你写的信息的关键字不能和mysql的保留字重合(如rank)。不然就会和我一样检查一上午的错误QAQ
—————————————————————————————————————————————
com.model包:
package com.model;
//一个简单的bean类。
public class videoModel {
private int rank;
private String title;
private String playNum;
private String danmuNum;
private String author;
private String point;
public int getRank() {
return rank;
}
public void setRank(String rank) {
this.rank = Integer.parseInt(rank);
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getPlayNum() {
return playNum;
}
public void setPlayNum(String playNum) {
this.playNum = playNum;
}
public String getDanmuNum() {
return danmuNum;
}
public void setDanmuNum(String danmuNum) {
this.danmuNum = danmuNum;
}
public String getAuthor() {
return author;
}
public void setAuthor(String author) {
this.author = author;
}
public String getPoint() {
return point;
}
public void setPoint(String point) {
this.point = point;
}
}
—————————————————————————————————————————————
com.parse包:
package com.parse;
import java.util.ArrayList;
import java.util.List;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import com.model.videoModel;
public class parse {
public static List<videoModel>getData(String html) throws XPatherException,Exception{
List<videoModel> dataList = new ArrayList<videoModel>();
HtmlCleaner cleaner = new HtmlCleaner(); //htmlCleaner真好用,Xpath真好用!
TagNode node = cleaner.clean(html);
Object[] videos = node.evaluateXPath("//*[@class='rank-list']/li");
Object[] title = node.evaluateXPath("//*[@class='rank-list']/li//div[@class='info']/a");
Object[] playNum = node.evaluateXPath("//*[@class='rank-list']/li//div[@class='detail']/span[1]");
Object[] danmuNum = node.evaluateXPath("//*[@class='rank-list']/li//div[@class='detail']/span[2]");
Object[] author = node.evaluateXPath("//*[@class='rank-list']/li//div[@class='detail']/a/span");
Object[] point = node.evaluateXPath("//*[@class='rank-list']/li//div[@class='info']//div[@class='pts']/div");
Object[] rank = node.evaluateXPath("//*[@class='rank-list']/li/div[1]");
for(int i = 0 ;i<videos.length;i++) { //对视频数量的遍历
videoModel model = new videoModel();
model.setTitle(((TagNode)title[i]).getText().toString());
model.setPlayNum(((TagNode)playNum[i]).getText().toString());
model.setDanmuNum(((TagNode)danmuNum[i]).getText().toString());
model.setAuthor(((TagNode)author[i]).getText().toString());
model.setPoint(((TagNode)point[i]).getText().toString());
model.setRank(((TagNode)rank[i]).getText().toString());
dataList.add(model);
showData(model);
}
return dataList;
}
// public static void showData(videoModel model) {
// System.out.print(model.getAuthor()+"\t");
// System.out.println(model.getTitle());
// }
//这只是一个检查的方法。
}
1、这里我把每一个信息都开辟了一个object来储存,属实有点浪费空间…下次可以把object放入for里面来。
2、注意如果你xpath定位到"…/text()"这里的话,那后面的转化成tagnode就不能用getText()方法。否则会报错。所以不用定位到text()也可以的。
—————————————————————————————————————————————
com.util包:
package com.util;
import java.io.IOException;
import java.net.http.HttpClient;
import java.net.http.HttpHeaders;
import java.net.http.HttpResponse;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.*;
import org.apache.http.cookie.Cookie;
import org.apache.http.cookie.CookieSpec;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.DefaultHttpRequestRetryHandler;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicHeader;
import org.apache.http.util.EntityUtils;
public class HttpRequestUtil {
private CloseableHttpClient httpClient;
private List<Header> headList = new ArrayList<Header>(); //请求头信息的list
public HttpEntity getEntityByGetMethod(String url) { //*返回网页实体的方法*
initDefaultHeaders(); //请求头初始化
httpClient = HttpClients.custom()
.setRetryHandler(new DefaultHttpRequestRetryHandler())
.setDefaultHeaders(headList)
.build();
RequestConfig defaultConfig = RequestConfig.custom().setCookieSpec(CookieSpecs.STANDARD).build();
HttpGet get = new HttpGet(url);
get.setConfig(defaultConfig); //cookie的设定
CloseableHttpResponse response = null;
try {
response = httpClient.execute(get);
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
HttpEntity httpEntity = response.getEntity();
return httpEntity;
}
public String getContentByGetMethod(String url,String code) { //*返回网页的方法*
try{
return EntityUtils.toString(getEntityByGetMethod(url),code);
//传入实体来返回html页面
}
catch(Exception e) {
e.printStackTrace();
return null;
}
}
public List<Header>initDefaultHeaders() {
headList.add(new BasicHeader("accept", "***"));
headList.add(new BasicHeader("accept-encoding","***"));
headList.add(new BasicHeader("accept-language","***"));
headList.add(new BasicHeader("if-modified-since", "***"));
headList.add(new BasicHeader("user-agent", "***"));
return headList;
}
}
—————————————————————————————————————————————
com.main包:
package com.main;
import java.util.ArrayList;
import java.util.List;
import org.htmlcleaner.XPatherException;
import com.db.MySQLControl;
import com.model.videoModel;
import com.parse.parse;
import com.util.HttpRequestUtil;
public class biliCrawler {
static HttpRequestUtil httpRequest = new HttpRequestUtil();
public static void main(String[] args) throws Exception {
String url = "https://www.bilibili.com/ranking?spm_id_from=333.851.b_7072696d61727950616765546162.3";
String html = httpRequest.getContentByGetMethod(url, "utf-8");
List<videoModel>datalist = parse.getData(html);
MySQLControl.Insert(datalist);
}
}
//调用前面几个类的一个main程序。
导入到mysql后,用navicat查看,效果如下:
目前来说还只能爬这种静态页面,还需继续学习!