解析具体模块列表页并优化解析实现类
类如csdn主页的边栏 5g 模块
给page.类添加url列表 属性
//存储模块url(包含列表页url和详情页url)
private List<String> urlList= new ArrayList<String>();
连接数据库模块 并提供对数据库的各种操作接口
package work.spider.util;
import java.sql.Connection;
import java.sql.Date;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
/*
* 数据库操作实现类
* 2020/3/20
*/
public class DatabaseUtil {
public static int insert(String dbname,String command) { //更新数据库功能
Connection con = null; //连接
Statement sql = null; //sql语句
ResultSet rs= null; //返回的结果
int result = 0;
try {
Class.forName("com.mysql.cj.jdbc.Driver");
}
catch(Exception e) {
System.out.println("forNameError:"+e);
}
String url="jdbc:mysql://127.0.0.1:3306/"+dbname+"?serverTimezone=UTC";
String user="root";
String password="root";
try {
con=DriverManager.getConnection(url,user,password);
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
sql = con.createStatement();
result= sql.executeUpdate(command);
con.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return result;
}
public static boolean isfind(String dbname,String URL) { //查询数据库是否已经存在相关字段的数据
boolean flag=false;
Connection con = null; //连接
Statement sql = null; //sql语句
ResultSet rs= null; //返回的结果
int result = 0;
try {
Class.forName("com.mysql.cj.jdbc.Driver");
}
catch(Exception e) {
System.out.println("forNameError:"+e);
}
String url="jdbc:mysql://127.0.0.1:3306/"+dbname+"?serverTimezone=UTC";
String user="root";
String password="root";
try {
con=DriverManager.getConnection(url,user,password);
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
sql = con.createStatement();
rs= sql.executeQuery("SELECT * FROM csdn WHERE url='"+URL+"'");
while(rs.next()) {
result++;
}
con.close();
} catch (SQLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
if(result!=0) flag=true;
return flag;
}
/* rs=sql.executeQuery("SELECT * FROM mess"); //查询语句
while(rs.next()) {
String number = rs.getString(1);
String name= rs.getString(2);
Date date=rs.getDate(3);
float height = rs.getFloat(4);*/
}
对解析类的优化、保存一个列表中每一篇文章的url
package work.spider.service.impl;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import work.spider.entity.Page;
import work.spider.service.IProcessService;
import work.spider.util.HtmlUtil;
import work.spider.util.LoadPropertyUtil;
import work.spider.util.RegexUtil;
/**
*
*CSDN页面解析实现类
* @auther lwr
* create by 2020-03-13
* */
public class CSDNProcessService implements IProcessService {
public void process(Page page) {
// TODO Auto-generated method stub
String content =page.getContent();
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode rootNode =htmlCleaner.clean(content);
if(page.getUrl().startsWith("https://blog.csdn.net/")) {
//解析博客详情页
parseDetail(page,rootNode);
}
else {
//获取详情页的url
try {
//未加载前的页面
Object[] evaluateXpath=rootNode.evaluateXPath(LoadPropertyUtil.getCSDN("hrefPath1"));
if(evaluateXpath.length>0) {
for(Object object:evaluateXpath) {
TagNode tagNode = (TagNode) object;
String detailurl = tagNode.getAttributeByName("href");
page.addUrl(detailurl);
}
}
//加载之后的页面
evaluateXpath = rootNode.evaluateXPath(LoadPropertyUtil.getCSDN("hrefPath2"));
if(evaluateXpath.length>0) {
for(Object object:evaluateXpath) {
TagNode tagNode = (TagNode) object;
String detailurl = tagNode.getAttributeByName("href");
page.addUrl(detailurl);
System.out.println("detailurl:"+detailurl);
}
}
} catch (XPatherException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/*
*
* 解析模块详情页
*/
public void parseDetail(Page page,TagNode rootNode) {
//获取总阅读数
page.setAllnumber(HtmlUtil.getFieldByRegex(rootNode,LoadPropertyUtil.getCSDN("readNumberPath"),LoadPropertyUtil.getCSDN("allnumberRegex")));
//获取评论数
page.setCommentNuber(HtmlUtil.getFieldByRegex(rootNode, LoadPropertyUtil.getCSDN("commentNumberPath"),LoadPropertyUtil.getCSDN("commentnumberRegex")));
//获取发文时间
page.setPublishtime(HtmlUtil.getFieldByRegex(rootNode,LoadPropertyUtil.getCSDN("publishTimePath") ,LoadPropertyUtil.getCSDN("publishTime")));
//获取点赞数
page.setSupportNumber(HtmlUtil.getFieldByRegex(rootNode,LoadPropertyUtil.getCSDN("supportNumberPath") ,LoadPropertyUtil.getCSDN("supportNumberRegex")));
//获取博主id
page.setBlogger(HtmlUtil.getFieldByRegex(rootNode,LoadPropertyUtil.getCSDN("bloggerPath") ,LoadPropertyUtil.getCSDN("bloggerRegex")));
//获取博客标题
page.setBlogTitle(HtmlUtil.getFieldByRegex(rootNode,LoadPropertyUtil.getCSDN("blogTitlePath") ,LoadPropertyUtil.getCSDN("blogTitleRegex")));
}
}
当前遇到一个问题就是关于 csdn博客的下拉动态加载问题,怎样获取加载出的新的文章的url
用过chrome浏览器的开发者工具---》network 可以查看到
当下拉页面动态加载时会在文件列表显示出新的文件,根据request地址打开文件之后可以看到 是json格式的文件内容
可以使用fastjson 解析该文件获得 每个博主的id 然后即可访问每个博主的博文主页,便可以依次爬取每篇文章的数据了。