结果部分截图如下
我使用的是 IntelliJ IDEA ,首先先建立一个普通的maven项目
先创建一个实体类来存储电影信息
/**
* 存储页面信息实体类
*/
public class Page {
//id
private int id;
//评论数
private String commentcount;
//电影名称
private String title;
//电影介绍
private String jieshao;
//电影评分
private String fenshu;
//。。。。。此处省略getter和setter
}
创建一个通过URL来获取网页html的工具类PageDownLoadUtil,用来把网页内容下载下来
package util;
import org.apache.http.HttpEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import java.io.IOException;
public class PageDownLoadUtil {
public static String getPageContent(String url) {
HttpClientBuilder builder = HttpClients.custom();
CloseableHttpClient client = builder.build();
HttpGet request = new HttpGet(url);
String content = null;
try {
CloseableHttpResponse response = client.execute(request);
HttpEntity entity = response.getEntity();
content = EntityUtils.toString(entity);
} catch (IOException e) {
e.printStackTrace();
}
return content;
}
}
做一个提取标签的工具类 htmlUtil
package util;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
public class htmlUtil {
/**
* 获取标签属性值
* @param tagNode
* @param xpath
* @param att 匹配标签值
* @return
*/
public static String getAttributeByName(TagNode tagNode,String xpath,String att){
String result = null;
Object[] objects = null;
try {
objects = tagNode.evaluateXPath(xpath);
if (objects.length > 0){
TagNode node = (TagNode) objects[0];
result = node.getAttributeByName(att);
}
} catch (XPatherException e) {
e.printStackTrace();
}
return result;
}
}
工具类都做好了,使用工具类解析提取网页内容
创建一个解析页面的接口
package service;
public interface ProcessService {
public void process(String url);
}
接下来是它的实现类
package service;
import Pojo.Page;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;
import java.io.IOException;
/**
* 解析页面
*/
public class ProcessServiceImpl implements ProcessService {
pageService pageService = new pageImpl();
public void process(String content) {
Page page = new Page();
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode rootNode = htmlCleaner.clean(content);
//获取标题
try {
for (int i = 1; i <= 25; i++) {
//标题
Object[] objects = rootNode.evaluateXPath("//*[@id=\"content\"]/div/div[1]/ol/li["+i+"]/div/div[2]/div[1]/a/span[1]");
if (objects.length > 0) {//有东西
TagNode node = (TagNode) objects[0];
System.out.print(node.getText().toString()+ " ");
page.setTitle(node.getText().toString());
}
Object[] objects1 = rootNode.evaluateXPath("//*[@id=\"content\"]/div/div[1]/ol/li["+i+"]/div/div[2]/div[2]/div/span[2]");
//分数
if (objects1.length > 0) {
TagNode node = (TagNode) objects1[0];
System.out.print(node.getText().toString() + " ");
page.setFenshu(node.getText().toString());
}
//评论数
Object[] objects2 = rootNode.evaluateXPath("//*[@id=\"content\"]/div/div[1]/ol/li["+i+"]/div/div[2]/div[2]/div/span[4]");
if (objects2.length > 0) {
TagNode node = (TagNode) objects2[0];
System.out.print(node.getText().toString() + " ");
page.setCommentcount(node.getText().toString());
}
//介绍
Object[] objects3 = rootNode.evaluateXPath("//*[@id=\"content\"]/div/div[1]/ol/li["+i+"]/div/div[2]/div[2]/p[2]/span");
if (objects3.length > 0) {
TagNode node = (TagNode) objects3[0];
System.out.println(node.getText().toString());
page.setJieshao(node.getText().toString());
}
int result = pageService.save(page);
if (result == 0){
System.out.println("保存失败");
}
}
} catch (XPatherException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
主要说一下xpath的获取:
我用的谷歌浏览器,F12可以查看网页代码,然后找到电影名称的位置,然后右键 点击copy->copyxpath
代码中的一个 li 标签代表一个电影,编号是从1到25(每页只有25个电影)
翻页的操作要用到 工具类 htmlUtil 来提取 href 标签 ,然后在主函数中的 url 后面加入这个标签的值就相当于翻页了
主函数
import service.ProcessService;
import service.ProcessServiceImpl;
import util.PageDownLoadUtil;
public class Main {
public static void main(String[] args) {
for (int i = 0; i < 250; i+=25) {
String url = "https://movie.douban.com/top250" + "?start="+i+"&filter=";
ProcessService ps = new ProcessServiceImpl();
ps.process(PageDownLoadUtil.getPageContent(url));
}
}
}
剩下的是使用Mybatis框架保存数据了
Pagemapper
package mapper;
import Pojo.Page;
import java.util.ArrayList;
public interface Pagemapper {
void addPage(Page page);
ArrayList<Page> selectAll();
}
pagemapper.xml
<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE mapper
PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="mapper.Pagemapper">
<insert id="addPage" parameterType="Pojo.Page">
insert into page values (#{id} ,#{commentcount} ,#{title} ,#{jieshao} ,#{fenshu})
</insert>
<select id="selectAll" resultType="Pojo.Page">
select * from page
</select>
</mapper>
mybatis-config.xml
<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE configuration
PUBLIC "-//mybatis.org//DTD Config 3.0//EN"
"http://mybatis.org/dtd/mybatis-3-config.dtd">
<configuration>
<!--default引用下面的id-->
<environments default="mybatis1">
<environment id="mybatis1">
<transactionManager type="JDBC" />
<dataSource type="POOLED">
<property name="driver" value="com.mysql.cj.jdbc.Driver"/>
<property name="url" value="jdbc:mysql://localhost:3306/pa?
useSSL=false&serverTimezone=UTC&
characterEncoding=utf-8&allowPublicKeyRetrieval=true"/>
<property name="username" value="root"/>
<property name="password" value="qqq1234"/>
</dataSource>
</environment>
</environments>
<mappers>
<mapper resource="mapper/pagemapper.xml"/>
</mappers>
</configuration>
pageService
package service;
import Pojo.Page;
import java.io.IOException;
import java.util.List;
public interface pageService {
int save(Page page) throws IOException;
List<Page> selectAll() throws IOException;
}
pageImpl
package service;
import Pojo.Page;
import org.apache.ibatis.io.Resources;
import org.apache.ibatis.session.SqlSession;
import org.apache.ibatis.session.SqlSessionFactory;
import org.apache.ibatis.session.SqlSessionFactoryBuilder;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
public class pageImpl implements pageService {
@Override
public int save(Page page) throws IOException {
InputStream is = Resources.getResourceAsStream("mybatis-config.xml");
SqlSessionFactory factory = new SqlSessionFactoryBuilder().build(is);
SqlSession sqlSession = factory.openSession(true); //设置自动提交事务
int i = sqlSession.insert("addPage",page);
return i;
}
@Override
public List<Page> selectAll() throws IOException {
InputStream is = Resources.getResourceAsStream("mybatis-config.xml");
SqlSessionFactory factory = new SqlSessionFactoryBuilder().build(is);
SqlSession sqlSession = factory.openSession();
List<Page> list = sqlSession.selectList("selectAll");
return list;
}
}
到这就可以爬取豆瓣电影top250了