唐诗之诗词爬取
1,项目简介
通过抓取网页唐诗三百首进行数据的清洗,存储,分析最后生成可视化文档。
2,项目思路
从网络上爬取古诗词保存至数据库中–主要划分为以下步骤:
- 在数据库中创建唐诗表结构
- 获取古诗词列表的html
- 从列表页html提取每首诗的信息
- 进入每首诗的详情页获取(标题,朝代,作者,正文等信息)
- 计算sha256(标题+正文)保证数据不重复
- 调用一个分词第三方库,对内容进行分词
- 将数据保存到数据库的表中
3,数据抓取阶段需要的第三方依赖
数据库操作:mysql
html页面请求解析需要:htmlunit
实现分词需要:ansj_seg
Servlet:servlet
<dependency>
<groupId>net.sourceforge.htmlunit</groupId>
<artifactId>htmlunit</artifactId>
<version>2.36.0</version>
</dependency>
<dependency>
<groupId>org.ansj</groupId>
<artifactId>ansj_seg</artifactId>
<version>5.1.6</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.47</version>
</dependency>
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>javax.servlet-api</artifactId>
<version>3.1.0</version>
<scope>provided</scope>
</dependency>
4,项目编写
- 数据库的创建
CREATE TABLE TANGSHI(
id INT PRIMARY KEY AUTO_INCREMENT,
sha256 CHAR(64) NOT NULL UNIQUE,
dynastay VARCHAR(10) NOT NULL,
title VARCHAR(30) NOT NULL,
author VARCHAR(10) NOT null,
content TEXT NOT NULL,
words TEXT NOT NULL
);
- 网页抓取第三方库htmlunit的使用
public class HtmlUnitDemo {
public static void main(String[] args) throws IOException {
//无界面的浏览器
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//用此方法不用去执行CSS引擎和JS引擎,关闭js和css执行引擎
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
//可以具体对页面进行请求,请求一个具体对url
HtmlPage page = webClient.getPage("https://www.gushiwen.org/gushi/tangshi.aspx");
System.out.println(page);
//注意Sava只能一次
page.save(new File("唐诗三百首\\列表页.html"));
//以上为请求过程
//如何从html中获取详情信息
HtmlElement body = page.getBody();
List<HtmlElement> elements = body.getElementsByAttribute(
"div",//元素名称
"class",//标签名称
"typecont");//标签值
for(HtmlElement element:elements){
System.out.println(element);
}
}
}
- 从列表页提取每首诗信息
public class 列表页提取Demo {
public static void main(String[] args) throws IOException {
WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
String url = "https://www.gushiwen.org/gushi/tangshi.aspx";
HtmlPage page = webClient.getPage(url);
HtmlElement body = page.getBody();
List<HtmlElement> elements = body.getElementsByAttribute(
"div",
"class",
"typecont"
);
int count = 0;
for(HtmlElement element:elements){
List<HtmlElement> aElements = element.getElementsByTagName("a");
for(HtmlElement a:aElements){
System.out.println(a.getAttribute("href"));
count++;
}
}
System.out.println(count);
}
}
- 从诗详情页提取作者,朝代,诗词等信息…
public class 详情页提取Demo {
public static void main(String[] args) throws Exception {
WebClient webClient = new WebClient(BrowserVersion.CHROME);
webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setCssEnabled(false);
String url = "https://so.gushiwen.org/shiwenv_45c396367f59.aspx";
HtmlPage page = webClient.getPage(url);
HtmlElement body = page.getBody();
// List<HtmlElement> elements = body.getElementsByAttribute(
// "div",
// "class",
// "contson"
// );
// for(HtmlElement element:elements){
// System.out.println(element);
// }
// System.out.println(elements.get(0).getTextContent().trim());
//标题
{
String xpath = "//div[@class='cont']/h1/text()";
Object o = body.getByXPath(xpath).get(0);
DomText domText = (DomText)o;
System.out.println(domText.asText());
}
//朝代
{
String xpath = "//div[@class='cont']/p[@class='source']/a[1]/text()";
Object o = body.getByXPath(xpath).get(0);
DomText domText = (DomText)o;
System.out.println(domText.asText());
}
//作者
{
String xpath = "//div[@class='cont']/p[@class='source']/a[2]/text()";
Object o = body.getByXPath(xpath).get(0);
DomText domText = (DomText)o;
System.out.println(domText.asText());
}
//诗文
{
String xpath = "//div[@class='cont']/div[@class='contson']";
Object o = body.getByXPath(xpath).get(0);
HtmlElement element = (HtmlElement) o;
System.out.println(element.getTextContent().trim());
}
}
}
方法解释:
1,获取div标签中class为typecont的html元素
`List<HtmlElement> elements = body.getElementsByAttribute(
"div",//元素名称
"class",//标签名称
"typecont");//标签值`
2,Xpath
获取div属性中class为cont下p标签中class为。。。
- 实现分词需要的ansj_seg
import org.ansj.domain.Term;
import org.ansj.splitWord.analysis.NlpAnalysis;
import java.util.List;
public class 分词Demo {
public static void main(String[] args) {
String S = "靡不有初,鲜克有终";
List<Term> termList = NlpAnalysis.parse(S).getTerms();
for(Term term:termList){
System.out.println(term.getNatureStr() + ":" + term.getName());
}
}
}
5,项目实现
项目实现代码