1.爬虫模块
public class Crawler {
//打印日志
private final Logger logger = LoggerFactory.getLogger(Crawler.class);
//放置文档页面(超链接)
private final Queue<Page> docQueue = new LinkedBlockingQueue<>();
//放置详情页面(处理完成,数据在dataSet)
private final Queue<Page> detailQueue = new LinkedBlockingQueue<>();
//采集器
private final WebClient webClient;
//解析器
private final List<Parse> parseList = new LinkedList<>();
//清洗器——管道
private final List<Pipeline> pipelineList = new LinkedList<>();
//线程调度器
private final ExecutorService executorService;
public Crawler(){
this.webClient = new WebClient(BrowserVersion.FIREFOX_38);
this.executorService = Executors.newFixedThreadPool(8,
new ThreadFactory() {
private final AtomicInteger id = new AtomicInteger(0);
@Override
public Thread newThread(Runnable r) {
Thread thread = new Thread(r);
thread.setName("Crawler-Thread"+id.getAndIncrement());
return thread;
}
});
}
//启动爬虫
public void start(){
//1.爬取+解析
//2.清洗
this.executorService.submit(this::parse);
this.executorService.submit(this::pipeline);
}
private void parse(){
while (true){
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
logger.error("Parse occur Exception{}.",e.getMessage());
}
final Page page = this.docQueue.poll();
if (page == null){
continue;
}
this.executorService.submit(new Runnable() {
@Override
public void run() {
try {
//采集
HtmlPage htmlPage = Crawler.this.webClient.getPage(page.getUrl());
page.setHtmlPage(htmlPage);
for (Parse parse:Crawler.this.parseList){
parse.parse(page);
}
if (page.isDetail()){
Crawler.this.detailQueue.add(page);
}else{
Iterator<Page> iterator = page.getSubPage().iterator();
while (iterator.hasNext()){
Page subPage = iterator.next();
Crawler.this.docQueue.add(subPage);
iterator.remove();
}
}
} catch (IOException e) {
logger.error("Parse occur Exception{}.",e.getMessage());
}
}
});
}
}
private void pipeline(){
while (true){
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
logger.error("Pipeline occur Exception{}.",e.getMessage());
}
final Page page = this.detailQueue.poll();
if (page == null){
continue;
}
this.executorService.submit(new Runnable() {
@Override
public void run() {
for (Pipeline pipeline : Crawler.this.pipelineList){
pipeline.pipeline(page);
}
}
});
}
}
//爬虫停止
public void stop(){
if (this.executorService!=null && !this.executorService.isShutdown()){
this.executorService.shutdown();
}
logger.info("Crawler stopped...");
}
public void addParse(Parse parse){
this.parseList.add(parse);
}
public void addPipeline(Pipeline pipeline){
this.pipelineList.add(pipeline);
}
public void addPage(Page page){
this.docQueue.add(page);
}
}
//---------------------------------Common-----------------------------------
@Data
public class Page {
//数据网站的根地址
// 比如:https://so.gushiwen.org
private final String base;
//具体网页的路径
private final String path;
//网页DOM对象
private HtmlPage htmlPage;
//标识网页是否是详情页
private final boolean detail;
//子页面对象集合
private Set<Page> subPage = new HashSet<>();
//数据对象(封装后的hashMap)
private DataSet dataSet = new DataSet();
public String getUrl(){
return this.base+this.path;
}
}
public class DataSet {
//data把DOM解析,清洗之后存储的数据
//比如:
//标题:送孟浩然之广陵
//作者:李白
//正文:XXX
private Map<String,Object> data = new HashMap<>();
public void putData(String key,Object value){
this.data.put(key,value);
}
public Object getData(String key){
return this.data.get(key);
}
public Map<String,Object> getData(){
return new HashMap<>(this.data);
}
}
//---------------------------------解析--------------------------------------
public interface Parse {
//解析页面
void parse(Page page);
}
//链接解析
public class DocumentParse implements Parse {
@Override
public void parse(final Page page) {
if (page.isDetail()){
return;
}
HtmlPage htmlPage = page.getHtmlPage();
//分析网页源码
htmlPage.getBody().getElementsByAttribute("div",
"class","typecont").
forEach(div -> {
DomNodeList<HtmlElement> anodeList = div.
getElementsByTagName("a");
anodeList.forEach(
aNode -> {
String path = aNode.getAttribute("href");
Page subpage = new Page(page.getBase(), path, true);
page.getSubPage().add(subpage);
}
);
});
}
}
//详情页面解析
public class DataPageParse implements Parse {
@Override
public void parse(final Page page) {
if(!page.isDetail()){
return;
}
HtmlPage htmlPage = page.getHtmlPage();
HtmlElement body = htmlPage.getBody();
//标题htmlunit
String titlePath = "//div[@class='cont']/h1/text()";
DomText titleDom =(DomText)body.getByXPath(titlePath).get(0);
String title = titleDom.asText();
//作者
String authorPath = "//div[@class='cont']/p/a[2]";
HtmlAnchor authorDom = (HtmlAnchor) body.getByXPath(authorPath).get(0);
String author = authorDom.asText();
//朝代
String dynastyPath = "//div[@class='cont']/p/a[1]";
HtmlAnchor dynastyDom = (HtmlAnchor) body.getByXPath(dynastyPath).get(0);
String dynasty = authorDom.asText();
//正文
String contextPath = "//div[@class='cont']/div[@class='contson']";
HtmlDivision contentDom = (HtmlDivision) body.getByXPath(contextPath).get(0);
String content = contentDom.asText();
page.getDataSet().putData("title",title);
page.getDataSet().putData("dynasty",dynasty);
page.getDataSet().putData("author",author);
page.getDataSet().putData("content",content);
}
}
//---------------------------------管道------------------------------------
public interface Pipeline {
//管道处理page中的数据
void pipeline(final Page page);
}
//将数据插入数据库中
public class DatabasePopeline implements Pipeline{
private final Logger logger = LoggerFactory.getLogger(DatabasePopeline.class);
private final DataSource dataSource;
public DatabasePopeline(DataSource dataSource) {
this.dataSource = dataSource;
}
@Override
public void pipeline(final Page page) {
String title = ((String) page.getDataSet().getData("title"));
String dynasty = (String) page.getDataSet().getData("dynasty");
String author = ((String) page.getDataSet().getData("author"));
String content = ((String) page.getDataSet().getData("content"));
String sql = "insert into poetry_info values (?,?,?,?);";
try (Connection connection = dataSource.getConnection();
PreparedStatement statement =connection.prepareStatement(sql)){
statement.setString(1,title);
statement.setString(2,dynasty);
statement.setString(3,author);
statement.setString(4,content);
statement.executeUpdate();
}catch (SQLException e){
logger.error("Database insert occur Exception{}.",e.getMessage());
}
}
}
//打印处理后的数据
public class ConsolePipeline implements Pipeline {
@Override
public void pipeline(final Page page) {
Map<String,Object> data = page.getDataSet().getData();
//存储
System.out.println(data);
}
}
2.配置模块
@Data
public class ConfigProperties {
private String crawlerBase;
private String crawlerPath;
private boolean crawlerDetail;
private String dbUsername;
private String dbPassword;
private String dbUrl;
private String dbDriverClass;
private boolean enableConsole ;
public ConfigProperties(){
//从外部文件加载
InputStream inputStream = ConfigProperties.class.getClassLoader().
getResourceAsStream("Config.properties");
Properties p = new Properties();
try {
p.load(inputStream);
} catch (IOException e) {
e.printStackTrace();
}
this.crawlerBase = String.valueOf(p.get("crawler.base"));
this.crawlerPath = String.valueOf(p.get("crawler.path"));
this.crawlerDetail = Boolean.parseBoolean(String.valueOf(p.get("crawler.detail")));
this.dbUsername = String.valueOf(p.get("db.username"));
this.dbPassword = String.valueOf(p.get("db.password"));
this.dbUrl = String.valueOf(p.get("db.url"));
this.dbDriverClass = String.valueOf(p.get("db.driver_class"));
this.enableConsole = Boolean.valueOf(String.valueOf(p.getProperty("config.enable_console","false")));
}
public static void main(String[] args) {
new ConfigProperties();
}
}
public class ObjectFactory {
private static final ObjectFactory instance = new ObjectFactory();//单例模式
//存放所有对象
private final Map<Class,Object> objectHashMap = new HashMap<>();
private ObjectFactory(){
//1.初始化配置对象
initConfigProperties();
//2.数据源对象
initDataSource();
//3爬虫对象
initCrawler();
//4.web对象
initWebController();
}
private void initWebController() {
DataSource dataSource = getObject(DataSource.class);
AnalyzeDao analyzeDao = new AnalyzeDaoImpl(dataSource);
AnalyzeService analyzeService = new AnalyzeServiceImpl(analyzeDao);
WebController webController = new WebController(analyzeService);
objectHashMap.put(WebController.class,webController);
}
private void initCrawler() {
ConfigProperties configProperties = new ConfigProperties();
DruidDataSource dataSource = getObject(DataSource.class);
final Page page = new Page(configProperties.getCrawlerBase(),
configProperties.getCrawlerPath(),
configProperties.isCrawlerDetail());
Crawler crawler = new Crawler();
crawler.addParse(new DocumentParse());
crawler.addParse(new DataPageParse());
if (configProperties.isEnableConsole()){
crawler.addPipeline(new ConsolePipeline());
}
crawler.addPipeline(new DatabasePopeline(dataSource));
crawler.addPage(page);
objectHashMap.put(Crawler.class,crawler);
}
private void initConfigProperties(){
ConfigProperties configProperties = new ConfigProperties();
objectHashMap.put(ConfigProperties.class,configProperties);
}
private void initDataSource(){
ConfigProperties configProperties = getObject(ConfigProperties.class);
DruidDataSource dataSource = new DruidDataSource();
dataSource.setUsername(configProperties.getDbUsername());
dataSource.setPassword(configProperties.getDbPassword());
dataSource.setDriverClassName(configProperties.getDbDriverClass());
dataSource.setUrl(configProperties.getDbUrl());
objectHashMap.put(DataSource.class,dataSource);
}
public <T> T getObject(Class classz){
if (!objectHashMap.containsKey(classz)){
throw new IllegalArgumentException("Class"+classz
.getName()+"not found Object");
}
return (T)objectHashMap.get(classz);
}
public static ObjectFactory getInstance(){
return instance;
}
/*
private void printObjectList(){
System.out.println("----------------ObjectFactoryList--------------");
for (Map.Entry<Class,Object> entry : objectHashMap.entrySet()){
System.out.println(String.format("\t[%s]==>[%s]",
entry.getKey().getCanonicalName(),
entry.getValue().getClass().getCanonicalName()));
}
System.out.println("------------------------------------------------");
}*/
}
3.分析模块
//------------------------------存放诗的信息------------------------------------
@Data
public class PoetryInfo {
private String title;
private String dynasty;
private String author;
private String content;
}
public interface AnalyzeDao {
//分析唐诗中作者的创作数量
List<AuthorCount> analyzeAuthorCount();
//查询所有诗文,提供给业务层进行分析
List<PoetryInfo> queryAllPoetryInfo();
}
public class AnalyzeDaoImpl implements AnalyzeDao {
private final DataSource dataSource;
public AnalyzeDaoImpl(DataSource dataSource) {
this.dataSource = dataSource;
}
@Override
public List<AuthorCount> analyzeAuthorCount() {
List<AuthorCount> datas = new ArrayList<>();
//try()自动关闭
String sql = "select count(*) as count,author from" +
" poetry_info group by author;";
try (Connection connection = dataSource.getConnection();
PreparedStatement statement = connection.prepareStatement(sql);
ResultSet rs = statement.executeQuery();
){
while (rs.next()){
AuthorCount authorCount = new AuthorCount();
authorCount.setAuthor(rs.getString("author"));
authorCount.setCount(rs.getInt("count"));
datas.add(authorCount);
}
} catch (SQLException e) {
e.printStackTrace();
}
return datas;
}
@Override
public List<PoetryInfo> queryAllPoetryInfo() {
List<PoetryInfo> datas = new ArrayList<>();
String sql = "select title,dynasty,author,content from poetry_info;";
try(Connection connection = dataSource.getConnection();
PreparedStatement statement = connection.prepareStatement(sql);
ResultSet rs = statement.executeQuery();
){
while (rs.next()){
PoetryInfo poetryInfo = new PoetryInfo();
poetryInfo.setTitle(rs.getString("title"));
poetryInfo.setDynasty(rs.getString("dynasty"));
poetryInfo.setAuthor(rs.getString("author"));
poetryInfo.setContent(rs.getString("content"));
datas.add(poetryInfo);
}
} catch (SQLException e) {
e.printStackTrace();
}
return datas;
}
}
public interface AnalyzeService {
//分析唐诗中作者的创作数量
List<AuthorCount> analyzeAuthorCount();
//词云分析
List<WordCount> analyzeWordCloud();
}
public class AnalyzeServiceImpl implements AnalyzeService{
private final AnalyzeDao analyzeDao;
public AnalyzeServiceImpl(AnalyzeDao analyzeDao) {
this.analyzeDao = analyzeDao;
}
@Override
public List<AuthorCount> analyzeAuthorCount() {
//排序方式:
//1.DAO层SQL排序
//2.业务层Service进行数据排序
List<AuthorCount> authorCounts = analyzeDao.analyzeAuthorCount();
//降序排序
authorCounts.sort((o1, o2) -> o2.getCount() - o1.getCount());
return authorCounts;
}
@Override
public List<WordCount> analyzeWordCloud() {
//1.查询出所有数据
//2.取出title content
//3.分词-过滤w、null、空-length<2
//4.统计
//<词:词频>
Map<String,Integer> map = new HashMap<>();
List<PoetryInfo> poetryInfos = analyzeDao.queryAllPoetryInfo();
for (PoetryInfo poetryInfo :poetryInfos){
List<Term> terms = new ArrayList<>();
String title = poetryInfo.getTitle();
String content = poetryInfo.getContent();
terms.addAll(NlpAnalysis.parse(title).getTerms());
terms.addAll(NlpAnalysis.parse(content).getTerms());
Iterator<Term>iterator = terms.iterator();
while (iterator.hasNext()){
Term term = iterator.next();
//词性的过滤
if (term.getNatureStr() == null || term.getNatureStr().equals("w")){
iterator.remove();
continue;
}
//词的过滤
if(term.getRealName().length()<2){
iterator.remove();
continue;
}
//统计
String realName = term.getRealName();
Integer count = 0;
if (map.containsKey(realName)){
count = map.get(realName)+1;
}else {
count = 1;
}
map.put(realName,count);
}
}
List<WordCount> wordCounts = new ArrayList<>();
for (Map.Entry<String,Integer> entry: map.entrySet()){
WordCount wordCount = new WordCount();
wordCount.setCount(entry.getValue());
wordCount.setWord(entry.getKey());
wordCounts.add(wordCount);
}
return wordCounts;
}
/* 测试
public static void main(String[] args) {
Result result = NlpAnalysis.parse("松下问童子,言师采药去。" +
"只在此山中,云深不知处。 ");
List<Term> terms = result.getTerms();
for (Term term:terms) {
System.out.println(term);
}
}*/
}
@Data
public class AuthorCount {
private String author;
private Integer count;
}
public class WordCount {
private String word;
private Integer count;
}
4.web模块
public class WebController {
private final AnalyzeService analyzeService;
public WebController(AnalyzeService analyzeService) {
this.analyzeService = analyzeService;
}
//-> http://127.0.0.1:4567/
//->/analyze/author_count
private List<AuthorCount> analyzeAythorCount(){
return analyzeService.analyzeAuthorCount();
}
//-> http://127.0.0.1:4567/
//->/analyze/word_cloud
private List<WordCount> analyzeWordCloud(){
return analyzeService.analyzeWordCloud();
}
public void launch(){
ResponseTransformer transformer = new JSONResponseTransformer();
Spark.staticFileLocation("/static");
Spark.get("/analyze/author_count", ((request, response) ->
analyzeAythorCount()),transformer);
Spark.get("/analyze/word_cloud",((request, response) ->
analyzeWordCloud()),transformer);
}
public static class JSONResponseTransformer implements ResponseTransformer{
//Object——>Gson
private Gson gson = new Gson();
@Override
public String render(Object o) throws Exception {
return gson.toJson(o);
}
}
5.项目主类
public class TangshiAnalyzeApplication {
private static final Logger LOGGER = LoggerFactory.
getLogger(TangshiAnalyzeApplication.class);
public static void main(String[] args) {
WebController webController = ObjectFactory.getInstance().getObject(WebController.class);
//运行了web服务,提供接口
LOGGER.info("Web Server Launch...");
webController.launch();
/* if (args.length==1&&args[0].equals("run-crawler")){
Crawler crawler = ObjectFactory.getInstance().getObject(Crawler.class);
LOGGER.info("Crawler started...");
crawler.start();
}*/
}
}
6.前端模块
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>唐诗分析报告</title>
<link type="text/css" href="assets/css/base.css">
<script type="text/javascript" src="assets/js/echarts.min.js"></script>
<script type="text/javascript" src="assets/js/echarts-wordcloud.min.js"></script>
<script type="text/javascript" src="assets/js/jquery-3.3.1.min.js"></script>
<script type="text/javascript" src="assets/js/api.js"></script>
<script>
</script>
</head>
<body id="box">
<h1>唐诗分析报告</h1>
<div>
<div>
<input type="button" value="创作数量排行榜" onclick="creationRanking('content')">
<input type="button" value="诗词用词云图" onclick="cloudWorld('content')">
</div>
<div id="content" style="width: 800px ; height: 500px;">
</div>
</div>
</body>
</html>
/*
api.js
1. 主要用于前端页面与后端的交互
前端:HTML+CSS+JavaScript开发的浏览器识别的Web程序
后端:Java开发服务程序,通过HTTP协议提供Web API接口
前端和后端交互:通信协议用的是HTTP协议
*/
function creationRanking(id) {
//HTTP Method : GET
//HTTP URL : 请求地址(服务端提供的API接口)
$.get({
url: "/analyze/author_count",
dataType: "json",
method: "get",
success: function (data, status, xhr) {
//echarts图表对象
var myChart = echarts.init(document.getElementById(id));
var options = {
//图标的标题
title: {
text: '唐诗创作排行榜'
},
tooltip: {},
//柱状图的提示信息
legend: {
data: ['数量(首)']
},
//X轴的数据:作者
xAxis: {
data: []
},
//Y轴的数据:创作的数量
yAxis: {},
series: [{
name: '创作数量',
type: 'bar',
data: []
}]
};
//List<AuthorCount>
for (var i=0; i< data.length; i++) {
var authorCount = data[i];
options.xAxis.data.push(authorCount.author);
options.series[0].data.push(authorCount.count);
}
myChart.setOption(options, true);
},
error: function (xhr, status, error) {
}
});
}
function cloudWorld(id) {
$.get({
url: "/analyze/word_cloud",
dataType: "json",
method: "get",
success: function (data, status, xhr) {
var myChart = echarts.init(document.getElementById(id));
var options = {
series: [{
type: 'wordCloud',
shape: 'pentagon',
left: 'center',
top: 'center',
width: '80%',
height: '80%',
right: null,
bottom: null,
sizeRange: [12, 60],
rotationRange: [-90, 90],
rotationStep: 45,
gridSize: 8,
drawOutOfBound: false,
textStyle: {
normal: {
fontFamily: 'sans-serif',
fontWeight: 'bold',
color: function () {
//rgb(r,g,b)
return 'rgb(' + [
Math.round(Math.random() * 160),
Math.round(Math.random() * 160),
Math.round(Math.random() * 160)
].join(',') + ')';
}
},
emphasis: {
shadowBlur: 10,
shadowColor: '#333'
}
},
// Data is an array. Each array item must have name and value property.
data: []
}]
};
for (var i=0 ;i<data.length; i++) {
var wordCount = data[i];
//wordCount => 词 : 词频
options.series[0].data.push({
name: wordCount.word,
value: wordCount.count,
textStyle: {
normal: {},
emphasis: {}
}
});
}
myChart.setOption(options, true);
},
error: function (xhr, status, error) {
}
});
}
效果展示