Java项目之seek-poetry（爬虫）

最新推荐文章于 2023-08-07 16:41:22 发布

perfectmatch_G

最新推荐文章于 2023-08-07 16:41:22 发布

阅读量452

点赞数

分类专栏： Java

本文链接：https://blog.csdn.net/perfectmatch_g/article/details/100104873

版权

Java 专栏收录该内容

22 篇文章 0 订阅

订阅专栏

本文详细介绍了Java实现的诗词爬虫项目seek-poetry，包括爬虫模块负责数据抓取，配置模块管理项目设置，分析模块进行数据解析，web模块构建后台服务，项目主类启动入口，以及前端模块展示效果。

摘要由CSDN通过智能技术生成

1.爬虫模块


public class Crawler {
	//打印日志
    private final Logger logger = LoggerFactory.getLogger(Crawler.class);

    //放置文档页面（超链接）
    private final Queue<Page> docQueue = new LinkedBlockingQueue<>();
    //放置详情页面(处理完成，数据在dataSet)
    private final Queue<Page> detailQueue = new LinkedBlockingQueue<>();
    //采集器
    private final WebClient webClient;
    //解析器
    private final List<Parse> parseList = new LinkedList<>();
    //清洗器——管道
    private final List<Pipeline> pipelineList = new LinkedList<>();
    //线程调度器
    private final ExecutorService executorService;

    public Crawler(){
        this.webClient = new WebClient(BrowserVersion.FIREFOX_38);
        this.executorService = Executors.newFixedThreadPool(8,
                new ThreadFactory() {
            private final AtomicInteger id = new AtomicInteger(0);
            @Override
            public Thread newThread(Runnable r) {
                Thread thread = new Thread(r);
                thread.setName("Crawler-Thread"+id.getAndIncrement());
                return thread;
            }
        });
    }
    //启动爬虫
    public void start(){
        //1.爬取+解析
        //2.清洗
        this.executorService.submit(this::parse);
        this.executorService.submit(this::pipeline);
    }
    private void parse(){
        while (true){
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                logger.error("Parse occur Exception{}.",e.getMessage());
            }
            final Page page = this.docQueue.poll();
            if (page == null){
                continue;
            }
            this.executorService.submit(new Runnable() {
                @Override
                public void run() {
                    try {
                        //采集
                        HtmlPage htmlPage = Crawler.this.webClient.getPage(page.getUrl());
                        page.setHtmlPage(htmlPage);
                        for (Parse parse:Crawler.this.parseList){
                            parse.parse(page);
                        }
                        if (page.isDetail()){
                            Crawler.this.detailQueue.add(page);
                        }else{
                            Iterator<Page> iterator = page.getSubPage().iterator();
                            while (iterator.hasNext()){
                                Page subPage = iterator.next();
                                Crawler.this.docQueue.add(subPage);
                                iterator.remove();
                            }
                        }
                    } catch (IOException e) {
                        logger.error("Parse occur Exception{}.",e.getMessage());
                    }
                }
            });
        }
    }

    private void pipeline(){
        while (true){
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                logger.error("Pipeline occur Exception{}.",e.getMessage());
            }
            final Page page = this.detailQueue.poll();
            if (page == null){
                continue;
            }
            this.executorService.submit(new Runnable() {
                @Override
                public void run() {
                    for (Pipeline pipeline : Crawler.this.pipelineList){
                        pipeline.pipeline(page);
                    }
                }
            });

        }
    }

    //爬虫停止
    public void stop(){
        if (this.executorService!=null && !this.executorService.isShutdown()){
            this.executorService.shutdown();
        }
        logger.info("Crawler stopped...");
    }
    public void addParse(Parse parse){
        this.parseList.add(parse);
    }
    public void addPipeline(Pipeline pipeline){
        this.pipelineList.add(pipeline);
    }
    public void addPage(Page page){
        this.docQueue.add(page);
    }

}
//---------------------------------Common-----------------------------------
@Data
public class Page {
    //数据网站的根地址
    // 比如：https://so.gushiwen.org
    private final String base;
    //具体网页的路径
    private final String path;
    //网页DOM对象
    private HtmlPage htmlPage;
    //标识网页是否是详情页
    private final boolean detail;
    //子页面对象集合
    private Set<Page> subPage = new HashSet<>();
    //数据对象(封装后的hashMap）
    private DataSet dataSet = new DataSet();
    public String getUrl(){
        return this.base+this.path;
    }
}


public class DataSet {
    //data把DOM解析，清洗之后存储的数据
    //比如：
    //标题：送孟浩然之广陵
    //作者：李白
    //正文：XXX
    private Map<String,Object> data = new HashMap<>();

    public void putData(String key,Object value){

        this.data.put(key,value);
    }
    public Object getData(String key){

        return this.data.get(key);
    }
    public Map<String,Object> getData(){

        return new HashMap<>(this.data);
    }
}
//---------------------------------解析--------------------------------------

public interface Parse {
    //解析页面
    void parse(Page page);
}

//链接解析
public class DocumentParse implements Parse {
    @Override
    public void parse(final Page page) {
        if (page.isDetail()){
            return;
        }
        HtmlPage htmlPage = page.getHtmlPage();
        //分析网页源码
        htmlPage.getBody().getElementsByAttribute("div",
                "class","typecont").
                forEach(div -> {
                    DomNodeList<HtmlElement> anodeList = div.
                            getElementsByTagName("a");
                    anodeList.forEach(
                            aNode -> {
                                String path = aNode.getAttribute("href");
                                Page subpage = new Page(page.getBase(), path, true);
                                page.getSubPage().add(subpage);
                            }
                    );
                });
    }
}

//详情页面解析
public class DataPageParse implements Parse {
    @Override
    public void parse(final Page page) {
        if(!page.isDetail()){
            return;
        }

        HtmlPage htmlPage = page.getHtmlPage();
        HtmlElement body = htmlPage.getBody();
        //标题htmlunit
        String titlePath = "//div[@class='cont']/h1/text()";
        DomText titleDom =(DomText)body.getByXPath(titlePath).get(0);
        String title = titleDom.asText();

        //作者
        String authorPath = "//div[@class='cont']/p/a[2]";
        HtmlAnchor authorDom = (HtmlAnchor) body.getByXPath(authorPath).get(0);
        String author = authorDom.asText();

        //朝代
        String dynastyPath = "//div[@class='cont']/p/a[1]";
        HtmlAnchor dynastyDom = (HtmlAnchor) body.getByXPath(dynastyPath).get(0);
        String dynasty = authorDom.asText();

        //正文
        String contextPath = "//div[@class='cont']/div[@class='contson']";
        HtmlDivision contentDom = (HtmlDivision) body.getByXPath(contextPath).get(0);
        String content = contentDom.asText();

        page.getDataSet().putData("title",title);
        page.getDataSet().putData("dynasty",dynasty);
        page.getDataSet().putData("author",author);
        page.getDataSet().putData("content",content);

    }
}




//---------------------------------管道------------------------------------
public interface Pipeline {

    //管道处理page中的数据
    void pipeline(final Page page);
}


//将数据插入数据库中
public class DatabasePopeline implements Pipeline{
    private final Logger logger = LoggerFactory.getLogger(DatabasePopeline.class);

    private final DataSource dataSource;

    public DatabasePopeline(DataSource dataSource) {
        this.dataSource = dataSource;
    }

    @Override
    public void pipeline(final Page page) {

        String title = ((String) page.getDataSet().getData("title"));
        String dynasty = (String) page.getDataSet().getData("dynasty");
        String author = ((String) page.getDataSet().getData("author"));
        String content = ((String) page.getDataSet().getData("content"));

        String sql = "insert into poetry_info values (?,?,?,?);";
        try (Connection connection = dataSource.getConnection();
             PreparedStatement statement =connection.prepareStatement(sql)){
            statement.setString(1,title);
            statement.setString(2,dynasty);
            statement.setString(3,author);
            statement.setString(4,content);
            statement.executeUpdate();
        }catch (SQLException e){
            logger.error("Database insert occur Exception{}.",e.getMessage());
        }
    }
}

//打印处理后的数据
public class ConsolePipeline implements Pipeline {
    @Override
    public void pipeline(final Page page) {
        Map<String,Object> data = page.getDataSet().getData();
        //存储
        System.out.println(data);
    }
}

2.配置模块


@Data
public class ConfigProperties {
    private String crawlerBase;
    private String crawlerPath;
    private boolean crawlerDetail;

    private String dbUsername;
    private String dbPassword;
    private String dbUrl;
    private String dbDriverClass;
    private boolean enableConsole ;

    public ConfigProperties(){
        //从外部文件加载
        InputStream inputStream = ConfigProperties.class.getClassLoader().
                getResourceAsStream("Config.properties");
        Properties p = new Properties();
        try {
            p.load(inputStream);

        } catch (IOException e) {
            e.printStackTrace();
        }
        this.crawlerBase = String.valueOf(p.get("crawler.base"));
        this.crawlerPath = String.valueOf(p.get("crawler.path"));
        this.crawlerDetail = Boolean.parseBoolean(String.valueOf(p.get("crawler.detail")));
        this.dbUsername = String.valueOf(p.get("db.username"));
        this.dbPassword = String.valueOf(p.get("db.password"));
        this.dbUrl = String.valueOf(p.get("db.url"));
        this.dbDriverClass = String.valueOf(p.get("db.driver_class"));
        this.enableConsole = Boolean.valueOf(String.valueOf(p.getProperty("config.enable_console","false")));

    }

    public static void main(String[] args) {
        new ConfigProperties();
    }
}






public class ObjectFactory {
    private static final ObjectFactory instance = new ObjectFactory();//单例模式

    //存放所有对象
    private  final Map<Class,Object> objectHashMap = new HashMap<>();
    private ObjectFactory(){
        //1.初始化配置对象
        initConfigProperties();
        //2.数据源对象
        initDataSource();
        //3爬虫对象
        initCrawler();
        //4.web对象
        initWebController();
    }

    private void initWebController() {
        DataSource dataSource = getObject(DataSource.class);
        AnalyzeDao analyzeDao = new AnalyzeDaoImpl(dataSource);
        AnalyzeService analyzeService = new AnalyzeServiceImpl(analyzeDao);

        WebController webController = new WebController(analyzeService);
        objectHashMap.put(WebController.class,webController);
    }

    private void initCrawler() {
        ConfigProperties configProperties = new ConfigProperties();
        DruidDataSource dataSource = getObject(DataSource.class);
        final Page page = new Page(configProperties.getCrawlerBase(),
                configProperties.getCrawlerPath(),
                configProperties.isCrawlerDetail());
        Crawler crawler = new Crawler();
        crawler.addParse(new DocumentParse());
        crawler.addParse(new DataPageParse());
        if (configProperties.isEnableConsole()){
            crawler.addPipeline(new ConsolePipeline());
        }
        crawler.addPipeline(new DatabasePopeline(dataSource));
        crawler.addPage(page);
        objectHashMap.put(Crawler.class,crawler);
    }

    private void initConfigProperties(){
        ConfigProperties configProperties = new ConfigProperties();
        objectHashMap.put(ConfigProperties.class,configProperties);


    }

    private void initDataSource(){
        ConfigProperties configProperties = getObject(ConfigProperties.class);
        DruidDataSource dataSource = new DruidDataSource();
        dataSource.setUsername(configProperties.getDbUsername());
        dataSource.setPassword(configProperties.getDbPassword());
        dataSource.setDriverClassName(configProperties.getDbDriverClass());
        dataSource.setUrl(configProperties.getDbUrl());

        objectHashMap.put(DataSource.class,dataSource);
    }

    public  <T> T getObject(Class classz){
        if (!objectHashMap.containsKey(classz)){
            throw new IllegalArgumentException("Class"+classz
            .getName()+"not found Object");
        }
        return (T)objectHashMap.get(classz);
    }
    public static ObjectFactory getInstance(){
        return instance;
    }
/*
    private void printObjectList(){
        System.out.println("----------------ObjectFactoryList--------------");
        for (Map.Entry<Class,Object> entry : objectHashMap.entrySet()){
            System.out.println(String.format("\t[%s]==>[%s]",
                    entry.getKey().getCanonicalName(),
                    entry.getValue().getClass().getCanonicalName()));
        }
        System.out.println("------------------------------------------------");
    }*/
}

3.分析模块

//------------------------------存放诗的信息------------------------------------
@Data
public class PoetryInfo {
    private String title;
    private String dynasty;
    private String author;
    private String content;
}



public interface AnalyzeDao {
    //分析唐诗中作者的创作数量
    List<AuthorCount> analyzeAuthorCount();
    //查询所有诗文，提供给业务层进行分析
    List<PoetryInfo> queryAllPoetryInfo();
}

public class AnalyzeDaoImpl implements AnalyzeDao {
    private final DataSource dataSource;
    public AnalyzeDaoImpl(DataSource dataSource) {
        this.dataSource = dataSource;
    }

    @Override
    public List<AuthorCount> analyzeAuthorCount() {
        List<AuthorCount> datas = new ArrayList<>();
        //try()自动关闭
        String sql = "select count(*) as count,author from" +
                " poetry_info group by author;";
        try (Connection connection = dataSource.getConnection();
             PreparedStatement statement = connection.prepareStatement(sql);
             ResultSet rs = statement.executeQuery();
             ){
            while (rs.next()){
                AuthorCount authorCount = new AuthorCount();
                authorCount.setAuthor(rs.getString("author"));
                authorCount.setCount(rs.getInt("count"));
                datas.add(authorCount);
            }

        } catch (SQLException e) {
            e.printStackTrace();
        }
        return datas;
    }

    @Override
    public List<PoetryInfo> queryAllPoetryInfo() {
        List<PoetryInfo> datas = new ArrayList<>();
        String sql = "select title,dynasty,author,content from poetry_info;";
        try(Connection connection = dataSource.getConnection();
            PreparedStatement statement = connection.prepareStatement(sql);
            ResultSet rs = statement.executeQuery();
                ){
            while (rs.next()){
                PoetryInfo poetryInfo = new PoetryInfo();
                poetryInfo.setTitle(rs.getString("title"));
                poetryInfo.setDynasty(rs.getString("dynasty"));
                poetryInfo.setAuthor(rs.getString("author"));
                poetryInfo.setContent(rs.getString("content"));
                datas.add(poetryInfo);
            }

        } catch (SQLException e) {
            e.printStackTrace();
        }
        return datas;
    }
}


public interface AnalyzeService {
    //分析唐诗中作者的创作数量
    List<AuthorCount> analyzeAuthorCount();
    //词云分析
    List<WordCount> analyzeWordCloud();
}

public class AnalyzeServiceImpl implements  AnalyzeService{
    private final AnalyzeDao analyzeDao;

    public AnalyzeServiceImpl(AnalyzeDao analyzeDao) {
        this.analyzeDao = analyzeDao;
    }

    @Override
    public List<AuthorCount> analyzeAuthorCount() {

        //排序方式：
        //1.DAO层SQL排序
        //2.业务层Service进行数据排序
        List<AuthorCount> authorCounts = analyzeDao.analyzeAuthorCount();
        //降序排序
        authorCounts.sort((o1, o2) -> o2.getCount() - o1.getCount());
        return authorCounts;
    }

    @Override
    public List<WordCount> analyzeWordCloud() {
        //1.查询出所有数据
        //2.取出title content
        //3.分词-过滤w、null、空-length<2
        //4.统计
        //<词：词频>
        Map<String,Integer> map = new HashMap<>();
        List<PoetryInfo> poetryInfos = analyzeDao.queryAllPoetryInfo();
        for (PoetryInfo poetryInfo :poetryInfos){
            List<Term> terms = new ArrayList<>();
            String title = poetryInfo.getTitle();
            String content = poetryInfo.getContent();

            terms.addAll(NlpAnalysis.parse(title).getTerms());
            terms.addAll(NlpAnalysis.parse(content).getTerms());
            Iterator<Term>iterator = terms.iterator();

            while (iterator.hasNext()){
                Term term = iterator.next();
                //词性的过滤
                if (term.getNatureStr() == null || term.getNatureStr().equals("w")){
                    iterator.remove();
                    continue;
                }
                //词的过滤
                if(term.getRealName().length()<2){
                    iterator.remove();
                    continue;
                }
                //统计
                String realName = term.getRealName();
                Integer count = 0;
                if (map.containsKey(realName)){
                    count =  map.get(realName)+1;
                }else {
                    count = 1;
                }
                map.put(realName,count);
            }
        }
        List<WordCount> wordCounts = new ArrayList<>();
        for (Map.Entry<String,Integer> entry: map.entrySet()){
            WordCount wordCount = new WordCount();
            wordCount.setCount(entry.getValue());
            wordCount.setWord(entry.getKey());
            wordCounts.add(wordCount);
        }
        return wordCounts;
    }

/*   测试
 public static void main(String[] args) {
         Result result = NlpAnalysis.parse("松下问童子，言师采药去。" +
                 "只在此山中，云深不知处。 ");
         List<Term> terms =  result.getTerms();
        for (Term term:terms) {
            System.out.println(term);
        }
    }*/
}


@Data
public class AuthorCount {
    private String author;
    private Integer count;
}


public class WordCount {
    private String word;
    private Integer count;
}

4.web模块

public class WebController {
    private final AnalyzeService analyzeService;

    public WebController(AnalyzeService analyzeService) {
        this.analyzeService = analyzeService;
    }
    //-> http://127.0.0.1:4567/
    //->/analyze/author_count
    private List<AuthorCount> analyzeAythorCount(){
        return analyzeService.analyzeAuthorCount();
    }
    //-> http://127.0.0.1:4567/
    //->/analyze/word_cloud
    private List<WordCount> analyzeWordCloud(){
        return analyzeService.analyzeWordCloud();
    }

    public void launch(){
        ResponseTransformer transformer = new JSONResponseTransformer();

        Spark.staticFileLocation("/static");
        Spark.get("/analyze/author_count", ((request, response) ->
            analyzeAythorCount()),transformer);
        Spark.get("/analyze/word_cloud",((request, response) ->
                analyzeWordCloud()),transformer);
    }

    public static class JSONResponseTransformer implements ResponseTransformer{
        //Object——>Gson
       private Gson gson = new Gson();
        @Override
        public String render(Object o) throws Exception {
            return gson.toJson(o);
        }
    }

5.项目主类

public class TangshiAnalyzeApplication {
    private static final Logger LOGGER = LoggerFactory.
            getLogger(TangshiAnalyzeApplication.class);

    public static void main(String[] args) {

        WebController webController = ObjectFactory.getInstance().getObject(WebController.class);

        //运行了web服务，提供接口
        LOGGER.info("Web Server Launch...");
        webController.launch();

      /*  if (args.length==1&&args[0].equals("run-crawler")){
            Crawler crawler = ObjectFactory.getInstance().getObject(Crawler.class);
            LOGGER.info("Crawler started...");
            crawler.start();
        }*/
    }
}

6.前端模块

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>唐诗分析报告</title>
    <link type="text/css" href="assets/css/base.css">
    <script type="text/javascript" src="assets/js/echarts.min.js"></script>
    <script type="text/javascript" src="assets/js/echarts-wordcloud.min.js"></script>
    <script type="text/javascript" src="assets/js/jquery-3.3.1.min.js"></script>
    <script type="text/javascript" src="assets/js/api.js"></script>
    <script>
    </script>
</head>
<body id="box">
<h1>唐诗分析报告</h1>
<div>
    <div>
        <input type="button" value="创作数量排行榜" onclick="creationRanking('content')">
        <input type="button" value="诗词用词云图" onclick="cloudWorld('content')">
    </div>
    <div id="content" style="width: 800px ; height: 500px;">

    </div>
</div>
</body>
</html>

/*

 api.js
 1. 主要用于前端页面与后端的交互
 前端：HTML+CSS+JavaScript开发的浏览器识别的Web程序
 后端：Java开发服务程序，通过HTTP协议提供Web API接口
 前端和后端交互：通信协议用的是HTTP协议
 */
function creationRanking(id) {
    //HTTP Method : GET
    //HTTP URL : 请求地址（服务端提供的API接口）
    $.get({
        url: "/analyze/author_count",
        dataType: "json",
        method: "get",
        success: function (data, status, xhr) {
            //echarts图表对象
            var myChart = echarts.init(document.getElementById(id));
            var options = {
                //图标的标题
                title: {
                    text: '唐诗创作排行榜'
                },

                tooltip: {},
                //柱状图的提示信息
                legend: {
                    data: ['数量(首)']
                },
                //X轴的数据：作者
                xAxis: {
                    data: []
                },
                //Y轴的数据：创作的数量
                yAxis: {},
                series: [{
                    name: '创作数量',
                    type: 'bar',
                    data: []
                }]
            };

            //List<AuthorCount>
            for (var i=0; i< data.length; i++) {
                var authorCount  = data[i];
                options.xAxis.data.push(authorCount.author);
                options.series[0].data.push(authorCount.count);
            }
            myChart.setOption(options, true);
        },
        error: function (xhr, status, error) {

        }
    });
}

function cloudWorld(id) {
    $.get({
        url: "/analyze/word_cloud",
        dataType: "json",
        method: "get",
        success: function (data, status, xhr) {
            var myChart = echarts.init(document.getElementById(id));
            var options = {
                series: [{
                    type: 'wordCloud',
                    shape: 'pentagon',
                    left: 'center',
                    top: 'center',
                    width: '80%',
                    height: '80%',
                    right: null,
                    bottom: null,
                    sizeRange: [12, 60],
                    rotationRange: [-90, 90],
                    rotationStep: 45,
                    gridSize: 8,
                    drawOutOfBound: false,
                    textStyle: {
                        normal: {
                            fontFamily: 'sans-serif',
                            fontWeight: 'bold',
                            color: function () {
                                //rgb(r,g,b)
                                return 'rgb(' + [
                                    Math.round(Math.random() * 160),
                                    Math.round(Math.random() * 160),
                                    Math.round(Math.random() * 160)
                                ].join(',') + ')';
                            }
                        },
                        emphasis: {
                            shadowBlur: 10,
                            shadowColor: '#333'
                        }
                    },
                    // Data is an array. Each array item must have name and value property.
                    data: []
                }]
            };
            for (var i=0 ;i<data.length; i++) {
                var wordCount = data[i];
                //wordCount => 词 ： 词频
                options.series[0].data.push({
                    name: wordCount.word,
                    value: wordCount.count,
                    textStyle: {
                        normal: {},
                        emphasis: {}
                    }
                });
            }
            myChart.setOption(options, true);
        },
        error: function (xhr, status, error) {

        }
    });
}