java热词_信息化领域热词分类分析及解释实战

import requests

import re

import xlwt

url = 'https://news.cnblogs.com/n/recommend'

headers = {

"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"

}

def get_page(url):

try:

response = requests.get(url, headers=headers)

if response.status_code == 200:

print('获取网页成功')

print(response.encoding)

return response.text

else:

print('获取网页失败')

except Exception as e:

print(e)

f = xlwt.Workbook(encoding='utf-8')

sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)

sheet01.write(0, 0, '博客最热新闻') # 第一行第一列

urls = ['https://news.cnblogs.com/n/recommend?page={}'.format(i * 1) for i in range(100)]

temp=0

num=0

for url in urls:

print(url)

page = get_page(url)

items = re.findall('

.*?(.*?)',page,re.S)

print(len(items))

print(items)

for i in range(len(items)):

sheet01.write(temp + i + 1, 0, items[i])

temp += len(items)

num+=1

print("已打印完第"+str(num)+"页")

print("打印完!!!")

f.save('Hotword.xls')

爬取结果截图:

1ddaed5a1cc168312eb5a1d86fedfaea.png

然后继续在爬取结果里面进行筛选,选出100个出现频率最高的信息热词。

Python代码:

import jieba

import pandas as pd

import re

from collections import Counter

if __name__ == '__main__':

filehandle = open("Hotword.txt", "r", encoding='utf-8');

mystr = filehandle.read()

seg_list = jieba.cut(mystr) # 默认是精确模式

print(seg_list)

# all_words = cut_words.split()

# print(all_words)

stopwords = {}.fromkeys([line.rstrip() for line in open(r'final.txt',encoding='UTF-8')])

c = Counter()

for x in seg_list:

if x not in stopwords:

if len(x) > 1 and x != '\r\n':

c[x] += 1

print('\n词频统计结果:')

for (k, v) in c.most_common(100): # 输出词频最高的前两个词

print("%s:%d" % (k, v))

# print(mystr)

filehandle.close();

# seg2 = jieba.cut("好好学学python,有用。", cut_all=False)

# print("精确模式(也是默认模式):", ' '.join(seg2))

里面的那个final.txt是将那些单词比如“我们”,“什么”,“中国”,“没有”,这些句子常出现的词语频率高但是跟信息没有关系的词语,我们将他们首先排除。

final.txt:

6f9791d35ffdfbd292c24f455ebda252.png

这个txt有需要的,联系Q:893225523

运行结果:

2e93553ccf8e9b29a3ba05dd748be982.png

然后将他们存入txt,导入mysql。

之后我们继续进行爬取,爬取百度百科每个热词的解释。

Python源代码:

import requests

import re

import xlwt

import linecache

url = 'https://baike.baidu.com/'

headers = {

"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"

}

def get_page(url):

try:

response = requests.get(url, headers=headers)

response.encoding = 'utf-8'

if response.status_code == 200:

print('获取网页成功')

#print(response.encoding)

return response.text

else:

print('获取网页失败')

except Exception as e:

print(e)

f = xlwt.Workbook(encoding='utf-8')

sheet01 = f.add_sheet(u'sheet1', cell_overwrite_ok=True)

sheet01.write(0, 0, '热词') # 第一行第一列

sheet01.write(0, 1, '热词解释') # 第一行第二列

sheet01.write(0, 2, '网址') # 第一行第三列

fopen = open('C:\\Users\\hp\\Desktop\\final_hotword2.txt', 'r',encoding='utf-8')

lines = fopen.readlines()

urls = ['https://baike.baidu.com/item/{}'.format(line) for line in lines]

i=0

for url in urls:

print(url.replace("\n", ""))

page = get_page(url.replace("\n", ""))

items = re.findall('',page,re.S)

print(items)

if len(items)>0:

sheet01.write(i + 1, 0,linecache.getline("C:\\Users\\hp\\Desktop\\final_hotword2.txt", i+1).strip())

sheet01.write(i + 1, 1,items[0])

sheet01.write(i + 1, 2,url.replace("\n", ""))

i+= 1

print("总爬取完毕数量:" + str(i))

print("打印完!!!")

f.save('hotword_explain.xls')

刚开始我爬取的时候,在确定正则表达式正确的情况下,爬取结果一直都是乱码。然后加上 response.encoding = 'utf-8',就OK了。

爬取结果:

cbcbb1c5d2722f9b7aa7b8f84fb4b9b5.png

将其存入数据库。

之后打开eclipse,用javaweb实现数据可视化和热词目录。

jsp源代码:

pageEncoding="UTF-8"%>

Insert title here

var myChart = echarts.init(document.getElementById('main'));

var statisticsData =[];

myChart.showLoading();

$.ajax({

type : "post",

async : true, //异步请求(同步请求将会锁住浏览器,其他操作须等请求完成才可执行)

url : "servlet?method=find", //请求发送到Servlet

data : {},

dataType : "json", //返回数据形式为json

//7.请求成功后接收数据name+num两组数据

success : function(result) {

//result为服务器返回的json对象

if (result) {

//8.取出数据存入数组

for (var i = 0; i

var statisticsObj = {name:'',value:''}; //因为ECharts里边需要的的数据格式是这样的

statisticsObj.name =result[i].hotwords;

statisticsObj.value =result[i].num;

//alert( statisticsObj.name);

//alert(statisticsObj.value);

statisticsData.push(statisticsObj);

}

//alert(statisticsData);

//把拿到的异步数据push进我自己建的数组里

myChart.hideLoading();

//9.覆盖操作-根据数据加载数据图表

var z1_option = {

title : {

text:'热词图'

},

series: [{

type: 'wordCloud',

gridSize: 20,

sizeRange: [12, 50],

rotationRange: [-90, 90],

shape: 'pentagon',

textStyle: {

normal: {

color: function() {

return 'rgb(' + [

Math.round(Math.random() * 160),

Math.round(Math.random() * 160),

Math.round(Math.random() * 160)

].join(',') + ')';

}

},

emphasis: {

shadowBlur: 10,

shadowColor: '#333'

}

},

data: statisticsData

}]

};

myChart.setOption(z1_option, true);

}

},

})

dao层代码:

package com.hotwords.dao;

import java.sql.Connection;

import java.sql.DriverManager;

import java.sql.ResultSet;

import java.sql.Statement;

import java.util.ArrayList;

import java.util.List;

import com.hotwords.entity.entity;

public class dao {

public List list1(){

List list =new ArrayList();

try {

// 加载数据库驱动,注册到驱动管理器

Class.forName("com.mysql.jdbc.Driver");

// 数据库连接字符串

String url = "jdbc:mysql://localhost:3306/xinwen?useUnicode=true&characterEncoding=utf-8";

// 数据库用户名

String username = "root";

// 数据库密码

String password = "893225523";

// 创建Connection连接

Connection conn = DriverManager.getConnection(url, username,

password);

// 添加图书信息的SQL语句

String sql = "select * from final_hotword";

// 获取Statement

Statement statement = conn.createStatement();

ResultSet resultSet = statement.executeQuery(sql);

while (resultSet.next()) {

entity book = new entity();

book.setHotwords(resultSet.getString("热词"));

book.setNum(resultSet.getString("次数"));

list.add(book);

}

resultSet.close();

statement.close();

conn.close();

}catch (Exception e) {

e.printStackTrace();

}

return list;

}

//

public List list2(){

List list =new ArrayList();

try {

// 加载数据库驱动,注册到驱动管理器

Class.forName("com.mysql.jdbc.Driver");

// 数据库连接字符串

String url = "jdbc:mysql://localhost:3306/xinwen?useUnicode=true&characterEncoding=utf-8";

// 数据库用户名

String username = "root";

// 数据库密码

String password = "893225523";

// 创建Connection连接

Connection conn = DriverManager.getConnection(url, username,

password);

// 添加图书信息的SQL语句

String sql = "select * from website";

// 获取Statement

Statement statement = conn.createStatement();

ResultSet resultSet = statement.executeQuery(sql);

while (resultSet.next()) {

entity book = new entity();

book.setHotwords(resultSet.getString("热词"));

book.setExplain(resultSet.getString("解释"));

book.setWebsite(resultSet.getString("网址"));

list.add(book);

}

resultSet.close();

statement.close();

conn.close();

}catch (Exception e) {

e.printStackTrace();

}

return list;

}

}

servlet层:

package com.hotwords.servlet;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

import javax.servlet.ServletException;

import javax.servlet.annotation.WebServlet;

import javax.servlet.http.HttpServlet;

import javax.servlet.http.HttpServletRequest;

import javax.servlet.http.HttpServletResponse;

import javax.servlet.http.HttpSession;

import com.hotwords.dao.dao;

import com.hotwords.entity.entity;

import com.google.gson.Gson;

/**

* Servlet implementation class servlet

*/

@WebServlet("/servlet")

public class servlet extends HttpServlet {

private static final long serialVersionUID = 1L;

dao dao1=new dao();

/**

* @see HttpServlet#HttpServlet()

*/

public servlet() {

super();

// TODO Auto-generated constructor stub

}

protected void service(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {

request.setCharacterEncoding("utf-8");

String method=request.getParameter("method");

if("find".equals(method))

{

find(request, response);

}else if("find2".equals(method))

{

find2(request, response);

}

}

private void find(HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException {

request.setCharacterEncoding("utf-8");

List list =new ArrayList();

HttpSession session=request.getSession();

String buy_nbr=(String) session.getAttribute("userInfo");

entity book = new entity();

List list2=dao1.list1();

System.out.println(list2.size());

// String buy_nbr=(String) session.getAttribute("userInfo");

// System.out.println(buy_nbr);

Gson gson2 = new Gson();

String json = gson2.toJson(list2);

System.out.println(json);

// System.out.println(json);

// System.out.println(json.parse);

response.setContentType("text/html;charset=UTF-8");

response.getWriter().write(json);

}

private void find2(HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException {

request.setCharacterEncoding("utf-8");

request.setAttribute("list",dao1.list2());

request.getRequestDispatcher("NewFile1.jsp").forward(request, response);

}

}

项目结构:

b86165e861947f60fe14f810b00851bf.png

记得一定要加上 echars-all.js要不热词图不能显示。

运行结果:

b14715711c32c8182c1880cfe1ac3aee.png

71720142653a1875a818dd6c111638c6.png

那么基本功能就完成了,但是那个导出word还没有实现。

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值