代码和用到的jar包我放到了这里 https://github.com/MiniBuck/Lucene
主要做了这么几件事:
A)把室友爬到的5000多篇新闻搬运到了我的数据库
B)从MySQL中读取出新闻,基于TFIDF进行了关键词的抽取
C)使用Lucene,建立索引。抽取的关键词在建立索引的同时完成,关键字作为一个域与新闻的其他域如标题、正文等相同对待,建立索引。
D)简单查询,简单对比一下效果
效果:
评价标准:取自动打分高的前10篇,主观分为好中差三等
评价:抽取关键字后对其建立索引与直接对内容建立索引最后的结果差距不大,而且直接对内容搜索时排名靠前的结果主观上可能更好一些。但是,返回结果更少了
可能原因:主观看,也就看个大概;我自己实现的TFIDF可能不太准;TFIDF方法本身就不太准;Lucene本身就是全文搜索,关键字抽取建立索引和全文切分建立索引区别不是特别大,可能对Lucene自带的打分函数有影响,这一点我还没有细看
心得:如果换一种关键词抽取技术,甚至是关键字生成技术,效果可能会很好,因为范围确实缩减了。
以下是代码部分:
新闻模型:
package Model;
import java.sql.Timestamp;
/**
*
* @author miniBuck
*新闻对象的设计,有下列属性:按照下面定义的顺序
*新闻id、题目、正文、链接地址、尖标题、副标题、原题目作者、来源、来源地址、种类、(我也不知道叫什么域)、发表时间、保存时间
*/
public class News {
private int id;
private String title;
private String content;
private String url;
private String jtitle;
private String subtitle;
private String original_title;
private String author;
private String source;
private String source_url;
private String category;
private String rawhtml;
private Timestamp posted_at;
private Timestamp saved_at;
public News(int id,String title,String content,String url,String jtitle,String subtitle,String original_title,String author,String source,String source_url,String category,String rawhtml,Timestamp posted_at,Timestamp saved_at)
{
this.id = id;
this.title = title;
this.content = content;
this.url = url;
this.jtitle = jtitle;
this.subtitle = subtitle;
this.original_title = original_title;
this.author = author;
this.source = source;
this.source_url = source_url;
this.category = category;
this.rawhtml = rawhtml;
this.posted_at = posted_at;
this.saved_at = saved_at;
}
public void putid(int id){
this.id = id;
}
public int getid(){
return id;
}
public void puttitle(String title){
this.title = title;
}
public String gettitle(){
return title;
}
public void putcontent(String content){
this.content = content;
}
public String getcontent(){
return content;
}
public void puturl(String url){
this.url = url;
}
public String geturl(){
return url;
}
public void putjtitle(String title){
this.jtitle = title;
}
public String getjtitle(){
return jtitle;
}
public void putsubtitle(String subtitle){
this.subtitle = subtitle;
}
public String getsubtitle(){
return subtitle;
}
public void putoriginal_title(String original_title){
this.original_title = original_title;
}
public String getoriginal_title(){
return original_title;
}
public void putauthor(String author){
this.author = author;
}
public String getauthor(){
return author;
}
public void putsource(String source){
this.source = source;
}
public String getsource(){
return source;
}
public void putsource_url(String source_url){
this.source_url = source_url;
}
public String getsource_url(){
return source_url;
}
public void putcategory(String category){
this.category = category;
}
public String getcategory(){
return category;
}
public void putrawhtml(String rawhtml){
this.rawhtml = rawhtml;
}
public String getrawhtml(){
return rawhtml;
}
public void putposted_at(Timestamp posted_at){
this.posted_at = posted_at;
}
public Timestamp getposted_at(){
return posted_at;
}
public void putsaved_at(Timestamp saved_at){
this.saved_at = saved_at;
}
public Timestamp getsaved_at(){
return saved_at;
}
}
TFIDF工具:
package Utils;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.lionsoul.jcseg.analyzer.JcsegAnalyzer5X;
import org.lionsoul.jcseg.core.JcsegTaskConfig;
import Model.News;//对新闻的封装
//import Test.test1;//只在测试时使用一下
/**
* 工具:用于计算TFIDF
*
* @author miniBuck
*
*/
public class TFIDFDB {
/**
* 分词,返回一个词列表
*
* @param analyzeStr
* 待分字符串
* @param analyzer
* 分词器(Lucene中analyzer形式)
* @return 词列表
*/
public static ArrayList<String> getAnalyseResult(String analyzeStr, Analyzer analyzer) {
ArrayList<String> response = new ArrayList<String>();
TokenStream tokenstream = null;
try {
// 返回适用于fieldName的TokenStream,此处我暂时没看懂文档中关于这个域的说明
tokenstream = analyzer.tokenStream("keyword", new StringReader(analyzeStr));
// 词汇单元对应的文本
CharTermAttribute attr = tokenstream.addAttribute(CharTermAttribute.class);
// 消费者在使用incrementToken 开始消费之前调用此方法
// 将次流充值未干净状态。有状态的实现必须实现这种方法,以便他们可以被重用,就像他们被被构建一样
tokenstream.reset();
// Consumer(即IndexWriter)使用此方法将流推送到下一个token
while (tokenstream.incrementToken()) {
response.add(attr.toString());
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (tokenstream != null) {
try {
tokenstream.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
return response;
}
/**
* 分词
*
* @param str待分字符串
*
* @return 分词列表
*/
public static ArrayList<String> cutWords(String str) {
String text = str;
// 使用的Jcseg5 版本
Analyzer analyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE);
//加载停用词,因为我是用的版本默认是不启用停用词表的
JcsegAnalyzer5X jcseg = (JcsegAnalyzer5X) analyzer;
JcsegTaskConfig config = jcseg.getTaskConfig();
config.setClearStopwords(true);
ArrayList<String> words = getAnalyseResult(text, analyzer);
return words;
}
/**
* 对切分的词统计词频(频数)
* @param cutwords
* @return 以哈希表形式存储
*/
public static HashMap<String, Integer> normalTF(ArrayList<String> cutwords) {
HashMap<String, Integer> resTF = new HashMap<String, Integer>();
for (String word : cutwords) {
if (resTF.get(word) == null) {
resTF.put(word, 1);
} else {
resTF.put(word, resTF.get(word) + 1);
}
}
return resTF;
}
/**
* 对切分的词统计词频(频率)
* @param cutwords
* @return 以哈希表形式存储
*/
public static HashMap<String, Float> tf(ArrayList<String> cutwords) {
HashMap<String, Float> resTF = new HashMap<String, Float>();
int wordLen = cutwords.size();
HashMap<String, Integer> intTF = TFIDFDB.normalTF(cutwords);
Iterator<?> iter = intTF.entrySet().iterator(); // from TF
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
resTF.put(entry.getKey().toString(), Float.parseFloat(entry.getValue().toString()) / wordLen);
}
return resTF;
}
/**
* 根据新闻列表,返回每篇新闻id对应的词频统计
* @param list
* @return 新闻id:词频统计哈希表
*/
public static HashMap<Integer, HashMap<String, Integer>> normalTFAllFiles(List<News> list) {
HashMap<Integer, HashMap<String, Integer>> allNormalTF = new HashMap<Integer, HashMap<String, Integer>>();
List<News> filelist = list;
for (News file : filelist) {
HashMap<String, Integer> dict = new HashMap<String, Integer>();
ArrayList<String> cutwords = TFIDFDB.cutWords(file.getcontent());
dict = TFIDFDB.normalTF(cutwords);
allNormalTF.put(file.getid(), dict);
}
return allNormalTF;
}
/**
* 根据新闻列表,返回新闻列表里所有新闻的tf值
* @param list 新闻列表
* @return<新闻id,<词,tf值>>
*/
public static HashMap<Integer, HashMap<String, Float>> tfAllFiles(List<News> list) {
HashMap<Integer, HashMap<String, Float>> allTF = new HashMap<Integer, HashMap<String, Float>>();
List<News> filelist = list;
for (News file : filelist) {
HashMap<String, Float> dict = new HashMap<String, Float>();
ArrayList<String> cutwords = TFIDFDB.cutWords(file.getcontent());
dict = TFIDFDB.tf(cutwords);
allTF.put(file.getid(), dict);
}
return allTF;
}
/**
* 对tf列表运算idf
*
* @param all_tf 所有词的tf值
* @return 每个词的idf值
*/
public static HashMap<String, Float> idf(HashMap<Integer, HashMap<String, Float>> all_tf) {
HashMap<String, Float> resIdf = new HashMap<String, Float>();
HashMap<String, Integer> dict = new HashMap<String, Integer>();
// int docNum = FileList.size();
int docNum = all_tf.size();
Set<Integer> idset = all_tf.keySet();
Iterator<Integer> it = idset.iterator();
while (it.hasNext()) {
int id = it.next();
HashMap<String, Float> temp = all_tf.get(id);
Iterator iter = temp.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
String word = entry.getKey().toString();
if (dict.get(word) == null) {
dict.put(word, 1);
} else {
dict.put(word, dict.get(word) + 1);
}
}
}
Iterator iter_dict = dict.entrySet().iterator();
while (iter_dict.hasNext()) {
Map.Entry entry = (Map.Entry) iter_dict.next();
float value = (float) Math.log(docNum / Float.parseFloat(entry.getValue().toString()));
resIdf.put(entry.getKey().toString(), value);
}
return resIdf;
}
/**
*
* @param all_tf 所有新闻中每个词的tf词
* @param idfs 所有词的idf值
* @return 每篇新闻中每个词的tfidf值<新闻id,<词,tfid值>>
*/
public static HashMap<Integer, HashMap<String, Float>> tf_idf(HashMap<Integer, HashMap<String, Float>> all_tf,
HashMap<String, Float> idfs) {
HashMap<Integer, HashMap<String, Float>> resTfIdf = new HashMap<Integer, HashMap<String, Float>>();
Set<Integer> idset = all_tf.keySet();
Iterator<Integer> it = idset.iterator();
while (it.hasNext()) {
int id = it.next();
HashMap<String, Float> tfidf = new HashMap<String, Float>();
HashMap<String, Float> temp = all_tf.get(id);
Iterator iter = temp.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry entry = (Map.Entry) iter.next();
String word = entry.getKey().toString();
Float value = (float) Float.parseFloat(entry.getValue().toString()) * idfs.get(word);
tfidf.put(word, value);
}
resTfIdf.put(id, tfidf);
}
return resTfIdf;
}
/**
*
* @param tf_idf 每篇新闻中每个词的tfidf值
* @return 每篇新闻里tfidf值前十的词与列表<新闻id,词语列表>
*/
public static HashMap<Integer, ArrayList<String>> topNkeywoed(HashMap<Integer, HashMap<String, Float>> tf_idf) {
HashMap<Integer, ArrayList<String>> result = new HashMap<Integer, ArrayList<String>>();
Set<Integer> idset = tf_idf.keySet();
Iterator<Integer> it = idset.iterator();
while (it.hasNext()) {
int id = it.next();
HashMap<String, Float> temp = tf_idf.get(id);
Set<String> keywordSet = temp.keySet();
Iterator<String> iterator = keywordSet.iterator();
List<Map.Entry<String, Float>> l = new ArrayList<>();
for (Map.Entry<String, Float> entry : temp.entrySet()) {
l.add(entry); // 将map中的元素放入list中
}
l.sort(new Comparator<Map.Entry<String, Float>>() {
@Override
public int compare(Entry<String, Float> arg0, Entry<String, Float> arg1) {
// TODO Auto-generated method stub
return (int) (arg1.getValue() - arg0.getValue());
}
// 逆序(从大到小)排列,正序为“return o1.getValue()-o2.getValue”;
});
ArrayList<String> r = new ArrayList<>();
for (int i = 0; i < 10 && i < l.size(); i++) {// 取前10top
Map.Entry<String, Float> entry = l.get(i);
r.add(entry.getKey());
}
result.put(id, r);
}
return result;
}
/**
* 根据新闻列表,返回改新闻列表的TFIDF前10的关键字列表,封装一下,作为tfidf工具类使用的入口
* @param newslist 新闻列表
* @return 每篇新闻里tfidf值前十的词与列表<新闻id,词语列表>
*/
public static HashMap<Integer, ArrayList<String>> getkeyword(List<News> newslist) {
HashMap<Integer, HashMap<String, Float>> all_tf = tfAllFiles(newslist);
HashMap<String, Float> idfs = idf(all_tf);
HashMap<Integer, HashMap<String, Float>> tfidf = tf_idf(all_tf, idfs);// !!!
HashMap<Integer, ArrayList<String>> result = topNkeywoed(tfidf);// !!
return result;
}
/*
* public static void main(String[] args) { test1 t = new test1();
* List<News> newslist = t.connectDBAndRead(1, 2);
* //System.out.println(newslist);
*
* HashMap<Integer, HashMap<String, Float>> all_tf = tfAllFiles(newslist);
* //System.out.println(all_tf);
*
* HashMap<String, Float> idfs = idf(all_tf); //System.out.println(idfs);
*
* System.out.println(); HashMap<Integer, HashMap<String, Float>> tfidf =
* tf_idf(all_tf, idfs); //System.out.println(tfidf);
*
* HashMap<Integer, ArrayList<String>> result = topNkeywoed(tfidf);
* //System.out.println(result);
*
* }
*/
}
建立索引、检索
package Test;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.FileSystems;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.*;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.lionsoul.jcseg.analyzer.JcsegAnalyzer5X;
import org.lionsoul.jcseg.core.JcsegTaskConfig;
import Model.News;
import Utils.TFIDFDB;
public class test1 {
private static String url = "jdbc:mysql://localhost:3306/新闻";
private static String user = "***";
private static String password = "******";
private static String driver = "com.mysql.jdbc.Driver";
private static final String tablename = "dangjian_people_com_cn";
/**
* 从数据库读取id 范围的数据,返回news列表
*
* @param from id范围[from to]
*
* @param to
* @return
*/
public List<News> connectDBAndRead(int from, int to) {
Connection connection = null;
Statement statement = null;
ResultSet resultset = null;
ArrayList<News> list = new ArrayList<News>();
try {
String selectSql = "SELECT * FROM " + tablename + " WHERE id >= " + from + " and id <= " + to;
Class.forName(driver);
connection = DriverManager.getConnection(url, user, password);
statement = connection.createStatement();
resultset = statement.executeQuery(selectSql);
while (resultset.next()) {
int id = resultset.getInt("id");
String title = resultset.getString("title");
String content = resultset.getString("content");
String url = resultset.getString("url");
String jtitle = resultset.getString("jtitle");
String subtitle = resultset.getString("subtitle");
String original_title = resultset.getString("original_title");
String author = resultset.getString("author");
String source = resultset.getString("source");
String source_url = resultset.getString("source_url");
String category = resultset.getString("category");
String rawhtml = resultset.getString("raw_html");
Timestamp posted_at = resultset.getTimestamp("posted_at");
Timestamp saved_at = resultset.getTimestamp("saved_at");
News n = new News(id, title, content, url, jtitle, subtitle, original_title, author, source, source_url,
category, rawhtml, posted_at, saved_at);
list.add(n);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return list;
}
/**
* 对新闻内容进行了存储索引 通过新闻对象列表,写入索引到指定目录
*
* @param list
* 新闻对象列表
* @param indexPath
* 索引路径
*/
public void WriteIndexByNewsList(IndexWriter writer, List<News> list,
HashMap<Integer, ArrayList<String>> keyword_By_tfidf) {
try {
// 写索引
for (News n : list) {
Document doc = new Document();
//使用tfidf工具,获取新闻列表中每条新闻中的关键词(tfidf前十的词)
ArrayList<String> l = keyword_By_tfidf.get(n.getid());
StringBuilder sb = new StringBuilder();
for (String str : l) {
sb.append(str + " ");
}
//关键词作为普通的域存入Lucene
TextField keyword = new TextField("keyword", sb.toString(), Store.YES);
StoredField id = new StoredField("id", n.getid());
TextField title = new TextField("title", n.gettitle(), Store.YES);
TextField content = new TextField("content", n.getcontent(), Store.YES);
StringField url = new StringField("url", n.geturl(), Store.YES);
TextField jtitle = new TextField("jtitle", n.getjtitle(), Store.YES);
TextField subtitle = new TextField("subtitle", n.getsubtitle(), Store.YES);
TextField original_title = new TextField("original_title", n.getoriginal_title(), Store.YES);
StringField author = new StringField("author", n.getauthor(), Store.YES);
StringField source = new StringField("source", n.getsource(), Store.YES);
StringField source_url = new StringField("source_url", n.getsource_url(), Store.YES);
StringField category = new StringField("category", n.getcategory(), Store.YES);
StoredField rawhtml = new StoredField("rawhtml", n.getrawhtml());
LongPoint posted_at = new LongPoint("posted_at", n.getsaved_at().getTime());
StoredField posted_ats = new StoredField("posted_at", n.getsaved_at().getTime());
LongPoint saved_at = new LongPoint("saved_at", n.getsaved_at().getTime());
StoredField saved_ats = new StoredField("saved_at", n.getsaved_at().getTime());
doc.add(keyword);
doc.add(id);
doc.add(title);
doc.add(content);
doc.add(url);
doc.add(jtitle);
doc.add(subtitle);
doc.add(original_title);
doc.add(author);
doc.add(source);
doc.add(source_url);
doc.add(category);
doc.add(rawhtml);
doc.add(rawhtml);
doc.add(posted_at);
doc.add(posted_ats);
doc.add(saved_at);
doc.add(saved_ats);
writer.addDocument(doc);
}
writer.commit();
} catch (IOException e) {
System.out.println("catch a" + e.getClass() + "\n with a message" + e.getMessage());
}
}
/**
* 入口 将数据库中id为[from,to]的文件写入索引
* @param from
* @param to
* @param indexpath 将要写入的索引文件目录
*/
public void indexUsingDBById(int from, int to, String indexpath) {
long Starttime = System.nanoTime();
try {
System.out.println("Indexing to directory \'" + indexpath + "\'...");
// 创建字典目录存于文件系统
FSDirectory directory = FSDirectory.open(Paths.get(indexpath, new String[0]));
//使用的Jcseg分词,开启停用词表,启用它自带的复杂模式(就是加了几种过滤条件)
Analyzer analyzer = new JcsegAnalyzer5X(JcsegTaskConfig.COMPLEX_MODE);
JcsegAnalyzer5X jcseg = (JcsegAnalyzer5X) analyzer;
JcsegTaskConfig config = jcseg.getTaskConfig();
config.setClearStopwords(true);
// 修改索引修改配置
IndexWriterConfig writerconfig = new IndexWriterConfig(analyzer);
// 默认设成追加模式
writerconfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
// 索引writer
IndexWriter writer = new IndexWriter(directory, writerconfig);
// 写索引
List<News> l = connectDBAndRead(from, to);//从数据库读取新闻
HashMap<Integer, ArrayList<String>> keyword_By_tfidf = TFIDFDB.getkeyword(l);
//获取关键字
WriteIndexByNewsList(writer, l, keyword_By_tfidf);
//关键字连同新闻一起建索引
// 关闭writer
writer.close();
System.out.println("此次操作用时: " + (System.nanoTime() - Starttime));
} catch (IOException e) {
System.out.println("catch a" + e.getClass() + "\n with a message" + e.getMessage());
}
}
/**
* 获取制定分词器 的分词结果
*
* @param analyzeStr
* 要分的字符串
* @param analyzer
* 分词器
*
* @return
*/
public List<String> getAnalyseResult(String analyzeStr, Analyzer analyzer) {
List<String> response = new ArrayList<String>();
TokenStream tokenstream = null;
try {
// 返回适用于fieldName的TokenStream
tokenstream = analyzer.tokenStream("keyword", new StringReader(analyzeStr));
// 词汇单元对应的文本
CharTermAttribute attr = tokenstream.addAttribute(CharTermAttribute.class);
// 消费者在使用incrementToken 开始消费之前调用此方法
// 将次流充值未干净状态。有状态的实现必须实现这种方法,以便他们可以被重用,就像他们被被构建一样
tokenstream.reset();
// Consumer(即IndexWriter)使用此方法将流推送到下一个token
while (tokenstream.incrementToken()) {
response.add(attr.toString());
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (tokenstream != null) {
try {
tokenstream.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}
return response;
}
/**
* 一个简单的搜索,我也放在这里了
* @param filepathars 索引的目录
* @param field 想要查询的域
* @param content 查询的内容,这里使用最简单的term查询,lucene还有其他查询
* @throws IOException
*/
public void search(String filepathars, String field, String content) throws IOException {
Path path = FileSystems.getDefault().getPath(filepathars);
// 定义索引目录
Directory directory = FSDirectory.open(path);
// 定义索引查看器
IndexReader indexReader = DirectoryReader.open(directory);
// 定义索引搜索器
IndexSearcher indexSearcher = new IndexSearcher(indexReader);
// 定义搜索词条
Term term = new Term(field, content);
// 定义查询
Query query = new TermQuery(term);
// 命中前十条文档
TopDocs topdocs = indexSearcher.search(query, 10);
// 打印命中数
System.out.println("命中数:+" + topdocs.totalHits);
// 取出文档
ScoreDoc[] scoreDocs = topdocs.scoreDocs;
// 遍历取出数据
for (ScoreDoc scoreDoc : scoreDocs) {
Document doc = indexSearcher.doc(scoreDoc.doc);
System.out.println("id" + doc.get("id"));
System.out.println("content" + doc.get("content"));
System.out.println("keyword" + doc.get("keyword"));
}
}
//因为自己写的tfidf效率过低,所以每次查询50篇新闻抽取关键字,下一步考虑使用word2vec抽取
public static void main(String[] args) {
test1 t = new test1();
// 入口,参数为ID范围[1,5517],索引位置
//for (int i = 1; i <= 5500; i = i + 50) {
// t.indexUsingDBById(i, i + 49, "C:\\Users\\hp\\Desktop\\index");
//}
try {
t.search("C:\\Users\\hp\\Desktop\\index", "content", "主要矛盾");
System.out.println("**************************************************");
t.search("C:\\Users\\hp\\Desktop\\index", "keyword", "主要矛盾");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}