PanGu分词器

最新推荐文章于 2023-11-27 23:05:51 发布

xiexuzhao

最新推荐文章于 2023-11-27 23:05:51 发布

阅读量307

点赞数 1

分类专栏：程序

本文链接：https://blog.csdn.net/xiexuzhao/article/details/118252562

版权

程序专栏收录该内容

278 篇文章 1 订阅

订阅专栏

https://top.chinaz.com/

https://sourceforge.net/projects/ktdictseg/
https://blog.csdn.net/lijun7788/article/details/7719439
http://www.cftea.com/c/2017/06/7991.asp
https://blog.csdn.net/wudiyong22/article/details/48289965
https://github.com/stanzhai/IKAnalyzer.NET
Lucene.net(4.8.0)+PanGu分词器
Install-Package jieba.NET -Version 0.42.2

https://github.com/anderscui/jieba.NET
var segmenter = new JiebaSegmenter();
var segments = segmenter.Cut("我来到北京清华大学", cutAll: true);
Console.WriteLine("【全模式】：{0}", string.Join("/ ", segments));

segments = segmenter.Cut("我来到北京清华大学"); // 默认为精确模式
Console.WriteLine("【精确模式】：{0}", string.Join("/ ", segments));

segments = segmenter.Cut("他来到了网易杭研大厦"); // 默认为精确模式，同时也使用HMM模型
Console.WriteLine("【新词识别】：{0}", string.Join("/ ", segments));

segments = segmenter.CutForSearch("小明硕士毕业于中国科学院计算所，后在日本京都大学深造"); // 搜索引擎模式
Console.WriteLine("【搜索引擎模式】：{0}", string.Join("/ ", segments));

segments = segmenter.Cut("结过婚的和尚未结过婚的");
Console.WriteLine("【歧义消除】：{0}", string.Join("/ ", segments));

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;

using System.IO;
using Lucene.Net.Index;
using Lucene.Net.Search;
using Lucene.Net.Store;
using Lucene.Net.Util;
using Lucene.Net.Documents;
using Lucene.Net.Analysis;
using Lucene.Net.Analysis.Standard;
using JiebaNet.Segmenter;

namespace WindowsFormsApp1
{
//http://www.zhuzhusoft.com/article.php?id=151
//Install-Package Lucene.Net -Pre
//Install-Package Lucene.Net.Analysis.Common -Version 4.8.0-beta00014
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}

private void button1_Click(object sender, EventArgs e)
{
if (this.folderBrowserDialog1.ShowDialog() == DialogResult.OK)
{
this.textBox1.Text = this.folderBrowserDialog1.SelectedPath;

// Ensures index backward compatibility
const LuceneVersion AppLuceneVersion = LuceneVersion.LUCENE_48;

// Construct a machine-independent path for the index
var basePath = Environment.GetFolderPath(
Environment.SpecialFolder.CommonApplicationData);
var indexPath = Path.Combine(basePath, "index");

var dir = FSDirectory.Open(indexPath);

// Create an analyzer to process the text
var analyzer = new StandardAnalyzer(AppLuceneVersion);

// Create an index writer
var indexConfig = new IndexWriterConfig(AppLuceneVersion, analyzer);
var writer = new IndexWriter(dir, indexConfig);

PanGu.Segment.Init();
PanGu.Segment segment = new PanGu.Segment();

string[] files = System.IO.Directory.GetFiles(this.textBox1.Text.Trim(), "*.txt");
foreach (string file in files)
{
FileInfo f = new FileInfo(file);
ICollection words = segment.DoSegment(File.ReadAllText(file));
string FavoritePhraseStr = string.Empty;
foreach (var word in words)
{
FavoritePhraseStr += " " + word.Word;
Console.WriteLine(word.Word);
}

var segmenter = new JiebaSegmenter();
var segments = segmenter.Cut(File.ReadAllText(file), cutAll: true);
Console.WriteLine("【全模式】：{0}", string.Join(" ", segments));

var source = new
{
Name = file,
FavoritePhrase = FavoritePhraseStr
};
var doc = new Document
{
// StringField indexes but doesn't tokenize
new StringField("name",
source.Name,
Field.Store.YES),
new TextField("favoritePhrase",
source.FavoritePhrase,
Field.Store.YES)
};

writer.AddDocument(doc);

writer.Flush(triggerMerge: false, applyAllDeletes: false);
}

writer.Dispose();
}
}

private void button2_Click(object sender, EventArgs e)
{

Ensures index backward compatibility
//const LuceneVersion AppLuceneVersion = LuceneVersion.LUCENE_48;

Construct a machine-independent path for the index
//var basePath = Environment.GetFolderPath(
// Environment.SpecialFolder.CommonApplicationData);
//var indexPath = Path.Combine(basePath, "index");

//var dir = FSDirectory.Open(indexPath);

Create an analyzer to process the text
//var analyzer = new StandardAnalyzer(AppLuceneVersion);

Create an index writer
//var indexConfig = new IndexWriterConfig(AppLuceneVersion, analyzer);
//var writer = new IndexWriter(dir, indexConfig);

// Search with a phrase
var phrase = new MultiPhraseQuery
{
new Term("favoritePhrase", this.textBox2.Text.Trim()),
//new Term("favoritePhrase", "fox")
};
Re-use the writer to get real-time updates
//var reader = writer.GetReader(applyAllDeletes: true);
//var searcher = new IndexSearcher(reader);

var basePath = Environment.GetFolderPath(
Environment.SpecialFolder.CommonApplicationData);
var indexPath = Path.Combine(basePath, "index");
var dir = FSDirectory.Open(indexPath);
var searcher = new IndexSearcher(DirectoryReader.Open(dir));
var hits = searcher.Search(phrase, 20 /* top 20 */).ScoreDocs;

DataTable dataTable = new DataTable();
dataTable.Columns.Add("name");
dataTable.Columns.Add("favoritePhrase");
// Display the output in a table
Console.WriteLine($"{"Score",10}" +
$" {"Name",-15}" +
$" {"Favorite Phrase",-40}");
foreach (var hit in hits)
{
var foundDoc = searcher.Doc(hit.Doc);
DataRow dr = dataTable.NewRow();
dr["name"] = foundDoc.Get("name");
dr["favoritePhrase"] = foundDoc.Get("favoritePhrase");
dataTable.Rows.Add(dr);
Console.WriteLine($"{hit.Score:f8}" +
$" {foundDoc.Get("name"),-15}" +
$" {foundDoc.Get("favoritePhrase"),-40}");
}
this.dataGridView1.DataSource = dataTable;
}
}
}

it自媒体

1、做长远计划；2、精准人群定位；3、差异化竞争；4、有营销点；5、吸粉更容易；6、打造知名度；7、时间定位运营；8、低成本高收益；9、增加客户粘性；10、内容服务为王。

https://www.svgrepo.com/
https://kalendar.altinselimi.com/
https://www.yuque.com/explore/headlines
Mybatis.net
https://xiaoluoboding.github.io/monthly/2021/2021-03.html#%E5%B7%A5%E5%85%B7
https://xiaoluoboding.github.io/monthly/2019/#%F0%9F%8D%AD-%E8%AE%BE%E8%AE%A1%E5%88%9B%E6%84%8F
http://www.chinavalue.net/Wiki/%E8%87%AA%E5%AA%92%E4%BD%93.aspx
https://www.163.com/dy/article/G3MQE1MQ0511GV8V.html
https://www.infoq.cn/
https://www.infoq.cn/article/W4leI4XZ32eSTqFJ8qPl
https://xiaoluoboding.github.io/monthly/2019/2019-01.html#%E6%95%99%E7%A8%8B
http://yixiaoer.coozf.com/
SpringBoot+SpringMVC+Mybatis+Redis+ELK+Quartz+Websocket+vue.js
https://activity.feishu.cn/

https://www.yuque.com/woniu666/tech_doc/pueka0

网站的排行榜
https://top.chinaz.com/

xiexuzhao

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
PanGu分词器

https://top.chinaz.com/https://sourceforge.net/projects/ktdictseg/https://blog.csdn.net/lijun7788/article/details/7719439http://www.cftea.com/c/2017/06/7991.asphttps://blog.csdn.net/wudiyong22/article/details/48289965https://github.com/stanzhai/IKAn
复制链接

扫一扫