TF-IDF 提取关键词

最新推荐文章于 2024-07-23 13:00:00 发布

荣华富贵8

最新推荐文章于 2024-07-23 13:00:00 发布

阅读量413

点赞数 1

文章标签：经验分享

本文链接：https://blog.csdn.net/s13596191285/article/details/128360670

版权

<?php

class Document
{
protected $words;
protected $tf_matrix;
protected $tfidf_matrix;
public function __construct($string)
{
$this->tfidf_matrix = null;
if (isset($string))
{
$string = strtolower($string);
$this->words = preg_split('/((^p{P}+)|(p{P}*s+p{P}*)|(p{P}+$))/', $string, -1, PREG_SPLIT_NO_EMPTY);
$this->build_tf();
}
else
{
$this->words = null;
$this->tf_matrix = null;
}
}
public function build_tf()
{
if (isset($this->tf_matrix) && $this->tf_matrix)
return ;
$this->tfidf_matrix = null;
$words_count = count($this->words);
$words_occ = array_count_values($this->words);
foreach ($words_occ as $word => $amount)
$this->tf_matrix[$word] = $amount / $words_count;
arsort($this->tf_matrix);
}
public function build_tfidf($idf)
{
if (isset($this->tfidf_matrix) && $this->tfidf_matrix)
return true;
if (!isset($this->tf_matrix) || !$this->tf_matrix)
return false;
if (!isset($idf) || !$idf)
return false;

if(is_array($idf)){
foreach ($this->tf_matrix as $word => $word_tf){
$this->tfidf_matrix[$word] = $word_tf * $idf[$word];
}

}else{
foreach ($this->tf_matrix as $word => $word_tf){
$this->tfidf_matrix[$word] = $word_tf * $idf;
}
}
arsort($this->tfidf_matrix);
return true;
}
public function getWords()
{
return ($this->words);
}
public function getTf()
{
return ($this->tf_matrix);
}
public function getTfidf()
{
return ($this->tfidf_matrix);
}
}

/*
第一步，计算词频。
考虑到文章有长短之分，为了便于不同文章的比较，进行"词频"标准化。

第二步，计算逆文档频率。
这时，需要一个语料库（corpus），用来模拟语言的使用环境。
如果一个词越常见，那么分母就越大，逆文档频率就越小越接近0。分母之所以要加1，是为了避免分母为0（即所有文档都不包含该词）。log表示对得到的值取对数。

第三步，计算TF-IDF。
可以看到，TF-IDF与一个词在文档中的出现次数成正比，与该词在整个语言中的出现次数成反比。所以，自动提取关键词的算法就很清楚了，就是计算出文档的每个词的TF-IDF值，然后按降序排列，取排在最前面的几个词。
*/
$text = 'i very good, ha , i very nice, i is good';

$obj = new Document($text);
$obj->build_tf(); //词频率TF，一般是词出现次数/总词数

$idf = log(3 / 2); //逆文档频率，总文档数/包含该词的文档数
$obj->build_tfidf($idf);

//越高则频率高
var_dump($obj->getWords(), 88, $obj->getTf(), 99, $obj->getTfidf());

荣华富贵8

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
TF-IDF 提取关键词

<?phpclass Document{ protected $words; protected $tf_matrix; protected $tfidf_matrix; public function __construct($string) { $this->tfidf_matrix = null; if (isset($string)) { $string = strtol
复制链接

扫一扫