判断文本信息是否是高重复率的内容
js 实现对文本内容大于10个字符时,重复的内容(包括字符或词组)超过80%的判断函数。
/**
* 检查一个文本字符串是否根据子串的重复情况而高度重复。
*
* 一个文本在以下情况下被认为是高度重复的:
* 1. 其长度大于 `minLen`。
* 2. 出现超过一次的子字符串(长度 >= `minRepeatLen`),其所有出现所覆盖的字符总数,
* 占总文本长度的百分比超过了指定的 `threshold`。
*
* @param {string} text 输入的文本字符串。
* @param {number} [threshold=80] 用于判断文本是否高度重复的百分比阈值(0-100)。
* @param {number} [minLen=10] 进行检查所需的最小文本长度。
* @param {number} [minRepeatLen=1] 用于重复检查的子字符串的最小长度(例如,1 表示单个字符,2 表示字符对及以上)。
* @returns {boolean} 如果文本高度重复则返回 true,否则返回 false。
*/
function isHighlyRepetitive(
text,
threshold = 80,
minLen = 10,
minRepeatLen = 1 // 默认情况下考虑单个字符的重复
) {
// --- 基本输入验证 ---
if (typeof text !== 'string' || text.length <= minLen) {
return false;
}
if (threshold < 0 || threshold > 100) {
console.warn("阈值应在 0 到 100 之间。使用默认值 80。"); // (Self-correction: Also translated user-facing warning)
threshold = 80;
}
if (minRepeatLen < 1) {
console.warn("minRepeatLen 必须至少为 1。使用默认值 1。"); // (Self-correction: Also translated user-facing warning)
minRepeatLen = 1;
}
const n = text.length;
// 使用一个布尔数组来标记被重复子字符串覆盖的字符
const covered = new Array(n).fill(false);
// --- 查找重复子字符串并标记覆盖范围 ---
// 遍历可能的子字符串长度。
// 要检查的最大长度是 n/2,因为更长的子字符串不可能重复出现。
const maxCheckLen = Math.floor(n / 2);
for (let len = minRepeatLen; len <= maxCheckLen; len++) {
const substringCounts = new Map();
// 计算当前长度 'len' 的所有子字符串的出现次数
for (let i = 0; i <= n - len; i++) {
const sub = text.substring(i, i + len);
substringCounts.set(sub, (substringCounts.get(sub) || 0) + 1);
}
// 处理实际重复的子字符串 (count > 1)
for (const [sub, count] of substringCounts.entries()) {
if (count > 1) {
// 查找该重复子字符串的所有起始位置
let startIndex = -1;
while ((startIndex = text.indexOf(sub, startIndex + 1)) !== -1) {
// 标记被本次出现覆盖的字符
for (let k = 0; k < len; k++) {
if (startIndex + k < n) { // 边界检查
covered[startIndex + k] = true;
}
}
}
}
}
}
// 处理整个字符串只是一个字符重复的情况
// (例如:"aaaaa...")。上面的循环只检查到 n/2。
// 如果所有字符都相同,那它肯定是重复的。
if (minRepeatLen === 1 && n > 0) {
const firstChar = text[0];
let allSame = true;
for(let i = 1; i < n; i++) {
if (text[i] !== firstChar) {
allSame = false;
break;
}
}
if (allSame) {
// 如果尚未被 n/2 的逻辑完全覆盖,则将所有字符标记为已覆盖
for(let i = 0; i < n; i++) covered[i] = true;
}
}
// --- 计算百分比 ---
const totalCoveredLength = covered.filter(Boolean).length;
const repetitionPercentage = (totalCoveredLength / n) * 100;
// --- 调试输出(可选) ---
// console.log(`文本: "${text}" (长度: ${n})`);
// console.log(`覆盖字符数: ${totalCoveredLength}`);
// console.log(`重复百分比: ${repetitionPercentage.toFixed(2)}%`);
// console.log(`覆盖数组: ${covered.map(c => c ? 'T' : 'F').join('')}`);
return repetitionPercentage >= threshold;
}
// --- 示例 ---
console.log(`"abcabcabcabcabcabc" (len 18): ${isHighlyRepetitive("abcabcabcabcabcabc")}`); // true (100%)
console.log(`"aaaaaaaaaaaaaaaaaa" (len 18): ${isHighlyRepetitive("aaaaaaaaaaaaaaaaaa")}`); // true (100%)
console.log(`"ababababababababab" (len 18): ${isHighlyRepetitive("ababababababababab")}`); // true (100%)
console.log(`"This is a test. This is a test." (len 31): ${isHighlyRepetitive("This is a test. This is a test.")}`); // true (~96.7%)
console.log(`"abcdefghijklmno" (len 15): ${isHighlyRepetitive("abcdefghijklmno")}`); // false (0%)
console.log(`"abcabcabcxyz123" (len 15): ${isHighlyRepetitive("abcabcabcxyz123")}`); // false (60% 被 "abc" 覆盖)
console.log(`"abcabcabcabcxyz" (len 15): ${isHighlyRepetitive("abcabcabcabcxyz")}`); // true (80% 被 "abc" 覆盖)
console.log(`"short text" (len 10): ${isHighlyRepetitive("short text")}`); // false (长度 <= minLen)
console.log(`"very long text without much repetition..." (len > 10): ${isHighlyRepetitive("This is a relatively long sentence without significant repeating phrases that would trigger the threshold.")}`); // false
console.log(`"" (empty string): ${isHighlyRepetitive("")}`); // false
console.log(`null: ${isHighlyRepetitive(null)}`); // false
console.log(`12345678901: ${isHighlyRepetitive(12345678901)}`); // false (不是字符串)
console.log(`"go go go go go go go go go go" (len 29): ${isHighlyRepetitive("go go go go go go go go go go")}`); // true (100%)
console.log(`"测试阈值 50": ${isHighlyRepetitive("abcabcabcxyz123", 50)}`); // true (60% > 50%) (Changed example description to Chinese)
console.log(`"测试 minLen 20": ${isHighlyRepetitive("abcabcabcabcabcabc", 80, 20)}`); // false (长度 18 < minLen 20) (Changed example description to Chinese)
console.log(`"测试 minRepeatLen 2": ${isHighlyRepetitive("aaaaaaaaaaaaaaaaaa", 80, 10, 2)}`); // true (被 "aa" 覆盖) (Changed example description to Chinese)
console.log(`"测试 minRepeatLen 3 (无三字符重复)": ${isHighlyRepetitive("abababababacacacac", 80, 10, 3)}`); // false ("aba", "aca" 覆盖率 < 80%) (Changed example description to Chinese)
// Note: The long Chinese string example below already used Chinese characters.
console.log(`"你行不行,行不行,太秀了,行不行,行不行,行不行,行不行,行不行你行不行,行不行,太秀了,行不行,行不行,行不行,行不行,行不行你行不行,行不行,太秀了,行不行,行不行,行不行,行不行,行不行" (len 18): ${isHighlyRepetitive("你行不行,行不行,太秀了,行不行,行不行,行不行,行不行,行不行你行不行,行不行,太秀了,行不行,行不行,行不行,行不行,行不行你行不行,行不行,太秀了,行不行,行不行,行不行,行不行,行不行")}`); // true
console.log(`"你好,我怎么样才能成为技术大佬" (len 15): ${isHighlyRepetitive("你好,我怎么样才能成为技术大佬")}`); // false