算法:js 数组 array 去重,并显示所有重复的元素
遇到这样一个问题,一个数组中查重并提取所有重复的项目,不仅仅只展示多出来的。
[1,2,2,3,4,5,6,7,3,2,3]
得出 [2,2,2,3,3,3]
而不只是得出 [2,3,2,3]
一、问题:
需要处理的数据是这样的,前面是词条,后面是编码
需要实现的是:
- 查出所有词条文字一样的词条,不只是多余的词条
斯蒂芬 aaaw
劳苦功高 aaay
式子 aabb
草草了事 aabg
茕茕孑立 aabu
戒除 aabw
落落难合 aacw
式 aad
葚 aad
工友 aadc
工友 aad1
工友 aad2
工厂 aadg
工友 aad3
工厂 aad1
工厂 aad2
工厂 aad3
落荒而逃 aadi
匿 aadk
花花太岁 aadm
慝 aadn
工厂 aad4
葚 aadn
落花有意 aadu
二、处理源数据
将其拆分后处理得出可操作数据
{"id":73334,"code":"aaaw","word":"斯蒂芬","priority":null},
{"id":73335,"code":"aaay","word":"劳苦功高","priority":null},
{"id":73336,"code":"aabb","word":"式子","priority":null},
{"id":73337,"code":"aabg","word":"草草了事","priority":null},
{"id":73338,"code":"aabu","word":"茕茕孑立","priority":null},
{"id":73339,"code":"aabw","word":"戒除","priority":null},
{"id":73340,"code":"aacw","word":"落落难合","priority":null},
{"id":73341,"code":"aad","word":"式","priority":null},
{"id":73342,"code":"aad","word":"葚","priority":null},
{"id":73343,"code":"aadc","word":"工友","priority":null},
{"id":73344,"code":"aad1","word":"工友","priority":null},
{"id":73345,"code":"aad2","word":"工友","priority":null},
{"id":73346,"code":"aadg","word":"工厂","priority":null},
{"id":73347,"code":"aad3","word":"工友","priority":null},
{"id":73348,"code":"aad1","word":"工厂","priority":null},
{"id":73349,"code":"aad2","word":"工厂","priority":null},
{"id":73350,"code":"aad3","word":"工厂","priority":null},
{"id":73351,"code":"aadi","word":"落荒而逃","priority":null},
{"id":73352,"code":"aadk","word":"匿","priority":null},
{"id":73353,"code":"aadm","word":"花花太岁","priority":null},
{"id":73354,"code":"aadn","word":"慝","priority":null},
{"id":73355,"code":"aad4","word":"工厂","priority":null},
{"id":73356,"code":"aadn","word":"葚","priority":null},
{"id":73357,"code":"aadu","word":"落花有意","priority":null},
三、查重算法
由于需要提取出所有重复的元素,我的解决办法是:
- 遍历数组,以
word
为key
生成 一个对照的Map()
- 遍历过程中,如果比较的词条已经在 Map 中了,保存对照的元素和被对照元素
- 以上操作会多出许多重复的元素,这里需要去重,只留一个
- 最后就是所有重复的元素
实现方法:
// 查重,返回重复定义的字词
// includeCharacter 当包含单字时
getRepetitionWords(){
let startPoint = new Date().getTime()
let wordMap = new Map() // 1. 定义一个字典文件
let repetitionWords = [] // 2. 盛放重复的元素
this.wordsOrigin.forEach(word => { // 3. 遍历
if (wordMap.has(word.word)){ // 3.1 如果 map 中已经有这个词,
repetitionWords.push(word) // 记录这个词
let matchedWord = wordMap.get(word.word) // 获取保存在 map 中的源元素
repetitionWords.push(matchedWord) // 记录 map 中的这个词。这样每重复一次都会多添加一次源元素,这个在其后进行处理
} else { // 如果 map 中没有这个词的记录,添加这个记录
wordMap.set(word.word, word) // 3.2 如果 map 中没有这个词,记录下来,用于比较
}
})
// 排序后再去除重复项
repetitionWords.sort((a, b) => {
// log(a.word + a.code, b.word + b.code)
return (a.word + a.code) > (b.word + b.code) ? 1 : -1 // 以词条+编码 作为排序依据
})
log('重复词条数量:未去重之前 ', repetitionWords.length)
// 去除记录的词条里重复的元素
for (let i = 0; i < repetitionWords.length - 1; i++) {
if (repetitionWords[i].id === repetitionWords[i + 1].id ) {
log(repetitionWords[i].toString(), repetitionWords[i+1].toString())
repetitionWords.splice(i,1)
i = i - 1
}
}
log(`查重完成,用时 ${new Date().getTime() - startPoint} ms`)
log('词条字典数量: ', wordMap.size)
log('重复词条数量: ', repetitionWords.length)
log('重复 + 词条字典 = ', repetitionWords.length + wordMap.size)
repetitionWords.forEach(item => log( item.toString()))
return repetitionWords
}
四、结果
这是我查出的 8w 条数据中的重复项,很棒!
[{"id":73348,"code":"aad1","word":"工厂","priority":null},
{"id":73349,"code":"aad2","word":"工厂","priority":null},
{"id":73350,"code":"aad3","word":"工厂","priority":null},
{"id":73355,"code":"aad4","word":"工厂","priority":null},
{"id":73346,"code":"aadg","word":"工厂","priority":null},
{"id":73344,"code":"aad1","word":"工友","priority":null},
{"id":73345,"code":"aad2","word":"工友","priority":null},
{"id":73347,"code":"aad3","word":"工友","priority":null},
{"id":73343,"code":"aadc","word":"工友","priority":null},
{"id":117210,"code":"qtqt","word":"狡猾","priority":null},
{"id":117418,"code":"quqm","word":"狡猾","priority":null},
{"id":117211,"code":"qtqt","word":"狼狈","priority":null},
{"id":117927,"code":"qyqm","word":"狼狈","priority":null},
{"id":115794,"code":"qjqd","word":"猖獗","priority":null},
{"id":117209,"code":"qtqt","word":"猖獗","priority":null},
{"id":102180,"code":"khkh","word":"跟踪","priority":null},
{"id":103615,"code":"kvkp","word":"跟踪","priority":null},
{"id":102178,"code":"khkh","word":"跳跃","priority":null},
{"id":102476,"code":"kikt","word":"跳跃","priority":null},
{"id":101472,"code":"kckt","word":"踊跃","priority":null},
{"id":102179,"code":"khkh","word":"踊跃","priority":null}]