前情概要
有一天领导让我去对比两个标书之间是否有重复或者极为相似的文本段落,我一看标书每个都是3000+页。看不完根本看不完,所以就萌生了用代码帮我查重的想法。
到这里,作为一个程序员我该做些什么了!!!
代码实现思路
文本对比(node实现)
对于文本我采用的是将word文章按照段落进行拆分,然后两篇文章生成了两个数组,逐一将数组1的元素和数组2的元素进行比较看相似度,其中比较相似度是调用的string-similarity库去比较两段文字之间的差异。
当然由于文章过大所以我并没有简单粗暴的单纯用双重for循环而是做了一定的剪枝来加速。
具体代码实现如下:
index.js
const fs = require('fs')
const mammoth = require('mammoth')
const fuzzaldrin = require('fuzzaldrin-plus')
const stringSimilarity = require('string-similarity')
const XLSX = require('xlsx')
const { Document, Packer } = require('docx')
const readline = require("readline")
const extractImagesFromWord = require('./extract_images_from_word')
const { execFile, spawn } = require('child_process')
const { log } = require('console')
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
})
function rlPromisify (fn) {
return async (...args) => {
return new Promise(resolve => fn(...args, resolve))
}
}
const question = rlPromisify(rl.question.bind(rl))
// 将word文档拆分成数组
async function splitParagraphsFromWord (filePath, step) {
try {
const data = await fs.promises.readFile(filePath)
const result = await mammoth.extractRawText({ buffer: data })
const paragraphs = result.value.split('\n\n')
return paragraphs.filter(str => str.length > step)
} catch (err) {
console.error(err)
}
}
// 对比两个字符串输出他们之间的相似度
function compareStrings (str1, str2) {
const similarity = stringSimilarity.compareTwoStrings(str1, str2)
return similarity
}
async function wordDiff (url1, url2, step, similarityNum) {
const paragraphs1 = await splitParagraphsFromWord(url1, step)
const paragraphs2 = await splitParagraphsFromWord(url2, step)
// 创建一个 Set 用于存储 paragraphs2 的值,便于快速查找
const setParagraphs2 = new Set(paragraphs2)
// 用于存储结果
const matches = new Map()
let index = 0
paragraphs1.forEach((str1) => {
index++
let process = (index / paragraphs1.length) * 100
console.log(`相似度比对进度:${process}%`)
if (setParagraphs2.has(str1)) {
matches.set(str1, { str2: str1, score: 1 })
} else {
for (const str2 of paragraphs2) {
if (!str1 || !str2) continue
const score = compareStrings(str1, str2)
if (score > similarityNum) {
matches.set(str1, { str2, score })
break // 找到匹配后立即退出循环
}
}
}
})
// 将 Map 转换为数组返回
return Array.from(matches.entries()).map(([str1, { str2, score }]) => ({
str1,
str2,
score,
}))
}
// 获取同级目录下的所有docx文件名称
function getDocxFiles () {
const files = fs.readdirSync('./')
return files.filter((file) => file.endsWith('.docx'))
}
// 转换数据为工作表
function createWorksheet (data, docxFiles) {
const ws = XLSX.utils.json_to_sheet(data)
return ws
}
async function main () {
// 移除img文件夹
fs.rmSync('./img', { recursive: true, force: true })
let startTime = Date.now()
let step = 10
let similarityNum = 0.8
let pic_similarityNum = 0.8
similarityNum = await question(`请输入相似度阈值(建议为${similarityNum}):`)
step = await question(`请输入步长(建议为${step}):`)
// pic_similarityNum = await question(`请输入图片相似度阈值(建议为${pic_similarityNum}):`)
const docxFiles = getDocxFiles()
await extractImagesFromWord(docxFiles[0], './img/original1/')
await extractImagesFromWord(docxFiles[1], './img/original2/')
console.log('==========图片已经准备好,可以双击analysisPic.exe,进行图片处理了=============')
// 执行图片分析的analysisPic.exe可执行文件
// execFile('./analysisPic.exe', [docxFiles[0], docxFiles[1], pic_similarityNum], (error, stdout, stderr) => {
// if (error) {
// console.error(`执行出错: ${error}`)
// return
// }
// console.log(`stdout: ${stdout}`)
// console.log(`stderr: ${stderr}`)
// })
let res = await wordDiff(docxFiles[0], docxFiles[1], step, similarityNum)
res.unshift({
str1: docxFiles[0],
str2: docxFiles[1],
score: '相似度'
})
// 将数组res输出成excel文件
// 创建工作簿
const wb = XLSX.utils.book_new()
// 添加工作表到工作簿
const ws = createWorksheet(res)
XLSX.utils.book_append_sheet(wb, ws, 'Sheet1')
// 写入文件
XLSX.writeFile(wb, '文本相似度比较结果.xlsx')
// 输出耗时 HH:mm:ss.SSS
console.log(`耗时:${new Date(Date.now() - startTime).toISOString().substr(11, 12)}`)
console.log('已将结果输出至(文本相似度比较结果.xlsx)')
}
main()
extract_images_from_word.js
它主要是为了下面图片对比做铺垫
const fs = require('fs')
const path = require('path')
const unzipper = require('unzipper')
// 解压并提取图片
async function extractImagesFromWord (docxFilePath, outputDir) {
return new Promise((resolve, reject) => {
// 创建输出目录如果不存在
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true })
}
try {
// 读取 Word 文档文件
const fileStream = fs.createReadStream(docxFilePath)
// 解压文件
const zipFile = fileStream.pipe(unzipper.Parse())
// 监听文件事件
zipFile.on('entry', entry => {
const entryPath = entry.path
const extension = path.extname(entryPath).toLowerCase()
// 检查文件是否是图片
// if (['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.emf', '.wmf', '.svg'].includes(extension)) {
if (['.png', '.jpg', '.jpeg', '.gif', '.bmp', '.emf', '.svg'].includes(extension)) {
// 读取图片数据
const imgData = []
// entry.pipe(fs.createWriteStream(path.join(outputDir, path.basename(entryPath))))
// // 复制图片到输出目录
entry.on('data', chunk => imgData.push(chunk))
entry.on('end', () => {
const imgBuffer = Buffer.concat(imgData)
const imgFileName = path.join(outputDir, `image_${Date.now()}${extension}`)
fs.writeFileSync(imgFileName, imgBuffer)
console.log(`Image extracted to: ${imgFileName}`)
})
} else {
// 忽略非图片文件
entry.autodrain()
}
})
// 结束处理
zipFile.on('close', () => {
console.log('图片复制成功')
resolve('图片复制成功')
})
} catch (error) {
console.error('Error extracting images from Word:', error)
reject(error)
}
})
}
module.exports = extractImagesFromWord
图片对比(python实现)
因为word中有的图片并不是jpg、png而是用visio画出来的所以extract_images_from_word.js
导出的图片格式就是emf,为了便于后续处理,我们需要统一图片格式为png,又因为nodejs没有emf转png的库,所以我就将目光投向了python,毕竟是万能python。
步骤1:将图片格式统一为png
步骤2:挨个分析图片相似度,这里采用的ssim(结构相似性)作为相似度
步骤3:将比较完认为是相似的图片组合到新的一张大图里
具体代码如下:
from PIL import Image, ImageDraw, ImageFont
import os
import cv2
import sys
from skimage.metrics import structural_similarity as ssim
Image.LOAD_TRUNCATED_IMAGES = True
Image.MAX_IMAGE_PIXELS = None
# 获取命令行参数,如果没有提供参数,则使用默认值
try:
word1 = sys.argv[1]
word2 = sys.argv[2]
pic_similarityNum = sys.argv[3]
except IndexError:
word1 = 'File1'
word2 = 'File2'
pic_similarityNum = 0.8
pic_similarityNum = input("请输入图片相似度阈值(建议为0.8):").strip().lower()
isFast = input("是否开启快速模式,适合内存较小的电脑,开启后精度会有所下降 (y/n):").strip().lower()
print(type(word1), word1)
print(type(word2), word2)
print(type(pic_similarityNum), pic_similarityNum)
def compare_images(image1_path, image2_path):
# 读取图片
image1 = cv2.imread(image1_path)
image2 = cv2.imread(image2_path)
# 调整图片大小为相同尺寸
height1, width1, _ = image1.shape
height2, width2, _ = image2.shape
if isFast == 'y':
min_height = 300
min_width = 300
else:
min_height = max(height1, height2)
min_width = max(width1, width2)
image1_resized = cv2.resize(image1, (min_width, min_height))
image2_resized = cv2.resize(image2, (min_width, min_height))
# 计算灰度图像
gray_image1 = cv2.cvtColor(image1_resized, cv2.COLOR_BGR2GRAY)
gray_image2 = cv2.cvtColor(image2_resized, cv2.COLOR_BGR2GRAY)
# 计算 SSIM
similarity = ssim(gray_image1, gray_image2)
return similarity
# -------------------------将图片格式统一为png------------------
dir_path1 = './img/original1'
dir_path2 = './img/original2'
new_path1 = './img/analysis_images1'
new_path2 = './img/analysis_images2'
def toPng(orgUrl, targetUrl):
# 没有new_images则创建
if not os.path.exists(targetUrl):
os.mkdir(targetUrl)
file_list1 = os.listdir(orgUrl)
for file in file_list1:
name, ext = file.split('.')
img_path = orgUrl + '/' + file
save_path = targetUrl + '/' + file
if ext != 'png':
save_path = targetUrl + '/' + name + '.png'
img = Image.open(img_path)
img.save(save_path)
print('图片格式统一为png =》' + save_path)
print('图片转换完毕')
toPng(dir_path1, new_path1)
toPng(dir_path2, new_path2)
# -------------------------挨个分析图片相似度------------------
num = 0
files1 = os.listdir(new_path1)
files2 = os.listdir(new_path2)
result = []
for url1 in files1:
for url2 in files2:
num += 1
print('图片分析进度:{:.2f}%'.format(num / (len(files1) * len(files2)) * 100))
persent = compare_images(new_path1 + '/' + url1, new_path2 + '/' + url2)
if persent > float(pic_similarityNum):
print('图片相似度大于'+str(pic_similarityNum)+' url1:' + url1 + ',url2:' + url2 + ',相似度:' + str(persent))
result.append({
'url1': url1,
'url2': url2,
'similarity': persent
})
# result 按相似度排序 由高到低
result = sorted(result, key=lambda x: x['similarity'], reverse=True)
print('图片相似度分析完毕!')
print('开始生成图片对比结果.........')
# ---------------------拼接图片---------------------------
def create_comparison_image(pairs, target_size=(900, 900)):
# 文本设置
font = ImageFont.truetype("arial.ttf", 40) # 使用Arial字体,大小为40
text_color = (0, 0, 0) # 黑色文字
text_padding = 50 # 文字与图片边缘的距离
text_height = 70 # 文字行的高度
# 新图片的宽度和高度
new_width = target_size[0] * 2 # 每行两张图片
new_height = (target_size[1] + text_height) * len(pairs) # 每对图片的高度加上文本行的高度
# 创建一个空白的新图片
new_img = Image.new('RGB', (new_width, new_height), color='gray')
# 在新图片上放置每一对图片
y_offset = 0
num = 0
for pair in pairs:
num += 1
print('图片对比结果生成进度:{:.2f}%'.format(num / (len(pairs)) * 100))
# 打开图片并调整大小
left_img = Image.open(new_path1 + '/' + pair['url1']).resize(target_size)
right_img = Image.open(new_path2 + '/' + pair['url2']).resize(target_size)
# 创建带有文本的透明层
text_layer = Image.new('RGBA', (new_width, text_height), (255, 255, 255, 0)) # 透明背景
draw = ImageDraw.Draw(text_layer)
# 在透明层上绘制文本
draw.text((text_padding, 0), word1, fill=text_color, font=font)
draw.text((target_size[0] + text_padding + 40, 0), word2, fill=text_color, font=font)
draw.text((target_size[0] + target_size[0] - 120, 0), str(pair['similarity']), fill=text_color, font=font)
# 将调整大小后的图片和带有文本的透明层粘贴到新图片上
new_img.paste(left_img, (0, y_offset + text_height))
new_img.paste(right_img, (target_size[0] + 40, y_offset + text_height))
new_img.paste(text_layer, (0, y_offset), text_layer)
# 更新y坐标
y_offset += target_size[1] + text_height + 40
# 保存新图片
new_img.save('图片相似度比较结果.png')
create_comparison_image(result)
打包exe
nodejs打包exe
对于nodejs代码我采用的是pkg库
全局安装
npm i -g pkg
执行打包命令
pkg index.js --target win
python打包exe
conda install pyinstaller
或者
pip install pyinstaller
执行打包命令
pyinstaller -D .\你的python文件名.py
注意
:pyinstaller问题也比较多 很有可能遇到如下报错:
File "c:\programdata\anaconda3\lib\site-packages\PyInstaller\utils\hooks\__init__.py", line 352, in get_module_attribute
raise AttributeError(
AttributeError: Module 'PyQt5' has no attribute '__version__'
方法一: 这个时候你换个conda环境往下降降python和pip版本然后再下pyinstaller再打包试试。
方法二:或者你直接去报错信息中提到的文件中
c:\programdata\anaconda3\lib\site-packages\PyInstaller\utils\hooks\__init__.py
,把那个报错代码改掉,我是下面这么改的,直接绕过PyQt5的检查报错
if attr_value == attr_value_if_undefined:
if module_name != "PyQt5":
raise AttributeError(
'Module %r has no attribute %r' % (module_name, attr_name))