最近在做一个文档分析系统,要求是在上传的两千份左右的office文件里筛选出含有一千个左右的关键词,因为以前主业开发PHP,但是PHP在处理这块的时候“力不从心”,整好研究了小半年的Python,于是希望用Py和PHP混合开发,简单架构如下:
(为啥有Node,因为后边调用了 textract 来处理某些文件)
前台PHP上传那块不做赘述,常规CURD,Upload,重点是后边 Py+Node处理这一块,贴代码
环境:Centos6.9 Python3.6 Node10.16 Liboffice
框架:Thinkphp5.1 Layui
依赖都在代码声明里了。
# -*- coding: UTF-8 -*-
import sys
import getopt
import pymysql
import os
import re
import subprocess
import time
# 这个文件是处理主逻辑,基本思路是 将所有文件都转为 xhtml, 部分liboffice转不了的文件交给textract处理, 双保险, 再处理不了的, 返回提示, 让用户处理一下文件, 那种文件数量极少.
# init
db = pymysql.connect("127.0.0.1", "root", "root", "document")
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 接受一个参数, 文件id
def main(argv):
try:
opts, args = getopt.getopt(argv, "i:p:h", ["id=", "path=", "help"])
except getopt.GetoptError:
print('Wrong opts! you can use do.py -h to get help')
sys.exit(3)
params = {
'type': 0,
'args': ''
}
for opt, arg in opts:
if opt in ('-h', '--help'):
print('if you want to deal files by id, this order will according to these id')
print('main.py -i <ex:1,2,3,4,5,6,7,8>')
print('if you want to deal files by path, this order will according to this path , get all files to deal')
print('main.py -p <ex: ./pdf/>')
sys.exit()
elif opt in ('-i', '--id='):
params = {
'type': 1,
'args': arg
}
elif opt in ('-p', '--path='):
params = {
'type': 2,
'args': arg
}
return params
def getFile(params):
# 加一层判断, 如果传入的 id = all , 则处理全部文件,
# 判断 传入的是 id 还是 路径
# 如果传入的是 id, 去数据库 获取到他的路径
if 1 == params['type']:
if 'all' == params['args']:
sql = "SELECT id, path from doc_file where delete_time is null"
else:
sql = "SELECT id, path from doc_file where id IN (" + params['args'] + ")"
cds = cursor.execute(sql)
cds = cursor.fetchall()
for i in cds:
# print(i)
do(i[0], i[1])
def do(id, path):
# 追加 path 完整路径
# path_reg = r"/www_2/wwwroot/document/public/"
# path = path_reg+ path
# 先转
x_path = dealFormat(path)
if os.path.exists(x_path):
sql = "INSERT INTO doc_content(doc_id) VALUES ('%s')" % (id)
cursor.execute(sql)
db.commit()
else:
# 尝试 textract 读取
sub = subprocess.Popen([r"textract", path],stdin=subprocess.PIPE,stdout=subprocess.PIPE)
while sub.poll() is None:
print('textract Waiting...')
time.sleep(0.1)
content = sub.stdout.read()
print(content)
content = content.decode('utf-8', 'ignore')
f = open(x_path,'w+')
f.write(content)
f.close()
sql = "INSERT INTO doc_content(doc_id) VALUES ('%s')" % (id)
cursor.execute(sql)
db.commit()
f = open('bad.txt','a+')
f.write(id.__str__())
f.write('|')
# f.seek(0)
f.close()
print('This file is bad:', id)
# 再读
# content = read(x_path)
# try:
# cont_str = content.decode('utf-8', 'ignore')
# except:
# cont_str = content.decode('gbk', 'ignore')
# 存
# print(cont_str)
def dealFormat(path):
path = r_path+path
file = os.path.splitext(path)
filename, type = file
_, tempfilename = os.path.split(filename + '.xhtml')
x_path = r_path + r'cache3/' + tempfilename
# print(x_path)
if not os.path.exists(x_path):
# 转换文件格式
print('Now:',path)
if '.html' == type:
sub = subprocess.Popen([r"cp", path, r_path+'cache3/' + tempfilename])
# os.system('cp ' + path + ' ' + r_path+'/cache/' + filename + '.xhtml')
else:
# sub = subprocess.Popen([r"/opt/openoffice4/program/soffice", "--convert-to", "xhtml", "--outdir", r_path+r"cache3/", path])
sub = subprocess.Popen([r"/opt/libreoffice6.1/program/soffice", "--convert-to", "xhtml", "--outdir", r_path+r"cache3/", path])
# 检查进程是否结束
t = 0
while sub.poll() is None:
print('soffice Waiting...')
time.sleep(0.1)
return x_path
else:
print('This file has exists, pass')
return 'This file has exists, pass'
# 读html
def read(path=''):
try:
if 0 == len(path):
return False
fp = open(path, "rb")
data = fp.read()
return data
except:
print('error')
if __name__ == "__main__":
# 网站所在的目录
r_path = r"/www_2/wwwroot/document/public/"
# print(sys.argv[1:])
params = main(sys.argv[1:])
getFile(params)
# -*- coding: UTF-8 -*-
# 文件获取两个参数 -i [文件id] -c [公司id]
import sys
import getopt
import pymysql
import os
# 这个文件是搜索关键的逻辑, 直接去让python读上一步处理好的 xhtml 文件, 利用python处理字符串快的优势, 返回所需格式
# init
db = pymysql.connect("127.0.0.1", "root", "root", "document")
# 使用 cursor() 方法创建一个游标对象 cursor
cursor = db.cursor()
# 接受两个参数 文件id 公司id
def main(argv):
try:
opts, args = getopt.getopt(argv, "i:c:h", ["id=", "cid=", "help"])
except getopt.GetoptError:
print('Wrong opts! you can use do.py -h to get help')
sys.exit(3)
params = {}
for opt, arg in opts:
if opt in ('-h', '--help'):
print('if you want to deal files by id, this order will according to these id')
print('main.py -i <ex:1,2,3,4,5,6,7,8> -c <ex:1,2,3,4,5,6,7,8>')
sys.exit()
elif opt in ('-i', '--id='):
params['id'] = arg
elif opt in ('-c', '--cid='):
params['cid'] = arg
return params
def got(params):
# 获取文件 id
id = params['id']
# 获取id 所对的 生成 xhtml 文件名
sql = "SELECT path, id FROM doc_file WHERE id IN (" + id + ")"
cds = cursor.execute(sql)
cds = cursor.fetchall()
# print(cds)
xhtml_file = []
for i in cds:
file = os.path.splitext(i[0])
filename, type = file
_, tempfilename = os.path.split(filename + '.xhtml')
tempfilename = r_path+"/cache3/"+ tempfilename
xhtml_file.append([tempfilename, i[1]])
# 获取企业 id
cid = params['cid']
sql_c = "SELECT company_id, name FROM doc_keyword WHERE delete_time is null AND company_id IN (" + cid + ")"
cds = cursor.execute(sql_c)
cds = cursor.fetchall()
keys = []
for c_i in cds:
keys.append([c_i[1],c_i[0]])
# print(xhtml_file)
# 开始循环 文件判断文件中是否存在关键词
has_key = []
for x_i in xhtml_file:
f = open(x_i[0], 'r', -1, 'utf-8','ignore')
fr = f.read()
for key in keys:
if key[0] in fr:
has_key.append([x_i[1], key[1], key[0]])
f.close()
print(has_key)
if __name__ == "__main__":
# 网站所在的目录
r_path = r"/www_2/wwwroot/document/public/"
# print(sys.argv[1:])
params = main(sys.argv[1:])
got(params)
至于调用, PHP exec 调用即可, 不过要注意对 exec的 安全