介绍
采用微信的hook插件,然后解析微信发来的数据图片,通过ocr识别 然后将数据落入execl表格中。同时有权限的人可以导出数据表格即可。
流程图
代码片
文本消息处理流程_robot.py
elif msg.type == 0x01: # 文本消息
# 管理员列表
dba_user_list = ['wxid_uev4klri3snh22','zhangzijian0715','yanheng1984','wxid_30173uk0ohjd21']
# 8812131740734078818 id字段
# 'wxid_uev4klri3snh22' sender字段 我的WXID
self.LOG.info("发送人的ID:" + msg.sender + ",发送内容:" + msg.content)
# 管理员的特殊权限
if msg.sender in dba_user_list:
if msg.content.startswith('数据文件'):
# 发送文件
self.wcf.send_file(data_execl_path,msg.sender)
elif msg.content.startswith('添加人员'):
# 保存数据的字典
data_dict = {}
data_list = msg.content.split()
if len(data_list) == 4:
data_dict['user_name']= data_list[1]
data_dict['user_department']= data_list[2]
data_dict['user_wxid']= data_list[3]
insert_data_result, insert_cause_str = insert_user_data_info(data_dict)
if insert_data_result == False:
insert_fail_info = ('数据插入失败,请联系管理员处理,cause: ' + insert_cause_str)
self.sendTextMsg(insert_fail_info, msg.sender)
return
else:
insert_success_info = ('数据插入成功')
self.sendTextMsg(insert_success_info, msg.sender)
return
else:
self.sendTextMsg('数据格式错误',msg.sender)
return
return
# 让配置加载更灵活,自己可以更新配置。也可以利用定时任务更新。
# if msg.from_self():
# if msg.content == "^更新$":
# self.config.reload()
# self.LOG.info("已更新")
# else:
# self.toChitchat(msg) # 闲聊
图片消息处理流程_robot.py
elif msg.type == 3: # 图片消息
try:
##### 判断用户是否属于授权用户
user_name = get_user_name_info_by_wxid(msg.sender)
if not user_name:
self.sendTextMsg('你的账号暂未授权,请联系管理员授权账号\nwxid = ' + msg.sender, msg.sender)
return
# msg.extra字段为完整图片的dat路径,需要进行图片解密转换,另一个字段属于缩略图
# DONE sleep是为了给图片预留落盘的时间,不然会识别不到文件
time.sleep(1)
###### 下载Image目录的图片并返回图片地址
current_datetime = datetime.datetime.now()
formatted_datetime = current_datetime.strftime("%Y_%m").strip()
# temp_pic目录的路径
target_path = os.path.join(os.path.join(os.getcwd(), 'temp_pic'), formatted_datetime)
if not os.path.exists(target_path):
os.makedirs(target_path)
self.wcf.download_image(msg.id, msg.extra, target_path)
##### 根据图片地址OCR解析需要的内容
## 标准精度识别
# data_dict = ocr_wx_pic(os.path.join(target_path, pic_name).replace('.dat', '.jpg'))
## 高精度的ORC识别
pic_name = os.path.basename(msg.extra)
new_pic_path = os.path.join(target_path, pic_name).replace('.dat', '.jpg')
if not os.path.exists(new_pic_path):
self.sendTextMsg('当前网络繁忙,图片解析失败,请稍后重试', msg.sender)
self.LOG.error('当前网络繁忙,图片解析失败,请稍后重试,如有疑问请联系管理员')
return
### 加一层过滤 防止OCR识别发生错误或者异常图片不符合规范
data_dict = ocr_wx_pic_hign(new_pic_path)
if data_dict == None:
self.sendTextMsg('图片不符合规范,请上传本周小结的内容', msg.sender)
self.LOG.error('图片不符合规范,请上传本周小结的内容,如有疑问请联系管理员')
return
#### 判断数据是否已经录入数据库中,如果未录入则录入
data_is_exit = get_user_week_info_is_exit(data_dict['week_date'], user_name)
data_dict['user_name'] = user_name
if data_is_exit == False:
insert_data_result,insert_cause_str = insert_user_week_data_info(data_dict)
if insert_data_result == False:
insert_fail_info = ('数据插入失败,请联系管理员处理,cause: '+insert_cause_str+"\n"
+ '[数据总览]\n'
+ '姓名 : ' + user_name + "\n"
+ '日期 : ' + data_dict['week_date'] + "\n"
+ '工作会话数 : ' + str(data_dict['week_work_num']) + "次\n"
+ '花费时长 : ' + str(data_dict['week_work_total_min']) + "分钟\n"
+ '最晚时间 : ' + data_dict['week_final_last_time']
)
self.sendTextMsg(insert_fail_info, msg.sender)
return
else:
send_succ_info = ('数据插入成功\n'
+ '[数据总览]\n'
+ '姓名 : ' + user_name + "\n"
+ '日期 : ' + data_dict['week_date'] + "\n"
+ '工作会话数 : ' + str(data_dict['week_work_num']) + "次\n"
+ '花费时长 : ' + str(data_dict['week_work_total_min']) + "分钟\n"
+ '最晚时间 : ' + data_dict['week_final_last_time']
)
# 对内容进行转换
self.sendTextMsg(send_succ_info, msg.sender)
return
else:
# 对内容进行转换
repeat_info = ('数据已存在,不可重复添加\n'
+ '[数据总览]\n'
+ '姓名 : ' + user_name + "\n"
+ '日期 : ' + data_dict['week_date'] + "\n"
+ '工作会话数 : ' + str(data_dict['week_work_num']) + "次\n"
+ '花费时长 : ' + str(data_dict['week_work_total_min']) + "分钟\n"
+ '最晚时间 : ' + data_dict['week_final_last_time']
)
self.sendTextMsg(repeat_info, msg.sender)
return
except Exception as e:
self.sendTextMsg('图片处理失败,请联系管理员处理, cause ' + str(e), msg.sender)
self.LOG.exception("图片处理失败,请联系管理员处理: %s", e)
execl表格的代码处理_csv_util.py
import pandas as pd
import os
import logging
import datetime
# data_path = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), 'data')
data_path = os.path.join(os.getcwd(), 'data')
data_execl_path = os.path.join(data_path, 'weixin_data.xlsx')
user_execl_path = os.path.join(data_path, 'user_info.xlsx')
logs = logging.getLogger("csv_util")
def read_excel(file_path):
"""读取Excel文件并返回DataFrame"""
try:
os.chmod(file_path, 0o777)
df = pd.read_excel(file_path)
return df
except Exception as e:
logs.error("Error reading Excel file: {} , cause {}",
file_path, e)
return None
def write_to_excel(file_path, new_row):
"""将新数据写入Excel文件的最后一行"""
try:
df = read_excel(file_path)
if df is not None:
# df = df.append(data, ignore_index=True)
df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
# 将数据写入
with pd.ExcelWriter(file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
df.to_excel(writer, index=False, sheet_name='Sheet1')
for key, value in new_row.items():
logs.error(f"Key: {key}, Value: {value}")
logs.info("Data written successfully. ")
return True,None
else:
for key, value in new_row.items():
logs.error(f"Key: {key}, Value: {value}")
logs.warning("Failed to read Excel file., file info {} ",
file_path)
return False,None
except Exception as e:
for key, value in new_row.items():
logs.error(f"Key: {key}, Value: {value}")
logs.exception("Error writing to Excel file., file info {} ,cause info {}",
file_path, e)
return False,str(e)
def query_excel(file_path, field, value, return_field=None):
"""根据指定字段查询数据"""
try:
df = read_excel(file_path)
if df is not None:
result = df[df[field] == value]
if return_field:
return_values = result[return_field]
if not return_values.empty:
return_values = ''.join([return_values.iloc[0]])
else:
return_values = ''.join(return_values)
return str(return_values)
return result
else:
logs.warning("Failed to read Excel file {} no found , field {} , value {} , return_field {}",
file_path, field, value, return_field)
return False
except Exception as e:
logs.exception(
"Error querying Excel file {} , field {} , value {} , return_field {}, cause {}",
file_path, field, value, return_field, e)
return False
# 根据wxid获取用户的姓名
def get_user_name_info_by_wxid(wxid):
return query_excel(user_execl_path, '微信唯一标识', wxid, '姓名')
# 判断周报数据是否存在
def get_user_week_info_is_exit(data_str, user_name):
result_df = query_excel(data_execl_path, '日期', data_str)
if result_df is None:
return False
first_column_values = result_df['姓名'].values
if user_name in first_column_values:
return True
else:
return False
# 将人员数据入库
def insert_user_data_info(data_dict):
current_datetime = datetime.datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S").strip()
new_data = pd.Series({
'姓名': data_dict['user_name'],
'部门': data_dict['user_department'],
'微信唯一标识': data_dict['user_wxid'],
'入库时间': formatted_datetime,
# 添加更多列数据
})
# 将数据转换为DataFrame
#
# new_row = pd.Series(
# [data_dict['user_name'], data_dict['week_date'], data_dict['week_work_num'], data_dict['week_work_total_min'],
# data_dict['week_final_last_time'], data_dict['data_year'], formatted_datetime],
# index=['姓名', '日期', '工作会话数', '花费时长', '最晚时间', '年份', '入库时间'])
return write_to_excel(user_execl_path, new_data)
# 将周报数据入库
def insert_user_week_data_info(data_dict):
current_datetime = datetime.datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d %H:%M:%S").strip()
new_data = pd.Series({
'姓名': data_dict['user_name'],
'日期': data_dict['week_date'],
'工作会话数': data_dict['week_work_num'],
'花费时长': data_dict['week_work_total_min'],
'最晚时间': data_dict['week_final_last_time'],
'年份': data_dict['data_year'],
'入库时间': formatted_datetime,
# 添加更多列数据
})
# 将数据转换为DataFrame
#
# new_row = pd.Series(
# [data_dict['user_name'], data_dict['week_date'], data_dict['week_work_num'], data_dict['week_work_total_min'],
# data_dict['week_final_last_time'], data_dict['data_year'], formatted_datetime],
# index=['姓名', '日期', '工作会话数', '花费时长', '最晚时间', '年份', '入库时间'])
return write_to_excel(data_execl_path, new_data)
# 示例用法
if __name__ == "__main__":
# 通过wxid 判断是否有权限
# result = get_user_name_info_by_wxid('wxid_uev4klri3snh22')
result = get_user_week_info_is_exit(20240603, '高垣')
if result is not None:
print(result)
# file_path = '/mnt/data/excel_file.xlsx' # 替换为你的Excel文件路径
# excel_handler = ExcelHandler(file_path)
#
# # 读取Excel文件
# df = excel_handler.read_excel()
# if df is not None:
# print(df)
#
# # 写入新数据
# new_data = {
# 'Column1': 'Value1',
# 'Column2': 'Value2',
# # 添加更多列数据
# }
# excel_handler.write_to_excel(new_data)
#
# # 根据指定字段查询数据
# result = excel_handler.query_excel('Column1', 'Value1')
# if result is not None:
# print(result)
图片ocr的图片处理_baidu_ocr.py
import base64
import urllib
import requests
import json
import re
import datetime
import os
import logging
import datetime
API_KEY = "XXXX"
SECRET_KEY = "XXXX"
logs = logging.getLogger("baidu_ocr")
def main():
url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general?access_token=" + get_access_token()
# image 可以通过 get_file_content_as_base64("C:\fakepath\15866dbd4118eb7638c9a13b430dadf1.jpg",True) 方法获取
# payload = 'image=%2F9j%2F4AAQSkZJRgABAQAAAQABAAD%2F2wBDAAEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQEBAQ...&detect_direction=false&detect_language=false¶graph=false&probability=false'
payload = 'image='+get_file_content_as_base64(r"E:\PythonCode\WeChatRobot\temp_pic\15866dbd4118eb7638c9a13b430dadf1.jpg",True)+"&detect_direction=false&detect_language=false&vertexes_location=false¶graph=false&probability=false"
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
if(response.status_code==200):
pic_str = json.loads(response.text)
else:
logs.error('接口请求失败。status_code {} , reason {}',response.status_code,response.reason)
def get_file_content_as_base64(path, urlencoded=False):
"""
获取文件base64编码
:param path: 文件路径
:param urlencoded: 是否对结果进行urlencoded
:return: base64编码信息
"""
with open(path, "rb") as f:
content = base64.b64encode(f.read()).decode("utf8")
if urlencoded:
content = urllib.parse.quote_plus(content)
# print(content)
return content
def get_access_token():
"""
使用 AK,SK 生成鉴权签名(Access Token)
:return: access_token,或是None(如果错误)
"""
url = "https://aip.baidubce.com/oauth/2.0/token"
params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY}
return str(requests.post(url, params=params).json().get("access_token"))
def ocr_wx_pic(pic_full_path):
url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general?access_token=" + get_access_token()
payload = 'image=' + get_file_content_as_base64(pic_full_path,True) + "&detect_direction=false&detect_language=false&vertexes_location=false¶graph=false&probability=false"
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
if response.status_code == 200:
#解析图片
return parse_pic_data(response.text)
else:
print("接口请求失败。原因:" + response.reason)
return None
# 高精度版本
def ocr_wx_pic_hign(pic_full_path):
url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic?access_token=" + get_access_token()
payload = 'image=' + get_file_content_as_base64(pic_full_path,True) + "&detect_direction=false¶graph=false&probability=false"
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'application/json'
}
response = requests.request("POST", url, headers=headers, data=payload)
if response.status_code == 200:
#解析图片
return parse_pic_data(response.text)
else:
logs.error('接口请求失败。status_code {} , reason {}', response.status_code, response.reason)
return None
def parse_pic_data(orcStr):
# 解析json字符串
pic_str = json.loads(orcStr)
# 保存数据的字典
data_dict = {}
# TODO 50表示该图片不是需要识别图片
if pic_str['words_result_num'] > 50 or pic_str['words_result_num'] < 5:
return None
# 循环List去除掉无关项,例如 手机图片头部信息 以及 尾部二维码等
pic_list = pic_str['words_result']
# 循环pic_list
for idx,pic in enumerate(pic_list):
# print(f"pic: {pic}")
if pic['words'].find('一周小结') != -1:
data_dict['week_date'] = pic['words'].replace('一周小结', "")
elif pic['words'].find('处理工作会话') != -1:
data_dict['week_work_num'] = extract_integer(pic_list[idx+1]['words'])
elif pic['words'].find('总共花了') != -1:
data_dict['week_work_total_min'] = extract_integer(pic_list[idx+1]['words'])
elif pic['words'].find('最晚时间') != -1:
data_dict['week_final_last_time'] = pic_list[idx+1]['words']
# 手动添加年份
data_dict['data_year'] = datetime.datetime.today().year
if len(data_dict) != 5:
return None
return data_dict
def extract_integer(text):
# 匹配字符串中的第一个整数部分
match = re.search(r'\d+', text)
if match:
# 找到整数则返回整数值
return int(match.group())
else:
# 如果未找到整数,则返回 None 或者其他你想要的默认值
return None
if __name__ == '__main__':
pic_orc_str = '{"words_result":[{"words":"中国移动15:36●","location":{"top":23,"left":200,"width":257,"height":81}},{"words":"中国联通","location":{"top":70,"left":213,"width":96,"height":24}},{"words":"@86令959l92乡","location":{"top":46,"left":667,"width":370,"height":44}},{"words":"一周小结5.27-5.31","location":{"top":424,"left":309,"width":358,"height":44}},{"words":"66","location":{"top":573,"left":242,"width":49,"height":37}},{"words":"金星虽然是不发光的行","location":{"top":570,"left":343,"width":502,"height":51}},{"words":"星,但却比许多恒星还要","location":{"top":640,"left":343,"width":555,"height":51}},{"words":"明亮","location":{"top":710,"left":341,"width":100,"height":51}},{"words":"处理工作会话","location":{"top":813,"left":341,"width":218,"height":44}},{"words":"826次","location":{"top":880,"left":341,"width":136,"height":46}},{"words":"总共花了","location":{"top":998,"left":341,"width":143,"height":44}},{"words":"441分钟","location":{"top":1066,"left":341,"width":177,"height":46}},{"words":"最晚时间","location":{"top":1190,"left":343,"width":141,"height":37}},{"words":"周三深夜11:25","location":{"top":1251,"left":345,"width":307,"height":51}},{"words":"回叠回","location":{"top":1620,"left":718,"width":172,"height":54}},{"words":"企业微信","location":{"top":1756,"left":184,"width":146,"height":44}},{"words":"分享到朋友圈","location":{"top":2095,"left":218,"width":213,"height":33}},{"words":"保存到相册","location":{"top":2092,"left":674,"width":177,"height":37}}],"words_result_num":18,"log_id":1797510538274524905}'
# pic_str = json.loads(json_str)
parse_pic_data(pic_orc_str)
# main()
部署流程
安装python_3.9
安装包有略过此过程
安装微信
- 先下载一个 3.9.10.19版本, 32位的。
- 登陆以后,设置里面点击 更新
- 升级到最新版,然后拿23版本 覆盖一下 就ok了
安装python依赖
4. 安装依赖
```sh
# 升级 pip
python -m pip install -U pip
# 安装必要依赖
pip install -r requirements_v1.txt
pip install baidu-aip
pip install pandas openpyxl
pip install xlrd
pip install pymem
运行微信
```sh
python main.py
# 需要停止按 Ctrl+C
已支持功能
- 图片ocr识别
- 添加人员
命令: 添加人员 xxx xxx服务部 wxid_huwcf7p637mxxx
- 查看execl文件
命令:数据文件 (仅管理员权限的人支持)