import re
import xlrd
import openpyxl
import datetime
import os
# 检查结果表excel是否存在,没有就新建空excel,有则删掉重新建立一个excel
def exists_file(file):
if os.path.exists(file):
print(f"{file}文件存在")
os.remove(file)
else:
print(f"{file}文件不存在")
new_excel = openpyxl.Workbook()
new_excel.save(file)
# 获取订单号,匹配的规则是,"Order Number "这个字符串后面连续的数字部分,即为订单号,取第一页的订单号就可以
def get_order_no(text):
pattern = r"Order Number {1,}([0-9]{1,})"
order_nos = re.findall(pattern, text)
return order_nos[0]
# 获取客户公司,匹配的规则是,"Bill To "这个字符串后面除了换行符以外的所有字符,即为客户公司名称,取第一页的客户公司名称就可以
#def get_company_name(text):
# pattern = r"Bill To {1,3}(.*)"
# company_names = re.findall(pattern, text)
# return company_names[0]
def get_company_address(page_one_text):
real_address = ""
"""匹配第一页中Shipped From和Phone之间的所有字符串"""
pattern = r"Shipped From([\s\S]*)Phone"
try:
address = re.findall(pattern, page_one_text)[0]
for every_line in address.split("\n"):
# 如果行中包含Ship To 则匹配Ship和Bill之间的内容
if "Bill To" in every_line:
pattern = r"Bill To {1,3}(.*)"
temp = re.findall(pattern, every_line)
real_address += temp[0].replace("To", "").strip() + '\n'
# 如果包含了Phone,则不进行处理
elif "Phone" in every_line:
pass
# 如果是其他,则进行地址获取
else:
line_list = every_line.split(" ")
#print(line_list)
no_empty_strs = [i for i in line_list if i != '']
#print(no_empty_strs)
# 判断当前行只有一段连续字符串
if len(no_empty_strs) == 1:
# 如果这段字符串的位置是在中间,则把这段字符加入地址字符串
if line_list.index(no_empty_strs[0]) > 20:
real_address += no_empty_strs[0].strip() + '\n'
# 判断有1段段连续字符串
elif len(no_empty_strs) == 2:
# 如果这段字符串(真实有效的字符)的位置是在中间,则把这段字符加入地址字符串
if line_list.index(no_empty_strs[1]) > 20:
real_address += no_empty_strs[1].strip() + '\n'
elif len(no_empty_strs) == 3:
# 如果这段字符串(真实有效的字符)的位置是在中间,则把这段字符加入地址字符串
real_address += no_empty_strs[2].strip() + '\n'
else:
pass
except:
pass
return real_address
# 获取客户公司,匹配的规则是,"Bill To "这个字符串后面除了换行符以外的所有字符,即为客户公司名称,取第一页的客户公司名称就可以
def get_order_date(text):
pattern = r"Ordered {1,5}([0-9]{1,}[/][0-9]{1,}[/][0-9]*)"
order_date = re.findall(pattern, text)
return order_date[0].strip()
# 获取地址
def get_address(page_one_text):
real_address = ""
"""匹配第一页中Shipped From和Phone之间的所有字符串"""
pattern = r"Shipped From([\s\S]*)Phone"
address = re.findall(pattern, page_one_text)[0]
for every_line in address.split("\n"):
# 如果行中包含Ship To 则匹配Ship和Bill之间的内容
if "Ship To" in every_line:
pattern = r"Ship([\s\S]*)Bill"
temp = re.findall(pattern, every_line)
real_address += temp[0].replace("To", "").strip() + '\n'
# 如果包含了Phone,则不进行处理
elif "Phone" in every_line:
pass
# 如果是其他,则进行地址获取
else:
line_list = every_line.split(" ")
no_empty_strs = [i for i in line_list if i != '']
# 判断当前行只有一段连续字符串
if len(no_empty_strs) == 1:
# 如果这段字符串的位置是在中间,则把这段字符加入地址字符串
if line_list.index(no_empty_strs[0]) > 8 and line_list.index(no_empty_strs[0]) < 20:
real_address += no_empty_strs[0].strip() + '\n'
# 判断有1段段连续字符串
elif len(no_empty_strs) > 1:
# 如果这段字符串(真实有效的字符)的位置是在中间,则把这段字符加入地址字符串
if line_list.index(no_empty_strs[0]) > 8 and line_list.index(no_empty_strs[0]) < 20:
real_address += no_empty_strs[0].strip() + '\n'
elif line_list.index(no_empty_strs[1]) > 8 and line_list.index(no_empty_strs[1]) < 20:
real_address += no_empty_strs[1].strip() + '\n'
else:
pass
return real_address
# 匹配物料中的每一行数据内容,比如:1.000 0 EM9,PVT UNIT,SSS SUZHOU 2 EA 1.2400 EA 2.48 9/3/2021
def get_po_details(text):
# 找到每一页中的明细表格
po_details = []
data = text.split("PURCHASE ORDER")
for every_page in data:
if "By" in every_page:
pattern = r"UM {1,}(By[\s\S]*)"
po_details = po_details+re.findall(pattern, every_page)
# 定义一个新的list接受每行的明细
details_list = []
# 定义一个新的list接受每行的真实数据
result_list = []
for po_detail in po_details:
# 将每一页中的明细数据提取出来,every_page_info是指的每一页明细数据字段字符串内容
if "Sales Tax" in po_detail and ".000" in po_detail:
pattern = r"By([\s\S]*?)Sales Tax"
try:
every_page_info = re.findall(pattern, po_detail)[0]
except:
every_page_info = ""
elif "Sales Tax" in po_detail and ".000" not in po_detail:
every_page_info = ""
else:
every_page_info = po_detail.replace("By", "")
# 定义一个变量temp,目的是为了保存没一行的明细数据
temp = ""
if every_page_info:
line_totals = every_page_info.split("\n")
# 遍历每一行的数据,对其进行切割,将每一行数据之后的数据切割好之后,就将它写入到details_list这个数组中
for index in range(len(line_totals)):
if ".000 " in line_totals[index] or index == len(line_totals) - 1:
# print(temp)
if temp.strip():
details_list.append(temp)
temp = ""
temp += line_totals[index] + "\n"
# 遍历每一行的明细数据,并提取内容
for j in details_list:
# print("-------------")
# print(j)
descp = ""
# 处理每一行数据明细,并提取出描述信息部分将它写入到descp这个变量
for k in j.split("\n"):
lines = [i.strip() for i in k.split(" ") if i != '']
# print(lines)
if len(lines) >= 1 and len(lines) <= 2:
descp += lines[0] + "\n"
elif len(lines) >= 3 and len(lines[2].strip()) > 4 and "EA" not in lines[2]:
descp += lines[2] + "\n"
else:
pass
try:
pattern0 = r"([0-9]{7})"
order_item = re.findall(pattern0, descp)[0].strip()
except:
order_item = ""
# 提取line号
pattern1 = r"([0-9]{1,}[.][0-9]+ )"
try:
line_num = re.findall(pattern1, j)[0].strip()
except:
line_num = ""
# 提取rev号
try:
rev_num = j.split(" ")[1].strip()
except:
rev_num = ""
# 提取order数量
pattern1 = r"([0-9]{0,}) {1,3}EA"
try:
order_qty = re.findall(pattern1, j)[0]
except:
order_qty = ""
# 提取order单价
pattern2 = r"([0-9]{0,}[.][0-9]+) {1,3}EA"
try:
order_price = re.findall(pattern2, j)[0]
except:
order_price = ""
# 提取总价跟日期
try:
pattern3 = r"([0-9]{1,}[.][0-9]*.*[0-9]{1,}[/][0-9]{1,}[/][0-9]*)"
price_date = re.findall(pattern3, j)[0].split("EA")[-1]
pattern4 = r"([0-9,]{1,}[.][0-9]*.*[0-9]{1,}[/][0-9]{1,}[/][0-9]*)"
order_price_total = re.findall(pattern4, price_date)[0].strip().split(" ")[0]
order_date = re.findall(pattern4, price_date)[0].strip().split(" ")[1]
except:
order_price_total = ""
order_date = ""
result_list.append([line_num, rev_num, descp, order_item, order_qty, "EA", order_price, "EA", order_price_total, order_date])
return result_list
def main(in_file, out_file):
with open(in_file, "r", encoding='utf-8') as f: # 打开文件
data = f.read() # 读取文件
page_one = data.split("PURCHASE ORDER")[1]
order_date = get_order_date(page_one)
address = get_address(page_one)
#company = get_company_name(page_one)
company = get_company_address(page_one)
order_no = get_order_no(page_one)
# 打开结果Excel表
"""完整数据的文件名"""
excel = xlrd.open_workbook(out_file)
full_excel = openpyxl.Workbook()
full_sheet = full_excel.active
full_sheet.append(["客户OrderNo", "客户address", "客户company", "客户OrderDate",
"Line_No", "Rev_No", "Description", "Order_Item", "Order_QTY",
"Order_Price", "Order_Price_Total", "Order_Date"])
for i in get_po_details(data):
print(i)
full_sheet.append([order_no, address, company, order_date, i[0], i[1], i[2], i[3], i[4], i[6], i[8], i[9]])
full_excel.save(out_file)
if __name__ == '__main__':
in_file = r"C:\Users\maowe\Documents\Encoo\项目9\text.txt"
out_file = r"test.xlsx"
exists_file(out_file)
main(in_file, out_file)
2021-11-09
最新推荐文章于 2023-01-06 15:19:32 发布