三张表格数据(上万条)需要比对,看是否有重复存在,使用excel自身进行数据核对显然数据量比较庞大,所以考虑使用python脚本来实现。
from openpyxl import load_workbook, Workbook
wb1 = load_workbook('/Users/Mike/Downloads/ceshi/employee1.xlsx')
wb2 = load_workbook('/Users/Mike/Downloads/ceshi/employee2.xlsx')
wb3 = load_workbook('/Users/Mike/Downloads/ceshi/employee3.xlsx')
a_wb = wb1['employee']
b_wb = wb2['employee']
c_wb = wb3['employee']
a_wb_max_row = len(a_wb['B'])
b_wb_max_row = len(b_wb['B'])
c_wb_max_row = len(c_wb['B'])
'''
# 提取姓名
for row in a_wb.iter_cols(min_row=1, max_row=a_wb_max_row, min_col=1, max_col=1, values_only=True):
row1 = list(row)
print(row1)
for row in b_wb.iter_cols(min_row=1, max_row=b_wb_max_row, min_col=1, max_col=1, values_only=True):
row2 = list(row)
# print(row2)
for row in c_wb.iter_cols(min_row=1, max_row=c_wb_max_row, min_col=1, max_col=1, values_only=True):
row3 = list(row)
'''
# 提取邮箱
for row in a_wb.iter_cols(min_row=2, max_row=a_wb_max_row, min_col=2, max_col=2, values_only=True):
row1 = list(row)
# print(row1)
for row in b_wb.iter_cols(min_row=2, max_row=b_wb_max_row, min_col=2, max_col=2, values_only=True):
row2 = list(row)
# print(row2)
for row in c_wb.iter_cols(min_row=2, max_row=c_wb_max_row, min_col=2, max_col=2, values_only=True):
row3 = list(row)
# print(row3)
# 同时存在三张数据表格中
c = []
for i in row1:
if i in row2:
if i in row3:
c.append(i)
for i in c:
if i == None:
c.remove(i)
print(c)
wb3 = Workbook()
d_wb = wb3.active
for i in range(len(c)):
d = [c[i]]
d_wb.append(d)
wb3.save('/Users/Mike/Downloads/mynewdata1.xlsx')
# 1表v2表v3表不存在的数据
e = []
for i in row1:
if i not in row2:
if i not in row3:
e.append(i)
for i in e:
if i == None:
e.remove(i)
print(e)
wb4 = Workbook()
e_wb = wb4.active
for i in range(len(e)):
f = [e[i]]
e_wb.append(f)
wb4.save('/Users/Mike/Downloads/mynewdata2.xlsx')
# 2表v3表不存在的数据
f = []
for i in row2:
if i not in row3:
f.append(i)
for i in f:
if i == None:
f.remove(i)
print(f)
wb5 = Workbook()
f_wb = wb5.active
for i in range(len(f)):
g = [f[i]]
f_wb.append(g)
wb5.save('/Users/Mike/Downloads/mynewdata3.xlsx')
# 1表v2表不存在的数据
g = []
for i in row1:
if i not in row2:
g.append(i)
for i in g:
if i == None:
g.remove(i)
print(g)
wb6 = Workbook()
h_wb = wb6.active
for i in range(len(g)):
h = [g[i]]
h_wb.append(h)
wb6.save('/Users/Mike/Downloads/mynewdata4.xlsx')