由于在数据库中进行表映射时导致有一些数据是重复的,根据开发需求对表中数据进行去重处理
具体需求是 根据表中的三个字段进行查找,若是该三个字段皆为重复则保留其中字段数不为空最多的一条记录,将其标记为1。
import pymysql
connection = pymysql.connect(host='',
user='',
password='',
database='')
cursor = connection.cursor()
query1 = "SELECT is_delete, name, count(name) AS n, sequence, count(sequence) AS s, description, count(description) AS d " \
"FROM table" \
"WHERE website_source=%s and is_delete=0 " \
"GROUP BY is_delete, name, sequence, description " \
"HAVING n>1 OR s>1 OR d>1 " \
"ORDER BY n, s, d DESC;"
cursor.execute(query1, ('APD3',))
records = cursor.fetchall()
# 执行第二个查询语句和更新操作
for record in records:
print(record)
is_delete = record[0]
name = record[1]
sequence = record[3]
print(name,sequence)
query2 = """
SELECT id,name,sequence,sequence_length,description,patent,reference,UniProt_ID,PubMed_ID,SwissProt_ID,source
FROM table
WHERE website_source = %s
AND name = %s
AND sequence = %s;"""
cursor.execute(query2, ('APD3', name, sequence))
result = cursor.fetchall()
# 找出最长字段数量
max_length = 0
for row in result:
current_length = len([field for field in row if field and str(field).strip()]) # 获取当前记录的非空字段数量
if current_length > max_length:
max_length = current_length
print(max_length)
# 找出除了最长字段数量之外的全部id
ids = []
first_id = None
for row in result:
current_length = len([field for field in row if field and str(field).strip()])
if current_length != max_length: # 如果字段数量不等于最长字段数量
ids.append(row[0])
elif first_id is None: # 字段数量等于最长字段数量,且尚未添加第一条id
first_id = row[0] # 记录第一条id
else: # 字段数量等于最长字段数量,且已经添加过第一条id
ids.append(row[0])
print(ids)
for id in ids:
delete_query = "update table set is_delete=1 where id=%s and website_source = %s"
cursor.execute(delete_query, (id, 'APD3'))
connection.commit()
connection.close()