字符校对代码

import pandas as pd
import re
from tqdm import tqdm


# import warnings
# # warnings.filterwarnings("ignore")
# match: '3207882_python程序设计实验指导书.pdf'

# 物联网RFID原理与技术 第2版_高建良,贺建飚编著 .pdf
def match_book(original_book_name,original_book_name2, re_str):
    print(original_book_name)
    print(re_str)
    result_dict = {}
    match_flag = False
    match_result = re.search(re_str, original_book_name2)
    # 匹配书名
    if match_result:
        match_flag = True
        result_dict = match_result.groupdict()
        result_dict["original_book_name"] = original_book_name
    return match_flag, result_dict


# 判断字符串是否全是中文
def IsChinese(character):
    for cha in character:
        if not '\u0e00' <= cha <= "\u9fa5":
            return False
    else:
        return True


if __name__ == '__main__':

    re_str_dict = {
        # # match: '3207882_python程序设计实验指导书.pdf'
        "match1": '^\d+_(?P<book_name>.*?)(?:.html)?.pdf',
        # # '[图灵]《24小时365天不间断服务》[张毅译][人民邮电出版社][978-7-115-38024-1][2015.1][p361].pdf'
        "match2": ".*?《(?P<book_name>.*?)》.*?\[(?P<book_author>.*?)[译|著|编|译制]\].*?\[(?P<book_publish>.*?[社|电子书])\](?:.*?).pdf",
        # 《机床电气线路安装与维修工作页》.pdf
        "match3": "《(?P<book_name>.*?)》.pdf",
        # 半导体器件可靠性技术_(日)安食恒雄主编;日本松下电子工业株式会社编;周南生等译_西安:西安电子科技大学出版社
        "match4": "(?P<book_name>.*?)_(?P<book_author>.*?[编|译|著|])_(?P<book_publish>.*?[出版社|组|厂]).pdf",
        # # 半导体集成电路 姚金生
        "match5": "(?P<book_name>.*?) (?P<book_author>.*?).pdf",
        # 电路、信号与系统 [徐昌彪 主编] 2012年.pdf
        "match6": '(?P<book_name>.*?) \[(?P<book_author>.*?)[主编|著]\](?:.*?)(?:.html)?.pdf',
        # 电工电子技术实验 [席建中,陈松柏,何勇 主编;刘建生,罗小华,彭名华,刘西成,张雪平,李兴红副主编] 2014年.pdf
        "match7": '(?P<book_name>.*?) \[(?P<book_author>.*?)[著|编著|主编] (?:.*?)[副主编]\](?:.*?)(?:.html)?.pdf',
        # 电工电子技术基础教程 第2版 [陈新龙,胡国庆 著] 2013年.pdf
        "match8": '(?P<book_name>.*?) (?:.*?版) \[(?P<book_author>.*?)[编|著|编著|主编]\](?:.*?)(?:.html)?.pdf',
        # [OpenCL.Programming.Guide(第1版)].pdf
        "match9":"^\[(?P<book_name>.*?)\].pdf$",
        # [web开发CSS系列].WebDevelopmentSolutions.pdf
        "match10": "^\[.*?\].(?P<book_name>.*?).pdf$",
        #《android底层开发实战》[电子书][p48].pdf
        'match11': "《(?P<book_name>.*?)》.*?\[(?P<book_author>.*?[书|编著|英文|译制|笔记])\](?:.*?).pdf",
        # 【课件】图象工程(上册)(清华大学)_章毓晋.pdf
        "match13":".*?】(?P<book_name>.*?)\(.*?\)_(?P<book_author>.*?).pdf",
        # c程序设计语言(第二版,中文版,b.w.kernighan、d.m.ritchie 著)
        "match14":"(?P<book_name>.*?)((?P<book_author>.*?著)).pdf",

        # "match15": "(?P<book_name>.*?)_(?P<book_author>.*?)_(?P<book_publish>.*?).pdf",

        # # 高等学校规划教材 电路与电子学 王文辉
        # "match16":"(?:>*?材) (?P<book_name>.*?) (?P<book_author>.*?).pdf",
        # # 电气运行_潘龙德主编
        # "match17": "(?P<book_name>.*?)_(?P<book_author>.*?).pdf",
        # 网络并购:并购交易的电子商务化 武建永 李斌
        # "match18": "(?P<book_name>.*?) (?P<book_author>.*?)",
        # 编译原理 by 蒋宗礼,姜守旭编著(z - lib.org)
        "match19": "(?P<book_name>.*?) by (?P<book_author>.*?)\(.*?",

        # 高效程序员的45个习惯:敏捷开发修炼之道(中文版).(苏帕拉马尼亚姆).钱安川等.pdf
        # "match20": "(?P<book_name>.*?)\((?P<book_author>.*?)\).pdf",
        'mathch12':"(?P<book_name>.*?)(?:.html)?.pdf",
    }
    book_df = pd.read_excel(r"..\data\第二批提交书籍清单.xlsx")
    book_df = book_df.drop_duplicates()
    book_list = book_df["书名"].unique().tolist()
    result_list = []
    for original_book_name in book_list:
        # if original_book_name !="C程序设计语言(第二版,中文版,B.W.Kernighan、D.M.Ritchie 著).pdf":
        #     continue

        original_book_name1 = original_book_name.replace(" ", "")
        not_need_list = ["[大家网]", '[www.topsage.com]','[www.TopSage.com]']
        original_book_name2 = original_book_name.lower()
        for not_need in not_need_list:
            original_book_name2 = original_book_name2.replace(not_need, "")
        result_dict = {}
        for re_id,re_str in re_str_dict.items():
            if re_id =="match5" and not IsChinese(original_book_name1):
                continue
            match_flag, result_dict = match_book(original_book_name,original_book_name2, re_str)
            if match_flag:
                result_list.append(result_dict)
                break
        else:
            result_dict["original_book_name"] = original_book_name
            result_list.append(result_dict)
    result_df = pd.DataFrame(result_list)
    result_df.to_excel(r"..\test_data\ICT通信中文书单(2)_结果_比对.xlsx", index=False)
    print(result_df)


    # result_df = pd.read_excel(r"..\test_data\ICT通信中文书单(2)_结果_比对.xlsx")
    # result_list = []
    # for row_index, row_df in result_df.iterrows():
    #     # book_name	original_book_name	book_author	book_publish
    #     result_list.append((row_df["book_name"], row_df["original_book_name"], row_df["book_author"],row_df["book_publish"]))
    # book_ict_df = pd.read_excel(r"..\data\ICT通信中文书单(2).xlsx")
    # book_ict_df= book_ict_df.drop_duplicates()
    # target_result_list =[]
    # for book_ict_index,book_ict_info_df in tqdm(book_ict_df.iterrows()):
    #     ict_book_name = book_ict_info_df["图书名称"]
    #     ict_book_author = book_ict_info_df["作者署名"]
    #     ict_book_publish = book_ict_info_df["出版社"]
    #     ishaving = book_ict_info_df["书籍是否已有"]
    #     if ishaving==1:
    #         target_result_list.append(pd.DataFrame(book_ict_info_df).T.copy())
    #     else:
    #         for book_info in result_list:
    #             book_name = book_info[0]
    #             original_book_name = book_info[1]
    #             book_author = book_info[2]
    #             book_publish =  book_info[3]
    #             if book_name != ict_book_name:
    #                 continue
    #             if book_publish==book_publish and ":" in book_publish:
    #                 book_publish = book_publish.split(":")[1]
    #                 if book_publish !=ict_book_publish:
    #                     continue
    #             book_ict_info_df["书籍是否已有"] =1
    #             book_ict_info_df["备注"] = "新增"
    #             target_result_list.append(pd.DataFrame(book_ict_info_df).T.copy())
    #             break
    #         else:
    #             target_result_list.append(pd.DataFrame(book_ict_info_df).T.copy())
    # target_df = pd.concat(target_result_list, ignore_index=True)
    # target_df.to_excel(r"..\test_data\ICT通信中文书单(2)_结果_比对2.xlsx", index=False)

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值