get_article_info

最新推荐文章于 2021-10-09 17:07:51 发布
mmdaphne
最新推荐文章于 2021-10-09 17:07:51 发布
阅读量1.2k
点赞数
分类专栏： Python
本文链接：https://blog.csdn.net/mmdaphne/article/details/43851375
版权
Python 专栏收录该内容
4 篇文章 0 订阅
订阅专栏
import os
import re
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read, pickle_write


def get_info():
    print('Start getting info...')
    journal_filter()    #例：在0001-8708\article\j.html中匹配doi值，没有的话，标明是哪篇文章
    get_article_type() #例：处理0001-8708\article_list后生成article_type，其中包含了文章所属区块的类型和属于该区块的文章数量
    article_filter()    #匹配article_type，将区块分类，得到大致的文章类型key_except等；再通过比较article_list中文章区块信息、文章类型，确定article的类型；
                        #最终将一些类型不是论文的文章删除，并把文章的序号，类型和文章名字写进列表article_select中
    get_author_list()   #在article的网页中匹配，生成author_list，其中按序包含了author_all（所有作者），脚注信息，doi值，文章序号，类型以及名字
    get_article_info()


def journal_filter():
    name_article_list = 'article_list'#article_list是get_article.py中get_article_list()函数处理issue中网页生成的，其中按序包含文章的副网址、所属区块、类型、名字
    name_article_dir = 'article'
    key_doi = re.compile(r'SDM.doi = \'(.*?)\'')
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        article_list = pickle_read(os.path.join(dir_list[i], name_article_list))    #例：实现的是读取0001-8708\article_list
        folder = os.path.join(dir_list[i], name_article_dir)    #例：folder就是0001-8708\article
        for j in range(len(article_list)):  #遍历article_list，这样就会知道一个文件夹如0001-8708\article中共有多少篇文章
            f = open(os.path.join(folder, '{}.html'.format(j)), mode='r', encoding='utf-8')  #例：打开0001-8708\article\j.html
            page = f.read()
            f.close()
            if not key_doi.search(page):
                print(i, '\{}\{}.html'.format(folder, j))


def get_article_type():
    name_article_list = 'article_list'  #article_list中按序包含文章的副网址、所属区块、类型、名字
    name_article_type = 'article_type'
    name_article_type_list = 'article_type.txt'
    dir_list = get_dir_list()
    type_list = []
    type_count = []
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        article_list = pickle_read(os.path.join(dir_list[i], name_article_list))    #例：article_list列表就是0001-8708\article_list
        for article in article_list:
            if not article[1] in type_list: #article[1]为文章所属区块，如果区块不在列表type_list中的话，把区块加进该列表中，并且计数为1
                type_list.append(article[1])
                type_count.append(1)
            else:                               #否则，如果区块已经在type_list中的话，遍历列表type_list
                for j in range(len(type_list)):
                    if type_list[j] == article[1]:  #如果有和该区块相同的项，就将该项的计数加1
                        type_count[j] += 1
                        break
    article_type_list = list(map(lambda m, n: '{}\t{}\n'.format(m, n), type_list, type_count))
    article_type = list(map(lambda m, n: (m, n), type_list, type_count))    #这里的article_type记录的是文章区块的类型以及属于该区块的文章个数
    #print("{}".format(article_type))    #以0001-8708为例，article_type中内容如：('', 335), ('Corrigendum', 1), ('Erratum', 1)
    pickle_write(article_type, name_article_type)   #将列表article_type写进文件name_article_type
    f = open(name_article_type_list, mode='w', encoding='utf-8')
    f.writelines(article_type_list)
    f.close()


def article_filter():
    name_article_type = 'article_type'  #get_info.py中get_article_type()函数处理article_list的article[1]（即文章区块信息）生成的，包含了文章所属区块的类型article_type[0]和属于该区块的文章数量article_type[1]
    name_article_list = 'article_list'  #article_list中按序包含文章的副网址article[0]、所属区块article[1]、类型article[2]、名字article[3]
    name_article_select = 'article_select'
    key_except = re.compile(r'editor.*?choice',re.I)    #re.I为忽略大小写
    key_exclude = re.compile(r'editor|book|news|acknowledgment|acknowledgement|education|retraction|erratum|'
                             r'introduction|in this issue|feature|foreword|topic|response|reply|comment|'
                             r'index|content|abstract|highlight|obituary|announcement|guideline|\sview|^view|list|'
                             r'presentation|survey|summary|correction|abbreviation', re.I)
    key_letter = re.compile(r'letter|correspondence', re.I)
    key_review = re.compile(r'|reveiw|insight', re.I)   #"|"表示或，所以这里就是任意或者review或者insight
    article_type_all = pickle_read(name_article_type)   #article_type_all即article_type
    type_except = []
    type_exclude = []
    type_letter = []
    type_review = []
    for i in range(len(article_type_all)):
        article_type = article_type_all[i][0]
        if key_except.search(article_type):
            type_except.append(article_type)    #没能找到例子
        elif key_exclude.search(article_type):
            type_exclude.append(article_type)   #例：'Book reviews', "Editors' Acknowledgement", 'Commentary'
        elif key_letter.search(article_type):
            type_letter.append(article_type)    #例：'Correspondence'
        elif key_review.search(article_type):
            type_review.append(article_type)    #例：'Articles', 'Reports', 'Review', 'Errata', 'ASHG Awards and Addresses'

    #print("{}".format(article_type_all))
    #print("{}".format(type_except))
    #print("{}".format(type_exclude))
    #print("{}".format(type_letter))
    #print("{}".format(type_review))
    count_letter = 0    #文章区块在type_letter中或文章名字能被 key_letter匹配到的文章属于letter
    count_review = 0    #文章类型非空且不是'Original Research Article'的文章，以及，文章区块在type_review中或文章名字能被 key_review匹配到的文章属于review
    count_paper = 0     #文章类型非空且类型为'Original Research Article'的文章，文章区块在type_paper中的文章，以及文章类型是空的，且区块信息也不在type_except、exclude、letter、review中的文章，属于paper
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        article_list = pickle_read(os.path.join(dir_list[i], name_article_list))
        article_select = []
        for j in range(len(article_list)):
            article = article_list[j]   #article是article_list的第j+1行
            if article[2]:              #article[2]即文章类型，如果不是空的话，继续判断
                if article[2] == 'Original Research Article':   #如果文章类型是'Original Research Article'的话
                    article_select.append((j, 'Paper', article[3]))#将文章的序号j，类型'Paper'和文章名字加进列表中
                    count_paper += 1                                #将paper的计数count_paper加1
                else:
                    article_select.append((j, 'Review', article[3]))#否则的话，认为该类型是'Review'，将文章的序号j，类型'Review'和文章名字加进列表中
                    count_review += 1                                #将review的计数count_review加1
                continue
            if article[1] in type_except:   #先判断文章类型article[2]是不是空的，如果是空的，则判断文章的区块信息article[1]是否在type_except中，如果在的话
                article_select.append((j, 'Paper', article[3])) #认为该文章类型是'Paper'，将文章的序号j，类型'Paper'和文章名字加进列表中
                count_paper += 1                                 #paper的计数count_paper加1
                continue
            if article[1] in type_exclude or key_exclude.search(article[3]):#如果文章的区块信息article[1]在type_exclude中，或者key_exclude能在文章的名字中匹配到信息,则该文章不属于letter、review、paper中任一种
                continue    #如：如：('/science/article/pii/S0002929713000402?np=y', 'ASHG Awards and Addresses', '', '2012 William Allan Award Introduction: Uta Francke')该类型虽然是属于type_review，但是名字中有Introduction
            if article[1] in type_letter or key_letter.search(article[3]):  #如果文章的区块信息article[1]在type_letter中，或者key_letter能在文章的名字中匹配到信息
                article_select.append((j, 'Letter', article[3]))    #认为该文章类型是'Letter'，将文章的序号j，类型'Letter'和文章名字加进列表中
                count_letter += 1                                    #paper的计数count_paper加1
                continue
            if article[1] in type_review or key_review.search(article[3]):#如果文章的区块信息article[1]在type_review中，或者key_review能在文章的名字中匹配到信息
                article_select.append((j, 'Review', article[3]))        #认为该文章类型是'Review'，将文章的序号j，类型'Review'和文章名字加进列表中
                count_review += 1
                continue
            article_select.append((j, 'Paper', article[3])) #如果article[2]即文章类型是空的，且区块信息article[1]也不在type_except、exclude、letter、review中，默认其为'Paper'
            count_paper += 1
        pickle_write(article_select, os.path.join(dir_list[i], name_article_select))
    print(dir_list[i],count_letter, count_review, count_paper)
    #print("{}".format(article_select))  #形如：(4, 'Paper', 'Genetic and Epigenetic Regulation of Human lincRNA Gene Expression')


def get_author_list():
    name_article_dir = 'article'
    name_article_select = 'article_select'  #article_select是article_filter()函数中生成的，包括了文章的序号，类型和文章名字
    name_author_list = 'author_list'        #author_list在此函数中生成，包含了author_all（所有作者），脚注信息，doi值，文章序号，类型以及名字
    key_author = re.compile(r'<li><a href="#" class="authorName.*?</li>|<li><span class="authorDegrees">.*?</li>')#匹配作者信息
    key_equal = re.compile(r'<dl class="footnote".*?</dl>') #匹配脚注信息，脚注信息包括等同作者说明、联系地址等等信息
    key_doi = re.compile(r'SDM.doi = \'(.*?)\'')    #匹配doi值，\'标明是符号‘
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        article_select = pickle_read(os.path.join(dir_list[i], name_article_select))#形如：(4, 'Paper', 'Genetic and Epigenetic Regulation of Human lincRNA Gene Expression')
        folder = os.path.join(dir_list[i], name_article_dir)#folder路径，例：0002-9297\article
        author_list = []
        for article in article_select:
            f = open(os.path.join(folder, '{}.html'.format(article[0])), mode='r', encoding='utf-8')    #这里article[0]就是文章的序号
            page = f.read()     #page就是文章，例：0002-9297\article\2.html
            f.close()
            author_all = key_author.findall(page)   #匹配到的，形如：<li><a href="#" class="authorName S_C_authorName" id="authname_N3a7fe8b0N727f3258" data-t="a" data-fn="Tobias\xa0B."。。。。
            author_equal = key_equal.findall(page)
            #匹配到的形如：<dl class="footnote" id="fn1" data-t="n"><dt class="label"><a href="#bfn1" class="intra_ref">3</a></dt><dd><p>These authors contributed equally to this work</p></dd></dl>
            article_doi = key_doi.search(page).groups()[0]  #形如：10.1016/j.ajhg.2013.05.001
            author_list.append((author_all, author_equal, article_doi, article[0], article[1], article[2]))
            #author_all为文章所有作者信息, author_equal为文章的脚注, article_doi为文章的doi值，article[0]为文章序号,article[1]为文章类型，article[2]为文章名字
        pickle_write(author_list, os.path.join(dir_list[i], name_author_list))


def get_article_info():
    def judge_complete():   #判断equal_name_list[k]将非字符串类型转变成列表中元素
        nonlocal equal_name_list    #非全局变量，可以访问外部作用域
        nonlocal author_equal
        for k in range(len(equal_name_list)):
            if type(equal_name_list[k]) == str: #如果equal_name_list[k]是字符串类型的话，返回值为False
                return False
        author_equal.append([n[0] for n in equal_name_list])    #遍历equal_name_list，不是的话，将其每一行的不是字符串的项放进列表author_equal中，如：[['Xiaoyan Xu'], ['Qiang Fu'], ['Qun Zhang']]处理后变成： [['Xiaoyan Xu', 'Qiang Fu', 'Qun Zhang']]
        return True

    name_author_list = 'author_list' #author_list在函数get_author_list()中生成，包含了文章作者，脚注，doi值，序号，类型以及名字
    name_article_info = 'article_info'#article_info在此函数中生成
    key_author_name = re.compile(r'<a href="#" class="authorName.*?>(.*?)</a>')
    key_author_name_split = re.compile(r'\s')   #匹配空白字符
    key_author_name_hyphen = re.compile(r'-')   #匹配连字符
    key_author_name_split_hyphen = re.compile(r'\s|-')  #匹配空白字符或者连字符,切片函数，把匹配到的字符作为分界
    key_corr = re.compile(r'Corresponding author')  #匹配通讯作者
    key_equal = re.compile(r'contributed equally')  #匹配等同作者相关信息
    key_equal_split = re.compile(r'\s[^A-Z\s\.]*?\.\s|\)\.\s')  #匹配空白+[非A-Z 空格 .]集合+ .+空格或者).+空格   如：‘abstracted data. M.A.’中' data. '被匹配出来，相当于起到断句作用
    key_equal_1 = re.compile(r'class="intra_ref">(.*?)</a>')
    key_equal_1_not = re.compile(r'Appendix 1')
    key_equal_1_sub = re.compile(r'<.*?>(.*?)<.*?>')#( ) 标记一个子表达式的开始和结束位置。子表达式可以获取供以后使用。取两个标签之间的内容
    key_equal_1_spec = re.compile(r' and ')
    key_equal_1_single = re.compile(r'This author|The author|Co-first author')
    key_equal_2 = re.compile(r'>[\s\)\]\.,;:]*([^<]*?contributed equally)')#匹配集合[\s\)\.,;:]0次或多次，没有就只要括号>;先匹配括号()中的，括号中内容取以右尖括号开头的字符串
    #key_equal_2 = re.compile(r'>([^<]*?contributed equally)')#可是为什么一定要匹配这些空格、括号、逗号之类的符号，没有影响啊？
    key_equal_2_sub = re.compile(r'\s*contributed equally') #匹配空格和contributed equally
    key_equal_2_1 = re.compile(r'All authors|All of the authors|All the authors|^Authors$|^The authors$|^The authors do|'
                               r'^The three institutions|^The Tsimikas|These authors|^Northeast Normal University|'
                               r'^These author$')   #匹配这些词，或以这些词开头或结尾，多数表示等同的是所有作者
    key_equal_2_2 = re.compile(r'Both authors|Both first authors|The 1st 2 authors|The first 2 authors|'
                               r'The first two authors')
    key_equal_2_3 = re.compile(r'The first 3 authors|The first three authors|the first three authors')
    key_equal_2_4 = re.compile(r'The last 2 authors|The last two authors')
    key_equal_2_5 = re.compile(r'The last 3 authors')
    key_equal_2_6 = re.compile(r'The last four authors')
    key_equal_2_7 = re.compile(r'Second and third authors')
    key_equal_3_sub = re.compile(r' have$|^and has.*Merck. |,$|^As joint first authors, |^Author contributions: |'
                                 r' performed .*?$|^Author |^Authors |^Both | both$| equally$|^Note: Both |'
                                 r' as co-corresponding author$| are joint first authors and$|, these authors have|'
                                 r', MD|, PhD|^Professors |^Drs |^Drs. |^The authors | are co-first authors and$')  #匹配表示等同的一些修饰词
    key_equal_3_split = re.compile(r',\sand\s|,\s|\sand\s|\s&\s')   #匹配首尾有空格的连接词，如：and
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        folder = dir_list[i]
        print(folder)
        author_list = pickle_read(os.path.join(folder, name_author_list))   #author_list例：0002-9297\author_list
        article_info = []
        for article in author_list: #author_list中每一个article都包括文章作者，脚注，doi值，序号，类型以及名字。
            author_name = [key_author_name.search(n).groups()[0] for n in article[0]]   #匹配article[0]（即文章作者信息），形如：<li><a href="#" class="authorName S_C_authorName"。。。
            #print("{}\t{}".format(article[3],author_name))     # author_name例：108	['Xing Hua', 'Haiming Xu', 'Yaning Yang'。。。]（因为article[0]是重复匹配而来的，其中有多少作者，就有多少重复的<li><a href="#"这样的句子，所以n指每一对''中的句子 ）
            author_name_split = [key_author_name_split.split(n) for n in author_name]   #匹配author_name，删除空白字符，这里n指每一对''中的内容
            #print("{}\t{}".format(article[3],author_name_split))    #author_name_split例：108	[['Xing', 'Hua'], ['Haiming', 'Xu'], ['Yaning', 'Yang']。。。]
            author_name_split_hyphen = [key_author_name_split_hyphen.split(n) for n in author_name] #匹配author_name，删除其中空白字符或者连字符
            #print("{}\t{}".format(article[3],author_name_split_hyphen))    #例：如果author_name是'Moeenaldeen\xa0D. Al-Sayed'那么author_name_split_hyphen是['Moeenaldeen', 'D.', 'Al', 'Sayed']
            if not author_name:
                continue    #如果author_name是空的，那么忽略该文章，程序继续
            author_corr = []    #通讯作者的集合
            author_equal = []
            author_equal_all = []
            equal_flag_1 = []
            equal_flag_2 = []
            for j in range(len(article[0])):    #在一篇文章中遍历它的所有作者信息，article[0]是author_list中的第一项，包含作者信息
                if key_corr.search(article[0][j]):  #如果在article[0]的第j+1个句子中能够匹配到Corresponding author，
                    author_corr.append(author_name[j])#那么就把该文章的第j+1的作者加进列表author_corr，通讯作者的集合
            #print("{}\t{}".format(article[3],author_corr))#例：180	['Peter Calabrese', 'Norman Arnheim']
            for line in article[1]: #在一篇文章中遍历它的脚注信息，article[1]是author_list中的第二项，包含脚注信息
                if key_equal.search(line):  #如果在脚注信息中匹配到了contributed equally，将这项加进列表author_equal_all
                    author_equal_all.append(line)
            for line in author_equal_all:   #在一篇文章中，遍历author_equal_all列表，其中包含了等同作者的脚注信息
                if key_equal_1.search(line): #key_equal_1 = re.compile(r'class="intra_ref">(.*?)</a>')，如果能匹配到 ，则将匹配到的字符和被匹配的line赋给equal_flag_temp
                    equal_flag_temp = (key_equal_1.search(line).groups()[0], line)#line中有class="intra_ref">3</a>，那么equal_flag_temp就是3和line 例：'3' '<dl class="footnote"...
                    #print(equal_flag_temp)
                    if key_equal_1_not.search(equal_flag_temp[0]):  #遍历equal_flag_temp[0]，即equal_flag_temp的第一项（上标），如果能匹配到Appendix 1，则把匹配到的字符和line加进列表equal_flag_2
                        equal_flag_2.append((key_equal_2.search(line).groups()[0], line))
                        #print(equal_flag_2)
                    elif key_equal_1_spec.search(equal_flag_temp[0]):#如果在equal_flag_temp[0]中能够匹配到and，那么将匹配到的字符串去除and之后，和line加进列表equal_flag_1
                        equal_flag_1 += [(n, line) for n in key_equal_1_spec.split(equal_flag_temp[0])]
                        #print(equal_flag_1)
                    elif key_equal_1_sub.search(equal_flag_temp[0]):    #如果key_equal_1_sub能够在equal_flag_temp[0]中能够匹配到，将匹配到的字符串和line加进列表equal_flag_1
                        equal_flag_1.append((key_equal_1_sub.search(equal_flag_temp[0]).groups()[0], line))
                        #print(equal_flag_1)
                    else:       #如果以上三种匹配表达式都没有匹配成功的话，就将equal_flag_temp加给列表equal_flag_1
                        equal_flag_1.append(equal_flag_temp)
                        #print(equal_flag_1)
                else:
                    equal_flag_2.append((key_equal_2.search(line).groups()[0], line))#如果key_equal_1没有匹配成功的话，则在所有的line中用key_equal_2匹配，将匹配到的字符串和line赋给列表equal_flag_2
                    #print("{}\t{}".format(article[3],equal_flag_2))     #以文件夹1558-7673为例：['Drs Linton and Pond contributed equally','<dl class="footnote"...]
            for line in equal_flag_1:   #遍历每一篇文章中equal_flag_1(有关等同作者的上标)，根据文章的不同上标确定相应的匹配表达式key_equal_flag，在article[0]即作者信息中匹配找到有相同上标的作者集合
                if line[0] == '**':#根据equal_flag_1里第一项的形式，来确定key_equal_flag的形式
                    key_equal_flag = re.compile(r'<sup>\*\*</sup>|<sup>\*</sup>')   #\*表示符号*
                elif line[0] == '*':
                    key_equal_flag = re.compile(r'<sup>\*</sup>')
                elif line[0] == '+':
                    key_equal_flag = re.compile(r'<sup>\+</sup>')   #\+表示符号+
                else:       #如果以上这些特殊符号都没有的话，就将key_equal_flag设置成如下形式，例：如果equal_flag_1的某一项是：'3','<dl class="footnote"...
                    key_equal_flag = re.compile(r'<sup>' + line[0] + '</sup>')  #那么key_equal_flag = re.compile(r'<sup> 3 </sup>'),使用“+”号可连接字符串
                temp = []
                for k in range(len(article[0])):
                    if key_equal_flag.search(article[0][k]):
                        temp.append(author_name[k])
                #print("{}\t{}\t{}".format(article[3],line[0],temp)) #以文件夹0002-9297为例：180	3	['Song-Ro Yoon', 'Soo-Kung Choi']，180	4	['Peter Calabrese', 'Norman Arnheim']
                if len(temp) == 0:#如果根据equal_flag_1确定的key_equal_flag 在article[0]中没有匹配到等同作者的话，那么在equal_flag_1的line[1]（脚注信息）中再次用key_equal_2匹配
                    equal_flag_2.append((key_equal_2.search(line[1]).groups()[0], line[1]))#将在equal_flag_1中匹配到的字符串和line[1]加进列表equal_flag_2
                elif len(temp) == 1:    #如果该文章的temp只有一个的话，那么先在line[1]中匹配key_equal_1_single，成功的话，将temp加进列表author_name；
                    #print(len(temp))
                    if key_equal_1_single.search(line[1]):  #key_equal_1_single = re.compile(r'This author|The author|Co-first author')
                        author_name.append(temp)
                    else:               #否则，如果匹配不成功的话，在line[1]中匹配key_equal_2，将匹配到的字符串和line[1]加进列表equal_flag_2中
                        equal_flag_2.append((key_equal_2.search(line[1]).groups()[0], line[1]))
                else:       #如果该文章的temp多于一个的话，直接将temp加进列表author_equal中
                    author_equal.append(temp)
            for line in equal_flag_2:   #equal_flag_2形如：[('I.-Y.C. and C.J. contributed equally', '<dl class="footnote" id="item2" data-t="n"><dd><p>I.-Y.C. and C.J. contributed equally to this work.</p></dd></dl>')]
                equal_split = key_equal_split.split(line[0])    #以在line[0]中匹配到的字符串为分界，例：['E.G.R. and C.S. conducted literature searches and abstracted', 'M.A.W. performed statistical', 'All authors contributed equally']
                split_words = key_equal_split.findall(line[0])  #找出所有在line[0]中能匹配到的整个正则式,例：[' data. ', ' analyses. ']
                #print(line[0]) #该例子的line[0]是：E.G.R. and C.S. conducted literature searches and abstracted data. M.A.W. performed statistical analyses. All authors contributed equally
                for k in range(len(split_words)):   #遍历split_words，将split_words的第k+1项加入equal_split的第k+1项，之前把句子断开，现在再接上
                    equal_split[k] += split_words[k]#例：['E.G.R. and C.S. conducted literature searches and abstracted data. ', 'M.A.W. performed statistical analyses. ', 'All authors contributed equally']
                    #print(article[3],equal_split)  #以0010-7824文件夹为例
                for item in equal_split:    #遍历equal_split，如果在equal_split中匹配到了contributed equally，那么在item中用key_equal_2_sub匹配，并且将匹配到的字符串删除
                    if key_equal.search(item):  #即把contributed equally以及这之前的空格删去，并将剩下的字符串复制给equal_sentence，例：T.M. and A.M.
                        equal_sentence = key_equal_2_sub.sub('', item)  #以0010-7824文件夹为例，匹配到：All authors
                        #print(article[3],equal_sentence)
                if key_equal_2_1.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_1匹配到，匹配如All authors这些词，则将这篇文章的所有作者都加进等同作者的列表author_equal
                    author_equal.append(author_name)
                    continue
                if key_equal_2_2.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_2匹配到，匹配如Both authors这些词，则将author_name中的前两个作者加进等同作者列表
                    author_equal.append(author_name[:2])#[:2]表示从开始到下标为2的元素，但是不包括结束下标（此处2就为结束下标）
                    #print(author_equal)    #以文件夹0015-0282为例
                    continue
                if key_equal_2_3.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_3匹配到，匹配如The first 3 authors这些词，则将author_name中的前三个作者加进等同作者列表
                    author_equal.append(author_name[:3])
                    continue
                if key_equal_2_4.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_4匹配到，匹配如The last 2 authors这些词，则将author_name中的最后两个作者加进等同作者列表
                    author_equal.append(author_name[-2:])
                    continue
                if key_equal_2_5.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_5匹配到，匹配如The last 3 authors这些词，则将author_name中的最后三个作者加进等同作者列表
                    author_equal.append(author_name[-3:])
                    continue
                if key_equal_2_6.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_6匹配到，匹配如The last four authors这些词，则将author_name中的最后四个作者加进等同作者列表
                    author_equal.append(author_name[-4:])
                    continue
                if key_equal_2_7.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_7匹配到，匹配如Second and third authors这些词，则将author_name中的第二个和第三个作者加进等同作者列表
                    author_equal.append(author_name[1:3])
                    continue
                equal_sentence = key_equal_3_sub.sub('', equal_sentence)#在equal_sentence中将能被key_equal_3_sub匹配到的字符串，如作为第一个单词的Authors这些词，删除
                #print(article[3],equal_sentence)    #以0015-0282文件夹为例：原来的equal_sentence是：Authors H.S.S. and Y.-M.L.，匹配处理之后是：H.S.S. and Y.-M.L.
                equal_name_list = key_equal_3_split.split(equal_sentence)   #以在equal_sentence中匹配到的字符串为分界，并将这些字符串组赋值给equal_name_list。
                #print(article[3],equal_name_list)  #key_equal_3_split主要匹配一些如and的连接词，承接上例：['H.S.S.', 'Y.-M.L.']
                author_name_modify = [' '.join(n) for n in author_name_split]   #author_name_split形如：[。。。 ['H.', 'Sunny', 'Sun'], ['Yung-Ming', 'Lin']]，此处又重新将姓和名组合起来了
                #print(article[3],author_name_modify)        # author_name_modify又重新将姓和名组合起来了，形如：[。。。'H. Sunny Sun', 'Yung-Ming Lin']
                for k in range(len(equal_name_list)):   #找出等同作者列表里的作者在文章作者列表中的完整名称
                    for l in range(len(author_name_modify)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():#lower()函数返回将字符串中所有大写字符转换为小写后生成的字符串。
                            equal_name_list[k] = [author_name[l]]#同为小写字母时判断，如果equal_name_list第k+1个字符串和author_name第l+1个字符串相同，则将author_name第l+1个元素赋值给equal_name_list第k+1个元素
                            #print(article[3],equal_name_list)   #形如：[['Xiaoyan Xu'], ['Qiang Fu'], ['Qun Zhang']]
                            break
                if judge_complete():    #将刚刚得到的列表equal_name_list的中的非字符串类型转变成列表中元素
                    #print(article[3],author_equal) #author_equal承接上例形如：[['Xiaoyan Xu', 'Qiang Fu', 'Qun Zhang']]
                    continue
                author_name_modify = [n[-1]+' '+' '.join(n[:-1]) for n in author_name_split] #将每一项的最后一个单词向前移两个，如：['Ya-Jing Tan', 'Yun Xiong', 'Guo-Lian Ding']经处理后变成：['Tan Ya-Jing', 'Xiong Yun', 'Ding Guo-Lian']
                #print(article[3],author_name_modify)
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:#如果equal_name_list中第k+1项是列表类型，那么继续
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():#重复前一个循环的比较过程
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [n[0][0]+'. '+' '.join(n[1:]) for n in author_name_split]#取第一个字符串的首字母+符号'.'+字符串第一个单词之后的内容
                #print(article[3],author_name_modify)    #承接上例，形如：['Y. Tan', 'Y. Xiong', 'G. Ding']
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [n[0][0]+' '+' '.join(n[1:]) for n in author_name_split]#相较于上一次改变，删去了符号'.',承接上例，形如： ['Y Tan', 'Y Xiong', 'G Ding']
                #print(article[3],author_name_modify)
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = ['.'.join([m[0] for m in n])+'.' for n in author_name_split]#将每一个字符串的每个单词都取其首字母,在同一个字符串中用符号'.'连接
                #print(article[3],author_name_modify)                                               #承接上例，形如：['Y.T.', 'Y.X.', 'G.D.']
                for k in range(len(equal_name_list)):   ##重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = ['. '.join([m[0] for m in n])+'.' for n in author_name_split]  #取每个元素首字母，之间用'. '连接，并在最后字母处也加上符号'.'，承接上例，形如：['J. C.', 'H. A.', 'W. L.']
                #print(article[3],author_name_modify)
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split]    #只取每个字符串的首字母，承接上例，形如：['YT', 'YX', 'GD']
                #print(article[3],author_name_modify)
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [n[0]+n[-1][0]+'.' for n in author_name_split] #取第一个单词和最后一个单词的首字母，并加上符号'.',承接上例，形如：['Ya-JingT.', 'YunX.', 'Guo-LianD.']
                #print(article[3],author_name_modify)
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [] #将author_name_modify清空
                for k in range(len(author_name_split)):#以0015-0282为例，author_name_split形如：[['Jennifer', 'L.', 'Herington'], ['Dana', 'R.', 'Glore']]
                    if len(author_name_split[k]) > 2:   #如果author_name_split第k+1项元素的个数大于2，那么取每个单词的首字母，并以符号'.'或符号'. '连接
                        author_name_modify.append(author_name_split[k][0][0]+'.'+author_name_split[k][1][0]+'. '+
                                                  ' '.join(author_name_split[k][2:]))   #取第一个和第二个元素的首字母，之间用符号'.'连接，以及第三个元素的全部，之间用符号'. '连接，此处是符号和空格
                        #print(article[3],author_name_modify)#形如：['J.L. Herington', 'D.R. Glore']
                    else:
                        author_name_modify.append('')   #否则删除
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [] #再次清空
                for k in range(len(author_name_split)):
                    if len(author_name_split[k]) > 2:#如果author_name_split第k+1项元素的个数大于2，那么取每个单词的首字母，并以符号'. '连接
                        author_name_modify.append(author_name_split[k][0][0]+'. '+author_name_split[k][1][0]+'. '+
                                                  ' '.join(author_name_split[k][2:]))#取第一个和第二个元素的首字母，以及第三个元素的全部,之间用符号'. '连接，符号和空格
                        #print(article[3],author_name_modify)# 形如：['J. L. Herington', 'D. R. Glore']
                    else:
                        author_name_modify.append('')
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split]#取每个单词首字母
                #print(article[3],author_name_modify)    #形如： ['JLH', 'DRG']
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [' '.join(n[:-1])+' '+n[-1][0] for n in author_name_split] #从第一个到倒数第二个元素用空格连接，并取最后一个单词的首字母
                #print(article[3],author_name_modify)   #形如：['Jennifer L. H', 'Dana R. G']
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [n[-1] for n in author_name_split]#取最后一个单词，形如：['Herington', 'Glore']
                #print(article[3],author_name_modify)
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [] #再次清空列表author_name_modify
                for k in range(len(author_name_split_hyphen)):  #如果author_name_split第k+1项元素的个数大于2，那么取第一个单词首字母并加上符号'.'
                    if len(author_name_split_hyphen[k]) > 2:    #取第二个单词首字母并加上符号'.'和空格，以及剩下的所有单词
                        author_name_modify.append(author_name_split_hyphen[k][0][0]+'.'+
                                                  author_name_split_hyphen[k][1][0]+'. '+
                                                  ' '.join(author_name_split_hyphen[k][2:]))
                    else:
                        author_name_modify.append('')
                #print(article[3],author_name_modify)    #形如：['J.L. Herington', 'D.R. Glore']
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_hyphen = key_author_name_hyphen.sub('', equal_name_list[k])  #删除equal_name_list中的连字符
                    for l in range(len(author_name)):
                        if equal_name_hyphen.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = []
                for k in range(len(author_name_split_hyphen)):  #如果author_name_split第k+1项元素的个数大于2，那么取其第一个单词首字母加上符号'.'和空格，
                    if len(author_name_split_hyphen[k]) > 2:    #第二个单词首字母加上符号'.'和空格，以及剩下的所有单词
                        author_name_modify.append(author_name_split_hyphen[k][0][0]+'. '+
                                                  author_name_split_hyphen[k][1][0]+'. '+
                                                  ' '.join(author_name_split_hyphen[k][2:]))
                    else:
                        author_name_modify.append('')
                #print(article[3],author_name_modify)    #形如：['J. L. Herington', 'D. R. Glore', 'K. L. Bruner Tran']
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程，但是增加了一次匹配过程，用于删除其中的连字符
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_hyphen = key_author_name_hyphen.sub('', equal_name_list[k])#因为上一步中author_name_modify是由author_name_split_hyphen匹配而来的，已经去除掉连字符了
                    #print(article[3],equal_name_hyphen) #形如：K.L.B.T.
                    for l in range(len(author_name)):
                        if equal_name_hyphen.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [' '.join(n) for n in author_name_split]   #将原先被分开的单词重新组合
                #print(article[3],author_name_modify)    #形如：['Ya-Jing Tan', 'Yun Xiong']
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue        #key_author_name_split是以空白字符为分界；取第一个分组的首字母，并加上符号'.'和空格
                    equal_name_temp = key_author_name_split.split(equal_name_list[k])[0][0] + '. ' + \
                                      ' '.join(key_author_name_split.split(equal_name_list[k])[1:])     #取从第二个到最后的所有分组，例如：'Y.-J. T.'变成'Y. T.'
                    #print(article[3],equal_name_temp)
                    for l in range(len(author_name)):
                        if equal_name_temp.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]   #将相同的那项以列表形式代替equal_name_list的第k+1项
                            break
                if judge_complete():
                    continue
                author_name_modify = ['.'.join([m[0] for m in n])+'.' for n in author_name_split_hyphen]#author_name_split_hyphen形如:['Ya', 'Jing', 'Tan'], ['Yun', 'Xiong']
                #print(article[3],author_name_modify)#取每个分组的首字母，用符号'.'连接，并在该分组中以符号'.'结尾
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_temp = re.sub('-', '', equal_name_list[k])#删除equal_name_list项中的连字符，如原本的：H.-K.A.
                    #print(article[3],equal_name_list)   #处理后的equal_name_temp形如：H.K.A.（因为equal_name_temp处理的是equal_name_list[k]，单个的某项）
                    for l in range(len(author_name)):
                        if equal_name_temp.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = ['.'.join([m[0] for m in n]) for n in author_name_split]#取author_name_split每个分组的首字母，用符号'.'连接
                #print(article[3],author_name_modify)   #形如：['Y.T', 'Y.X']
                for k in range(len(equal_name_list)):   #重复前一个循环的比较过程，我将其称为modify过程。
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_temp = re.sub('-', '', equal_name_list[k])
                    for l in range(len(author_name)):
                        if equal_name_temp.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split_hyphen]#取author_name_split_hyphen每个分组的首字母
                #print(article[3],author_name_modify)
                for k in range(len(equal_name_list)):#重复前一个循环的比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_temp = re.sub('-', '', equal_name_list[k])
                    for l in range(len(author_name)):
                        if equal_name_temp.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                key_author_name_temp = [re.compile(n[-1]) for n in author_name_split]   #匹配表达式，取每个分组的最后一个字符串
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if key_author_name_temp[l].search(equal_name_list[k]):  #如果能在equal_name_list中第k+1项能被第l+1个匹配式匹配到，就将author_name第l+1项赋给equal_name_list第k+1项
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                key_author_name_temp = [re.compile(n[0]) for n in author_name_split] #匹配表达式，取每个分组的第一个字符串
                for k in range(len(equal_name_list)):   #重复上一比较过程
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if key_author_name_temp[l].search(equal_name_list[k]):
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                if equal_name_list[0] in ['T.C-D.', ['Roy Phitayakorn'], 'CDA', 'ASm', 'H.-P.K',
                                          ['Francesca Moro'], 'Y.-J. T.', 'A.M.D.A.', 'LK.M.', ['Andras Hoffman'],
                                          'K.R.', 'M.dC.V.H.', 'Y-G.K.', ['Scott J. Robbie'], ['Seung Hoon Woo'],
                                          'M.S.M', 'C.J.C.T.', ['Klaas J. Wardenaar'], 'L.-Q. X.',
                                          ['Massimiliano Fusaro'], ['Oliver Husser'], ['Icela Palma'], 'W-M.L',
                                          'program. The project']:
                    author_equal.append(author_name[:2])  #如果equal_name_list的第一项在以上集合中，那么把author_name的前两个元素加入author_equal中
                    continue
                if equal_name_list[0] in ['M-T.M-G', ["Anthony V. D'Amico"], 'MC', 'CAZ', ['Arne Östman'],
                                          'J.J.V.P.']:   # 如果equal_name_list的第一项在以上集合中
                    author_equal.append(author_name[-2:]) #那么把author_name的最后两个元素加入author_equal中
                    continue
                if equal_name_list[0] in ['MH']:        #如果equal_name_list的第一项在以上集合中
                    author_equal.append(author_name[1:3]) #那么把author_name的第二个和第三个元素加入author_equal中
                    continue
                if equal_name_list[0] in [['Chunsheng Liu']]:   #如果equal_name_list的第一项在以上集合中，
                    author_equal.append(author_name[:3])  # 那么把author_name的前三个元素加入author_equal中
                    continue
                if equal_name_list[0] in [['Leonidas Chouliaras']]:#如果equal_name_list的第一项在以上集合中，
                    author_equal.append(author_name[:2]+author_name[-2:])#那么把author_name的前两个和最后两个元素加入author_equal中
                    continue
                if equal_name_list[0] in [['Karin Hek']]:  #如果equal_name_list的第一项在以上集合中
                    author_equal.append(author_name[:5])     #那么把author_name的前五个元素加入author_equal中
                    continue
                if equal_name_list[0] in [['Cornelia M. van Duijn']]: #如果equal_name_list的第一项在以上集合中
                    author_equal.append(author_name[-8:])      #那么把author_name的最后八个元素加入author_equal中
            for j in range(len(author_equal)):
                temp = []
                for line in author_equal[j]:  #将author_equal中的每一项加进列表temp中
                    if not line in temp:
                        temp.append(line)
                if len(temp) == 1 and not author_name[0] in temp:  #如果列表中只有一个元素，并且author_name中的第一个元素不在temp中
                    temp.append(author_name[0])                     #则将author_name中的第一个元素加进列表temp中
                author_equal[j] = temp
            article_index = article[3] #article_index：文章编号
            article_type = article[4]  #article_type： 文章类型
            article_title = article[5]  #article_title： 文章名称
            article_doi = article[2]    #article_doi：文章doi值
            article_info.append((author_name, author_corr, author_equal, article_index, article_type, article_title,
                                article_doi)) #将所有作者集合、通讯作者集合、等同作者集合、文章编号、文章类型、文章名称、文章doi值按序加进列表article_info中
        pickle_write(article_info, os.path.join(folder, name_article_info))
mmdaphne
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
get_article_info

import osimport refrom defined_function.get_dir_list import get_dir_listfrom defined_function.pickle_var_file import pickle_read, pickle_writedef get_info(): print('Start getting info...')
复制链接

扫一扫