import os
import re
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read, pickle_write
def get_info():
print('Start getting info...')
journal_filter() #例:在0001-8708\article\j.html中匹配doi值,没有的话,标明是哪篇文章
get_article_type() #例:处理0001-8708\article_list后生成article_type,其中包含了文章所属区块的类型和属于该区块的文章数量
article_filter() #匹配article_type,将区块分类,得到大致的文章类型key_except等;再通过比较article_list中文章区块信息、文章类型,确定article的类型;
#最终将一些类型不是论文的文章删除,并把文章的序号,类型和文章名字写进列表article_select中
get_author_list() #在article的网页中匹配,生成author_list,其中按序包含了author_all(所有作者),脚注信息,doi值,文章序号,类型以及名字
get_article_info()
def journal_filter():
name_article_list = 'article_list'#article_list是get_article.py中get_article_list()函数处理issue中网页生成的,其中按序包含文章的副网址、所属区块、类型、名字
name_article_dir = 'article'
key_doi = re.compile(r'SDM.doi = \'(.*?)\'')
dir_list = get_dir_list()
for i in range(len(dir_list)):
article_list = pickle_read(os.path.join(dir_list[i], name_article_list)) #例:实现的是读取0001-8708\article_list
folder = os.path.join(dir_list[i], name_article_dir) #例:folder就是0001-8708\article
for j in range(len(article_list)): #遍历article_list,这样就会知道一个文件夹如0001-8708\article中共有多少篇文章
f = open(os.path.join(folder, '{}.html'.format(j)), mode='r', encoding='utf-8') #例:打开0001-8708\article\j.html
page = f.read()
f.close()
if not key_doi.search(page):
print(i, '\{}\{}.html'.format(folder, j))
def get_article_type():
name_article_list = 'article_list' #article_list中按序包含文章的副网址、所属区块、类型、名字
name_article_type = 'article_type'
name_article_type_list = 'article_type.txt'
dir_list = get_dir_list()
type_list = []
type_count = []
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
article_list = pickle_read(os.path.join(dir_list[i], name_article_list)) #例:article_list列表就是0001-8708\article_list
for article in article_list:
if not article[1] in type_list: #article[1]为文章所属区块,如果区块不在列表type_list中的话,把区块加进该列表中,并且计数为1
type_list.append(article[1])
type_count.append(1)
else: #否则,如果区块已经在type_list中的话,遍历列表type_list
for j in range(len(type_list)):
if type_list[j] == article[1]: #如果有和该区块相同的项,就将该项的计数加1
type_count[j] += 1
break
article_type_list = list(map(lambda m, n: '{}\t{}\n'.format(m, n), type_list, type_count))
article_type = list(map(lambda m, n: (m, n), type_list, type_count)) #这里的article_type记录的是文章区块的类型以及属于该区块的文章个数
#print("{}".format(article_type)) #以0001-8708为例,article_type中内容如:('', 335), ('Corrigendum', 1), ('Erratum', 1)
pickle_write(article_type, name_article_type) #将列表article_type写进文件name_article_type
f = open(name_article_type_list, mode='w', encoding='utf-8')
f.writelines(article_type_list)
f.close()
def article_filter():
name_article_type = 'article_type' #get_info.py中get_article_type()函数处理article_list的article[1](即文章区块信息)生成的,包含了文章所属区块的类型article_type[0]和属于该区块的文章数量article_type[1]
name_article_list = 'article_list' #article_list中按序包含文章的副网址article[0]、所属区块article[1]、类型article[2]、名字article[3]
name_article_select = 'article_select'
key_except = re.compile(r'editor.*?choice',re.I) #re.I为忽略大小写
key_exclude = re.compile(r'editor|book|news|acknowledgment|acknowledgement|education|retraction|erratum|'
r'introduction|in this issue|feature|foreword|topic|response|reply|comment|'
r'index|content|abstract|highlight|obituary|announcement|guideline|\sview|^view|list|'
r'presentation|survey|summary|correction|abbreviation', re.I)
key_letter = re.compile(r'letter|correspondence', re.I)
key_review = re.compile(r'|reveiw|insight', re.I) #"|"表示或,所以这里就是任意或者review或者insight
article_type_all = pickle_read(name_article_type) #article_type_all即article_type
type_except = []
type_exclude = []
type_letter = []
type_review = []
for i in range(len(article_type_all)):
article_type = article_type_all[i][0]
if key_except.search(article_type):
type_except.append(article_type) #没能找到例子
elif key_exclude.search(article_type):
type_exclude.append(article_type) #例:'Book reviews', "Editors' Acknowledgement", 'Commentary'
elif key_letter.search(article_type):
type_letter.append(article_type) #例:'Correspondence'
elif key_review.search(article_type):
type_review.append(article_type) #例:'Articles', 'Reports', 'Review', 'Errata', 'ASHG Awards and Addresses'
#print("{}".format(article_type_all))
#print("{}".format(type_except))
#print("{}".format(type_exclude))
#print("{}".format(type_letter))
#print("{}".format(type_review))
count_letter = 0 #文章区块在type_letter中或文章名字能被 key_letter匹配到的文章属于letter
count_review = 0 #文章类型非空且不是'Original Research Article'的文章,以及,文章区块在type_review中或文章名字能被 key_review匹配到的文章属于review
count_paper = 0 #文章类型非空且类型为'Original Research Article'的文章,文章区块在type_paper中的文章,以及文章类型是空的,且区块信息也不在type_except、exclude、letter、review中的文章,属于paper
dir_list = get_dir_list()
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
article_list = pickle_read(os.path.join(dir_list[i], name_article_list))
article_select = []
for j in range(len(article_list)):
article = article_list[j] #article是article_list的第j+1行
if article[2]: #article[2]即文章类型,如果不是空的话,继续判断
if article[2] == 'Original Research Article': #如果文章类型是'Original Research Article'的话
article_select.append((j, 'Paper', article[3]))#将文章的序号j,类型'Paper'和文章名字加进列表中
count_paper += 1 #将paper的计数count_paper加1
else:
article_select.append((j, 'Review', article[3]))#否则的话,认为该类型是'Review',将文章的序号j,类型'Review'和文章名字加进列表中
count_review += 1 #将review的计数count_review加1
continue
if article[1] in type_except: #先判断文章类型article[2]是不是空的,如果是空的,则判断文章的区块信息article[1]是否在type_except中,如果在的话
article_select.append((j, 'Paper', article[3])) #认为该文章类型是'Paper',将文章的序号j,类型'Paper'和文章名字加进列表中
count_paper += 1 #paper的计数count_paper加1
continue
if article[1] in type_exclude or key_exclude.search(article[3]):#如果文章的区块信息article[1]在type_exclude中,或者key_exclude能在文章的名字中匹配到信息,则该文章不属于letter、review、paper中任一种
continue #如:如:('/science/article/pii/S0002929713000402?np=y', 'ASHG Awards and Addresses', '', '2012 William Allan Award Introduction: Uta Francke')该类型虽然是属于type_review,但是名字中有Introduction
if article[1] in type_letter or key_letter.search(article[3]): #如果文章的区块信息article[1]在type_letter中,或者key_letter能在文章的名字中匹配到信息
article_select.append((j, 'Letter', article[3])) #认为该文章类型是'Letter',将文章的序号j,类型'Letter'和文章名字加进列表中
count_letter += 1 #paper的计数count_paper加1
continue
if article[1] in type_review or key_review.search(article[3]):#如果文章的区块信息article[1]在type_review中,或者key_review能在文章的名字中匹配到信息
article_select.append((j, 'Review', article[3])) #认为该文章类型是'Review',将文章的序号j,类型'Review'和文章名字加进列表中
count_review += 1
continue
article_select.append((j, 'Paper', article[3])) #如果article[2]即文章类型是空的,且区块信息article[1]也不在type_except、exclude、letter、review中,默认其为'Paper'
count_paper += 1
pickle_write(article_select, os.path.join(dir_list[i], name_article_select))
print(dir_list[i],count_letter, count_review, count_paper)
#print("{}".format(article_select)) #形如:(4, 'Paper', 'Genetic and Epigenetic Regulation of Human lincRNA Gene Expression')
def get_author_list():
name_article_dir = 'article'
name_article_select = 'article_select' #article_select是article_filter()函数中生成的,包括了文章的序号,类型和文章名字
name_author_list = 'author_list' #author_list在此函数中生成,包含了author_all(所有作者),脚注信息,doi值,文章序号,类型以及名字
key_author = re.compile(r'<li><a href="#" class="authorName.*?</li>|<li><span class="authorDegrees">.*?</li>')#匹配作者信息
key_equal = re.compile(r'<dl class="footnote".*?</dl>') #匹配脚注信息,脚注信息包括等同作者说明、联系地址等等信息
key_doi = re.compile(r'SDM.doi = \'(.*?)\'') #匹配doi值,\'标明是符号‘
dir_list = get_dir_list()
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
article_select = pickle_read(os.path.join(dir_list[i], name_article_select))#形如:(4, 'Paper', 'Genetic and Epigenetic Regulation of Human lincRNA Gene Expression')
folder = os.path.join(dir_list[i], name_article_dir)#folder路径,例:0002-9297\article
author_list = []
for article in article_select:
f = open(os.path.join(folder, '{}.html'.format(article[0])), mode='r', encoding='utf-8') #这里article[0]就是文章的序号
page = f.read() #page就是文章,例:0002-9297\article\2.html
f.close()
author_all = key_author.findall(page) #匹配到的,形如:<li><a href="#" class="authorName S_C_authorName" id="authname_N3a7fe8b0N727f3258" data-t="a" data-fn="Tobias\xa0B."。。。。
author_equal = key_equal.findall(page)
#匹配到的形如:<dl class="footnote" id="fn1" data-t="n"><dt class="label"><a href="#bfn1" class="intra_ref">3</a></dt><dd><p>These authors contributed equally to this work</p></dd></dl>
article_doi = key_doi.search(page).groups()[0] #形如:10.1016/j.ajhg.2013.05.001
author_list.append((author_all, author_equal, article_doi, article[0], article[1], article[2]))
#author_all为文章所有作者信息, author_equal为文章的脚注, article_doi为文章的doi值,article[0]为文章序号,article[1]为文章类型,article[2]为文章名字
pickle_write(author_list, os.path.join(dir_list[i], name_author_list))
def get_article_info():
def judge_complete(): #判断equal_name_list[k]将非字符串类型转变成列表中元素
nonlocal equal_name_list #非全局变量,可以访问外部作用域
nonlocal author_equal
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == str: #如果equal_name_list[k]是字符串类型的话,返回值为False
return False
author_equal.append([n[0] for n in equal_name_list]) #遍历equal_name_list,不是的话,将其每一行的不是字符串的项放进列表author_equal中,如:[['Xiaoyan Xu'], ['Qiang Fu'], ['Qun Zhang']]处理后变成: [['Xiaoyan Xu', 'Qiang Fu', 'Qun Zhang']]
return True
name_author_list = 'author_list' #author_list在函数get_author_list()中生成,包含了文章作者,脚注,doi值,序号,类型以及名字
name_article_info = 'article_info'#article_info在此函数中生成
key_author_name = re.compile(r'<a href="#" class="authorName.*?>(.*?)</a>')
key_author_name_split = re.compile(r'\s') #匹配空白字符
key_author_name_hyphen = re.compile(r'-') #匹配连字符
key_author_name_split_hyphen = re.compile(r'\s|-') #匹配空白字符或者连字符,切片函数,把匹配到的字符作为分界
key_corr = re.compile(r'Corresponding author') #匹配通讯作者
key_equal = re.compile(r'contributed equally') #匹配等同作者相关信息
key_equal_split = re.compile(r'\s[^A-Z\s\.]*?\.\s|\)\.\s') #匹配空白+[非A-Z 空格 .]集合+ .+空格或者).+空格 如:‘abstracted data. M.A.’中' data. '被匹配出来,相当于起到断句作用
key_equal_1 = re.compile(r'class="intra_ref">(.*?)</a>')
key_equal_1_not = re.compile(r'Appendix 1')
key_equal_1_sub = re.compile(r'<.*?>(.*?)<.*?>')#( ) 标记一个子表达式的开始和结束位置。子表达式可以获取供以后使用。取两个标签之间的内容
key_equal_1_spec = re.compile(r' and ')
key_equal_1_single = re.compile(r'This author|The author|Co-first author')
key_equal_2 = re.compile(r'>[\s\)\]\.,;:]*([^<]*?contributed equally)')#匹配集合[\s\)\.,;:]0次或多次,没有就只要括号>;先匹配括号()中的,括号中内容取以右尖括号开头的字符串
#key_equal_2 = re.compile(r'>([^<]*?contributed equally)')#可是为什么一定要匹配这些空格、括号、逗号之类的符号,没有影响啊?
key_equal_2_sub = re.compile(r'\s*contributed equally') #匹配空格和contributed equally
key_equal_2_1 = re.compile(r'All authors|All of the authors|All the authors|^Authors$|^The authors$|^The authors do|'
r'^The three institutions|^The Tsimikas|These authors|^Northeast Normal University|'
r'^These author$') #匹配这些词,或以这些词开头或结尾,多数表示等同的是所有作者
key_equal_2_2 = re.compile(r'Both authors|Both first authors|The 1st 2 authors|The first 2 authors|'
r'The first two authors')
key_equal_2_3 = re.compile(r'The first 3 authors|The first three authors|the first three authors')
key_equal_2_4 = re.compile(r'The last 2 authors|The last two authors')
key_equal_2_5 = re.compile(r'The last 3 authors')
key_equal_2_6 = re.compile(r'The last four authors')
key_equal_2_7 = re.compile(r'Second and third authors')
key_equal_3_sub = re.compile(r' have$|^and has.*Merck. |,$|^As joint first authors, |^Author contributions: |'
r' performed .*?$|^Author |^Authors |^Both | both$| equally$|^Note: Both |'
r' as co-corresponding author$| are joint first authors and$|, these authors have|'
r', MD|, PhD|^Professors |^Drs |^Drs. |^The authors | are co-first authors and$') #匹配表示等同的一些修饰词
key_equal_3_split = re.compile(r',\sand\s|,\s|\sand\s|\s&\s') #匹配首尾有空格的连接词,如:and
dir_list = get_dir_list()
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
folder = dir_list[i]
print(folder)
author_list = pickle_read(os.path.join(folder, name_author_list)) #author_list例:0002-9297\author_list
article_info = []
for article in author_list: #author_list中每一个article都包括文章作者,脚注,doi值,序号,类型以及名字。
author_name = [key_author_name.search(n).groups()[0] for n in article[0]] #匹配article[0](即文章作者信息),形如:<li><a href="#" class="authorName S_C_authorName"。。。
#print("{}\t{}".format(article[3],author_name)) # author_name例:108 ['Xing Hua', 'Haiming Xu', 'Yaning Yang'。。。](因为article[0]是重复匹配而来的,其中有多少作者,就有多少重复的<li><a href="#"这样的句子,所以n指每一对''中的句子 )
author_name_split = [key_author_name_split.split(n) for n in author_name] #匹配author_name,删除空白字符,这里n指每一对''中的内容
#print("{}\t{}".format(article[3],author_name_split)) #author_name_split例:108 [['Xing', 'Hua'], ['Haiming', 'Xu'], ['Yaning', 'Yang']。。。]
author_name_split_hyphen = [key_author_name_split_hyphen.split(n) for n in author_name] #匹配author_name,删除其中空白字符或者连字符
#print("{}\t{}".format(article[3],author_name_split_hyphen)) #例:如果author_name是'Moeenaldeen\xa0D. Al-Sayed'那么author_name_split_hyphen是['Moeenaldeen', 'D.', 'Al', 'Sayed']
if not author_name:
continue #如果author_name是空的,那么忽略该文章,程序继续
author_corr = [] #通讯作者的集合
author_equal = []
author_equal_all = []
equal_flag_1 = []
equal_flag_2 = []
for j in range(len(article[0])): #在一篇文章中遍历它的所有作者信息,article[0]是author_list中的第一项,包含作者信息
if key_corr.search(article[0][j]): #如果在article[0]的第j+1个句子中能够匹配到Corresponding author,
author_corr.append(author_name[j])#那么就把该文章的第j+1的作者加进列表author_corr,通讯作者的集合
#print("{}\t{}".format(article[3],author_corr))#例:180 ['Peter Calabrese', 'Norman Arnheim']
for line in article[1]: #在一篇文章中遍历它的脚注信息,article[1]是author_list中的第二项,包含脚注信息
if key_equal.search(line): #如果在脚注信息中匹配到了contributed equally,将这项加进列表author_equal_all
author_equal_all.append(line)
for line in author_equal_all: #在一篇文章中,遍历author_equal_all列表,其中包含了等同作者的脚注信息
if key_equal_1.search(line): #key_equal_1 = re.compile(r'class="intra_ref">(.*?)</a>'),如果能匹配到 ,则将匹配到的字符和被匹配的line赋给equal_flag_temp
equal_flag_temp = (key_equal_1.search(line).groups()[0], line)#line中有class="intra_ref">3</a>,那么equal_flag_temp就是3和line 例:'3' '<dl class="footnote"...
#print(equal_flag_temp)
if key_equal_1_not.search(equal_flag_temp[0]): #遍历equal_flag_temp[0],即equal_flag_temp的第一项(上标),如果能匹配到Appendix 1,则把匹配到的字符和line加进列表equal_flag_2
equal_flag_2.append((key_equal_2.search(line).groups()[0], line))
#print(equal_flag_2)
elif key_equal_1_spec.search(equal_flag_temp[0]):#如果在equal_flag_temp[0]中能够匹配到and,那么将匹配到的字符串去除and之后,和line加进列表equal_flag_1
equal_flag_1 += [(n, line) for n in key_equal_1_spec.split(equal_flag_temp[0])]
#print(equal_flag_1)
elif key_equal_1_sub.search(equal_flag_temp[0]): #如果key_equal_1_sub能够在equal_flag_temp[0]中能够匹配到,将匹配到的字符串和line加进列表equal_flag_1
equal_flag_1.append((key_equal_1_sub.search(equal_flag_temp[0]).groups()[0], line))
#print(equal_flag_1)
else: #如果以上三种匹配表达式都没有匹配成功的话,就将equal_flag_temp加给列表equal_flag_1
equal_flag_1.append(equal_flag_temp)
#print(equal_flag_1)
else:
equal_flag_2.append((key_equal_2.search(line).groups()[0], line))#如果key_equal_1没有匹配成功的话,则在所有的line中用key_equal_2匹配,将匹配到的字符串和line赋给列表equal_flag_2
#print("{}\t{}".format(article[3],equal_flag_2)) #以文件夹1558-7673为例:['Drs Linton and Pond contributed equally','<dl class="footnote"...]
for line in equal_flag_1: #遍历每一篇文章中equal_flag_1(有关等同作者的上标),根据文章的不同上标确定相应的匹配表达式key_equal_flag,在article[0]即作者信息中匹配找到有相同上标的作者集合
if line[0] == '**':#根据equal_flag_1里第一项的形式,来确定key_equal_flag的形式
key_equal_flag = re.compile(r'<sup>\*\*</sup>|<sup>\*</sup>') #\*表示符号*
elif line[0] == '*':
key_equal_flag = re.compile(r'<sup>\*</sup>')
elif line[0] == '+':
key_equal_flag = re.compile(r'<sup>\+</sup>') #\+表示符号+
else: #如果以上这些特殊符号都没有的话,就将key_equal_flag设置成如下形式,例:如果equal_flag_1的某一项是:'3','<dl class="footnote"...
key_equal_flag = re.compile(r'<sup>' + line[0] + '</sup>') #那么key_equal_flag = re.compile(r'<sup> 3 </sup>'),使用“+”号可连接字符串
temp = []
for k in range(len(article[0])):
if key_equal_flag.search(article[0][k]):
temp.append(author_name[k])
#print("{}\t{}\t{}".format(article[3],line[0],temp)) #以文件夹0002-9297为例:180 3 ['Song-Ro Yoon', 'Soo-Kung Choi'],180 4 ['Peter Calabrese', 'Norman Arnheim']
if len(temp) == 0:#如果根据equal_flag_1确定的key_equal_flag 在article[0]中没有匹配到等同作者的话,那么在equal_flag_1的line[1](脚注信息)中再次用key_equal_2匹配
equal_flag_2.append((key_equal_2.search(line[1]).groups()[0], line[1]))#将在equal_flag_1中匹配到的字符串和line[1]加进列表equal_flag_2
elif len(temp) == 1: #如果该文章的temp只有一个的话,那么先在line[1]中匹配key_equal_1_single,成功的话,将temp加进列表author_name;
#print(len(temp))
if key_equal_1_single.search(line[1]): #key_equal_1_single = re.compile(r'This author|The author|Co-first author')
author_name.append(temp)
else: #否则,如果匹配不成功的话,在line[1]中匹配key_equal_2,将匹配到的字符串和line[1]加进列表equal_flag_2中
equal_flag_2.append((key_equal_2.search(line[1]).groups()[0], line[1]))
else: #如果该文章的temp多于一个的话,直接将temp加进列表author_equal中
author_equal.append(temp)
for line in equal_flag_2: #equal_flag_2形如:[('I.-Y.C. and C.J. contributed equally', '<dl class="footnote" id="item2" data-t="n"><dd><p>I.-Y.C. and C.J. contributed equally to this work.</p></dd></dl>')]
equal_split = key_equal_split.split(line[0]) #以在line[0]中匹配到的字符串为分界,例:['E.G.R. and C.S. conducted literature searches and abstracted', 'M.A.W. performed statistical', 'All authors contributed equally']
split_words = key_equal_split.findall(line[0]) #找出所有在line[0]中能匹配到的整个正则式,例:[' data. ', ' analyses. ']
#print(line[0]) #该例子的line[0]是:E.G.R. and C.S. conducted literature searches and abstracted data. M.A.W. performed statistical analyses. All authors contributed equally
for k in range(len(split_words)): #遍历split_words,将split_words的第k+1项加入equal_split的第k+1项,之前把句子断开,现在再接上
equal_split[k] += split_words[k]#例:['E.G.R. and C.S. conducted literature searches and abstracted data. ', 'M.A.W. performed statistical analyses. ', 'All authors contributed equally']
#print(article[3],equal_split) #以0010-7824文件夹为例
for item in equal_split: #遍历equal_split,如果在equal_split中匹配到了contributed equally,那么在item中用key_equal_2_sub匹配,并且将匹配到的字符串删除
if key_equal.search(item): #即把contributed equally以及这之前的空格删去,并将剩下的字符串复制给equal_sentence,例:T.M. and A.M.
equal_sentence = key_equal_2_sub.sub('', item) #以0010-7824文件夹为例,匹配到:All authors
#print(article[3],equal_sentence)
if key_equal_2_1.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_1匹配到,匹配如All authors这些词,则将这篇文章的所有作者都加进等同作者的列表author_equal
author_equal.append(author_name)
continue
if key_equal_2_2.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_2匹配到,匹配如Both authors这些词,则将author_name中的前两个作者加进等同作者列表
author_equal.append(author_name[:2])#[:2]表示从开始到下标为2的元素,但是不包括结束下标(此处2就为结束下标)
#print(author_equal) #以文件夹0015-0282为例
continue
if key_equal_2_3.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_3匹配到,匹配如The first 3 authors这些词,则将author_name中的前三个作者加进等同作者列表
author_equal.append(author_name[:3])
continue
if key_equal_2_4.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_4匹配到,匹配如The last 2 authors这些词,则将author_name中的最后两个作者加进等同作者列表
author_equal.append(author_name[-2:])
continue
if key_equal_2_5.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_5匹配到,匹配如The last 3 authors这些词,则将author_name中的最后三个作者加进等同作者列表
author_equal.append(author_name[-3:])
continue
if key_equal_2_6.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_6匹配到,匹配如The last four authors这些词,则将author_name中的最后四个作者加进等同作者列表
author_equal.append(author_name[-4:])
continue
if key_equal_2_7.search(equal_sentence):#如果在equal_sentence中能被key_equal_2_7匹配到,匹配如Second and third authors这些词,则将author_name中的第二个和第三个作者加进等同作者列表
author_equal.append(author_name[1:3])
continue
equal_sentence = key_equal_3_sub.sub('', equal_sentence)#在equal_sentence中将能被key_equal_3_sub匹配到的字符串,如作为第一个单词的Authors这些词,删除
#print(article[3],equal_sentence) #以0015-0282文件夹为例:原来的equal_sentence是:Authors H.S.S. and Y.-M.L.,匹配处理之后是:H.S.S. and Y.-M.L.
equal_name_list = key_equal_3_split.split(equal_sentence) #以在equal_sentence中匹配到的字符串为分界,并将这些字符串组赋值给equal_name_list。
#print(article[3],equal_name_list) #key_equal_3_split主要匹配一些如and的连接词,承接上例:['H.S.S.', 'Y.-M.L.']
author_name_modify = [' '.join(n) for n in author_name_split] #author_name_split形如:[。。。 ['H.', 'Sunny', 'Sun'], ['Yung-Ming', 'Lin']],此处又重新将姓和名组合起来了
#print(article[3],author_name_modify) # author_name_modify又重新将姓和名组合起来了,形如:[。。。'H. Sunny Sun', 'Yung-Ming Lin']
for k in range(len(equal_name_list)): #找出等同作者列表里的作者在文章作者列表中的完整名称
for l in range(len(author_name_modify)):
if equal_name_list[k].lower() == author_name_modify[l].lower():#lower()函数返回将字符串中所有大写字符转换为小写后生成的字符串。
equal_name_list[k] = [author_name[l]]#同为小写字母时判断,如果equal_name_list第k+1个字符串和author_name第l+1个字符串相同,则将author_name第l+1个元素赋值给equal_name_list第k+1个元素
#print(article[3],equal_name_list) #形如:[['Xiaoyan Xu'], ['Qiang Fu'], ['Qun Zhang']]
break
if judge_complete(): #将刚刚得到的列表equal_name_list的中的非字符串类型转变成列表中元素
#print(article[3],author_equal) #author_equal承接上例形如:[['Xiaoyan Xu', 'Qiang Fu', 'Qun Zhang']]
continue
author_name_modify = [n[-1]+' '+' '.join(n[:-1]) for n in author_name_split] #将每一项的最后一个单词向前移两个,如:['Ya-Jing Tan', 'Yun Xiong', 'Guo-Lian Ding']经处理后变成:['Tan Ya-Jing', 'Xiong Yun', 'Ding Guo-Lian']
#print(article[3],author_name_modify)
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:#如果equal_name_list中第k+1项是列表类型,那么继续
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():#重复前一个循环的比较过程
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [n[0][0]+'. '+' '.join(n[1:]) for n in author_name_split]#取第一个字符串的首字母+符号'.'+字符串第一个单词之后的内容
#print(article[3],author_name_modify) #承接上例,形如:['Y. Tan', 'Y. Xiong', 'G. Ding']
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [n[0][0]+' '+' '.join(n[1:]) for n in author_name_split]#相较于上一次改变,删去了符号'.',承接上例,形如: ['Y Tan', 'Y Xiong', 'G Ding']
#print(article[3],author_name_modify)
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = ['.'.join([m[0] for m in n])+'.' for n in author_name_split]#将每一个字符串的每个单词都取其首字母,在同一个字符串中用符号'.'连接
#print(article[3],author_name_modify) #承接上例,形如:['Y.T.', 'Y.X.', 'G.D.']
for k in range(len(equal_name_list)): ##重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = ['. '.join([m[0] for m in n])+'.' for n in author_name_split] #取每个元素首字母,之间用'. '连接,并在最后字母处也加上符号'.',承接上例,形如:['J. C.', 'H. A.', 'W. L.']
#print(article[3],author_name_modify)
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split] #只取每个字符串的首字母,承接上例,形如:['YT', 'YX', 'GD']
#print(article[3],author_name_modify)
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [n[0]+n[-1][0]+'.' for n in author_name_split] #取第一个单词和最后一个单词的首字母,并加上符号'.',承接上例,形如:['Ya-JingT.', 'YunX.', 'Guo-LianD.']
#print(article[3],author_name_modify)
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [] #将author_name_modify清空
for k in range(len(author_name_split)):#以0015-0282为例,author_name_split形如:[['Jennifer', 'L.', 'Herington'], ['Dana', 'R.', 'Glore']]
if len(author_name_split[k]) > 2: #如果author_name_split第k+1项元素的个数大于2,那么取每个单词的首字母,并以符号'.'或符号'. '连接
author_name_modify.append(author_name_split[k][0][0]+'.'+author_name_split[k][1][0]+'. '+
' '.join(author_name_split[k][2:])) #取第一个和第二个元素的首字母,之间用符号'.'连接,以及第三个元素的全部,之间用符号'. '连接,此处是符号和空格
#print(article[3],author_name_modify)#形如:['J.L. Herington', 'D.R. Glore']
else:
author_name_modify.append('') #否则删除
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [] #再次清空
for k in range(len(author_name_split)):
if len(author_name_split[k]) > 2:#如果author_name_split第k+1项元素的个数大于2,那么取每个单词的首字母,并以符号'. '连接
author_name_modify.append(author_name_split[k][0][0]+'. '+author_name_split[k][1][0]+'. '+
' '.join(author_name_split[k][2:]))#取第一个和第二个元素的首字母,以及第三个元素的全部,之间用符号'. '连接,符号和空格
#print(article[3],author_name_modify)# 形如:['J. L. Herington', 'D. R. Glore']
else:
author_name_modify.append('')
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split]#取每个单词首字母
#print(article[3],author_name_modify) #形如: ['JLH', 'DRG']
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [' '.join(n[:-1])+' '+n[-1][0] for n in author_name_split] #从第一个到倒数第二个元素用空格连接,并取最后一个单词的首字母
#print(article[3],author_name_modify) #形如:['Jennifer L. H', 'Dana R. G']
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [n[-1] for n in author_name_split]#取最后一个单词,形如:['Herington', 'Glore']
#print(article[3],author_name_modify)
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [] #再次清空列表author_name_modify
for k in range(len(author_name_split_hyphen)): #如果author_name_split第k+1项元素的个数大于2,那么取第一个单词首字母并加上符号'.'
if len(author_name_split_hyphen[k]) > 2: #取第二个单词首字母并加上符号'.'和空格,以及剩下的所有单词
author_name_modify.append(author_name_split_hyphen[k][0][0]+'.'+
author_name_split_hyphen[k][1][0]+'. '+
' '.join(author_name_split_hyphen[k][2:]))
else:
author_name_modify.append('')
#print(article[3],author_name_modify) #形如:['J.L. Herington', 'D.R. Glore']
for k in range(len(equal_name_list)): #重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
equal_name_hyphen = key_author_name_hyphen.sub('', equal_name_list[k]) #删除equal_name_list中的连字符
for l in range(len(author_name)):
if equal_name_hyphen.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = []
for k in range(len(author_name_split_hyphen)): #如果author_name_split第k+1项元素的个数大于2,那么取其第一个单词首字母加上符号'.'和空格,
if len(author_name_split_hyphen[k]) > 2: #第二个单词首字母加上符号'.'和空格,以及剩下的所有单词
author_name_modify.append(author_name_split_hyphen[k][0][0]+'. '+
author_name_split_hyphen[k][1][0]+'. '+
' '.join(author_name_split_hyphen[k][2:]))
else:
author_name_modify.append('')
#print(article[3],author_name_modify) #形如:['J. L. Herington', 'D. R. Glore', 'K. L. Bruner Tran']
for k in range(len(equal_name_list)): #重复前一个循环的比较过程,但是增加了一次匹配过程,用于删除其中的连字符
if type(equal_name_list[k]) == list:
continue
equal_name_hyphen = key_author_name_hyphen.sub('', equal_name_list[k])#因为上一步中author_name_modify是由author_name_split_hyphen匹配而来的,已经去除掉连字符了
#print(article[3],equal_name_hyphen) #形如:K.L.B.T.
for l in range(len(author_name)):
if equal_name_hyphen.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [' '.join(n) for n in author_name_split] #将原先被分开的单词重新组合
#print(article[3],author_name_modify) #形如:['Ya-Jing Tan', 'Yun Xiong']
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue #key_author_name_split是以空白字符为分界;取第一个分组的首字母,并加上符号'.'和空格
equal_name_temp = key_author_name_split.split(equal_name_list[k])[0][0] + '. ' + \
' '.join(key_author_name_split.split(equal_name_list[k])[1:]) #取从第二个到最后的所有分组,例如:'Y.-J. T.'变成'Y. T.'
#print(article[3],equal_name_temp)
for l in range(len(author_name)):
if equal_name_temp.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]] #将相同的那项以列表形式代替equal_name_list的第k+1项
break
if judge_complete():
continue
author_name_modify = ['.'.join([m[0] for m in n])+'.' for n in author_name_split_hyphen]#author_name_split_hyphen形如:['Ya', 'Jing', 'Tan'], ['Yun', 'Xiong']
#print(article[3],author_name_modify)#取每个分组的首字母,用符号'.'连接,并在该分组中以符号'.'结尾
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
equal_name_temp = re.sub('-', '', equal_name_list[k])#删除equal_name_list项中的连字符,如原本的:H.-K.A.
#print(article[3],equal_name_list) #处理后的equal_name_temp形如:H.K.A.(因为equal_name_temp处理的是equal_name_list[k],单个的某项)
for l in range(len(author_name)):
if equal_name_temp.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = ['.'.join([m[0] for m in n]) for n in author_name_split]#取author_name_split每个分组的首字母,用符号'.'连接
#print(article[3],author_name_modify) #形如:['Y.T', 'Y.X']
for k in range(len(equal_name_list)): #重复前一个循环的比较过程,我将其称为modify过程。
if type(equal_name_list[k]) == list:
continue
equal_name_temp = re.sub('-', '', equal_name_list[k])
for l in range(len(author_name)):
if equal_name_temp.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split_hyphen]#取author_name_split_hyphen每个分组的首字母
#print(article[3],author_name_modify)
for k in range(len(equal_name_list)):#重复前一个循环的比较过程
if type(equal_name_list[k]) == list:
continue
equal_name_temp = re.sub('-', '', equal_name_list[k])
for l in range(len(author_name)):
if equal_name_temp.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
key_author_name_temp = [re.compile(n[-1]) for n in author_name_split] #匹配表达式,取每个分组的最后一个字符串
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if key_author_name_temp[l].search(equal_name_list[k]): #如果能在equal_name_list中第k+1项能被第l+1个匹配式匹配到,就将author_name第l+1项赋给equal_name_list第k+1项
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
key_author_name_temp = [re.compile(n[0]) for n in author_name_split] #匹配表达式,取每个分组的第一个字符串
for k in range(len(equal_name_list)): #重复上一比较过程
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if key_author_name_temp[l].search(equal_name_list[k]):
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
if equal_name_list[0] in ['T.C-D.', ['Roy Phitayakorn'], 'CDA', 'ASm', 'H.-P.K',
['Francesca Moro'], 'Y.-J. T.', 'A.M.D.A.', 'LK.M.', ['Andras Hoffman'],
'K.R.', 'M.dC.V.H.', 'Y-G.K.', ['Scott J. Robbie'], ['Seung Hoon Woo'],
'M.S.M', 'C.J.C.T.', ['Klaas J. Wardenaar'], 'L.-Q. X.',
['Massimiliano Fusaro'], ['Oliver Husser'], ['Icela Palma'], 'W-M.L',
'program. The project']:
author_equal.append(author_name[:2]) #如果equal_name_list的第一项在以上集合中,那么把author_name的前两个元素加入author_equal中
continue
if equal_name_list[0] in ['M-T.M-G', ["Anthony V. D'Amico"], 'MC', 'CAZ', ['Arne Östman'],
'J.J.V.P.']: # 如果equal_name_list的第一项在以上集合中
author_equal.append(author_name[-2:]) #那么把author_name的最后两个元素加入author_equal中
continue
if equal_name_list[0] in ['MH']: #如果equal_name_list的第一项在以上集合中
author_equal.append(author_name[1:3]) #那么把author_name的第二个和第三个元素加入author_equal中
continue
if equal_name_list[0] in [['Chunsheng Liu']]: #如果equal_name_list的第一项在以上集合中,
author_equal.append(author_name[:3]) # 那么把author_name的前三个元素加入author_equal中
continue
if equal_name_list[0] in [['Leonidas Chouliaras']]:#如果equal_name_list的第一项在以上集合中,
author_equal.append(author_name[:2]+author_name[-2:])#那么把author_name的前两个和最后两个元素加入author_equal中
continue
if equal_name_list[0] in [['Karin Hek']]: #如果equal_name_list的第一项在以上集合中
author_equal.append(author_name[:5]) #那么把author_name的前五个元素加入author_equal中
continue
if equal_name_list[0] in [['Cornelia M. van Duijn']]: #如果equal_name_list的第一项在以上集合中
author_equal.append(author_name[-8:]) #那么把author_name的最后八个元素加入author_equal中
for j in range(len(author_equal)):
temp = []
for line in author_equal[j]: #将author_equal中的每一项加进列表temp中
if not line in temp:
temp.append(line)
if len(temp) == 1 and not author_name[0] in temp: #如果列表中只有一个元素,并且author_name中的第一个元素不在temp中
temp.append(author_name[0]) #则将author_name中的第一个元素加进列表temp中
author_equal[j] = temp
article_index = article[3] #article_index:文章编号
article_type = article[4] #article_type: 文章类型
article_title = article[5] #article_title: 文章名称
article_doi = article[2] #article_doi:文章doi值
article_info.append((author_name, author_corr, author_equal, article_index, article_type, article_title,
article_doi)) #将所有作者集合、通讯作者集合、等同作者集合、文章编号、文章类型、文章名称、文章doi值按序加进列表article_info中
pickle_write(article_info, os.path.join(folder, name_article_info))
get_article_info
最新推荐文章于 2021-10-09 17:07:51 发布