main
from defined_function.convert_data import convert_data
from defined_function.data_initial import data_initial
from defined_function.get_article import get_article
from defined_function.get_info import get_info
from defined_function.get_issue import get_issue
from defined_function.get_journal import get_journal
DATA_DIR = 'Data'
data_initial(DATA_DIR)
# get_journal()
# get_issue()
# get_article()
# get_info()
# convert_data()
data_initial
import os
def data_initial(path):
if not os.path.isdir(path):
os.makedirs(path)
os.chdir(path)
get_journal()
import os
import re
import shutil
from defined_class.website import Website
from defined_class.website_multi import WebsiteMulti
from defined_function.read_data_list import read_data_list
from defined_function.pickle_var_file import pickle_read, pickle_write
def get_journal():
get_journal_list()
get_journal_page()
def get_journal_list():
def judge_sci(issn):
for line in sci_list:
if issn == line:
return True
return False
print('Start getting journals...')
name_sci_list = 'sci_list.txt'
name_journal_list = 'journal_list'
key_journal = re.compile(r'<span><a href=(.*?)</a></span>')
key_journal_name = re.compile(r'>(.*?)$')
key_journal_url = re.compile(r'"(.*?)"')
key_journal_delete = re.compile(r'[^A-Za-z,&:+*\(\)/\-\' ]| de | des | et | en | voor | der | i | d\'|^De |^Des |'
r'^Et |^En |^Voor |^Der |^D\'|^de |^des |^et |^en |^voor |^der |^d\'| de$| des$|'
r' et$| en$| voor$| der$')
sub_url = '/science/browsescroll/journals/all/begidx/50/rwpos/0'
website = Website(sub_url)
page = website.get()
sci_list = read_data_list(name_sci_list)
journal_list_all = key_journal.findall(page)
journal_list = []
for journal in journal_list_all:
journal_url = key_journal_url.search(journal).groups()[0]
journal_name = key_journal_name.search(journal).groups()[0]
journal_issn = journal_url[-8:-4] + '-' + journal_url[-4:]
if judge_sci(journal_issn) and not key_journal_delete.search(journal_name):
journal_list.append((journal_url, journal_issn, journal_name))
pickle_write(journal_list, name_journal_list)
def get_journal_page():
name_temp_dir = 'temp'
name_journal_list = 'journal_list'
name_journal_info = 'journal_info.txt'
name_journal_page = 'journal.html'
journal_list = pickle_read(name_journal_list)
if os.path.isdir(name_temp_dir):
shutil.rmtree(name_temp_dir)
os.mkdir(name_temp_dir)
journal_url_list = [n[0] for n in journal_list]
website_multi = WebsiteMulti(journal_url_list, name_temp_dir, os.path.join(name_temp_dir, 'fail_journal.txt'), 0)
website_multi.get()
for i in range(len(journal_list)):
if not os.path.isdir(journal_list[i][1]):
os.mkdir(journal_list[i][1])
shutil.move(os.path.join(name_temp_dir, '{}.html'.format(i)), os.path.join(journal_list[i][1],
name_journal_page))
f = open(os.path.join(journal_list[i][1], name_journal_info), mode='w', encoding='utf-8')
f.write('Name:\t{}\nISSN:\t{}\n'.format(journal_list[i][2], journal_list[i][1]))
f.close()
if os.path.isdir(name_temp_dir):
shutil.rmtree(name_temp_dir)
get_issue()
import os
import re
import shutil
import time
from defined_class.website_multi import WebsiteMulti
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read, pickle_write
def get_issue():
print('Start getting issues...')
get_volume_list()
get_volume_page()
get_issue_list()
get_issue_page()
def get_volume_list():
name_journal_page = 'journal.html'
name_volume_list = 'volume_list'
key_volume = re.compile(r'<A HREF=.*?</A>')
key_volume_url = re.compile(r'A HREF="(.*?)"')
key_volume_name = re.compile(r'</span>([^<]*?)</A>')
key_volume_year1 = re.compile(r'\((.*?) - (.*?)\)')
key_volume_year2 = re.compile(r'\((.*?)\)')
dir_list = get_dir_list()
for folder in dir_list:
f = open(os.path.join(folder, name_journal_page), mode='r', encoding='utf-8')
page = f.read()
f.close()
volume_list_all = key_volume.findall(page)
if not volume_list_all:
print(folder)
continue
volume_list = []
for line in volume_list_all:
if not key_volume_name.search(line):
continue
if key_volume_year1.search(line):
year1 = int(key_volume_year1.search(line).groups()[0])
year2 = int(key_volume_year1.search(line).groups()[1])
if year1 <= 2013 <= year2:
volume_list.append((key_volume_url.search(line).groups()[0], (year1, year2)))
else:
year = int(key_volume_year2.search(line).groups()[0])
if year == 2013:
volume_list.append((key_volume_url.search(line).groups()[0], year))
if volume_list:
pickle_write(volume_list, os.path.join(folder, name_volume_list))
else:
shutil.rmtree(folder)
def get_volume_page():
sleep_time = 15
sleep_step = 20
name_volume_list = 'volume_list'
name_volume_dir = 'volume'
dir_list = get_dir_list()
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
folder = os.path.join(dir_list[i], name_volume_dir)
volume_list = pickle_read(os.path.join(dir_list[i], name_volume_list))
volume_url_list = [n[0] for n in volume_list]
if not os.path.isdir(folder):
os.mkdir(folder)
website_multi = WebsiteMulti(volume_url_list, folder, 'fail_volume.txt', i)
website_multi.get()
if (i+1) % sleep_step == 0 and (i+1) != len(dir_list):
print('Continue after {} seconds...'.format(sleep_time))
time.sleep(sleep_time)
def get_issue_list():
name_volume_list = 'volume_list'
name_volume_dir = 'volume'
name_issue_list = 'issue_list'
key_volume = re.compile(r'<A HREF=.*?</A>')
key_issue = re.compile(r'<A HREF="([^"]*?)"[^"]*?</A>[^"]*?2013')
dir_list = get_dir_list()
for i in range(len(dir_list)):
volume_list = pickle_read(os.path.join(dir_list[i], name_volume_list))
folder_volume = os.path.join(dir_list[i], name_volume_dir)
issue_list = []
for j in range(len(volume_list)):
if type(volume_list[j][1]) == int:
if volume_list[j][1] == 2013:
issue_list.append(volume_list[j][0])
else:
if volume_list[j][1][1] == 2013:
issue_list.append(volume_list[j][0])
f = open(os.path.join(folder_volume, '{}.html'.format(j)), mode='r', encoding='utf-8')
page = f.read()
f.close()
if not key_volume.findall(page):
print(i, '\{}\{}.html'.format(os.path.join(dir_list[i], name_volume_dir), j))
issue_list += key_issue.findall(page)
pickle_write(issue_list, os.path.join(dir_list[i], name_issue_list))
def get_issue_page():
sleep_time = 15
sleep_step = 20
name_issue_list = 'issue_list'
name_issue_dir = 'issue'
dir_list = get_dir_list()
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
folder_issue = os.path.join(dir_list[i], name_issue_dir)
issue_list = pickle_read(os.path.join(dir_list[i], name_issue_list))
if not os.path.isdir(folder_issue):
os.mkdir(folder_issue)
website_multi = WebsiteMulti(issue_list, folder_issue, 'fail_issue.txt', i)
website_multi.get()
if (i+1) % sleep_step == 0 and (i+1) != len(dir_list):
print('Continue after {} seconds...'.format(sleep_time))
time.sleep(sleep_time)
get_article()
import os
import re
import time
from defined_class.website_multi import WebsiteMulti
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read, pickle_write
def get_article():
print('Start getting articles...')
get_article_list()
get_article_page()
def get_article_list():
name_issue_dir = 'issue'
name_article_list = 'article_list'
key_section = re.compile(r'<h3 class="sectionH1">.*?</h3>')
key_article = re.compile(r'<li class="title.*?</li>')
key_article_url = re.compile(r'href="http://www.sciencedirect.com(.*?)"')
key_article_name = re.compile(r'href=".*?">(.*?)</a>')
key_sub1 = re.compile(r'<.*?>')
key_sub2 = re.compile(r'[\s]{2,}')
key_sub3 = re.compile(r'^[\s]+|[\s]+$')
key_article_type = re.compile(r'<span class="articleTypeLabel">(.*?)</span>')
dir_list = get_dir_list()
for i in range(len(dir_list)):
issue_folder = os.path.join(dir_list[i], name_issue_dir)
issue_page_list = os.listdir(issue_folder)
article_list = []
for issue_page in issue_page_list:
f = open(os.path.join(issue_folder, issue_page), mode='r', encoding='utf-8')
page = f.read()
f.close()
if not key_article.findall(page):
print(i, '\{}\{}'.format(issue_folder, issue_page))
section_title = key_section.findall(page)
page_split = key_section.split(page)
section_list = [''] + [n[22:-5] for n in section_title]
for j in range(len(section_list)):
article_list_section = key_article.findall(page_split[j])
for article in article_list_section:
article_url = key_article_url.search(article).groups()[0] + '?np=y'
article_section = section_list[j]
if key_article_type.search(article):
article_type = key_article_type.search(article).groups()[0]
else:
article_type = ''
article_name = key_article_name.search(article).groups()[0]
if key_sub1.search(article_name):
article_name = key_sub1.sub('', article_name)
if key_sub2.search(article_name):
article_name = key_sub2.sub(' ', article_name)
if key_sub3.search(article_name):
article_name = key_sub3.sub('', article_name)
article_list.append((article_url, article_section, article_type, article_name))
pickle_write(article_list, os.path.join(dir_list[i], name_article_list))
print('{}\t{}'.format(len(article_list), dir_list[i]))
def get_article_page():
sleep_time = 15
sleep_step = 2
name_article_list = 'article_list'
name_article_dir = 'article'
dir_list = get_dir_list()
for i in range(len(dir_list)):
print('Getting {} of {} journals... {}'.format(i+1, len(dir_list), dir_list[i]))
folder = os.path.join(dir_list[i], name_article_dir)
article_list = pickle_read(os.path.join(dir_list[i], name_article_list))
article_url_list = [n[0] for n in article_list]
if not os.path.isdir(folder):
os.mkdir(folder)
website_multi = WebsiteMulti(article_url_list, folder, 'fail_article.txt', i)
website_multi.get()
if (i+1) % sleep_step == 0 and (i+1) != len(dir_list):
print('Continue after {} seconds...'.format(sleep_time))
time.sleep(sleep_time)
get_info()
import os
import re
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read, pickle_write
def get_info():
print('Start getting info...')
journal_filter()
get_article_type()
article_filter()
get_author_list()
get_article_info()
def journal_filter():
name_article_list = 'article_list'
name_article_dir = 'article'
key_doi = re.compile(r'SDM.doi = \'(.*?)\'')
dir_list = get_dir_list()
for i in range(len(dir_list)):
article_list = pickle_read(os.path.join(dir_list[i], name_article_list))
folder = os.path.join(dir_list[i], name_article_dir)
for j in range(len(article_list)):
f = open(os.path.join(folder, '{}.html'.format(j)), mode='r', encoding='utf-8')
page = f.read()
f.close()
if not key_doi.search(page):
print(i, '\{}\{}.html'.format(folder, j))
def get_article_type():
name_article_list = 'article_list'
name_article_type = 'article_type'
name_article_type_list = 'article_type.txt'
dir_list = get_dir_list()
type_list = []
type_count = []
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
article_list = pickle_read(os.path.join(dir_list[i], name_article_list))
for article in article_list:
if not article[1] in type_list:
type_list.append(article[1])
type_count.append(1)
else:
for j in range(len(type_list)):
if type_list[j] == article[1]:
type_count[j] += 1
break
article_type_list = list(map(lambda m, n: '{}\t{}\n'.format(m, n), type_list, type_count))
article_type = list(map(lambda m, n: (m, n), type_list, type_count))
pickle_write(article_type, name_article_type)
f = open(name_article_type_list, mode='w', encoding='utf-8')
f.writelines(article_type_list)
f.close()
def article_filter():
name_article_type = 'article_type'
name_article_list = 'article_list'
name_article_select = 'article_select'
key_except = re.compile(r'editor.*?choice',re.I)
key_exclude = re.compile(r'editor|book|news|acknowledgment|acknowledgement|education|retraction|erratum|'
r'introduction|in this issue|feature|foreword|topic|response|reply|comment|'
r'index|content|abstract|highlight|obituary|announcement|guideline|\sview|^view|list|'
r'presentation|survey|summary|correction|abbreviation', re.I)
key_letter = re.compile(r'letter|correspondence', re.I)
key_review = re.compile(r'review|reveiw|insight', re.I)
article_type_all = pickle_read(name_article_type)
type_except = []
type_exclude = []
type_letter = []
type_review = []
for i in range(len(article_type_all)):
article_type = article_type_all[i][0]
if key_except.search(article_type):
type_except.append(article_type)
elif key_exclude.search(article_type):
type_exclude.append(article_type)
elif key_letter.search(article_type):
type_letter.append(article_type)
elif key_review.search(article_type):
type_review.append(article_type)
count_letter = 0
count_review = 0
count_paper = 0
dir_list = get_dir_list()
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
article_list = pickle_read(os.path.join(dir_list[i], name_article_list))
article_select = []
for j in range(len(article_list)):
article = article_list[j]
if article[2]:
if article[2] == 'Original Research Article':
article_select.append((j, 'Paper', article[3]))
count_paper += 1
else:
article_select.append((j, 'Review', article[3]))
count_review += 1
continue
if article[1] in type_except:
article_select.append((j, 'Paper', article[3]))
count_paper += 1
continue
if article[1] in type_exclude or key_exclude.search(article[3]):
continue
if article[1] in type_letter or key_letter.search(article[3]):
article_select.append((j, 'Letter', article[3]))
count_letter += 1
continue
if article[1] in type_review or key_review.search(article[3]):
article_select.append((j, 'Review', article[3]))
count_review += 1
continue
article_select.append((j, 'Paper', article[3]))
count_paper += 1
pickle_write(article_select, os.path.join(dir_list[i], name_article_select))
print(count_letter, count_review, count_paper)
def get_author_list():
name_article_dir = 'article'
name_article_select = 'article_select'
name_author_list = 'author_list'
key_author = re.compile(r'<li><a href="#" class="authorName.*?</li>|<li><span class="authorDegrees">.*?</li>')
key_equal = re.compile(r'<dl class="footnote".*?</dl>')
key_doi = re.compile(r'SDM.doi = \'(.*?)\'')
dir_list = get_dir_list()
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
article_select = pickle_read(os.path.join(dir_list[i], name_article_select))
folder = os.path.join(dir_list[i], name_article_dir)
author_list = []
for article in article_select:
f = open(os.path.join(folder, '{}.html'.format(article[0])), mode='r', encoding='utf-8')
page = f.read()
f.close()
author_all = key_author.findall(page)
author_equal = key_equal.findall(page)
article_doi = key_doi.search(page).groups()[0]
author_list.append((author_all, author_equal, article_doi, article[0], article[1], article[2]))
pickle_write(author_list, os.path.join(dir_list[i], name_author_list))
def get_article_info():
def judge_complete():
nonlocal equal_name_list
nonlocal author_equal
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == str:
return False
author_equal.append([n[0] for n in equal_name_list])
return True
name_author_list = 'author_list'
name_article_info = 'article_info'
key_author_name = re.compile(r'<a href="#" class="authorName.*?>(.*?)</a>')
key_author_name_split = re.compile(r'\s')
key_author_name_hyphen = re.compile(r'-')
key_author_name_split_hyphen = re.compile(r'\s|-')
key_corr = re.compile(r'Corresponding author')
key_equal = re.compile(r'contributed equally')
key_equal_split = re.compile(r'\s[^A-Z\s\.]*?\.\s|\)\.\s')
key_equal_1 = re.compile(r'class="intra_ref">(.*?)</a>')
key_equal_1_not = re.compile(r'Appendix 1')
key_equal_1_sub = re.compile(r'<.*?>(.*?)<.*?>')
key_equal_1_spec = re.compile(r' and ')
key_equal_1_single = re.compile(r'This author|The author|Co-first author')
key_equal_2 = re.compile(r'>[\s\)\]\.,;:]*([^<]*?contributed equally)')
key_equal_2_sub = re.compile(r'\s*contributed equally')
key_equal_2_1 = re.compile(r'All authors|All of the authors|All the authors|^Authors$|^The authors$|^The authors do|'
r'^The three institutions|^The Tsimikas|These authors|^Northeast Normal University|'
r'^These author$')
key_equal_2_2 = re.compile(r'Both authors|Both first authors|The 1st 2 authors|The first 2 authors|'
r'The first two authors')
key_equal_2_3 = re.compile(r'The first 3 authors|The first three authors|the first three authors')
key_equal_2_4 = re.compile(r'The last 2 authors|The last two authors')
key_equal_2_5 = re.compile(r'The last 3 authors')
key_equal_2_6 = re.compile(r'The last four authors')
key_equal_2_7 = re.compile(r'Second and third authors')
key_equal_3_sub = re.compile(r' have$|^and has.*Merck. |,$|^As joint first authors, |^Author contributions: |'
r' performed .*?$|^Author |^Authors |^Both | both$| equally$|^Note: Both |'
r' as co-corresponding author$| are joint first authors and$|, these authors have|'
r', MD|, PhD|^Professors |^Drs |^Drs. |^The authors | are co-first authors and$')
key_equal_3_split = re.compile(r',\sand\s|,\s|\sand\s|\s&\s')
dir_list = get_dir_list()
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
folder = dir_list[i]
author_list = pickle_read(os.path.join(folder, name_author_list))
article_info = []
for article in author_list:
author_name = [key_author_name.search(n).groups()[0] for n in article[0]]
author_name_split = [key_author_name_split.split(n) for n in author_name]
author_name_split_hyphen = [key_author_name_split_hyphen.split(n) for n in author_name]
if not author_name:
continue
author_corr = []
author_equal = []
author_equal_all = []
equal_flag_1 = []
equal_flag_2 = []
for j in range(len(article[0])):
if key_corr.search(article[0][j]):
author_corr.append(author_name[j])
for line in article[1]:
if key_equal.search(line):
author_equal_all.append(line)
for line in author_equal_all:
if key_equal_1.search(line):
equal_flag_temp = (key_equal_1.search(line).groups()[0], line)
if key_equal_1_not.search(equal_flag_temp[0]):
equal_flag_2.append((key_equal_2.search(line).groups()[0], line))
elif key_equal_1_spec.search(equal_flag_temp[0]):
equal_flag_1 += [(n, line) for n in key_equal_1_spec.split(equal_flag_temp[0])]
elif key_equal_1_sub.search(equal_flag_temp[0]):
equal_flag_1.append((key_equal_1_sub.search(equal_flag_temp[0]).groups()[0], line))
else:
equal_flag_1.append(equal_flag_temp)
else:
equal_flag_2.append((key_equal_2.search(line).groups()[0], line))
for line in equal_flag_1:
if line[0] == '**':
key_equal_flag = re.compile(r'<sup>\*\*</sup>|<sup>\*</sup>')
elif line[0] == '*':
key_equal_flag = re.compile(r'<sup>\*</sup>')
elif line[0] == '+':
key_equal_flag = re.compile(r'<sup>\+</sup>')
else:
key_equal_flag = re.compile(r'<sup>' + line[0] + '</sup>')
temp = []
for k in range(len(article[0])):
if key_equal_flag.search(article[0][k]):
temp.append(author_name[k])
if len(temp) == 0:
equal_flag_2.append((key_equal_2.search(line[1]).groups()[0], line[1]))
elif len(temp) == 1:
if key_equal_1_single.search(line[1]):
author_name.append(temp)
else:
equal_flag_2.append((key_equal_2.search(line[1]).groups()[0], line[1]))
else:
author_equal.append(temp)
for line in equal_flag_2:
equal_split = key_equal_split.split(line[0])
split_words = key_equal_split.findall(line[0])
for k in range(len(split_words)):
equal_split[k] += split_words[k]
for item in equal_split:
if key_equal.search(item):
equal_sentence = key_equal_2_sub.sub('', item)
if key_equal_2_1.search(equal_sentence):
author_equal.append(author_name)
continue
if key_equal_2_2.search(equal_sentence):
author_equal.append(author_name[:2])
continue
if key_equal_2_3.search(equal_sentence):
author_equal.append(author_name[:3])
continue
if key_equal_2_4.search(equal_sentence):
author_equal.append(author_name[-2:])
continue
if key_equal_2_5.search(equal_sentence):
author_equal.append(author_name[-3:])
continue
if key_equal_2_6.search(equal_sentence):
author_equal.append(author_name[-4:])
continue
if key_equal_2_7.search(equal_sentence):
author_equal.append(author_name[1:3])
continue
equal_sentence = key_equal_3_sub.sub('', equal_sentence)
equal_name_list = key_equal_3_split.split(equal_sentence)
author_name_modify = [' '.join(n) for n in author_name_split]
for k in range(len(equal_name_list)):
for l in range(len(author_name_modify)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [n[-1]+' '+' '.join(n[:-1]) for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [n[0][0]+'. '+' '.join(n[1:]) for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [n[0][0]+' '+' '.join(n[1:]) for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = ['.'.join([m[0] for m in n])+'.' for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = ['. '.join([m[0] for m in n])+'.' for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [n[0]+n[-1][0]+'.' for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = []
for k in range(len(author_name_split)):
if len(author_name_split[k]) > 2:
author_name_modify.append(author_name_split[k][0][0]+'.'+author_name_split[k][1][0]+'. '+
' '.join(author_name_split[k][2:]))
else:
author_name_modify.append('')
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = []
for k in range(len(author_name_split)):
if len(author_name_split[k]) > 2:
author_name_modify.append(author_name_split[k][0][0]+'. '+author_name_split[k][1][0]+'. '+
' '.join(author_name_split[k][2:]))
else:
author_name_modify.append('')
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [' '.join(n[:-1])+' '+n[-1][0] for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [n[-1] for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if equal_name_list[k].lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = []
for k in range(len(author_name_split_hyphen)):
if len(author_name_split_hyphen[k]) > 2:
author_name_modify.append(author_name_split_hyphen[k][0][0]+'.'+
author_name_split_hyphen[k][1][0]+'. '+
' '.join(author_name_split_hyphen[k][2:]))
else:
author_name_modify.append('')
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
equal_name_hyphen = key_author_name_hyphen.sub('', equal_name_list[k])
for l in range(len(author_name)):
if equal_name_hyphen.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = []
for k in range(len(author_name_split_hyphen)):
if len(author_name_split_hyphen[k]) > 2:
author_name_modify.append(author_name_split_hyphen[k][0][0]+'. '+
author_name_split_hyphen[k][1][0]+'. '+
' '.join(author_name_split_hyphen[k][2:]))
else:
author_name_modify.append('')
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
equal_name_hyphen = key_author_name_hyphen.sub('', equal_name_list[k])
for l in range(len(author_name)):
if equal_name_hyphen.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [' '.join(n) for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
equal_name_temp = key_author_name_split.split(equal_name_list[k])[0][0] + '. ' + \
' '.join(key_author_name_split.split(equal_name_list[k])[1:])
for l in range(len(author_name)):
if equal_name_temp.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = ['.'.join([m[0] for m in n])+'.' for n in author_name_split_hyphen]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
equal_name_temp = re.sub('-', '', equal_name_list[k])
for l in range(len(author_name)):
if equal_name_temp.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = ['.'.join([m[0] for m in n]) for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
equal_name_temp = re.sub('-', '', equal_name_list[k])
for l in range(len(author_name)):
if equal_name_temp.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split_hyphen]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
equal_name_temp = re.sub('-', '', equal_name_list[k])
for l in range(len(author_name)):
if equal_name_temp.lower() == author_name_modify[l].lower():
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
key_author_name_temp = [re.compile(n[-1]) for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if key_author_name_temp[l].search(equal_name_list[k]):
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
key_author_name_temp = [re.compile(n[0]) for n in author_name_split]
for k in range(len(equal_name_list)):
if type(equal_name_list[k]) == list:
continue
for l in range(len(author_name)):
if key_author_name_temp[l].search(equal_name_list[k]):
equal_name_list[k] = [author_name[l]]
break
if judge_complete():
continue
if equal_name_list[0] in ['T.C-D.', ['Roy Phitayakorn'], 'CDA', 'ASm', 'H.-P.K',
['Francesca Moro'], 'Y.-J. T.', 'A.M.D.A.', 'LK.M.', ['Andras Hoffman'],
'K.R.', 'M.dC.V.H.', 'Y-G.K.', ['Scott J. Robbie'], ['Seung Hoon Woo'],
'M.S.M', 'C.J.C.T.', ['Klaas J. Wardenaar'], 'L.-Q. X.',
['Massimiliano Fusaro'], ['Oliver Husser'], ['Icela Palma'], 'W-M.L',
'program. The project']:
author_equal.append(author_name[:2])
continue
if equal_name_list[0] in ['M-T.M-G', ["Anthony V. D'Amico"], 'MC', 'CAZ', ['Arne Östman'],
'J.J.V.P.']:
author_equal.append(author_name[-2:])
continue
if equal_name_list[0] in ['MH']:
author_equal.append(author_name[1:3])
continue
if equal_name_list[0] in [['Chunsheng Liu']]:
author_equal.append(author_name[:3])
continue
if equal_name_list[0] in [['Leonidas Chouliaras']]:
author_equal.append(author_name[:2]+author_name[-2:])
continue
if equal_name_list[0] in [['Karin Hek']]:
author_equal.append(author_name[:5])
continue
if equal_name_list[0] in [['Cornelia M. van Duijn']]:
author_equal.append(author_name[-8:])
for j in range(len(author_equal)):
temp = []
for line in author_equal[j]:
if not line in temp:
temp.append(line)
if len(temp) == 1 and not author_name[0] in temp:
temp.append(author_name[0])
author_equal[j] = temp
article_index = article[3]
article_type = article[4]
article_title = article[5]
article_doi = article[2]
article_info.append((author_name, author_corr, author_equal, article_index, article_type, article_title,
article_doi))
pickle_write(article_info, os.path.join(folder, name_article_info))
convert_data()import os
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read
from defined_function.read_data_list import read_data_list
def convert_data():
def get_article_property():
article_property = []
author_equal_order = []
if author_equal:
for line_1 in author_equal:
for i in range(len(author_name)):
if author_name[i] in line_1:
operation = (i, line_1)
break
author_name[operation[0]] = operation[1]
for line_2 in line_1:
if line_2 in author_name:
author_name.remove(line_2)
for i in range(len(author_name)):
if type(author_name[i]) == list:
author_equal_order.append(i+1)
if len(author_corr) > 1:
article_property.append(1)
elif len(author_corr) == 1:
article_property.append(2)
else:
article_property.append(3)
if not 1 in author_equal_order:
article_property.append(4)
else:
article_property.append(5)
author_first = author_name[0]
if type(author_first) == list:
for line in author_first:
if line in author_corr:
article_property.append(6)
break
else:
article_property.append(7)
else:
if author_first in author_corr:
article_property.append(6)
else:
article_property.append(7)
for line in author_equal_order:
if line != 1:
article_property.append(8)
break
article_property.sort()
return article_property, author_first, author_equal_order
def count_article():
nonlocal count
if len(author_name) == 1 and type(author_name[0]) == str:
count[12] += 1
return 1
count[0] += 1
if 8 in article_property:
count[11] += 1
if 3 in article_property:
if 4 in article_property:
count[9] += 1
elif 5 in article_property:
count[10] += 1
elif 1 in article_property:
if 4 in article_property:
if 6 in article_property:
count[1] += 1
elif 7 in article_property:
count[2] += 1
elif 5 in article_property:
if 6 in article_property:
count[3] += 1
elif 7 in article_property:
count[4] += 1
elif 2 in article_property:
if 4 in article_property:
if 6 in article_property:
count[5] += 1
elif 7 in article_property:
count[6] += 1
elif 5 in article_property:
if 6 in article_property:
count[7] += 1
elif 7 in article_property:
count[8] += 1
return 0
name_article_info = 'article_info'
name_journal_info = 'journal_info.txt'
name_article_overview = 'article_overview.txt'
name_journal_overview = 'journal_overview.txt'
name_overview = 'overview.txt'
overview = ['Name\tISSN\tTotal\tTotal(Excluding single author)\t(1 4 6)\t(1 4 7)\t(1 5 6)\t(1 5 7)\t'
'(2 4 6)\t(2 4 7)\t(2 5 6)\t(2 5 7)\t(3 4)\t(3 5)\t(8)\n']
dir_list = get_dir_list()
for i in range(len(dir_list)):
print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
folder = dir_list[i]
article_info = pickle_read(os.path.join(folder, name_article_info))
count = [0 for n in range(13)]
article_overview = ['Title\tDOI\tSingle Author\tArticle Type\tArticle Property\tAuthor First\tAuthor Corr\tAuthor Equal\t'
'Equal Order\tAuthor All\tFile\n']
for article in article_info:
article_index = article[3]
article_type= article[4]
article_doi = article[6]
article_title = article[5]
author_name = article[0]
author_corr = article[1]
author_equal = article[2]
article_property, author_first, author_equal_order = get_article_property()
flag_single = count_article()
article_overview.append('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(article_title,
article_doi, flag_single, article_type, article_property, author_first, author_corr,
author_equal, author_equal_order, author_name, article_index))
journal_info = read_data_list(os.path.join(dir_list[i], name_journal_info))
journal_name = journal_info[0].split('\t')[1]
journal_issn = journal_info[1].split('\t')[1]
journal_overview = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(journal_name,
journal_issn, count[0]+count[12], count[0], count[1], count[2], count[3], count[4],
count[5], count[6], count[7], count[8], count[9], count[10], count[11])
overview.append(journal_overview)
f = open(os.path.join(folder, name_article_overview), mode='w', encoding='utf-8')
f.writelines(article_overview)
f.close()
f = open(os.path.join(folder, name_journal_overview), mode='w', encoding='utf-8')
f.write(overview[0] + journal_overview)
f.close()
f = open(name_overview, mode='w', encoding='utf-8')
f.writelines(overview)
f.close()
get_dir_list()
import os
def get_dir_list():
dir_list = os.listdir()
dir_not = []
for folder in dir_list:
if not os.path.isdir(folder):
dir_not.append(folder)
for file in dir_not:
dir_list.remove(file)
return dir_list
pickle_var_file()
import os
import pickle
import sys
def pickle_read(file_name):
if os.path.isfile(file_name):
file = open(file_name, 'rb')
var = pickle.load(file)
file.close()
return var
else:
print('Error while opening {}!'.format(file_name))
sys.exit(1)
def pickle_write(var, file_name):
file = open(file_name, 'wb')
pickle.dump(var, file)
file.close()
read_data_list()
import os
import sys
def read_data_list(file_name):
if os.path.isfile(file_name):
file = open(file_name)
data_list = [n[:-1] for n in file.readlines()]
file.close()
return data_list
else:
print('Error while opening {}!'.format(file_name))
sys.exit(1)