Pycharm——程序

main

from defined_function.convert_data import convert_data
from defined_function.data_initial import data_initial
from defined_function.get_article import get_article
from defined_function.get_info import get_info
from defined_function.get_issue import get_issue
from defined_function.get_journal import get_journal

DATA_DIR = 'Data'
data_initial(DATA_DIR)
# get_journal()
# get_issue()
# get_article()
# get_info()
# convert_data()

data_initial

import os

def data_initial(path):
    if not os.path.isdir(path):
        os.makedirs(path)
    os.chdir(path)

get_journal()

import os
import re
import shutil
from defined_class.website import Website
from defined_class.website_multi import WebsiteMulti
from defined_function.read_data_list import read_data_list
from defined_function.pickle_var_file import pickle_read, pickle_write


def get_journal():
    get_journal_list()
    get_journal_page()


def get_journal_list():
    def judge_sci(issn):
        for line in sci_list:
            if issn == line:
                return True
        return False

    print('Start getting journals...')
    name_sci_list = 'sci_list.txt'
    name_journal_list = 'journal_list'
    key_journal = re.compile(r'<span><a href=(.*?)</a></span>')
    key_journal_name = re.compile(r'>(.*?)$')
    key_journal_url = re.compile(r'"(.*?)"')
    key_journal_delete = re.compile(r'[^A-Za-z,&:+*\(\)/\-\' ]| de | des | et | en | voor | der | i | d\'|^De |^Des |'
                                    r'^Et |^En |^Voor |^Der |^D\'|^de |^des |^et |^en |^voor |^der |^d\'| de$| des$|'
                                    r' et$| en$| voor$| der$')
    sub_url = '/science/browsescroll/journals/all/begidx/50/rwpos/0'
    website = Website(sub_url)
    page = website.get()
    sci_list = read_data_list(name_sci_list)
    journal_list_all = key_journal.findall(page)
    journal_list = []
    for journal in journal_list_all:
        journal_url = key_journal_url.search(journal).groups()[0]
        journal_name = key_journal_name.search(journal).groups()[0]
        journal_issn = journal_url[-8:-4] + '-' + journal_url[-4:]
        if judge_sci(journal_issn) and not key_journal_delete.search(journal_name):
            journal_list.append((journal_url, journal_issn, journal_name))
    pickle_write(journal_list, name_journal_list)


def get_journal_page():
    name_temp_dir = 'temp'
    name_journal_list = 'journal_list'
    name_journal_info = 'journal_info.txt'
    name_journal_page = 'journal.html'
    journal_list = pickle_read(name_journal_list)
    if os.path.isdir(name_temp_dir):
        shutil.rmtree(name_temp_dir)
    os.mkdir(name_temp_dir)
    journal_url_list = [n[0] for n in journal_list]
    website_multi = WebsiteMulti(journal_url_list, name_temp_dir, os.path.join(name_temp_dir, 'fail_journal.txt'), 0)
    website_multi.get()
    for i in range(len(journal_list)):
        if not os.path.isdir(journal_list[i][1]):
            os.mkdir(journal_list[i][1])
        shutil.move(os.path.join(name_temp_dir, '{}.html'.format(i)), os.path.join(journal_list[i][1],
                                                                                   name_journal_page))
        f = open(os.path.join(journal_list[i][1], name_journal_info), mode='w', encoding='utf-8')
        f.write('Name:\t{}\nISSN:\t{}\n'.format(journal_list[i][2], journal_list[i][1]))
        f.close()
    if os.path.isdir(name_temp_dir):
        shutil.rmtree(name_temp_dir)
get_issue()
import os
import re
import shutil
import time
from defined_class.website_multi import WebsiteMulti
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read, pickle_write


def get_issue():
    print('Start getting issues...')
    get_volume_list()
    get_volume_page()
    get_issue_list()
    get_issue_page()


def get_volume_list():
    name_journal_page = 'journal.html'
    name_volume_list = 'volume_list'
    key_volume = re.compile(r'<A HREF=.*?</A>')
    key_volume_url = re.compile(r'A HREF="(.*?)"')
    key_volume_name = re.compile(r'</span>([^<]*?)</A>')
    key_volume_year1 = re.compile(r'\((.*?) - (.*?)\)')
    key_volume_year2 = re.compile(r'\((.*?)\)')
    dir_list = get_dir_list()
    for folder in dir_list:
        f = open(os.path.join(folder, name_journal_page), mode='r', encoding='utf-8')
        page = f.read()
        f.close()
        volume_list_all = key_volume.findall(page)
        if not volume_list_all:
            print(folder)
            continue
        volume_list = []
        for line in volume_list_all:
            if not key_volume_name.search(line):
                continue
            if key_volume_year1.search(line):
                year1 = int(key_volume_year1.search(line).groups()[0])
                year2 = int(key_volume_year1.search(line).groups()[1])
                if year1 <= 2013 <= year2:
                    volume_list.append((key_volume_url.search(line).groups()[0], (year1, year2)))
            else:
                year = int(key_volume_year2.search(line).groups()[0])
                if year == 2013:
                    volume_list.append((key_volume_url.search(line).groups()[0], year))
        if volume_list:
            pickle_write(volume_list, os.path.join(folder, name_volume_list))
        else:
            shutil.rmtree(folder)


def get_volume_page():
    sleep_time = 15
    sleep_step = 20
    name_volume_list = 'volume_list'
    name_volume_dir = 'volume'
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        folder = os.path.join(dir_list[i], name_volume_dir)
        volume_list = pickle_read(os.path.join(dir_list[i], name_volume_list))
        volume_url_list = [n[0] for n in volume_list]
        if not os.path.isdir(folder):
            os.mkdir(folder)
        website_multi = WebsiteMulti(volume_url_list, folder, 'fail_volume.txt', i)
        website_multi.get()
        if (i+1) % sleep_step == 0 and (i+1) != len(dir_list):
            print('Continue after {} seconds...'.format(sleep_time))
            time.sleep(sleep_time)


def get_issue_list():
    name_volume_list = 'volume_list'
    name_volume_dir = 'volume'
    name_issue_list = 'issue_list'
    key_volume = re.compile(r'<A HREF=.*?</A>')
    key_issue = re.compile(r'<A HREF="([^"]*?)"[^"]*?</A>[^"]*?2013')
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        volume_list = pickle_read(os.path.join(dir_list[i], name_volume_list))
        folder_volume = os.path.join(dir_list[i], name_volume_dir)
        issue_list = []
        for j in range(len(volume_list)):
            if type(volume_list[j][1]) == int:
                if volume_list[j][1] == 2013:
                    issue_list.append(volume_list[j][0])
            else:
                if volume_list[j][1][1] == 2013:
                    issue_list.append(volume_list[j][0])
            f = open(os.path.join(folder_volume, '{}.html'.format(j)), mode='r', encoding='utf-8')
            page = f.read()
            f.close()
            if not key_volume.findall(page):
                print(i, '\{}\{}.html'.format(os.path.join(dir_list[i], name_volume_dir), j))
            issue_list += key_issue.findall(page)
        pickle_write(issue_list, os.path.join(dir_list[i], name_issue_list))


def get_issue_page():
    sleep_time = 15
    sleep_step = 20
    name_issue_list = 'issue_list'
    name_issue_dir = 'issue'
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        folder_issue = os.path.join(dir_list[i], name_issue_dir)
        issue_list = pickle_read(os.path.join(dir_list[i], name_issue_list))
        if not os.path.isdir(folder_issue):
            os.mkdir(folder_issue)
        website_multi = WebsiteMulti(issue_list, folder_issue, 'fail_issue.txt', i)
        website_multi.get()
        if (i+1) % sleep_step == 0 and (i+1) != len(dir_list):
            print('Continue after {} seconds...'.format(sleep_time))
            time.sleep(sleep_time)
get_article()
import os
import re
import time
from defined_class.website_multi import WebsiteMulti
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read, pickle_write


def get_article():
    print('Start getting articles...')
    get_article_list()
    get_article_page()


def get_article_list():
    name_issue_dir = 'issue'
    name_article_list = 'article_list'
    key_section = re.compile(r'<h3 class="sectionH1">.*?</h3>')
    key_article = re.compile(r'<li class="title.*?</li>')
    key_article_url = re.compile(r'href="http://www.sciencedirect.com(.*?)"')
    key_article_name = re.compile(r'href=".*?">(.*?)</a>')
    key_sub1 = re.compile(r'<.*?>')
    key_sub2 = re.compile(r'[\s]{2,}')
    key_sub3 = re.compile(r'^[\s]+|[\s]+$')
    key_article_type = re.compile(r'<span class="articleTypeLabel">(.*?)</span>')
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        issue_folder = os.path.join(dir_list[i], name_issue_dir)
        issue_page_list = os.listdir(issue_folder)
        article_list = []
        for issue_page in issue_page_list:
            f = open(os.path.join(issue_folder, issue_page), mode='r', encoding='utf-8')
            page = f.read()
            f.close()
            if not key_article.findall(page):
                print(i, '\{}\{}'.format(issue_folder, issue_page))
            section_title = key_section.findall(page)
            page_split = key_section.split(page)
            section_list = [''] + [n[22:-5] for n in section_title]
            for j in range(len(section_list)):
                article_list_section = key_article.findall(page_split[j])
                for article in article_list_section:
                    article_url = key_article_url.search(article).groups()[0] + '?np=y'
                    article_section = section_list[j]
                    if key_article_type.search(article):
                        article_type = key_article_type.search(article).groups()[0]
                    else:
                        article_type = ''
                    article_name = key_article_name.search(article).groups()[0]
                    if key_sub1.search(article_name):
                        article_name = key_sub1.sub('', article_name)
                    if key_sub2.search(article_name):
                        article_name = key_sub2.sub(' ', article_name)
                    if key_sub3.search(article_name):
                        article_name = key_sub3.sub('', article_name)
                    article_list.append((article_url, article_section, article_type, article_name))
        pickle_write(article_list, os.path.join(dir_list[i], name_article_list))
        print('{}\t{}'.format(len(article_list), dir_list[i]))


def get_article_page():
    sleep_time = 15
    sleep_step = 2
    name_article_list = 'article_list'
    name_article_dir = 'article'
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        print('Getting {} of {} journals... {}'.format(i+1, len(dir_list), dir_list[i]))
        folder = os.path.join(dir_list[i], name_article_dir)
        article_list = pickle_read(os.path.join(dir_list[i], name_article_list))
        article_url_list = [n[0] for n in article_list]
        if not os.path.isdir(folder):
            os.mkdir(folder)
        website_multi = WebsiteMulti(article_url_list, folder, 'fail_article.txt', i)
        website_multi.get()
        if (i+1) % sleep_step == 0 and (i+1) != len(dir_list):
            print('Continue after {} seconds...'.format(sleep_time))
            time.sleep(sleep_time)
get_info()
import os
import re
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read, pickle_write


def get_info():
    print('Start getting info...')
    journal_filter()
    get_article_type()
    article_filter()
    get_author_list()
    get_article_info()


def journal_filter():
    name_article_list = 'article_list'
    name_article_dir = 'article'
    key_doi = re.compile(r'SDM.doi = \'(.*?)\'')
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        article_list = pickle_read(os.path.join(dir_list[i], name_article_list))
        folder = os.path.join(dir_list[i], name_article_dir)
        for j in range(len(article_list)):
            f = open(os.path.join(folder, '{}.html'.format(j)), mode='r', encoding='utf-8')
            page = f.read()
            f.close()
            if not key_doi.search(page):
                print(i, '\{}\{}.html'.format(folder, j))


def get_article_type():
    name_article_list = 'article_list'
    name_article_type = 'article_type'
    name_article_type_list = 'article_type.txt'
    dir_list = get_dir_list()
    type_list = []
    type_count = []
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        article_list = pickle_read(os.path.join(dir_list[i], name_article_list))
        for article in article_list:
            if not article[1] in type_list:
                type_list.append(article[1])
                type_count.append(1)
            else:
                for j in range(len(type_list)):
                    if type_list[j] == article[1]:
                        type_count[j] += 1
                        break
    article_type_list = list(map(lambda m, n: '{}\t{}\n'.format(m, n), type_list, type_count))
    article_type = list(map(lambda m, n: (m, n), type_list, type_count))
    pickle_write(article_type, name_article_type)
    f = open(name_article_type_list, mode='w', encoding='utf-8')
    f.writelines(article_type_list)
    f.close()


def article_filter():
    name_article_type = 'article_type'
    name_article_list = 'article_list'
    name_article_select = 'article_select'
    key_except = re.compile(r'editor.*?choice',re.I)
    key_exclude = re.compile(r'editor|book|news|acknowledgment|acknowledgement|education|retraction|erratum|'
                             r'introduction|in this issue|feature|foreword|topic|response|reply|comment|'
                             r'index|content|abstract|highlight|obituary|announcement|guideline|\sview|^view|list|'
                             r'presentation|survey|summary|correction|abbreviation', re.I)
    key_letter = re.compile(r'letter|correspondence', re.I)
    key_review = re.compile(r'review|reveiw|insight', re.I)
    article_type_all = pickle_read(name_article_type)
    type_except = []
    type_exclude = []
    type_letter = []
    type_review = []
    for i in range(len(article_type_all)):
        article_type = article_type_all[i][0]
        if key_except.search(article_type):
            type_except.append(article_type)
        elif key_exclude.search(article_type):
            type_exclude.append(article_type)
        elif key_letter.search(article_type):
            type_letter.append(article_type)
        elif key_review.search(article_type):
            type_review.append(article_type)
    count_letter = 0
    count_review = 0
    count_paper = 0
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        article_list = pickle_read(os.path.join(dir_list[i], name_article_list))
        article_select = []
        for j in range(len(article_list)):
            article = article_list[j]
            if article[2]:
                if article[2] == 'Original Research Article':
                    article_select.append((j, 'Paper', article[3]))
                    count_paper += 1
                else:
                    article_select.append((j, 'Review', article[3]))
                    count_review += 1
                continue
            if article[1] in type_except:
                article_select.append((j, 'Paper', article[3]))
                count_paper += 1
                continue
            if article[1] in type_exclude or key_exclude.search(article[3]):
                continue
            if article[1] in type_letter or key_letter.search(article[3]):
                article_select.append((j, 'Letter', article[3]))
                count_letter += 1
                continue
            if article[1] in type_review or key_review.search(article[3]):
                article_select.append((j, 'Review', article[3]))
                count_review += 1
                continue
            article_select.append((j, 'Paper', article[3]))
            count_paper += 1
        pickle_write(article_select, os.path.join(dir_list[i], name_article_select))
    print(count_letter, count_review, count_paper)


def get_author_list():
    name_article_dir = 'article'
    name_article_select = 'article_select'
    name_author_list = 'author_list'
    key_author = re.compile(r'<li><a href="#" class="authorName.*?</li>|<li><span class="authorDegrees">.*?</li>')
    key_equal = re.compile(r'<dl class="footnote".*?</dl>')
    key_doi = re.compile(r'SDM.doi = \'(.*?)\'')
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        article_select = pickle_read(os.path.join(dir_list[i], name_article_select))
        folder = os.path.join(dir_list[i], name_article_dir)
        author_list = []
        for article in article_select:
            f = open(os.path.join(folder, '{}.html'.format(article[0])), mode='r', encoding='utf-8')
            page = f.read()
            f.close()
            author_all = key_author.findall(page)
            author_equal = key_equal.findall(page)
            article_doi = key_doi.search(page).groups()[0]
            author_list.append((author_all, author_equal, article_doi, article[0], article[1], article[2]))
        pickle_write(author_list, os.path.join(dir_list[i], name_author_list))


def get_article_info():
    def judge_complete():
        nonlocal equal_name_list
        nonlocal author_equal
        for k in range(len(equal_name_list)):
            if type(equal_name_list[k]) == str:
                return False
        author_equal.append([n[0] for n in equal_name_list])
        return True

    name_author_list = 'author_list'
    name_article_info = 'article_info'
    key_author_name = re.compile(r'<a href="#" class="authorName.*?>(.*?)</a>')
    key_author_name_split = re.compile(r'\s')
    key_author_name_hyphen = re.compile(r'-')
    key_author_name_split_hyphen = re.compile(r'\s|-')
    key_corr = re.compile(r'Corresponding author')
    key_equal = re.compile(r'contributed equally')
    key_equal_split = re.compile(r'\s[^A-Z\s\.]*?\.\s|\)\.\s')
    key_equal_1 = re.compile(r'class="intra_ref">(.*?)</a>')
    key_equal_1_not = re.compile(r'Appendix 1')
    key_equal_1_sub = re.compile(r'<.*?>(.*?)<.*?>')
    key_equal_1_spec = re.compile(r' and ')
    key_equal_1_single = re.compile(r'This author|The author|Co-first author')
    key_equal_2 = re.compile(r'>[\s\)\]\.,;:]*([^<]*?contributed equally)')
    key_equal_2_sub = re.compile(r'\s*contributed equally')
    key_equal_2_1 = re.compile(r'All authors|All of the authors|All the authors|^Authors$|^The authors$|^The authors do|'
                               r'^The three institutions|^The Tsimikas|These authors|^Northeast Normal University|'
                               r'^These author$')
    key_equal_2_2 = re.compile(r'Both authors|Both first authors|The 1st 2 authors|The first 2 authors|'
                               r'The first two authors')
    key_equal_2_3 = re.compile(r'The first 3 authors|The first three authors|the first three authors')
    key_equal_2_4 = re.compile(r'The last 2 authors|The last two authors')
    key_equal_2_5 = re.compile(r'The last 3 authors')
    key_equal_2_6 = re.compile(r'The last four authors')
    key_equal_2_7 = re.compile(r'Second and third authors')
    key_equal_3_sub = re.compile(r' have$|^and has.*Merck. |,$|^As joint first authors, |^Author contributions: |'
                                 r' performed .*?$|^Author |^Authors |^Both | both$| equally$|^Note: Both |'
                                 r' as co-corresponding author$| are joint first authors and$|, these authors have|'
                                 r', MD|, PhD|^Professors |^Drs |^Drs. |^The authors | are co-first authors and$')
    key_equal_3_split = re.compile(r',\sand\s|,\s|\sand\s|\s&\s')
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        folder = dir_list[i]
        author_list = pickle_read(os.path.join(folder, name_author_list))
        article_info = []
        for article in author_list:
            author_name = [key_author_name.search(n).groups()[0] for n in article[0]]
            author_name_split = [key_author_name_split.split(n) for n in author_name]
            author_name_split_hyphen = [key_author_name_split_hyphen.split(n) for n in author_name]
            if not author_name:
                continue
            author_corr = []
            author_equal = []
            author_equal_all = []
            equal_flag_1 = []
            equal_flag_2 = []
            for j in range(len(article[0])):
                if key_corr.search(article[0][j]):
                    author_corr.append(author_name[j])
            for line in article[1]:
                if key_equal.search(line):
                    author_equal_all.append(line)
            for line in author_equal_all:
                if key_equal_1.search(line):
                    equal_flag_temp = (key_equal_1.search(line).groups()[0], line)
                    if key_equal_1_not.search(equal_flag_temp[0]):
                        equal_flag_2.append((key_equal_2.search(line).groups()[0], line))
                    elif key_equal_1_spec.search(equal_flag_temp[0]):
                        equal_flag_1 += [(n, line) for n in key_equal_1_spec.split(equal_flag_temp[0])]
                    elif key_equal_1_sub.search(equal_flag_temp[0]):
                        equal_flag_1.append((key_equal_1_sub.search(equal_flag_temp[0]).groups()[0], line))
                    else:
                        equal_flag_1.append(equal_flag_temp)
                else:
                    equal_flag_2.append((key_equal_2.search(line).groups()[0], line))
            for line in equal_flag_1:
                if line[0] == '**':
                    key_equal_flag = re.compile(r'<sup>\*\*</sup>|<sup>\*</sup>')
                elif line[0] == '*':
                    key_equal_flag = re.compile(r'<sup>\*</sup>')
                elif line[0] == '+':
                    key_equal_flag = re.compile(r'<sup>\+</sup>')
                else:
                    key_equal_flag = re.compile(r'<sup>' + line[0] + '</sup>')
                temp = []
                for k in range(len(article[0])):
                    if key_equal_flag.search(article[0][k]):
                        temp.append(author_name[k])
                if len(temp) == 0:
                    equal_flag_2.append((key_equal_2.search(line[1]).groups()[0], line[1]))
                elif len(temp) == 1:
                    if key_equal_1_single.search(line[1]):
                        author_name.append(temp)
                    else:
                        equal_flag_2.append((key_equal_2.search(line[1]).groups()[0], line[1]))
                else:
                    author_equal.append(temp)
            for line in equal_flag_2:
                equal_split = key_equal_split.split(line[0])
                split_words = key_equal_split.findall(line[0])
                for k in range(len(split_words)):
                    equal_split[k] += split_words[k]
                for item in equal_split:
                    if key_equal.search(item):
                        equal_sentence = key_equal_2_sub.sub('', item)
                if key_equal_2_1.search(equal_sentence):
                    author_equal.append(author_name)
                    continue
                if key_equal_2_2.search(equal_sentence):
                    author_equal.append(author_name[:2])
                    continue
                if key_equal_2_3.search(equal_sentence):
                    author_equal.append(author_name[:3])
                    continue
                if key_equal_2_4.search(equal_sentence):
                    author_equal.append(author_name[-2:])
                    continue
                if key_equal_2_5.search(equal_sentence):
                    author_equal.append(author_name[-3:])
                    continue
                if key_equal_2_6.search(equal_sentence):
                    author_equal.append(author_name[-4:])
                    continue
                if key_equal_2_7.search(equal_sentence):
                    author_equal.append(author_name[1:3])
                    continue
                equal_sentence = key_equal_3_sub.sub('', equal_sentence)
                equal_name_list = key_equal_3_split.split(equal_sentence)
                author_name_modify = [' '.join(n) for n in author_name_split]
                for k in range(len(equal_name_list)):
                    for l in range(len(author_name_modify)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [n[-1]+' '+' '.join(n[:-1]) for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [n[0][0]+'. '+' '.join(n[1:]) for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [n[0][0]+' '+' '.join(n[1:]) for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = ['.'.join([m[0] for m in n])+'.' for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = ['. '.join([m[0] for m in n])+'.' for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [n[0]+n[-1][0]+'.' for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = []
                for k in range(len(author_name_split)):
                    if len(author_name_split[k]) > 2:
                        author_name_modify.append(author_name_split[k][0][0]+'.'+author_name_split[k][1][0]+'. '+
                                                  ' '.join(author_name_split[k][2:]))
                    else:
                        author_name_modify.append('')
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = []
                for k in range(len(author_name_split)):
                    if len(author_name_split[k]) > 2:
                        author_name_modify.append(author_name_split[k][0][0]+'. '+author_name_split[k][1][0]+'. '+
                                                  ' '.join(author_name_split[k][2:]))
                    else:
                        author_name_modify.append('')
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [' '.join(n[:-1])+' '+n[-1][0] for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [n[-1] for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if equal_name_list[k].lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = []
                for k in range(len(author_name_split_hyphen)):
                    if len(author_name_split_hyphen[k]) > 2:
                        author_name_modify.append(author_name_split_hyphen[k][0][0]+'.'+
                                                  author_name_split_hyphen[k][1][0]+'. '+
                                                  ' '.join(author_name_split_hyphen[k][2:]))
                    else:
                        author_name_modify.append('')
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_hyphen = key_author_name_hyphen.sub('', equal_name_list[k])
                    for l in range(len(author_name)):
                        if equal_name_hyphen.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = []
                for k in range(len(author_name_split_hyphen)):
                    if len(author_name_split_hyphen[k]) > 2:
                        author_name_modify.append(author_name_split_hyphen[k][0][0]+'. '+
                                                  author_name_split_hyphen[k][1][0]+'. '+
                                                  ' '.join(author_name_split_hyphen[k][2:]))
                    else:
                        author_name_modify.append('')
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_hyphen = key_author_name_hyphen.sub('', equal_name_list[k])
                    for l in range(len(author_name)):
                        if equal_name_hyphen.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [' '.join(n) for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_temp = key_author_name_split.split(equal_name_list[k])[0][0] + '. ' + \
                                      ' '.join(key_author_name_split.split(equal_name_list[k])[1:])
                    for l in range(len(author_name)):
                        if equal_name_temp.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = ['.'.join([m[0] for m in n])+'.' for n in author_name_split_hyphen]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_temp = re.sub('-', '', equal_name_list[k])
                    for l in range(len(author_name)):
                        if equal_name_temp.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = ['.'.join([m[0] for m in n]) for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_temp = re.sub('-', '', equal_name_list[k])
                    for l in range(len(author_name)):
                        if equal_name_temp.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                author_name_modify = [''.join([m[0] for m in n]) for n in author_name_split_hyphen]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    equal_name_temp = re.sub('-', '', equal_name_list[k])
                    for l in range(len(author_name)):
                        if equal_name_temp.lower() == author_name_modify[l].lower():
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                key_author_name_temp = [re.compile(n[-1]) for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if key_author_name_temp[l].search(equal_name_list[k]):
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                key_author_name_temp = [re.compile(n[0]) for n in author_name_split]
                for k in range(len(equal_name_list)):
                    if type(equal_name_list[k]) == list:
                        continue
                    for l in range(len(author_name)):
                        if key_author_name_temp[l].search(equal_name_list[k]):
                            equal_name_list[k] = [author_name[l]]
                            break
                if judge_complete():
                    continue
                if equal_name_list[0] in ['T.C-D.', ['Roy Phitayakorn'], 'CDA', 'ASm', 'H.-P.K',
                                          ['Francesca Moro'], 'Y.-J. T.', 'A.M.D.A.', 'LK.M.', ['Andras Hoffman'],
                                          'K.R.', 'M.dC.V.H.', 'Y-G.K.', ['Scott J. Robbie'], ['Seung Hoon Woo'],
                                          'M.S.M', 'C.J.C.T.', ['Klaas J. Wardenaar'], 'L.-Q. X.',
                                          ['Massimiliano Fusaro'], ['Oliver Husser'], ['Icela Palma'], 'W-M.L',
                                          'program. The project']:
                    author_equal.append(author_name[:2])
                    continue
                if equal_name_list[0] in ['M-T.M-G', ["Anthony V. D'Amico"], 'MC', 'CAZ', ['Arne Östman'],
                                          'J.J.V.P.']:
                    author_equal.append(author_name[-2:])
                    continue
                if equal_name_list[0] in ['MH']:
                    author_equal.append(author_name[1:3])
                    continue
                if equal_name_list[0] in [['Chunsheng Liu']]:
                    author_equal.append(author_name[:3])
                    continue
                if equal_name_list[0] in [['Leonidas Chouliaras']]:
                    author_equal.append(author_name[:2]+author_name[-2:])
                    continue
                if equal_name_list[0] in [['Karin Hek']]:
                    author_equal.append(author_name[:5])
                    continue
                if equal_name_list[0] in [['Cornelia M. van Duijn']]:
                    author_equal.append(author_name[-8:])
            for j in range(len(author_equal)):
                temp = []
                for line in author_equal[j]:
                    if not line in temp:
                        temp.append(line)
                if len(temp) == 1 and not author_name[0] in temp:
                    temp.append(author_name[0])
                author_equal[j] = temp
            article_index = article[3]
            article_type = article[4]
            article_title = article[5]
            article_doi = article[2]
            article_info.append((author_name, author_corr, author_equal, article_index, article_type, article_title,
                                article_doi))
        pickle_write(article_info, os.path.join(folder, name_article_info))
convert_data()
import os
from defined_function.get_dir_list import get_dir_list
from defined_function.pickle_var_file import pickle_read
from defined_function.read_data_list import read_data_list


def convert_data():
    def get_article_property():
        article_property = []
        author_equal_order = []
        if author_equal:
            for line_1 in author_equal:
                for i in range(len(author_name)):
                    if author_name[i] in line_1:
                        operation = (i, line_1)
                        break
                author_name[operation[0]] = operation[1]
                for line_2 in line_1:
                    if line_2 in author_name:
                        author_name.remove(line_2)
            for i in range(len(author_name)):
                if type(author_name[i]) == list:
                    author_equal_order.append(i+1)
        if len(author_corr) > 1:
            article_property.append(1)
        elif len(author_corr) == 1:
            article_property.append(2)
        else:
            article_property.append(3)
        if not 1 in author_equal_order:
            article_property.append(4)
        else:
            article_property.append(5)
        author_first = author_name[0]
        if type(author_first) == list:
            for line in author_first:
                if line in author_corr:
                    article_property.append(6)
                    break
            else:
                article_property.append(7)
        else:
            if author_first in author_corr:
                article_property.append(6)
            else:
                article_property.append(7)
        for line in author_equal_order:
            if line != 1:
                article_property.append(8)
                break
        article_property.sort()
        return article_property, author_first, author_equal_order

    def count_article():
        nonlocal count
        if len(author_name) == 1 and type(author_name[0]) == str:
            count[12] += 1
            return 1
        count[0] += 1
        if 8 in article_property:
            count[11] += 1
        if 3 in article_property:
            if 4 in article_property:
                count[9] += 1
            elif 5 in article_property:
                count[10] += 1
        elif 1 in article_property:
            if 4 in article_property:
                if 6 in article_property:
                    count[1] += 1
                elif 7 in article_property:
                    count[2] += 1
            elif 5 in article_property:
                if 6 in article_property:
                    count[3] += 1
                elif 7 in article_property:
                    count[4] += 1
        elif 2 in article_property:
            if 4 in article_property:
                if 6 in article_property:
                    count[5] += 1
                elif 7 in article_property:
                    count[6] += 1
            elif 5 in article_property:
                if 6 in article_property:
                    count[7] += 1
                elif 7 in article_property:
                    count[8] += 1
        return 0

    name_article_info = 'article_info'
    name_journal_info = 'journal_info.txt'
    name_article_overview = 'article_overview.txt'
    name_journal_overview = 'journal_overview.txt'
    name_overview = 'overview.txt'
    overview = ['Name\tISSN\tTotal\tTotal(Excluding single author)\t(1 4 6)\t(1 4 7)\t(1 5 6)\t(1 5 7)\t'
                '(2 4 6)\t(2 4 7)\t(2 5 6)\t(2 5 7)\t(3 4)\t(3 5)\t(8)\n']
    dir_list = get_dir_list()
    for i in range(len(dir_list)):
        print('Getting {} of {} journals...'.format(i+1, len(dir_list)))
        folder = dir_list[i]
        article_info = pickle_read(os.path.join(folder, name_article_info))
        count = [0 for n in range(13)]
        article_overview = ['Title\tDOI\tSingle Author\tArticle Type\tArticle Property\tAuthor First\tAuthor Corr\tAuthor Equal\t'
                            'Equal Order\tAuthor All\tFile\n']
        for article in article_info:
            article_index = article[3]
            article_type= article[4]
            article_doi = article[6]
            article_title = article[5]
            author_name = article[0]
            author_corr = article[1]
            author_equal = article[2]
            article_property, author_first, author_equal_order = get_article_property()
            flag_single = count_article()
            article_overview.append('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(article_title,
                                    article_doi, flag_single, article_type, article_property, author_first, author_corr,
                                    author_equal, author_equal_order, author_name, article_index))
        journal_info = read_data_list(os.path.join(dir_list[i], name_journal_info))
        journal_name = journal_info[0].split('\t')[1]
        journal_issn = journal_info[1].split('\t')[1]
        journal_overview = '{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(journal_name,
                            journal_issn, count[0]+count[12], count[0], count[1], count[2], count[3], count[4],
                            count[5], count[6], count[7], count[8], count[9], count[10], count[11])
        overview.append(journal_overview)
        f = open(os.path.join(folder, name_article_overview), mode='w', encoding='utf-8')
        f.writelines(article_overview)
        f.close()
        f = open(os.path.join(folder, name_journal_overview), mode='w', encoding='utf-8')
        f.write(overview[0] + journal_overview)
        f.close()
    f = open(name_overview, mode='w', encoding='utf-8')
    f.writelines(overview)
    f.close()
 get_dir_list()
import os


def get_dir_list():
    dir_list = os.listdir()
    dir_not = []
    for folder in dir_list:
        if not os.path.isdir(folder):
            dir_not.append(folder)
    for file in dir_not:
        dir_list.remove(file)
    return dir_list

pickle_var_file()

import os
import pickle
import sys

def pickle_read(file_name):
    if os.path.isfile(file_name):
        file = open(file_name, 'rb')
        var = pickle.load(file)
        file.close()
        return var
    else:
        print('Error while opening {}!'.format(file_name))
        sys.exit(1)

def pickle_write(var, file_name):
    file = open(file_name, 'wb')
    pickle.dump(var, file)
    file.close()
read_data_list()

import os
import sys


def read_data_list(file_name):
    if os.path.isfile(file_name):
        file = open(file_name)
        data_list = [n[:-1] for n in file.readlines()]
        file.close()
        return data_list
    else:
        print('Error while opening {}!'.format(file_name))
        sys.exit(1)




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值