zhkmjj

最新推荐文章于 2020-11-24 05:13:19 发布

qq_38247596

最新推荐文章于 2020-11-24 05:13:19 发布

阅读量279

点赞数

分类专栏： asd 文章标签： 1asd

本文链接：https://blog.csdn.net/qq_38247596/article/details/78509795

版权

asd 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

# -*- coding: utf-8 -*-
from configparser import ConfigParser
from urllib.parse import quote
import socket
import os
import math
import urllib.request
from bs4 import BeautifulSoup
import time
import spider_search_page
import spider_paper

if __name__ == '__main__':
    print("123")
    start = time.clock()
    cf = ConfigParser()
    cf.read("Config.conf", encoding='utf-8')
    keyword = cf.get('base', 'keyword')# 关键词
    maxpage = cf.getint('base', 'maxpage')# 最大页码
    searchlocation = cf.get('base', 'searchlocation') #搜索位置
    currentpage = cf.getint('base', 'currentpage')
    if os.path.exists('data-detail.txt') and currentpage == 0:
        print('存在输出文件，删除该文件')
        os.remove('data-detail.txt')

    #构造不同条件的关键词搜索
    values = {
           '全文': 'qw',
           '主题': 'theme',
           '篇名': 'title',
           '作者': 'author',
           '摘要':'abstract'
    }
    keywordval = str(values[searchlocation])+':'+str(keyword)
    index_url='http://search.cnki.com.cn/Search.aspx?q='+quote(keywordval)+'&rank=&cluster=&val=&p='#quote方法把汉字转换为encodeuri?
    print(index_url)

    #获取最大页数
    html = urllib.request.urlopen(index_url).read()
    soup = BeautifulSoup(html, 'html.parser')
    pagesum_text = soup.find('span', class_='page-sum').get_text()
    maxpage = math.ceil(int(pagesum_text[7:-1]) / 15)
    #print(maxpage)
    cf = ConfigParser()
    cf.read("Config.conf", encoding='utf-8')
    cf.set('base', 'maxpage', str(maxpage))
    cf.write(open('Config.conf', 'w', encoding='utf-8'))

    for i in range(currentpage, maxpage):
        page_num=15
        page_str_num=i*page_num
        page_url=index_url+str(page_str_num)
        print(page_url)
        attempts = 0
        success = False
        while attempts < 50 and not success:
            try:
                spider_search_page.get_paper_url(page_url)
                socket.setdefaulttimeout(10)  # 设置10秒后连接超时
                success = True
            except socket.error:
                attempts += 1
                print("第"+str(attempts)+"次重试！！")
                if attempts == 50:
                    break
            except urllib.error:
                attempts += 1
                print("第"+str(attempts)+"次重试！！")
                if attempts == 50:
                    break
        cf.set('base', 'currentpage', str(i))
        cf.write(open("Config.conf", "w", encoding='utf-8'))
    spider_paper.spider_paper()# spider_paper补全文章信息
    end = time.clock()
    print ('Running time: %s Seconds'%(end-start))

# -*- coding: utf-8 -*-
import socket

from bs4 import BeautifulSoup
import urllib
import requests
import time
import xlwt
from configparser import ConfigParser

def spider_paper():
    start = time.clock()
    # f=urllib2.urlopen(url, timeout=5).read()
    # soup=BeautifulSoup(html)
    # tags=soup.find_all('a')
    file = open("data-detail.txt", encoding='utf8')
    cf = ConfigParser()
    cf.read("Config.conf", encoding='utf-8')
    keyword = cf.get('base', 'keyword')# 关键词

# 写入Excel
    wb = xlwt.Workbook("data_out.xls")
    sheet = wb.add_sheet("data-out")
    sheet.write(0, 0, '下载网址')
    sheet.write(0, 1, '标题')
    sheet.write(0, 2, '来源')
    sheet.write(0, 3, '引用')
    sheet.write(0, 4, '作者')
    sheet.write(0, 5, '作者单位')
    sheet.write(0, 6, '关键词')
    sheet.write(0, 7, '摘要')
    sheet.write(0, 8, '共引文献')

    lines = file.readlines()
    txt_num = 1
    lin_num = 1
    paper_list = []
    for line in lines:
        object = line.split('\t')
        paper_url = object[0]
        if paper_url in paper_list:
            continue
        paper_list.append(paper_url)
        attempts = 0
        success = False
        while attempts < 50 and not success:
            try:
                html = urllib.request.urlopen(paper_url).read()
                soup = BeautifulSoup(html, 'html.parser')
                socket.setdefaulttimeout(10)  # 设置10秒后连接超时
                success = True
            except socket.error:
                attempts += 1
                print("第"+str(attempts)+"次重试！！")
                if attempts == 50:
                    break
            except urllib.error:
                attempts += 1
                print("第"+str(attempts)+"次重试！！")
                if attempts == 50:
                    break
        title = soup.find_all('div', style="text-align:center; width:740px; font-size: 28px;color: #0000a0; font-weight:bold; font-family:'宋体';")
        abstract = soup.find_all('div', style='text-align:left;word-break:break-all')
        author = soup.find_all('div', style='text-align:center; width:740px; height:30px;')

        #获取作者名字
        for item in author:
            author = item.get_text()
        # print(item)
        #获取摘要信息
        tmp = ''
        for thing in abstract:
            a = thing.strings
            for string in a:
                tmp = tmp + string
            txt_num += 1
        result = tmp.split(' ')
        tstr = ''
        for t in result:
            test = t.split('\n')
            # print(test)
            if test != '\t' and test != '\n' and test != '\r' and test != '':
                for i in test:
                    if len(i) > 1:
                        item = i.split('\r')
                        for j in item:
                            object = j.split('\t')
                            for k in object:
                                tstr += k

        ifreferen = soup.find_all('td', class_='b14', rowspan='2')
        ref = ''
        for i in range(len(ifreferen)):
            if ('【共引文献】' in ifreferen[i].get_text()):
                referenceList = soup.find_all('div', id='div_Ref')  # 共引文献列表
                if len(referenceList) == 0:
                    referenceList = soup.find_all('div', class_='div_Ref')
                referenceList = referenceList[i]
                for tdref in referenceList.find_all('td', width='676'):
                    refitem = tdref.a.get("href")
                    refitem = refitem.strip()
                    print(refitem)
                    ref = ref + refitem + ' ,'
        # 获取作者单位，处理字符串匹配
        authorUnitScope = soup.find('div', style='text-align:left;', class_='xx_font')
        author_unit = ''
        author_unit_text = authorUnitScope.get_text()
        # print(author_unit_text)
        if '【作者单位】：' in author_unit_text:
            auindex = author_unit_text.find('【作者单位】：', 0)
        else:
            auindex = author_unit_text.find('【学位授予单位】：', 0)
        for k in range(auindex, len(author_unit_text)):
            if author_unit_text[k] == '\n' or author_unit_text[k] == '\t' or author_unit_text[k] == '\r' or \
                        author_unit_text[k] == '】':
                continue
            if author_unit_text[k] == ' ' and author_unit_text[k + 1] == ' ':
                continue
            if author_unit_text[k] != '【':
                author_unit = author_unit + author_unit_text[k]
            if author_unit_text[k] == '【' and k != auindex:
                break
        # 获取关键字
        key_word = ''
        kwindex = author_unit_text.find('【关键词】：', 0)
        for k in range(kwindex, len(author_unit_text)):
            if author_unit_text[k] == '\n' or author_unit_text[k] == '\t' or author_unit_text[k] == '\r' or \
                                author_unit_text[k] == '】':
                continue
            if author_unit_text[k] == ' ' and author_unit_text[k + 1] == ' ':
                continue
            if author_unit_text[k] != '【':
                key_word = key_word + author_unit_text[k]
            if author_unit_text[k] == '【' and k != kwindex:
                break
        # print(author_unit)
        # print(key_word)
        line = line.strip('\n')
        line = line + '\t' + str(author) + '\t' + str(author_unit) + '\t'+ str(key_word) + '\t'+ str(tstr) + '\t' + str(ref) + '\n'
        outstring = line.split('\t')
        for i in range(len(outstring)):
            sheet.write(lin_num, i, outstring[i])
        print('写入第'+str(lin_num)+'行')
        lin_num += 1
        wb.save('data_out_'+str(keyword)+'.xls')

    file.close()
    end = time.clock()
    print('Running time: %s Seconds' % (end - start))

from bs4 import BeautifulSoup
import urllib
import urllib.request
import sys
import io

sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='utf8')

page_num=15

def get_paper_url(page_url):
    html = urllib.request.urlopen(page_url).read()
    soup = BeautifulSoup(html,'html.parser')

    f = open('data-detail.txt','a+', encoding='utf-8')
    all = soup.find_all('div', class_='wz_content')
    for string in all:
        item = string.find('a', target='_blank')#文章标题与链接
        href = item.get('href')# 获取文章url
        title = item.get_text() # 获取文章标题
        year_count = string.find('span', class_='year-count')#获取文章出处与引用次数
        #year_count = year_count.get_text()
        publish = ''
        reference = ''
        for item in year_count:
            item = item.string
            item = item.replace('\n','')
            item = item.replace('\r', '')
            if '被引次数' in item:
                reference = item# 获取被引次数
            elif '年' in item: # 获取文章出处
                publish = item
            #print(publish)
            #print(reference)
        #print(year_count)
        f.write(href + '\t' + title + '\t' + publish + '\t' + reference +'\n')
    f.close()