知网相关文章采集

import os
import requests
import json
import pandas as pd

#爬取地址
# 实现将python中的数据写入excel中
def write_to_excel(result, path, sheet_name):
    if not os.path.exists(path):
        write = pd.ExcelWriter(path)
        result.to_excel(write, sheet_name=sheet_name, index=False)
        write.save()
    else:
        Original_df = pd.read_excel(path, sheet_name='result')
        result = pd.concat([Original_df, result], axis=0, join='outer')
        result = result.drop_duplicates()
        write = pd.ExcelWriter(path)
        result.to_excel(write, sheet_name=sheet_name, index=False)
        write.save()
def crawl(step, start_point, Save_path):
    """step为每批次爬取的页数,开始页数,保存路径"""
    df = pd.DataFrame()
    count = 0
    # step为每批次爬取的页数,group_num是能够爬取完整step的组数          
    group_num = int(2465 / step)
    start_group = int(start_point / step)#1/50=40
    # 保存获取失败的网页页码          
    failure_get = []
    for group in range(start_group, group_num + 1):
        # 当能够获取完整step时,循环最大值设置为step          
        if group != group_num:
            max = step + 1
        else:
            # 当不能够获取完整的step时,循环最大值设置为剩余获取页数
            max = 2465 - step * group_num
        for m in range(1, max):
            j = group * step + m
            try:
                print(f'正在爬取第{j}页')
                url = f'https://mci.cnki.net//statistics/query?q=&start={j}&size=20'
                User_Agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
                cookies = 'cangjieConfig_CHKD2=%7B%22status%22%3Atrue%2C%22startTime%22%3A%222021-12-27%22%2C%22endTime%22%3A%222022-06-24%22%2C%22type%22%3A%22mix%22%2C%22poolSize%22%3A%2210%22%2C%22intervalTime%22%3A10000%2C%22persist%22%3Afalse%7D; SID=012053; Ecp_ClientId=b230620141100392435; Ecp_IpLoginFail=230620219.137.5.106'
                headers_list = {'User-Agent': User_Agent, 'Cookie': cookies}
                # 解决代理报错问题          
                proxies = {"http": None, "https": None}
                issue_url_resp = requests.get(url=url, proxies=proxies, headers=headers_list)
                print()
                for i in range(0, len(json.loads(issue_url_resp.text)['data']['data'])):
                    df.loc[count, 'Chinese_title'] = json.loads(issue_url_resp.text)['data']['data'][i]['title']
                    df.loc[count, 'English_title'] = json.loads(issue_url_resp.text)['data']['data'][i]['entitle']
                    df.loc[count, 'url'] = json.loads(issue_url_resp.text)['data']['data'][i]['url']
                    count = count + 1
            except:
                print(f'第{j}页爬取失败!!!!!!!!!!!!')
                failure_get.append(j)
        write_to_excel(df, Save_path, 'result')
    print(failure_get)


if __name__ == '__main__':
    Save_path = r'C:\Users\Childers\中国知网关键词.xlsx'
    # 上次无响应的页数          
    start_point = 1
    crawl(50, start_point, Save_path)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值