爬虫--获取机器之心 SOTA模型的数据

参考博客:http://c.biancheng.net/view/2011.html

SOTA模型:https://www.jiqizhixin.com/sota 

python将 str 转换成 json 的方法:

https://blog.csdn.net/Pythoncxy/article/details/95203732

进入自然语言模块,然后又返回前一层,观察变化情况,多出来tech_fields就是我们要找的。

对应的 url: https://api.jiqizhixin.com/sota/tech_fields,方法是get  

代码如下:test1.py

#coding=utf-8
import json
import requests
import xlwt

def detect():
    # res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
    res = requests.get('https://applications.jiqizhixin.com/tech_fields/8a0ace81-a1e8-44ec-962f-dbe351a26e37/tech_tasks')
    #res.encoding = 'gbk'  # 得到的结果转换为 gbk 编码
    all_news = json.loads(res.text)  #
    print(all_news)

    # 先将res.text打印出来,查看里面的元素情况,以此来确定要得到的数据
    # for i in all_news:
    #     print(i)

    # 遍历由json数据得到的list
    for each_news in all_news['tech_tasks']:
        print(each_news['name'])
        print(each_news['summary'])

def process1():             #网页第一层
    with open(r"C:\Users\sky\Desktop\爬虫\document_1.txt", "w", encoding='utf-8') as file:
        res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
        all_news = json.loads(res.text)
        print(all_news)
        for each_news in all_news:
                file.write('   ' + 'https://www.jiqizhixin.com/sota/tech-fields/' + each_news['id'] + '\n')
                file.write('   ' + each_news['name'] + '\n')
                file.write('   ' + each_news['alias_name'] + '\n')
                file.write('   ' + each_news['summary'] + '\n')

def process2():            #网页第二层
    with open(r"C:\Users\sky\Desktop\爬虫\document_2.txt", "w", encoding='utf-8') as file:
        res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
        all_news = json.loads(res.text)
        for each_news in all_news:
            print(each_news['id'])
            url = 'https://applications.jiqizhixin.com/tech_fields/'+ each_news['id'] +'/tech_tasks'
            res = requests.get(url)
            all_news = json.loads(res.text)
            for each_news in all_news['tech_tasks']:
                if each_news['summary']==None:
                    file.write('   ' + each_news['name'] + '\n')
                else:
                    file.write('   ' + each_news['name'] + '\n')
                    file.write('   ' + each_news['summary'] + '\n')

def process_all():            #网页两层
    with open(r"C:\Users\sky\Desktop\爬虫\document_all.txt", "w", encoding='utf-8') as file:
        res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
        all_news = json.loads(res.text)
        num = 1
        for each_news in all_news:
            file.write(str(num) + '、' + each_news['name'] + '\n')      #str()将num中的数字转换出来
            file.write('   ' + each_news['alias_name'] + '\n')
            file.write('   ' + 'https://www.jiqizhixin.com/sota/tech-fields/' + each_news['id'] + '\n')
            file.write('   ' + each_news['summary'] + '\n')
            num = num+1

            print(each_news['id'])     #各个子模块
            url = 'https://applications.jiqizhixin.com/tech_fields/'+ each_news['id'] +'/tech_tasks'
            res = requests.get(url)
            all_news = json.loads(res.text)
            for each_news in all_news['tech_tasks']:
                if each_news['summary']==None:
                    file.write('   ' + each_news['name'] + '\n')
                else:
                    file.write('   ' + each_news['name'] + '\n')
                    file.write('   ' + each_news['summary'] + '\n')

def process_xls():      # 写入xls文件
        workbook = xlwt.Workbook(encoding='utf-8')    #创建工作簿
        booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True)     #创建sheet

        res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
        all_news = json.loads(res.text)
        # print(all_news)
        row = 0
        colum = 0

        for each_news in all_news:
            # print(each_news['name'])
            booksheet.write(row,0,each_news['name'])
            booksheet.write(row, 1, each_news['alias_name'])
            booksheet.write(row, 2, 'https://www.jiqizhixin.com/sota/tech-fields/'+each_news['id'])
            booksheet.write(row,3, each_news['summary'])
            row = row+1
            # print(each_news['id'])    # 各个子模块
            url = 'https://applications.jiqizhixin.com/tech_fields/' + each_news['id'] + '/tech_tasks'
            res = requests.get(url)
            all_news = json.loads(res.text)
            # print(all_news)
            for each_news in all_news['tech_tasks']:
                if each_news['summary'] == None:
                    booksheet.write(row, 1, each_news['name'])
                else:
                    booksheet.write(row, 1, each_news['name'])
                    booksheet.write(row, 2, each_news['summary'])
                row = row+1

        # 保存文件
        workbook.save('file.xls')

if __name__=='__main__':
    # detect()         #观察网页数据
    # process1()       #第一层网页数据获取
    # process2()       #第二层网页数据获取
    # process_all()      #两层网页数据都获取,txt格式
    process_xls()        #两层网页数据都获取,xls格式

或者:test_yang.py

import json
import requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/'
                  '537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/', headers)
all_data = json.loads(res.text)
index = 1

# open()是读写文件的函数,with语句会自动close()已打开文件
with open(r"data.txt", "w",encoding='utf-8') as file:
# 遍历由json数据得到的list
    for each_data in all_data:
        # print(each_data['id'])
        file.write(index.__str__() + '.' + each_data['name'] + '(' + each_data['alias_name'] + ')' + ':' + each_data[
            'summary'] + '\n')
        url = 'https://applications.jiqizhixin.com/tech_fields/' + each_data['id'] + '/tech_tasks'
        strhtml = requests.get(url, headers)
        all_data1 = json.loads(strhtml.text)
        # print(type(all_data1))
        # print(type(all_data1['tech_tasks']))
        for each_data1 in all_data1['tech_tasks']:
            if each_data1['summary'] == None:
                file.write('    ' + each_data1['name'] + '\n')
            else:
                file.write('    ' + each_data1['name'] + ':' + each_data1['summary'] + '\n')
        index = index + 1

结果:

方法二:未使用 json 转换时(很麻烦)

# 参考文章:           http://c.biancheng.net/view/2011.html
# 机器之心SOTA模型:   https://www.jiqizhixin.com/sota
#coding=utf-8
import requests
from bs4 import BeautifulSoup

def spider():
    # 使用 GET 方式抓取数据
    url='https://api.jiqizhixin.com/sota/tech_fields'
    strhtml = requests.get(url)        #Get方式获取网页数据
    print(strhtml.text)

    a = strhtml.text.split(',')
    # print(a[1])
    # print(len(a))
    # for i in a:    #查看字符内容
    #     print(i)
    with open(r"C:\Users\sky\Desktop\爬虫\document.txt", "w",encoding='utf-8') as file:
     for i in range(len(a)):
        # print(a[i])
        a1 = a[i].split(':')
        a2=a1[1].replace('"', '')
        print(a2)
        if (i%4==0):
            file.write('https://www.jiqizhixin.com/sota/tech-fields/'+a2+'\n')
        else:
            file.write(a2 + '\n')

def spider_next():
    # 使用 GET 方式抓取数据
    url='https://applications.jiqizhixin.com/tech_fields/8a0ace81-a1e8-44ec-962f-dbe351a26e37/tech_tasks'
    strhtml = requests.get(url)        #Get方式获取网页数据
    print(strhtml.text)

    a = strhtml.text.split(',')
    print(a)
    print(len(a))
    for i in a:    #查看字符内容,共843行
        print(i)
    # with open(r"C:\Users\sky\Desktop\爬虫\document.txt", "w",encoding='utf-8') as file:
    #  for i in range(len(a)):
    #     # print(a[i])
    #     a1 = a[i].split(':')
    #     a2=a1[1].replace('"', '')
    #     print(a2)
    #     if (i%4==0):
    # https://www.jiqizhixin.com/sota/tech-task/32b9c966-605c-48ef-806c-c5ffb53d8c35
    #         file.write('https://www.jiqizhixin.com/sota/tech-task/'+a2+'\n')
    #     else:
    #         file.write(a2 + '\n')

if __name__=='__main__':
    spider()
    # spider_next()

运行结果: 

运行第二个函数:分类很复杂

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值