爬虫--获取机器之心 SOTA模型的数据

最新推荐文章于 2025-04-15 20:10:51 发布

tiankong_hut

最新推荐文章于 2025-04-15 20:10:51 发布

阅读量888

点赞数 1

分类专栏： Python学习自然语言处理-NLP 语音识别

本文链接：https://blog.csdn.net/qq_34638161/article/details/102626132

版权

Python学习同时被 3 个专栏收录

113 篇文章

订阅专栏

自然语言处理-NLP

3 篇文章

订阅专栏

语音识别

2 篇文章

订阅专栏

本文介绍了一种使用Python进行网络爬虫的技术实践，包括如何利用requests库获取数据、使用json解析数据，以及如何将数据写入文本文件和Excel表格中。通过实例展示了爬取特定网站上关于自然语言处理技术领域的信息，包括技术领域名称、别名、概述和任务详情。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

参考博客：http://c.biancheng.net/view/2011.html

SOTA模型：https://www.jiqizhixin.com/sota

python将 str 转换成 json 的方法：

https://blog.csdn.net/Pythoncxy/article/details/95203732

进入自然语言模块，然后又返回前一层，观察变化情况，多出来tech_fields就是我们要找的。

对应的 url: https://api.jiqizhixin.com/sota/tech_fields，方法是get

代码如下：test1.py

#coding=utf-8
import json
import requests
import xlwt

def detect():
    # res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
    res = requests.get('https://applications.jiqizhixin.com/tech_fields/8a0ace81-a1e8-44ec-962f-dbe351a26e37/tech_tasks')
    #res.encoding = 'gbk'  # 得到的结果转换为 gbk 编码
    all_news = json.loads(res.text)  #
    print(all_news)

    # 先将res.text打印出来，查看里面的元素情况，以此来确定要得到的数据
    # for i in all_news:
    #     print(i)

    # 遍历由json数据得到的list
    for each_news in all_news['tech_tasks']:
        print(each_news['name'])
        print(each_news['summary'])

def process1():             #网页第一层
    with open(r"C:\Users\sky\Desktop\爬虫\document_1.txt", "w", encoding='utf-8') as file:
        res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
        all_news = json.loads(res.text)
        print(all_news)
        for each_news in all_news:
                file.write('   ' + 'https://www.jiqizhixin.com/sota/tech-fields/' + each_news['id'] + '\n')
                file.write('   ' + each_news['name'] + '\n')
                file.write('   ' + each_news['alias_name'] + '\n')
                file.write('   ' + each_news['summary'] + '\n')

def process2():            #网页第二层
    with open(r"C:\Users\sky\Desktop\爬虫\document_2.txt", "w", encoding='utf-8') as file:
        res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
        all_news = json.loads(res.text)
        for each_news in all_news:
            print(each_news['id'])
            url = 'https://applications.jiqizhixin.com/tech_fields/'+ each_news['id'] +'/tech_tasks'
            res = requests.get(url)
            all_news = json.loads(res.text)
            for each_news in all_news['tech_tasks']:
                if each_news['summary']==None:
                    file.write('   ' + each_news['name'] + '\n')
                else:
                    file.write('   ' + each_news['name'] + '\n')
                    file.write('   ' + each_news['summary'] + '\n')

def process_all():            #网页两层
    with open(r"C:\Users\sky\Desktop\爬虫\document_all.txt", "w", encoding='utf-8') as file:
        res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
        all_news = json.loads(res.text)
        num = 1
        for each_news in all_news:
            file.write(str(num) + '、' + each_news['name'] + '\n')      #str()将num中的数字转换出来
            file.write('   ' + each_news['alias_name'] + '\n')
            file.write('   ' + 'https://www.jiqizhixin.com/sota/tech-fields/' + each_news['id'] + '\n')
            file.write('   ' + each_news['summary'] + '\n')
            num = num+1

            print(each_news['id'])     #各个子模块
            url = 'https://applications.jiqizhixin.com/tech_fields/'+ each_news['id'] +'/tech_tasks'
            res = requests.get(url)
            all_news = json.loads(res.text)
            for each_news in all_news['tech_tasks']:
                if each_news['summary']==None:
                    file.write('   ' + each_news['name'] + '\n')
                else:
                    file.write('   ' + each_news['name'] + '\n')
                    file.write('   ' + each_news['summary'] + '\n')

def process_xls():      # 写入xls文件
        workbook = xlwt.Workbook(encoding='utf-8')    #创建工作簿
        booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True)     #创建sheet

        res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
        all_news = json.loads(res.text)
        # print(all_news)
        row = 0
        colum = 0

        for each_news in all_news:
            # print(each_news['name'])
            booksheet.write(row,0,each_news['name'])
            booksheet.write(row, 1, each_news['alias_name'])
            booksheet.write(row, 2, 'https://www.jiqizhixin.com/sota/tech-fields/'+each_news['id'])
            booksheet.write(row,3, each_news['summary'])
            row = row+1
            # print(each_news['id'])    # 各个子模块
            url = 'https://applications.jiqizhixin.com/tech_fields/' + each_news['id'] + '/tech_tasks'
            res = requests.get(url)
            all_news = json.loads(res.text)
            # print(all_news)
            for each_news in all_news['tech_tasks']:
                if each_news['summary'] == None:
                    booksheet.write(row, 1, each_news['name'])
                else:
                    booksheet.write(row, 1, each_news['name'])
                    booksheet.write(row, 2, each_news['summary'])
                row = row+1

        # 保存文件
        workbook.save('file.xls')

if __name__=='__main__':
    # detect()         #观察网页数据
    # process1()       #第一层网页数据获取
    # process2()       #第二层网页数据获取
    # process_all()      #两层网页数据都获取，txt格式
    process_xls()        #两层网页数据都获取，xls格式

或者：test_yang.py

import json
import requests
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/'
                  '537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/', headers)
all_data = json.loads(res.text)
index = 1

# open()是读写文件的函数,with语句会自动close()已打开文件
with open(r"data.txt", "w",encoding='utf-8') as file:
# 遍历由json数据得到的list
    for each_data in all_data:
        # print(each_data['id'])
        file.write(index.__str__() + '.' + each_data['name'] + '(' + each_data['alias_name'] + ')' + '：' + each_data[
            'summary'] + '\n')
        url = 'https://applications.jiqizhixin.com/tech_fields/' + each_data['id'] + '/tech_tasks'
        strhtml = requests.get(url, headers)
        all_data1 = json.loads(strhtml.text)
        # print(type(all_data1))
        # print(type(all_data1['tech_tasks']))
        for each_data1 in all_data1['tech_tasks']:
            if each_data1['summary'] == None:
                file.write('    ' + each_data1['name'] + '\n')
            else:
                file.write('    ' + each_data1['name'] + '：' + each_data1['summary'] + '\n')
        index = index + 1

结果：

方法二：未使用 json 转换时(很麻烦)

# 参考文章：           http://c.biancheng.net/view/2011.html
# 机器之心SOTA模型：   https://www.jiqizhixin.com/sota
#coding=utf-8
import requests
from bs4 import BeautifulSoup

def spider():
    # 使用 GET 方式抓取数据
    url='https://api.jiqizhixin.com/sota/tech_fields'
    strhtml = requests.get(url)        #Get方式获取网页数据
    print(strhtml.text)

    a = strhtml.text.split(',')
    # print(a[1])
    # print(len(a))
    # for i in a:    #查看字符内容
    #     print(i)
    with open(r"C:\Users\sky\Desktop\爬虫\document.txt", "w",encoding='utf-8') as file:
     for i in range(len(a)):
        # print(a[i])
        a1 = a[i].split(':')
        a2=a1[1].replace('"', '')
        print(a2)
        if (i%4==0):
            file.write('https://www.jiqizhixin.com/sota/tech-fields/'+a2+'\n')
        else:
            file.write(a2 + '\n')

def spider_next():
    # 使用 GET 方式抓取数据
    url='https://applications.jiqizhixin.com/tech_fields/8a0ace81-a1e8-44ec-962f-dbe351a26e37/tech_tasks'
    strhtml = requests.get(url)        #Get方式获取网页数据
    print(strhtml.text)

    a = strhtml.text.split(',')
    print(a)
    print(len(a))
    for i in a:    #查看字符内容，共843行
        print(i)
    # with open(r"C:\Users\sky\Desktop\爬虫\document.txt", "w",encoding='utf-8') as file:
    #  for i in range(len(a)):
    #     # print(a[i])
    #     a1 = a[i].split(':')
    #     a2=a1[1].replace('"', '')
    #     print(a2)
    #     if (i%4==0):
    # https://www.jiqizhixin.com/sota/tech-task/32b9c966-605c-48ef-806c-c5ffb53d8c35
    #         file.write('https://www.jiqizhixin.com/sota/tech-task/'+a2+'\n')
    #     else:
    #         file.write(a2 + '\n')

if __name__=='__main__':
    spider()
    # spider_next()

运行结果：