参考博客:http://c.biancheng.net/view/2011.html
SOTA模型:https://www.jiqizhixin.com/sota
python将 str 转换成 json 的方法:
https://blog.csdn.net/Pythoncxy/article/details/95203732
进入自然语言模块,然后又返回前一层,观察变化情况,多出来tech_fields就是我们要找的。
对应的 url: https://api.jiqizhixin.com/sota/tech_fields,方法是get
代码如下:test1.py
#coding=utf-8
import json
import requests
import xlwt
def detect():
# res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
res = requests.get('https://applications.jiqizhixin.com/tech_fields/8a0ace81-a1e8-44ec-962f-dbe351a26e37/tech_tasks')
#res.encoding = 'gbk' # 得到的结果转换为 gbk 编码
all_news = json.loads(res.text) #
print(all_news)
# 先将res.text打印出来,查看里面的元素情况,以此来确定要得到的数据
# for i in all_news:
# print(i)
# 遍历由json数据得到的list
for each_news in all_news['tech_tasks']:
print(each_news['name'])
print(each_news['summary'])
def process1(): #网页第一层
with open(r"C:\Users\sky\Desktop\爬虫\document_1.txt", "w", encoding='utf-8') as file:
res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
all_news = json.loads(res.text)
print(all_news)
for each_news in all_news:
file.write(' ' + 'https://www.jiqizhixin.com/sota/tech-fields/' + each_news['id'] + '\n')
file.write(' ' + each_news['name'] + '\n')
file.write(' ' + each_news['alias_name'] + '\n')
file.write(' ' + each_news['summary'] + '\n')
def process2(): #网页第二层
with open(r"C:\Users\sky\Desktop\爬虫\document_2.txt", "w", encoding='utf-8') as file:
res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
all_news = json.loads(res.text)
for each_news in all_news:
print(each_news['id'])
url = 'https://applications.jiqizhixin.com/tech_fields/'+ each_news['id'] +'/tech_tasks'
res = requests.get(url)
all_news = json.loads(res.text)
for each_news in all_news['tech_tasks']:
if each_news['summary']==None:
file.write(' ' + each_news['name'] + '\n')
else:
file.write(' ' + each_news['name'] + '\n')
file.write(' ' + each_news['summary'] + '\n')
def process_all(): #网页两层
with open(r"C:\Users\sky\Desktop\爬虫\document_all.txt", "w", encoding='utf-8') as file:
res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
all_news = json.loads(res.text)
num = 1
for each_news in all_news:
file.write(str(num) + '、' + each_news['name'] + '\n') #str()将num中的数字转换出来
file.write(' ' + each_news['alias_name'] + '\n')
file.write(' ' + 'https://www.jiqizhixin.com/sota/tech-fields/' + each_news['id'] + '\n')
file.write(' ' + each_news['summary'] + '\n')
num = num+1
print(each_news['id']) #各个子模块
url = 'https://applications.jiqizhixin.com/tech_fields/'+ each_news['id'] +'/tech_tasks'
res = requests.get(url)
all_news = json.loads(res.text)
for each_news in all_news['tech_tasks']:
if each_news['summary']==None:
file.write(' ' + each_news['name'] + '\n')
else:
file.write(' ' + each_news['name'] + '\n')
file.write(' ' + each_news['summary'] + '\n')
def process_xls(): # 写入xls文件
workbook = xlwt.Workbook(encoding='utf-8') #创建工作簿
booksheet = workbook.add_sheet('Sheet 1', cell_overwrite_ok=True) #创建sheet
res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/')
all_news = json.loads(res.text)
# print(all_news)
row = 0
colum = 0
for each_news in all_news:
# print(each_news['name'])
booksheet.write(row,0,each_news['name'])
booksheet.write(row, 1, each_news['alias_name'])
booksheet.write(row, 2, 'https://www.jiqizhixin.com/sota/tech-fields/'+each_news['id'])
booksheet.write(row,3, each_news['summary'])
row = row+1
# print(each_news['id']) # 各个子模块
url = 'https://applications.jiqizhixin.com/tech_fields/' + each_news['id'] + '/tech_tasks'
res = requests.get(url)
all_news = json.loads(res.text)
# print(all_news)
for each_news in all_news['tech_tasks']:
if each_news['summary'] == None:
booksheet.write(row, 1, each_news['name'])
else:
booksheet.write(row, 1, each_news['name'])
booksheet.write(row, 2, each_news['summary'])
row = row+1
# 保存文件
workbook.save('file.xls')
if __name__=='__main__':
# detect() #观察网页数据
# process1() #第一层网页数据获取
# process2() #第二层网页数据获取
# process_all() #两层网页数据都获取,txt格式
process_xls() #两层网页数据都获取,xls格式
或者:test_yang.py
import json
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/'
'537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
res = requests.get('https://api.jiqizhixin.com/sota/tech_fields/', headers)
all_data = json.loads(res.text)
index = 1
# open()是读写文件的函数,with语句会自动close()已打开文件
with open(r"data.txt", "w",encoding='utf-8') as file:
# 遍历由json数据得到的list
for each_data in all_data:
# print(each_data['id'])
file.write(index.__str__() + '.' + each_data['name'] + '(' + each_data['alias_name'] + ')' + ':' + each_data[
'summary'] + '\n')
url = 'https://applications.jiqizhixin.com/tech_fields/' + each_data['id'] + '/tech_tasks'
strhtml = requests.get(url, headers)
all_data1 = json.loads(strhtml.text)
# print(type(all_data1))
# print(type(all_data1['tech_tasks']))
for each_data1 in all_data1['tech_tasks']:
if each_data1['summary'] == None:
file.write(' ' + each_data1['name'] + '\n')
else:
file.write(' ' + each_data1['name'] + ':' + each_data1['summary'] + '\n')
index = index + 1
结果:
方法二:未使用 json 转换时(很麻烦)
# 参考文章: http://c.biancheng.net/view/2011.html
# 机器之心SOTA模型: https://www.jiqizhixin.com/sota
#coding=utf-8
import requests
from bs4 import BeautifulSoup
def spider():
# 使用 GET 方式抓取数据
url='https://api.jiqizhixin.com/sota/tech_fields'
strhtml = requests.get(url) #Get方式获取网页数据
print(strhtml.text)
a = strhtml.text.split(',')
# print(a[1])
# print(len(a))
# for i in a: #查看字符内容
# print(i)
with open(r"C:\Users\sky\Desktop\爬虫\document.txt", "w",encoding='utf-8') as file:
for i in range(len(a)):
# print(a[i])
a1 = a[i].split(':')
a2=a1[1].replace('"', '')
print(a2)
if (i%4==0):
file.write('https://www.jiqizhixin.com/sota/tech-fields/'+a2+'\n')
else:
file.write(a2 + '\n')
def spider_next():
# 使用 GET 方式抓取数据
url='https://applications.jiqizhixin.com/tech_fields/8a0ace81-a1e8-44ec-962f-dbe351a26e37/tech_tasks'
strhtml = requests.get(url) #Get方式获取网页数据
print(strhtml.text)
a = strhtml.text.split(',')
print(a)
print(len(a))
for i in a: #查看字符内容,共843行
print(i)
# with open(r"C:\Users\sky\Desktop\爬虫\document.txt", "w",encoding='utf-8') as file:
# for i in range(len(a)):
# # print(a[i])
# a1 = a[i].split(':')
# a2=a1[1].replace('"', '')
# print(a2)
# if (i%4==0):
# https://www.jiqizhixin.com/sota/tech-task/32b9c966-605c-48ef-806c-c5ffb53d8c35
# file.write('https://www.jiqizhixin.com/sota/tech-task/'+a2+'\n')
# else:
# file.write(a2 + '\n')
if __name__=='__main__':
spider()
# spider_next()
运行结果:
运行第二个函数:分类很复杂