python人工智能音箱实现
大家一定看过复仇者联盟把,钢铁侠的贾维斯是不是很帅很厉害,也学习了这么久PYTHON了 我想到了一种模式,可以做一个自己专属的贾维斯,废话不多说,我们开始吧
贾维斯人工智能音箱具体逻辑原理
1.通过国外网站下载生成对应指纹声音,来实现唤醒使用,不唤醒休眠。(此网站把我账号封了,这功能暂时搁浅未做,有兴趣的朋友可自行百度)
2.通过PYTHON调用设备电脑或者手机本身的录音功能,设置录音具体时间。
3.将语音转换为编码格式通过调用三方的语音识别的API 去生成文字(我这里用的是百度的)
4.将抓换的文字去第三方API,比如国内文言一心,国外chatgpt去生成具体答案,再把答案内容再通过语音识别的API去抓换为语音,
5.再调用本地的播放器播放出语音。
此时形成了一个闭环 一次智能音箱的问答就已经完成。
一、使用步骤
1.具体实现代码
代码如下(示例):
# coding = utf-8
# coding=gb2312
# coding = gbk
import pyaudio
import wave
import requests
import json
import time
import base64
from aip import AipSpeech
from playsound import playsound
# 百度API信息
APP_ID = 'XXXX'
API_KEY = 'xxxxxx'
SECRET_KEY = 'xxxxxxx'
client = AipSpeech(APP_ID,API_KEY,SECRET_KEY)
# 录音设置
CHUNK = 1024 # 录音块大小
FORMAT = pyaudio.paInt16 # 采样点16位二进制数
CHANNELS = 1 # 单声道
RATE = 16000 # 采样频率
RECORD_SECONDS = 15 # 录音时间
WAVE_OUTPUT_FILENAME = "output.wav"
# 录音函数
def record_audio(filename, seconds):
p = pyaudio.PyAudio()
stream = p.open(format=FORMAT,
channels=CHANNELS,
rate=RATE,
input=True,
frames_per_buffer=CHUNK)
frames = []
for i in range(0, int(RATE / CHUNK * seconds)):
data = stream.read(CHUNK)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
wf = wave.open(filename, 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()
def baidutts(text=""):
result = client.synthesis(text,'zh',1,{
'sod':4,
'vol':5,
'per':4
})
if not isinstance(result,dict):
with open("./output.mp3","wb") as f:
f.write(result)
else:
print('语音合成识别',result)
def openai():
url = "https://qianfan.baidubce.com/v2/app/conversation/runs"
payload = json.dumps({
"app_id": "xxxxxxxxx",
"query": zsnr,
"stream": False,
"conversation_id": "xxxxxxx"
})
headers = {
'Content-Type': 'application/json',
'X-Appbuilder-Authorization': 'Bearer bce-v3/ALTAK-hciofH1cZ00RTYVIbHXAA/08b6e696bdfa317d13d8bd54301bc7bd1c8453b1'
}
response = requests.request("POST", url, headers=headers, data=payload)
print(response)
nr= json.loads(response.text)
print(nr['answer'])
return nr
# 调用录音函数
record_audio(WAVE_OUTPUT_FILENAME, RECORD_SECONDS)
# 读取录音文件并编码为base64
with open(WAVE_OUTPUT_FILENAME, "rb") as audio_file:
encoded_string = base64.b64encode(audio_file.read()).decode()# 准备请求参数
payload = json.dumps({
"format": "pcm",
"rate": 16000,
"channel": 1,
"cuid": "Lk2BdS11FbqHce94EzDOF4FwtzEZaTBX",
"token": "24.f0539397c4aa7ec764483e8c08b0ff1e.2592000.1720767374.282335-81479669",
"speech":encoded_string,
"len": 479276
})
headers = {
'Content-Type': 'application/json',
'Accept': 'application/json'
}
# 发送POST请求到百度语音识别API
url = "https://vop.baidu.com/server_api"
response = requests.request("POST", url, headers=headers, data=payload)
# 处理响应
if response.status_code == 200:
jieguo = json.loads(response.text)
print(jieguo)
print("识别结果:", jieguo['result'])
qq=jieguo['result'][0]
zsnr = f"\"\\\"{qq}\\\"\""
else:
print("请求失败:", response.text)
nr2=openai()
print(nr2)
answer_text = nr2['answer']
print(answer_text)
baidutts(text=f"{answer_text}")
playsound('output.mp3')
总结
这个脚本我写了很久了,这也算是我自己的一个小作品雏形,再过段时间有可能我都快忘了这件事情了,如果对智能音箱有兴趣的话,想要去完善该代码 可以直接私信我,或者有哪些看不懂的地方也可以直接评论区指出或者后台私信我哦。