Developer Huang Dinghua shares his journey participating in the “Qianfan AppBuilder – Smart Hardware AIOT Creative Competition Phase 1”.
AI technology
Introduction
-
Children’s Companion: AI agents that provide entertaining companionship for children, common scenarios include story reading, knowledge encyclopedia, and character companionship.
-
Learning and Education: Mainly provides learning education AI agents for preschool and K12 students, common scenarios include professional English speaking practice, AI sports teacher, math problem-solving, vocabulary memorization, and Chinese character learning.
-
Entertainment and Interaction: Provides entertainment interaction AI agents for users of all ages, common scenarios include comic avatar generation, travel planning, role-playing, game strategy assistant, and pet emotion recognition.
-
Health and Wellness for the Elderly: Provides health and wellness AI agents for the elderly, common scenarios include health Q&A, elderly health diet assistant, and lifestyle health reminders.
-
Health Monitoring: Create health monitoring AI agents, specific scenarios include AI tongue diagnosis, family AI doctor, and AI companionship for weight loss.
AI technology
Background
AI technology
Development Board Introduction



AI technology
Project Introduction

-
For the ASR part, consider deploying the local Whisper model or using an API model.
-
For the large model part, local model deployment can fully utilize the inference platform capabilities of OpenVINO; it can also use an API model.
-
For the TTS part, consider deploying local TTS synthesis or using an API model.

AI technology
MVP (Minimum Viable Product)
1. Software Development Environment

2. ASR Part
import pyaudioimport numpy as npimport whisper
# Initialize Whisper modelmodel = whisper.load_model("base") # Options: "tiny", "base", "small", "medium", "large"
# PyAudio settingschunk = 1024 # Size of audio blockformat = pyaudio.paInt16 # Audio formatchannels = 1 # Mono channelrate = 16000 # Sample rate
p = pyaudio.PyAudio()
# Open audio streamstream = p.open(format=format, channels=channels, rate=rate, input=True, frames_per_buffer=chunk)
print("Starting real-time speech recognition...")
try: while True: # Read audio data data = stream.read(chunk) audio_data = np.frombuffer(data, dtype=np.int16)
# Convert audio data to format required by Whisper audio_float = audio_data.astype(np.float32) / 32768.0 # Normalize to [-1, 1] audio_float = np.pad(audio_float, (0, 16000 - len(audio_float)), 'constant') # Pad to 16 seconds
# Perform speech recognition result = model.transcribe(audio_float, fp16=False) # fp16=False to avoid issues on some hardware print("Recognition result:", result['text'])
except KeyboardInterrupt: print("Stopping speech recognition.")
finally: # Close audio stream stream.stop_stream() stream.close() p.terminate()

# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"speech_config = speechsdk.SpeechConfig(subscription='xxx', region='eastasia')speech_config.speech_recognition_language="zh-CN"
audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
print("Speak into your microphone.")
def stop_cb(evt): print('CLOSING on {}'.format(evt)) speech_recognizer.stop_continuous_recognition() global done done = True
def sent_to_model(text): response = get_response(text) if response: print("model response:",response) text_to_speech(response)
#speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)) or sent_to_model(evt.result.text))speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
speech_recognizer.session_stopped.connect(stop_cb)speech_recognizer.canceled.connect(stop_cb)
speech_recognizer.start_continuous_recognition()while not done: time.sleep(.5)
2.3 LLM Part
import appbuilderimport os
os.environ['APPBUILDER_TOKEN'] = "xxxxxxxxxxxxxxxxxxxxxxxxxxx"app_id = "acf19b27-1019-45fb-b163-a454d31ef014"def agent_query( query: str):
# Initialize Agent instance agent = appbuilder.AppBuilderClient(app_id) # Create conversation ID conversation_id = agent.create_conversation() print("Your AppBuilder App ID is: {}".format(app_id)) print("processing")
response_message = agent.run(conversation_id=conversation_id, query=query) description = response_message.content.answer
return description
if __name__ == '__main__': prompt = 'Tell a story about Cinderella' print(agent_query(prompt))
2.4 TTS Part
async def text_to_speech(text): speech_config2 = speechsdk.SpeechConfig(subscription='xxx', region='eastasia') audio_config2 = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)
# The neural multilingual voice can speak different languages based on the input text. speech_config2.speech_synthesis_voice_name='zh-CN-XiaoyiNeural' speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config2, audio_config=audio_config2)
# Get text from the console and synthesize to the default speaker. print("tts=>>>",text) speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
if speech_synthesis_result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: print("Speech synthesized for text [{}]".format(text)) elif speech_synthesis_result.reason == speechsdk.ResultReason.Canceled: cancellation_details = speech_synthesis_result.cancellation_details print("Speech synthesis canceled: {}".format(cancellation_details.reason)) if cancellation_details.reason == speechsdk.CancellationReason.Error: if cancellation_details.error_details: print("Error details: {}".format(cancellation_details.error_details)) print("Did you set the speech resource key and region values?")
AI technology
Demonstration
AI technology
Afterword
