from google import genai
from google.genai import types
import wave
# Set up the wave file to save the output:
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
with wave.open(filename, "wb") as wf:
wf.setnchannels(channels)
wf.setsampwidth(sample_width)
wf.setframerate(rate)
wf.writeframes(pcm)
client = genai.Client()
response = client.models.generate_content(
model="gemini-2.5-flash-preview-tts",
contents="Say cheerfully: Have a wonderful day!",
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name='Kore',
data = response.candidates[0].content.parts[0].inline_data.data
file_name='out.wav'
wave_file(file_name, data) # Saves the file to current directory
如需查看更多代码示例,请参阅食谱库仓库中的“TTS - 使用入门”文件:
在 GitHub 上查看
JavaScript
import {GoogleGenAI} from '@google/genai';
import wav from 'wav';
async function saveWaveFile(
filename,
pcmData,
channels = 1,
rate = 24000,
sampleWidth = 2,
return new Promise((resolve, reject) => {
const writer = new wav.FileWriter(filename, {
channels,
sampleRate: rate,
bitDepth: sampleWidth * 8,
writer.on('finish', resolve);
writer.on('error', reject);
writer.write(pcmData);
writer.end();
async function main() {
const ai = new GoogleGenAI({});
const response = await ai.models.generateContent({
model: "gemini-2.5-flash-preview-tts",
contents: [{ parts: [{ text: 'Say cheerfully: Have a wonderful day!' }] }],
config: {
responseModalities: ['AUDIO'],
speechConfig: {
voiceConfig: {
prebuiltVoiceConfig: { voiceName:
'Kore' },
const data = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
const audioBuffer = Buffer.from(data, 'base64');
const fileName = 'out.wav';
await saveWaveFile(fileName, audioBuffer);
await main();
REST
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \
-H "x-goog-api-key: $GEMINI_API_KEY" \
-X POST \
-H "Content-Type: application/json" \
-d '{
"contents": [{
"parts":[{
"text": "Say cheerfully: Have a wonderful day!"
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {
"voiceConfig": {
"prebuiltVoiceConfig": {
"voiceName": "Kore"
"model": "gemini-2.5-flash-preview-tts",
}' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \
base64 --decode >out.pcm
# You may need to install ffmpeg.
ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav
多讲者文字转语音
对于多扬声器音频,您需要一个 MultiSpeakerVoiceConfig 对象,其中每个扬声器(最多 2 个)都配置为 SpeakerVoiceConfig。您需要使用与提示中使用的名称定义每个 speaker:
Python
from google import genai
from google.genai import types
import wave
# Set up the wave file to save the output:
def wave_file(filename, pcm, channels=1, rate=24000, sample_width=2):
with wave.open(filename, "wb") as wf:
wf.setnchannels(channels)
wf.setsampwidth(sample_width)
wf.setframerate(rate)
wf.writeframes(pcm)
client = genai.Client()
prompt = """TTS the following conversation between Joe and Jane:
Joe: How's it going today Jane?
Jane: Not too bad, how about you?"""
response = client.models.generate_content(
model="gemini-2.5-flash-preview-tts",
contents=prompt,
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
speaker_voice_configs=[
types.SpeakerVoiceConfig(
speaker='Joe',
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name='Kore',
types.SpeakerVoiceConfig(
speaker='Jane',
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name='Puck',
data = response.candidates[0].content.parts[0].inline_data.data
file_name='out.wav'
wave_file(file_name, data) # Saves the file to current directory
JavaScript
import {GoogleGenAI} from '@google/genai';
import wav from 'wav';
async function saveWaveFile(
filename,
pcmData,
channels = 1,
rate = 24000,
sampleWidth = 2,
return new Promise((resolve, reject) =
> {
const writer = new wav.FileWriter(filename, {
channels,
sampleRate: rate,
bitDepth: sampleWidth * 8,
writer.on('finish', resolve);
writer.on('error', reject);
writer.write(pcmData);
writer.end();
async function main() {
const ai = new GoogleGenAI({});
const prompt = `TTS the following conversation between Joe and Jane:
Joe: How's it going today Jane?
Jane: Not too bad, how about you?`;
const response = await ai.models.generateContent({
model: "gemini-2.5-flash-preview-tts",
contents: [{ parts: [{ text: prompt }] }],
config: {
responseModalities: ['AUDIO'],
speechConfig: {
multiSpeakerVoiceConfig: {
speakerVoiceConfigs: [
speaker: 'Joe',
voiceConfig: {
prebuiltVoiceConfig: { voiceName: 'Kore' }
speaker: 'Jane',
voiceConfig: {
prebuiltVoiceConfig: { voiceName: 'Puck' }
const data = response.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
const audioBuffer = Buffer.from(data, 'base64');
const fileName = 'out.wav';
await saveWaveFile(fileName, audioBuffer);
await main();
REST
curl "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash-preview-tts:generateContent" \
-H "x-goog-api-key: $GEMINI_API_KEY" \
-X POST \
-H "Content-Type: application/json" \
-d '{
"contents": [{
"parts":[{
"text": "TTS the following conversation between Joe and Jane:
Joe: Hows it going today Jane?
Jane: Not too bad, how about you?"
"generationConfig": {
"responseModalities": ["AUDIO"],
"speechConfig": {
"multiSpeakerVoiceConfig": {
"speakerVoiceConfigs": [{
"speaker": "Joe",
"voiceConfig": {
"prebuiltVoiceConfig": {
"voiceName": "Kore"
"speaker": "Jane",
"voiceConfig": {
"prebuiltVoiceConfig": {
"voiceName": "Puck"
"model": "gemini-2.5-flash-preview-tts",
}' | jq -r '.candidates[0].content.parts[0].inlineData.data' | \
base64 --decode > out.pcm
# You may need to install ffmpeg.
ffmpeg -f s16le -ar 24000 -ac 1 -i out.pcm out.wav
使用提示控制语音风格
您可以使用自然语言提示控制单声道和多声道 TTS 的语气、语调、口音和节奏。例如,在单音箱提示中,您可以说:
Say in an spooky whisper:
"By the pricking of my thumbs...
Something wicked this way comes"
在多讲者提示中,请向模型提供每位讲者的姓名和相应的转写内容。您还可以为每位发言人单独提供指导:
Make Speaker1 sound tired and bored, and Speaker2 sound excited and happy:
Speaker1: So... what's on the agenda today?
Speaker2: You're never going to guess!
尝试使用与您想要传达的风格或情感相符的语音选项,以进一步强调这些内容。例如,在上一个问题中,Enceladus 的喘息音可能强调“疲倦”和“无聊”,而 Puck 的乐观语气则可以与“兴奋”和“快乐”相得益彰。
生成要转换为音频的提示
TTS 模型只能输出音频,但您可以先使用其他模型生成转写内容,然后将该转写内容传递给 TTS 模型以大声朗读。
Python
from google import genai
from google.genai import types
client = genai.Client()
transcript = client.models.generate_content(
model="gemini-2.0-flash",
contents="""Generate a short transcript around 100 words that reads
like it was clipped from a podcast by excited herpetologists.
The hosts names are Dr. Anya and Liam.""").text
response = client.models.generate_content(
model="gemini-2.5-flash-preview-tts",
contents=transcript,
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
speaker_voice_configs=[
types.SpeakerVoiceConfig(
speaker='Dr. Anya',
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name='Kore',
types.SpeakerVoiceConfig(
speaker='Liam',
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name='Puck',
# ...Code to stream or save the output
JavaScript
import { GoogleGenAI } from "@google/genai";
const ai = new GoogleGenAI({});
async function main() {
const transcript = await ai.models.generateContent({
model: "gemini-2.0-flash",
contents: "Generate a short transcript around 100 words that reads like it was clipped from a podcast by excited herpetologists. The hosts names are Dr. Anya and Liam.",
const response = await ai.models.generateContent({
model: "gemini-2.5-flash-preview-tts",
contents: transcript,
config: {
responseModalities: ['AUDIO'],
speechConfig: {
multiSpeakerVoiceConfig: {
speakerVoiceConfigs: [
speaker: "Dr. Anya",
voiceConfig: {
prebuiltVoiceConfig: {voiceName: "Kore"},
speaker: "Liam",
voiceConfig: {
prebuiltVoiceConfig: {voiceName: "Puck"},
// ..JavaScript code for exporting .wav file for output audio
await main();
TTS 模型支持 voice_name 字段中的以下 30 种语音选项:
如未另行说明,那么本页面中的内容已根据知识共享署名 4.0 许可获得了许可,并且代码示例已根据 Apache 2.0 许可获得了许可。有关详情,请参阅 Google 开发者网站政策。Java 是 Oracle 和/或其关联公司的注册商标。
最后更新时间 (UTC):2025-06-27。
[[["易于理解","easyToUnderstand","thumb-up"],["解决了我的问题","solvedMyProblem","thumb-up"],["其他","otherUp","thumb-up"]],[["没有我需要的信息","missingTheInformationINeed","thumb-down"],["太复杂/步骤太多","tooComplicatedTooManySteps","thumb-down"],["内容需要更新","outOfDate","thumb-down"],["翻译问题","translationIssue","thumb-down"],["示例/代码问题","samplesCodeIssue","thumb-down"],["其他","otherDown","thumb-down"]],["最后更新时间 (UTC):2025-06-27。"],[],[]]