Voice Transcribe
Transcribe audio files using OpenAI's gpt-4o-mini-transcribe model with vocabulary hints and text replacements. Requires uv (https://docs.astral.sh/uv/).
clawhub install voice-transcribeGenerate speech audio from text using HeyGen's Starfish TTS model. Use when: (1) Generating standalone speech audio files from text, (2) Converting text to s...
# Install Skill (downloads SKILL.md to .claude/skills/) clawhub install text-to-speech-heygen # Then just tell Claude: "use Text to Speech to help me..."
# Same install command — works with all SKILL.md-compatible AI coding tools clawhub install text-to-speech-heygen
This Skill is compatible with the OpenClaw standard. After installation, a SKILL.md file is auto-generated, usable by any OpenClaw-compatible AI Agent (Claude Code, Cursor, Windsurf, etc.).
X-Api-Key header. Set the HEYGEN_API_KEY environment variable.curl -X GET "https://api.heygen.com/v1/audio/voices" \
-H "X-Api-Key: $HEYGEN_API_KEY"
mcp__heygen__*), prefer them over direct HTTP API calls.mcp__heygen__list_audio_voices | GET /v1/audio/voices |
| Generate speech audio | mcp__heygen__text_to_speech | POST /v1/audio/text_to_speech |mcp__heygen__list_audio_voices (or GET /v1/audio/voices)mcp__heygen__text_to_speech (or POST /v1/audio/text_to_speech) with text and voice_idaudio_url to download or play the audioGET /v1/audio/voices — a different endpoint from the video voices API (GET /v2/voices). Not all video voices support Starfish TTS.curl -X GET "https://api.heygen.com/v1/audio/voices" \
-H "X-Api-Key: $HEYGEN_API_KEY"
interface TTSVoice {
voice_id: string;
language: string;
gender: "female" | "male" | "unknown";
name: string;
preview_audio_url: string | null;
support_pause: boolean;
support_locale: boolean;
type: string;
}
interface TTSVoicesResponse {
error: null | string;
data: {
voices: TTSVoice[];
};
}
async function listTTSVoices(): Promise<TTSVoice[]> {
const response = await fetch("https://api.heygen.com/v1/audio/voices", {
headers: { "X-Api-Key": process.env.HEYGEN_API_KEY! },
});
const json: TTSVoicesResponse = await response.json();
if (json.error) {
throw new Error(json.error);
}
return json.data.voices;
}
import requests
import os
def list_tts_voices() -> list:
response = requests.get(
"https://api.heygen.com/v1/audio/voices",
headers={"X-Api-Key": os.environ["HEYGEN_API_KEY"]}
)
data = response.json()
if data.get("error"):
raise Exception(data["error"])
return data["data"]["voices"]
{
"error": null,
"data": {
"voices": [
{
"voice_id": "f38a635bee7a4d1f9b0a654a31d050d2",
"name": "Chill Brian",
"language": "English",
"gender": "male",
"preview_audio_url": "https://resource.heygen.ai/text_to_speech/WpSDQvmLGXEqXZVZQiVeg6.mp3",
"support_pause": true,
"support_locale": false,
"type": "public"
}
]
}
}
POST https://api.heygen.com/v1/audio/text_to_speechtext | string | Y | Text content to convert to speech |
| voice_id | string | Y | Voice ID from GET /v1/audio/voices |
| speed | number | | Speech speed, 0.5-1.5 (default: 1) |
| pitch | integer | | Voice pitch, -50 to 50 (default: 0) |
| locale | string | | Accent/locale for multilingual voices (e.g., en-US, pt-BR) |
| elevenlabs_settings | object | | Advanced settings for ElevenLabs voices |model | string | Model selection (eleven_v3, eleven_turbo_v2_5, etc.) |
| similarity_boost | number | Voice similarity, 0.0-1.0 |
| stability | number | Output consistency, 0.0-1.0 |
| style | number | Style intensity, 0.0-1.0 |curl -X POST "https://api.heygen.com/v1/audio/text_to_speech" \
-H "X-Api-Key: $HEYGEN_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"text": "Hello! Welcome to our product demo.",
"voice_id": "YOUR_VOICE_ID",
"speed": 1.0
}'
interface TTSRequest {
text: string;
voice_id: string;
speed?: number;
pitch?: number;
locale?: string;
elevenlabs_settings?: {
model?: string;
similarity_boost?: number;
stability?: number;
style?: number;
};
}
interface WordTimestamp {
word: string;
start: number;
end: number;
}
interface TTSResponse {
error: null | string;
data: {
audio_url: string;
duration: number;
request_id: string;
word_timestamps: WordTimestamp[];
};
}
async function textToSpeech(request: TTSRequest): Promise<TTSResponse["data"]> {
const response = await fetch(
"https://api.heygen.com/v1/audio/text_to_speech",
{
method: "POST",
headers: {
"X-Api-Key": process.env.HEYGEN_API_KEY!,
"Content-Type": "application/json",
},
body: JSON.stringify(request),
}
);
const json: TTSResponse = await response.json();
if (json.error) {
throw new Error(json.error);
}
return json.data;
}
import requests
import os
def text_to_speech(
text: str,
voice_id: str,
speed: float = 1.0,
pitch: int = 0,
locale: str | None = None,
) -> dict:
payload = {
"text": text,
"voice_id": voice_id,
"speed": speed,
"pitch": pitch,
}
if locale:
payload["locale"] = locale
response = requests.post(
"https://api.heygen.com/v1/audio/text_to_speech",
headers={
"X-Api-Key": os.environ["HEYGEN_API_KEY"],
"Content-Type": "application/json",
},
json=payload,
)
data = response.json()
if data.get("error"):
raise Exception(data["error"])
return data["data"]
{
"error": null,
"data": {
"audio_url": "https://resource2.heygen.ai/text_to_speech/.../id=365d46bb.wav",
"duration": 5.526,
"request_id": "p38QJ52hfgNlsYKZZmd9",
"word_timestamps": [
{ "word": "<start>", "start": 0.0, "end": 0.0 },
{ "word": "Hey", "start": 0.079, "end": 0.219 },
{ "word": "there,", "start": 0.239, "end": 0.459 },
{ "word": "<end>", "start": 5.526, "end": 5.526 }
]
}
}
const result = await textToSpeech({
text: "Welcome to our quarterly earnings call.",
voice_id: "YOUR_VOICE_ID",
});
console.log(Audio URL: ${result.audio_url});
console.log(Duration: ${result.duration}s);
const result = await textToSpeech({
text: "We're thrilled to announce our newest feature!",
voice_id: "YOUR_VOICE_ID",
speed: 1.1,
});
const result = await textToSpeech({
text: "Bem-vindo ao nosso produto.",
voice_id: "MULTILINGUAL_VOICE_ID",
locale: "pt-BR",
});
async function generateSpeech(text: string, language: string): Promise<string> {
const voices = await listTTSVoices();
const voice = voices.find(
(v) => v.language.toLowerCase().includes(language.toLowerCase())
);
if (!voice) {
throw new Error(No TTS voice found for language: ${language});
}
const result = await textToSpeech({
text,
voice_id: voice.voice_id,
});
return result.audio_url;
}
const audioUrl = await generateSpeech("Hello and welcome!", "english");
word <break time="1s"/> word
s suffix: <break time="1.5s"/>GET /v1/audio/voices to find compatible voices — not all voices from GET /v2/voices support Starfish TTSsupport_locale before setting a locale — only multilingual voices support locale selectionpreview_audio_url before generating (may be null for some voices)word_timestamps in the response for caption syncing or timed text overlaysword <break time="1s"/> wordclawhub run text-to-speech-heygen --text "这是一个演示文本" --voice-id "zh-CN-female" --speed 1.0 --output output.mp3Create chapters, highlights, and show notes from podcast audio or transcripts. Use when a user wants chapter markers, highlight clips, or show-note drafts without publishing or distribution actions.
clawhub install podcast-chaptering-highlights