Speech-to-Text
Convert audio recordings to accurate text transcriptions using Mirako's advanced speech-to-text technology. Transform voice recordings, meetings, interviews, and any audio content into readable text.
Mirako's STT service provides high-quality transcription with support for multiple languages and audio formats. The service can handle various audio qualities and speaking styles, making it suitable for a wide range of applications from content creation to accessibility features.
Supported Languages and Formats
Below are the primary languages supported by Mirako's STT service. We will continue to expand our language offerings over time.
- English
- Cantonese (yue)
- Mandarin Chinese (mandarin)
Note: If you have specific language requirements, welcome to reach out to our Discord channel.
The following audio formats are supported for transcription:
- WAV (.wav)
- MP3 (.mp3)
- M4A (.m4a)
- FLAC (.flac)
- OGG (.ogg)
Quick Start
Basic Speech-to-Text
import requests
import base64
# API configuration
API_KEY = "your_api_key_here"
BASE_URL = "https://mirako.co"
headers = {
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json"
}
def speech_to_text(audio_file_path):
"""Convert audio file to text"""
# Read and encode audio file as base64
with open(audio_file_path, "rb") as audio_file:
audio_data = audio_file.read()
audio_b64 = base64.b64encode(audio_data).decode('utf-8')
payload = {
"audio": audio_b64
}
response = requests.post(
f"{BASE_URL}/v1/speech/stt",
headers=headers,
json=payload
)
if response.status_code == 200:
result = response.json()['data']
transcribed_text = result.get('text')
input_duration = result.get('input_duration')
transcription_segments = result.get('transcription')
print(f"✅ Transcription completed")
print(f"📝 Text: {transcribed_text}")
print(f"⏱️ Audio duration: {input_duration}s")
return {
'text': transcribed_text,
'duration': input_duration,
'segments': transcription_segments
}
else:
print(f"❌ Error: {response.status_code}")
print(response.text)
return None
# Transcribe audio
result = speech_to_text("path/to/your/audio.wav")
Batch Speech-to-Text Processing
import time
import os
def batch_stt(audio_files, delay=1.0):
"""Transcribe multiple audio files"""
results = []
for i, audio_file in enumerate(audio_files):
print(f"Processing {i+1}/{len(audio_files)}: {os.path.basename(audio_file)}")
try:
with open(audio_file, "rb") as f:
audio_data = f.read()
audio_b64 = base64.b64encode(audio_data).decode('utf-8')
payload = {"audio": audio_b64}
response = requests.post(f"{BASE_URL}/v1/speech/stt", headers=headers, json=payload)
if response.status_code == 200:
result = response.json()['data']
results.append({
"file": audio_file,
"text": result.get('text'),
"duration": result.get('input_duration'),
"segments": result.get('transcription')
})
print(f"✅ Success: {result.get('text')[:50]}...")
else:
print(f"❌ Failed: {response.text}")
results.append({
"file": audio_file,
"error": response.text
})
except Exception as e:
print(f"❌ Error processing {audio_file}: {str(e)}")
results.append({
"file": audio_file,
"error": str(e)
})
# Rate limiting
if i < len(audio_files) - 1:
time.sleep(delay)
return results
# Batch transcribe
audio_files = [
"meeting_recording_1.wav",
"interview_audio.mp3",
"voice_memo.m4a"
]
batch_results = batch_stt(audio_files)
# Process results
for result in batch_results:
if "text" in result:
print(f"📁 {result['file']}: {result['text']}")
print(f"⏱️ Duration: {result['duration']}s")
else:
print(f"❌ Failed {result['file']}: {result['error']}")
Transcription Segments and Subtitle Generation
Mirako's STT API returns detailed transcription segments with precise timing information for each spoken phrase or sentence. This segmented data includes start and end timestamps, making it perfect for creating subtitles, captions, or time-synchronized content.
Each transcription segment contains:
- start: Beginning timestamp in seconds
- end: Ending timestamp in seconds
- text: The transcribed text for that time segment
This timing data allows you to generate industry-standard subtitle formats like SRT files, which are widely supported by video players and streaming platforms.
Analyzing Transcription Segments
def analyze_transcription_segments(audio_file_path):
"""Analyze transcription with detailed segment information"""
with open(audio_file_path, "rb") as audio_file:
audio_data = audio_file.read()
audio_b64 = base64.b64encode(audio_data).decode('utf-8')
payload = {"audio": audio_b64}
response = requests.post(f"{BASE_URL}/v1/speech/stt", headers=headers, json=payload)
if response.status_code == 200:
result = response.json()['data']
print(f"Full text: {result['text']}")
print(f"Total duration: {result['input_duration']}s")
print("\n--- Segment Analysis ---")
segments = result.get('transcription', [])
for i, segment in enumerate(segments):
if isinstance(segment, dict):
start_time = segment.get('start', 0)
end_time = segment.get('end', 0)
text = segment.get('text', '')
print(f"Segment {i+1}: [{start_time}s - {end_time}s] {text}")
return result
else:
print(f"Error: {response.text}")
return None
Generating SRT Subtitle Files
def generate_srt_file(audio_file_path, output_srt_path):
"""Generate SRT subtitle file from audio transcription"""
# Get transcription with segments
with open(audio_file_path, "rb") as audio_file:
audio_data = audio_file.read()
audio_b64 = base64.b64encode(audio_data).decode('utf-8')
payload = {"audio": audio_b64}
response = requests.post(f"{BASE_URL}/v1/speech/stt", headers=headers, json=payload)
if response.status_code != 200:
print(f"❌ Error transcribing audio: {response.text}")
return False
result = response.json()['data']
segments = result.get('transcription', [])
def seconds_to_srt_time(seconds):
"""Convert seconds to SRT timestamp format (HH:MM:SS,mmm)"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
milliseconds = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
# Generate SRT content
srt_content = []
for i, segment in enumerate(segments):
if isinstance(segment, dict) and segment.get('text'):
start_time = segment.get('start', 0)
end_time = segment.get('end', 0)
text = segment.get('text', '').strip()
if text: # Only add non-empty segments
# SRT format: sequence number, timestamp, text, blank line
srt_content.append(str(i + 1))
srt_content.append(f"{seconds_to_srt_time(start_time)} --> {seconds_to_srt_time(end_time)}")
srt_content.append(text)
srt_content.append("") # Blank line
# Write SRT file
try:
with open(output_srt_path, 'w', encoding='utf-8') as srt_file:
srt_file.write('\n'.join(srt_content))
print(f"✅ SRT file generated: {output_srt_path}")
print(f"📊 Generated {len(segments)} subtitle segments")
return True
except Exception as e:
print(f"❌ Error writing SRT file: {str(e)}")
return False
# Generate SRT subtitle file
success = generate_srt_file("meeting_recording.wav", "meeting_subtitles.srt")
# Example output SRT format:
# 1
# 00:00:00,000 --> 00:00:03,500
# Welcome everyone to today's team meeting.
#
# 2
# 00:00:03,500 --> 00:00:07,200
# Let's start by reviewing our quarterly goals.
Response Format
The STT API returns a structured response containing:
- text: The complete transcribed text
- input_duration: Duration of the audio in seconds
- transcription: Array of transcription segments with timing information
- id: Unique request identifier