A Swift SDK for integrating ElevenLabs' conversational AI capabilities into your iOS and macOS applications. Built on top of LiveKit WebRTC for real-time audio streaming and communication.
- Xcode (minimum 16.3)
Add to your project using Swift Package Manager:
dependencies: [
.package(url: "https://github.com/elevenlabs/elevenlabs-swift-sdk.git", from: "2.0.17")
]import ElevenLabs
// 1. Start a conversation with your agent
let conversation = try await ElevenLabs.startConversation(
agentId: "your-agent-id",
config: ConversationConfig()
)
// 2. Observe conversation state and messages
conversation.$state
.sink { state in
print("Connection state: \(state)")
}
.store(in: &cancellables)
conversation.$messages
.sink { messages in
for message in messages {
print("\(message.role): \(message.content)")
}
}
.store(in: &cancellables)
// 3. Send messages and control the conversation
try await conversation.sendMessage("Hello!")
try await conversation.toggleMute()
await conversation.endConversation()- iOS 14.0+ / macOS 11.0+
- Swift 5.9+
- Add
NSMicrophoneUsageDescriptionto your Info.plist - Add
NSCameraUsageDescriptionto your Info.plist. Your app won't fail to work in development but uploading to App Store Connect will fail without this.
The SDK provides a streamlined Conversation class that handles all aspects of real-time communication:
import ElevenLabs
import LiveKit
@MainActor
class ConversationManager: ObservableObject {
@Published var conversation: Conversation?
private var cancellables = Set<AnyCancellable>()
func startConversation(agentId: String) async throws {
let config = ConversationConfig(
conversationOverrides: ConversationOverrides(textOnly: false)
)
conversation = try await ElevenLabs.startConversation(
agentId: agentId,
config: config
)
setupObservers()
}
private func setupObservers() {
guard let conversation else { return }
// Monitor connection state
conversation.$state
.sink { state in
print("State: \(state)")
}
.store(in: &cancellables)
// Monitor messages
conversation.$messages
.sink { messages in
print("Messages: \(messages.count)")
}
.store(in: &cancellables)
// Monitor agent state
conversation.$agentState
.sink { agentState in
print("Agent: \(agentState)")
}
.store(in: &cancellables)
// Handle client tool calls
conversation.$pendingToolCalls
.sink { toolCalls in
for toolCall in toolCalls {
Task {
await handleToolCall(toolCall)
}
}
}
.store(in: &cancellables)
// Monitor MCP (Model Context Protocol) tool calls
conversation.$mcpToolCalls
.sink { mcpCalls in
for call in mcpCalls {
print("MCP tool: \(call.toolName) - \(call.state)")
// Approve/reject if awaiting approval
if call.state == .awaitingApproval {
try? await conversation.sendMCPToolApproval(
toolCallId: call.toolCallId,
isApproved: true
)
}
}
}
.store(in: &cancellables)
// Monitor MCP connection status
conversation.$mcpConnectionStatus
.sink { status in
if let status = status {
for integration in status.integrations {
print("MCP \(integration.integrationType): \(integration.isConnected ? "connected" : "disconnected")")
}
}
}
.store(in: &cancellables)
// Monitor conversation metadata (includes conversation ID)
conversation.$conversationMetadata
.compactMap { $0 }
.sink { metadata in
print("Conversation ID: \(metadata.conversationId)")
print("Agent audio format: \(metadata.agentOutputAudioFormat)")
if let userFormat = metadata.userInputAudioFormat {
print("User audio format: \(userFormat)")
}
}
.store(in: &cancellables)
}
}Handle tool calls from your agent with full parameter support:
private func handleToolCall(_ toolCall: ClientToolCallEvent) async {
do {
let parameters = try toolCall.getParameters()
let result = await executeClientTool(
name: toolCall.toolName,
parameters: parameters
)
if toolCall.expectsResponse {
try await conversation?.sendToolResult(
for: toolCall.toolCallId,
result: result
)
} else {
conversation?.markToolCallCompleted(toolCall.toolCallId)
}
} catch {
// Handle tool execution errors
if toolCall.expectsResponse {
try? await conversation?.sendToolResult(
for: toolCall.toolCallId,
result: ["error": error.localizedDescription],
isError: true
)
}
}
}
private func executeClientTool(name: String, parameters: [String: Any]) async -> String {
switch name {
case "get_weather":
let location = parameters["location"] as? String ?? "Unknown"
return "Weather in \(location): 22°C, Sunny"
case "get_time":
return "Current time: \(Date().ISO8601Format())"
case "alert_tool":
return "User clicked something"
default:
return "Unknown tool: \(name)"
}
}let conversation = try await ElevenLabs.startConversation(
agentId: "your-public-agent-id",
config: ConversationConfig()
)// Option 1: Direct method (recommended)
// Get a conversatoin token from your backend (never store API keys in your app)
let token = try await fetchConversationToken()
let conversation = try await ElevenLabs.startConversation(
conversationToken: token,
config: ConversationConfig()
)
// Option 2: Using auth configuration
let conversation = try await ElevenLabs.startConversation(
auth: .conversationToken(token),
config: ConversationConfig()
)Here's a complete example showing how to fetch tokens and connect to private agents:
import Foundation
// Token service for fetching conversation tokens
actor TokenService {
private let apiKey: String
private let baseURL = "https://api.us.elevenlabs.io/v1/convai/conversation/token"
init(apiKey: String) {
self.apiKey = apiKey
}
func fetchConversationToken(for agentId: String) async throws -> String {
guard let url = URL(string: "\(baseURL)?agent_id=\(agentId)") else {
throw TokenServiceError.invalidURL
}
var request = URLRequest(url: url)
request.httpMethod = "GET"
request.setValue(apiKey, forHTTPHeaderField: "xi-api-key")
request.setValue("application/json", forHTTPHeaderField: "Accept")
let (data, response) = try await URLSession.shared.data(for: request)
guard let httpResponse = response as? HTTPURLResponse,
httpResponse.statusCode == 200 else {
throw TokenServiceError.apiError
}
let tokenResponse = try JSONDecoder().decode(TokenResponse.self, from: data)
return tokenResponse.token
}
}
struct TokenResponse: Codable {
let token: String
}
enum TokenServiceError: Error {
case invalidURL
case apiError
}
// Usage in your app
class ConversationManager {
private let tokenService = TokenService(apiKey: "your-api-key")
private let agentId = "your-private-agent-id"
func startPrivateAgentConversation() async throws -> Conversation {
// Fetch token from ElevenLabs API
let token = try await tokenService.fetchConversationToken(for: agentId)
// Start conversation with private agent
return try await ElevenLabs.startConversation(
conversationToken: token,
config: ConversationConfig()
)
}
}// Voice conversation (default)
let voiceConfig = ConversationConfig(
conversationOverrides: ConversationOverrides(textOnly: false)
)
// Text-only conversation
let textConfig = ConversationConfig(
conversationOverrides: ConversationOverrides(textOnly: true)
)
let conversation = try await ElevenLabs.startConversation(
agentId: agentId,
config: textConfig
)// Microphone control
try await conversation.toggleMute()
try await conversation.setMuted(true)
// Check microphone state
let isMuted = conversation.isMuted
// Access audio tracks for advanced use cases
let inputTrack = conversation.inputTrack
let agentAudioTrack = conversation.agentAudioTrackThe conversation metadata (including conversation ID) is available after the conversation is initialized:
// Access conversation metadata directly
if let metadata = conversation.conversationMetadata {
let conversationId = metadata.conversationId
let agentAudioFormat = metadata.agentOutputAudioFormat
let userAudioFormat = metadata.userInputAudioFormat // Optional
}
// Or observe it reactively
conversation.$conversationMetadata
.compactMap { $0 }
.sink { metadata in
// Store or use the conversation ID
self.currentConversationId = metadata.conversationId
// Log conversation details
print("Started conversation: \(metadata.conversationId)")
}
.store(in: &cancellables)The SDK is built with modern Swift patterns and reactive programming:
ElevenLabs (Main Module)
├── Conversation (Core conversation management)
├── ConnectionManager (LiveKit WebRTC integration)
├── DataChannelReceiver (Real-time message handling)
├── EventParser/EventSerializer (Protocol implementation)
├── TokenService (Authentication and connection details)
└── Dependencies (Dependency injection container)
- Conversation: Main class providing
@Publishedproperties for reactive UI updates - ConnectionManager: Manages LiveKit room connections and audio streaming
- DataChannelReceiver: Handles incoming protocol events from ElevenLabs agents
- EventParser/EventSerializer: Handles protocol event parsing and serialization
- ClientToolCallEvent: Represents tool calls from agents with parameter extraction
The SDK provides automatic message management with reactive updates:
conversation.$messages
.sink { messages in
// Update your UI with the latest messages
self.chatMessages = messages.map { message in
ChatMessage(
id: message.id,
content: message.content,
isFromAgent: message.role == .agent
)
}
}
.store(in: &cancellables)conversation.$agentState
.sink { state in
switch state {
case .listening:
// Agent is listening, show listening indicator
break
case .speaking:
// Agent is speaking, show speaking indicator
break
}
}
.store(in: &cancellables)conversation.$state
.sink { state in
switch state {
case .idle:
// Not connected
break
case .connecting:
// Show connecting indicator
break
case .active(let callInfo):
// Connected to agent: \(callInfo.agentId)
break
case .ended(let reason):
// Handle disconnection: \(reason)
break
case .error(let error):
// Handle error: \(error)
break
}
}
.store(in: &cancellables)import SwiftUI
import ElevenLabs
import LiveKit
import Combine
struct ConversationView: View {
@StateObject private var viewModel = ConversationViewModel()
var body: some View {
VStack {
// Chat messages
ScrollView {
LazyVStack {
ForEach(viewModel.messages) { message in
MessageView(message: message)
}
}
}
// Controls
HStack {
Button(viewModel.isConnected ? "End" : "Start") {
Task {
if viewModel.isConnected {
await viewModel.endConversation()
} else {
await viewModel.startConversation()
}
}
}
Button(viewModel.isMuted ? "Unmute" : "Mute") {
Task {
await viewModel.toggleMute()
}
}
.disabled(!viewModel.isConnected)
}
}
.task {
await viewModel.setup()
}
}
}
@MainActor
class ConversationViewModel: ObservableObject {
@Published var messages: [Message] = []
@Published var isConnected = false
@Published var isMuted = false
private var conversation: Conversation?
private var cancellables = Set<AnyCancellable>()
func setup() async {
// Initialize your conversation manager
}
func startConversation() async {
do {
conversation = try await ElevenLabs.startConversation(
agentId: "your-agent-id",
config: ConversationConfig()
)
setupObservers()
} catch {
print("Failed to start conversation: \(error)")
}
}
private func setupObservers() {
guard let conversation else { return }
conversation.$messages
.assign(to: &$messages)
conversation.$state
.map { $0.isActive }
.assign(to: &$isConnected)
conversation.$isMuted
.assign(to: &$isMuted)
}
}The SDK provides comprehensive error handling through the onError callback:
let config = ConversationConfig(
onError: { error in
print("SDK Error: \(error.localizedDescription)")
// Handle different error types
switch error {
case .notConnected:
// Show "not connected" message
break
case .connectionFailed(let reason):
// Handle connection failure
print("Connection failed: \(reason)")
case .authenticationFailed(let reason):
// Handle auth error
print("Auth failed: \(reason)")
case .agentTimeout:
// Agent took too long to respond
break
case .localNetworkPermissionRequired:
// User needs to grant local network permission
break
}
}
)Monitor the conversation startup progress with detailed state transitions:
let config = ConversationConfig(
onStartupStateChange: { state in
switch state {
case .idle:
print("Not started")
case .resolvingToken:
print("Fetching authentication token...")
case .connectingToRoom:
print("Connecting to LiveKit room...")
case .waitingForAgent(let timeout):
print("Waiting for agent (timeout: \(timeout)s)...")
case .agentReady(let report):
print("Agent ready in \(report.elapsed)s")
if report.viaGraceTimeout {
print(" (via grace timeout)")
}
case .sendingConversationInit:
print("Sending conversation initialization...")
case .active(let callInfo, let metrics):
print("✅ Connected to agent: \(callInfo.agentId)")
print(" Total startup time: \(metrics.total)s")
print(" - Token fetch: \(metrics.tokenFetch ?? 0)s")
print(" - Room connect: \(metrics.roomConnect ?? 0)s")
print(" - Agent ready: \(metrics.agentReady ?? 0)s")
print(" - Init: \(metrics.conversationInit ?? 0)s")
print(" - Attempts: \(metrics.conversationInitAttempts)")
case .failed(let stage, let metrics):
print("❌ Failed at \(stage)")
print(" Total time: \(metrics.total)s")
}
}
)Listen to fine-grained conversation events:
let config = ConversationConfig(
// Agent response events
onAgentResponse: { text, eventId in
print("Agent said: \(text) [event: \(eventId)]")
},
// Agent response corrections (when agent self-corrects)
onAgentResponseCorrection: { original, corrected, eventId in
print("Agent corrected: '\(original)' → '\(corrected)'")
},
// User transcript events
onUserTranscript: { text, eventId in
print("You said: \(text) [event: \(eventId)]")
},
// Interruption detection
onInterruption: { eventId in
print("User interrupted agent [event: \(eventId)]")
},
// Feedback availability tracking
onCanSendFeedbackChange: { canSend in
// Enable/disable feedback UI based on whether feedback can be sent
self.showFeedbackButton = canSend
}
)Get word-level timing information to highlight text as the agent speaks:
let config = ConversationConfig(
onAudioAlignment: { alignment in
// alignment.chars: ["H", "e", "l", "l", "o"]
// alignment.charStartTimesMs: [0, 100, 150, 200, 250]
// alignment.charDurationsMs: [100, 50, 50, 50, 100]
// Example: Highlight text character by character
for (index, char) in alignment.chars.enumerated() {
let startMs = alignment.charStartTimesMs[index]
let durationMs = alignment.charDurationsMs[index]
Task {
try? await Task.sleep(nanoseconds: UInt64(startMs * 1_000_000))
await highlightCharacter(at: index, duration: durationMs)
}
}
}
)Control microphone behavior and voice processing:
let audioConfig = AudioPipelineConfiguration(
// Microphone mute strategy
// - .voiceProcessing: Mute by stopping voice processing
// - .restart: Mute by restarting the audio session
// - .inputMixer: Mute at the input mixer level (default)
microphoneMuteMode: .inputMixer,
// Keep mic warm to avoid first-word latency (default: true)
recordingAlwaysPrepared: true,
// Bypass WebRTC voice processing (AEC/NS/VAD)
// Set to true if you want raw audio without processing
voiceProcessingBypassed: false,
// Enable Auto Gain Control for consistent volume
voiceProcessingAGCEnabled: true,
// Detect speech while muted (useful for "tap to speak" UX)
onSpeechActivity: { event in
print("Speech detected while muted!")
// Show visual indicator that user is trying to speak
}
)
let config = ConversationConfig(
audioConfiguration: audioConfig
)Fine-tune the connection handshake behavior:
let startupConfig = ConversationStartupConfiguration(
// How long to wait for agent to be ready (default: 3.0s)
agentReadyTimeout: 5.0,
// Retry delays for conversation init in seconds (default: [0, 0.5, 1.0])
// First attempt: immediate, 2nd: wait 0.5s, 3rd: wait 1.0s, etc.
initRetryDelays: [0, 0.5, 1.0, 2.0],
// Whether to fail if agent isn't ready in time (default: false)
// false = continue with grace period, true = throw error immediately
failIfAgentNotReady: false
)
let config = ConversationConfig(
startupConfiguration: startupConfig
)Monitor real-time voice activity scores:
let config = ConversationConfig(
onVadScore: { score in
// score: 0.0 to 1.0 (higher = more speech detected)
updateVoiceActivityIndicator(score)
}
)let config = ConversationConfig(
// Core callbacks
onAgentReady: {
print("✅ Agent is ready!")
},
onDisconnect: {
print("🔌 Disconnected")
},
onError: { error in
print("❌ Error: \(error.localizedDescription)")
},
// Startup monitoring
onStartupStateChange: { state in
print("Startup: \(state)")
},
// Event callbacks
onAgentResponse: { text, eventId in
print("Agent: \(text)")
},
onUserTranscript: { text, eventId in
print("User: \(text)")
},
onInterruption: { eventId in
print("Interrupted!")
},
// Advanced features
onAudioAlignment: { alignment in
// Highlight words as agent speaks
},
onCanSendFeedbackChange: { canSend in
// Enable/disable feedback button
},
// Audio pipeline
audioConfiguration: AudioPipelineConfiguration(
microphoneMuteMode: .inputMixer,
recordingAlwaysPrepared: true,
voiceProcessingBypassed: false,
voiceProcessingAGCEnabled: true
),
// Network configuration
networkConfiguration: LiveKitNetworkConfiguration(
strategy: .automatic
),
// Startup tuning
startupConfiguration: ConversationStartupConfiguration(
agentReadyTimeout: 5.0,
initRetryDelays: [0, 0.5, 1.0, 2.0]
)
)
let conversation = try await ElevenLabs.startConversation(
agentId: "your-agent-id",
config: config
)