Skip to content

Commit be7ad56

Browse files
committed
Add subcommand transcribe to transcribe text from an audio file
1 parent d07fd2a commit be7ad56

6 files changed

Lines changed: 163 additions & 93 deletions

File tree

Cargo.lock

Lines changed: 17 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ edition = "2021"
1212
clap = { version = "4.5.4", features = ["derive", "cargo"] }
1313
color-print = "0.3.5"
1414
config = "0.14.0"
15-
reqwest = { version = "0.12.2", features = ["json"] }
15+
reqwest = { version = "0.12.2", features = ["json", "multipart"] }
1616
serde = "1.0.197"
1717
serde_derive = "1.0.197"
1818
serde_json = "1.0.115"

readme.md

Lines changed: 80 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -77,85 +77,86 @@ The fastest CLI tool for prompting LLMs
7777
Usage: cai [OPTIONS] [PROMPT]... [COMMAND]
7878
7979
Commands:
80-
fast Shortcut for `groq gemma2-9b-it`
81-
local Shortcut for 'ollama llama3.2'
82-
value Return only the value/answer without explanation for the provided question
83-
svg Generate an SVG graphic from a textual description
84-
ocr Extract text from an image
85-
rename Analyze and rename files to timestamp plus description
86-
changelog Generate a changelog starting from a given commit using OpenAI's GPT-4o
87-
reply Reply to a conversation passed via stdin. Add additional reply instructions as the prompt
88-
rewrite Fix spelling, grammar, and wording issues in text passed via stdin
89-
image Generate an image using GPT-5 image generation
90-
google Google [aliases: go]
91-
ge - Gemini Pro shortcut
92-
gf - Gemini Flash shortcut
93-
groq Groq [aliases: gr]
94-
ll - Llama 3 shortcut (🏆 Default)
95-
mi - Mixtral shortcut
96-
cerebras Cerebras [aliases: ce]
97-
deepseek DeepSeek [aliases: de]
98-
openai OpenAI [aliases: op]
99-
gp - GPT-4o shortcut
100-
gm - GPT-4o mini shortcut
101-
o3 - o3 shortcut
102-
o4m - o4-mini shortcut
103-
gpt41 - gpt-4.1 shortcut
104-
gpt41m - gpt-4.1-mini shortcut
105-
gpt41n - gpt-4.1-nano shortcut
106-
gpt5 - gpt-5 shortcut
107-
gpt5m - gpt-5-mini shortcut
108-
gpt5n - gpt-5-nano shortcut
109-
o1p - o1-pro shortcut
110-
anthropic Anthropic [aliases: an]
111-
cl - Claude Opus
112-
so - Claude Sonnet
113-
ha - Claude Haiku
114-
xai xAI
115-
grok - Grok
116-
llamafile Llamafile server hosted at http://localhost:8080 [aliases: lf]
117-
ollama Ollama server hosted at http://localhost:11434 [aliases: ol]
118-
all Simultaneously send prompt to each provider's default model:
119-
- Groq Llama 3.1
120-
- Antropic Claude Sonnet 3.7
121-
- Google Gemini 2.0 Flash
122-
- OpenAI GPT-4o mini
123-
- Ollama Llama 3
124-
- Llamafile
125-
bash Use Bash development as the prompt context
126-
c Use C development as the prompt context
127-
cpp Use C++ development as the prompt context
128-
cs Use C# development as the prompt context
129-
docker Use Docker development as the prompt context
130-
elm Use Elm development as the prompt context
131-
fish Use Fish development as the prompt context
132-
fs Use F# development as the prompt context
133-
gd Use Godot and GDScript development as the prompt context
134-
git Use Git development as the prompt context
135-
gl Use Gleam development as the prompt context
136-
golang Use Go development as the prompt context
137-
hs Use Haskell development as the prompt context
138-
java Use Java development as the prompt context
139-
js Use JavaScript development as the prompt context
140-
kt Use Kotlin development as the prompt context
141-
ly Use LilyPond development as the prompt context
142-
lua Use Lua development as the prompt context
143-
nix Use Nix development as the prompt context
144-
oc Use OCaml development as the prompt context
145-
php Use PHP development as the prompt context
146-
pg Use Postgres development as the prompt context
147-
ps Use PureScript development as the prompt context
148-
py Use Python development as the prompt context
149-
rb Use Ruby development as the prompt context
150-
rs Use Rust development as the prompt context
151-
sql Use SQLite development as the prompt context
152-
sw Use Swift development as the prompt context
153-
ts Use TypeScript development as the prompt context
154-
ty Use Typst development as the prompt context
155-
wl Use Wolfram Language and Mathematica development as the prompt context
156-
zig Use Zig development as the prompt context
157-
jq Use jq development as the prompt context
158-
help Print this message or the help of the given subcommand(s)
80+
fast Shortcut for `groq gemma2-9b-it`
81+
local Shortcut for 'ollama llama3.2'
82+
value Return only the value/answer without explanation for the provided question
83+
svg Generate an SVG graphic from a textual description
84+
ocr Extract text from an image
85+
rename Analyze and rename files to timestamp plus description
86+
changelog Generate a changelog starting from a given commit using OpenAI's GPT-4o
87+
reply Reply to a conversation passed via stdin. Add additional reply instructions as the prompt
88+
rewrite Fix spelling, grammar, and wording issues in text passed via stdin
89+
transcribe Transcribe an audio file
90+
image Generate an image using GPT-5 image generation
91+
google Google [aliases: go]
92+
ge - Gemini Pro shortcut
93+
gf - Gemini Flash shortcut
94+
groq Groq [aliases: gr]
95+
ll - Llama 3 shortcut (🏆 Default)
96+
mi - Mixtral shortcut
97+
cerebras Cerebras [aliases: ce]
98+
deepseek DeepSeek [aliases: de]
99+
openai OpenAI [aliases: op]
100+
gp - GPT-4o shortcut
101+
gm - GPT-4o mini shortcut
102+
o3 - o3 shortcut
103+
o4m - o4-mini shortcut
104+
gpt41 - gpt-4.1 shortcut
105+
gpt41m - gpt-4.1-mini shortcut
106+
gpt41n - gpt-4.1-nano shortcut
107+
gpt5 - gpt-5 shortcut
108+
gpt5m - gpt-5-mini shortcut
109+
gpt5n - gpt-5-nano shortcut
110+
o1p - o1-pro shortcut
111+
anthropic Anthropic [aliases: an]
112+
cl - Claude Opus
113+
so - Claude Sonnet
114+
ha - Claude Haiku
115+
xai xAI
116+
grok - Grok
117+
llamafile Llamafile server hosted at http://localhost:8080 [aliases: lf]
118+
ollama Ollama server hosted at http://localhost:11434 [aliases: ol]
119+
all Simultaneously send prompt to each provider's default model:
120+
- Groq Llama 3.1
121+
- Antropic Claude Sonnet 3.7
122+
- Google Gemini 2.0 Flash
123+
- OpenAI GPT-4o mini
124+
- Ollama Llama 3
125+
- Llamafile
126+
bash Use Bash development as the prompt context
127+
c Use C development as the prompt context
128+
cpp Use C++ development as the prompt context
129+
cs Use C# development as the prompt context
130+
docker Use Docker development as the prompt context
131+
elm Use Elm development as the prompt context
132+
fish Use Fish development as the prompt context
133+
fs Use F# development as the prompt context
134+
gd Use Godot and GDScript development as the prompt context
135+
git Use Git development as the prompt context
136+
gl Use Gleam development as the prompt context
137+
golang Use Go development as the prompt context
138+
hs Use Haskell development as the prompt context
139+
java Use Java development as the prompt context
140+
js Use JavaScript development as the prompt context
141+
kt Use Kotlin development as the prompt context
142+
ly Use LilyPond development as the prompt context
143+
lua Use Lua development as the prompt context
144+
nix Use Nix development as the prompt context
145+
oc Use OCaml development as the prompt context
146+
php Use PHP development as the prompt context
147+
pg Use Postgres development as the prompt context
148+
ps Use PureScript development as the prompt context
149+
py Use Python development as the prompt context
150+
rb Use Ruby development as the prompt context
151+
rs Use Rust development as the prompt context
152+
sql Use SQLite development as the prompt context
153+
sw Use Swift development as the prompt context
154+
ts Use TypeScript development as the prompt context
155+
ty Use Typst development as the prompt context
156+
wl Use Wolfram Language and Mathematica development as the prompt context
157+
zig Use Zig development as the prompt context
158+
jq Use jq development as the prompt context
159+
help Print this message or the help of the given subcommand(s)
159160
160161
Arguments:
161162
[PROMPT]... The prompt to send to the AI model

src/lib.rs

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,50 @@ pub async fn extract_text_from_file(
891891
exec_tool(&Some(model), opts, &prompt).await
892892
}
893893

894+
pub async fn transcribe_audio_file(
895+
opts: &ExecOptions,
896+
file_path: &str,
897+
) -> Result<(), Box<dyn Error + Send + Sync>> {
898+
let secrets_path_str = get_secrets_path_str();
899+
let full_config = get_full_config(&secrets_path_str)?;
900+
let model = &Model::Model(Provider::OpenAI, "gpt-4o-transcribe".to_string());
901+
let (_used_model, http_req) =
902+
get_http_req(&Some(model), &secrets_path_str, &full_config)?;
903+
904+
let file = std::fs::read(file_path)?;
905+
let part = reqwest::multipart::Part::bytes(file)
906+
.file_name(file_path.to_string())
907+
.mime_str("audio/mpeg")?;
908+
909+
let form = reqwest::multipart::Form::new()
910+
.text("model", http_req.model.clone())
911+
.part("file", part);
912+
913+
let client = reqwest::Client::new();
914+
let resp = client
915+
.post("https://api.openai.com/v1/audio/transcriptions")
916+
.bearer_auth(&http_req.api_key)
917+
.multipart(form)
918+
.send()
919+
.await?;
920+
921+
if resp.status().is_success() {
922+
let resp_json = resp.json::<Value>().await?;
923+
let text = format!("{}\n", resp_json["text"].as_str().unwrap_or_default());
924+
if opts.is_raw {
925+
println!("{text}");
926+
} else {
927+
highlight::text_via_bat(&text);
928+
}
929+
} else {
930+
let resp_json = resp.json::<Value>().await?;
931+
let resp_formatted = serde_json::to_string_pretty(&resp_json).unwrap();
932+
Err(resp_formatted)?;
933+
}
934+
935+
Ok(())
936+
}
937+
894938
pub async fn prompt_with_lang_cntxt(
895939
opts: &ExecOptions,
896940
cmd: &Commands,

src/main.rs

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ use std::io::{read_to_string, IsTerminal};
33

44
use cai::{
55
analyze_file_content, exec_tool, extract_text_from_file, generate_changelog,
6-
prompt_with_lang_cntxt, submit_prompt, Commands, ExecOptions, Model,
7-
Provider,
6+
prompt_with_lang_cntxt, submit_prompt, transcribe_audio_file, Commands,
7+
ExecOptions, Model, Provider,
88
};
99
use chrono::NaiveDateTime;
1010
use clap::{builder::styling, crate_version, Parser};
@@ -32,10 +32,7 @@ async fn process_rename(
3232
})
3333
.is_ok();
3434
let timestamp = if valid_timestamp {
35-
timestamp_norm
36-
.replace(":", "")
37-
.replace("z", "")
38-
.replace("t0000", "")
35+
timestamp_norm.replace([':', 'z'], "").replace("t0000", "")
3936
} else {
4037
chrono::Local::now().format("%Y-%m-%dt%H%M").to_string()
4138
};
@@ -75,16 +72,15 @@ async fn process_rename(
7572
.unwrap_or_else(|_| {
7673
chrono::Local::now().format("%Y-%m-%dt%H%M").to_string()
7774
})
78-
.to_string()
7975
})
8076
.unwrap_or_else(|_| {
8177
chrono::Local::now().format("%Y-%m-%dt%H%M").to_string()
8278
});
8379

8480
std::path::Path::new(file)
8581
.file_stem()
86-
.map(|file_name_no_ext| {
87-
file_name_no_ext.to_str().unwrap_or_default().to_string()
82+
.and_then(|file_name_no_ext| {
83+
file_name_no_ext.to_str().map(|s| s.to_string())
8884
})
8985
.map(|file_name| rename_file(file.to_string(), timestamp, file_name))
9086
.ok_or_else(|| {
@@ -183,7 +179,7 @@ async fn exec_with_args(args: Args, stdin: &str) {
183179
is_json: args.json,
184180
json_schema: args
185181
.json_schema
186-
.map(|schema_str| {
182+
.and_then(|schema_str| {
187183
serde_json::from_str(&schema_str).expect("Invalid JSON schema")
188184
})
189185
.map(|schema: Value| {
@@ -192,12 +188,11 @@ async fn exec_with_args(args: Args, stdin: &str) {
192188
if !schema_obj.contains_key("type") {
193189
schema_obj.insert("type".to_string(), "object".into());
194190
}
195-
let api_object = json!({
191+
json!({
196192
"name": "requested_json_schema",
197193
"strict": true,
198194
"schema": schema_obj,
199-
});
200-
api_object
195+
})
201196
}),
202197
subcommand: args.command.clone(),
203198
};
@@ -341,6 +336,12 @@ async fn exec_with_args(args: Args, stdin: &str) {
341336
)
342337
.await
343338
}
339+
Commands::Transcribe { file } => {
340+
if let Err(_err) = transcribe_audio_file(&opts, file).await {
341+
eprintln!("Error transcribing file: {{_err}}");
342+
std::process::exit(1);
343+
}
344+
}
344345
Commands::Image { prompt } => {
345346
let image_prompt = prompt.join(" ").to_string();
346347
submit_prompt(

src/types.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,12 @@ pub enum Commands {
7373
prompt: Vec<String>,
7474
},
7575

76+
/// Transcribe an audio file
77+
#[clap()]
78+
Transcribe {
79+
/// The audio file to transcribe
80+
file: String,
81+
},
7682

7783
/// Generate an image using GPT-5 image generation
7884
#[clap()]
@@ -522,6 +528,7 @@ impl Commands {
522528
Commands::Changelog { .. } => Some("Changelog"),
523529
Commands::Reply { .. } => Some("Reply"),
524530
Commands::Rewrite { .. } => Some("Rewrite"),
531+
Commands::Transcribe { .. } => Some("Transcribe"),
525532
Commands::Image { .. } => Some("Image"),
526533

527534
// AI Providers

0 commit comments

Comments
 (0)