Add subcommand transcribe to transcribe text from an audio file

ad-si · ad-si · commit be7ad566d0c1 · 2025-08-15T19:59:15.000Z
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,7 +12,7 @@ edition = "2021"
 clap = { version = "4.5.4", features = ["derive", "cargo"] }
 color-print = "0.3.5"
 config = "0.14.0"
-reqwest = { version = "0.12.2", features = ["json"] }
+reqwest = { version = "0.12.2", features = ["json", "multipart"] }
 serde = "1.0.197"
 serde_derive = "1.0.197"
 serde_json = "1.0.115"
diff --git a/readme.md b/readme.md
@@ -77,85 +77,86 @@ The fastest CLI tool for prompting LLMs
 Usage: cai [OPTIONS] [PROMPT]... [COMMAND]
 
 Commands:
-  fast       Shortcut for `groq gemma2-9b-it`
-  local      Shortcut for 'ollama llama3.2'
-  value      Return only the value/answer without explanation for the provided question
-  svg        Generate an SVG graphic from a textual description
-  ocr        Extract text from an image
-  rename     Analyze and rename files to timestamp plus description
-  changelog  Generate a changelog starting from a given commit using OpenAI's GPT-4o
-  reply      Reply to a conversation passed via stdin. Add additional reply instructions as the prompt
-  rewrite    Fix spelling, grammar, and wording issues in text passed via stdin
-  image      Generate an image using GPT-5 image generation
-  google     Google [aliases: go]
-  ge         - Gemini Pro shortcut
-  gf         - Gemini Flash shortcut
-  groq       Groq [aliases: gr]
-  ll         - Llama 3 shortcut (🏆 Default)
-  mi         - Mixtral shortcut
-  cerebras   Cerebras [aliases: ce]
-  deepseek   DeepSeek [aliases: de]
-  openai     OpenAI [aliases: op]
-  gp         - GPT-4o shortcut
-  gm         - GPT-4o mini shortcut
-  o3         - o3 shortcut
-  o4m        - o4-mini shortcut
-  gpt41      - gpt-4.1 shortcut
-  gpt41m     - gpt-4.1-mini shortcut
-  gpt41n     - gpt-4.1-nano shortcut
-  gpt5       - gpt-5 shortcut
-  gpt5m      - gpt-5-mini shortcut
-  gpt5n      - gpt-5-nano shortcut
-  o1p        - o1-pro shortcut
-  anthropic  Anthropic [aliases: an]
-  cl         - Claude Opus
-  so         - Claude Sonnet
-  ha         - Claude Haiku
-  xai        xAI
-  grok       - Grok
-  llamafile  Llamafile server hosted at http://localhost:8080 [aliases: lf]
-  ollama     Ollama server hosted at http://localhost:11434 [aliases: ol]
-  all        Simultaneously send prompt to each provider's default model:
-             - Groq Llama 3.1
-             - Antropic Claude Sonnet 3.7
-             - Google Gemini 2.0 Flash
-             - OpenAI GPT-4o mini
-             - Ollama Llama 3
-             - Llamafile
-  bash       Use Bash development as the prompt context
-  c          Use C development as the prompt context
-  cpp        Use C++ development as the prompt context
-  cs         Use C# development as the prompt context
-  docker     Use Docker development as the prompt context
-  elm        Use Elm development as the prompt context
-  fish       Use Fish development as the prompt context
-  fs         Use F# development as the prompt context
-  gd         Use Godot and GDScript development as the prompt context
-  git        Use Git development as the prompt context
-  gl         Use Gleam development as the prompt context
-  golang     Use Go development as the prompt context
-  hs         Use Haskell development as the prompt context
-  java       Use Java development as the prompt context
-  js         Use JavaScript development as the prompt context
-  kt         Use Kotlin development as the prompt context
-  ly         Use LilyPond development as the prompt context
-  lua        Use Lua development as the prompt context
-  nix        Use Nix development as the prompt context
-  oc         Use OCaml development as the prompt context
-  php        Use PHP development as the prompt context
-  pg         Use Postgres development as the prompt context
-  ps         Use PureScript development as the prompt context
-  py         Use Python development as the prompt context
-  rb         Use Ruby development as the prompt context
-  rs         Use Rust development as the prompt context
-  sql        Use SQLite development as the prompt context
-  sw         Use Swift development as the prompt context
-  ts         Use TypeScript development as the prompt context
-  ty         Use Typst development as the prompt context
-  wl         Use Wolfram Language and Mathematica development as the prompt context
-  zig        Use Zig development as the prompt context
-  jq         Use jq development as the prompt context
-  help       Print this message or the help of the given subcommand(s)
+  fast        Shortcut for `groq gemma2-9b-it`
+  local       Shortcut for 'ollama llama3.2'
+  value       Return only the value/answer without explanation for the provided question
+  svg         Generate an SVG graphic from a textual description
+  ocr         Extract text from an image
+  rename      Analyze and rename files to timestamp plus description
+  changelog   Generate a changelog starting from a given commit using OpenAI's GPT-4o
+  reply       Reply to a conversation passed via stdin. Add additional reply instructions as the prompt
+  rewrite     Fix spelling, grammar, and wording issues in text passed via stdin
+  transcribe  Transcribe an audio file
+  image       Generate an image using GPT-5 image generation
+  google      Google [aliases: go]
+  ge          - Gemini Pro shortcut
+  gf          - Gemini Flash shortcut
+  groq        Groq [aliases: gr]
+  ll          - Llama 3 shortcut (🏆 Default)
+  mi          - Mixtral shortcut
+  cerebras    Cerebras [aliases: ce]
+  deepseek    DeepSeek [aliases: de]
+  openai      OpenAI [aliases: op]
+  gp          - GPT-4o shortcut
+  gm          - GPT-4o mini shortcut
+  o3          - o3 shortcut
+  o4m         - o4-mini shortcut
+  gpt41       - gpt-4.1 shortcut
+  gpt41m      - gpt-4.1-mini shortcut
+  gpt41n      - gpt-4.1-nano shortcut
+  gpt5        - gpt-5 shortcut
+  gpt5m       - gpt-5-mini shortcut
+  gpt5n       - gpt-5-nano shortcut
+  o1p         - o1-pro shortcut
+  anthropic   Anthropic [aliases: an]
+  cl          - Claude Opus
+  so          - Claude Sonnet
+  ha          - Claude Haiku
+  xai         xAI
+  grok        - Grok
+  llamafile   Llamafile server hosted at http://localhost:8080 [aliases: lf]
+  ollama      Ollama server hosted at http://localhost:11434 [aliases: ol]
+  all         Simultaneously send prompt to each provider's default model:
+              - Groq Llama 3.1
+              - Antropic Claude Sonnet 3.7
+              - Google Gemini 2.0 Flash
+              - OpenAI GPT-4o mini
+              - Ollama Llama 3
+              - Llamafile
+  bash        Use Bash development as the prompt context
+  c           Use C development as the prompt context
+  cpp         Use C++ development as the prompt context
+  cs          Use C# development as the prompt context
+  docker      Use Docker development as the prompt context
+  elm         Use Elm development as the prompt context
+  fish        Use Fish development as the prompt context
+  fs          Use F# development as the prompt context
+  gd          Use Godot and GDScript development as the prompt context
+  git         Use Git development as the prompt context
+  gl          Use Gleam development as the prompt context
+  golang      Use Go development as the prompt context
+  hs          Use Haskell development as the prompt context
+  java        Use Java development as the prompt context
+  js          Use JavaScript development as the prompt context
+  kt          Use Kotlin development as the prompt context
+  ly          Use LilyPond development as the prompt context
+  lua         Use Lua development as the prompt context
+  nix         Use Nix development as the prompt context
+  oc          Use OCaml development as the prompt context
+  php         Use PHP development as the prompt context
+  pg          Use Postgres development as the prompt context
+  ps          Use PureScript development as the prompt context
+  py          Use Python development as the prompt context
+  rb          Use Ruby development as the prompt context
+  rs          Use Rust development as the prompt context
+  sql         Use SQLite development as the prompt context
+  sw          Use Swift development as the prompt context
+  ts          Use TypeScript development as the prompt context
+  ty          Use Typst development as the prompt context
+  wl          Use Wolfram Language and Mathematica development as the prompt context
+  zig         Use Zig development as the prompt context
+  jq          Use jq development as the prompt context
+  help        Print this message or the help of the given subcommand(s)
 
 Arguments:
   [PROMPT]...  The prompt to send to the AI model
diff --git a/src/lib.rs b/src/lib.rs
@@ -891,6 +891,50 @@ pub async fn extract_text_from_file(
   exec_tool(&Some(model), opts, &prompt).await
 }
 
+pub async fn transcribe_audio_file(
+  opts: &ExecOptions,
+  file_path: &str,
+) -> Result<(), Box<dyn Error + Send + Sync>> {
+  let secrets_path_str = get_secrets_path_str();
+  let full_config = get_full_config(&secrets_path_str)?;
+  let model = &Model::Model(Provider::OpenAI, "gpt-4o-transcribe".to_string());
+  let (_used_model, http_req) =
+    get_http_req(&Some(model), &secrets_path_str, &full_config)?;
+
+  let file = std::fs::read(file_path)?;
+  let part = reqwest::multipart::Part::bytes(file)
+    .file_name(file_path.to_string())
+    .mime_str("audio/mpeg")?;
+
+  let form = reqwest::multipart::Form::new()
+    .text("model", http_req.model.clone())
+    .part("file", part);
+
+  let client = reqwest::Client::new();
+  let resp = client
+    .post("https://api.openai.com/v1/audio/transcriptions")
+    .bearer_auth(&http_req.api_key)
+    .multipart(form)
+    .send()
+    .await?;
+
+  if resp.status().is_success() {
+    let resp_json = resp.json::<Value>().await?;
+    let text = format!("{}\n", resp_json["text"].as_str().unwrap_or_default());
+    if opts.is_raw {
+      println!("{text}");
+    } else {
+      highlight::text_via_bat(&text);
+    }
+  } else {
+    let resp_json = resp.json::<Value>().await?;
+    let resp_formatted = serde_json::to_string_pretty(&resp_json).unwrap();
+    Err(resp_formatted)?;
+  }
+
+  Ok(())
+}
+
 pub async fn prompt_with_lang_cntxt(
   opts: &ExecOptions,
   cmd: &Commands,
diff --git a/src/main.rs b/src/main.rs
@@ -3,8 +3,8 @@ use std::io::{read_to_string, IsTerminal};
 
 use cai::{
   analyze_file_content, exec_tool, extract_text_from_file, generate_changelog,
-  prompt_with_lang_cntxt, submit_prompt, Commands, ExecOptions, Model,
-  Provider,
+  prompt_with_lang_cntxt, submit_prompt, transcribe_audio_file, Commands,
+  ExecOptions, Model, Provider,
 };
 use chrono::NaiveDateTime;
 use clap::{builder::styling, crate_version, Parser};
@@ -32,10 +32,7 @@ async fn process_rename(
           })
           .is_ok();
       let timestamp = if valid_timestamp {
-        timestamp_norm
-          .replace(":", "")
-          .replace("z", "")
-          .replace("t0000", "")
+        timestamp_norm.replace([':', 'z'], "").replace("t0000", "")
       } else {
         chrono::Local::now().format("%Y-%m-%dt%H%M").to_string()
       };
@@ -75,16 +72,15 @@ async fn process_rename(
               .unwrap_or_else(|_| {
                 chrono::Local::now().format("%Y-%m-%dt%H%M").to_string()
               })
-              .to_string()
           })
           .unwrap_or_else(|_| {
             chrono::Local::now().format("%Y-%m-%dt%H%M").to_string()
           });
 
         std::path::Path::new(file)
           .file_stem()
-          .map(|file_name_no_ext| {
-            file_name_no_ext.to_str().unwrap_or_default().to_string()
+          .and_then(|file_name_no_ext| {
+            file_name_no_ext.to_str().map(|s| s.to_string())
           })
           .map(|file_name| rename_file(file.to_string(), timestamp, file_name))
           .ok_or_else(|| {
@@ -183,7 +179,7 @@ async fn exec_with_args(args: Args, stdin: &str) {
     is_json: args.json,
     json_schema: args
       .json_schema
-      .map(|schema_str| {
+      .and_then(|schema_str| {
         serde_json::from_str(&schema_str).expect("Invalid JSON schema")
       })
       .map(|schema: Value| {
@@ -192,12 +188,11 @@ async fn exec_with_args(args: Args, stdin: &str) {
         if !schema_obj.contains_key("type") {
           schema_obj.insert("type".to_string(), "object".into());
         }
-        let api_object = json!({
+        json!({
           "name": "requested_json_schema",
           "strict": true,
           "schema": schema_obj,
-        });
-        api_object
+        })
       }),
     subcommand: args.command.clone(),
   };
@@ -341,6 +336,12 @@ async fn exec_with_args(args: Args, stdin: &str) {
         )
         .await
       }
+      Commands::Transcribe { file } => {
+        if let Err(_err) = transcribe_audio_file(&opts, file).await {
+          eprintln!("Error transcribing file: {{_err}}");
+          std::process::exit(1);
+        }
+      }
       Commands::Image { prompt } => {
         let image_prompt = prompt.join(" ").to_string();
         submit_prompt(
diff --git a/src/types.rs b/src/types.rs
@@ -73,6 +73,12 @@ pub enum Commands {
     prompt: Vec<String>,
   },
 
+  /// Transcribe an audio file
+  #[clap()]
+  Transcribe {
+    /// The audio file to transcribe
+    file: String,
+  },
 
   /// Generate an image using GPT-5 image generation
   #[clap()]
@@ -522,6 +528,7 @@ impl Commands {
       Commands::Changelog { .. } => Some("Changelog"),
       Commands::Reply { .. } => Some("Reply"),
       Commands::Rewrite { .. } => Some("Rewrite"),
+      Commands::Transcribe { .. } => Some("Transcribe"),
       Commands::Image { .. } => Some("Image"),
 
       // AI Providers