✨(summary) add localization support for transcription context text

cameledev · lebaudantoine · commit f5e0ddf69236 · 2026-02-25T18:07:19.000+01:00
Transcription and summarization results were always generated
using a French text structure (e.g. "Réunion du..."), regardless
of user preference or meeting language. Introduced basic localization
support to adapt generated string languages.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to
 ### Added
 
 - 👷(docker) add arm64 platform support for image builds
+- ✨(summary) add localization support for transcription context text
 
 ### Changed
 
diff --git a/gitlint/gitlint_emoji.py b/gitlint/gitlint_emoji.py
@@ -2,6 +2,7 @@
 Gitlint extra rule to validate that the message title is of the form
 "<gitmoji>(<scope>) <subject>"
 """
+
 from __future__ import unicode_literals
 
 import re
diff --git a/src/backend/core/recording/event/notification.py b/src/backend/core/recording/event/notification.py
@@ -167,6 +167,7 @@ def _notify_summary_service(recording):
                 owner_access.user.timezone
             ).strftime("%H:%M"),
             "download_link": f"{get_recording_download_base_url()}/{recording.id}",
+            "context_language": owner_access.user.language,
         }
 
         headers = {
diff --git a/src/summary/summary/api/route/tasks.py b/src/summary/summary/api/route/tasks.py
@@ -15,8 +15,8 @@
 settings = get_settings()
 
 
-class TaskCreation(BaseModel):
-    """Task data."""
+class TranscribeSummarizeTaskCreation(BaseModel):
+    """Transcription and summarization parameters."""
 
     owner_id: str
     filename: str
@@ -28,6 +28,7 @@ class TaskCreation(BaseModel):
     recording_time: Optional[str]
     language: Optional[str]
     download_link: Optional[str]
+    context_language: Optional[str] = None
 
     @field_validator("language")
     @classmethod
@@ -45,8 +46,8 @@ def validate_language(cls, v):
 
 
 @router.post("/")
-async def create_task(request: TaskCreation):
-    """Create a task."""
+async def create_transcribe_summarize_task(request: TranscribeSummarizeTaskCreation):
+    """Create a transcription and summarization task."""
     task = process_audio_transcribe_summarize_v2.apply_async(
         args=[
             request.owner_id,
@@ -59,6 +60,7 @@ async def create_task(request: TaskCreation):
             request.recording_time,
             request.language,
             request.download_link,
+            request.context_language,
         ],
         queue=settings.transcribe_queue,
     )
diff --git a/src/summary/summary/core/celery_worker.py b/src/summary/summary/core/celery_worker.py
@@ -18,6 +18,7 @@
 from summary.core.config import get_settings
 from summary.core.file_service import FileService, FileServiceException
 from summary.core.llm_service import LLMException, LLMObservability, LLMService
+from summary.core.locales import get_locale
 from summary.core.prompt import (
     FORMAT_NEXT_STEPS,
     FORMAT_PLAN,
@@ -121,6 +122,7 @@ def process_audio_transcribe_summarize_v2(
     recording_time: Optional[str],
     language: Optional[str],
     download_link: Optional[str],
+    context_language: Optional[str] = None,
 ):
     """Process an audio file by transcribing it and generating a summary.
 
@@ -129,6 +131,19 @@ def process_audio_transcribe_summarize_v2(
     2. Transcribes the audio using WhisperX model
     3. Sends the results via webhook
 
+    Args:
+        self: Celery task instance (passed on with bind=True)
+        owner_id: Unique identifier of the recording owner.
+        filename: Name of the audio file in MinIO storage.
+        email: Email address of the recording owner.
+        sub: OIDC subject identifier of the recording owner.
+        received_at: Unix timestamp when the recording was received.
+        room: room name where the recording took place.
+        recording_date: Date of the recording (localized display string).
+        recording_time: Time of the recording (localized display string).
+        language: ISO 639-1 language code for transcription.
+        download_link: URL to download the original recording.
+        context_language: ISO 639-1 language code of the meeting summary context text.
     """
     logger.info(
         "Notification received | Owner: %s | Room: %s",
@@ -145,6 +160,7 @@ def process_audio_transcribe_summarize_v2(
         max_retries=settings.whisperx_max_retries,
     )
 
+    # Transcription
     try:
         with (
             file_service.prepare_audio_file(filename) as (audio_file, metadata),
@@ -183,7 +199,10 @@ def process_audio_transcribe_summarize_v2(
 
     metadata_manager.track_transcription_metadata(task_id, transcription)
 
-    formatter = TranscriptFormatter()
+    # For locale of context, use in decreasing priority context_language,
+    # language (of meeting), default context language
+    locale = get_locale(context_language, language)
+    formatter = TranscriptFormatter(locale)
 
     content, title = formatter.format(
         transcription,
@@ -221,6 +240,7 @@ def process_audio_transcribe_summarize_v2(
 
     metadata_manager.capture(task_id, settings.posthog_event_success)
 
+    # LLM Summarization
     if (
         analytics.is_feature_enabled("summary-enabled", distinct_id=owner_id)
         and settings.is_summary_enabled
@@ -336,9 +356,7 @@ def summarize_transcription(
     summary = tldr + "\n\n" + cleaned_summary + "\n\n" + next_steps
 
     data = {
-        "title": settings.summary_title_template.format(
-            title=title,
-        ),
+        "title": settings.summary_title_template.format(title=title),
         "content": summary,
         "email": email,
         "sub": sub,
diff --git a/src/summary/summary/core/config.py b/src/summary/summary/core/config.py
@@ -1,7 +1,7 @@
 """Application configuration and settings."""
 
 from functools import lru_cache
-from typing import Annotated, List, Optional, Set
+from typing import Annotated, List, Literal, Optional, Set
 
 from fastapi import Depends
 from pydantic import SecretStr
@@ -51,7 +51,6 @@ class Settings(BaseSettings):
 
     # Transcription processing
     hallucination_patterns: List[str] = ["Vap'n'Roll Thierry"]
-    hallucination_replacement_text: str = "[Texte impossible à transcrire]"
 
     # Webhook-related settings
     webhook_max_retries: int = 2
@@ -60,11 +59,10 @@ class Settings(BaseSettings):
     webhook_api_token: SecretStr
     webhook_url: str
 
+    # Locale
+    default_context_language: Literal["de", "en", "fr", "nl"] = "fr"
+
     # Output related settings
-    document_default_title: Optional[str] = "Transcription"
-    document_title_template: Optional[str] = (
-        'Réunion "{room}" du {room_recording_date} à {room_recording_time}'
-    )
     summary_title_template: Optional[str] = "Résumé de {title}"
 
     # Summary related settings
diff --git a/src/summary/summary/core/locales/__init__.py b/src/summary/summary/core/locales/__init__.py
@@ -0,0 +1,30 @@
+"""Locale support for the summary service."""
+
+from typing import Optional
+
+from summary.core.config import get_settings
+from summary.core.locales import de, en, fr, nl
+from summary.core.locales.strings import LocaleStrings
+
+_LOCALES = {"fr": fr, "en": en, "de": de, "nl": nl}
+
+
+def get_locale(*languages: Optional[str]) -> LocaleStrings:
+    """Return locale strings for the first matching language candidate.
+
+    Accept language codes in decreasing priority order and return the
+    locale for the first one that matches a known locale.
+    Fall back to the configured default_context_language.
+    """
+    for lang in languages:
+        if not lang:
+            continue
+        if lang in _LOCALES:
+            return _LOCALES[lang].STRINGS
+
+        # Provide fallback for longer formats of ISO 639-1 (e.g. "en-au" -> "en")
+        base_lang = lang.split("-")[0]
+        if base_lang in _LOCALES:
+            return _LOCALES[base_lang].STRINGS
+
+    return _LOCALES[get_settings().default_context_language].STRINGS
diff --git a/src/summary/summary/core/locales/de.py b/src/summary/summary/core/locales/de.py
@@ -0,0 +1,34 @@
+"""German locale strings."""
+
+from summary.core.locales.strings import LocaleStrings
+
+STRINGS = LocaleStrings(
+    empty_transcription="""
+**In Ihrer Transkription wurde kein Audioinhalt erkannt.**
+
+*Wenn Sie glauben, dass es sich um einen Fehler handelt, zögern Sie nicht,
+unseren technischen Support zu kontaktieren: visio@numerique.gouv.fr*
+
+.
+
+.
+
+.
+
+Einige Punkte, die wir Ihnen empfehlen zu überprüfen:
+- War ein Mikrofon aktiviert?
+- Waren Sie nah genug am Mikrofon?
+- Ist das Mikrofon von guter Qualität?
+- Dauert die Aufnahme länger als 30 Sekunden?
+
+""",
+    download_header_template=(
+        "\n*Laden Sie Ihre Aufnahme herunter, "
+        "indem Sie [diesem Link folgen]({download_link})*\n"
+    ),
+    hallucination_replacement_text="[Text konnte nicht transkribiert werden]",
+    document_default_title="Transkription",
+    document_title_template=(
+        'Besprechung "{room}" am {room_recording_date} um {room_recording_time}'
+    ),
+)
diff --git a/src/summary/summary/core/locales/en.py b/src/summary/summary/core/locales/en.py
@@ -0,0 +1,33 @@
+"""English locale strings."""
+
+from summary.core.locales.strings import LocaleStrings
+
+STRINGS = LocaleStrings(
+    empty_transcription="""
+**No audio content was detected in your transcription.**
+
+*If you believe this is an error, please do not hesitate to contact
+our technical support: visio@numerique.gouv.fr*
+
+.
+
+.
+
+.
+
+A few things we recommend you check:
+- Was a microphone enabled?
+- Were you close enough to the microphone?
+- Is the microphone of good quality?
+- Is the recording longer than 30 seconds?
+
+""",
+    download_header_template=(
+        "\n*Download your recording by [following this link]({download_link})*\n"
+    ),
+    hallucination_replacement_text="[Unable to transcribe text]",
+    document_default_title="Transcription",
+    document_title_template=(
+        'Meeting "{room}" on {room_recording_date} at {room_recording_time}'
+    ),
+)
diff --git a/src/summary/summary/core/locales/fr.py b/src/summary/summary/core/locales/fr.py
@@ -0,0 +1,33 @@
+"""French locale strings (default)."""
+
+from summary.core.locales.strings import LocaleStrings
+
+STRINGS = LocaleStrings(
+    empty_transcription="""
+**Aucun contenu audio n'a été détecté dans votre transcription.**
+
+*Si vous pensez qu'il s'agit d'une erreur, n'hésitez pas à contacter
+notre support technique : visio@numerique.gouv.fr*
+
+.
+
+.
+
+.
+
+Quelques points que nous vous conseillons de vérifier :
+- Un micro était-il activé ?
+- Étiez-vous suffisamment proche ?
+- Le micro est-il de bonne qualité ?
+- L'enregistrement dure-t-il plus de 30 secondes ?
+
+""",
+    download_header_template=(
+        "\n*Télécharger votre enregistrement en [suivant ce lien]({download_link})*\n"
+    ),
+    hallucination_replacement_text="[Texte impossible à transcrire]",
+    document_default_title="Transcription",
+    document_title_template=(
+        'Réunion "{room}" du {room_recording_date} à {room_recording_time}'
+    ),
+)
diff --git a/src/summary/summary/core/locales/nl.py b/src/summary/summary/core/locales/nl.py
@@ -0,0 +1,33 @@
+"""Dutch locale strings."""
+
+from summary.core.locales.strings import LocaleStrings
+
+STRINGS = LocaleStrings(
+    empty_transcription="""
+**Er is geen audio-inhoud gedetecteerd in uw transcriptie.**
+
+*Als u denkt dat dit een fout is, aarzel dan niet om contact op te nemen
+met onze technische ondersteuning: visio@numerique.gouv.fr*
+
+.
+
+.
+
+.
+
+Een paar punten die wij u aanraden te controleren:
+- Was er een microfoon ingeschakeld?
+- Was u dicht genoeg bij de microfoon?
+- Is de microfoon van goede kwaliteit?
+- Duurt de opname langer dan 30 seconden?
+
+""",
+    download_header_template=(
+        "\n*Download uw opname door [deze link te volgen]({download_link})*\n"
+    ),
+    hallucination_replacement_text="[Tekst kon niet worden getranscribeerd]",
+    document_default_title="Transcriptie",
+    document_title_template=(
+        'Vergadering "{room}" op {room_recording_date} om {room_recording_time}'
+    ),
+)
diff --git a/src/summary/summary/core/locales/strings.py b/src/summary/summary/core/locales/strings.py
@@ -0,0 +1,15 @@
+"""Locale types for the summary service."""
+
+from dataclasses import dataclass
+
+
+@dataclass(frozen=True)
+class LocaleStrings:
+    """All translatable output strings for the summary pipeline."""
+
+    # transcript_formatter.py
+    empty_transcription: str
+    download_header_template: str
+    hallucination_replacement_text: str
+    document_default_title: str
+    document_title_template: str
diff --git a/src/summary/summary/core/transcript_formatter.py b/src/summary/summary/core/transcript_formatter.py

Original file line number	Diff line number	Diff line change
`@@ -167,6 +167,7 @@ def _notify_summary_service(recording):`
`167`	`167`	`owner_access.user.timezone`
`168`	`168`	`).strftime("%H:%M"),`
`169`	`169`	`"download_link": f"{get_recording_download_base_url()}/{recording.id}",`
	`170`	`+ "context_language": owner_access.user.language,`
`170`	`171`	`}`
`171`	`172`
`172`	`173`	`headers = {`