DataHackIL · shaypal5 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -39,7 +39,7 @@ jobs:
         run: uv pip install --no-deps -e .
 
       - name: Run tests
-        run: .venv/bin/pytest tests/ -v --tb=short --cov avdp --cov-branch --cov-report=
+        run: .venv/bin/pytest tests/ -v --tb=short --cov synthbanshee --cov-branch --cov-report=
 
       - name: Upload raw coverage artifact
         if: always()
@@ -77,7 +77,7 @@ jobs:
         run: .venv/bin/ruff format --check .
 
       - name: mypy
-        run: .venv/bin/mypy avdp/ --ignore-missing-imports
+        run: .venv/bin/mypy synthbanshee/ --ignore-missing-imports
 
   coverage:
     name: "Combine coverage & generate XML"

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -31,4 +31,4 @@ repos:
           - pydantic>=2.6
           - types-PyYAML
         args: ["--ignore-missing-imports"]
-        files: ^avdp/
+        files: ^synthbanshee/
diff --git a/AGENTS.md b/AGENTS.md
@@ -22,7 +22,7 @@ All design decisions are documented. Read before changing anything structural:
 ## Repo structure
 
 ```
-avdp/                      ← main Python package
+synthbanshee/              ← main Python package
   config/                  ← Pydantic config models
   script/                  ← LLM-based script generation + Jinja2 templates
   tts/                     ← TTS rendering (Azure he-IL, Google he-IL)
@@ -131,7 +131,7 @@ OPENAI_API_KEY=...   # or ANTHROPIC_API_KEY for Codex script generation
 - Unit tests in `tests/unit/`, integration tests in `tests/integration/`
 - Run with `pytest`
 - Every module must have unit tests before the corresponding integration test is written
-- A generated clip is valid if and only if it passes `avdp.package.validator.validate_clip(clip_path)`
+- A generated clip is valid if and only if it passes `synthbanshee.package.validator.validate_clip(clip_path)`
 
 ## What NOT to do
 

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -22,7 +22,7 @@ All design decisions are documented. Read before changing anything structural:
 ## Repo structure
 
 ```
-avdp/                      ← main Python package
+synthbanshee/              ← main Python package
   config/                  ← Pydantic config models
   script/                  ← LLM-based script generation + Jinja2 templates
   tts/                     ← TTS rendering (Azure he-IL, Google he-IL)
@@ -134,14 +134,14 @@ Path fields (`transcript_path`, `scene_config`) and date/version fields are inte
 
 ## Preprocessing stack
 
-**No torchaudio.** The preprocessing pipeline (`avdp/augment/preprocessing.py`) uses only `scipy` + `soundfile` to avoid torch version incompatibilities. Operations in order: resample (polyphase), downmix to mono, Butterworth low-pass at 7.5 kHz, Wiener denoise, peak-normalize to −1.0 dBFS, silence pad ≥ 0.5 s.
+**No torchaudio.** The preprocessing pipeline (`synthbanshee/augment/preprocessing.py`) uses only `scipy` + `soundfile` to avoid torch version incompatibilities. Operations in order: resample (polyphase), downmix to mono, Butterworth low-pass at 7.5 kHz, Wiener denoise, peak-normalize to −1.0 dBFS, silence pad ≥ 0.5 s.
 
 ## Testing conventions
 
 - Unit tests in `tests/unit/`, integration tests in `tests/integration/`
 - Run with `pytest`
 - Every module must have unit tests before the corresponding integration test is written
-- A generated clip is valid if and only if it passes `avdp.package.validator.validate_clip(clip_path)`
+- A generated clip is valid if and only if it passes `synthbanshee.package.validator.validate_clip(clip_path)`
 - `validate_clip` checks: (1) all three files present, (2) WAV passes `validate_audio()`, (3) JSON parses as `ClipMetadata` with `is_synthetic=True`, (4) filename stem is ASCII-only lowercase
 
 ## Phase 0 status (complete as of 2026-04-07)
@@ -150,11 +150,11 @@ All Phase 0 milestones (0.2–0.6) are implemented and tested:
 
 | Milestone | Module(s) | Status |
 |---|---|---|
-| 0.2 Config schema | `avdp/config/` | Done |
-| 0.3 TTS renderer | `avdp/tts/` | Done |
-| 0.4 Preprocessing pipeline | `avdp/augment/preprocessing.py` | Done |
-| 0.5 Label schema & generator | `avdp/labels/` | Done |
-| 0.6 Happy path + validator + CLI | `avdp/package/validator.py`, `avdp/cli.py` | Done |
+| 0.2 Config schema | `synthbanshee/config/` | Done |
+| 0.3 TTS renderer | `synthbanshee/tts/` | Done |
+| 0.4 Preprocessing pipeline | `synthbanshee/augment/preprocessing.py` | Done |
+| 0.5 Label schema & generator | `synthbanshee/labels/` | Done |
+| 0.6 Happy path + validator + CLI | `synthbanshee/package/validator.py`, `synthbanshee/cli.py` | Done |
 
 CI runs ruff, mypy, and pytest (Python 3.11 + 3.12) on every PR and push to main.
 

diff --git a/README.md b/README.md
@@ -1 +1,104 @@
 # SynthBanshee
+
+**SynthBanshee** is a config-driven pipeline for generating large-scale synthetic Hebrew audio
+datasets. It is part of the [AVDP](https://datahack.org.il) (Audio Violence Dataset Project)
+initiative run by DataHack, which produces training data for two AI safety products:
+
+- **She-Proves** — a smartphone app that passively monitors for domestic violence incidents and
+  preserves audio evidence for legal use
+- **Elephant in the Room (הפיל שבחדר)** — a Raspberry Pi–class device in social-work offices that
+  alerts security when a social worker is being physically threatened
+
+The goal of SynthBanshee is to supply AI teams with a wide, deliberate distribution of voices,
+acoustic conditions, and violence types while real-data (actor recording) pipelines are built in
+parallel. Synthetic-to-real gap is expected and documented.
+
+---
+
+## How it works
+
+Every clip is produced by four sequential pipeline stages:
+
+```
+SceneConfig (YAML)
+  → [1] Script Generator   LLM fills a Jinja2 template → dialogue JSON
+  → [2] TTS Renderer       Azure he-IL SSML → per-speaker WAV segments
+  → [3] Acoustic Augmenter Room IR + device profile + noise → scene WAV
+  → [4] Label Generator    Script structure + augmentation log → AVDP-schema JSONL
+```
+
+Output per clip: `{clip_id}.wav` + `{clip_id}.txt` (Hebrew transcript) + `{clip_id}.json` (metadata).
+
+Audio spec: **16 kHz · mono · 16-bit PCM · −1.0 dBFS peak · ≥ 0.5 s silence pad**.
+
+---
+
+## Dataset tiers
+
+| Tier | Description | Target (per project) |
+|---|---|---|
+| A | Clean TTS, no acoustic augmentation | 1,000 clips |
+| B | Room simulation + device profile + background noise | 2,000 clips |
+| C | Hard negatives / confusors (arguments that de-escalate, ambient sounds) | 1,000 clips |
+
+Two projects — **She-Proves** (3–6 min clips, apartment rooms) and **Elephant in the Room**
+(1–4 min clips, clinic/welfare offices) — each receive the full tier stack.
+
+---
+
+## Label taxonomy
+
+Labels use a three-level hierarchy (no binary Violence/Non-Violence):
+
+1. **Violence typology** (scene): `SV` · `IT` · `NEG` · `NEU`
+2. **Tier 1 category** (event): `PHYS` · `VERB` · `DIST` · `ACOU` · `EMOT` · `NONE`
+3. **Tier 2 subtype** (event): e.g. `PHYS_HARD` · `VERB_THREAT` · `DIST_SCREAM`
+
+Full taxonomy: `configs/taxonomy.yaml`.
+
+---
+
+## Current status
+
+Phase 0 (pipeline foundation) is complete. Phase 1 (500 Tier A clips/project) is next.
+
+| Phase | Deliverable | Status |
+|---|---|---|
+| 0 | Single spec-compliant clip end-to-end | **Done** |
+| 1 | 500 Tier A clips/project | In progress |
+| 2 | 1,000–1,500 Tier B clips/project | Planned |
+| 3 | 4,000 clips/project, all tiers | Planned |
+
+---
+
+## Quick start
+
+```bash
+# Install (requires Python ≥ 3.11 and uv)
+uv pip install -e .
+
+# Generate a clip from a scene config
+synthbanshee generate --config configs/scenes/test_scene_001.yaml
+
+# Validate an existing clip
+synthbanshee validate data/he/clip_001.wav
+```
+
+API credentials required (set in environment or `.env`):
+
+```
+AZURE_TTS_KEY=...
+AZURE_TTS_REGION=...
+ANTHROPIC_API_KEY=...   # for script generation
+```
+
+---
+
+## Docs
+
+| Document | Contents |
+|---|---|
+| `docs/spec.md` | Audio format, file naming, label schema, IAA protocol |
+| `docs/implementation_plan.md` | Phased milestones, module map, API cost estimates |
+| `docs/design_approaches.md` | Design decisions and rationale |
+| `CLAUDE.md` | Claude Code context guide (pipeline constraints, conventions) |
diff --git a/avdp/config/__init__.py b/avdp/config/__init__.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -3,9 +3,9 @@ requires = ["hatchling"]
 build-backend = "hatchling.build"
 
 [project]
-name = "avdp-synth"
+name = "synthbanshee"
 version = "0.1.0"
-description = "Synthetic dataset generation framework for the Audio Violence Dataset Project (AVDP)"
+description = "SynthBanshee: synthetic Hebrew audio dataset generator for the AVDP initiative"
 readme = "README.md"
 requires-python = ">=3.11"
 license = { text = "MIT" }
@@ -63,10 +63,10 @@ dev = [
 ]
 
 [project.scripts]
-avdp = "avdp.cli:cli"
+synthbanshee = "synthbanshee.cli:cli"
 
 [tool.hatch.build.targets.wheel]
-packages = ["avdp"]
+packages = ["synthbanshee"]
 
 [tool.ruff]
 line-length = 100
@@ -86,14 +86,14 @@ testpaths = ["tests"]
 asyncio_mode = "auto"
 
 [tool.coverage.run]
-source = ["avdp"]
+source = ["synthbanshee"]
 branch = true
 relative_files = true
 
 [tool.coverage.paths]
 source = [
-    "avdp",
-    "/home/runner/work/*/*/avdp",
+    "synthbanshee",
+    "/home/runner/work/*/*/synthbanshee",
 ]
 
 [tool.coverage.report]

diff --git a/avdp/__init__.py → synthbanshee/__init__.py b/avdp/__init__.py → synthbanshee/__init__.py
diff --git a/avdp/augment/__init__.py → synthbanshee/augment/__init__.py b/avdp/augment/__init__.py → synthbanshee/augment/__init__.py
@@ -1,5 +1,5 @@
 """Audio augmentation pipeline: room simulation, device profiles, noise mixing, preprocessing."""
 
-from avdp.augment.preprocessing import PreprocessingResult, preprocess, validate_audio
+from synthbanshee.augment.preprocessing import PreprocessingResult, preprocess, validate_audio
 
 __all__ = ["PreprocessingResult", "preprocess", "validate_audio"]
diff --git a/avdp/augment/preprocessing.py → synthbanshee/augment/preprocessing.py b/avdp/augment/preprocessing.py → synthbanshee/augment/preprocessing.py
diff --git a/avdp/cli.py → synthbanshee/cli.py b/avdp/cli.py → synthbanshee/cli.py
@@ -66,7 +66,7 @@ def generate(
     dry_run: bool,
 ) -> None:
     """Generate a synthetic clip from a scene YAML config."""
-    from avdp.config.scene_config import SceneConfig
+    from synthbanshee.config.scene_config import SceneConfig
 
     console.print(f"[bold]Loading config:[/bold] {config}")
     scene = SceneConfig.from_yaml(config)
@@ -89,12 +89,12 @@ def generate(
     # --- TTS rendering (single-speaker stub for Phase 0) ---
     import tempfile
 
-    from avdp.augment.preprocessing import preprocess
-    from avdp.config.speaker_config import SpeakerConfig
-    from avdp.labels.generator import LabelGenerator, ScriptEvent
-    from avdp.labels.schema import PreprocessingApplied, SpeakerInfo
-    from avdp.package.validator import validate_clip
-    from avdp.tts.renderer import TTSRenderer
+    from synthbanshee.augment.preprocessing import preprocess
+    from synthbanshee.config.speaker_config import SpeakerConfig
+    from synthbanshee.labels.generator import LabelGenerator, ScriptEvent
+    from synthbanshee.labels.schema import PreprocessingApplied, SpeakerInfo
+    from synthbanshee.package.validator import validate_clip
+    from synthbanshee.tts.renderer import TTSRenderer
 
     renderer = TTSRenderer(cache_dir=cache_dir)
     label_gen = LabelGenerator()
@@ -220,7 +220,7 @@ def generate(
 @click.argument("clip", type=click.Path(exists=True, path_type=Path))
 def validate(clip: Path) -> None:
     """Validate an existing clip against the AVDP spec."""
-    from avdp.package.validator import validate_clip
+    from synthbanshee.package.validator import validate_clip
 
     result = validate_clip(clip)
     if result.is_valid:

diff --git a/synthbanshee/config/__init__.py b/synthbanshee/config/__init__.py
@@ -0,0 +1,14 @@
+"""Config schema and validation for the AVDP pipeline."""
+
+from synthbanshee.config.acoustic_config import AcousticSceneConfig, BackgroundEvent
+from synthbanshee.config.scene_config import SceneConfig
+from synthbanshee.config.speaker_config import SpeakerConfig
+from synthbanshee.config.taxonomy import load_taxonomy
+
+__all__ = [
+    "AcousticSceneConfig",
+    "BackgroundEvent",
+    "SceneConfig",
+    "SpeakerConfig",
+    "load_taxonomy",
+]
diff --git a/avdp/config/acoustic_config.py → synthbanshee/config/acoustic_config.py b/avdp/config/acoustic_config.py → synthbanshee/config/acoustic_config.py
diff --git a/avdp/config/scene_config.py → synthbanshee/config/scene_config.py b/avdp/config/scene_config.py → synthbanshee/config/scene_config.py
@@ -6,8 +6,8 @@
 import yaml
 from pydantic import BaseModel, Field, field_validator, model_validator
 
-from avdp.config.acoustic_config import AcousticSceneConfig
-from avdp.config.taxonomy import (
+from synthbanshee.config.acoustic_config import AcousticSceneConfig
+from synthbanshee.config.taxonomy import (
     scene_phase_values,
     speaker_role_codes,
     violence_typology_codes,

diff --git a/avdp/config/speaker_config.py → synthbanshee/config/speaker_config.py b/avdp/config/speaker_config.py → synthbanshee/config/speaker_config.py
@@ -9,7 +9,7 @@
 import yaml
 from pydantic import BaseModel, Field, field_validator, model_validator
 
-from avdp.config.taxonomy import speaker_role_codes
+from synthbanshee.config.taxonomy import speaker_role_codes
 
 _SPEAKER_ID_RE = re.compile(r"^[A-Z]{2,4}_[MF]_\d{1,3}-\d{1,3}_\d{3}$")
 

diff --git a/avdp/config/taxonomy.py → synthbanshee/config/taxonomy.py b/avdp/config/taxonomy.py → synthbanshee/config/taxonomy.py
diff --git a/avdp/labels/__init__.py → synthbanshee/labels/__init__.py b/avdp/labels/__init__.py → synthbanshee/labels/__init__.py
@@ -1,5 +1,5 @@
 """Label schema, auto-generator, and IAA utilities."""
 
-from avdp.labels.schema import ClipMetadata, EventLabel, WeakLabel
+from synthbanshee.labels.schema import ClipMetadata, EventLabel, WeakLabel
 
 __all__ = ["ClipMetadata", "EventLabel", "WeakLabel"]
diff --git a/avdp/labels/generator.py → synthbanshee/labels/generator.py b/avdp/labels/generator.py → synthbanshee/labels/generator.py
@@ -16,8 +16,8 @@
 
 import jsonlines
 
-from avdp import __version__
-from avdp.labels.schema import (
+from synthbanshee import __version__
+from synthbanshee.labels.schema import (
     ClipAcousticScene,
     ClipMetadata,
     EventLabel,

diff --git a/avdp/labels/schema.py → synthbanshee/labels/schema.py b/avdp/labels/schema.py → synthbanshee/labels/schema.py
@@ -11,7 +11,7 @@
 
 from pydantic import BaseModel, Field, field_validator, model_validator
 
-from avdp.config.taxonomy import (
+from synthbanshee.config.taxonomy import (
     emotional_state_values,
     intensity_levels,
     speaker_role_codes,

diff --git a/avdp/package/__init__.py → synthbanshee/package/__init__.py b/avdp/package/__init__.py → synthbanshee/package/__init__.py
@@ -1,5 +1,5 @@
 """Dataset packaging: assembly, manifests, splits, and clip validation."""
 
-from avdp.package.validator import ValidationResult, validate_clip
+from synthbanshee.package.validator import ValidationResult, validate_clip
 
 __all__ = ["ValidationResult", "validate_clip"]
diff --git a/avdp/package/validator.py → synthbanshee/package/validator.py b/avdp/package/validator.py → synthbanshee/package/validator.py
@@ -18,8 +18,8 @@
 
 from pydantic import ValidationError
 
-from avdp.augment.preprocessing import validate_audio
-from avdp.labels.schema import ClipMetadata
+from synthbanshee.augment.preprocessing import validate_audio
+from synthbanshee.labels.schema import ClipMetadata
 
 _ASCII_FILENAME_RE = re.compile(r"^[a-z0-9_\-]+$")
 

diff --git a/...t/templates/she_proves/stub_utterance.txt → ...t/templates/she_proves/stub_utterance.txt b/...t/templates/she_proves/stub_utterance.txt → ...t/templates/she_proves/stub_utterance.txt
diff --git a/avdp/tts/__init__.py → synthbanshee/tts/__init__.py b/avdp/tts/__init__.py → synthbanshee/tts/__init__.py
@@ -1,6 +1,6 @@
 """TTS rendering: SSML builder, Azure provider, render cache."""
 
-from avdp.tts.renderer import TTSRenderer
-from avdp.tts.ssml_builder import SSMLBuilder
+from synthbanshee.tts.renderer import TTSRenderer
+from synthbanshee.tts.ssml_builder import SSMLBuilder
 
 __all__ = ["SSMLBuilder", "TTSRenderer"]
diff --git a/avdp/tts/azure_provider.py → synthbanshee/tts/azure_provider.py b/avdp/tts/azure_provider.py → synthbanshee/tts/azure_provider.py
diff --git a/avdp/tts/renderer.py → synthbanshee/tts/renderer.py b/avdp/tts/renderer.py → synthbanshee/tts/renderer.py
@@ -13,9 +13,9 @@
 import hashlib
 from pathlib import Path
 
-from avdp.config.speaker_config import SpeakerConfig
-from avdp.tts.azure_provider import AzureProvider
-from avdp.tts.ssml_builder import SSMLBuilder
+from synthbanshee.config.speaker_config import SpeakerConfig
+from synthbanshee.tts.azure_provider import AzureProvider
+from synthbanshee.tts.ssml_builder import SSMLBuilder
 
 _DEFAULT_CACHE_DIR = Path("assets/speech")
 

diff --git a/avdp/tts/ssml_builder.py → synthbanshee/tts/ssml_builder.py b/avdp/tts/ssml_builder.py → synthbanshee/tts/ssml_builder.py