Enhancement: Full MCP integration for ARTAgent #22
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: 🧪 Evaluation Framework Tests | |
| on: | |
| push: | |
| branches: [main, staging] | |
| paths: | |
| - 'tests/evaluation/**' | |
| - '.github/workflows/test-evaluation-framework.yml' | |
| pull_request: | |
| branches: [main, staging] | |
| paths: | |
| - 'tests/evaluation/**' | |
| - 'src/**' | |
| - '.github/workflows/test-evaluation-framework.yml' | |
| workflow_dispatch: | |
| inputs: | |
| run_all_tests: | |
| description: 'Run all tests (not just evaluation)' | |
| required: false | |
| default: false | |
| type: boolean | |
| env: | |
| CI: true | |
| PYTHONDONTWRITEBYTECODE: 1 | |
| PYTHONUNBUFFERED: 1 | |
| permissions: | |
| contents: read | |
| pull-requests: read | |
| jobs: | |
| # ============================================================================ | |
| # JOB: Lint & Type Check | |
| # ============================================================================ | |
| lint: | |
| name: 🔍 Lint & Type Check | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: 🛒 Checkout | |
| uses: actions/checkout@v4 | |
| - name: 🐍 Setup Python 3.11 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| - name: 📦 Install uv | |
| uses: astral-sh/setup-uv@v4 | |
| with: | |
| version: "latest" | |
| - name: 🔧 Install dependencies | |
| run: | | |
| uv pip install --system ruff black isort | |
| - name: 🔍 Ruff lint | |
| run: | | |
| ruff check tests/evaluation/ --config pyproject.toml | |
| continue-on-error: true | |
| - name: 🎨 Black format check | |
| run: | | |
| black --check --diff tests/evaluation/ | |
| continue-on-error: true | |
| - name: 📑 isort import check | |
| run: | | |
| isort --check-only --diff tests/evaluation/ | |
| continue-on-error: true | |
| - name: 📋 Lint Summary | |
| run: | | |
| echo "## 🔍 Lint Results" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Check | Status |" >> $GITHUB_STEP_SUMMARY | |
| echo "|-------|--------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| Ruff | ✓ Checked |" >> $GITHUB_STEP_SUMMARY | |
| echo "| Black | ✓ Checked |" >> $GITHUB_STEP_SUMMARY | |
| echo "| isort | ✓ Checked |" >> $GITHUB_STEP_SUMMARY | |
| # ============================================================================ | |
| # JOB: Unit Tests | |
| # ============================================================================ | |
| unit-tests: | |
| name: 🧪 Unit Tests | |
| runs-on: ubuntu-latest | |
| needs: lint | |
| steps: | |
| - name: 🛒 Checkout | |
| uses: actions/checkout@v4 | |
| with: | |
| fetch-depth: 0 | |
| - name: 🐍 Setup Python 3.11 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| - name: 📦 Install uv | |
| uses: astral-sh/setup-uv@v4 | |
| with: | |
| version: "latest" | |
| - name: 🔧 Install dependencies | |
| run: | | |
| # Install project with dev dependencies | |
| uv pip install --system -e ".[dev]" | |
| - name: 📁 Create reports directory | |
| run: mkdir -p reports | |
| - name: 🧪 Run Evaluation Tests | |
| run: | | |
| set -o pipefail | |
| pytest tests/evaluation/ \ | |
| -v \ | |
| --tb=short \ | |
| --junitxml=reports/evaluation-tests.xml \ | |
| -m "not evaluation" \ | |
| 2>&1 | tee reports/test-output.txt | |
| env: | |
| PYTHONPATH: . | |
| - name: 📊 Test Results Summary | |
| if: always() | |
| run: | | |
| echo "## 🧪 Evaluation Framework Test Results" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| # Count tests from output | |
| PASSED=$(grep -oP '\d+(?= passed)' reports/test-output.txt | head -1 || echo "0") | |
| FAILED=$(grep -oP '\d+(?= failed)' reports/test-output.txt | head -1 || echo "0") | |
| SKIPPED=$(grep -oP '\d+(?= skipped)' reports/test-output.txt | head -1 || echo "0") | |
| echo "| Metric | Count |" >> $GITHUB_STEP_SUMMARY | |
| echo "|--------|-------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| ✅ Passed | $PASSED |" >> $GITHUB_STEP_SUMMARY | |
| echo "| ❌ Failed | $FAILED |" >> $GITHUB_STEP_SUMMARY | |
| echo "| ⏭️ Skipped | $SKIPPED |" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| # Module breakdown | |
| echo "### 📦 Test Modules" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Module | Description |" >> $GITHUB_STEP_SUMMARY | |
| echo "|--------|-------------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| test_metrics.py | Pluggable metrics |" >> $GITHUB_STEP_SUMMARY | |
| echo "| test_scenarios.py | End-to-end scenario tests |" >> $GITHUB_STEP_SUMMARY | |
| - name: 📤 Upload Test Results | |
| uses: actions/upload-artifact@v4 | |
| if: always() | |
| with: | |
| name: test-results | |
| path: | | |
| reports/evaluation-tests.xml | |
| reports/test-output.txt | |
| retention-days: 30 | |
| # ============================================================================ | |
| # JOB: Schema Validation | |
| # ============================================================================ | |
| schema-validation: | |
| name: 📋 Schema Validation | |
| runs-on: ubuntu-latest | |
| needs: lint | |
| steps: | |
| - name: 🛒 Checkout | |
| uses: actions/checkout@v4 | |
| - name: 🐍 Setup Python 3.11 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| - name: 📦 Install uv | |
| uses: astral-sh/setup-uv@v4 | |
| with: | |
| version: "latest" | |
| - name: 🔧 Install dependencies | |
| run: | | |
| uv pip install --system -e ".[dev]" | |
| - name: 📋 Validate Scenario Schemas | |
| run: | | |
| python -c " | |
| import sys | |
| sys.path.insert(0, '.') | |
| from pathlib import Path | |
| import yaml | |
| # Import schemas to validate they load correctly | |
| from tests.evaluation.schemas import ( | |
| ModelProfile, | |
| TurnEvent, | |
| TurnScore, | |
| RunSummary, | |
| ScenarioExpectations, | |
| SessionAgentConfig, | |
| ) | |
| print('✅ All schema imports successful') | |
| # Validate any YAML scenarios | |
| scenarios_dir = Path('tests/evaluation/scenarios') | |
| if scenarios_dir.exists(): | |
| for yaml_file in scenarios_dir.rglob('*.yaml'): | |
| print(f'📄 Validating: {yaml_file}') | |
| with open(yaml_file) as f: | |
| data = yaml.safe_load(f) | |
| print(f' ✓ Valid YAML structure') | |
| print('') | |
| print('✅ Schema validation complete!') | |
| " | |
| env: | |
| PYTHONPATH: . | |
| - name: 📋 Schema Validation Summary | |
| run: | | |
| echo "## 📋 Schema Validation" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Schema Module | Status |" >> $GITHUB_STEP_SUMMARY | |
| echo "|---------------|--------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| config.py | ✅ Valid |" >> $GITHUB_STEP_SUMMARY | |
| echo "| events.py | ✅ Valid |" >> $GITHUB_STEP_SUMMARY | |
| echo "| expectations.py | ✅ Valid |" >> $GITHUB_STEP_SUMMARY | |
| echo "| results.py | ✅ Valid |" >> $GITHUB_STEP_SUMMARY | |
| echo "| foundry.py | ✅ Valid |" >> $GITHUB_STEP_SUMMARY | |
| # ============================================================================ | |
| # JOB: Module Integration | |
| # ============================================================================ | |
| module-integration: | |
| name: 🔗 Module Integration | |
| runs-on: ubuntu-latest | |
| needs: [unit-tests, schema-validation] | |
| steps: | |
| - name: 🛒 Checkout | |
| uses: actions/checkout@v4 | |
| - name: 🐍 Setup Python 3.11 | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.11' | |
| - name: 📦 Install uv | |
| uses: astral-sh/setup-uv@v4 | |
| with: | |
| version: "latest" | |
| - name: 🔧 Install dependencies | |
| run: | | |
| uv pip install --system -e ".[dev]" | |
| - name: 🔗 Test Module Integration | |
| run: | | |
| python -c " | |
| import sys | |
| sys.path.insert(0, '.') | |
| print('🔗 Testing Module Integration') | |
| print('=' * 50) | |
| # Test all modules can be imported together | |
| from tests.evaluation import ( | |
| # Schemas | |
| ModelProfile, | |
| TurnEvent, | |
| ToolCall, | |
| HandoffEvent, | |
| ScenarioExpectations, | |
| RunSummary, | |
| TurnScore, | |
| # Core | |
| EventRecorder, | |
| MetricsScorer, | |
| ) | |
| # Import additional schemas from submodule | |
| from tests.evaluation.schemas import SessionAgentConfig | |
| print('✅ All module imports successful') | |
| print('') | |
| print('🎉 All module integrations verified!') | |
| " | |
| env: | |
| PYTHONPATH: . | |
| - name: 📋 Integration Summary | |
| run: | | |
| echo "## 🔗 Module Integration" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "All evaluation framework modules integrate correctly:" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Module | Components |" >> $GITHUB_STEP_SUMMARY | |
| echo "|--------|------------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| schemas | ModelProfile, TurnEvent, ScenarioExpectations |" >> $GITHUB_STEP_SUMMARY | |
| echo "| core | EventRecorder, MetricsScorer |" >> $GITHUB_STEP_SUMMARY | |
| # ============================================================================ | |
| # JOB: Summary | |
| # ============================================================================ | |
| summary: | |
| name: 📊 Test Summary | |
| runs-on: ubuntu-latest | |
| needs: [lint, unit-tests, schema-validation, module-integration] | |
| if: always() | |
| steps: | |
| - name: 📊 Generate Summary | |
| run: | | |
| echo "# 🧪 Evaluation Framework CI Summary" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "## Job Results" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "| Job | Status |" >> $GITHUB_STEP_SUMMARY | |
| echo "|-----|--------|" >> $GITHUB_STEP_SUMMARY | |
| echo "| 🔍 Lint & Type Check | ${{ needs.lint.result == 'success' && '🟢 Passed' || '🔴 Failed' }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| 🧪 Unit Tests | ${{ needs.unit-tests.result == 'success' && '🟢 Passed' || '🔴 Failed' }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| 📋 Schema Validation | ${{ needs.schema-validation.result == 'success' && '🟢 Passed' || '🔴 Failed' }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "| 🔗 Module Integration | ${{ needs.module-integration.result == 'success' && '🟢 Passed' || '🔴 Failed' }} |" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "## Framework Components Tested" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Schemas**: Pydantic models for configs, events, results" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Metrics**: Pluggable metrics (tool precision, latency, cost, etc.)" >> $GITHUB_STEP_SUMMARY | |
| echo "- **Scenarios**: End-to-end scenario validation" >> $GITHUB_STEP_SUMMARY | |
| echo "" >> $GITHUB_STEP_SUMMARY | |
| echo "📖 See [Evaluation Framework Docs](https://azure-samples.github.io/art-voice-agent-accelerator/testing/model-evaluation/) for details." >> $GITHUB_STEP_SUMMARY | |
| - name: ✅ Check Results | |
| run: | | |
| if [[ "${{ needs.unit-tests.result }}" != "success" ]]; then | |
| echo "❌ Unit tests failed" | |
| exit 1 | |
| fi | |
| if [[ "${{ needs.schema-validation.result }}" != "success" ]]; then | |
| echo "❌ Schema validation failed" | |
| exit 1 | |
| fi | |
| if [[ "${{ needs.module-integration.result }}" != "success" ]]; then | |
| echo "❌ Module integration failed" | |
| exit 1 | |
| fi | |
| echo "✅ All evaluation framework tests passed!" |