From 89e487be148ce15b9ca98b14b9eb39482445377b Mon Sep 17 00:00:00 2001 From: Chitoku YATO Date: Sat, 23 Aug 2025 23:44:11 -0700 Subject: [PATCH 1/3] feat: Add enhanced GitHub token integration for Docker builds - Add get_github_token() function with multiple environment variable support - Implement preprocess_dockerfile_for_github_api() for automatic GitHub API call handling - Integrate GitHub token authentication into build_container() function - Replace ADD https://api.github.com calls with COPY instructions using pre-fetched data - Add automatic cleanup of temporary files after builds - Maintain backward compatibility with --no-github-api flag - Add comprehensive documentation and test script - Support multiple token environment variables: GITHUB_TOKEN, GITHUB_PAT, GH_TOKEN This enhancement provides higher rate limits (5000 vs 60 requests/hour) and more reliable builds by pre-fetching GitHub data using authenticated API calls instead of relying on Docker's ADD instruction during build time. --- docs/github-token-integration.md | 166 +++++++++++++++++++++++++++++++ jetson_containers/build.py | 2 +- jetson_containers/container.py | 49 ++++++++- jetson_containers/network.py | 91 ++++++++++++++++- test_github_token_integration.py | 123 +++++++++++++++++++++++ 5 files changed, 424 insertions(+), 7 deletions(-) create mode 100644 docs/github-token-integration.md create mode 100644 test_github_token_integration.py diff --git a/docs/github-token-integration.md b/docs/github-token-integration.md new file mode 100644 index 000000000..386a18224 --- /dev/null +++ b/docs/github-token-integration.md @@ -0,0 +1,166 @@ +# GitHub Token Integration + +This document describes the enhanced GitHub token integration feature that automatically handles GitHub API rate limiting during Docker builds. + +## Overview + +The GitHub token integration automatically detects GitHub API calls in Dockerfiles and pre-fetches the required data using authenticated API requests. This provides several benefits: + +- **Higher rate limits**: Authenticated requests get 5000 requests/hour vs 60 for unauthenticated +- **More reliable builds**: No more build failures due to rate limiting +- **Better caching**: Uses `COPY` instead of `ADD` for improved Docker layer caching +- **Automatic fallback**: Falls back to the `--no-github-api` workaround if needed + +## How It Works + +1. **Detection**: The build system scans Dockerfiles for `ADD https://api.github.com/...` lines +2. **Authentication**: Uses GitHub token from environment variables if available +3. **Pre-fetching**: Fetches commit hashes and other data using authenticated API calls +4. **File creation**: Creates temporary JSON files with the fetched data +5. **Dockerfile modification**: Replaces `ADD` with `COPY` instructions +6. **Build integration**: Passes commit hashes as build arguments +7. **Cleanup**: Automatically removes temporary files after build completion + +## Environment Variables + +The system supports multiple environment variable names for GitHub tokens: + +- `GITHUB_TOKEN` (primary) +- `GITHUB_PAT` (Personal Access Token) +- `GH_TOKEN` (GitHub CLI style) + +Set any of these variables before running builds: + +```bash +export GITHUB_TOKEN=ghp_your_token_here +./build.sh sudonim +``` + +## Usage Examples + +### Basic Usage (with token) + +```bash +# Set your GitHub token +export GITHUB_TOKEN=ghp_your_token_here + +# Build normally - GitHub API calls will be automatically handled +./build.sh sudonim +./build.sh mlc vllm +``` + +### Fallback Usage (without token) + +```bash +# If no token is set, the system will warn you but continue +./build.sh sudonim + +# Or explicitly disable GitHub API usage +./build.sh --no-github-api sudonim +``` + +### Multiple Packages + +```bash +# Build multiple packages with automatic GitHub API handling +./build.sh --multiple sudonim mlc vllm + +# Or chain them together +./build.sh sudonim mlc vllm +``` + +## Build Arguments + +When GitHub API calls are pre-processed, the commit hashes are automatically added as build arguments: + +- `GITHUB_DUSTY_NV_SUDONIM_COMMIT`: Latest commit SHA for dusty-nv/sudonim +- `GITHUB_DUSTY_NV_MLC_COMMIT`: Latest commit SHA for dusty-nv/mlc +- etc. + +These can be used in Dockerfiles or build scripts for version tracking. + +## Technical Details + +### File Structure + +During preprocessing, the system creates: + +``` +package_directory/ +├── Dockerfile # Original Dockerfile +├── Dockerfile.with-github-data # Modified Dockerfile (temporary) +└── .github-api-temp/ # Temporary data directory + ├── dusty_nv_sudonim_main.json + └── dusty_nv_mlc_main.json +``` + +### Dockerfile Transformation + +**Before (original):** +```dockerfile +ADD https://api.github.com/repos/dusty-nv/sudonim/git/refs/heads/main /tmp/sudonim_version.json +``` + +**After (processed):** +```dockerfile +COPY .github-api-temp/dusty_nv_sudonim_main.json /tmp/sudonim_version.json +``` + +### Error Handling + +- **Token missing**: System continues with unauthenticated requests (may hit rate limits) +- **API failures**: Falls back to original Dockerfile behavior +- **Preprocessing errors**: Logs warnings and continues with original approach + +## Troubleshooting + +### Rate Limit Errors + +If you still see rate limit errors: + +1. **Check token**: Verify your GitHub token is set correctly +2. **Token permissions**: Ensure token has `repo` access for private repositories +3. **Token expiration**: GitHub tokens can expire; generate a new one if needed + +### Build Failures + +If builds fail with GitHub API issues: + +1. **Use fallback**: Add `--no-github-api` flag +2. **Check logs**: Look for GitHub API preprocessing messages +3. **Verify connectivity**: Ensure your system can reach GitHub's API + +### Debug Mode + +Enable verbose logging to see detailed GitHub API processing: + +```bash +./build.sh --verbose --log-level=debug sudonim +``` + +## Migration from --no-github-api + +The `--no-github-api` flag is still supported and works as before. The new integration: + +- **Enhances** the existing system rather than replacing it +- **Automatically** handles GitHub API calls when possible +- **Falls back** to the original workaround when needed +- **Maintains** backward compatibility + +## Future Enhancements + +Potential improvements for future versions: + +- **Caching**: Cache GitHub API responses to reduce API calls +- **Batch processing**: Process multiple repositories in single API calls +- **Webhook integration**: Trigger rebuilds on repository updates +- **Rate limit monitoring**: Track and report API usage + +## Contributing + +To contribute to this feature: + +1. **Test thoroughly**: Ensure your changes work with various GitHub API scenarios +2. **Handle errors gracefully**: Always provide fallback behavior +3. **Update documentation**: Keep this document current with any changes +4. **Follow patterns**: Use the existing logging and error handling patterns diff --git a/jetson_containers/build.py b/jetson_containers/build.py index f20807e47..0a93da2d9 100644 --- a/jetson_containers/build.py +++ b/jetson_containers/build.py @@ -53,7 +53,7 @@ parser.add_argument('--simulate', action='store_true', help="print out the build commands without actually building the containers") parser.add_argument('--push', type=str, default='', help="repo or user to push built container image to (no push by default)") -parser.add_argument('--no-github-api', action='store_true', help="disalbe Github API use to force rebuild on new git commits") +parser.add_argument('--no-github-api', action='store_true', help="disable GitHub API use to force rebuild on new git commits (fallback when no token is available)") parser.add_argument('--log-dir', '--logs', type=str, default=None, help="sets the directory to save container build logs to (default: jetson-containers/logs)") parser.add_argument('--log-level', type=str, default=None, choices=LogConfig.levels, help="sets the logging verbosity level") diff --git a/jetson_containers/container.py b/jetson_containers/container.py index 9644523fb..06cb16a62 100644 --- a/jetson_containers/container.py +++ b/jetson_containers/container.py @@ -28,6 +28,7 @@ split_container_name, query_yes_no, needs_sudo, sudo_prefix, get_env, get_dir, get_repo_dir ) +from .network import preprocess_dockerfile_for_github_api _NEWLINE_=" \\\n" # used when building command strings @@ -179,8 +180,28 @@ def build_container( if 'dockerfile' in pkg: cmd = f"{sudo_prefix()}DOCKER_BUILDKIT=0 docker build --network=host" + _NEWLINE_ cmd += f" --tag {container_name}" + _NEWLINE_ - if no_github_api: - dockerfilepath = os.path.join(pkg['path'], pkg['dockerfile']) + + dockerfilepath = os.path.join(pkg['path'], pkg['dockerfile']) + github_build_args = {} + + # Try to pre-process GitHub API calls if not explicitly disabled + if not no_github_api: + try: + processed_dockerfile, github_build_args = preprocess_dockerfile_for_github_api(dockerfilepath, pkg['path']) + if github_build_args: + # Merge with existing build args + if 'build_args' not in pkg: + pkg['build_args'] = {} + pkg['build_args'].update(github_build_args) + dockerfilepath = processed_dockerfile + log_info(f"Pre-processed GitHub API calls for {package}") + except Exception as e: + log_warning(f"Failed to pre-process GitHub API calls for {package}: {e}") + # Fall back to original Dockerfile + dockerfilepath = os.path.join(pkg['path'], pkg['dockerfile']) + + # Fall back to no_github_api logic if preprocessing failed or was disabled + if no_github_api or (dockerfilepath == os.path.join(pkg['path'], pkg['dockerfile']) and 'ADD https://api.github.com' in open(dockerfilepath, 'r').read()): with open(dockerfilepath, 'r') as fp: data = fp.read() if 'ADD https://api.github.com' in data: @@ -189,9 +210,9 @@ def build_container( os.system(f"sed 's|^ADD https://api.github.com|#[minus-github-api]ADD https://api.github.com|' -i {dockerfilepath_minus_github_api}") cmd += f" --file {os.path.join(pkg['path'], pkg['dockerfile'] + '.minus-github-api')}" + _NEWLINE_ else: - cmd += f" --file {os.path.join(pkg['path'], pkg['dockerfile'])}" + _NEWLINE_ + cmd += f" --file {dockerfilepath}" + _NEWLINE_ else: - cmd += f" --file {os.path.join(pkg['path'], pkg['dockerfile'])}" + _NEWLINE_ + cmd += f" --file {dockerfilepath}" + _NEWLINE_ cmd += f" --build-arg BASE_IMAGE={base}" + _NEWLINE_ @@ -280,6 +301,26 @@ def build_container( log_success(f'✅ `jetson-containers build {repo_name}` ({name})') log_success(f'⏱️ Total build time: {total_duration:.1f} seconds ({total_duration/60:.1f} minutes)') log_success('=====================================================================================') + + # Clean up temporary GitHub API files + try: + for pkg in packages: + pkg_info = find_package(pkg) + if 'path' in pkg_info: + temp_dir = os.path.join(pkg_info['path'], '.github-api-temp') + if os.path.exists(temp_dir): + import shutil + shutil.rmtree(temp_dir) + log_debug(f"Cleaned up temporary GitHub API directory: {temp_dir}") + + # Also clean up modified Dockerfiles + modified_dockerfile = os.path.join(pkg_info['path'], pkg_info['dockerfile'] + '.with-github-data') + if os.path.exists(modified_dockerfile): + os.remove(modified_dockerfile) + log_debug(f"Cleaned up modified Dockerfile: {modified_dockerfile}") + except Exception as e: + log_warning(f"Failed to clean up temporary GitHub API files: {e}") + log_success('=====================================================================================') return name diff --git a/jetson_containers/network.py b/jetson_containers/network.py index 33d90508d..eea9a9358 100644 --- a/jetson_containers/network.py +++ b/jetson_containers/network.py @@ -5,7 +5,7 @@ import time from typing import Dict, Literal -from .logging import log_error, log_warning, log_verbose +from .logging import log_error, log_warning, log_verbose, log_info def handle_text_request(url, retries=3, backoff=5): @@ -69,6 +69,19 @@ def handle_json_request(url: str, headers: Dict[str, str] = None, return None +def get_github_token(): + """Get GitHub token from environment variables with fallbacks""" + token = os.environ.get('GITHUB_TOKEN') or \ + os.environ.get('GITHUB_PAT') or \ + os.environ.get('GH_TOKEN') + + if not token: + log_warning("No GitHub token found. API calls will be unauthenticated and may hit rate limits.") + log_info("Set GITHUB_TOKEN, GITHUB_PAT, or GH_TOKEN environment variable for higher rate limits.") + + return token + + @functools.lru_cache(maxsize=None) def github_api(url: str): """ @@ -80,7 +93,7 @@ def github_api(url: str): Returns: dict or None: The parsed JSON response data as a dictionary, or None if an error occurs. """ - github_token = os.environ.get('GITHUB_TOKEN') + github_token = get_github_token() headers = {'Authorization': f'token {github_token}'} if github_token else None request_url = f'https://api.github.com/{url}' @@ -144,3 +157,77 @@ def get_json_value_from_url(url: str, notation: str = None): return None return data + + +def preprocess_dockerfile_for_github_api(dockerfile_path: str, pkg_path: str): + """ + Pre-process Dockerfile to replace GitHub API calls with pre-fetched data. + + This function: + 1. Detects GitHub API calls in Dockerfiles + 2. Pre-fetches the data using authenticated API calls + 3. Creates temporary files with the fetched data + 4. Modifies the Dockerfile to use COPY instead of ADD + + Args: + dockerfile_path (str): Path to the original Dockerfile + pkg_path (str): Path to the package directory + + Returns: + tuple: (modified_dockerfile_path, build_args_dict) or (original_path, None) if no changes + """ + import re + import json + import os + + with open(dockerfile_path, 'r') as fp: + content = fp.read() + + # Find all GitHub API calls + github_api_pattern = r'ADD https://api\.github\.com/repos/([^/]+/[^/]+)/git/refs/heads/([^\s]+)\s+([^\s]+)' + matches = re.findall(github_api_pattern, content) + + if not matches: + return dockerfile_path, None + + # Create a temporary directory for pre-fetched data + temp_dir = os.path.join(pkg_path, '.github-api-temp') + os.makedirs(temp_dir, exist_ok=True) + + modified_content = content + build_args = {} + + for owner_repo, branch, dest_path in matches: + log_info(f"Pre-fetching GitHub data for {owner_repo}/{branch}") + + # Fetch the commit hash using authenticated API + commit_sha = github_latest_commit(owner_repo, branch) + + if commit_sha: + # Create a temporary file with the commit data + temp_file = os.path.join(temp_dir, f"{owner_repo.replace('/', '_')}_{branch}.json") + with open(temp_file, 'w') as f: + json.dump({"sha": commit_sha, "ref": f"refs/heads/{branch}"}, f) + + # Replace ADD with COPY + old_line = f'ADD https://api.github.com/repos/{owner_repo}/git/refs/heads/{branch} {dest_path}' + new_line = f'COPY .github-api-temp/{os.path.basename(temp_file)} {dest_path}' + modified_content = modified_content.replace(old_line, new_line) + + # Add build arg for the commit SHA + build_args[f'GITHUB_{owner_repo.replace("/", "_").upper()}_COMMIT'] = commit_sha + + log_info(f"Successfully pre-fetched commit {commit_sha[:8]} for {owner_repo}/{branch}") + else: + log_warning(f"Failed to fetch commit for {owner_repo}/{branch}, keeping original ADD line") + + if modified_content != content: + # Write modified Dockerfile + modified_dockerfile = dockerfile_path + '.with-github-data' + with open(modified_dockerfile, 'w') as fp: + fp.write(modified_content) + + log_info(f"Created modified Dockerfile: {modified_dockerfile}") + return modified_dockerfile, build_args + + return dockerfile_path, None diff --git a/test_github_token_integration.py b/test_github_token_integration.py new file mode 100644 index 000000000..88b2bdc98 --- /dev/null +++ b/test_github_token_integration.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +""" +Test script for GitHub token integration functionality. +This script tests the new preprocess_dockerfile_for_github_api function. +""" + +import os +import sys +import tempfile +import shutil + +# Add the jetson_containers module to the path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'jetson_containers')) + +from network import preprocess_dockerfile_for_github_api, get_github_token + +def test_github_token_detection(): + """Test GitHub token detection from environment variables""" + print("Testing GitHub token detection...") + + # Test with no token + token = get_github_token() + print(f"Token detected: {'Yes' if token else 'No'}") + + if token: + print(f"Token preview: {token[:8]}...") + else: + print("No token found - this is expected if GITHUB_TOKEN is not set") + + return True + +def test_dockerfile_preprocessing(): + """Test Dockerfile preprocessing functionality""" + print("\nTesting Dockerfile preprocessing...") + + # Create a temporary directory + temp_dir = tempfile.mkdtemp() + + try: + # Create a test Dockerfile with GitHub API calls + dockerfile_content = """# Test Dockerfile +FROM ubuntu:20.04 + +# This should be replaced +ADD https://api.github.com/repos/dusty-nv/sudonim/git/refs/heads/main /tmp/sudonim_version.json + +# This should also be replaced +ADD https://api.github.com/repos/dusty-nv/mlc/git/refs/heads/main /tmp/mlc_version.json + +RUN echo "Build complete" +""" + + dockerfile_path = os.path.join(temp_dir, 'Dockerfile') + with open(dockerfile_path, 'w') as f: + f.write(dockerfile_content) + + print(f"Created test Dockerfile: {dockerfile_path}") + + # Test preprocessing + result = preprocess_dockerfile_for_github_api(dockerfile_path, temp_dir) + modified_dockerfile, build_args = result + + print(f"Preprocessing result:") + print(f" Modified Dockerfile: {modified_dockerfile}") + print(f" Build args: {build_args}") + + if modified_dockerfile != dockerfile_path: + print("✅ Preprocessing successful!") + + # Show the modified content + with open(modified_dockerfile, 'r') as f: + modified_content = f.read() + print("\nModified Dockerfile content:") + print("=" * 50) + print(modified_content) + print("=" * 50) + + # Check if .github-api-temp directory was created + temp_api_dir = os.path.join(temp_dir, '.github-api-temp') + if os.path.exists(temp_api_dir): + print(f"✅ Temporary API directory created: {temp_api_dir}") + print("Files in temp directory:") + for file in os.listdir(temp_api_dir): + print(f" - {file}") + else: + print("❌ Temporary API directory not created") + + else: + print("❌ Preprocessing failed - no changes made") + + return True + + finally: + # Clean up + shutil.rmtree(temp_dir) + print(f"\nCleaned up temporary directory: {temp_dir}") + +def main(): + """Main test function""" + print("GitHub Token Integration Test Suite") + print("=" * 50) + + try: + # Test 1: Token detection + test_github_token_detection() + + # Test 2: Dockerfile preprocessing + test_dockerfile_preprocessing() + + print("\n" + "=" * 50) + print("✅ All tests completed successfully!") + + except Exception as e: + print(f"\n❌ Test failed with error: {e}") + import traceback + traceback.print_exc() + return False + + return True + +if __name__ == "__main__": + success = main() + sys.exit(0 if success else 1) From f48273095017e13d79b3424ac957069277486ed2 Mon Sep 17 00:00:00 2001 From: Chitoku YATO Date: Sat, 23 Aug 2025 23:49:29 -0700 Subject: [PATCH 2/3] fix: Remove conflicting local shutil import in cleanup function --- jetson_containers/container.py | 1 - 1 file changed, 1 deletion(-) diff --git a/jetson_containers/container.py b/jetson_containers/container.py index 06cb16a62..7c1b7f697 100644 --- a/jetson_containers/container.py +++ b/jetson_containers/container.py @@ -309,7 +309,6 @@ def build_container( if 'path' in pkg_info: temp_dir = os.path.join(pkg_info['path'], '.github-api-temp') if os.path.exists(temp_dir): - import shutil shutil.rmtree(temp_dir) log_debug(f"Cleaned up temporary GitHub API directory: {temp_dir}") From 238f347ecb1385c918b4574b29f1cf2577b46e15 Mon Sep 17 00:00:00 2001 From: Chitoku YATO Date: Sun, 24 Aug 2025 00:15:01 -0700 Subject: [PATCH 3/3] fix: Add cleanup for .minus-github-api temporary files - Add cleanup logic for Dockerfile.minus-github-api files - Ensures both types of temporary files are removed after builds - Prevents accumulation of temporary files in package directories - Maintains clean package directory state after all build scenarios --- jetson_containers/container.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/jetson_containers/container.py b/jetson_containers/container.py index 7c1b7f697..112d768d9 100644 --- a/jetson_containers/container.py +++ b/jetson_containers/container.py @@ -302,7 +302,7 @@ def build_container( log_success(f'⏱️ Total build time: {total_duration:.1f} seconds ({total_duration/60:.1f} minutes)') log_success('=====================================================================================') - # Clean up temporary GitHub API files + # Clean up temporary GitHub API files try: for pkg in packages: pkg_info = find_package(pkg) @@ -317,6 +317,12 @@ def build_container( if os.path.exists(modified_dockerfile): os.remove(modified_dockerfile) log_debug(f"Cleaned up modified Dockerfile: {modified_dockerfile}") + + # Clean up fallback Dockerfiles + fallback_dockerfile = os.path.join(pkg_info['path'], pkg_info['dockerfile'] + '.minus-github-api') + if os.path.exists(fallback_dockerfile): + os.remove(fallback_dockerfile) + log_debug(f"Cleaned up fallback Dockerfile: {fallback_dockerfile}") except Exception as e: log_warning(f"Failed to clean up temporary GitHub API files: {e}")