Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 50 additions & 1 deletion rdagent/app/finetune/data_science/conf.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os

import torch
from pydantic_settings import SettingsConfigDict

from rdagent.app.data_science.conf import DS_RD_SETTING
Expand Down Expand Up @@ -38,3 +38,52 @@ def update_settings(competition: str):
if hasattr(DS_RD_SETTING, field_name):
setattr(DS_RD_SETTING, field_name, new_value)
DS_RD_SETTING.competition = competition

def get_training_config():
return {
"device": "cuda" if torch.cuda.is_available() else "cpu",
"batch_size": 32 if torch.cuda.is_available() else 16,
"use_mixed_precision": True if torch.cuda.is_available() else False,
"num_workers": 4 if torch.cuda.is_available() else 2,
"pin_memory": True if torch.cuda.is_available() else False
}

class GPUConfig:
@staticmethod
def setup_cuda_optimizations():
if torch.cuda.is_available():
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False

@staticmethod
def get_optimized_batch_size(base_batch_size=32):
if torch.cuda.is_available():
# Adjust based on available GPU memory
gpu_memory = torch.cuda.get_device_properties(0).total_memory
if gpu_memory > 8e9: # 8GB
return base_batch_size * 4
elif gpu_memory > 4e9: # 4GB
return base_batch_size * 2
return base_batch_size

def get_gpu_enhanced_config():
"""Get configuration optimized for GPU if available"""
gpu_available = torch.cuda.is_available()

return {
"training": {
"device": "cuda" if gpu_available else "cpu",
"use_amp": gpu_available,
"gradient_accumulation_steps": 1,
"max_grad_norm": 1.0
},
"data": {
"num_workers": 4 if gpu_available else 2,
"pin_memory": gpu_available,
"prefetch_factor": 2 if gpu_available else 1
},
"model": {
"use_compile": gpu_available,
"optimize_for_inference": gpu_available
}
}
23 changes: 21 additions & 2 deletions rdagent/app/finetune/data_science/loop.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,33 @@
import asyncio
from pathlib import Path

import fire

from rdagent.app.data_science.conf import DS_RD_SETTING
from rdagent.app.finetune.data_science.conf import update_settings
from rdagent.core.utils import import_class
from rdagent.log import rdagent_logger as logger
from rdagent.scenarios.data_science.loop import DataScienceRDLoop

from ...utils.gpu_utils import setup_gpu

class EnhancedTrainingLoop:
def __init__(self):
self.device = setup_gpu()

def train_time_series_model(self, model, data_loader, optimizer):
model = model.to(self.device)

for batch in data_loader:
# Move data to GPU
inputs, targets = batch
inputs = inputs.to(self.device)
targets = targets.to(self.device)

outputs = model(inputs)
loss = self.criterion(outputs, targets)

loss.backward()
optimizer.step()
optimizer.zero_grad()

def main(
model: str | None = None,
Expand Down
37 changes: 36 additions & 1 deletion rdagent/app/general_model/general_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import fire

import torch
import torch.nn as nn
from ..utils.gpu_utils import setup_gpu
from rdagent.components.coder.model_coder.task_loader import (
ModelExperimentLoaderFromPDFfiles,
)
Expand All @@ -10,7 +12,40 @@
from rdagent.scenarios.general_model.scenario import GeneralModelScenario
from rdagent.scenarios.qlib.developer.model_coder import QlibModelCoSTEER

class GPUEnhancedLSTM(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, output_size, dropout=0.2):
super(GPUEnhancedLSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.device = setup_gpu()

self.lstm = nn.LSTM(input_size, hidden_size, num_layers,
batch_first=True, dropout=dropout)
self.fc = nn.Linear(hidden_size, output_size)

def forward(self, x):
# Initialize hidden states on correct device
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(self.device)

out, _ = self.lstm(x, (h0, c0))
out = self.fc(out[:, -1, :])
return out

class TimeSeriesModelFactory:
def create_model(self, model_type, **kwargs):
model = None
if model_type == "lstm":
model = GPUEnhancedLSTM(
input_size=kwargs.get('input_size', 10),
hidden_size=kwargs.get('hidden_size', 50),
num_layers=kwargs.get('num_layers', 2),
output_size=kwargs.get('output_size', 1)
)
if model:
model = model.to(setup_gpu())
return model

def extract_models_and_implement(report_file_path: str) -> None:
"""
This is a research copilot to automatically implement models from a report file or paper.
Expand Down
33 changes: 33 additions & 0 deletions rdagent/app/utils/dl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import torch
from .gpu_utils import setup_gpu

class GPUDataLoader:
def __init__(self, dataset, batch_size=32, shuffle=True):
self.device = setup_gpu()
self.loader = torch.utils.data.DataLoader(
dataset,
batch_size=batch_size,
shuffle=shuffle,
num_workers=4 if torch.cuda.is_available() else 2,
pin_memory=True if torch.cuda.is_available() else False
)

def __iter__(self):
for data, target in self.loader:
yield data.to(self.device), target.to(self.device)

def create_gpu_optimized_loader(dataset, config=None):
"""Create data loader optimized for GPU"""
if config is None:
config = {}

batch_size = config.get('batch_size', 32)
if torch.cuda.is_available():
# Increase batch size for GPU
batch_size = batch_size * 2

return GPUDataLoader(
dataset,
batch_size=batch_size,
shuffle=config.get('shuffle', True)
)
177 changes: 177 additions & 0 deletions rdagent/app/utils/gpu_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
import torch
import logging
import gc
import subprocess

logger = logging.getLogger(__name__)

def check_nvidia_drivers():
try:
result = subprocess.run(['nvidia-smi'], capture_output=True, text=True)
return result.returncode == 0
except FileNotFoundError:
return False

def is_cuda_built():
try:
if hasattr(torch.cuda, 'is_built'):
return torch.cuda.is_built()
else:
return torch.cuda.is_available()
except:
return False

def setup_gpu(verbose=True):
if verbose:
print("Initializing GPU support...")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA built with PyTorch: {is_cuda_built()}")
print(f"CUDA available: {torch.cuda.is_available()}")

if not torch.cuda.is_available():
if verbose:
print("CUDA not available in PyTorch")
print("Possible solutions:")
print("1. Install PyTorch with CUDA support")
print("2. Update NVIDIA drivers")
print("3. Check CUDA toolkit installation")
return torch.device("cpu")

num_gpus = torch.cuda.device_count()
if num_gpus == 0:
if verbose:
print("No GPUs detected")
return torch.device("cpu")

if verbose:
print(f"Found {num_gpus} GPU(s)")

device = torch.device("cuda:0")

try:
test_tensor = torch.tensor([1.0, 2.0, 3.0]).cuda()
del test_tensor
if hasattr(torch.cuda, 'synchronize'):
torch.cuda.synchronize()

if verbose:
gpu_name = torch.cuda.get_device_name(0)
memory = torch.cuda.get_device_properties(0).total_memory / 1e9
print(f"Using GPU: {gpu_name}")
print(f"GPU Memory: {memory:.1f} GB")
try:
if hasattr(torch.version, 'cuda'):
print(f"CUDA version: {torch.version.cuda}")
except:
print("CUDA version: Unknown")

if hasattr(torch.backends, 'cudnn'):
if hasattr(torch.backends.cudnn, 'benchmark'):
torch.backends.cudnn.benchmark = True
if hasattr(torch.backends.cudnn, 'deterministic'):
torch.backends.cudnn.deterministic = False

return device

except Exception as e:
if verbose:
print(f"GPU initialization failed: {e}")
print("Falling back to CPU")
return torch.device("cpu")

def force_cuda_initialization():
if torch.cuda.is_available():
try:
x = torch.cuda.FloatTensor(1)
del x
if hasattr(torch.cuda, 'synchronize'):
torch.cuda.synchronize()
return True
except Exception as e:
print(f"CUDA forced initialization failed: {e}")
return False
return False

def get_gpu_info():
info = {
"pytorch_version": torch.__version__,
"cuda_available": torch.cuda.is_available(),
"cuda_built": is_cuda_built(),
"gpu_count": torch.cuda.device_count() if torch.cuda.is_available() else 0,
"gpus": []
}

try:
if hasattr(torch.version, 'cuda'):
info["cuda_version"] = torch.version.cuda
else:
info["cuda_version"] = "Unknown"
except:
info["cuda_version"] = "Unknown"

if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
try:
gpu_info = {
"name": torch.cuda.get_device_name(i),
"memory_total_gb": torch.cuda.get_device_properties(i).total_memory / 1e9,
}
try:
gpu_info["memory_allocated_gb"] = torch.cuda.memory_allocated(i) / 1e9
gpu_info["memory_reserved_gb"] = torch.cuda.memory_reserved(i) / 1e9
except:
gpu_info["memory_allocated_gb"] = 0
gpu_info["memory_reserved_gb"] = 0
info["gpus"].append(gpu_info)
except Exception as e:
print(f"Could not get info for GPU {i}: {e}")

return info

def print_gpu_memory():
if torch.cuda.is_available():
for i in range(torch.cuda.device_count()):
try:
allocated = torch.cuda.memory_allocated(i) / 1e9
reserved = torch.cuda.memory_reserved(i) / 1e9
total = torch.cuda.get_device_properties(i).total_memory / 1e9
print(f"GPU {i} - Allocated: {allocated:.2f}GB, Reserved: {reserved:.2f}GB, Total: {total:.1f}GB")
except Exception as e:
print(f"Could not get memory info for GPU {i}: {e}")

def clear_gpu_cache():
if torch.cuda.is_available():
try:
torch.cuda.empty_cache()
gc.collect()
except Exception as e:
print(f"Could not clear GPU cache: {e}")

def optimize_model_for_gpu(model):
if torch.cuda.is_available():
try:
model = model.cuda()
if hasattr(torch, 'compile'):
try:
model = torch.compile(model)
print("Model compilation enabled")
except Exception as e:
print(f"Model compilation failed: {e}")
except Exception as e:
print(f"Failed to move model to GPU: {e}")
return model

def check_pytorch_installation():
print("PyTorch Installation Check")
print("=" * 40)
print(f"Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
print(f"Built with CUDA: {is_cuda_built()}")

if not torch.cuda.is_available():
print("\nRECOMMENDATION:")
print("To enable GPU support, install PyTorch with CUDA:")
print("For CUDA 11.8: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118")
print("For CUDA 12.1: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121")

return torch.cuda.is_available()
Loading
Loading