Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
079e31e
add mapping for genders in metadata
carmichaelong Nov 13, 2024
f9e32be
add makeRequestWithRetry and initial test
carmichaelong Nov 20, 2024
e9fceec
add tests folder
carmichaelong Nov 20, 2024
6ff69d4
add pytest and fill out tests for requests retry
carmichaelong Nov 21, 2024
484f2bf
use mock for tests. add post to allowed methods for retry
carmichaelong Nov 22, 2024
bda3d82
replace requests calls with retry wrapper
carmichaelong Nov 22, 2024
57e14f4
do not use retries for dequeue loop in app
carmichaelong Nov 22, 2024
6b7bdeb
Merge pull request #215 from stanfordnmbl/gender-mapping
carmichaelong Dec 5, 2024
43cd355
Merge pull request #217 from stanfordnmbl/make-request-with-retry
carmichaelong Dec 5, 2024
4bfcf1c
Intercepting error of human detection pose not detected in hrnet.
AlbertoCasasOrtiz Dec 10, 2024
3e8702b
Merge branch 'main' into dev
antoinefalisse Dec 12, 2024
8c1459c
support for lying checkerboard
antoinefalisse Dec 12, 2024
cfe2c9e
Merge pull request #219 from stanfordnmbl/error-2-hrnet-detection-pose
AlbertoCasasOrtiz Dec 12, 2024
f69fe47
Merge pull request #220 from stanfordnmbl/flat_checkerboard
AlbertoCasasOrtiz Dec 17, 2024
0cb0372
Allowing running multiple instances of opencap in one server.
AlbertoCasasOrtiz Jan 15, 2025
f807e25
Modified Makefile and docker-compose to allow multiple instances in s…
AlbertoCasasOrtiz Jan 21, 2025
98a385d
Renamed scripts. Added script to stop all.
AlbertoCasasOrtiz Jan 23, 2025
26fa709
Added script to check container status and start single containers.
AlbertoCasasOrtiz Jan 28, 2025
666e165
Added ugly logging to docker containers. Added try catch to finally i…
AlbertoCasasOrtiz Jan 28, 2025
f42a602
catch exception if patch fails in except block
AlbertoCasasOrtiz Jan 30, 2025
b14f5bc
use different gpu for each instance
carmichaelong Feb 4, 2025
638807e
add error logging json file (optional) for machines running app.py
carmichaelong Feb 5, 2025
7d7e215
set defaults for INSTANCE_ID and CPU_SET to restore make run usage
carmichaelong Feb 5, 2025
116e8d4
retry test session with random waiting time, simplify makeRequestWith…
carmichaelong Feb 7, 2025
cc976fa
Merge pull request #226 from stanfordnmbl/server
AlbertoCasasOrtiz Feb 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,4 @@ Examples/reprocessDataServer.py
*.ini
*.stats

tests/

newsletter.py
56 changes: 44 additions & 12 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@
import glob
from datetime import datetime, timedelta
import numpy as np
from utilsAPI import getAPIURL, getWorkerType, getASInstance, unprotect_current_instance, get_number_of_pending_trials
from utilsAPI import getAPIURL, getWorkerType, getErrorLogBool, getASInstance, unprotect_current_instance, get_number_of_pending_trials
from utilsAuth import getToken
from utils import (getDataDirectory, checkTime, checkResourceUsage,
sendStatusEmail, checkForTrialsWithStatus,
getCommitHash, getHostname, postLocalClientInfo,
postProcessedDuration)
postProcessedDuration, makeRequestWithRetry,
writeToErrorLog)

logging.basicConfig(level=logging.INFO)

Expand All @@ -24,6 +25,9 @@
autoScalingInstance = getASInstance()
logging.info(f"AUTOSCALING TEST INSTANCE: {autoScalingInstance}")

ERROR_LOG = getErrorLogBool()
error_log_path = "/data/error_log.json"

# if true, will delete entire data directory when finished with a trial
isDocker = True

Expand Down Expand Up @@ -120,8 +124,10 @@
error_msg['error_msg'] = 'No videos uploaded. Ensure phones are connected and you have stable internet connection.'
error_msg['error_msg_dev'] = 'No videos uploaded.'

r = requests.patch(trial_url, data={"status": "error", "meta": json.dumps(error_msg)},
headers = {"Authorization": "Token {}".format(API_TOKEN)})
r = makeRequestWithRetry('PATCH',
trial_url,
data={"status": "error", "meta": json.dumps(error_msg)},
headers = {"Authorization": "Token {}".format(API_TOKEN)})
continue

# The following is now done in main, to allow reprocessing trials with missing videos
Expand Down Expand Up @@ -149,15 +155,33 @@

# note a result needs to be posted for the API to know we finished, but we are posting them
# automatically thru procesTrial now
r = requests.patch(trial_url, data={"status": "done"},
headers = {"Authorization": "Token {}".format(API_TOKEN)})
r = makeRequestWithRetry('PATCH',
trial_url,
data={"status": "done"},
headers = {"Authorization": "Token {}".format(API_TOKEN)})

logging.info('0.5s pause if need to restart.')
time.sleep(0.5)

except Exception as e:
r = requests.patch(trial_url, data={"status": "error"},
headers = {"Authorization": "Token {}".format(API_TOKEN)})
traceback.print_exc()
try:
r = makeRequestWithRetry('PATCH',
trial_url, data={"status": "error"},
headers = {"Authorization": "Token {}".format(API_TOKEN)})
traceback.print_exc()

if ERROR_LOG:
stack = traceback.format_exc()
writeToErrorLog(error_log_path, trial["session"], trial["id"],
e, stack)

except:
traceback.print_exc()

if ERROR_LOG:
stack = traceback.format_exc()
writeToErrorLog(error_log_path, trial["session"], trial["id"],
e, stack)

# Antoine: Removing this, it is too often causing the machines to stop. Not because
# the machines are failing, but because for instance the video is very long with a lot
Expand All @@ -172,8 +196,16 @@

finally:
# End process duration timer and post duration to database
process_end_time = datetime.now()
postProcessedDuration(trial_url, process_end_time - process_start_time)
try:
process_end_time = datetime.now()
postProcessedDuration(trial_url, process_end_time - process_start_time)
except Exception as e:
traceback.print_exc()

if ERROR_LOG:
stack = traceback.format_exc()
writeToErrorLog(error_log_path, trial["session"], trial["id"],
e, stack)

justProcessed = True

Expand All @@ -182,4 +214,4 @@
folders = glob.glob(os.path.join(getDataDirectory(isDocker=True),'Data','*'))
for f in folders:
shutil.rmtree(f)
logging.info('deleting ' + f)
logging.info('deleting ' + f)
23 changes: 14 additions & 9 deletions docker/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ REPO_NAME := opencap
PROD_BRANCH := main
DEV_BRANCH := dev

# Initialize variables if not passed in
INSTANCE_ID ?= 0
CPU_SET ?= ""

# Determine the branch name
CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD)

Expand Down Expand Up @@ -68,12 +72,13 @@ endif

.PHONY: run
run:
ifeq ($(CURRENT_BRANCH),$(PROD_BRANCH))
aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com

else ifeq ($(CURRENT_BRANCH),$(DEV_BRANCH))
aws ecr get-login-password --region us-west-2 --profile opencap | docker login --username AWS --password-stdin 660440363484.dkr.ecr.us-west-2.amazonaws.com

endif

OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) docker-compose up
@echo "Usage: sudo make run INSTANCE_ID=<unique_id> CPU_SET=<cpu_set>"
@echo "Defaults: INSTANCE_ID=0, CPU_SET=\"\""

COMPOSE_PROJECT_NAME=opencap_$(INSTANCE_ID) \
OPENCAP_IMAGE_NAME=$(OPENCAP_IMAGE_NAME) \
OPENPOSE_IMAGE_NAME=$(OPENPOSE_IMAGE_NAME) \
MMPOSE_IMAGE_NAME=$(MMPOSE_IMAGE_NAME) \
INSTANCE_ID=$(INSTANCE_ID) \
CPU_SET=$(CPU_SET) \
docker compose up -d
34 changes: 34 additions & 0 deletions docker/check-containers-health.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

# Function to check if a container is running
is_container_alive() {
local container_name=$1
docker ps --filter "name=^/${container_name}$" --filter "status=running" --format '{{.Names}}' | grep -wq "$container_name"
return $?
}

# Loop through numbers 0 to 7
for n in {0..7}; do
# Container names
opencap_openpose="opencap_${n}-openpose-1"
opencap_mmpose="opencap_${n}-mmpose-1"
opencap_mobilecap="opencap_${n}-mobilecap-1"

# Check if all three containers are alive
if is_container_alive "$opencap_openpose" && \
is_container_alive "$opencap_mmpose" && \
is_container_alive "$opencap_mobilecap"; then
echo "All containers for instance $n are alive. Skipping."
continue
fi

# Check if any container exists
if docker ps -a --filter "name=^/opencap_${n}-(openpose|mmpose|mobilecap)-1$" --format '{{.Names}}' | grep -q "opencap_${n}"; then
echo "Some containers for instance $n are not alive. Stopping instance."
./stop-container.sh "$n"
./start-container.sh "$n"
else
echo "No containers for instance $n. Skipping."
fi

done
25 changes: 22 additions & 3 deletions docker/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,14 @@ services:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
device_ids: ["${INSTANCE_ID}"]
cpuset: "${CPU_SET}"
logging:
driver: "json-file"
options:
max-size: "100m" # Rotate when the log reaches 10MB
max-file: "7" # Keep the last 7 log files
openpose:
image: ${OPENPOSE_IMAGE_NAME}
volumes:
Expand All @@ -24,8 +30,14 @@ services:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
device_ids: ["${INSTANCE_ID}"]
cpuset: "${CPU_SET}"
logging:
driver: "json-file"
options:
max-size: "100m" # Rotate when the log reaches 10MB
max-file: "7" # Keep the last 7 log files
mmpose:
image: ${MMPOSE_IMAGE_NAME}
volumes:
Expand All @@ -35,7 +47,14 @@ services:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
device_ids: ["${INSTANCE_ID}"]
cpuset: "${CPU_SET}"
logging:
driver: "json-file"
options:
max-size: "100m" # Rotate when the log reaches 10MB
max-file: "7" # Keep the last 7 log files

volumes:
data: {}
51 changes: 51 additions & 0 deletions docker/start-container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

# Configuration
MAX_INSTANCES=8
CPUS_PER_INSTANCE=14
GPUS_PER_INSTANCE=1

# Get the total number of CPUs and GPUs available
TOTAL_CPUS=$(nproc)
TOTAL_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)

# Check if an instance number is provided
if [ -z "$1" ]; then
echo "Usage: $0 <instance_number>"
echo "Provide the instance number to start (0 to $((MAX_INSTANCES - 1)))."
exit 1
fi

INSTANCE_NUMBER=$1

# Validate the instance number
if (( INSTANCE_NUMBER < 0 || INSTANCE_NUMBER >= MAX_INSTANCES )); then
echo "Error: Instance number must be between 0 and $((MAX_INSTANCES - 1))."
exit 1
fi

# Compute CPU and GPU offsets for the selected instance
CPU_START=$(( INSTANCE_NUMBER * CPUS_PER_INSTANCE ))
CPU_END=$(( CPU_START + CPUS_PER_INSTANCE - 1 ))
CPU_SET="${CPU_START}-${CPU_END}"

# Validate resource availability
if (( CPU_START + CPUS_PER_INSTANCE > TOTAL_CPUS )); then
echo "Error: Not enough CPUs available for instance $INSTANCE_NUMBER."
exit 1
fi

if (( INSTANCE_NUMBER >= TOTAL_GPUS )); then
echo "Error: Not enough GPUs available for instance $INSTANCE_NUMBER."
exit 1
fi

# Start the specific instance
echo "Starting instance $INSTANCE_NUMBER with CPU_SET=${CPU_SET} and GPU=${INSTANCE_NUMBER}"

# Run docker-compose for the specific instance
make run INSTANCE_ID=$INSTANCE_NUMBER CPU_SET=$CPU_SET

sleep 10

echo "Instance $INSTANCE_NUMBER started successfully."
60 changes: 60 additions & 0 deletions docker/start-containers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash

# Configuration
MAX_INSTANCES=8
CPUS_PER_INSTANCE=14
GPUS_PER_INSTANCE=1

# Get the total number of CPUs and GPUs available
TOTAL_CPUS=$(nproc)
TOTAL_GPUS=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)

# Read number of instances to start
if [ -z "$1" ]; then
echo "Usage: $0 <number_of_instances>"
echo "Provide the number of instances to start (max $MAX_INSTANCES)."
exit 1
fi

NUM_INSTANCES=$1

# Validate the number of instances
if (( NUM_INSTANCES > MAX_INSTANCES )); then
echo "Error: Maximum number of instances is $MAX_INSTANCES."
exit 1
fi

# Check if there are enough resources
if (( NUM_INSTANCES * CPUS_PER_INSTANCE > TOTAL_CPUS )); then
echo "Error: Not enough CPUs. Required: $((NUM_INSTANCES * CPUS_PER_INSTANCE)), Available: $TOTAL_CPUS."
exit 1
fi

if (( NUM_INSTANCES * GPUS_PER_INSTANCE > TOTAL_GPUS )); then
echo "Error: Not enough GPUs. Required: $((NUM_INSTANCES * GPUS_PER_INSTANCE)), Available: $TOTAL_GPUS."
exit 1
fi

# Display summary
echo "Starting $NUM_INSTANCES instances..."
echo "Total CPUs: $TOTAL_CPUS (using $CPUS_PER_INSTANCE per instance)"
echo "Total GPUs: $TOTAL_GPUS (using $GPUS_PER_INSTANCE per instance)"
echo

# Start instances
for (( i=0; i<NUM_INSTANCES; i++ )); do
INSTANCE_ID=$i
CPU_START=$(( i * CPUS_PER_INSTANCE ))
CPU_END=$(( CPU_START + CPUS_PER_INSTANCE - 1 ))
CPU_SET="${CPU_START}-${CPU_END}"

echo "Starting instance $INSTANCE_ID with CPU_SET=${CPU_SET} and GPU=${INSTANCE_ID}"

# Run docker-compose for each instance
make run INSTANCE_ID=$INSTANCE_ID CPU_SET=$CPU_SET

sleep 2
done

echo "All instances started successfully."

1 change: 1 addition & 0 deletions docker/stop-all-containers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
for i in $(seq 0 7); do ./stop-container.sh $i; done
25 changes: 25 additions & 0 deletions docker/stop-container.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

# Check if INSTANCE_ID is provided
if [ -z "$1" ]; then
echo "Usage: $0 <INSTANCE_ID>"
exit 1
fi

INSTANCE_ID=$1
COMPOSE_PROJECT_NAME="opencap_${INSTANCE_ID}"

echo "Stopping and removing containers for INSTANCE_ID=${INSTANCE_ID}..."

# Stop and remove containers associated with the project
docker-compose \
--project-name $COMPOSE_PROJECT_NAME \
down

# Verify if containers are removed
if [ $? -eq 0 ]; then
echo "Successfully stopped and removed containers for INSTANCE_ID=${INSTANCE_ID}."
else
echo "Failed to stop and remove containers for INSTANCE_ID=${INSTANCE_ID}."
fi

Loading
Loading